-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_data.py
196 lines (165 loc) · 6.49 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python
import urllib
import datetime
import re
import os
import sys
CALENDAR = [ 0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]
PAGES = 2
# params
year = int(sys.argv[1])
month = int(sys.argv[2])
set_day = int(sys.argv[3])
target_dir = sys.argv[4]
# photo_ids and titles are extracted from explore page source
link_pattern = """(?<=data-track="thumb" href=\"/photos/)(\w)*/(\w)*/"""
title_pattern = """(?<=title=")([\S\s])*(?=")"""
re_link = re.compile(link_pattern)
re_title = re.compile(title_pattern)
# ****************************************
# Flickr API Params
# ****************************************
HOST = 'http://flickr.com'
API = '/services/rest'
API_KEY = "e9c26bc8a4a20f2a15aab8ac659f25e2"
API_SECRET = "57940bdb914aa0a8"
# ***************************************
# Flickr API Helpers
# ***************************************
class Bag: pass
def safe_unicode(obj, *args):
try:
return unicode(obj, *args)
except UnicodeDecodeError:
ascii_text = str(obj).encode('string_escape')
return unicode(ascii_text)
#unmarshal taken and modified from pyamazon.py
#makes the xml easy to work with
from xml.dom import minidom
from urllib import urlencode, urlopen
def unmarshal(element):
rc = Bag()
if isinstance(element, minidom.Element):
for key in element.attributes.keys():
setattr(rc, key, element.attributes[key].value)
childElements = [e for e in element.childNodes \
if isinstance(e, minidom.Element)]
if childElements:
for child in childElements:
key = child.tagName
if hasattr(rc, key):
if type(getattr(rc, key)) <> type([]):
setattr(rc, key, [getattr(rc, key)])
setattr(rc, key, getattr(rc, key) + [unmarshal(child)])
elif isinstance(child, minidom.Element) and \
(child.tagName == 'Details'):
# make the first Details element a key
setattr(rc,key,[unmarshal(child)])
#dbg: because otherwise 'hasattr' only tests
#dbg: on the second occurence: if there's a
#dbg: single return to a query, it's not a
#dbg: list. This module should always
#dbg: return a list of Details objects.
else:
setattr(rc, key, unmarshal(child))
else:
#jec: we'll have the main part of the element stored in .text
#jec: will break if tag <text> is also present
text = "".join([e.data for e in element.childNodes \
if isinstance(e, minidom.Text)])
setattr(rc, 'text', text)
return rc
def _doget(method, auth=False, **params):
#uncomment to check you aren't killing the flickr server
#print "***** do get %s" % method
#convert lists to strings with ',' between items
for (key, value) in params.items():
if isinstance(value, list):
params[key] = ','.join([item for item in value])
url = '%s%s/?api_key=%s&method=%s&%s'% \
(HOST, API, API_KEY, method, urlencode(params))
if auth:
url = url + '&email=%s&password=%s' % (email, password)
xml = minidom.parse(urlopen(url))
data = unmarshal(xml)
if not data.rsp.stat == 'ok':
msg = "ERROR [%s]: %s" % (data.rsp.err.code, data.rsp.err.msg)
raise FlickrError, msg
return data
# ***************************************
# Flickr API Helpers
# ***************************************
def get_tags(id):
tags = []
method = 'flickr.photos.getInfo'
data = _doget(method, photo_id=id)
photo = data.rsp.photo
if hasattr(photo.tags, "tag"):
for tag in photo.tags.tag:
print tag.text
unicode_text = safe_unicode(tag.text)
utf8_text = unicode_text.encode('utf-8')
tags.append(utf8_text)
return tags
#response = urllib.urlopen(page).read()
#lines = response.split("\n")
#tags = []
#for line in lines:
# line = line.strip()
# if (line[:12] == "Y.photo.init"):
# tag_substr = line.split("\"tags\":[")
# if (len(tag_substr) > 1):
# tag_list_string = tag_substr[1].split("]")[0].strip("\{\}")
# split_to_tag_pairs = tag_list_string.split(",")
#
# for tag_pair in split_to_tag_pairs:
# if (len(tag_pair) > 1 and tag_pair[0] == '{'):
# stag_pair = tag_pair.split(":")
# if (stag_pair > 1):
# tag = stag_pair[1].strip("\"")
# tags.append(tag)
#return tags
def process_page(page, f):
response = urllib.urlopen(page).read()
lines = response.split("\n")
for line in lines:
m_link = re_link.search(line)
m_title = re_title.search(line)
if m_link:
link = m_link.group()
id = link.split("/")[1]
#id = "http://flickr.com/photos/"+link
tags = get_tags(id)
f.write(link + "\n")
if m_title:
info_data = m_title.group().split(" by ")
title_strip = info_data[0].strip();
if (title_strip):
f.write(info_data[0] + "\n")
else:
f.write("__noname__\n");
else:
f.write("__noname__\n")
if tags:
for tag in tags:
f.write(tag+"\n")
f.write("\n")
def run_app(year, month, set_day, target_dir):
if not os.path.exists(target_dir):
os.makedirs(target_dir)
if (set_day == 0):
for day in range(CALENDAR[month]):
output_file = "%s/explore_%d_%02d_%02d.txt" % (target_dir, year, month, day)
f = open(output_file, "w");
for page in range(1, PAGES):
current_page="http://www.flickr.com/explore/interesting/%d/%02d/%02d/page%02d/" % (year, month, day, page)
process_page(current_page, f)
f.close()
else:
output_file = "%s/explore_%d_%02d_%02d.txt" % (target_dir, year, month, set_day)
f = open(output_file, "w")
for page in range(1, PAGES):
current_page="http://www.flickr.com/explore/interesting/%d/%02d/%02d/page%02d/" % (year, month, set_day, page)
process_page(current_page, f)
f.close()
run_app(year, month, set_day, target_dir)