-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeopull.py
275 lines (229 loc) · 9.65 KB
/
geopull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Main functions for this "geopull" project
# Description:
# Twitter collects a large amount of geocoding data volunteered
# from its users. I wanted to:
# 1. find out what type of data is collected
# 2. systematically collect some of that data without being detected.
# (this project mines potentially valuable/proprietary data)
# 3. examine how "complete" this data is - how close are the points
# from the queried location?
# 8-12 hours, approximately
# import standard packages
import csv, os, time, pickle, pprint, random
# import non-standard classes
import arcpy
from twitter import *
# import our classes
import points
# this program requires
# pyshp (shapefile) http://code.google.com/p/pyshp/
# and Python Twitter Tools(twitter) http://mike.verdone.ca/twitter/
# STATIC VARIABLES
SEED_POINT_FILE = 'seedpoints.csv'
POINT_SEQUENCE_DATA = 'seedpoints.save'
RETURNED_POINTS_FILE = 'returnpoints.csv'
RETURNED_POINTS_DATA = 'returnpoints.save'
SPATIAL_REF_ID = 4326 # WGS 84
RESULTS_NAME = 'resultpoints.shp'
SEEDS_NAME = 'querypoints.shp'
LINES_NAME = 'queryresultlines.shp'
# twitter rate limiting window, in seconds.
TWITTER_TIME_BLOCK = 15*60
# twitter max requests per window
TWITTER_MAX_REQ = 15
# set these auth keys (from dev.twitter.com)
OAUTH_TOKEN = ''
OAUTH_SECRET = ''
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
# METHODS
def createSeedPoints(pts):
# generate and save random points to check in twitter
print 'creating ' + str(pts) + ' points...'
# helper function to generate 'pts' number of points randomly distributed
# x-y
seed_points = points.generatePoints(pts, 'Shorelines/WGS84/shorelines')
print 'writing points to ' + SEED_POINT_FILE + '...'
with open(SEED_POINT_FILE, 'wb') as csvfile:
csvhandle = csv.writer(csvfile, dialect='excel')
csvhandle.writerow(['ID','long','lat'])
for p_i in range(pts):
csvhandle.writerow([p_i, seed_points[p_i][0], seed_points[p_i][1]])
return [(p_i, seed_points[p_i][0], seed_points[p_i]) for p_i in range(pts)]
def getSeedPoints():
# load random points to check in twitter from CSV file
print 'reading points...'
seed_points = []
with open(SEED_POINT_FILE, 'r') as csvfile:
csvhandle = csv.reader(csvfile, dialect='excel')
csvhandle.next() # first row contains header
for row in csvhandle:
# ID, lat, long
seed_points.append((int(row[0]), float(row[1]), float(row[2])))
return seed_points
def searchSaveLoop(sorted_points):
# this is the main loop of our program, cycles through points querying for
# geodata
# open a twitter object with authentication
t = Twitter(auth=OAuth(OAUTH_TOKEN, OAUTH_SECRET,
CONSUMER_KEY, CONSUMER_SECRET))
# basically, we expect to have to rerun this program to get through all the points.
# here, we load our progress or create new files to save to.
if os.path.exists(RETURNED_POINTS_FILE) and os.path.exists(RETURNED_POINTS_DATA):
print 'found previous feed saved, loading...'
call_dictionary = pickle.load(open(RETURNED_POINTS_DATA,'rb'))
csvhandle = csv.writer(open(RETURNED_POINTS_FILE,'ab'), dialect='excel')
else:
print 'found no feed, creating new save files...'
call_dictionary = {} # dictionary that contains subject and response for each API Query
csvhandle = csv.writer(open(RETURNED_POINTS_FILE,'wb'), dialect='excel')
csvhandle.writerow(['search_id','search_long','search_lat','street_address','full_name','name','twitter_id','place_type','url','long','lat'])
# start from old position in points list
current_position = len(call_dictionary.keys())
# start looping through search points
for point in sorted_points[current_position:]:
print 'querying point: ' + str(point)
# perform query
query = geoSearch(point[1], point[2], t)
# take the results of the query that contain place data
place_list = query['result']['places']
# print results of query
print(query)
# we pickle the, in case we want to come back later
# and retrieve other data from the API call
to_pickle = []
print('saving output...')
for place in place_list:
# save each place to the csv
csvhandle.writerow(list(point) + placeToRow(place))
# save each place to the
to_pickle.append(dict(place))
# keeps dictionary of query place results
call_dictionary[point[0]] = to_pickle
# save returned points data to pickle.
with open(RETURNED_POINTS_DATA, 'wb') as picklefile:
pickle.dump(call_dictionary, picklefile)
# pause here to prevent exceeding query limit
print('waiting...')
time.sleep(float(TWITTER_TIME_BLOCK)/TWITTER_MAX_REQ +
random.random() * 15)
def placeToRow(place_dictionary):
# produces a list for tablular form from a twitter api place dicitionary
# ['street_address','full_name','name','twitter_id','place_type','url','long','lat']
keys = ["[u'attributes'][u'street_address']",
"['full_name']",
"['name']",
"['id']",
"['place_type']",
"['url']",
"['bounding_box']['coordinates'][0][0][0]", # we take one coordinate
"['bounding_box']['coordinates'][0][0][1]"] # pair of the bounding box
out_row = []
# we cycle through various dictionary lookups using eval()
# the api specifies that not all fields have to be returned, hence
# the try/except pattern
for key in keys:
try:
eval("out_row.append(place_dictionary" + key + ".encode('ascii'))")
except:
try:
eval("out_row.append(str(place_dictionary" + key + ").encode('ascii'))")
except:
out_row.append(' ')
return out_row
def geoSearch(x, y, t):
# oneliner to return results of search using twitter API
return t.geo.search(lat=y, long=x, granularity='poi', max_results=20)
def loadDatatoSHPs(seed_points):
# let's turn our data into points, with some statistics for averages
results_list = []
# open csv of files
with open(RETURNED_POINTS_FILE, 'rU') as csvfile:
#['search_id','search_long','search_lat','street_address','full_name','name','twitter_id','place_type','url','long','lat']
csvhandle = csv.reader(csvfile, dialect='excel')
csvhandle.next() # remove header
for row in csvhandle:
results_list.append(row)
# find out how many points were queried
queried_points = [row[0] for row in results_list]
# only some results are actually points of interest, rather than city descriptions
results_list_poi = [row for row in results_list if row[7] == 'poi']
# determine the points of interest
pois = [row[6] for row in results_list_poi]
# print some statistics
print 'Results: ' + str(len(results_list))
print 'PoI Results: ' + str(len(results_list_poi))
print 'Queried Points: ' + str(len(set(queried_points)))
print 'Unique PoIs: ' + str(len(set(pois)))
sr = arcpy.SpatialReference(SPATIAL_REF_ID)
# create feature classes for storage
arcpy.CreateFeatureclass_management(os.getcwd(), RESULTS_NAME, geometry_type='POINT', spatial_reference=sr)
arcpy.CreateFeatureclass_management(os.getcwd(), SEEDS_NAME, geometry_type='POINT', spatial_reference=sr)
arcpy.CreateFeatureclass_management(os.getcwd(), LINES_NAME, geometry_type='POLYLINE', spatial_reference=sr)
# add fields
fields = ['search_id','str_addr','full_name','name','twitter_id']
types = ['LONG','STRING','STRING','STRING','STRING']
for field, typ in zip(fields, types):
arcpy.AddField_management(RESULTS_NAME, field, typ)
fields2 = ['search_id','count','avgdist']
types2 = ['LONG','LONG','DOUBLE']
for field, typ in zip(fields2, types2):
arcpy.AddField_management(SEEDS_NAME, field, typ)
fields3 = ['search_id','twitter_id']
types3 = ['LONG','STRING']
for field, typ in zip(fields3, types3):
arcpy.AddField_management(LINES_NAME, field, typ)
# populate results
with arcpy.da.InsertCursor(RESULTS_NAME, fields + ['SHAPE@X', 'SHAPE@Y']) as cursor:
for row in results_list_poi:
cursor.insertRow((row[0], row[3], row[4], row[5], row[6], float(row[9]), float(row[10])))
# populate seed points/lines
total_distance = 0.0
line_cursor = arcpy.da.InsertCursor(LINES_NAME, fields3 + ['SHAPE@'])
with arcpy.da.InsertCursor(SEEDS_NAME, fields2 + ['SHAPE@X', 'SHAPE@Y']) as cursor:
for row in seed_points:
# calculate count/avg distance of query results
d = 0.0
matching_results = [r for r in results_list_poi if str(r[0])==str(row[0])]
for result in matching_results:
d = d + points.getDistance((row[1], row[2]), (float(result[9]), float(result[10])))
try:
avg = d/len(matching_results)
except:
avg = -1.0
# keep track of total poi - query distance
total_distance = total_distance + d
# update seed point
cursor.insertRow((row[0], len(matching_results), avg, float(row[1]), float(row[2])))
# add lines connecting queries and seed points
for result in matching_results:
p1 = arcpy.Point(float(row[1]), float(row[2]))
p2 = arcpy.Point(float(result[9]), float(result[10]))
line_cursor.insertRow((row[0], result[6], arcpy.Polyline(arcpy.Array([p1, p2]), sr)))
del line_cursor
print 'Average Query to Result distance (deg): ' + str(total_distance/len(results_list_poi))
# MAIN FUNCTION
def __init__():
# check if we've already generated and sorted the "seed" points
if os.path.exists(POINT_SEQUENCE_DATA):
# if we have, load them
with open(POINT_SEQUENCE_DATA, 'rb') as picklefile:
sp = pickle.load(picklefile)
print 'loaded points...'
else:
# otherwise, create them?
if os.path.exists(SEED_POINT_FILE):
sp = getSeedPoints()
else:
sp = createSeedPoints(2000)
# in case twitter cuts off our access, how do we make sure we get a good coverage in our sample?
# this algorithm tries to maximize the initial spread of our points
print 'sorting points...'
sp = points.getBestSequence(sp)
# we save the data to a CSV
with open(POINT_SEQUENCE_DATA, 'wb') as picklefile:
pickle.dump(sp, picklefile)
# execute search loop
searchSaveLoop(sp)
# get some statistics about the data and load it into a shapefile
loadDatatoSHPs(sp)