-
Notifications
You must be signed in to change notification settings - Fork 379
/
Copy pathfetch_wikidata.py
executable file
·464 lines (400 loc) · 17 KB
/
fetch_wikidata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#!/usr/bin/env python3
#-- pip3 install -U SPARQLWrapper
#-- pip3 install -U fiona
#-- pip3 install -U hanzidentifier
"""
Fetch Wikidata Labels
Wikidata limitations:
population: lot of constraint violations.
https://www.wikidata.org/wiki/Property_talk:P1082#Querying_for_the_latest_value
elevation : temporary disabled , reason: SPARQL performance problem.
"""
import argparse
import csv
import sys
import time
import hanzidentifier
#import requests
from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions
import fiona
parser = argparse.ArgumentParser(description='Fetch wikidata labels for Natural-Earth ')
parser.add_argument('-input_shape_name',
default='../../10m_cultural/ne_10m_populated_places.shp',
help='input natural-earth shape file - with wikidataid columns')
parser.add_argument('-input_lettercase',
default='uppercase',
help='variables in thes hape file - lowercase or uppercase')
parser.add_argument('-output_csv_name',
default='ne_10m_populated_places.csv',
help='output csv file with wikidata labels')
args = parser.parse_args()
def get_sparql_value(sresult, variable_id):
"""
Get SPARQL value from the sresult
"""
val = ''
if variable_id in sresult:
val = sresult[variable_id]['value']
return val
def get_sparql_label(sresult, variable_id):
"""
Get SPARQL label from the sresult
"""
val = ''
if variable_id in sresult:
val = sresult[variable_id]['value'].split('#')[0].split('(')[0].split(',')[0]
return val.strip()
def get_sparql_numvalue(sresult, variable_id):
"""
Get SPARQL numeric value from the sresult
"""
val = -1
if variable_id in sresult:
val = float(sresult[variable_id]['value'])
return val
def post_process_wd_zh(properties):
""" First check whether name_zh (Simplified) and name_zht(Traditional)
are set already, if not we use the name_zh-default to backfill them.
During the backfill, if there is no Simplified Chinese, Traditional
Chinese will be used to further backfill, and vice versa
It also deletes the intermediate property `zh-default`
"""
if args.input_lettercase == "lowercase":
name_en_default = properties['name_en'] if 'name_en' in \
properties else u''
zh_Hans_fallback = properties['name_zh_hans'] if 'name_zh_hans' in \
properties else u''
zh_Hant_fallback = properties['name_zh_hant'] if 'name_zh_hant' in \
properties else u''
else:
name_en_default = properties['NAME_EN'] if 'NAME_EN' in \
properties else u''
zh_Hans_fallback = properties['NAME_ZH_HANS'] if 'NAME_ZH_HANS' in \
properties else u''
zh_Hant_fallback = properties['NAME_ZH_HANT'] if 'NAME_ZH_HANT' in \
properties else u''
# sometimes the default Chinese name has several values in a list
if 'name_zh_default' in properties:
names = properties['name_zh_default'].split('/')
for name in names:
if hanzidentifier.is_simplified(name) and \
len(zh_Hans_fallback) == 0:
zh_Hans_fallback = name
#print('found simplified name')
if hanzidentifier.is_traditional(name) and \
len(zh_Hant_fallback) == 0:
zh_Hant_fallback = name
#print('found traditional name')
# make sure we don't shove English values into Chinese namespace
if (zh_Hans_fallback == name_en_default) and len(name_en_default) > 0:
zh_Hans_fallback = u''
if (zh_Hant_fallback == name_en_default) and len(name_en_default) > 0:
zh_Hant_fallback = u''
# now make traditional and simplified Chinese name assignments
if 'name_zhs' not in properties:
if len(zh_Hans_fallback) != 0:
properties['name_zhs'] = zh_Hans_fallback
elif len(zh_Hant_fallback) != 0:
properties['name_zhs'] = zh_Hant_fallback
else:
properties['name_zhs'] = u''
if 'name_zht' not in properties:
if len(zh_Hant_fallback) != 0:
properties['name_zht'] = zh_Hant_fallback
elif len(zh_Hans_fallback) != 0:
properties['name_zht'] = zh_Hans_fallback
else:
properties['name_zht'] = u''
# only select one of the options if the field is separated by "/"
# for example if the field is "旧金山市县/三藩市市縣/舊金山市郡" only the first
# one 旧金山市县 will be preserved
if 'name_zh' in properties:
if len(properties['name_zh']) > 0:
properties['name_zh'] = properties['name_zh'].split('/')[0].strip()
if 'name_zht' in properties:
if len(properties['name_zht']) > 0:
properties['name_zht'] = properties['name_zht'].split('/')[0].strip()
if 'NAME_ZH' in properties:
if len(properties['NAME_ZH']) > 0:
properties['NAME_ZH'] = properties['NAME_ZH'].split('/')[0].strip()
if 'NAME_ZHT' in properties:
if len(properties['NAME_ZHT']) > 0:
properties['NAME_ZHT'] = properties['NAME_ZHT'].split('/')[0].strip()
return properties
def fetchwikidata(a_wid):
"""
Fetch wikidata with SPARQL
"""
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.1 (github.com/nvkelso/natural-earth-vector)')
query_template = """
SELECT
?e ?i ?r ?population
?name_ar
?name_bn
?name_de
?name_el
?name_en
?name_es
?name_fa
?name_fr
?name_he
?name_hi
?name_hu
?name_id
?name_it
?name_ja
?name_ko
?name_nl
?name_pl
?name_pt
?name_ru
?name_sv
?name_tr
?name_uk
?name_ur
?name_vi
?name_zh
?name_zh_hans
?name_zh_hant
WHERE {
{
SELECT DISTINCT ?e ?i ?r
WHERE{
VALUES ?i { wd:Q2102493 wd:Q1781 }
OPTIONAL{ ?i owl:sameAs ?r. }
BIND(COALESCE(?r, ?i) AS ?e).
}
}
SERVICE wikibase:label {bd:serviceParam wikibase:language "en".}
OPTIONAL{?e wdt:P1082 ?population .}
OPTIONAL{?e rdfs:label ?name_ar FILTER((LANG(?name_ar))="ar").}
OPTIONAL{?e rdfs:label ?name_bn FILTER((LANG(?name_bn))="bn").}
OPTIONAL{?e rdfs:label ?name_de FILTER((LANG(?name_de))="de").}
OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").}
OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").}
OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").}
OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fa))="fa").}
OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").}
OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").}
OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").}
OPTIONAL{?e rdfs:label ?name_hu FILTER((LANG(?name_hu))="hu").}
OPTIONAL{?e rdfs:label ?name_id FILTER((LANG(?name_id))="id").}
OPTIONAL{?e rdfs:label ?name_it FILTER((LANG(?name_it))="it").}
OPTIONAL{?e rdfs:label ?name_ja FILTER((LANG(?name_ja))="ja").}
OPTIONAL{?e rdfs:label ?name_ko FILTER((LANG(?name_ko))="ko").}
OPTIONAL{?e rdfs:label ?name_nl FILTER((LANG(?name_nl))="nl").}
OPTIONAL{?e rdfs:label ?name_pl FILTER((LANG(?name_pl))="pl").}
OPTIONAL{?e rdfs:label ?name_pt FILTER((LANG(?name_pt))="pt").}
OPTIONAL{?e rdfs:label ?name_ru FILTER((LANG(?name_ru))="ru").}
OPTIONAL{?e rdfs:label ?name_sv FILTER((LANG(?name_sv))="sv").}
OPTIONAL{?e rdfs:label ?name_tr FILTER((LANG(?name_tr))="tr").}
OPTIONAL{?e rdfs:label ?name_uk FILTER((LANG(?name_uk))="uk").}
OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").}
OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").}
OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").}
OPTIONAL{?e rdfs:label ?name_zh_hans FILTER((LANG(?name_zh_hans))="zh-hans").}
OPTIONAL{?e rdfs:label ?name_zh_hant FILTER((LANG(?name_zh_hant))="zh-hant").}
}
"""
wikidata_sparql_ids = ""
for wid in a_wid:
wikidata_sparql_ids += " wd:"+wid
print("fetch: ", wikidata_sparql_ids.split()[1], "... ", wikidata_sparql_ids.split()[-1])
ne_query = query_template.replace('wd:Q2102493 wd:Q1781', wikidata_sparql_ids)
# compress the Query - removing the extra spaces
while ' ' in ne_query:
ne_query = ne_query.replace(' ', ' ')
results = None
retries = 0
while results is None and retries < 8:
try:
results = None
sparql.setQuery(ne_query)
sparql.setTimeout(1000)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
except SPARQLExceptions.EndPointNotFound:
print('ERRwikidata-SPARQLExceptions-EndPointNotFound: Retrying in 30 seconds.')
time.sleep(30)
retries += 1
continue
except SPARQLExceptions.EndPointInternalError as e:
print("ERRwikidata-SPARQLExceptions-EndPointInternalError: Retrying in 30 seconds.",e)
time.sleep(30)
retries += 1
continue
except SPARQLExceptions.QueryBadFormed as e:
print("ERRwikidata-SPARQLExceptions-QueryBadFormed : Check! ",e)
return "error"
except TimeoutError as e:
print("ERRwikidata-SPARQLExceptions TimeOut : Retrying in 1 seconds.",e)
time.sleep(1)
retries += 1
continue
except KeyboardInterrupt:
# quit
sys.exit()
except:
wait = retries*5
print("ERRwikidata: other error. Retrying in "+str(wait)+" seconds.")
print('error: %s ' % sys.exc_info()[0])
time.sleep(3)
retries += 1
continue
if results is None and retries >= 8:
print("Wikidata request failed ; system stopped! ")
sys.exit(1)
return results
print('- Start fetching Natural-Earth wikidata labels via SPARQL query - ')
with open(args.output_csv_name, "w", encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow((
"wd_id",
"wd_id_new",
"population",
#"elevation",
"name_ar",
"name_bn",
"name_de",
"name_el",
"name_en",
"name_es",
"name_fa",
"name_fr",
"name_he",
"name_hi",
"name_hu",
"name_id",
"name_it",
"name_ja",
"name_ko",
"name_nl",
"name_pl",
"name_pt",
"name_ru",
"name_sv",
"name_tr",
"name_uk",
"name_ur",
"name_vi",
"name_zh",
"name_zht"
))
with fiona.open(args.input_shape_name, 'r') as shape_input:
i = 0
REC_IN_SHAPE = len(shape_input)
wikidata_chunk = list()
for pt in shape_input:
i = i+1
if args.input_lettercase == "lowercase":
ne_wikidataid = pt['properties']['wikidataid']
else:
ne_wikidataid = pt['properties']['WIKIDATAID']
ne_fid = pt['id']
if ne_wikidataid:
if ne_wikidataid[0] == 'Q':
wikidata_chunk.append(ne_wikidataid)
else:
print("ERROR: Bad formatted wikidataid , skip", ne_wikidataid)
if (len(wikidata_chunk) >= 200) or (i >= REC_IN_SHAPE):
sparql_results = fetchwikidata(wikidata_chunk)
wikidata_chunk = []
for result in sparql_results["results"]["bindings"]:
#print(result)
wd_id_label = get_sparql_value(result, 'e').split('/')[4]
wd_id = get_sparql_value(result, 'i').split('/')[4]
wd_id_new = get_sparql_value(result, 'r')
if wd_id_new:
wd_id_new = wd_id_new.split('/')[4]
print('Redirected:', wd_id, wd_id_new)
population = get_sparql_value(result, 'population')
#elevation =get_sparql_value(result, 'elevation')
name_ar = get_sparql_label(result, 'name_ar')
name_bn = get_sparql_label(result, 'name_bn')
name_de = get_sparql_label(result, 'name_de')
name_el = get_sparql_label(result, 'name_el')
name_en = get_sparql_label(result, 'name_en')
name_es = get_sparql_label(result, 'name_es')
name_fa = get_sparql_label(result, 'name_fa')
name_fr = get_sparql_label(result, 'name_fr')
name_he = get_sparql_label(result, 'name_he')
name_hi = get_sparql_label(result, 'name_hi')
name_hu = get_sparql_label(result, 'name_hu')
name_id = get_sparql_label(result, 'name_id')
name_it = get_sparql_label(result, 'name_it')
name_ja = get_sparql_label(result, 'name_ja')
name_ko = get_sparql_label(result, 'name_ko')
name_lt = get_sparql_label(result, 'name_lt')
name_nl = get_sparql_label(result, 'name_nl')
name_pl = get_sparql_label(result, 'name_pl')
name_pt = get_sparql_label(result, 'name_pt')
name_ru = get_sparql_label(result, 'name_ru')
name_sv = get_sparql_label(result, 'name_sv')
name_tr = get_sparql_label(result, 'name_tr')
name_uk = get_sparql_label(result, 'name_uk')
name_ur = get_sparql_label(result, 'name_ur')
name_vi = get_sparql_label(result, 'name_vi')
# not all Wikidata places have all name (label) translations
try:
name_en = get_sparql_label(result, 'name_en')
except:
name_en = u''
try:
name_zh_default = get_sparql_label(result, 'name_zh')
except:
name_zh_default = u''
try:
name_zh_hans = get_sparql_label(result, 'name_zh_hans')
except:
name_zh_hans = u''
try:
name_zh_hant = get_sparql_label(result, 'name_zh_hant')
except:
name_zh_hant = u''
chinese_names = { 'name_en' : name_en,
'name_zh_default' : name_zh_default,
'name_zh_hans' : name_zh_hans,
'name_zh_hant' : name_zh_hant
}
processed_chinese_names = post_process_wd_zh( chinese_names )
try:
name_zh = processed_chinese_names['name_zhs']
except:
name_zh = u''
try:
name_zht = processed_chinese_names['name_zht']
except:
name_zht = u''
writer.writerow((
wd_id,
wd_id_new,
population,
#elevation,
name_ar,
name_bn,
name_de,
name_el,
name_en,
name_es,
name_fa,
name_fr,
name_he,
name_hi,
name_hu,
name_id,
name_it,
name_ja,
name_ko,
name_nl,
name_pl,
name_pt,
name_ru,
name_sv,
name_tr,
name_uk,
name_ur,
name_vi,
name_zh,
name_zht
))
print(' - JOB end -')