tools/wikidata/fetch_wikidata.py

#!/usr/bin/env python3

#--  pip3 install -U SPARQLWrapper
#--  pip3 install -U fiona
#--  pip3 install -U hanzidentifier

"""
Fetch Wikidata Labels


 Wikidata limitations:
 population: lot of constraint violations.
 https://www.wikidata.org/wiki/Property_talk:P1082#Querying_for_the_latest_value

 elevation : temporary disabled , reason: SPARQL performance problem.
"""


import argparse
import csv
import sys
import time
import hanzidentifier
#import requests

from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions

import fiona

parser = argparse.ArgumentParser(description='Fetch wikidata labels for Natural-Earth ')
parser.add_argument('-input_shape_name',
                    default='../../10m_cultural/ne_10m_populated_places.shp',
                    help='input natural-earth shape file - with wikidataid columns')
parser.add_argument('-input_lettercase',
                    default='uppercase',
                    help='variables in thes hape file - lowercase or uppercase')
parser.add_argument('-output_csv_name',
                    default='ne_10m_populated_places.csv',
                    help='output csv file with wikidata labels')

args = parser.parse_args()


def get_sparql_value(sresult, variable_id):
    """
    Get SPARQL value from the sresult
    """
    val = ''
    if variable_id in sresult:
        val = sresult[variable_id]['value']
    return val

def get_sparql_label(sresult, variable_id):
    """
    Get SPARQL label from the sresult
    """
    val = ''
    if variable_id in sresult:
        val = sresult[variable_id]['value'].split('#')[0].split('(')[0].split(',')[0]
    return val.strip()

def get_sparql_numvalue(sresult, variable_id):
    """
    Get SPARQL numeric value from the sresult
    """
    val = -1
    if variable_id in sresult:
        val = float(sresult[variable_id]['value'])
    return val

def post_process_wd_zh(properties):
    """ First check whether name_zh (Simplified) and name_zht(Traditional)
    are set already, if not we use the name_zh-default to backfill them.
    During the backfill, if there is no Simplified Chinese, Traditional
    Chinese will be used to further backfill, and vice versa
    It also deletes the intermediate property `zh-default`
    """

    if args.input_lettercase == "lowercase":
        name_en_default = properties['name_en'] if 'name_en' in \
                                                    properties else u''
        zh_Hans_fallback = properties['name_zh_hans'] if 'name_zh_hans' in \
                                                    properties else u''
        zh_Hant_fallback = properties['name_zh_hant'] if 'name_zh_hant' in \
                                                     properties else u''
    else:
        name_en_default = properties['NAME_EN'] if 'NAME_EN' in \
                                                    properties else u''
        zh_Hans_fallback = properties['NAME_ZH_HANS'] if 'NAME_ZH_HANS' in \
                                                    properties else u''
        zh_Hant_fallback = properties['NAME_ZH_HANT'] if 'NAME_ZH_HANT' in \
                                                     properties else u''

    # sometimes the default Chinese name has several values in a list
    if 'name_zh_default' in properties:
        names = properties['name_zh_default'].split('/')
        for name in names:
            if hanzidentifier.is_simplified(name) and \
                    len(zh_Hans_fallback) == 0:
                zh_Hans_fallback = name
                #print('found simplified name')
            if hanzidentifier.is_traditional(name) and \
                    len(zh_Hant_fallback) == 0:
                zh_Hant_fallback = name
                #print('found traditional name')

    # make sure we don't shove English values into Chinese namespace
    if (zh_Hans_fallback == name_en_default) and len(name_en_default) > 0:
        zh_Hans_fallback = u''

    if (zh_Hant_fallback == name_en_default) and len(name_en_default) > 0:
        zh_Hant_fallback = u''

    # now make traditional and simplified Chinese name assignments
    if 'name_zhs' not in properties:
        if len(zh_Hans_fallback) != 0:
            properties['name_zhs'] = zh_Hans_fallback
        elif len(zh_Hant_fallback) != 0:
            properties['name_zhs'] = zh_Hant_fallback
        else:
            properties['name_zhs'] = u''

    if 'name_zht' not in properties:
        if len(zh_Hant_fallback) != 0:
            properties['name_zht'] = zh_Hant_fallback
        elif len(zh_Hans_fallback) != 0:
            properties['name_zht'] = zh_Hans_fallback
        else:
            properties['name_zht'] = u''

    # only select one of the options if the field is separated by "/"
    # for example if the field is "旧金山市县/三藩市市縣/舊金山市郡" only the first
    # one 旧金山市县 will be preserved
    if 'name_zh' in properties:
        if len(properties['name_zh']) > 0:
            properties['name_zh'] = properties['name_zh'].split('/')[0].strip()
    if 'name_zht' in properties:
        if len(properties['name_zht']) > 0:
            properties['name_zht'] = properties['name_zht'].split('/')[0].strip()
    if 'NAME_ZH' in properties:
        if len(properties['NAME_ZH']) > 0:
            properties['NAME_ZH'] = properties['NAME_ZH'].split('/')[0].strip()
    if 'NAME_ZHT' in properties:
        if len(properties['NAME_ZHT']) > 0:
            properties['NAME_ZHT'] = properties['NAME_ZHT'].split('/')[0].strip()

    return properties


def fetchwikidata(a_wid):
    """
    Fetch wikidata with SPARQL
    """

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 'natural_earth_name_localizer v1.1.1 (github.com/nvkelso/natural-earth-vector)')
    query_template = """
        SELECT
            ?e ?i ?r ?population
            ?name_ar
            ?name_bn
            ?name_de
            ?name_el
            ?name_en
            ?name_es
            ?name_fa
            ?name_fr
            ?name_he
            ?name_hi
            ?name_hu
            ?name_id
            ?name_it
            ?name_ja
            ?name_ko
            ?name_nl
            ?name_pl
            ?name_pt
            ?name_ru
            ?name_sv
            ?name_tr
            ?name_uk
            ?name_ur
            ?name_vi
            ?name_zh
            ?name_zh_hans
            ?name_zh_hant
        WHERE {
            {
                SELECT DISTINCT  ?e ?i ?r
                WHERE{
                    VALUES ?i { wd:Q2102493 wd:Q1781    }
                    OPTIONAL{ ?i owl:sameAs ?r. }
                    BIND(COALESCE(?r, ?i) AS ?e).
                }
            }
            SERVICE wikibase:label {bd:serviceParam wikibase:language "en".}
            OPTIONAL{?e wdt:P1082 ?population .}
            OPTIONAL{?e rdfs:label ?name_ar FILTER((LANG(?name_ar))="ar").}
            OPTIONAL{?e rdfs:label ?name_bn FILTER((LANG(?name_bn))="bn").}
            OPTIONAL{?e rdfs:label ?name_de FILTER((LANG(?name_de))="de").}
            OPTIONAL{?e rdfs:label ?name_el FILTER((LANG(?name_el))="el").}
            OPTIONAL{?e rdfs:label ?name_en FILTER((LANG(?name_en))="en").}
            OPTIONAL{?e rdfs:label ?name_es FILTER((LANG(?name_es))="es").}
            OPTIONAL{?e rdfs:label ?name_fa FILTER((LANG(?name_fa))="fa").}
            OPTIONAL{?e rdfs:label ?name_fr FILTER((LANG(?name_fr))="fr").}
            OPTIONAL{?e rdfs:label ?name_he FILTER((LANG(?name_he))="he").}
            OPTIONAL{?e rdfs:label ?name_hi FILTER((LANG(?name_hi))="hi").}
            OPTIONAL{?e rdfs:label ?name_hu FILTER((LANG(?name_hu))="hu").}
            OPTIONAL{?e rdfs:label ?name_id FILTER((LANG(?name_id))="id").}
            OPTIONAL{?e rdfs:label ?name_it FILTER((LANG(?name_it))="it").}
            OPTIONAL{?e rdfs:label ?name_ja FILTER((LANG(?name_ja))="ja").}
            OPTIONAL{?e rdfs:label ?name_ko FILTER((LANG(?name_ko))="ko").}
            OPTIONAL{?e rdfs:label ?name_nl FILTER((LANG(?name_nl))="nl").}
            OPTIONAL{?e rdfs:label ?name_pl FILTER((LANG(?name_pl))="pl").}
            OPTIONAL{?e rdfs:label ?name_pt FILTER((LANG(?name_pt))="pt").}
            OPTIONAL{?e rdfs:label ?name_ru FILTER((LANG(?name_ru))="ru").}
            OPTIONAL{?e rdfs:label ?name_sv FILTER((LANG(?name_sv))="sv").}
            OPTIONAL{?e rdfs:label ?name_tr FILTER((LANG(?name_tr))="tr").}
            OPTIONAL{?e rdfs:label ?name_uk FILTER((LANG(?name_uk))="uk").}
            OPTIONAL{?e rdfs:label ?name_ur FILTER((LANG(?name_ur))="ur").}
            OPTIONAL{?e rdfs:label ?name_vi FILTER((LANG(?name_vi))="vi").}
            OPTIONAL{?e rdfs:label ?name_zh FILTER((LANG(?name_zh))="zh").}
            OPTIONAL{?e rdfs:label ?name_zh_hans FILTER((LANG(?name_zh_hans))="zh-hans").}
            OPTIONAL{?e rdfs:label ?name_zh_hant FILTER((LANG(?name_zh_hant))="zh-hant").}
        }

    """

    wikidata_sparql_ids = ""
    for wid in a_wid:
        wikidata_sparql_ids += " wd:"+wid

    print("fetch: ", wikidata_sparql_ids.split()[1], "... ", wikidata_sparql_ids.split()[-1])
    ne_query = query_template.replace('wd:Q2102493 wd:Q1781', wikidata_sparql_ids)

    # compress the Query -  removing the extra spaces
    while '  ' in ne_query:
        ne_query = ne_query.replace('  ', ' ')

    results = None
    retries = 0
    while results is None and retries < 8:
        try:
            results = None
            sparql.setQuery(ne_query)
            sparql.setTimeout(1000)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()

        except SPARQLExceptions.EndPointNotFound:
            print('ERRwikidata-SPARQLExceptions-EndPointNotFound:  Retrying in 30 seconds.')
            time.sleep(30)
            retries += 1
            continue

        except SPARQLExceptions.EndPointInternalError as e:
            print("ERRwikidata-SPARQLExceptions-EndPointInternalError: Retrying in 30 seconds.",e)
            time.sleep(30)
            retries += 1
            continue

        except SPARQLExceptions.QueryBadFormed as e:
            print("ERRwikidata-SPARQLExceptions-QueryBadFormed : Check!  ",e)
            return "error"

        except TimeoutError as e:
            print("ERRwikidata-SPARQLExceptions  TimeOut : Retrying in 1 seconds.",e)
            time.sleep(1)
            retries += 1
            continue

        except KeyboardInterrupt:
            # quit
            sys.exit()

        except:
            wait = retries*5
            print("ERRwikidata: other error. Retrying in "+str(wait)+" seconds.")
            print('error: %s ' % sys.exc_info()[0])
            time.sleep(3)
            retries += 1
            continue

    if results is None and retries >= 8:
        print("Wikidata request failed ; system stopped! ")
        sys.exit(1)


    return results

print('- Start fetching Natural-Earth wikidata labels via SPARQL query - ')

with open(args.output_csv_name, "w", encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    writer.writerow((
        "wd_id",
        "wd_id_new",
        "population",
        #"elevation",
        "name_ar",
        "name_bn",
        "name_de",
        "name_el",
        "name_en",
        "name_es",
        "name_fa",
        "name_fr",
        "name_he",
        "name_hi",
        "name_hu",
        "name_id",
        "name_it",
        "name_ja",
        "name_ko",
        "name_nl",
        "name_pl",
        "name_pt",
        "name_ru",
        "name_sv",
        "name_tr",
        "name_uk",
        "name_ur",
        "name_vi",
        "name_zh",
        "name_zht"
    ))


    with fiona.open(args.input_shape_name, 'r') as shape_input:
        i = 0
        REC_IN_SHAPE = len(shape_input)
        wikidata_chunk = list()
        for pt in shape_input:
            i = i+1

            if args.input_lettercase == "lowercase":
                ne_wikidataid = pt['properties']['wikidataid']
            else:
                ne_wikidataid = pt['properties']['WIKIDATAID']

            ne_fid = pt['id']

            if ne_wikidataid:
                if ne_wikidataid[0] == 'Q':
                    wikidata_chunk.append(ne_wikidataid)
                else:
                    print("ERROR: Bad formatted wikidataid , skip", ne_wikidataid)

            if (len(wikidata_chunk) >= 200) or (i >= REC_IN_SHAPE):

                sparql_results = fetchwikidata(wikidata_chunk)
                wikidata_chunk = []

                for result in sparql_results["results"]["bindings"]:
                    #print(result)
                    wd_id_label = get_sparql_value(result, 'e').split('/')[4]
                    wd_id = get_sparql_value(result, 'i').split('/')[4]
                    wd_id_new = get_sparql_value(result, 'r')
                    if wd_id_new:
                        wd_id_new = wd_id_new.split('/')[4]
                        print('Redirected:', wd_id, wd_id_new)
                    population = get_sparql_value(result, 'population')
                    #elevation =get_sparql_value(result, 'elevation')

                    name_ar = get_sparql_label(result, 'name_ar')
                    name_bn = get_sparql_label(result, 'name_bn')
                    name_de = get_sparql_label(result, 'name_de')
                    name_el = get_sparql_label(result, 'name_el')
                    name_en = get_sparql_label(result, 'name_en')
                    name_es = get_sparql_label(result, 'name_es')
                    name_fa = get_sparql_label(result, 'name_fa')
                    name_fr = get_sparql_label(result, 'name_fr')
                    name_he = get_sparql_label(result, 'name_he')
                    name_hi = get_sparql_label(result, 'name_hi')
                    name_hu = get_sparql_label(result, 'name_hu')
                    name_id = get_sparql_label(result, 'name_id')
                    name_it = get_sparql_label(result, 'name_it')
                    name_ja = get_sparql_label(result, 'name_ja')
                    name_ko = get_sparql_label(result, 'name_ko')
                    name_lt = get_sparql_label(result, 'name_lt')
                    name_nl = get_sparql_label(result, 'name_nl')
                    name_pl = get_sparql_label(result, 'name_pl')
                    name_pt = get_sparql_label(result, 'name_pt')
                    name_ru = get_sparql_label(result, 'name_ru')
                    name_sv = get_sparql_label(result, 'name_sv')
                    name_tr = get_sparql_label(result, 'name_tr')
                    name_uk = get_sparql_label(result, 'name_uk')
                    name_ur = get_sparql_label(result, 'name_ur')
                    name_vi = get_sparql_label(result, 'name_vi')

                    # not all Wikidata places have all name (label) translations
                    try:
                        name_en = get_sparql_label(result, 'name_en')
                    except:
                        name_en = u''

                    try:
                        name_zh_default = get_sparql_label(result, 'name_zh')
                    except:
                        name_zh_default = u''

                    try:
                        name_zh_hans = get_sparql_label(result, 'name_zh_hans')
                    except:
                        name_zh_hans = u''

                    try:
                        name_zh_hant = get_sparql_label(result, 'name_zh_hant')
                    except:
                        name_zh_hant = u''

                    chinese_names = { 'name_en'         : name_en,
                                      'name_zh_default' : name_zh_default,
                                      'name_zh_hans'    : name_zh_hans,
                                      'name_zh_hant'    : name_zh_hant
                                    }

                    processed_chinese_names = post_process_wd_zh( chinese_names )

                    try:
                        name_zh  = processed_chinese_names['name_zhs']
                    except:
                        name_zh  = u''
                    try:
                        name_zht = processed_chinese_names['name_zht']
                    except:
                        name_zht  = u''

                    writer.writerow((
                        wd_id,
                        wd_id_new,
                        population,
                        #elevation,
                        name_ar,
                        name_bn,
                        name_de,
                        name_el,
                        name_en,
                        name_es,
                        name_fa,
                        name_fr,
                        name_he,
                        name_hi,
                        name_hu,
                        name_id,
                        name_it,
                        name_ja,
                        name_ko,
                        name_nl,
                        name_pl,
                        name_pt,
                        name_ru,
                        name_sv,
                        name_tr,
                        name_uk,
                        name_ur,
                        name_vi,
                        name_zh,
                        name_zht
                        ))

print(' - JOB end -')