-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWikimediaUtilities.py
27 lines (22 loc) · 1.23 KB
/
WikimediaUtilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from urllib.request import urlopen, quote
import Utilities
FILENAME_CUE = "File:"
IMAGE_LOCATION_CUE = '<div class="fullMedia"><a href="https://upload.wikimedia.org/wikipedia/commons/'
IMAGE_LOCATION_URL_START = 'https://upload.wikimedia.org/wikipedia/commons/'
def directUrlOfFile(mediaPageURL):
"""Returns (success, url)"""
filenameStart = mediaPageURL.find(FILENAME_CUE) + len(FILENAME_CUE)
filename = mediaPageURL[filenameStart:]
filename_percent_encoded = quote(filename)
print(filename, filename_percent_encoded)
lines = urlopen(mediaPageURL).readlines()
for item in lines:
item = item.decode('utf-8')
item = item.replace('href="//', 'href="https://')
if item.find(IMAGE_LOCATION_CUE) == 0\
and filename_percent_encoded.replace('_','').replace(' ','') in item.replace('_','').replace(' ',''): # Remove spaces and underscores when checking, they seem inconsistent
indexOfCueEnd = item.index(IMAGE_LOCATION_CUE) + len(IMAGE_LOCATION_CUE)
image_location_short = item[indexOfCueEnd : item.find('"', indexOfCueEnd)]
image_location_full = IMAGE_LOCATION_URL_START + image_location_short
return True, image_location_full
return False, None