-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki_list_url_scraper.py
41 lines (38 loc) · 1.49 KB
/
wiki_list_url_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import urllib.request
import string
import os
home_path = os.path.expanduser('~/')
output_file = home_path + 'word_lists/new_list.txt'
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
url_base = 'https://en.wikipedia.org/wiki/List_of_biblical_names_starting_with_'
output_list = []
temp_list =[]
for c in string.ascii_uppercase:
print(c)
try:
request = urllib.request.Request(url_base + c, headers = hdr) # The assembled request
response = urllib.request.urlopen(request)
data = str(response.read())
lines = data.split(r'\n')
for line in lines:
temp = line.replace(r'\t','')
temp = temp.replace('<strong>', '')
if temp[0:4] =='<li>':
temp_list = temp.split('>')
b=0
while b < len(temp_list) -1:
if temp_list[b][0] == c:
name = temp_list[b].replace('</a','')
print(name)
b += 1
output_list.append(name)
except:
print('error with ' + url_base + c)
with open(output_file, 'a') as out:
for n in output_list:
out.writelines(n+'\n')