forked from tanyav2/red-alert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
81 lines (78 loc) · 3.38 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome("/usr/local/bin/chromedriver", options=options)
names = []
phones = []
addresses = []
for i in range(1, 1682):
counter = 1
driver.get("https://www.rubmaps.ch/?i_agree_notice&to=%2Fadvanced-search-2694152-" + str(i))
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
for br in soup.findAll('br'):
if (i < 784 or i > 928) and i < 1556:
if counter > 27 and counter < 86:
if counter % 4 == 0:
addresses.append(br.nextSibling[16:].strip())
if counter % 4 == 1:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
elif (i > 784 and i < 928) or (i > 1556 and i != 1681):
if counter > 27 and counter < 72:
if counter % 3 == 1:
addresses.append(br.nextSibling[16:].strip())
if counter % 3 == 2:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
elif i == 928:
if counter > 27 and counter < 60:
if counter % 3 == 1:
addresses.append(br.nextSibling[16:].strip())
if counter % 3 == 2:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
if counter > 60 and counter < 76:
if counter % 4 == 2:
addresses.append(br.nextSibling[16:].strip())
if counter % 4 == 3:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
elif i == 1681:
if counter > 27 and counter < 48:
if counter % 3 == 1:
addresses.append(br.nextSibling[16:].strip())
if counter % 3 == 2:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
elif i == 1556:
if counter > 27 and counter < 46:
if counter % 4 == 0:
addresses.append(br.nextSibling[16:].strip())
if counter % 4 == 1:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
if counter > 46 and counter < 76:
if counter % 3 == 2:
addresses.append(br.nextSibling[16:].strip())
if counter % 3 == 0:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
else: #i == 784
if counter > 27 and counter < 66:
if counter % 4 == 0:
addresses.append(br.nextSibling[16:].strip())
if counter % 4 == 1:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
if counter > 66 and counter < 81:
if counter % 3 == 1:
addresses.append(br.nextSibling[16:].strip())
if counter % 3 == 2:
addresses[len(addresses) - 1] = addresses[len(addresses) - 1] + "; " + br.nextSibling[16:].strip()
#print(br.nextSibling)
#print(str(i) + ": " + str(counter))
counter = counter + 1
for a in soup.findAll('div', attrs={'class':'main-row'}):
name = a.find('a', attrs={'class':'th-a'})
phone = a.find('i', attrs={'class':'phonenumberrow'})
names.append(name.text.strip())
phones.append(phone.text.strip())
print("scraping page " + str(i) + "...")
#print(str(i) + " " + str(len(names)) + " " + str(len(phones)) + " " + str(len(addresses)))
df = pd.DataFrame({'Parlor Name':names,'Phone Number':phones,'Address':addresses})
df.to_csv('massage_parlors.csv', index=False, encoding='utf-8')