-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
91 lines (79 loc) · 3.14 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import csv
import requests
import datetime
import time
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
def write_to_file(data, method, delimiter):
"""Function for writing parsed data to csv-file"""
output_file = 'data.csv'
with open(output_file, method, newline='', encoding='utf-8') as file:
writer = csv.writer(file, delimiter=delimiter)
writer.writerows([data])
def get_page(date):
"""Function takes date as an argument.
Send cookies to the url.
Get response as html page.
Call function 'get_data_from_table' for parsing with parameter - html page.
Error handling and return message.
Data types:
date - str,
page - Response"""
session = requests.Session()
try:
data = {"FLG_Autoconsulta": 1}
url = 'https://laboral.pjud.cl'
session.post(url + '/SITLAPORWEB/InicioAplicacionPortal.do', data=data)
session.get(url + '/SITLAPORWEB/AtPublicoViewAccion.do?tipoMenuATP=1')
form_data = {'TIP_Consulta': 5,
'TIP_Lengueta': 'tdDos',
'SeleccionL': 0,
'ERA_Causa': 0,
'RUC_Tribunal': 4,
'FEC_Desde': date,
'FEC_Hasta': date,
'SEL_Trabajadores': 0,
'irAccionAtPublico': 'Consulta',
'COD_Tribunal': 0}
page = session.post(url + '/SITLAPORWEB/AtPublicoDAction.do', data=form_data)
get_data_from_table(page)
except ConnectionError:
print("Connection error")
return None
def get_data_from_table(page):
"""Function for getting parsed data.
Find table with data in html page, which is the argument.
Scrape necessary information,
call function for writing information to file.
Data types:
page - Response"""
soup = BeautifulSoup(page.text, 'lxml')
table = soup.find('table', id='filaSel')
for row in table.find_all('tr'):
data_list = []
for column in row.find_all('td'):
data_list.append(column.get_text().strip())
write_to_file(data_list, 'a', '\t')
def generate_date(date):
"""Function get date as an argument.
Generate date of each day from a given date to today.
Return date.
Data types:
date - str,
date_of_day - str"""
date_list = date.split('/')
date = datetime.datetime(int(date_list[2]), int(date_list[1]), int(date_list[0]))
now = datetime.datetime.now()
general_delta = abs(now - date)
for delta in range(general_delta.days + 1):
date_of_day = (date + datetime.timedelta(delta)).strftime('%d/%m/%Y')
yield date_of_day
def main():
start = time.time()
write_to_file(['RIT', 'RUC', 'F. Ingreso', 'Caratulado', 'Tribunal'], 'w', '\t')
date = '01/07/2018'
pool = ThreadPool(20)
pool.map(get_page, generate_date(date))
print('Time of working script:', time.time() - start, 'sec')
if __name__ == '__main__':
main()