-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect_motion_pages.py
37 lines (28 loc) · 1018 Bytes
/
collect_motion_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import requests
import parslepy
import tqdm
import asyncio
links = set()
parselet_list = parslepy.Parselet({
'moties(.m-card__main)': [{
'link': '.h-link-inverse::attr(href)'
}]
})
async def main():
loop = asyncio.get_event_loop()
futures = []
for i in range(1, 5000, 15):
futures.append(loop.run_in_executor(
None, requests.get, f'https://www.tweedekamer.nl/kamerstukken/moties?qry=*&cfg=tksearch&fld_tk_categorie=Kamerstukken&fld_prl_kamerstuk=Moties&srt=date%3Adesc%3Adate&sta={i}'))
responses = []
for future in tqdm.tqdm(futures):
responses.append(await future)
for response in responses:
extracted = parselet_list.parse_fromstring(response.content)
for link in [x['link'] for x in extracted['moties']]:
links.add(f'https://www.tweedekamer.nl{link}')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
with open('motion_links.txt', 'w') as f:
for link in links:
f.write(link + '\n')