-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path00-scrape.py
81 lines (61 loc) · 2.3 KB
/
00-scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
This script scrapes the TSA's FOIA reading room, saving any newly-discovered
PDF files containing the keyword "Contact Center" to the pdfs/ directory.
"""
import pathlib
from datetime import datetime
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
KEYWORD = "Contact Center"
PDF_SAVE_PREFIX = "pdfs/tsa-contact-center-traveler-complaints-report"
BASE_URL = "https://www.tsa.gov/foia/readingroom"
TITLE_FIXES = {
"https://www.tsa.gov/sites/default/files/foia-readingroom/tsa_contact_center_traveler_complaints_report_septermber.pdf": "September 2019" # noqa: E501
}
def process_link(link):
"""
For each hyperlink in the TSA Reading Room that's a PDF file containing the
keyword "Contact Center" assign it a standardized filename (e.g.
tsa-contact-center-traveler-complaints-report-2019-02.pdf) and write it to
the pdfs/ directory if it's not already there.
Arguments:
- link: the URL of the file to check
"""
href = link["href"]
if KEYWORD not in link.text or not href.endswith(".pdf"):
return
pdf_url = urljoin(BASE_URL, href)
title = TITLE_FIXES.get(pdf_url, link.text)
elements = title.split()
year = elements[-1]
# Converts, e.g., March->3
month = datetime.strptime(elements[-2], "%B").month
dest = pathlib.Path(f"{PDF_SAVE_PREFIX}-{year}-{month:02d}.pdf")
if dest.exists():
return
print(f"Downloading {pdf_url}")
pdf_response = requests.get(pdf_url)
with open(dest, "wb") as pdf_file:
pdf_file.write(pdf_response.content)
def check_and_download(page):
"""
for a single page in the TSA Reading Room, step through each hyperlink
calling "process_link" for each
arguments:
page -- page # in the reading room
"""
response = requests.get(BASE_URL, params=dict(page=page))
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a", href=True):
process_link(link)
def main():
"""
step through up to 25 pages of posts in the TSA FOIA Reading Room
calling check_and_download for each page
"""
for i in range(25):
print(f"Checking page {i}")
check_and_download(i)
if __name__ == "__main__":
main()