-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper.py
72 lines (56 loc) · 2.53 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests
from bs4 import BeautifulSoup
import os
from os.path import exists
# Scrape and parse HTML content from page
def soupify(URL):
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
return soup
# Gets all the sections from the year provided
def get_section(yearURL):
dataset = soupify(baseURL + yearURL).find('ul', {'class': 'mb-0 block-list'})
for data in dataset:
newsoup = BeautifulSoup(str(data), 'html.parser')
links = newsoup.findAll('a', {'class': 'list-title td-none td-ul-hover'})
for link in links:
link = link.get("href").strip("..")
year = link[-4:]
section = link[link.find("Component=") + len("Component="):link.find("&")]
if section != 'LimitedAccess':
get_download_link(link, year, section)
print("FINISHED DOWNLOADING " + section + "\n")
# Gets the download link for the each file in the corresponding year and section
def get_download_link(sectionURL, year, section):
dataset = soupify(baseURL + sectionURL).find('tbody')
for data in dataset:
newsoup = BeautifulSoup(str(data), 'html.parser')
links = newsoup.findAll('a')
for link in links:
dataFileLink = link.get("href")
if dataFileLink.endswith('.XPT'):
download('https://wwwn.cdc.gov/' + dataFileLink, dest_folder="NHANES/" + year + "/" + section)
# Saves the file locally to a folder called NHANES
def download(url: str, dest_folder: str):
if not exists(dest_folder):
os.makedirs(dest_folder) # create folder if it does not exist
filename = url.split('/')[-1].replace(" ", "_")
file_path = os.path.join(dest_folder, filename)
if not exists(file_path):
r = requests.get(url, stream=True)
if r.ok:
print("Saving to", os.path.abspath(file_path))
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
else: # HTTP status code 4XX/5XX
print("Download failed: status code {}\n{}".format(r.status_code, r.text))
baseURL = "https://wwwn.cdc.gov/nchs/nhanes"
# Forms the correct base URL for each year
for year in range(1999, 2015)[::-1]:
if year % 2 != 0:
get_section('/continuousnhanes/default.aspx?BeginYear=' + str(year))
print("--FINISHED DOWNLOADING " + str(year) + "--\n\n")