-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathncdc_data_scraper.py
125 lines (99 loc) · 4.71 KB
/
ncdc_data_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import os
CWD = os.path.dirname(os.path.abspath(__file__))
STATES_DAILY_CASES_URL = 'https://github.com/Kamparia/nigeria-covid19-data/raw/master/data/csv/ncdc-covid19-states-daily-cases.csv'
STATES_DAILY_DEATHS_URL = 'https://raw.githubusercontent.com/Kamparia/nigeria-covid19-data/master/data/csv/ncdc-covid19-states-daily-deaths.csv'
STATES_DAILY_RECOVERED = 'https://raw.githubusercontent.com/Kamparia/nigeria-covid19-data/master/data/csv/ncdc-covid19-states-daily-recovered.csv'
COORD_ULR = 'https://github.com/Kamparia/nigeria-covid19-data/raw/master/data/csv/ncdc-covid19-states.csv'
GEN_UPDATE = 'https://github.com/Kamparia/nigeria-covid19-data/raw/master/data/csv/ncdc-covid19-dailyupdates.csv'
WHO_URL ="https://covid19.who.int/WHO-COVID-19-global-data.csv"
def scrape_ncdc_data():
"""
This function extracts data from the NCDC website and stores it as a pandas dataframe'
"""
PAGE_URL = "https://covid19.ncdc.gov.ng/"
# Response Data
response_data = requests.get(PAGE_URL).text
# Initializing the BeautifulSoup package and the specifying the parser
soup = BeautifulSoup(response_data, 'lxml')
content_table = soup.find("table", id="custom1")
# Extracting the Table header names
table_headers = content_table.thead.findAll("tr")
for k in range(len(table_headers)):
data = table_headers[k].find_all("th")
column_names = [j.string.strip() for j in data]
# Extracting the data in the Table's body (values)
table_data = content_table.tbody.findAll('tr')
values = []
keys = []
data_dict = {}
for k in range(len(table_data)):
key = table_data[k].find_all("td")[0].string.strip()
value = [j.string.strip() for j in table_data[k].find_all("td")]
keys.append(key)
values.append(value)
data_dict[key] = value
#Convert dictionary to dataframe
data = pd.DataFrame(data_dict).T
data.columns = ['state', 'confirmed', 'active', 'recovered', 'deaths']
data = data.sort_values(by = 'state', ignore_index=True)
data = data.reset_index(drop=True)
#Removing the commas ( , ) between the numbers e.g 6,239
data['confirmed'] = data['confirmed'].apply(lambda x: int(re.sub("[^0-9]", "", x)) )
data['active'] = data['active'].apply(lambda x: int(re.sub("[^0-9]", "", x)) )
data['recovered'] = data['recovered'].apply(lambda x: int(re.sub("[^0-9]", "", x)) )
data['deaths'] = data['deaths'].apply(lambda x: int(re.sub("[^0-9]", "", x)) )
filename = "ncdc_latest.csv"
data_path = os.path.join(CWD, filename)
data.to_csv(data_path, index=False)
return data
def get_data():
'''
Fuction adds all the needed data to a list to be easily
accessed by indexing
0 = states_csv,
1 = dailyupdates
2 = states_Daily
3 = states_daily death
4 = states_daily_recovery
5 = who
'''
# empty array
data = []
# 0 - ncdc-covid19-states.csv Site has been taken down
# states_csv = scrape_ncdc_data()
# states_csv.columns = ['STATE', 'CONFIRMED','ACTIVE','DISCHARGED', 'DEATHS' ]
# data.append(states_csv)
## 1 - ncdc-covid19-dailyupdates.csv
dailyupdates_csv = pd.read_csv(GEN_UPDATE)
data.append(dailyupdates_csv)
## 2 - ncdc-covid19-states-daily-cases.csv
states_daily_cases_csv = pd.read_csv(STATES_DAILY_CASES_URL)
data.append(states_daily_cases_csv)
## 3 - ncdc-covid19-states-daily-deaths.csv
states_daily_deaths_csv = pd.read_csv(STATES_DAILY_DEATHS_URL)
data.append(states_daily_deaths_csv)
## 4 - ncdc-covid19-states-daily-recovered.csv
states_daily_recovered_csv = pd.read_csv(STATES_DAILY_RECOVERED)
data.append(states_daily_recovered_csv)
##5 - 'WHO-COVID-19-global-data.csv'
who = pd.read_csv(WHO_URL)[['Date_reported', 'Country', 'New_cases', 'Cumulative_cases',
'New_deaths', 'Cumulative_deaths']]
who_daily = who.loc[who['Country'] == 'Nigeria'].reset_index(drop = True)
who_daily.columns = ['Date_reported', 'Country', 'New_cases', 'Cumulative_cases',
'New_deaths', 'Cumulative_deaths']
data.append(who_daily)
return data
def store_data():
data = get_data()
data[0].to_csv(os.path.join(CWD, "data_latest/daily_update_latest.csv"), index = False)
data[1].to_csv(os.path.join(CWD, "data_latest/states_daily_latest.csv"), index = False)
data[2].to_csv(os.path.join(CWD, "data_latest/states_daily_death_latest.csv"), index = False)
data[3].to_csv(os.path.join(CWD, "data_latest/states_daily_recovery_latest.csv"), index = False)
if __name__ == "__main__":
get_data()
store_data()