Skip to content

Commit 094e5b3

Browse files
author
KS
authored
Merge pull request #19 from michaelosthege/country-independent-data-loading
Move US-specific data loading into its own module
2 parents f9580fd + 4508611 commit 094e5b3

File tree

3 files changed

+159
-94
lines changed

3 files changed

+159
-94
lines changed

covid/data.py

+43-94
Original file line numberDiff line numberDiff line change
@@ -1,103 +1,52 @@
1-
import requests
1+
import typing
22
import pandas as pd
3-
import numpy as np
43
import arviz as az
54

6-
idx = pd.IndexSlice
7-
8-
9-
def get_raw_covidtracking_data():
10-
""" Gets the current daily CSV from COVIDTracking """
11-
url = "https://covidtracking.com/api/v1/states/daily.csv"
12-
data = pd.read_csv(url)
13-
return data
14-
15-
16-
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
17-
""" Processes raw COVIDTracking data to be in a form for the GenerativeModel.
18-
In many cases, we need to correct data errors or obvious outliers."""
19-
data = data.rename(columns={"state": "region"})
20-
data["date"] = pd.to_datetime(data["date"], format="%Y%m%d")
21-
data = data.set_index(["region", "date"]).sort_index()
22-
data = data[["positive", "total"]]
23-
24-
# Too little data or unreliable reporting in the data source.
25-
data = data.drop(["MP", "GU", "AS", "PR", "VI"])
26-
27-
# On Jun 5 Covidtracking started counting probable cases too
28-
# which increases the amount by 5014.
29-
# https://covidtracking.com/screenshots/MI/MI-20200605-184320.png
30-
data.loc[idx["MI", pd.Timestamp("2020-06-05") :], "positive"] -= 5014
31-
32-
# From CT: On June 19th, LDH removed 1666 duplicate and non resident cases
33-
# after implementing a new de-duplicaton process.
34-
data.loc[idx["LA", pd.Timestamp("2020-06-19") :], :] += 1666
35-
36-
# Now work with daily counts
37-
data = data.diff().dropna().clip(0, None).sort_index()
38-
39-
# Michigan missed 6/18 totals and lumped them into 6/19 so we've
40-
# divided the totals in two and equally distributed to both days.
41-
data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871
42-
data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871
43-
44-
# Note that when we set total to zero, the model ignores that date. See
45-
# the likelihood function in GenerativeModel.build
46-
47-
# Huge outlier in NJ causing sampling issues.
48-
data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0
49-
50-
# Huge outlier in CA causing sampling issues.
51-
data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0
52-
53-
# Huge outlier in CA causing sampling issues.
54-
# TODO: generally should handle when # tests == # positives and that
55-
# is not an indication of positive rate.
56-
data.loc[idx["SC", pd.Timestamp("2020-06-26")], :] = 0
57-
58-
# Two days of no new data then lumped sum on third day with lack of new total tests
59-
data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'positive'] = 174
60-
data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'total'] = 3296
61-
62-
63-
#https://twitter.com/OHdeptofhealth/status/1278768987292209154
64-
data.loc[idx["OH", pd.Timestamp("2020-07-01")], :] = 0
65-
66-
# Nevada didn't report total tests this day
67-
data.loc[idx["NV", pd.Timestamp("2020-07-02")], :] = 0
68-
69-
# A bunch of incorrect values for WA data so nulling them out.
70-
data.loc[idx["WA", pd.Timestamp("2020-06-05") : pd.Timestamp("2020-06-07")], :] = 0
71-
data.loc[idx["WA", pd.Timestamp("2020-06-20") : pd.Timestamp("2020-06-21")], :] = 0
72-
73-
# Outlier dates in PA
74-
data.loc[
75-
idx[
76-
"PA",
77-
[
78-
pd.Timestamp("2020-06-03"),
79-
pd.Timestamp("2020-04-21"),
80-
pd.Timestamp("2020-05-20"),
81-
],
82-
],
83-
:,
84-
] = 0
85-
86-
# At the real time of `run_date`, the data for `run_date` is not yet available!
87-
# Cutting it away is important for backtesting!
88-
return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
89-
90-
91-
def get_and_process_covidtracking_data(run_date: pd.Timestamp):
92-
""" Helper function for getting and processing COVIDTracking data at once """
93-
data = get_raw_covidtracking_data()
94-
data = process_covidtracking_data(data, run_date)
95-
return data
5+
from . data_us import (
6+
get_and_process_covidtracking_data,
7+
get_raw_covidtracking_data,
8+
process_covidtracking_data,
9+
)
10+
11+
# Data loading functions for different countries may be registered here.
12+
# For US, the data loader is pre-registered. Additional countries may be
13+
# registered upon import of third-party modules.
14+
# Data cleaning must be done by the data loader function!
15+
LOADERS:typing.Dict[str, typing.Callable[[pd.Timestamp], pd.DataFrame]] = {
16+
'us': get_and_process_covidtracking_data,
17+
}
18+
19+
20+
def get_data(country: str, run_date: pd.Timestamp) -> pd.DataFrame:
21+
""" Retrieves data for a country using the registered data loader method.
22+
23+
Parameters
24+
----------
25+
country : str
26+
short code of the country (key in LOADERS dict)
27+
run_date : pd.Timestamp
28+
date when the analysis is performed
29+
30+
Returns
31+
-------
32+
model_input : pd.DataFrame
33+
Data as returned by data loader function.
34+
Ideally "as it was on `run_date`", meaning that information such as corrections
35+
that became available after `run_date` should not be taken into account.
36+
This is important to realistically back-test how the model would have performed at `run_date`.
37+
"""
38+
if not country in LOADERS:
39+
raise KeyError(f"No data loader for '{country}' is registered.")
40+
result = LOADERS[country](run_date)
41+
assert isinstance(result, pd.DataFrame)
42+
assert result.index.names == ("region", "date")
43+
assert "positive" in result.columns
44+
assert "total" in result.columns
45+
return result
9646

9747

9848
def summarize_inference_data(inference_data: az.InferenceData):
99-
""" Summarizes an inference_data object into the form that we publish on
100-
rt.live """
49+
""" Summarizes an inference_data object into the form that we publish on rt.live """
10150
posterior = inference_data.posterior
10251
hdi_mass = 80
10352
hpdi = az.hdi(posterior.r_t, hdi_prob=hdi_mass / 100).r_t

covid/data_us.py

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""
2+
This module contains all US-specific data loading and data cleaning routines.
3+
"""
4+
import requests
5+
import pandas as pd
6+
import numpy as np
7+
8+
idx = pd.IndexSlice
9+
10+
11+
def get_raw_covidtracking_data():
12+
""" Gets the current daily CSV from COVIDTracking """
13+
url = "https://covidtracking.com/api/v1/states/daily.csv"
14+
data = pd.read_csv(url)
15+
return data
16+
17+
18+
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
19+
""" Processes raw COVIDTracking data to be in a form for the GenerativeModel.
20+
In many cases, we need to correct data errors or obvious outliers."""
21+
data = data.rename(columns={"state": "region"})
22+
data["date"] = pd.to_datetime(data["date"], format="%Y%m%d")
23+
data = data.set_index(["region", "date"]).sort_index()
24+
data = data[["positive", "total"]]
25+
26+
# Too little data or unreliable reporting in the data source.
27+
data = data.drop(["MP", "GU", "AS", "PR", "VI"])
28+
29+
# On Jun 5 Covidtracking started counting probable cases too
30+
# which increases the amount by 5014.
31+
# https://covidtracking.com/screenshots/MI/MI-20200605-184320.png
32+
data.loc[idx["MI", pd.Timestamp("2020-06-05") :], "positive"] -= 5014
33+
34+
# From CT: On June 19th, LDH removed 1666 duplicate and non resident cases
35+
# after implementing a new de-duplicaton process.
36+
data.loc[idx["LA", pd.Timestamp("2020-06-19") :], :] += 1666
37+
38+
# Now work with daily counts
39+
data = data.diff().dropna().clip(0, None).sort_index()
40+
41+
# Michigan missed 6/18 totals and lumped them into 6/19 so we've
42+
# divided the totals in two and equally distributed to both days.
43+
data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871
44+
data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871
45+
46+
# Note that when we set total to zero, the model ignores that date. See
47+
# the likelihood function in GenerativeModel.build
48+
49+
# Huge outlier in NJ causing sampling issues.
50+
data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0
51+
52+
# Huge outlier in CA causing sampling issues.
53+
data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0
54+
55+
# Huge outlier in CA causing sampling issues.
56+
# TODO: generally should handle when # tests == # positives and that
57+
# is not an indication of positive rate.
58+
data.loc[idx["SC", pd.Timestamp("2020-06-26")], :] = 0
59+
60+
# Two days of no new data then lumped sum on third day with lack of new total tests
61+
data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'positive'] = 174
62+
data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'total'] = 3296
63+
64+
65+
#https://twitter.com/OHdeptofhealth/status/1278768987292209154
66+
data.loc[idx["OH", pd.Timestamp("2020-07-01")], :] = 0
67+
68+
# Nevada didn't report total tests this day
69+
data.loc[idx["NV", pd.Timestamp("2020-07-02")], :] = 0
70+
71+
# A bunch of incorrect values for WA data so nulling them out.
72+
data.loc[idx["WA", pd.Timestamp("2020-06-05") : pd.Timestamp("2020-06-07")], :] = 0
73+
data.loc[idx["WA", pd.Timestamp("2020-06-20") : pd.Timestamp("2020-06-21")], :] = 0
74+
75+
# Outlier dates in PA
76+
data.loc[
77+
idx[
78+
"PA",
79+
[
80+
pd.Timestamp("2020-06-03"),
81+
pd.Timestamp("2020-04-21"),
82+
pd.Timestamp("2020-05-20"),
83+
],
84+
],
85+
:,
86+
] = 0
87+
88+
# At the real time of `run_date`, the data for `run_date` is not yet available!
89+
# Cutting it away is important for backtesting!
90+
return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
91+
92+
93+
def get_and_process_covidtracking_data(run_date: pd.Timestamp):
94+
""" Helper function for getting and processing COVIDTracking data at once """
95+
data = get_raw_covidtracking_data()
96+
data = process_covidtracking_data(data, run_date)
97+
return data

covid/tests.py

+19
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,32 @@ def test_process(self):
1919
run_date = pandas.Timestamp('2020-06-25')
2020
df_processed = covid.data.process_covidtracking_data(df_raw, run_date)
2121
assert isinstance(df_processed, pandas.DataFrame)
22+
assert df_processed.index.names == ("region", "date")
2223
# the last entry in the data is the day before `run_date`!
2324
assert df_processed.xs('NY').index[-1] < run_date
2425
assert df_processed.xs('NY').index[-1] == (run_date - pandas.DateOffset(1))
2526
assert "positive" in df_processed.columns
2627
assert "total" in df_processed.columns
2728

2829

30+
class TestDataGeneralized:
31+
def test_get_unsupported(self):
32+
with pytest.raises(KeyError):
33+
covid.data.get_data(country="not_a_country", run_date=pandas.Timestamp("2020-06-20"))
34+
35+
def test_get_us(self):
36+
import covid.data_us
37+
assert "us" in covid.data.LOADERS
38+
run_date = pandas.Timestamp('2020-06-25')
39+
result = covid.data.get_data("us", run_date)
40+
assert isinstance(result, pandas.DataFrame)
41+
assert result.index.names == ("region", "date")
42+
assert result.xs('NY').index[-1] < run_date
43+
assert result.xs('NY').index[-1] == (run_date - pandas.DateOffset(1))
44+
assert "positive" in result.columns
45+
assert "total" in result.columns
46+
47+
2948
class TestGenerative:
3049
def test_build(self):
3150
df_raw = covid.data.get_raw_covidtracking_data()

0 commit comments

Comments
 (0)