|
1 |
| -import requests |
| 1 | +import typing |
2 | 2 | import pandas as pd
|
3 |
| -import numpy as np |
4 | 3 | import arviz as az
|
5 | 4 |
|
6 |
| -idx = pd.IndexSlice |
7 |
| - |
8 |
| - |
9 |
| -def get_raw_covidtracking_data(): |
10 |
| - """ Gets the current daily CSV from COVIDTracking """ |
11 |
| - url = "https://covidtracking.com/api/v1/states/daily.csv" |
12 |
| - data = pd.read_csv(url) |
13 |
| - return data |
14 |
| - |
15 |
| - |
16 |
| -def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp): |
17 |
| - """ Processes raw COVIDTracking data to be in a form for the GenerativeModel. |
18 |
| - In many cases, we need to correct data errors or obvious outliers.""" |
19 |
| - data = data.rename(columns={"state": "region"}) |
20 |
| - data["date"] = pd.to_datetime(data["date"], format="%Y%m%d") |
21 |
| - data = data.set_index(["region", "date"]).sort_index() |
22 |
| - data = data[["positive", "total"]] |
23 |
| - |
24 |
| - # Too little data or unreliable reporting in the data source. |
25 |
| - data = data.drop(["MP", "GU", "AS", "PR", "VI"]) |
26 |
| - |
27 |
| - # On Jun 5 Covidtracking started counting probable cases too |
28 |
| - # which increases the amount by 5014. |
29 |
| - # https://covidtracking.com/screenshots/MI/MI-20200605-184320.png |
30 |
| - data.loc[idx["MI", pd.Timestamp("2020-06-05") :], "positive"] -= 5014 |
31 |
| - |
32 |
| - # From CT: On June 19th, LDH removed 1666 duplicate and non resident cases |
33 |
| - # after implementing a new de-duplicaton process. |
34 |
| - data.loc[idx["LA", pd.Timestamp("2020-06-19") :], :] += 1666 |
35 |
| - |
36 |
| - # Now work with daily counts |
37 |
| - data = data.diff().dropna().clip(0, None).sort_index() |
38 |
| - |
39 |
| - # Michigan missed 6/18 totals and lumped them into 6/19 so we've |
40 |
| - # divided the totals in two and equally distributed to both days. |
41 |
| - data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871 |
42 |
| - data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871 |
43 |
| - |
44 |
| - # Note that when we set total to zero, the model ignores that date. See |
45 |
| - # the likelihood function in GenerativeModel.build |
46 |
| - |
47 |
| - # Huge outlier in NJ causing sampling issues. |
48 |
| - data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0 |
49 |
| - |
50 |
| - # Huge outlier in CA causing sampling issues. |
51 |
| - data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0 |
52 |
| - |
53 |
| - # Huge outlier in CA causing sampling issues. |
54 |
| - # TODO: generally should handle when # tests == # positives and that |
55 |
| - # is not an indication of positive rate. |
56 |
| - data.loc[idx["SC", pd.Timestamp("2020-06-26")], :] = 0 |
57 |
| - |
58 |
| - # Two days of no new data then lumped sum on third day with lack of new total tests |
59 |
| - data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'positive'] = 174 |
60 |
| - data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'total'] = 3296 |
61 |
| - |
62 |
| - |
63 |
| - #https://twitter.com/OHdeptofhealth/status/1278768987292209154 |
64 |
| - data.loc[idx["OH", pd.Timestamp("2020-07-01")], :] = 0 |
65 |
| - |
66 |
| - # Nevada didn't report total tests this day |
67 |
| - data.loc[idx["NV", pd.Timestamp("2020-07-02")], :] = 0 |
68 |
| - |
69 |
| - # A bunch of incorrect values for WA data so nulling them out. |
70 |
| - data.loc[idx["WA", pd.Timestamp("2020-06-05") : pd.Timestamp("2020-06-07")], :] = 0 |
71 |
| - data.loc[idx["WA", pd.Timestamp("2020-06-20") : pd.Timestamp("2020-06-21")], :] = 0 |
72 |
| - |
73 |
| - # Outlier dates in PA |
74 |
| - data.loc[ |
75 |
| - idx[ |
76 |
| - "PA", |
77 |
| - [ |
78 |
| - pd.Timestamp("2020-06-03"), |
79 |
| - pd.Timestamp("2020-04-21"), |
80 |
| - pd.Timestamp("2020-05-20"), |
81 |
| - ], |
82 |
| - ], |
83 |
| - :, |
84 |
| - ] = 0 |
85 |
| - |
86 |
| - # At the real time of `run_date`, the data for `run_date` is not yet available! |
87 |
| - # Cutting it away is important for backtesting! |
88 |
| - return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]] |
89 |
| - |
90 |
| - |
91 |
| -def get_and_process_covidtracking_data(run_date: pd.Timestamp): |
92 |
| - """ Helper function for getting and processing COVIDTracking data at once """ |
93 |
| - data = get_raw_covidtracking_data() |
94 |
| - data = process_covidtracking_data(data, run_date) |
95 |
| - return data |
| 5 | +from . data_us import ( |
| 6 | + get_and_process_covidtracking_data, |
| 7 | + get_raw_covidtracking_data, |
| 8 | + process_covidtracking_data, |
| 9 | +) |
| 10 | + |
| 11 | +# Data loading functions for different countries may be registered here. |
| 12 | +# For US, the data loader is pre-registered. Additional countries may be |
| 13 | +# registered upon import of third-party modules. |
| 14 | +# Data cleaning must be done by the data loader function! |
| 15 | +LOADERS:typing.Dict[str, typing.Callable[[pd.Timestamp], pd.DataFrame]] = { |
| 16 | + 'us': get_and_process_covidtracking_data, |
| 17 | +} |
| 18 | + |
| 19 | + |
| 20 | +def get_data(country: str, run_date: pd.Timestamp) -> pd.DataFrame: |
| 21 | + """ Retrieves data for a country using the registered data loader method. |
| 22 | +
|
| 23 | + Parameters |
| 24 | + ---------- |
| 25 | + country : str |
| 26 | + short code of the country (key in LOADERS dict) |
| 27 | + run_date : pd.Timestamp |
| 28 | + date when the analysis is performed |
| 29 | +
|
| 30 | + Returns |
| 31 | + ------- |
| 32 | + model_input : pd.DataFrame |
| 33 | + Data as returned by data loader function. |
| 34 | + Ideally "as it was on `run_date`", meaning that information such as corrections |
| 35 | + that became available after `run_date` should not be taken into account. |
| 36 | + This is important to realistically back-test how the model would have performed at `run_date`. |
| 37 | + """ |
| 38 | + if not country in LOADERS: |
| 39 | + raise KeyError(f"No data loader for '{country}' is registered.") |
| 40 | + result = LOADERS[country](run_date) |
| 41 | + assert isinstance(result, pd.DataFrame) |
| 42 | + assert result.index.names == ("region", "date") |
| 43 | + assert "positive" in result.columns |
| 44 | + assert "total" in result.columns |
| 45 | + return result |
96 | 46 |
|
97 | 47 |
|
98 | 48 | def summarize_inference_data(inference_data: az.InferenceData):
|
99 |
| - """ Summarizes an inference_data object into the form that we publish on |
100 |
| - rt.live """ |
| 49 | + """ Summarizes an inference_data object into the form that we publish on rt.live """ |
101 | 50 | posterior = inference_data.posterior
|
102 | 51 | hdi_mass = 80
|
103 | 52 | hpdi = az.hdi(posterior.r_t, hdi_prob=hdi_mass / 100).r_t
|
|
0 commit comments