Merge pull request #19 from michaelosthege/country-independent-data-loading

KS · web-flow · commit 094e5b3598ca · 2020-07-05T20:11:08.000-07:00
Move US-specific data loading into its own module
diff --git a/covid/data.py b/covid/data.py
@@ -1,103 +1,52 @@
-import requests
+import typing
 import pandas as pd
-import numpy as np
 import arviz as az
 
-idx = pd.IndexSlice
-
-
-def get_raw_covidtracking_data():
-    """ Gets the current daily CSV from COVIDTracking """
-    url = "https://covidtracking.com/api/v1/states/daily.csv"
-    data = pd.read_csv(url)
-    return data
-
-
-def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
-    """ Processes raw COVIDTracking data to be in a form for the GenerativeModel.
-        In many cases, we need to correct data errors or obvious outliers."""
-    data = data.rename(columns={"state": "region"})
-    data["date"] = pd.to_datetime(data["date"], format="%Y%m%d")
-    data = data.set_index(["region", "date"]).sort_index()
-    data = data[["positive", "total"]]
-
-    # Too little data or unreliable reporting in the data source.
-    data = data.drop(["MP", "GU", "AS", "PR", "VI"])
-
-    # On Jun 5 Covidtracking started counting probable cases too
-    # which increases the amount by 5014.
-    # https://covidtracking.com/screenshots/MI/MI-20200605-184320.png
-    data.loc[idx["MI", pd.Timestamp("2020-06-05") :], "positive"] -= 5014
-
-    # From CT: On June 19th, LDH removed 1666 duplicate and non resident cases
-    # after implementing a new de-duplicaton process.
-    data.loc[idx["LA", pd.Timestamp("2020-06-19") :], :] += 1666
-
-    # Now work with daily counts
-    data = data.diff().dropna().clip(0, None).sort_index()
-
-    # Michigan missed 6/18 totals and lumped them into 6/19 so we've
-    # divided the totals in two and equally distributed to both days.
-    data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871
-    data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871
-
-    # Note that when we set total to zero, the model ignores that date. See
-    # the likelihood function in GenerativeModel.build
-
-    # Huge outlier in NJ causing sampling issues.
-    data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0
-
-    # Huge outlier in CA causing sampling issues.
-    data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0
-
-    # Huge outlier in CA causing sampling issues.
-    # TODO: generally should handle when # tests == # positives and that
-    # is not an indication of positive rate.
-    data.loc[idx["SC", pd.Timestamp("2020-06-26")], :] = 0
-
-    # Two days of no new data then lumped sum on third day with lack of new total tests
-    data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'positive'] = 174
-    data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'total'] = 3296
-
-
-    #https://twitter.com/OHdeptofhealth/status/1278768987292209154
-    data.loc[idx["OH", pd.Timestamp("2020-07-01")], :] = 0
-
-    # Nevada didn't report total tests this day
-    data.loc[idx["NV", pd.Timestamp("2020-07-02")], :] = 0
-
-    # A bunch of incorrect values for WA data so nulling them out.
-    data.loc[idx["WA", pd.Timestamp("2020-06-05") : pd.Timestamp("2020-06-07")], :] = 0
-    data.loc[idx["WA", pd.Timestamp("2020-06-20") : pd.Timestamp("2020-06-21")], :] = 0
-
-    # Outlier dates in PA
-    data.loc[
-        idx[
-            "PA",
-            [
-                pd.Timestamp("2020-06-03"),
-                pd.Timestamp("2020-04-21"),
-                pd.Timestamp("2020-05-20"),
-            ],
-        ],
-        :,
-    ] = 0
-
-    # At the real time of `run_date`, the data for `run_date` is not yet available!
-    # Cutting it away is important for backtesting!
-    return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
-
-
-def get_and_process_covidtracking_data(run_date: pd.Timestamp):
-    """ Helper function for getting and processing COVIDTracking data at once """
-    data = get_raw_covidtracking_data()
-    data = process_covidtracking_data(data, run_date)
-    return data
+from . data_us import (
+    get_and_process_covidtracking_data,
+    get_raw_covidtracking_data,
+    process_covidtracking_data,
+)
+
+# Data loading functions for different countries may be registered here.
+# For US, the data loader is pre-registered. Additional countries may be 
+# registered upon import of third-party modules.
+# Data cleaning must be done by the data loader function!
+LOADERS:typing.Dict[str, typing.Callable[[pd.Timestamp], pd.DataFrame]] = {
+    'us': get_and_process_covidtracking_data,
+}
+
+
+def get_data(country: str, run_date: pd.Timestamp) -> pd.DataFrame:
+    """ Retrieves data for a country using the registered data loader method.
+
+    Parameters
+    ----------
+    country : str
+        short code of the country (key in LOADERS dict)
+    run_date : pd.Timestamp
+        date when the analysis is performed
+
+    Returns
+    -------
+    model_input : pd.DataFrame
+        Data as returned by data loader function.
+        Ideally "as it was on `run_date`", meaning that information such as corrections
+        that became available after `run_date` should not be taken into account.
+        This is important to realistically back-test how the model would have performed at `run_date`.
+    """
+    if not country in LOADERS:
+        raise KeyError(f"No data loader for '{country}' is registered.")
+    result = LOADERS[country](run_date)
+    assert isinstance(result, pd.DataFrame)
+    assert result.index.names == ("region", "date")
+    assert "positive" in result.columns
+    assert "total" in result.columns
+    return result
 
 
 def summarize_inference_data(inference_data: az.InferenceData):
-    """ Summarizes an inference_data object into the form that we publish on
-        rt.live """
+    """ Summarizes an inference_data object into the form that we publish on rt.live """
     posterior = inference_data.posterior
     hdi_mass = 80
     hpdi = az.hdi(posterior.r_t, hdi_prob=hdi_mass / 100).r_t
diff --git a/covid/data_us.py b/covid/data_us.py
@@ -0,0 +1,97 @@
+"""
+This module contains all US-specific data loading and data cleaning routines.
+"""
+import requests
+import pandas as pd
+import numpy as np
+
+idx = pd.IndexSlice
+
+
+def get_raw_covidtracking_data():
+    """ Gets the current daily CSV from COVIDTracking """
+    url = "https://covidtracking.com/api/v1/states/daily.csv"
+    data = pd.read_csv(url)
+    return data
+
+
+def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
+    """ Processes raw COVIDTracking data to be in a form for the GenerativeModel.
+        In many cases, we need to correct data errors or obvious outliers."""
+    data = data.rename(columns={"state": "region"})
+    data["date"] = pd.to_datetime(data["date"], format="%Y%m%d")
+    data = data.set_index(["region", "date"]).sort_index()
+    data = data[["positive", "total"]]
+
+    # Too little data or unreliable reporting in the data source.
+    data = data.drop(["MP", "GU", "AS", "PR", "VI"])
+
+    # On Jun 5 Covidtracking started counting probable cases too
+    # which increases the amount by 5014.
+    # https://covidtracking.com/screenshots/MI/MI-20200605-184320.png
+    data.loc[idx["MI", pd.Timestamp("2020-06-05") :], "positive"] -= 5014
+
+    # From CT: On June 19th, LDH removed 1666 duplicate and non resident cases
+    # after implementing a new de-duplicaton process.
+    data.loc[idx["LA", pd.Timestamp("2020-06-19") :], :] += 1666
+
+    # Now work with daily counts
+    data = data.diff().dropna().clip(0, None).sort_index()
+
+    # Michigan missed 6/18 totals and lumped them into 6/19 so we've
+    # divided the totals in two and equally distributed to both days.
+    data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871
+    data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871
+
+    # Note that when we set total to zero, the model ignores that date. See
+    # the likelihood function in GenerativeModel.build
+
+    # Huge outlier in NJ causing sampling issues.
+    data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0
+
+    # Huge outlier in CA causing sampling issues.
+    data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0
+
+    # Huge outlier in CA causing sampling issues.
+    # TODO: generally should handle when # tests == # positives and that
+    # is not an indication of positive rate.
+    data.loc[idx["SC", pd.Timestamp("2020-06-26")], :] = 0
+
+    # Two days of no new data then lumped sum on third day with lack of new total tests
+    data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'positive'] = 174
+    data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'total'] = 3296
+
+
+    #https://twitter.com/OHdeptofhealth/status/1278768987292209154
+    data.loc[idx["OH", pd.Timestamp("2020-07-01")], :] = 0
+
+    # Nevada didn't report total tests this day
+    data.loc[idx["NV", pd.Timestamp("2020-07-02")], :] = 0
+
+    # A bunch of incorrect values for WA data so nulling them out.
+    data.loc[idx["WA", pd.Timestamp("2020-06-05") : pd.Timestamp("2020-06-07")], :] = 0
+    data.loc[idx["WA", pd.Timestamp("2020-06-20") : pd.Timestamp("2020-06-21")], :] = 0
+
+    # Outlier dates in PA
+    data.loc[
+        idx[
+            "PA",
+            [
+                pd.Timestamp("2020-06-03"),
+                pd.Timestamp("2020-04-21"),
+                pd.Timestamp("2020-05-20"),
+            ],
+        ],
+        :,
+    ] = 0
+
+    # At the real time of `run_date`, the data for `run_date` is not yet available!
+    # Cutting it away is important for backtesting!
+    return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
+
+
+def get_and_process_covidtracking_data(run_date: pd.Timestamp):
+    """ Helper function for getting and processing COVIDTracking data at once """
+    data = get_raw_covidtracking_data()
+    data = process_covidtracking_data(data, run_date)
+    return data
diff --git a/covid/tests.py b/covid/tests.py
@@ -19,13 +19,32 @@ def test_process(self):
         run_date = pandas.Timestamp('2020-06-25')
         df_processed = covid.data.process_covidtracking_data(df_raw, run_date)
         assert isinstance(df_processed, pandas.DataFrame)
+        assert df_processed.index.names == ("region", "date")
         # the last entry in the data is the day before `run_date`!
         assert df_processed.xs('NY').index[-1] < run_date
         assert df_processed.xs('NY').index[-1] == (run_date - pandas.DateOffset(1))
         assert "positive" in df_processed.columns
         assert "total" in df_processed.columns
 
 
+class TestDataGeneralized:
+    def test_get_unsupported(self):
+        with pytest.raises(KeyError):
+            covid.data.get_data(country="not_a_country", run_date=pandas.Timestamp("2020-06-20"))
+
+    def test_get_us(self):
+        import covid.data_us
+        assert "us" in covid.data.LOADERS
+        run_date = pandas.Timestamp('2020-06-25')
+        result = covid.data.get_data("us", run_date)
+        assert isinstance(result, pandas.DataFrame)
+        assert result.index.names == ("region", "date")
+        assert result.xs('NY').index[-1] < run_date
+        assert result.xs('NY').index[-1] == (run_date - pandas.DateOffset(1))
+        assert "positive" in result.columns
+        assert "total" in result.columns
+
+
 class TestGenerative:
     def test_build(self):
         df_raw = covid.data.get_raw_covidtracking_data()