Feature: Add functionality for loading monpol statements dataset (#20)

* Feature: Add functionality for loading monopol statements dataset * Docs: Update docs and fix missing import * Fix: Typo 'monopol' to 'monpol' * Doc: Implement improvements suggested in PR
bis-med-it · May 16, 2024 · e7008e0 · e7008e0
1 parent 7890368
commit e7008e0
Show file tree

Hide file tree

Showing 14 changed files with 507 additions and 215 deletions.
diff --git a/00_datasets.qmd b/00_datasets.qmd
@@ -21,7 +21,7 @@ from fastcore.test import test_fail
 import matplotlib.pyplot as plt
 import pandas as pd
 from utils import show_doc
-from gingado.datasets import load_BarroLee_1994, load_CB_speeches
+from gingado.datasets import load_BarroLee_1994, load_CB_speeches, load_monpol_statements
 ```
 
 ## Real datasets
@@ -64,6 +64,7 @@ X.head()
 y.plot.hist(title='GDP growth', bins=30)
 ```
 
+
 <!-- load_CB_speeches -->
 ```{python}
 #| output: asis
@@ -93,6 +94,39 @@ speeches = load_CB_speeches(2020)
 speeches.head()
 ```
 
+
+<!-- load_monpol_statements -->
+```{python}
+#| output: asis
+#| echo: false
+show_doc(load_monpol_statements)
+```
+
+This function downloads monetary policy statements from 26 emerging market central banks
+(Armenia, Brazil, Chile, Colombia, Czech Republic, Egypt, Georgia, Hungary,
+Israel, India, Kazakhstan, Malaysia, Mongolia, Mexico, Nigeria, Pakistan, Peru,
+Philippines, Poland, Romania, Russia, South Africa, South Korea, Thailand,
+Türkiye, Ukraine) as well as the Fed and the ECB (press-conference introductory statements).
+The dataset includes official English versions of statements for 1998-2023
+(starting date varies depending on data availability).
+If you use this data in your work, please cite the dataset, as follows:
+
+```
+@article{emcbcom,
+    author = {Tatiana Evdokimova and Piroska Nagy Mohácsi and Olga Ponomarenko and Elina Ribakova},
+    title = {Central banks and policy communication: How emerging markets have outperformed the Fed and ECB},
+    year = {2023},
+    institution = {Peterson Institute for International Economics}
+    url = {https://www.piie.com/publications/working-papers/central-banks-and-policy-communication-how-emerging-markets-have}
+}
+```
+
+```{python}
+# Load monpol statements for 2020
+speeches = load_monpol_statements(2020)
+speeches.head()
+```
+
 ## Simulated datasets
 
 :::{.callout-note}

diff --git a/docs/00_datasets_files/figure-html/cell-14-output-1.png b/docs/00_datasets_files/figure-html/cell-14-output-1.png
diff --git a/docs/00_datasets_files/figure-html/cell-15-output-1.png b/docs/00_datasets_files/figure-html/cell-15-output-1.png
diff --git a/docs/00_datasets_files/figure-html/cell-18-output-1.png b/docs/00_datasets_files/figure-html/cell-18-output-1.png
diff --git a/docs/00_datasets_files/figure-html/cell-19-output-1.png b/docs/00_datasets_files/figure-html/cell-19-output-1.png
diff --git a/docs/00_datasets_files/figure-html/cell-6-output-1.png b/docs/00_datasets_files/figure-html/cell-6-output-1.png
diff --git a/docs/00_datasets_files/figure-html/cell-6-output-2.png b/docs/00_datasets_files/figure-html/cell-6-output-2.png
diff --git a/docs/datasets.html b/docs/datasets.html
diff --git a/docs/search.json b/docs/search.json
diff --git a/docs/site_libs/bootstrap/bootstrap-dark.min.css b/docs/site_libs/bootstrap/bootstrap-dark.min.css
diff --git a/docs/site_libs/bootstrap/bootstrap.min.css b/docs/site_libs/bootstrap/bootstrap.min.css
diff --git a/gingado/datasets.py b/gingado/datasets.py
@@ -1,18 +1,22 @@
 from __future__ import annotations  # Allows forward annotations in Python < 3.10
 
-from io import BytesIO
 import os
 from pathlib import Path
-from zipfile import ZipFile
 
 import pandas as pd
-import requests
 import numpy as np
 from inspect import signature
 from sklearn.utils import check_random_state
 
-from gingado.internals import generate_timestamped_file_path, get_latest_timestamped_file_path
-from gingado.settings import CACHE_DIRECTORY, CB_SPEECHES_CSV_BASE_FILENAME, CB_SPEECHES_BASE_URL, CB_SPEECHES_ZIP_BASE_FILENAME
+from gingado.internals import download_csv, try_read_cached_csv, verify_cached_csv
+from gingado.settings import (
+    CACHE_DIRECTORY,
+    CB_SPEECHES_CSV_BASE_FILENAME,
+    CB_SPEECHES_BASE_URL,
+    CB_SPEECHES_ZIP_BASE_FILENAME,
+    MONPOL_STATEMENTS_BASE_URL,
+    MONPOL_STATEMENTS_CSV_BASE_FILENAME
+)
 
 __all__ = ['load_BarroLee_1994', 'make_causal_effect']
 
@@ -42,7 +46,10 @@ def load_BarroLee_1994(
 
 
 def load_CB_speeches(
-    year: str | int | list = 'all', cache: bool = True, timeout: float | None = 120, **kwargs
+    year: str | int | list = 'all',
+    cache: bool = True,
+    timeout: float | None = 120,
+    **kwargs
 ) -> pd.DataFrame:
     """Load Central Bankers speeches dataset from 
     Bank for International Settlements (2024). Central bank speeches, all years,
@@ -84,12 +91,7 @@ def load_CB_speeches(
         # Try to read the CSV file from cache
         cb_speeches_year_df: pd.DataFrame | None = None
         if cache:
-            try:
-                timestamped_file_path = get_latest_timestamped_file_path(cb_speeches_file_path)
-                cb_speeches_year_df = pd.read_csv(timestamped_file_path, **kwargs)
-            except FileNotFoundError:
-                # File is not in cache
-                cb_speeches_year_df = None
+            cb_speeches_year_df = try_read_cached_csv(cb_speeches_file_path, **kwargs)
 
         # Download the CSV file, if it could not be loaded from cache
         if cb_speeches_year_df is None:
@@ -99,25 +101,17 @@ def load_CB_speeches(
             )
             zip_url = CB_SPEECHES_BASE_URL + filename_no_extension + '.zip'
 
-            # Download the zip file, unzip it and parse the CSV file:
-            with requests.get(zip_url, timeout=timeout) as response:
-                zip_file_content = response.content
-                speeches_zip = ZipFile(BytesIO(zip_file_content))
-                with speeches_zip.open(filename_no_extension + '.csv', 'r') as speeches_zip_file:
-                    cb_speeches_year_df = pd.read_csv(speeches_zip_file, **kwargs)
-
-            # Write file to cache
-            timestamped_file_path = generate_timestamped_file_path(cb_speeches_file_path)
-            Path(cb_speeches_file_path).parent.mkdir(exist_ok=True)  # Ensure parent dir exists
-            cb_speeches_year_df.to_csv(timestamped_file_path, index=False)
+            # Download the zip file, unzip it and parse the CSV file
+            cb_speeches_year_df = download_csv(
+                zip_url,
+                zipped_filename=filename_no_extension + '.csv',
+                cache_filename=cb_speeches_file_path,
+                timeout=timeout,
+                **kwargs
+            )
 
         # Verify that the file in the cache is valid
-        try:
-            timestamped_file_path = get_latest_timestamped_file_path(cb_speeches_file_path)
-            verify_df = pd.read_csv(timestamped_file_path)
-            assert len(verify_df) > 0, f'CB speeches dataset at {timestamped_file_path} is empty.'
-        except Exception as ex:
-            raise RuntimeError('Verification error. See previous exception.') from ex
+        verify_cached_csv(cb_speeches_file_path)
 
         # Add dataframe for year to aggregated list of dataframes
         cb_speeches_dfs.append(cb_speeches_year_df)
@@ -126,6 +120,78 @@ def load_CB_speeches(
     return pd.concat(cb_speeches_dfs)
 
 
+def load_monpol_statements(
+    year: str | int | list = 'all',
+    cache: bool = True,
+    timeout: float | None = 120,
+    **kwargs
+) -> pd.DataFrame:
+    """Load monetary policy statements from multiple central banks.
+
+    Args:
+        year: Either 'all' to download all available central bank speeches or the year(s)
+            to download. Defaults to 'all'.
+        cache: If False, cached data will be ignored and the dataset will be downloaded again.
+            Defaults to True.
+        timeout: The timeout to for downloading each speeches file. Set to `None` to disable
+            timeout. Defaults to 120.
+        **kwargs. Additional keyword arguments which will be passed to pandas `read_csv` function.
+        
+    Returns:
+        A pandas DataFrame containing the dataset.
+
+    Usage:
+        >>> load_monpol_statements()
+
+        >>> load_monpol_statements('2020')
+
+        >>> load_monpol_statements([2020, 2021, 2022])
+    """
+    # Ensure year is list[str] for uniform handling
+    if not isinstance(year, list):
+        year = [str(year)]
+    year = [str(y) for y in year]
+
+    # Load data for each year
+    monpol_statements_dfs = []
+    for y in year:
+        # Get expected filename
+        if y == 'all':
+            filename_csv = MONPOL_STATEMENTS_CSV_BASE_FILENAME + '_all' + '.csv'
+        else:
+            filename_csv = MONPOL_STATEMENTS_CSV_BASE_FILENAME + f'_{y}' + '.csv'
+
+        # Get the file path of the CSV
+        monpol_statements_file_path = str(Path(CACHE_DIRECTORY) / filename_csv)
+
+        # Try to read the CSV file from cache
+        monpol_statements_year_df: pd.DataFrame | None = None
+        if cache:
+            monpol_statements_year_df = try_read_cached_csv(monpol_statements_file_path, **kwargs)
+
+        # Download the CSV file, if it could not be loaded from cache
+        if monpol_statements_year_df is None:
+            # Get CSV file URL
+            file_url = MONPOL_STATEMENTS_BASE_URL + filename_csv
+
+            # Download CSV
+            monpol_statements_year_df = download_csv(
+                file_url,
+                cache_filename=monpol_statements_file_path,
+                timeout=timeout,
+                **kwargs
+            )
+
+        # Verify that the file in the cache is valid
+        verify_cached_csv(monpol_statements_file_path)
+
+        # Add dataframe for year to aggregated list of dataframes
+        monpol_statements_dfs.append(monpol_statements_year_df)
+
+    # Concat all dataframes into single dataframe and return
+    return pd.concat(monpol_statements_dfs)
+
+
 def make_causal_effect(
     n_samples:int=100,
     n_features:int=100,

diff --git a/gingado/internals.py b/gingado/internals.py
@@ -1,8 +1,13 @@
 """Functions intended for library internal use."""
+import io
 import os
 from pathlib import Path
 import re
 import time
+from zipfile import ZipFile
+
+import pandas as pd
+import requests
 
 
 def get_latest_timestamped_file_path(file_path: str) -> str:
@@ -79,3 +84,86 @@ def generate_timestamped_file_path(file_path: str, exists_ok: bool = True) -> st
 
     return timestamped_file_path
 
+
+def try_read_cached_csv(filename: str, **kwargs) -> pd.DataFrame | None:
+    """Try to read a CSV from cache.
+
+    Args:
+        filename: The name of the file (NOT the name of the cache file!).
+
+    Returns:
+        The contents of the CSV as a dataframe or None if the file is not found in cache.
+    """
+    try:
+        timestamped_file_path = get_latest_timestamped_file_path(filename)
+        df = pd.read_csv(timestamped_file_path, **kwargs)
+    except FileNotFoundError:
+        # File is not in cache
+        df = None
+
+    return df
+
+
+def download_csv(
+    url: str,
+    cache_filename: str | None = None,
+    zipped_filename: str | None = None,
+    timeout: int = 120, **kwargs
+) -> pd.DataFrame:
+    """Download a CSV and load it into a dataframe.
+
+    Args:
+        url: The full HTTP(S) URL of the CSV file.
+        cache_filename: The name of the file the downloaded CSV should be cached to. \
+            If None, no caching is performed. Defaults to None.
+        zipped_filename: If the given file is a zip, this should be set to the file \
+            path of the desired CSV in the zip. Defaults to None.
+        timeout: The timeout (in seconds) for the HTTP request. Defaults to 120.
+
+    Returns:
+        The CSV data as a pandas DataFrame.
+    """
+    # Get CSV file from URL
+    with requests.get(url, timeout=timeout) as response:
+        if zipped_filename is not None:
+            # CSV file is in a zip, get zip file and extract CSV file in memory
+            zip_file_content = response.content
+            speeches_zip = ZipFile(io.BytesIO(zip_file_content))
+            csv_io = speeches_zip.open(zipped_filename, 'r')
+        else:
+            # Read CSV file
+            # WORKAROUND: File may contain characters not compatible with utf8 encoding. "Fix" them
+            # by ignoring them during decode.
+            csv_io = io.StringIO(response.content.decode("utf8", errors="ignore"))
+
+        # Read CSV into dataframe
+        with csv_io as csv_file:
+            df = pd.read_csv(csv_file, **kwargs)
+
+    # Write file to cache
+    if cache_filename is not None:
+        timestamped_file_path = generate_timestamped_file_path(cache_filename)
+        Path(timestamped_file_path).parent.mkdir(exist_ok=True)  # Ensure parent dir exists
+        df.to_csv(timestamped_file_path, index=False)
+
+    return df
+
+
+def verify_cached_csv(file_path: str) -> None:
+    """Verify that a given file exists in the cache and can be read as CSV.
+
+    Args:
+        file_path: The name of the file (NOT the name of the cache file!)
+
+    Raises:
+        RuntimeError: Raised if verification fails.
+    """
+    try:
+        timestamped_file_path = get_latest_timestamped_file_path(file_path)
+        verify_df = pd.read_csv(timestamped_file_path)
+        assert len(verify_df) > 0, (
+            f'Cached CSV at {timestamped_file_path} is empty. '
+            'Manually remove the file or replace with correct dataset.'
+        )
+    except Exception as ex:
+        raise RuntimeError('Verification error. See previous exception.') from ex
diff --git a/gingado/settings.py b/gingado/settings.py
@@ -14,4 +14,12 @@
 CB_SPEECHES_ZIP_BASE_FILENAME = 'speeches'
 
 # Base path used for storing the speeches files on disk
-CB_SPEECHES_CSV_BASE_FILENAME = 'cb_speeches'
+CB_SPEECHES_CSV_BASE_FILENAME = 'cb_speeches'
+
+## MONPOL STATEMENTS SETTINGS
+
+# Base URL of CB speeches files (should end in a slash)
+MONPOL_STATEMENTS_BASE_URL = 'https://raw.githubusercontent.com/bis-med-it/gingado/main/assets/'
+
+# Base path used for storing the speeches files on disk
+MONPOL_STATEMENTS_CSV_BASE_FILENAME = 'monpol_statements'