Skip to content

Commit

Permalink
Feature: Add functionality for loading monpol statements dataset (#20)
Browse files Browse the repository at this point in the history
* Feature: Add functionality for loading monopol statements dataset

* Docs: Update docs and fix missing import

* Fix: Typo 'monopol' to 'monpol'

* Doc: Implement improvements suggested in PR
  • Loading branch information
robin-mader-bis authored May 16, 2024
1 parent 7890368 commit e7008e0
Show file tree
Hide file tree
Showing 14 changed files with 507 additions and 215 deletions.
36 changes: 35 additions & 1 deletion 00_datasets.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ from fastcore.test import test_fail
import matplotlib.pyplot as plt
import pandas as pd
from utils import show_doc
from gingado.datasets import load_BarroLee_1994, load_CB_speeches
from gingado.datasets import load_BarroLee_1994, load_CB_speeches, load_monpol_statements
```

## Real datasets
Expand Down Expand Up @@ -64,6 +64,7 @@ X.head()
y.plot.hist(title='GDP growth', bins=30)
```


<!-- load_CB_speeches -->
```{python}
#| output: asis
Expand Down Expand Up @@ -93,6 +94,39 @@ speeches = load_CB_speeches(2020)
speeches.head()
```


<!-- load_monpol_statements -->
```{python}
#| output: asis
#| echo: false
show_doc(load_monpol_statements)
```

This function downloads monetary policy statements from 26 emerging market central banks
(Armenia, Brazil, Chile, Colombia, Czech Republic, Egypt, Georgia, Hungary,
Israel, India, Kazakhstan, Malaysia, Mongolia, Mexico, Nigeria, Pakistan, Peru,
Philippines, Poland, Romania, Russia, South Africa, South Korea, Thailand,
Türkiye, Ukraine) as well as the Fed and the ECB (press-conference introductory statements).
The dataset includes official English versions of statements for 1998-2023
(starting date varies depending on data availability).
If you use this data in your work, please cite the dataset, as follows:

```
@article{emcbcom,
author = {Tatiana Evdokimova and Piroska Nagy Mohácsi and Olga Ponomarenko and Elina Ribakova},
title = {Central banks and policy communication: How emerging markets have outperformed the Fed and ECB},
year = {2023},
institution = {Peterson Institute for International Economics}
url = {https://www.piie.com/publications/working-papers/central-banks-and-policy-communication-how-emerging-markets-have}
}
```

```{python}
# Load monpol statements for 2020
speeches = load_monpol_statements(2020)
speeches.head()
```

## Simulated datasets

:::{.callout-note}
Expand Down
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
432 changes: 262 additions & 170 deletions docs/datasets.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/search.json

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions docs/site_libs/bootstrap/bootstrap-dark.min.css

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions docs/site_libs/bootstrap/bootstrap.min.css

Large diffs are not rendered by default.

124 changes: 95 additions & 29 deletions gingado/datasets.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
from __future__ import annotations # Allows forward annotations in Python < 3.10

from io import BytesIO
import os
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import requests
import numpy as np
from inspect import signature
from sklearn.utils import check_random_state

from gingado.internals import generate_timestamped_file_path, get_latest_timestamped_file_path
from gingado.settings import CACHE_DIRECTORY, CB_SPEECHES_CSV_BASE_FILENAME, CB_SPEECHES_BASE_URL, CB_SPEECHES_ZIP_BASE_FILENAME
from gingado.internals import download_csv, try_read_cached_csv, verify_cached_csv
from gingado.settings import (
CACHE_DIRECTORY,
CB_SPEECHES_CSV_BASE_FILENAME,
CB_SPEECHES_BASE_URL,
CB_SPEECHES_ZIP_BASE_FILENAME,
MONPOL_STATEMENTS_BASE_URL,
MONPOL_STATEMENTS_CSV_BASE_FILENAME
)

__all__ = ['load_BarroLee_1994', 'make_causal_effect']

Expand Down Expand Up @@ -42,7 +46,10 @@ def load_BarroLee_1994(


def load_CB_speeches(
year: str | int | list = 'all', cache: bool = True, timeout: float | None = 120, **kwargs
year: str | int | list = 'all',
cache: bool = True,
timeout: float | None = 120,
**kwargs
) -> pd.DataFrame:
"""Load Central Bankers speeches dataset from
Bank for International Settlements (2024). Central bank speeches, all years,
Expand Down Expand Up @@ -84,12 +91,7 @@ def load_CB_speeches(
# Try to read the CSV file from cache
cb_speeches_year_df: pd.DataFrame | None = None
if cache:
try:
timestamped_file_path = get_latest_timestamped_file_path(cb_speeches_file_path)
cb_speeches_year_df = pd.read_csv(timestamped_file_path, **kwargs)
except FileNotFoundError:
# File is not in cache
cb_speeches_year_df = None
cb_speeches_year_df = try_read_cached_csv(cb_speeches_file_path, **kwargs)

# Download the CSV file, if it could not be loaded from cache
if cb_speeches_year_df is None:
Expand All @@ -99,25 +101,17 @@ def load_CB_speeches(
)
zip_url = CB_SPEECHES_BASE_URL + filename_no_extension + '.zip'

# Download the zip file, unzip it and parse the CSV file:
with requests.get(zip_url, timeout=timeout) as response:
zip_file_content = response.content
speeches_zip = ZipFile(BytesIO(zip_file_content))
with speeches_zip.open(filename_no_extension + '.csv', 'r') as speeches_zip_file:
cb_speeches_year_df = pd.read_csv(speeches_zip_file, **kwargs)

# Write file to cache
timestamped_file_path = generate_timestamped_file_path(cb_speeches_file_path)
Path(cb_speeches_file_path).parent.mkdir(exist_ok=True) # Ensure parent dir exists
cb_speeches_year_df.to_csv(timestamped_file_path, index=False)
# Download the zip file, unzip it and parse the CSV file
cb_speeches_year_df = download_csv(
zip_url,
zipped_filename=filename_no_extension + '.csv',
cache_filename=cb_speeches_file_path,
timeout=timeout,
**kwargs
)

# Verify that the file in the cache is valid
try:
timestamped_file_path = get_latest_timestamped_file_path(cb_speeches_file_path)
verify_df = pd.read_csv(timestamped_file_path)
assert len(verify_df) > 0, f'CB speeches dataset at {timestamped_file_path} is empty.'
except Exception as ex:
raise RuntimeError('Verification error. See previous exception.') from ex
verify_cached_csv(cb_speeches_file_path)

# Add dataframe for year to aggregated list of dataframes
cb_speeches_dfs.append(cb_speeches_year_df)
Expand All @@ -126,6 +120,78 @@ def load_CB_speeches(
return pd.concat(cb_speeches_dfs)


def load_monpol_statements(
year: str | int | list = 'all',
cache: bool = True,
timeout: float | None = 120,
**kwargs
) -> pd.DataFrame:
"""Load monetary policy statements from multiple central banks.
Args:
year: Either 'all' to download all available central bank speeches or the year(s)
to download. Defaults to 'all'.
cache: If False, cached data will be ignored and the dataset will be downloaded again.
Defaults to True.
timeout: The timeout to for downloading each speeches file. Set to `None` to disable
timeout. Defaults to 120.
**kwargs. Additional keyword arguments which will be passed to pandas `read_csv` function.
Returns:
A pandas DataFrame containing the dataset.
Usage:
>>> load_monpol_statements()
>>> load_monpol_statements('2020')
>>> load_monpol_statements([2020, 2021, 2022])
"""
# Ensure year is list[str] for uniform handling
if not isinstance(year, list):
year = [str(year)]
year = [str(y) for y in year]

# Load data for each year
monpol_statements_dfs = []
for y in year:
# Get expected filename
if y == 'all':
filename_csv = MONPOL_STATEMENTS_CSV_BASE_FILENAME + '_all' + '.csv'
else:
filename_csv = MONPOL_STATEMENTS_CSV_BASE_FILENAME + f'_{y}' + '.csv'

# Get the file path of the CSV
monpol_statements_file_path = str(Path(CACHE_DIRECTORY) / filename_csv)

# Try to read the CSV file from cache
monpol_statements_year_df: pd.DataFrame | None = None
if cache:
monpol_statements_year_df = try_read_cached_csv(monpol_statements_file_path, **kwargs)

# Download the CSV file, if it could not be loaded from cache
if monpol_statements_year_df is None:
# Get CSV file URL
file_url = MONPOL_STATEMENTS_BASE_URL + filename_csv

# Download CSV
monpol_statements_year_df = download_csv(
file_url,
cache_filename=monpol_statements_file_path,
timeout=timeout,
**kwargs
)

# Verify that the file in the cache is valid
verify_cached_csv(monpol_statements_file_path)

# Add dataframe for year to aggregated list of dataframes
monpol_statements_dfs.append(monpol_statements_year_df)

# Concat all dataframes into single dataframe and return
return pd.concat(monpol_statements_dfs)


def make_causal_effect(
n_samples:int=100,
n_features:int=100,
Expand Down
88 changes: 88 additions & 0 deletions gingado/internals.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
"""Functions intended for library internal use."""
import io
import os
from pathlib import Path
import re
import time
from zipfile import ZipFile

import pandas as pd
import requests


def get_latest_timestamped_file_path(file_path: str) -> str:
Expand Down Expand Up @@ -79,3 +84,86 @@ def generate_timestamped_file_path(file_path: str, exists_ok: bool = True) -> st

return timestamped_file_path


def try_read_cached_csv(filename: str, **kwargs) -> pd.DataFrame | None:
"""Try to read a CSV from cache.
Args:
filename: The name of the file (NOT the name of the cache file!).
Returns:
The contents of the CSV as a dataframe or None if the file is not found in cache.
"""
try:
timestamped_file_path = get_latest_timestamped_file_path(filename)
df = pd.read_csv(timestamped_file_path, **kwargs)
except FileNotFoundError:
# File is not in cache
df = None

return df


def download_csv(
url: str,
cache_filename: str | None = None,
zipped_filename: str | None = None,
timeout: int = 120, **kwargs
) -> pd.DataFrame:
"""Download a CSV and load it into a dataframe.
Args:
url: The full HTTP(S) URL of the CSV file.
cache_filename: The name of the file the downloaded CSV should be cached to. \
If None, no caching is performed. Defaults to None.
zipped_filename: If the given file is a zip, this should be set to the file \
path of the desired CSV in the zip. Defaults to None.
timeout: The timeout (in seconds) for the HTTP request. Defaults to 120.
Returns:
The CSV data as a pandas DataFrame.
"""
# Get CSV file from URL
with requests.get(url, timeout=timeout) as response:
if zipped_filename is not None:
# CSV file is in a zip, get zip file and extract CSV file in memory
zip_file_content = response.content
speeches_zip = ZipFile(io.BytesIO(zip_file_content))
csv_io = speeches_zip.open(zipped_filename, 'r')
else:
# Read CSV file
# WORKAROUND: File may contain characters not compatible with utf8 encoding. "Fix" them
# by ignoring them during decode.
csv_io = io.StringIO(response.content.decode("utf8", errors="ignore"))

# Read CSV into dataframe
with csv_io as csv_file:
df = pd.read_csv(csv_file, **kwargs)

# Write file to cache
if cache_filename is not None:
timestamped_file_path = generate_timestamped_file_path(cache_filename)
Path(timestamped_file_path).parent.mkdir(exist_ok=True) # Ensure parent dir exists
df.to_csv(timestamped_file_path, index=False)

return df


def verify_cached_csv(file_path: str) -> None:
"""Verify that a given file exists in the cache and can be read as CSV.
Args:
file_path: The name of the file (NOT the name of the cache file!)
Raises:
RuntimeError: Raised if verification fails.
"""
try:
timestamped_file_path = get_latest_timestamped_file_path(file_path)
verify_df = pd.read_csv(timestamped_file_path)
assert len(verify_df) > 0, (
f'Cached CSV at {timestamped_file_path} is empty. '
'Manually remove the file or replace with correct dataset.'
)
except Exception as ex:
raise RuntimeError('Verification error. See previous exception.') from ex
10 changes: 9 additions & 1 deletion gingado/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,12 @@
CB_SPEECHES_ZIP_BASE_FILENAME = 'speeches'

# Base path used for storing the speeches files on disk
CB_SPEECHES_CSV_BASE_FILENAME = 'cb_speeches'
CB_SPEECHES_CSV_BASE_FILENAME = 'cb_speeches'

## MONPOL STATEMENTS SETTINGS

# Base URL of CB speeches files (should end in a slash)
MONPOL_STATEMENTS_BASE_URL = 'https://raw.githubusercontent.com/bis-med-it/gingado/main/assets/'

# Base path used for storing the speeches files on disk
MONPOL_STATEMENTS_CSV_BASE_FILENAME = 'monpol_statements'

0 comments on commit e7008e0

Please sign in to comment.