|
5 | 5 | A few popular time series datasets
|
6 | 6 | """
|
7 | 7 |
|
| 8 | +import os |
8 | 9 | from pathlib import Path
|
9 |
| -from typing import List |
| 10 | +from typing import List, Literal, Optional |
10 | 11 |
|
11 | 12 | import numpy as np
|
12 | 13 | import pandas as pd
|
@@ -813,3 +814,111 @@ def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]:
|
813 | 814 | Load the WeatherDataset dataset as a list of univariate timeseries, one for weather indicator.
|
814 | 815 | """
|
815 | 816 | return [TimeSeries.from_series(series[label]) for label in series]
|
| 817 | + |
| 818 | + |
| 819 | +class ElectricityConsumptionZurichDataset(DatasetLoaderCSV): |
| 820 | + """ |
| 821 | + Electricity Consumption of households & SMEs (low voltage) and businesses & services (medium voltage) in the |
| 822 | + city of Zurich [1]_, with values recorded every 15 minutes. |
| 823 | +
|
| 824 | + The electricity consumption is combined with weather measurements recorded by three different |
| 825 | + stations in the city of Zurich with a hourly frequency [2]_. The missing time stamps are filled with NaN. |
| 826 | + The original weather data is recorded every hour. Before adding the features to the electricity consumption, |
| 827 | + the data is resampled to 15 minutes frequency, and missing values are interpolated. |
| 828 | +
|
| 829 | + To simplify the dataset, the measurements from the Zch_Schimmelstrasse and Zch_Rosengartenstrasse weather |
| 830 | + stations are discarded to keep only the data recorded in the Zch_Stampfenbachstrasse station. |
| 831 | +
|
| 832 | + Both dataset sources are updated continuously, but this dataset only retrains values between 2015 and 2022. |
| 833 | + The time index was converted from CET time zone to UTC. |
| 834 | +
|
| 835 | + Components Descriptions: |
| 836 | +
|
| 837 | + * Value_NE5 : Households & SMEs electricity consumption (low voltage, grid level 7) in kWh |
| 838 | + * Value_NE7 : Business and services electricity consumption (medium voltage, grid level 5) in kWh |
| 839 | + * Hr [%Hr] : Relative humidity |
| 840 | + * RainDur [min] : Duration of precipitation (divided by 4 for conversion from hourly to quarter-hourly records) |
| 841 | + * T [°C] : Temperature |
| 842 | + * WD [°] : Wind direction |
| 843 | + * WVv [m/s] : Wind vector speed |
| 844 | + * p [hPa] : Air pressure |
| 845 | + * WVs [m/s] : Wind scalar speed |
| 846 | + * StrGlo [W/m2] : Global solar irradiation |
| 847 | +
|
| 848 | + Note: before 2018, the scalar speeds were calculated from the 30 minutes vector data. |
| 849 | +
|
| 850 | + References |
| 851 | + ---------- |
| 852 | + .. [1] https://data.stadt-zuerich.ch/dataset/ewz_stromabgabe_netzebenen_stadt_zuerich |
| 853 | + .. [2] https://data.stadt-zuerich.ch/dataset/ugz_meteodaten_stundenmittelwerte |
| 854 | + """ |
| 855 | + |
| 856 | + def __init__(self): |
| 857 | + def pre_process_dataset(dataset_path): |
| 858 | + """Restrict the time axis and add the weather data""" |
| 859 | + df = pd.read_csv(dataset_path, index_col=0) |
| 860 | + # convert time index |
| 861 | + df.index = ( |
| 862 | + pd.DatetimeIndex(df.index, tz="CET").tz_convert("UTC").tz_localize(None) |
| 863 | + ) |
| 864 | + # extract pre-determined period |
| 865 | + df = df.loc[ |
| 866 | + (pd.Timestamp("2015-01-01") <= df.index) |
| 867 | + & (df.index <= pd.Timestamp("2022-12-31")) |
| 868 | + ] |
| 869 | + # download and preprocess the weather information |
| 870 | + df_weather = self._download_weather_data() |
| 871 | + # add weather data as additional features |
| 872 | + df = pd.concat([df, df_weather], axis=1) |
| 873 | + # interpolate weather data |
| 874 | + df = df.interpolate() |
| 875 | + # raining duration is given in minutes -> we divide by 4 from hourly to quarter-hourly records |
| 876 | + df["RainDur [min]"] = df["RainDur [min]"] / 4 |
| 877 | + |
| 878 | + # round Electricity cols to 4 decimals, other columns to 2 decimals |
| 879 | + cols_precise = ["Value_NE5", "Value_NE7"] |
| 880 | + df = df.round( |
| 881 | + decimals={col: (4 if col in cols_precise else 2) for col in df.columns} |
| 882 | + ) |
| 883 | + |
| 884 | + # export the dataset |
| 885 | + df.index.name = "Timestamp" |
| 886 | + df.to_csv(self._get_path_dataset()) |
| 887 | + |
| 888 | + # hash value for dataset with weather data |
| 889 | + super().__init__( |
| 890 | + metadata=DatasetLoaderMetadata( |
| 891 | + "zurich_electricity_consumption.csv", |
| 892 | + uri=( |
| 893 | + "https://data.stadt-zuerich.ch/dataset/" |
| 894 | + "ewz_stromabgabe_netzebenen_stadt_zuerich/" |
| 895 | + "download/ewz_stromabgabe_netzebenen_stadt_zuerich.csv" |
| 896 | + ), |
| 897 | + hash="c2fea1a0974611ff1c276abcc1d34619", |
| 898 | + header_time="Timestamp", |
| 899 | + freq="15min", |
| 900 | + pre_process_csv_fn=pre_process_dataset, |
| 901 | + ) |
| 902 | + ) |
| 903 | + |
| 904 | + @staticmethod |
| 905 | + def _download_weather_data(): |
| 906 | + """Concatenate the yearly csv files into a single dataframe and reshape it""" |
| 907 | + # download the csv from the url |
| 908 | + base_url = "https://data.stadt-zuerich.ch/dataset/ugz_meteodaten_stundenmittelwerte/download/" |
| 909 | + filenames = [f"ugz_ogd_meteo_h1_{year}.csv" for year in range(2015, 2023)] |
| 910 | + df = pd.concat([pd.read_csv(base_url + fname) for fname in filenames]) |
| 911 | + # retain only one weather station |
| 912 | + df = df.loc[df["Standort"] == "Zch_Stampfenbachstrasse"] |
| 913 | + # pivot the df to get all measurements as columns |
| 914 | + df["param_name"] = df["Parameter"] + " [" + df["Einheit"] + "]" |
| 915 | + df = df.pivot(index="Datum", columns="param_name", values="Wert") |
| 916 | + # convert time index to from CET to UTC and extract the required time range |
| 917 | + df.index = ( |
| 918 | + pd.DatetimeIndex(df.index, tz="CET").tz_convert("UTC").tz_localize(None) |
| 919 | + ) |
| 920 | + df = df.loc[ |
| 921 | + (pd.Timestamp("2015-01-01") <= df.index) |
| 922 | + & (df.index <= pd.Timestamp("2022-12-31")) |
| 923 | + ] |
| 924 | + return df |
0 commit comments