From e0a4b24d7f2004f8a0b6cd7dd868e2b6137fba8d Mon Sep 17 00:00:00 2001 From: Ivelin Ivanov Date: Thu, 1 Feb 2024 08:09:26 -0600 Subject: [PATCH 1/7] Create hfhub.py util for HugginfFace Hub model storage HFHub is a convenient cloud service for storing models, datasets and demo spaces. It would make it easier for timeseries ML engineers to build and deploy demos of Darts models and timeseries datasets if they can be saved and loaded from hfhub. Example usage (see cell 56): https://github.com/ivelin/canswim/blob/pypackage/model_sandbox.ipynb --- darts/utils/hfhub.py | 51 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 darts/utils/hfhub.py diff --git a/darts/utils/hfhub.py b/darts/utils/hfhub.py new file mode 100644 index 0000000000..2382141883 --- /dev/null +++ b/darts/utils/hfhub.py @@ -0,0 +1,51 @@ +import pandas as pd +from dotenv import load_dotenv +import os +import tempfile +from typing import Optional +from darts.models.forecasting.forecasting_model import ForecastingModel +from huggingface_hub import snapshot_download, upload_folder, create_repo + + +class HFHub: + """ + HuggingFace Hub integration using official HF API. + https://huggingface.co/docs/huggingface_hub/v0.20.3/en/guides/integrations + """ + + def __init__(self, api_key: Optional[str] = None): + if api_key is None: + # load from .env file or OS vars if available + load_dotenv(override=True) + api_key = os.getenv("HF_TOKEN") + assert ( + api_key is not None + ), "Could not find HF_TOKEN in OS environment. Cannot interact with HF Hub." + self.HF_TOKEN = api_key + + def upload_model( + self, + model: ForecastingModel = None, + repo_id: str = None, + private: Optional[bool] = True, + ): + # Create repo if not existing yet and get the associated repo_id + create_repo(repo_id=repo_id, repo_type="model", private=private, exist_ok=True) + + with tempfile.TemporaryDirectory() as tmpdirname: + # print("created temporary directory for model", tmpdirname) + model.save(path=f"{tmpdirname}/{model.model_name}") + upload_folder(repo_id=repo_id, folder_path=tmpdirname, token=self.HF_TOKEN) + + def download_model( + self, + repo_id: str = None, + model_name: str = None, + model_class: object = None, + ) -> ForecastingModel: + with tempfile.TemporaryDirectory() as tmpdirname: + snapshot_download( + repo_id=repo_id, local_dir=tmpdirname, token=self.HF_TOKEN + ) + model = model_class.load(path=f"{tmpdirname}/{model_name}") + return model From 1d3cae1f694b223d82ce2298ade10fdba4224815 Mon Sep 17 00:00:00 2001 From: Ivelin Ivanov Date: Thu, 1 Feb 2024 08:17:04 -0600 Subject: [PATCH 2/7] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d23107988e..beae08298f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ but cannot always guarantee backwards compatibility. Changes that may **break co ## [Unreleased](https://github.com/unit8co/darts/tree/master) +- Added utility functions for Huggingface Hub integration. Upload/download Darts Forecasting models. [#001](https://https://github.com/unit8co/darts/pull/001) + by [Ivelin Ivanov](https://github.com/ivelin). + + [Full Changelog](https://github.com/unit8co/darts/compare/0.27.2...master) ### For users of the library: From b8a578507fafd8af7db79a5031bca84a1c3c2dc0 Mon Sep 17 00:00:00 2001 From: Ivelin Ivanov Date: Thu, 1 Feb 2024 08:22:32 -0600 Subject: [PATCH 3/7] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index beae08298f..992a144cdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co ## [Unreleased](https://github.com/unit8co/darts/tree/master) -- Added utility functions for Huggingface Hub integration. Upload/download Darts Forecasting models. [#001](https://https://github.com/unit8co/darts/pull/001) +- Added utility functions for Huggingface Hub integration. Upload/download Darts Forecasting models. [#2201](https://github.com/unit8co/darts/pull/2201) by [Ivelin Ivanov](https://github.com/ivelin). From 9d50973382688ea188347f9165310da6d603c9f3 Mon Sep 17 00:00:00 2001 From: Ivelin Ivanov Date: Thu, 1 Feb 2024 12:10:50 -0600 Subject: [PATCH 4/7] Update CHANGELOG.md add timeseries info --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 992a144cdd..63a2fb00d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co ## [Unreleased](https://github.com/unit8co/darts/tree/master) -- Added utility functions for Huggingface Hub integration. Upload/download Darts Forecasting models. [#2201](https://github.com/unit8co/darts/pull/2201) +- Added utility functions for Huggingface Hub integration. Upload/download Darts TimeSeries and ForecastingModel instances. [#2201](https://github.com/unit8co/darts/pull/2201) by [Ivelin Ivanov](https://github.com/ivelin). From d05929fa0c23a755610980bbbd8ea3108d7ab814 Mon Sep 17 00:00:00 2001 From: Ivelin Ivanov Date: Thu, 1 Feb 2024 12:12:08 -0600 Subject: [PATCH 5/7] add timeseries upload and download methods --- darts/utils/hfhub.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/darts/utils/hfhub.py b/darts/utils/hfhub.py index 2382141883..2debd92288 100644 --- a/darts/utils/hfhub.py +++ b/darts/utils/hfhub.py @@ -3,6 +3,7 @@ import os import tempfile from typing import Optional +from darts import TimeSeries from darts.models.forecasting.forecasting_model import ForecastingModel from huggingface_hub import snapshot_download, upload_folder, create_repo @@ -25,8 +26,8 @@ def __init__(self, api_key: Optional[str] = None): def upload_model( self, - model: ForecastingModel = None, repo_id: str = None, + model: ForecastingModel = None, private: Optional[bool] = True, ): # Create repo if not existing yet and get the associated repo_id @@ -49,3 +50,44 @@ def download_model( ) model = model_class.load(path=f"{tmpdirname}/{model_name}") return model + + def upload_timeseries( + self, + repo_id: str = None, + series: TimeSeries = None, + series_name: str = None, + private: Optional[bool] = True, + ): + # Create repo if not existing yet and get the associated repo_id + repo_info = create_repo( + repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True + ) + # print(f"repo_info: ", repo_info) + df = series.pd_dataframe() + with tempfile.TemporaryDirectory() as tmpdirname: + df.to_parquet(path=f"{tmpdirname}/{series_name}.parquet") + upload_folder( + repo_id=repo_id, + repo_type="dataset", + folder_path=tmpdirname, + token=self.HF_TOKEN, + ) + + def download_timeseries( + self, + repo_id: str = None, + series_name: str = None, + ) -> TimeSeries: + with tempfile.TemporaryDirectory() as tmpdirname: + snapshot_download( + repo_id=repo_id, + repo_type="dataset", + local_dir=tmpdirname, + token=self.HF_TOKEN, + ) + print(os.listdir(tmpdirname)) + df = pd.read_parquet( + f"{tmpdirname}/{series_name}.parquet", engine="pyarrow" + ) + ts = TimeSeries.from_dataframe(df) + return ts From 4e5d993b788c1120c55830977d395da280ef0aab Mon Sep 17 00:00:00 2001 From: Ivelin Ivanov Date: Thu, 1 Feb 2024 13:24:59 -0600 Subject: [PATCH 6/7] replace print with logger Addressing review comment by @VascoSch92 https://github.com/unit8co/darts/pull/2201#discussion_r1474996216 --- darts/utils/hfhub.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/darts/utils/hfhub.py b/darts/utils/hfhub.py index 2debd92288..5b2b3b04ab 100644 --- a/darts/utils/hfhub.py +++ b/darts/utils/hfhub.py @@ -6,7 +6,9 @@ from darts import TimeSeries from darts.models.forecasting.forecasting_model import ForecastingModel from huggingface_hub import snapshot_download, upload_folder, create_repo +from darts.logging import get_logger +logger = get_logger(__name__) class HFHub: """ @@ -45,6 +47,10 @@ def download_model( model_class: object = None, ) -> ForecastingModel: with tempfile.TemporaryDirectory() as tmpdirname: + logger.info( + "HFHub model files downloaded to local temp dir: ", + os.listdir(tmpdirname) + ) snapshot_download( repo_id=repo_id, local_dir=tmpdirname, token=self.HF_TOKEN ) @@ -85,7 +91,10 @@ def download_timeseries( local_dir=tmpdirname, token=self.HF_TOKEN, ) - print(os.listdir(tmpdirname)) + logger.info( + "HFHub data files downloaded to local temp dir: ", + os.listdir(tmpdirname) + ) df = pd.read_parquet( f"{tmpdirname}/{series_name}.parquet", engine="pyarrow" ) From 5bcb523265335a9185dfd1c4669eabf6c783c5a4 Mon Sep 17 00:00:00 2001 From: Ivelin Ivanov Date: Fri, 2 Feb 2024 14:46:37 -0600 Subject: [PATCH 7/7] add token param to create_repo --- darts/utils/hfhub.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/darts/utils/hfhub.py b/darts/utils/hfhub.py index 5b2b3b04ab..fb0eadd3c7 100644 --- a/darts/utils/hfhub.py +++ b/darts/utils/hfhub.py @@ -33,7 +33,8 @@ def upload_model( private: Optional[bool] = True, ): # Create repo if not existing yet and get the associated repo_id - create_repo(repo_id=repo_id, repo_type="model", private=private, exist_ok=True) + create_repo(repo_id=repo_id, repo_type="model", private=private, exist_ok=True, + token=self.HF_TOKEN) with tempfile.TemporaryDirectory() as tmpdirname: # print("created temporary directory for model", tmpdirname) @@ -66,7 +67,8 @@ def upload_timeseries( ): # Create repo if not existing yet and get the associated repo_id repo_info = create_repo( - repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True + repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True, + token=self.HF_TOKEN ) # print(f"repo_info: ", repo_info) df = series.pd_dataframe()