Create a reusable ML model loader #325

leandro-lucarella-frequenz · 2023-04-12T12:41:40Z

leandro-lucarella-frequenz
Apr 12, 2023
Maintainer

We will soon need to load pre-trained models to be used for predictions, so it would be good if the SDK can provide some framework to do this.

Requirements

The models should be loaded from a common model folder that's mounted into the container.
The models can be updated while the container is running and would need to up updated in memory
The models are serialized data files using Python's pickle module
There will be also a model serialization functionality required in the future for the training code to dump its models to disk
SDK namespace for machine model loading and dumping

Possible base implementation

The below example model loader needs to be adjusted in a way that it is general enough for other forecasts with different loading mechanics.

from abc import ABC, abstractmethod
from typing import Dict, Type, TypeVar, Union
from typing import Optional
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer

from example.model import ModelType

import pandas as pd
import os
import pickle

ModelType = TypeVar("ModelType")

class ModelUpdateHandler(FileSystemEventHandler):
    def __init__(self, update_callback):
        super().__init__()
        self.update_callback = update_callback

    def on_modified(self, event):
        if event.is_directory:
            return
        self.update_callback(event.src_path)

class ModelHandler(ABC):  # pylint: disable=too-few-public-methods
    """Interface for storing models."""

    def __init__(self, model_cls: Type[ModelType], model_init_args: Dict[str, str] = None) -> None:
        """Create models and stores internally.
        Args:
            model_cls (Type[ModelType]): Model class.
            model_init_args (Dict[str, str], optional): Additional arguments for model
                initialization. Defaults to None.
        """
        self.model_cls = model_cls
        self.model_init_args = model_init_args

    @abstractmethod
    def get_model(self, timestamp: pd.Timestamp) -> ModelType:
        """Return model for the given timestamp.
        Args:
            timestamp (pd.Timestamp): Timestamp for the model.
        Returns:
            ModelType: Model instance.
        """

    def start_model_monitor(self, model_path: str):
        """Start monitoring the model directory for changes."""
        event_handler = ModelUpdateHandler(self.reload_model)
        self._observer = Observer()
        self._observer.schedule(event_handler, os.path.dirname(model_path), recursive=False)
        self._observer.start()

    def stop_model_monitor(self):
        """Stop monitoring the model directory for changes."""
        if self._observer:
            self._observer.stop()
            self._observer.join()

    def reload_model(self, model_path: str):
        """Reload the model when the model file is updated."""
        raise NotImplementedError("Subclasses must implement this method.")

class SingleModel(ModelHandler):  # pylint: disable=too-few-public-methods
    """Store a single model, independently of time."""

    def __init__(
        self,
        model_dir: str,
    ) -> None:
        """Handle and return a single model, independently of time.
        It creates only one model and returns it independently from the timestamp.
        Args:
            model_dir (str): Path to the directory with model definition.
        """
        super().__init__()
        self.model_path = os.path.join(model_dir, "model.pkl")
        self._load_model()
        self.start_model_monitor(model_dir)

    def _load_model(self):
        with open(self.model_path, "rb") as f:
            self._model = pickle.load(f)

    def get_model(self, timestamp: pd.Timestamp) -> ModelType:
        """Return stored model.
        Args:
            timestamp (pd.Timestamp): Not used.
        Returns:
            ModelType: Model instance.
        """
        return self._model

    def reload_model(self, model_path: str):
        if model_path == self.model_path:
            self._load_model()

class DayModel(ModelHandler):  # pylint: disable=too-few-public-methods
    """Store and return a separate model for each time period."""

    default_day_to_number: Dict[str, int] = {
        "mon": 0,
        "tue": 1,
        "wed": 2,
        "thu": 3,
        "fri": 4,
        "sat": 5,
        "sun": 6,
    }

    def __init__(
        self,
        model_dir_prefix: str,
        model_dir_suffix: str,
        period_to_number: Dict[str, int] = None,
    ) -> None:
        """Store and return separate models for each time period.
        Args:
            model_dir_prefix (str): Prefix path to the directory with model definition.
            model_dir_suffix (str): Suffix path to the directory with model definition.
            period_to_number (Dict[str, int], optional): Custom mapping of time periods to
                numbers. Defaults to None.
        Note:
            model_dir_prefix + time_period + model_dir_suffix should be the path to the
                directory with the model definition.
        """
        super().__init__()
        self.period_to_number = (
            period_to_number if period_to_number is not None else self.default_day_to_number
        )
        self.model_paths = {
            period: os.path.join(model_dir_prefix + period + model_dir_suffix, "model.pkl")
            for period in self.period_to_number.keys()
        }
        self._load_models()
        for period in self.period_to_number.keys():
            model_path = model_dir_prefix + period + model_dir_suffix
            self.start_model_monitor(model_path)

    def _load_models(self):
        for period, period_num in self.period_to_number.items():
            model_path = self.model_paths[period]
            with open(model_path, "rb") as f:
                self._models[period_num] = pickle.load(f)

    def get_model(self, timestamp: pd.Timestamp) -> ModelType:
        """Return model for the time period in the timestamp.

You could invoke these classes like following but there are a few possibilities how to use it:

The day-to-number mapping, we can be passed as an optional argument with a default value. This way, the user can provide a custom mapping if needed.
The optional model_init_args argument can be passed additional arguments when initialising models. This allows for custom parameters when working with models that require more than just a directory path. @idlir-shkurti-frequenz not sure if thats needed as I recall that we might also dump the model (hyper) parameters to a file too.

mathias-baumann-frequenz · 2023-04-12T12:53:07Z

mathias-baumann-frequenz
Apr 12, 2023

It might be worth checking out how the transformers library handles those things. It makes all aspects around models (training, datasets, architectures, downloading, etc) very accessible..

https://huggingface.co/docs/transformers/pipeline_tutorial

We could even use huggingface.co to store models and have them available for download with just one python call.

2 replies

thomas-nicolai-frequenz Apr 12, 2023
Maintainer

Its something we can support later on. Lets not do this right now please.

mathias-baumann-frequenz Apr 12, 2023

The idea was more that we can look at how they design the API and borrow those concepts for ourselves

thomas-nicolai-frequenz · 2023-04-12T15:50:40Z

thomas-nicolai-frequenz
Apr 12, 2023
Maintainer

The above example would assume the following directory structure for the models

models/
    consumption/
        single_model/
            model.pkl
        day_models/
            mon/
                model.pkl
            tue/
                model.pkl
            wed/
                model.pkl
            thu/
                model.pkl
            fri/
                model.pkl
            sat/
                model.pkl
            sun/
                model.pkl
    production/
        single_model/
             ....
        day_models/
             .....

Usage of the above example code might look like this

import pandas as pd
from model_handler import SingleModel, DayModel

# Create SingleModel instances for consumption and production
consumption_single_model_handler = SingleModel(model_dir="models/consumption/single_model")
production_single_model_handler = SingleModel(model_dir="models/production/single_model")

# Create DayModel instances for consumption and production
consumption_day_model_handler = DayModel(
    model_dir_prefix="models/consumption/day_models/",
    model_dir_suffix=""
)
production_day_model_handler = DayModel(
    model_dir_prefix="models/production/day_models/",
    model_dir_suffix=""
)

# Get model instances for a specific timestamp using SingleModel
timestamp = pd.Timestamp("2023-04-12 12:00:00")
consumption_single_model = consumption_single_model_handler.get_model(timestamp)
production_single_model = production_single_model_handler.get_model(timestamp)

# Get model instances for a specific timestamp using DayModel
consumption_day_model = consumption_day_model_handler.get_model(timestamp)
production_day_model = production_day_model_handler.get_model(timestamp)

# Use the obtained model instances for predictions or any other task
# consumption_single_model.predict(...)
# consumption_day_model.predict(...)
# production_single_model.predict(...)
# production_day_model.predict(...)

0 replies

thomas-nicolai-frequenz · 2023-04-13T10:13:06Z

thomas-nicolai-frequenz
Apr 13, 2023
Maintainer

Here is an extension when having a model for each 15-min window of a given day:

class IntervalModel(ModelHandler):  # pylint: disable=too-few-public-methods
    """Store and return a separate model for each 15-minute interval."""

    def __init__(
        self,
        model_dir_prefix: str,
        model_dir_suffix: str,
    ) -> None:
        """Store and return separate models for each 15-minute interval.
        Args:
            model_dir_prefix (str): Prefix path to the directory with model definition.
            model_dir_suffix (str): Suffix path to the directory with model definition.
        Note:
            model_dir_prefix + interval + model_dir_suffix should be the path to the
                directory with the model definition.
        """
        super().__init__()
        self.model_paths = {
            f"{i:02d}{j:02d}": os.path.join(model_dir_prefix + f"{i:02d}{j:02d}" + model_dir_suffix, "model.pkl")
            for i in range(0, 24)
            for j in range(0, 60, 15)
        }
        self._load_models()
        for interval in self.model_paths.keys():
            model_path = model_dir_prefix + interval + model_dir_suffix
            self.start_model_monitor(model_path)

    def _load_models(self):
        for interval, interval_num in self.model_paths.items():
            model_path = self.model_paths[interval]
            with open(model_path, "rb") as f:
                self._models[interval_num] = pickle.load(f)

    def get_model(self, timestamp: pd.Timestamp) -> ModelType:
        """Return model for the 15-minute interval in the timestamp.
        Args:
            timestamp (pd.Timestamp): Timestamp for the model.
        Returns:
            ModelType: Model instance for the given timestamp.
        """
        interval = timestamp.strftime("%H%M")
        return self._models[interval]

The directory structure could look like the following:

models/
    consumption/
        single_model/
            ....
        day_models/
            ...
        interval_models/
            0000/
                model.pkl
            0015/
                model.pkl
            ...
            2345/
                model.pkl

0 replies

idlir-shkurti-frequenz · 2023-04-25T13:55:39Z

idlir-shkurti-frequenz
Apr 25, 2023

Model handler

The following is the current implementation of the model handler in the 24hr forecast actor. This model handler loads all existing models required for the 24hr forecast in a dictionary and uses them one by one to make forecasts for the next 24 hours.

""" Model handler for daily forecast actor. """

from abc import ABC, abstractmethod
from typing import Dict
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer

import os
import pickle
from xgboost.sklearn import XGBRegressor

class ModelUpdateHandler(FileSystemEventHandler):
    """Handler for model updates.
    
    This class inherits from the FileSystemEventHandler class from the watchdog.events 
    module. The purpose of this class is to handle events related to changes in the 
    file system, specifically modifications or creations of files.
    """
    def __init__(self, reload_model_func):
        """Create handler for model updates."""
        self.reload_model_func = reload_model_func

    def on_any_event(self, event):
        """Handle any event.
        
        Args:
            event (watchdog.events.FileSystemEvent): Event that occurred.    
        """
        if event.is_directory:
            return
        elif event.event_type == 'created':
            print(f"Model file {event.src_path} has been created.")
        elif event.event_type == 'modified':
            print(f"Model file {event.src_path} has been modified. Reloading model...")
            self.reload_model_func(event.src_path)


class ModelHandler(ABC):
    """Interface for storing models.
    
    Abstract class defining an interface for storing and retrieving machine learning models.
    """

    def __init__(self, *args: str) -> None:
        """Create models and stores internally.

        Args:
            args (str): all arguments needed to specify the model directories
            kwargs (Dict[str, str]): all arguments needed to specify the model
                directories
        """
        self._observer = None
        self._model = None
        self.model_dir_prefix = args[0]

    @abstractmethod
    def get_model(self, i_ahead: int) -> XGBRegressor:
        """Return model for the given timestamp.

        Args:
            i_ahead (pd.Timestamp): 15min window ahead.

        Returns:
            XGBRegressor: Model instance.
        """

    def start_model_monitor(self, model_dir: str):
        """Start monitoring the model directory for changes.

        Args:
            model_dir (str): Path to the directory with model definition.

        Note:
            model_dir + xgb_model + model_suffix should be path to the
            directory with the model definition.
        """
        print(f"Monitoring model directory {model_dir} for changes...")
        event_handler = ModelUpdateHandler(self.reload_model)
        self._observer = Observer()
        self._observer.schedule(event_handler, os.path.dirname(model_dir), recursive=False)
        self._observer.start()

    def stop_model_monitor(self):
        """Stop monitoring the model directory for changes."""
        if self._observer:
            self._observer.stop()
            self._observer.join()

    def reload_model(self, model_path: str):
        """Reload the model when the model file is updated.

        This function reloads the model by deserializing it from the updated file. 
        Once the model has been reloaded, a message is printed to the console 
        indicating that the model has been reloaded.
        
        Args:
            model_path (str): Path to the model file.

        Note:
            model_path should be the path to the model file, not the path to the
            directory containing the model file.
        """
        with open(model_path, 'rb') as f:  #TODO: Check if pickle accepts extra metadata. No need for new file.
            self._model = pickle.load(f)
        print("Model reloaded.")


class SingleModel(ModelHandler):
    """Retrieve a single model, independently of time.
    
    The class loads a set of pre-trained models during initialization and 
    stores them in a dictionary, where each key corresponds to the number 
    of 15-minute intervals ahead of the current time.
    """

    def __init__(self, model_dir_prefix: str) -> None:
        """Store and return a model for a particular 15min window ahead.

        Args:
            model_dir_prefix (str): Prefix path to the directory with model definition.

        Note:
            model_dir_prefix + xgb_model + model_name_suffix should be path to the
            saved model definition.
        """
        super().__init__()
        self._model_dir_prefix = model_dir_prefix
        self._models: Dict[int, XGBRegressor] = self._load_models()

    def _load_models(
        self
    ) -> Dict[int, XGBRegressor]:
        """Load all models.

        Note:
            self._model_dir_prefix + xgb_model + model_name_suffix should be path to the
                directory with the model definition.

        Returns:
            Dict[int, XGBRegressor]: Dictionary with models for each 15min window.
        """
        models: Dict[int, XGBRegressor] = {}

        for model_name_suffix in range(1, 97):
            path = self._model_dir_prefix + 'xgb_model_' + str(model_name_suffix) + '.pkl'
            with open(path, "rb") as pickle_file:
                models[model_name_suffix] = pickle.load(pickle_file)
        return models


    def get_model(self, i_ahead: int) -> XGBRegressor:
        """Return a stored model.

        The i_ahead is an integer representing the number of 15-minute intervals 
        ahead of the current time. The method returns the stored model 
        instance for the specified interval.

        Args:
            i_ahead (pd.Timestamp): 15min window ahead.

        Returns:
            XGBRegressor: Model instance.
        """
        return self._models[i_ahead]

The handler could be used/tested the following way (note that this will probably have to be run in an infinite loop to be able to constantly monitor the directory):

def test_model_handler():
    '''Test model handler.'''
    _loaded_models_dir = '/dir/to/loaded/models/'
    model_handler = SingleModel(_loaded_models_dir)
    model_handler.start_model_monitor(_loaded_models_dir)
    model = model_handler.get_model(1)
    assert model is not None
    assert model.__class__.__name__ == 'XGBRegressor'
    print(model)
    model_handler.stop_model_monitor()

As discussed with @daniel-zullo-frequenz earlier, one way to load the models at the moment could be that we save the models into specific directories as follows:

-models
    -24h_consumption_models
        -15min_ahead
            15min_ahead_12_06_2022.pkl
            15min_ahead_12_06_2022_metadata.json
            15min_ahead_01_11_2022.pkl
            15min_ahead_01_11_2022_metadata.json
            ...
        -30min_ahead
            30min_ahead_19_07_2022.pkl
            30min_ahead_19_07_2022_metadata.json
            30min_ahead_21_01_2023.pkl
            30min_ahead_21_01_2023_metadata.json
            ...     
        - 45min_ahead
           ...

Monitoring (with `watchdog`)

Each directory will have the consumption models for a specific time ahead forecast and it will be saved with the re-training date in the model name. Priority will be given to more recently trained models. Using watchdog, if we get an alert that a new file has been added to the directory, we check the date when this model was created. If we have added a new model to the directory, we then check the metadata of the new model to see if the version of the trained model is accepted by the current forecasting actor. If so, we reload the new model, otherwise we stick to the existing model.

7 replies

cwasicki May 8, 2023
Collaborator

What will the metadata include? Only the model version? I tend to have the metadata as a separate file in a human-readable format. This provides more flexibility to add metadata we feel useful and also to still edit it independently of the training run. In particular beyond the scope of this model handler that setup has less limitations, e.g. it could define dependencies, is language-agnostic and I could imagine that it integrates easier with tools like mlflow.

If there are concerns about keeping them together, we could gzip both files.

leandro-lucarella-frequenz May 9, 2023
Maintainer Author

While I agree plain text is convenient, reading a pickle file is as simple as python -m pickle [FILE]. Also having 2 files (even if they are zipped together) opens up the possibility of adding inconsistencies (where a metadata doesn't really correspond to the model, because there was a mistake when packing both files together). Because of this, I'm not convinced the added complexity of having 2 separate files is worth it.

$ python 
Python 3.8.13 (default, Jul 21 2022, 13:48:11) 
[GCC 11.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import pickle
>>> pickle.dump(("hello", "world"), open('/tmp/p', 'wb'))
>>>
$ python -m pickle /tmp/p
('hello', 'world')
$ # If we only want to get the metadata because the model is too large
$ # or something, it is a bit uglier, but still pretty simple 
$ python -c 'import pickle; print(pickle.load(open("/tmp/p", "rb"))[0])'
hello

thomas-nicolai-frequenz May 9, 2023
Maintainer

If there are concerns about keeping them together, we could gzip both files.

doesn't feel like a good idea. it needs to be unzipped etc.

thomas-nicolai-frequenz May 9, 2023
Maintainer

also to still edit it independently of the training run

oh no. lets not go for that. the metadata might be related to a specific model version and thus manually editing might break the model. what Gigi used this for was certain parameters that have been used to train the model but will also be needed to load the model and prepare the data etc.

thomas-nicolai-frequenz May 9, 2023
Maintainer

My gut says what @leandro-lucarella-frequenz is saying is probably the best way to go....

daniel-zullo-frequenz · 2023-05-08T07:53:53Z

daniel-zullo-frequenz
May 8, 2023
Maintainer

Also we should use FileWatcher which is a channel receiver that watches for file events (file added, modified and deleted) given a list of Paths to watch for changes.

0 replies

daniel-zullo-frequenz · 2023-05-21T22:41:57Z

daniel-zullo-frequenz
May 21, 2023
Maintainer

To sum up this is the generic directories/paths structure for storing/loading models considering the use-cases in the discussion so far:

models/
    {consumption,production}/
        single/
            (prefix)_model_(suffix).pkl
        day/
            [mon,tue,wed,thu,fri,sat,sun]/
                (prefix)_model_(suffix).pkl
        interval/
            [0000,0015,0030,0045,0100,..,2345]/
                (prefix)_model_(suffix).pkl

I've have implemented the single, day and interval models (draft) using FileWatcher to monitor and report changes in the model files. Also I've slightly changed the ModelHandler interface to provide the internal function _load_model() common to all models.

ModelHandler

"""Load, update, monitor and retrieve machine learning models."""

from __future__ import annotations

import asyncio
import os
import pickle
from abc import ABC, abstractmethod
from datetime import datetime
from typing import TypeVar

from frequenz.channels.util import FileWatcher, Select
from frequenz.sdk._internal._asyncio import cancel_and_await

T = TypeVar("T")


class ModelHandler(ABC):
    """Interface for loading, updating, and monitoring machine learning models."""

    def __init__(self) -> None:
        """Handle machine learning models."""
        self._select: Select | None = None
        self._monitoring_task: asyncio.Task[None] | None = None

    @abstractmethod
    def get_model(self, index: int | str | datetime) -> T:
        """Get the model for the given index.

        Args:
            index: the index to get the model.

        Returns:
            the model instance.
        """

    @abstractmethod
    def reload_model(self, model_path: str) -> None:
        """Reload the model when the model file is updated.

        Args:
            model_path: the model path to be reloaded.
        """

    async def start_model_monitor(self, model_paths: list[str]):
        """Start monitoring the model paths for changes.

        Args:
            model_paths: the list of model paths to be monitored.
        """
        print(f"Monitoring model paths for changes: {model_paths}")
        file_watcher = FileWatcher(paths=model_paths)

        self._select = Select(file_watcher=file_watcher)

        while await self._select.ready():
            if msg := self._select.file_watcher:
                event = msg.inner
                if event.type == FileWatcher.EventType.CREATE:
                    print(f"Model has been created: {str(event.path)}")
                if event.type == FileWatcher.EventType.MODIFY:
                    print(
                        f"Model file {str(event.path)} has been modified. Reloading model..."
                    )
                    self.reload_model(str(event.path))

    async def stop_model_monitor(self):
        """Stop monitoring the model paths for changes."""
        if self._select:
            await self._select.stop()
        await cancel_and_await(self._monitoring_task)

    async def join(self) -> None:
        """Await the monitoring task, and return when the task completes."""
        if self._monitoring_task and self._monitoring_task.done() is False:
            await self._monitoring_task

    def _load_model(self, model_path: str) -> T:
        """Load the model file.

        Args:
            model_path: the model path to be loaded.

        Returns:
            the model instance.
        """
        assert os.path.exists(
            model_path
        ), f"The model path {model_path} does not exist."

        with open(model_path, "rb") as file_obj:
            return pickle.load(file_obj)

And these are the models:

SingleModel

from __future__ import annotations

import asyncio
from datetime import datetime
from typing import TypeVar

from ._handler import ModelHandler

T = TypeVar("T")


class SingleModel(ModelHandler):
    """Load, store and retrieve a single model."""

    def __init__(
        self,
        model_path: str,
    ) -> None:
        """Handle a single model.

        Args:
            model_path: the path to the model file.
        """
        super().__init__()
        self._model_path: str = model_path
        self._model: T = self._load_model(model_path)
        self._monitoring_task: asyncio.Task[None] = asyncio.create_task(
            self.start_model_monitor([self._model_path])
        )

    def get_model(self, index: int | str | datetime = 0) -> T:
        """Get the model for the given index.

        Args:
            index: index to get the model (not used).

        Returns:
            the model instance.
        """
        return self._model

    def reload_model(self, model_path: str):
        """Reload the model when the model file is updated.

        Args:
            model_path: the model path to be reloaded.
        """
        if self._model_path == model_path:
            self._model = self._load_model(model_path)

DayModel

from __future__ import annotations

import asyncio
import calendar
import os
from datetime import datetime
from typing import TypeVar

from ._handler import ModelHandler

T = TypeVar("T")


class DayModel(ModelHandler):
    """Load, update, store and retrieve a separate model for each day period."""

    weekday_to_number: dict[str, int] = {
        day.lower(): num for num, day in enumerate(calendar.day_abbr)
    }

    def __init__(
        self,
        model_dir_prefix: str,
        model_dir_suffix: str = "",
        model_file_name: str = "model.pkl",
        period_to_number: dict[str, int] = None,
    ) -> None:
        """Handle separate models for each time period.

        Args:
            model_dir_prefix: prefix path to the directory with the models definition.
            model_dir_suffix: suffix path to the directory with the models definition.
            model_file_name: the model file name.
            period_to_number: custom mapping of day periods to numbers.

        Note:
            model_dir_prefix + time_period + model_dir_suffix + model_file_name
            should be the path to the file with the model definition.
        """
        super().__init__()
        self._period_to_number = (
            self.weekday_to_number if period_to_number is None else period_to_number
        )

        self._model_paths: dict[str, str] = {
            os.path.join(
                model_dir_prefix, period, model_dir_suffix, model_file_name
            ): period
            for period in self._period_to_number.keys()
        }

        self._models: dict[int, T] = {}
        for model_path, period in self._model_paths.items():
            period_num = self._period_to_number[period]
            self._models[period_num] = self._load_model(model_path)

        self._monitoring_task: asyncio.Task[None] = asyncio.create_task(
            self.start_model_monitor(self._model_paths.keys())
        )

    def get_model(self, index: int | str | datetime = 0) -> T:
        """Get the model for the given index.

        Args:
            index: index to get the model (not used).

        Returns:
            the model instance.
        """
        if isinstance(index, int):
            return self._models[index]
        if isinstance(index, str):
            assert index in self._period_to_number.keys()
            return self.get_model(self._period_to_number[index])

        return self.get_model(index.weekday())

    def reload_model(self, model_path: str):
        """Reload the model when the model file is updated.

        Args:
            model_path: the model path to be reloaded.
        """
        if model_path in self._model_paths.keys():
            period = self._model_paths[model_path]
            period_num = self._period_to_number[period]
            self._models[period_num] = self._load_model(model_path)

IntervalModel

from __future__ import annotations

import asyncio
import os
from datetime import datetime, timedelta
from typing import TypeVar

from ._handler import ModelHandler

T = TypeVar("T")


class IntervalModel(ModelHandler):
    """Load, update, store and retrieve a separate model for each interval period."""

    def __init__(
        self,
        model_dir_prefix: str,
        model_dir_suffix: str = "",
        model_file_name: str = "model.pkl",
        interval: timedelta = timedelta(minutes=15),
    ) -> None:
        """Handle separate models for each time period.

        Args:
            model_dir_prefix: prefix path to the directory with the models definition.
            model_dir_suffix: suffix path to the directory with the models definition.
            model_file_name: the model file name.
            interval: the interval period in which models are defined.

        Note:
            model_dir_prefix, period, model_dir_suffix, model_file_name
            should be the path to the file with the model definition.
        """
        zero = timedelta(0)
        assert interval > zero and (timedelta(hours=1) % interval) == zero

        super().__init__()

        intervals_per_hour = timedelta(hours=1) // interval
        self._interval_to_number: dict[str, int] = {
            f"{h:02d}{m:02d}": h * intervals_per_hour + timedelta(minutes=m) // interval
            for h in range(0, 24)
            for m in range(0, 60, 60 // intervals_per_hour)
        }

        self._interval: timedelta = interval

        self._model_paths: dict[str, str] = {
            os.path.join(
                model_dir_prefix, period, model_dir_suffix, model_file_name
            ): period
            for period in self._interval_to_number.keys()
        }

        self._models: dict[int, T] = {}
        for model_path, period in self._model_paths.items():
            period_num = self._interval_to_number[period]
            self._models[period_num] = self._load_model(model_path)

        self._monitoring_task: asyncio.Task[None] = asyncio.create_task(
            self.start_model_monitor(self._model_paths.keys())
        )

    def get_model(self, index: int | str | datetime = 0) -> T:
        """Get the model for the given index.

        Args:
            index: index to get the model (not used).

        Returns:
            the model instance.
        """
        if isinstance(index, int):
            return self._models[index]
        if isinstance(index, str):
            return self.get_model(self._interval_to_number[index])

        minutes = (timedelta(minutes=index.minute) // self._interval) * (
            self._interval // timedelta(seconds=60)
        )
        return self.get_model(f"{index.hour:02d}{minutes:02d}")

    def reload_model(self, model_path: str):
        """Reload the model when the model file is updated.

        Args:
            model_path: the model path to be reloaded.
        """
        if model_path in self._model_paths.keys():
            period = self._model_paths[model_path]
            period_num = self._interval_to_number[period]
            self._models[period_num] = self._load_model(model_path)

Usage examples:

# Single
single_model = SingleModel(model_path="models/consumption/single/model.pkl")
model = single_model.get_model()

# Day
day_model = DayModel(model_dir_prefix="models/consumption/day")
model_mon = day_model.get_model(0)
model_tue = day_model.get_model("tue")
model_sun = day_model.get_model(datetime(2023, 5, 21))

# Interval
interval_model = IntervalModel("models/consumption/interval")
model_4 = interval_model.get_model(4)
model_0100 = interval_model.get_model("0100")
assert model_4 == model_0100
model_1745 = interval_model.get_model("1715")
model_1645 = interval_model.get_model(datetime(2023, 5, 21, 16, 45))
model_1648 = interval_model.get_model(datetime(2023, 5, 21, 16, 48))
assert model_1645 == model_1648

6 replies

cwasicki May 22, 2023
Collaborator

@daniel-zullo-frequenz The interval model is different from the setup @idlir-shkurti-frequenz is currently using. AFAIK he uses one model per time ahead of current time while the interval model is assuming one model per time window of the day.

I wonder whether this kind of logic (Interval model, weekday model) should be part of the SDK at all, since there can be other valid model setups beyond those (e.g. one model for each 15 min window in the next 6h and another model for the remaining 15 min windows). But we cannot cover all potential use-cases.

Alternatively I think the app should be responsible for the logic which model to pick. The model handler in the SDK would only be passed a dictionary with model identifier keys and the corresponding folder to the model. Does that make sense?

thomas-nicolai-frequenz May 23, 2023
Maintainer

since there can be other valid model setups beyond those

Thats just a matter of abstraction imho.

But we cannot cover all potential use-cases.

Yes, surely this can be extended but it does not mean that this shouldn't live in the SDK. If it does not live in the SDK each app is going to implement almost the same and I suggest we aim as much as possible to abstract/unify this.

daniel-zullo-frequenz May 23, 2023
Maintainer

I think at this stage it would be best to move this to an actual PR, because it looks like that we have a (initial :) final proposal of code we could actually end up merging, and it is much easier to review code in a PR than a discussion. Once you create the PR link it here and close the discussion (we can reopen it if needed).

created the initial draft PR in #397

daniel-zullo-frequenz May 23, 2023
Maintainer

The interval model is different from the setup @idlir-shkurti-frequenz is currently using. AFAIK he uses one model per time ahead of current time while the interval model is assuming one model per time window of the day.

Yes, you're right. IIUC we can solve that by providing a custom mapping of interval periods to numbers(indexes) as it was done for DayModel, that way the app can use models ahead or behind the current time. I think I've just forgotten that and IntervalModel only allows to customize the interval at the moment. I'll update it in the PR #397

daniel-zullo-frequenz May 23, 2023
Maintainer

I wonder whether this kind of logic (Interval model, weekday model) should be part of the SDK at all, since there can be other valid model setups beyond those (e.g. one model for each 15 min window in the next 6h and another model for the remaining 15 min windows). But we cannot cover all potential use-cases.

I think they should live in the SDK as IMO they are the common model handlers, they allow customization but indeed they won't cover all the special use-cases. So in that case, the app should provide the specific model handler. Ideally ModelHandler should provide all the boiler-plate functionality so that it is simple to implement a new type of model handler if needed.

Alternatively I think the app should be responsible for the logic which model to pick. The model handler in the SDK would only be passed a dictionary with model identifier keys and the corresponding folder to the model. Does that make sense?

In the initial draft the app can use the different indexes to pick the model (int index, string or datetime) and the app can provide a custom mapping (day or interval) as I mentioned before. I'll work out examples in the PR so that it is easier to see and review the different solutions.

llucax · 2024-06-19T12:07:58Z

llucax
Jun 19, 2024
Maintainer

We already have several WIP, so I'm closing this, we can discuss in the PRs.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Create a reusable ML model loader #325

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 7 comments 15 replies

{{title}}

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

Select a reply

Create a reusable ML model loader #325

leandro-lucarella-frequenz Apr 12, 2023 Maintainer

Requirements

Possible base implementation

Replies: 7 comments · 15 replies

mathias-baumann-frequenz Apr 12, 2023

thomas-nicolai-frequenz Apr 12, 2023 Maintainer

mathias-baumann-frequenz Apr 12, 2023

thomas-nicolai-frequenz Apr 12, 2023 Maintainer

thomas-nicolai-frequenz Apr 13, 2023 Maintainer

idlir-shkurti-frequenz Apr 25, 2023

Model handler

Monitoring (with watchdog)

cwasicki May 8, 2023 Collaborator

leandro-lucarella-frequenz May 9, 2023 Maintainer Author

thomas-nicolai-frequenz May 9, 2023 Maintainer

thomas-nicolai-frequenz May 9, 2023 Maintainer

thomas-nicolai-frequenz May 9, 2023 Maintainer

daniel-zullo-frequenz May 8, 2023 Maintainer

daniel-zullo-frequenz May 21, 2023 Maintainer

cwasicki May 22, 2023 Collaborator

thomas-nicolai-frequenz May 23, 2023 Maintainer

daniel-zullo-frequenz May 23, 2023 Maintainer

daniel-zullo-frequenz May 23, 2023 Maintainer

daniel-zullo-frequenz May 23, 2023 Maintainer

llucax Jun 19, 2024 Maintainer

leandro-lucarella-frequenz
Apr 12, 2023
Maintainer

Replies: 7 comments 15 replies

mathias-baumann-frequenz
Apr 12, 2023

thomas-nicolai-frequenz Apr 12, 2023
Maintainer

thomas-nicolai-frequenz
Apr 12, 2023
Maintainer

thomas-nicolai-frequenz
Apr 13, 2023
Maintainer

idlir-shkurti-frequenz
Apr 25, 2023

Monitoring (with `watchdog`)

cwasicki May 8, 2023
Collaborator

leandro-lucarella-frequenz May 9, 2023
Maintainer Author

thomas-nicolai-frequenz May 9, 2023
Maintainer

thomas-nicolai-frequenz May 9, 2023
Maintainer

thomas-nicolai-frequenz May 9, 2023
Maintainer

daniel-zullo-frequenz
May 8, 2023
Maintainer

daniel-zullo-frequenz
May 21, 2023
Maintainer

cwasicki May 22, 2023
Collaborator

thomas-nicolai-frequenz May 23, 2023
Maintainer

daniel-zullo-frequenz May 23, 2023
Maintainer

daniel-zullo-frequenz May 23, 2023
Maintainer

daniel-zullo-frequenz May 23, 2023
Maintainer

llucax
Jun 19, 2024
Maintainer