|
26 | 26 | When static covariates are present, they are appended to the lagged features. When multiple time series are passed,
|
27 | 27 | if their static covariates do not have the same size, the shorter ones are padded with 0 valued features.
|
28 | 28 | """
|
29 |
| - |
30 | 29 | from collections import OrderedDict
|
31 | 30 | from typing import List, Optional, Sequence, Tuple, Union
|
32 | 31 |
|
|
38 | 37 | from darts.timeseries import TimeSeries
|
39 | 38 | from darts.utils.data.tabularization import create_lagged_training_data
|
40 | 39 | from darts.utils.multioutput import MultiOutputRegressor
|
41 |
| -from darts.utils.utils import _check_quantiles, seq2series, series2seq |
| 40 | +from darts.utils.utils import ( |
| 41 | + _check_quantiles, |
| 42 | + get_single_series, |
| 43 | + seq2series, |
| 44 | + series2seq, |
| 45 | +) |
42 | 46 |
|
43 | 47 | logger = get_logger(__name__)
|
44 | 48 |
|
@@ -1029,3 +1033,299 @@ def _quantile_sampling(self, model_output: np.ndarray) -> np.ndarray:
|
1029 | 1033 | class _QuantileModelContainer(OrderedDict):
|
1030 | 1034 | def __init__(self):
|
1031 | 1035 | super().__init__()
|
| 1036 | + |
| 1037 | + |
| 1038 | +class RegressionModelWithCategoricalCovariates(RegressionModel): |
| 1039 | + def __init__( |
| 1040 | + self, |
| 1041 | + lags: Union[int, list] = None, |
| 1042 | + lags_past_covariates: Union[int, List[int]] = None, |
| 1043 | + lags_future_covariates: Union[Tuple[int, int], List[int]] = None, |
| 1044 | + output_chunk_length: int = 1, |
| 1045 | + add_encoders: Optional[dict] = None, |
| 1046 | + model=None, |
| 1047 | + multi_models: Optional[bool] = True, |
| 1048 | + categorical_past_covariates: Optional[Union[str, List[str]]] = None, |
| 1049 | + categorical_future_covariates: Optional[Union[str, List[str]]] = None, |
| 1050 | + categorical_static_covariates: Optional[Union[str, List[str]]] = None, |
| 1051 | + ): |
| 1052 | + """ |
| 1053 | + Extension of `RegressionModel` for regression models that support categorical covariates. |
| 1054 | +
|
| 1055 | + Parameters |
| 1056 | + ---------- |
| 1057 | + lags |
| 1058 | + Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags |
| 1059 | + are used (from -1 backward). Otherwise, a list of integers with lags is required (each lag must be < 0). |
| 1060 | + lags_past_covariates |
| 1061 | + Number of lagged past_covariates values used to predict the next time step. If an integer is given the last |
| 1062 | + `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers |
| 1063 | + with lags < 0 is required. |
| 1064 | + lags_future_covariates |
| 1065 | + Number of lagged future_covariates values used to predict the next time step. If a tuple (past, future) is |
| 1066 | + given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first |
| 1067 | + `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list |
| 1068 | + of integers with lags is required. |
| 1069 | + output_chunk_length |
| 1070 | + Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast |
| 1071 | + horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may |
| 1072 | + be useful if the covariates don't extend far enough into the future. |
| 1073 | + add_encoders |
| 1074 | + A large number of past and future covariates can be automatically generated with `add_encoders`. |
| 1075 | + This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that |
| 1076 | + will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to |
| 1077 | + transform the generated covariates. This happens all under one hood and only needs to be specified at |
| 1078 | + model creation. |
| 1079 | + Read :meth:`SequentialEncoder <darts.dataprocessing.encoders.SequentialEncoder>` to find out more about |
| 1080 | + ``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features: |
| 1081 | +
|
| 1082 | + .. highlight:: python |
| 1083 | + .. code-block:: python |
| 1084 | +
|
| 1085 | + add_encoders={ |
| 1086 | + 'cyclic': {'future': ['month']}, |
| 1087 | + 'datetime_attribute': {'future': ['hour', 'dayofweek']}, |
| 1088 | + 'position': {'past': ['relative'], 'future': ['relative']}, |
| 1089 | + 'custom': {'past': [lambda idx: (idx.year - 1950) / 50]}, |
| 1090 | + 'transformer': Scaler() |
| 1091 | + } |
| 1092 | + .. |
| 1093 | + model |
| 1094 | + Scikit-learn-like model with ``fit()`` and ``predict()`` methods. Also possible to use model that doesn't |
| 1095 | + support multi-output regression for multivariate timeseries, in which case one regressor |
| 1096 | + will be used per component in the multivariate series. |
| 1097 | + If None, defaults to: ``sklearn.linear_model.LinearRegression(n_jobs=-1)``. |
| 1098 | + multi_models |
| 1099 | + If True, a separate model will be trained for each future lag to predict. If False, a single model is |
| 1100 | + trained to predict at step 'output_chunk_length' in the future. Default: True. |
| 1101 | + categorical_past_covariates |
| 1102 | + Optionally, component name or list of component names specifying the past covariates that should be treated |
| 1103 | + as categorical. |
| 1104 | + categorical_future_covariates |
| 1105 | + Optionally, component name or list of component names specifying the future covariates that should be |
| 1106 | + treated as categorical. |
| 1107 | + categorical_static_covariates |
| 1108 | + Optionally, string or list of strings specifying the static covariates that should be treated as |
| 1109 | + categorical. |
| 1110 | + """ |
| 1111 | + super().__init__( |
| 1112 | + lags=lags, |
| 1113 | + lags_past_covariates=lags_past_covariates, |
| 1114 | + lags_future_covariates=lags_future_covariates, |
| 1115 | + output_chunk_length=output_chunk_length, |
| 1116 | + add_encoders=add_encoders, |
| 1117 | + model=model, |
| 1118 | + multi_models=multi_models, |
| 1119 | + ) |
| 1120 | + self.categorical_past_covariates = ( |
| 1121 | + [categorical_past_covariates] |
| 1122 | + if isinstance(categorical_past_covariates, str) |
| 1123 | + else categorical_past_covariates |
| 1124 | + ) |
| 1125 | + self.categorical_future_covariates = ( |
| 1126 | + [categorical_future_covariates] |
| 1127 | + if isinstance(categorical_future_covariates, str) |
| 1128 | + else categorical_future_covariates |
| 1129 | + ) |
| 1130 | + self.categorical_static_covariates = ( |
| 1131 | + [categorical_static_covariates] |
| 1132 | + if isinstance(categorical_static_covariates, str) |
| 1133 | + else categorical_static_covariates |
| 1134 | + ) |
| 1135 | + |
| 1136 | + def fit( |
| 1137 | + self, |
| 1138 | + series: Union[TimeSeries, Sequence[TimeSeries]], |
| 1139 | + past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, |
| 1140 | + future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, |
| 1141 | + max_samples_per_ts: Optional[int] = None, |
| 1142 | + n_jobs_multioutput_wrapper: Optional[int] = None, |
| 1143 | + **kwargs, |
| 1144 | + ): |
| 1145 | + self._validate_categorical_covariates( |
| 1146 | + series=series, |
| 1147 | + past_covariates=past_covariates, |
| 1148 | + future_covariates=future_covariates, |
| 1149 | + ) |
| 1150 | + super().fit( |
| 1151 | + series=series, |
| 1152 | + past_covariates=past_covariates, |
| 1153 | + future_covariates=future_covariates, |
| 1154 | + max_samples_per_ts=max_samples_per_ts, |
| 1155 | + n_jobs_multioutput_wrapper=n_jobs_multioutput_wrapper, |
| 1156 | + **kwargs, |
| 1157 | + ) |
| 1158 | + |
| 1159 | + @property |
| 1160 | + def _categorical_fit_param_name(self) -> str: |
| 1161 | + """ |
| 1162 | + Returns the name of the parameter of the model's `fit` method that specifies the categorical features. |
| 1163 | + Can be overridden in subclasses. |
| 1164 | + """ |
| 1165 | + return "categorical_feature" |
| 1166 | + |
| 1167 | + def _validate_categorical_covariates( |
| 1168 | + self, |
| 1169 | + series: Union[TimeSeries, Sequence[TimeSeries]], |
| 1170 | + past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, |
| 1171 | + future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, |
| 1172 | + ) -> None: |
| 1173 | + """ |
| 1174 | + Checks that the categorical covariates are valid. Specifically, checks that the categorical covariates |
| 1175 | + of the model are a subset of all covariates. |
| 1176 | +
|
| 1177 | + Parameters |
| 1178 | + ---------- |
| 1179 | + series |
| 1180 | + TimeSeries or Sequence[TimeSeries] object containing the target values. |
| 1181 | + past_covariates |
| 1182 | + Optionally, a series or sequence of series specifying past-observed covariates |
| 1183 | + future_covariates |
| 1184 | + Optionally, a series or sequence of series specifying future-known covariates |
| 1185 | + """ |
| 1186 | + for categorical_covariates, covariates, cov_type in zip( |
| 1187 | + [self.categorical_past_covariates, self.categorical_future_covariates], |
| 1188 | + [past_covariates, future_covariates], |
| 1189 | + ["past_covariates", "future_covariates"], |
| 1190 | + ): |
| 1191 | + if categorical_covariates: |
| 1192 | + if not covariates: |
| 1193 | + raise_log( |
| 1194 | + ValueError( |
| 1195 | + f"`categorical_{cov_type}` were declared at model creation but no " |
| 1196 | + f"`{cov_type}` are passed to the `fit()` call." |
| 1197 | + ), |
| 1198 | + ) |
| 1199 | + s = get_single_series(covariates) |
| 1200 | + if not set(categorical_covariates).issubset(set(s.components)): |
| 1201 | + raise_log( |
| 1202 | + ValueError( |
| 1203 | + f"Some `categorical_{cov_type}` components " |
| 1204 | + f"({set(categorical_covariates) - set(s.components)}) " |
| 1205 | + f"declared at model creation are not present in the `{cov_type}` " |
| 1206 | + f"passed to the `fit()` call." |
| 1207 | + ) |
| 1208 | + ) |
| 1209 | + if self.categorical_static_covariates: |
| 1210 | + s = get_single_series(series) |
| 1211 | + covariates = s.static_covariates |
| 1212 | + if not s.has_static_covariates: |
| 1213 | + raise_log( |
| 1214 | + ValueError( |
| 1215 | + "`categorical_static_covariates` were declared at model creation but `series`" |
| 1216 | + "passed to the `fit()` call does not contain `static_covariates`." |
| 1217 | + ), |
| 1218 | + ) |
| 1219 | + if not set(self.categorical_static_covariates).issubset( |
| 1220 | + set(covariates.columns) |
| 1221 | + ): |
| 1222 | + raise_log( |
| 1223 | + ValueError( |
| 1224 | + f"Some `categorical_static_covariates` components " |
| 1225 | + f"({set(self.categorical_static_covariates) - set(covariates.columns)}) " |
| 1226 | + f"declared at model creation are not present in the series' `static_covariates` " |
| 1227 | + f"passed to the `fit()` call." |
| 1228 | + ) |
| 1229 | + ) |
| 1230 | + |
| 1231 | + def _get_categorical_features( |
| 1232 | + self, |
| 1233 | + series: Union[List[TimeSeries], TimeSeries], |
| 1234 | + past_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None, |
| 1235 | + future_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None, |
| 1236 | + ) -> Tuple[List[int], List[str]]: |
| 1237 | + """ |
| 1238 | + Returns the indices and column names of the categorical features in the regression model. |
| 1239 | +
|
| 1240 | + Steps: |
| 1241 | + 1. Get the list of features used in the model. We keep the creation order of the different lags/features |
| 1242 | + in create_lagged_data. |
| 1243 | + 2. Get the indices of the categorical features in the list of features. |
| 1244 | + """ |
| 1245 | + |
| 1246 | + categorical_covariates = ( |
| 1247 | + ( |
| 1248 | + self.categorical_past_covariates |
| 1249 | + if self.categorical_past_covariates |
| 1250 | + else [] |
| 1251 | + ) |
| 1252 | + + ( |
| 1253 | + self.categorical_future_covariates |
| 1254 | + if self.categorical_future_covariates |
| 1255 | + else [] |
| 1256 | + ) |
| 1257 | + + ( |
| 1258 | + self.categorical_static_covariates |
| 1259 | + if self.categorical_static_covariates |
| 1260 | + else [] |
| 1261 | + ) |
| 1262 | + ) |
| 1263 | + |
| 1264 | + if not categorical_covariates: |
| 1265 | + return [], [] |
| 1266 | + else: |
| 1267 | + target_ts = get_single_series(series) |
| 1268 | + past_covs_ts = get_single_series(past_covariates) |
| 1269 | + fut_covs_ts = get_single_series(future_covariates) |
| 1270 | + |
| 1271 | + # We keep the creation order of the different lags/features in create_lagged_data |
| 1272 | + feature_list = ( |
| 1273 | + [ |
| 1274 | + f"target_{component}_lag{lag}" |
| 1275 | + for lag in self.lags.get("target", []) |
| 1276 | + for component in target_ts.components |
| 1277 | + ] |
| 1278 | + + [ |
| 1279 | + f"past_cov_{component}_lag{lag}" |
| 1280 | + for lag in self.lags.get("past", []) |
| 1281 | + for component in past_covs_ts.components |
| 1282 | + ] |
| 1283 | + + [ |
| 1284 | + f"fut_cov_{component}_lag{lag}" |
| 1285 | + for lag in self.lags.get("future", []) |
| 1286 | + for component in fut_covs_ts.components |
| 1287 | + ] |
| 1288 | + + ( |
| 1289 | + list(target_ts.static_covariates.columns) |
| 1290 | + if target_ts.has_static_covariates |
| 1291 | + # if isinstance(target_ts.static_covariates, pd.DataFrame) |
| 1292 | + else [] |
| 1293 | + ) |
| 1294 | + ) |
| 1295 | + |
| 1296 | + indices = [ |
| 1297 | + i |
| 1298 | + for i, col in enumerate(feature_list) |
| 1299 | + for cat in categorical_covariates |
| 1300 | + if cat and cat in col |
| 1301 | + ] |
| 1302 | + col_names = [feature_list[i] for i in indices] |
| 1303 | + |
| 1304 | + return indices, col_names |
| 1305 | + |
| 1306 | + def _fit_model( |
| 1307 | + self, |
| 1308 | + target_series, |
| 1309 | + past_covariates, |
| 1310 | + future_covariates, |
| 1311 | + max_samples_per_ts, |
| 1312 | + **kwargs, |
| 1313 | + ): |
| 1314 | + """ |
| 1315 | + Custom fit function for `RegressionModelWithCategoricalCovariates` models, adding logic to let the model |
| 1316 | + handle categorical features directly. |
| 1317 | + """ |
| 1318 | + cat_col_indices, _ = self._get_categorical_features( |
| 1319 | + target_series, |
| 1320 | + past_covariates, |
| 1321 | + future_covariates, |
| 1322 | + ) |
| 1323 | + |
| 1324 | + kwargs[self._categorical_fit_param_name] = cat_col_indices |
| 1325 | + super()._fit_model( |
| 1326 | + target_series=target_series, |
| 1327 | + past_covariates=past_covariates, |
| 1328 | + future_covariates=future_covariates, |
| 1329 | + max_samples_per_ts=max_samples_per_ts, |
| 1330 | + **kwargs, |
| 1331 | + ) |
0 commit comments