Skip to content

Commit 0233756

Browse files
gnwhrhrzn
andauthored
Feat/probabilistic-lgbm-linreg (#831)
* moved from forked branch * added likelihoods to linear model, updated tests, refactored lgbm quantile regression * fixed docstring * fixed test case * Apply suggestions from code review Co-authored-by: Julien Herzen <[email protected]> * applied suggestions from code review, adjusted test and moved _check_quantiles to utils/utils.py * removed unnecessary enumerate * inserted else statement for clarity Co-authored-by: Julien Herzen <[email protected]>
1 parent 9cb3fd7 commit 0233756

File tree

6 files changed

+510
-35
lines changed

6 files changed

+510
-35
lines changed

darts/models/forecasting/gradient_boosted_model.py

+109-3
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,25 @@
1111
from typing import List, Optional, Sequence, Tuple, Union
1212

1313
import lightgbm as lgb
14+
import numpy as np
1415

1516
from darts.logging import get_logger
16-
from darts.models.forecasting.regression_model import RegressionModel
17+
from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin
1718
from darts.timeseries import TimeSeries
1819

1920
logger = get_logger(__name__)
2021

2122

22-
class LightGBMModel(RegressionModel):
23+
class LightGBMModel(RegressionModel, _LikelihoodMixin):
2324
def __init__(
2425
self,
2526
lags: Union[int, list] = None,
2627
lags_past_covariates: Union[int, List[int]] = None,
2728
lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
2829
output_chunk_length: int = 1,
30+
likelihood: str = None,
31+
quantiles: List[float] = None,
32+
random_state: Optional[int] = None,
2933
**kwargs,
3034
):
3135
"""Light Gradient Boosted Model
@@ -48,10 +52,35 @@ def __init__(
4852
Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
4953
horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
5054
be useful if the covariates don't extend far enough into the future.
55+
likelihood
56+
Can be set to `quantile` or 'poisson'. If set, the model will be probabilistic, allowing sampling at
57+
prediction time.
58+
quantiles
59+
Fit the model to these quantiles if the `likelihood` is set to `quantile`.
60+
random_state
61+
Control the randomness in the fitting procedure and for sampling.
62+
Default: ``None``.
5163
**kwargs
5264
Additional keyword arguments passed to `lightgbm.LGBRegressor`.
5365
"""
66+
kwargs["random_state"] = random_state # seed for tree learner
5467
self.kwargs = kwargs
68+
self._median_idx = None
69+
self._model_container = None
70+
self.quantiles = None
71+
self.likelihood = likelihood
72+
self._rng = None
73+
74+
# parse likelihood
75+
available_likelihoods = ["quantile", "poisson"] # to be extended
76+
if likelihood is not None:
77+
self._check_likelihood(likelihood, available_likelihoods)
78+
self.kwargs["objective"] = likelihood
79+
self._rng = np.random.default_rng(seed=random_state) # seed for sampling
80+
81+
if likelihood == "quantile":
82+
self.quantiles, self._median_idx = self._prepare_quantiles(quantiles)
83+
self._model_container = self._get_model_container()
5584

5685
super().__init__(
5786
lags=lags,
@@ -102,14 +131,32 @@ def fit(
102131
"""
103132

104133
if val_series is not None:
105-
106134
kwargs["eval_set"] = self._create_lagged_data(
107135
target_series=val_series,
108136
past_covariates=val_past_covariates,
109137
future_covariates=val_future_covariates,
110138
max_samples_per_ts=max_samples_per_ts,
111139
)
112140

141+
if self.likelihood == "quantile":
142+
# empty model container in case of multiple calls to fit, e.g. when backtesting
143+
self._model_container.clear()
144+
for quantile in self.quantiles:
145+
self.kwargs["alpha"] = quantile
146+
self.model = lgb.LGBMRegressor(**self.kwargs)
147+
148+
super().fit(
149+
series=series,
150+
past_covariates=past_covariates,
151+
future_covariates=future_covariates,
152+
max_samples_per_ts=max_samples_per_ts,
153+
**kwargs,
154+
)
155+
156+
self._model_container[quantile] = self.model
157+
158+
return self
159+
113160
super().fit(
114161
series=series,
115162
past_covariates=past_covariates,
@@ -119,3 +166,62 @@ def fit(
119166
)
120167

121168
return self
169+
170+
def predict(
171+
self,
172+
n: int,
173+
series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
174+
past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
175+
future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
176+
num_samples: int = 1,
177+
**kwargs,
178+
) -> Union[TimeSeries, Sequence[TimeSeries]]:
179+
"""Forecasts values for `n` time steps after the end of the series.
180+
181+
Parameters
182+
----------
183+
n : int
184+
Forecast horizon - the number of time steps after the end of the series for which to produce predictions.
185+
series : TimeSeries or list of TimeSeries, optional
186+
Optionally, one or several input `TimeSeries`, representing the history of the target series whose future
187+
is to be predicted. If specified, the method returns the forecasts of these series. Otherwise, the method
188+
returns the forecast of the (single) training series.
189+
past_covariates : TimeSeries or list of TimeSeries, optional
190+
Optionally, the past-observed covariates series needed as inputs for the model.
191+
They must match the covariates used for training in terms of dimension and type.
192+
future_covariates : TimeSeries or list of TimeSeries, optional
193+
Optionally, the future-known covariates series needed as inputs for the model.
194+
They must match the covariates used for training in terms of dimension and type.
195+
num_samples : int, default: 1
196+
Specifies the numer of samples to obtain from the model. Should be set to 1 if no `likelihood` is specified.
197+
**kwargs : dict, optional
198+
Additional keyword arguments passed to the `predict` method of the model. Only works with
199+
univariate target series.
200+
"""
201+
202+
if self.likelihood == "quantile":
203+
model_outputs = []
204+
for quantile, fitted in self._model_container.items():
205+
self.model = fitted
206+
prediction = super().predict(
207+
n, series, past_covariates, future_covariates, **kwargs
208+
)
209+
model_outputs.append(prediction.all_values(copy=False))
210+
model_outputs = np.concatenate(model_outputs, axis=-1)
211+
samples = self._sample_quantiles(model_outputs, num_samples)
212+
# build timeseries from samples
213+
return self._ts_like(prediction, samples)
214+
215+
if self.likelihood == "poisson":
216+
prediction = super().predict(
217+
n, series, past_covariates, future_covariates, **kwargs
218+
)
219+
samples = self._sample_poisson(
220+
np.array(prediction.all_values(copy=False)), num_samples
221+
)
222+
# build timeseries from samples
223+
return self._ts_like(prediction, samples)
224+
225+
return super().predict(
226+
n, series, past_covariates, future_covariates, num_samples, **kwargs
227+
)

darts/models/forecasting/linear_regression_model.py

+188-6
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,29 @@
55
A forecasting model using a linear regression of some of the target series' lags, as well as optionally some
66
covariate series' lags in order to obtain a forecast.
77
"""
8-
from typing import List, Tuple, Union
8+
from typing import List, Optional, Sequence, Tuple, Union
99

10-
from sklearn.linear_model import LinearRegression
10+
import numpy as np
11+
from scipy.optimize import linprog
12+
from sklearn.linear_model import LinearRegression, PoissonRegressor, QuantileRegressor
1113

1214
from darts.logging import get_logger
13-
from darts.models.forecasting.regression_model import RegressionModel
15+
from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin
16+
from darts.timeseries import TimeSeries
1417

1518
logger = get_logger(__name__)
1619

1720

18-
class LinearRegressionModel(RegressionModel):
21+
class LinearRegressionModel(RegressionModel, _LikelihoodMixin):
1922
def __init__(
2023
self,
2124
lags: Union[int, list] = None,
2225
lags_past_covariates: Union[int, List[int]] = None,
2326
lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
2427
output_chunk_length: int = 1,
28+
likelihood: str = None,
29+
quantiles: List[float] = None,
30+
random_state: Optional[int] = None,
2531
**kwargs,
2632
):
2733
"""Linear regression model.
@@ -44,17 +50,193 @@ def __init__(
4450
Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
4551
horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
4652
be useful if the covariates don't extend far enough into the future.
53+
likelihood
54+
Can be set to `quantile` or 'poisson'. If set, the model will be probabilistic, allowing sampling at
55+
prediction time. If set to `quantile`, the `sklearn.linear_model.QuantileRegressor` is used. Similarly, if
56+
set to `poisson`, the `sklearn.linear_model.PoissonRegressor` is used.
57+
quantiles
58+
Fit the model to these quantiles if the `likelihood` is set to `quantile`.
59+
random_state
60+
Control the randomness of the sampling. Used as seed for
61+
`link <https://numpy.org/doc/stable/reference/random/generator.html#numpy.random.Generator>`_ . Ignored when
62+
no`likelihood` is set.
63+
Default: ``None``.
4764
**kwargs
48-
Additional keyword arguments passed to `sklearn.linear_model.LinearRegression`.
65+
Additional keyword arguments passed to `sklearn.linear_model.LinearRegression` (by default), to
66+
`sklearn.linear_model.PoissonRegressor` (if `likelihood="poisson"`), or to
67+
`sklearn.linear_model.QuantileRegressor` (if `likelihood="quantile"`).
4968
"""
5069
self.kwargs = kwargs
70+
self._median_idx = None
71+
self._model_container = None
72+
self.quantiles = None
73+
self.likelihood = likelihood
74+
self._rng = None
75+
76+
# parse likelihood
77+
available_likelihoods = ["quantile", "poisson"] # to be extended
78+
if likelihood is not None:
79+
self._check_likelihood(likelihood, available_likelihoods)
80+
self._rng = np.random.default_rng(seed=random_state)
81+
82+
if likelihood == "poisson":
83+
model = PoissonRegressor(**kwargs)
84+
if likelihood == "quantile":
85+
model = QuantileRegressor(**kwargs)
86+
self.quantiles, self._median_idx = self._prepare_quantiles(quantiles)
87+
self._model_container = self._get_model_container()
88+
else:
89+
model = LinearRegression(**kwargs)
90+
5191
super().__init__(
5292
lags=lags,
5393
lags_past_covariates=lags_past_covariates,
5494
lags_future_covariates=lags_future_covariates,
5595
output_chunk_length=output_chunk_length,
56-
model=LinearRegression(**kwargs),
96+
model=model,
5797
)
5898

5999
def __str__(self):
60100
return f"LinearRegression(lags={self.lags})"
101+
102+
def fit(
103+
self,
104+
series: Union[TimeSeries, Sequence[TimeSeries]],
105+
past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
106+
future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
107+
max_samples_per_ts: Optional[int] = None,
108+
n_jobs_multioutput_wrapper: Optional[int] = None,
109+
**kwargs,
110+
):
111+
"""
112+
Fit/train the model on one or multiple series.
113+
114+
Parameters
115+
----------
116+
series
117+
TimeSeries or Sequence[TimeSeries] object containing the target values.
118+
past_covariates
119+
Optionally, a series or sequence of series specifying past-observed covariates
120+
future_covariates
121+
Optionally, a series or sequence of series specifying future-known covariates
122+
max_samples_per_ts
123+
This is an integer upper bound on the number of tuples that can be produced
124+
per time series. It can be used in order to have an upper bound on the total size of the dataset and
125+
ensure proper sampling. If `None`, it will read all of the individual time series in advance (at dataset
126+
creation) to know their sizes, which might be expensive on big datasets.
127+
If some series turn out to have a length that would allow more than `max_samples_per_ts`, only the
128+
most recent `max_samples_per_ts` samples will be considered.
129+
n_jobs_multioutput_wrapper
130+
Number of jobs of the MultiOutputRegressor wrapper to run in parallel. Only used if the model doesn't
131+
support multi-output regression natively.
132+
**kwargs
133+
Additional keyword arguments passed to the `fit` method of the model.
134+
"""
135+
136+
if self.likelihood == "quantile":
137+
# empty model container in case of multiple calls to fit, e.g. when backtesting
138+
self._model_container.clear()
139+
140+
# set solver for linear program
141+
if "solver" not in self.kwargs:
142+
# set default fast solver
143+
self.kwargs["solver"] = "highs"
144+
145+
# test solver availability with dummy problem
146+
c = [1]
147+
try:
148+
linprog(c=c, method=self.kwargs["solver"])
149+
except ValueError as ve:
150+
logger.warning(
151+
f"{ve}. Upgrading scipy enables significantly faster solvers"
152+
)
153+
# set solver to slow legacy
154+
self.kwargs["solver"] = "interior-point"
155+
156+
for quantile in self.quantiles:
157+
self.kwargs["quantile"] = quantile
158+
self.model = QuantileRegressor(**self.kwargs)
159+
super().fit(
160+
series=series,
161+
past_covariates=past_covariates,
162+
future_covariates=future_covariates,
163+
max_samples_per_ts=max_samples_per_ts,
164+
**kwargs,
165+
)
166+
167+
self._model_container[quantile] = self.model
168+
169+
return self
170+
171+
else:
172+
super().fit(
173+
series=series,
174+
past_covariates=past_covariates,
175+
future_covariates=future_covariates,
176+
max_samples_per_ts=max_samples_per_ts,
177+
**kwargs,
178+
)
179+
180+
return self
181+
182+
def predict(
183+
self,
184+
n: int,
185+
series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
186+
past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
187+
future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
188+
num_samples: int = 1,
189+
**kwargs,
190+
) -> Union[TimeSeries, Sequence[TimeSeries]]:
191+
"""Forecasts values for `n` time steps after the end of the series.
192+
193+
Parameters
194+
----------
195+
n : int
196+
Forecast horizon - the number of time steps after the end of the series for which to produce predictions.
197+
series : TimeSeries or list of TimeSeries, optional
198+
Optionally, one or several input `TimeSeries`, representing the history of the target series whose future
199+
is to be predicted. If specified, the method returns the forecasts of these series. Otherwise, the method
200+
returns the forecast of the (single) training series.
201+
past_covariates : TimeSeries or list of TimeSeries, optional
202+
Optionally, the past-observed covariates series needed as inputs for the model.
203+
They must match the covariates used for training in terms of dimension and type.
204+
future_covariates : TimeSeries or list of TimeSeries, optional
205+
Optionally, the future-known covariates series needed as inputs for the model.
206+
They must match the covariates used for training in terms of dimension and type.
207+
num_samples : int, default: 1
208+
Specifies the numer of samples to obtain from the model. Should be set to 1 if no `likelihood` is specified.
209+
**kwargs : dict, optional
210+
Additional keyword arguments passed to the `predict` method of the model. Only works with
211+
univariate target series.
212+
"""
213+
214+
if self.likelihood == "quantile":
215+
model_outputs = []
216+
for quantile, fitted in self._model_container.items():
217+
self.model = fitted
218+
prediction = super().predict(
219+
n, series, past_covariates, future_covariates, **kwargs
220+
)
221+
model_outputs.append(prediction.all_values(copy=False))
222+
model_outputs = np.concatenate(model_outputs, axis=-1)
223+
samples = self._sample_quantiles(model_outputs, num_samples)
224+
225+
# build timeseries from samples
226+
return self._ts_like(prediction, samples)
227+
228+
elif self.likelihood == "poisson":
229+
prediction = super().predict(
230+
n, series, past_covariates, future_covariates, **kwargs
231+
)
232+
samples = self._sample_poisson(
233+
np.array(prediction.all_values(copy=False)), num_samples
234+
)
235+
236+
# build timeseries from samples
237+
return self._ts_like(prediction, samples)
238+
239+
else:
240+
return super().predict(
241+
n, series, past_covariates, future_covariates, num_samples, **kwargs
242+
)

0 commit comments

Comments
 (0)