From be3b9c6fed82f51f3f637bf292ddbb1bf95a94cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 28 Feb 2023 14:43:59 -0500 Subject: [PATCH 01/66] pr_cal now created within SPI + refactoring --- xclim/indices/_agro.py | 131 ++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 73 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 48b8e71a5..17d1fbb9f 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -886,12 +886,11 @@ def water_budget( @declare_units( pr="[precipitation]", - pr_cal="[precipitation]", ) def standardized_precipitation_index( pr: xarray.DataArray, - pr_cal: Quantified, - freq: str = "MS", + dates: tuple, + freq: str | None = "MS", window: int = 1, dist: str = "gamma", method: str = "APP", @@ -902,8 +901,8 @@ def standardized_precipitation_index( ---------- pr : xarray.DataArray Daily precipitation. - pr_cal : xarray.DataArray - Daily precipitation used for calibration. Usually this is a temporal subset of `pr` over some reference period. + dates: tuple + Calibration dates freq : str Resampling frequency. A monthly or daily frequency is expected. window : int @@ -932,15 +931,18 @@ def standardized_precipitation_index( >>> from xclim.indices import standardized_precipitation_index >>> ds = xr.open_dataset(path_to_pr_file) >>> pr = ds.pr - >>> pr_cal = pr.sel(time=slice(datetime(1990, 5, 1), datetime(1990, 8, 31))) >>> spi_3 = standardized_precipitation_index( - ... pr, pr_cal, freq="MS", window=3, dist="gamma", method="ML" + ... pr, + ... ("1990-05-01", "1990-08-31"), + ... freq="MS", + ... window=3, + ... dist="gamma", + ... method="ML", ... ) # Computing SPI-3 months using a gamma distribution for the fit References ---------- :cite:cts:`mckee_relationship_1993` - """ # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} @@ -951,80 +953,63 @@ def standardized_precipitation_index( f"The method `{method}` is not supported for distribution `{dist}`." ) - # calibration period - cal_period = pr_cal.time[[0, -1]].dt.strftime("%Y-%m-%dT%H:%M:%S").values.tolist() + # freq is None allows to use monthly precips + if freq is None: + group = "time.month" - # Determine group type - if freq == "D" or freq is None: - freq = "D" - group = "time.dayofyear" else: - _, base, _, _ = parse_offset(freq) - if base in ["M"]: - group = "time.month" + # Determine group type + if freq == "D" or freq is None: + freq = "D" + group = "time.dayofyear" else: - raise NotImplementedError(f"Resampling frequency `{freq}` not supported.") - - # Resampling precipitations - if freq != "D": - pr = pr.resample(time=freq).mean(keep_attrs=True) - pr_cal = pr_cal.resample(time=freq).mean(keep_attrs=True) - - def needs_rechunking(da): - if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: - warnings.warn( - "The input data is chunked on time dimension and must be fully rechunked to" - " run `fit` on groups ." - " Beware, this operation can significantly increase the number of tasks dask" - " has to handle.", - stacklevel=2, + _, base, _, _ = parse_offset(freq) + if base in ["M"]: + group = "time.month" + else: + raise NotImplementedError( + f"Resampling frequency `{freq}` not supported." ) - return True - return False - if needs_rechunking(pr): - pr = pr.chunk({"time": -1}) - if needs_rechunking(pr_cal): - pr_cal = pr_cal.chunk({"time": -1}) + # Resampling precipitations + if freq != "D": + pr = pr.resample(time=freq).mean(keep_attrs=True) + + if uses_dask(pr) and len(pr.chunks[pr.get_axis_num("time")]) > 1: + warnings.warn( + "The input data is chunked on time dimension and must be fully rechunked to" + " run `fit` on groups ." + " Beware, this operation can significantly increase the number of tasks dask" + " has to handle.", + stacklevel=2, + ) + pr = pr.chunk({"time": -1}) # Rolling precipitations if window > 1: pr = pr.rolling(time=window).mean(skipna=False, keep_attrs=True) - pr_cal = pr_cal.rolling(time=window).mean(skipna=False, keep_attrs=True) - - # Obtain fitting params and expand along time dimension - def resample_to_time(da, da_ref): - if freq == "D": - da = resample_doy(da, da_ref) - else: - da = da.rename(month="time").reindex(time=da_ref.time.dt.month) - da["time"] = da_ref.time - return da - params = pr_cal.groupby(group).map(lambda x: fit(x, dist, method)) - params = resample_to_time(params, pr) - - # ppf to cdf - if dist in ["gamma", "fisk"]: - prob_pos = dist_method("cdf", params, pr.where(pr > 0)) - prob_zero = resample_to_time( - pr.groupby(group).map( - lambda x: (x == 0).sum("time") / x.notnull().sum("time") - ), - pr, + def get_sub_spi(pr): + pr_cal = pr.sel(time=slice(dates[0], dates[1])) + params = fit(pr_cal, dist, method) + # ppf to cdf + if dist in ["gamma", "fisk"]: + prob_pos = dist_method("cdf", params, pr.where(pr > 0)) + prob_zero = (pr == 0).sum("time") / pr.notnull().sum("time") + prob = prob_zero + (1 - prob_zero) * prob_pos + # Invert to normal distribution with ppf and obtain SPI + params_norm = xarray.DataArray( + [0, 1], + dims=["dparams"], + coords=dict(dparams=(["loc", "scale"])), + attrs=dict(scipy_dist="norm"), ) - prob = prob_zero + (1 - prob_zero) * prob_pos - - # Invert to normal distribution with ppf and obtain SPI - params_norm = xarray.DataArray( - [0, 1], - dims=["dparams"], - coords=dict(dparams=(["loc", "scale"])), - attrs=dict(scipy_dist="norm"), - ) - spi = dist_method("ppf", params_norm, prob) + spi_month = dist_method("ppf", params_norm, prob) + return spi_month + + spi = pr.groupby(group).map(get_sub_spi) spi.attrs["units"] = "" - spi.attrs["calibration_period"] = cal_period + spi.attrs["calibration_period"] = dates return spi @@ -1035,7 +1020,7 @@ def resample_to_time(da, da_ref): ) def standardized_precipitation_evapotranspiration_index( wb: xarray.DataArray, - wb_cal: Quantified, + dates: tuple, freq: str = "MS", window: int = 1, dist: str = "gamma", @@ -1085,9 +1070,9 @@ def standardized_precipitation_evapotranspiration_index( # library is taken offset = convert_units_to("1 mm/d", wb.units, context="hydro") with xarray.set_options(keep_attrs=True): - wb, wb_cal = wb + offset, wb_cal + offset + wb = wb + offset - spei = standardized_precipitation_index(wb, wb_cal, freq, window, dist, method) + spei = standardized_precipitation_index(wb, dates, freq, window, dist, method) return spei From 2d5b94233a73e75351b3660aee46aa6cff25d8dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 28 Feb 2023 14:50:19 -0500 Subject: [PATCH 02/66] Better variable name --- xclim/indices/_agro.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 17d1fbb9f..47519d6d5 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -889,7 +889,7 @@ def water_budget( ) def standardized_precipitation_index( pr: xarray.DataArray, - dates: tuple, + cal_range: tuple, freq: str | None = "MS", window: int = 1, dist: str = "gamma", @@ -901,7 +901,7 @@ def standardized_precipitation_index( ---------- pr : xarray.DataArray Daily precipitation. - dates: tuple + cal_range: tuple Calibration dates freq : str Resampling frequency. A monthly or daily frequency is expected. @@ -990,7 +990,7 @@ def standardized_precipitation_index( pr = pr.rolling(time=window).mean(skipna=False, keep_attrs=True) def get_sub_spi(pr): - pr_cal = pr.sel(time=slice(dates[0], dates[1])) + pr_cal = pr.sel(time=slice(cal_range[0], cal_range[1])) params = fit(pr_cal, dist, method) # ppf to cdf if dist in ["gamma", "fisk"]: @@ -1009,7 +1009,7 @@ def get_sub_spi(pr): spi = pr.groupby(group).map(get_sub_spi) spi.attrs["units"] = "" - spi.attrs["calibration_period"] = dates + spi.attrs["calibration_period"] = cal_range return spi @@ -1036,8 +1036,8 @@ def standardized_precipitation_evapotranspiration_index( ---------- wb : xarray.DataArray Daily water budget (pr - pet). - wb_cal : xarray.DataArray - Daily water budget used for calibration. + cal_range: tuple + Calibration dates freq : str Resampling frequency. A monthly or daily frequency is expected. window : int From a939edb36c2ca85983cc7e9428be81a1f56b6cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 1 Mar 2023 11:30:35 -0500 Subject: [PATCH 03/66] params can be used as input, and computed beforehand (`get_params`) --- xclim/indices/_agro.py | 55 +++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 47519d6d5..7b3ca43bf 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -889,11 +889,13 @@ def water_budget( ) def standardized_precipitation_index( pr: xarray.DataArray, - cal_range: tuple, + cal_range: tuple | None, + params: Quantified | None = None, freq: str | None = "MS", window: int = 1, dist: str = "gamma", method: str = "APP", + get_params: bool = False, ) -> xarray.DataArray: r"""Standardized Precipitation Index (SPI). @@ -901,8 +903,10 @@ def standardized_precipitation_index( ---------- pr : xarray.DataArray Daily precipitation. - cal_range: tuple - Calibration dates + cal_range: tuple | None + Calibration dates. `None` means that the full range of `pr` is used. + params: xarray.DataArray + Fit parameters. freq : str Resampling frequency. A monthly or daily frequency is expected. window : int @@ -922,8 +926,10 @@ def standardized_precipitation_index( Notes ----- - The length `N` of the N-month SPI is determined by choosing the `window = N`. - Supported statistical distributions are: ["gamma"] + * The length `N` of the N-month SPI is determined by choosing the `window = N`. + * Supported statistical distributions are: ["gamma", "fisk"], where "fisk" is scipy's implementation of + a log-logistic distribution + * If `params` is given as input, it overrides the `cal_range` option. Example ------- @@ -944,6 +950,8 @@ def standardized_precipitation_index( ---------- :cite:cts:`mckee_relationship_1993` """ + uses_params = params is not None + uses_range = cal_range is not None # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} if dist not in dist_and_methods: @@ -989,12 +997,28 @@ def standardized_precipitation_index( if window > 1: pr = pr.rolling(time=window).mean(skipna=False, keep_attrs=True) + if get_params: + if uses_range: + pr = pr.sel(time=slice(cal_range[0], cal_range[1])) + return pr.groupby(group).map(fit, (dist, method)) + + if uses_params: + paramsd = dict(params.groupby(group.rsplit(".")[1])) + def get_sub_spi(pr): - pr_cal = pr.sel(time=slice(cal_range[0], cal_range[1])) - params = fit(pr_cal, dist, method) + if uses_params: + groupk = pr[group][0].values.item() + sub_params = paramsd[groupk] + else: + if uses_range: + pr_cal = pr.sel(time=slice(cal_range[0], cal_range[1])) + else: + pr_cal = pr + sub_params = fit(pr_cal, dist, method) + # ppf to cdf if dist in ["gamma", "fisk"]: - prob_pos = dist_method("cdf", params, pr.where(pr > 0)) + prob_pos = dist_method("cdf", sub_params, pr.where(pr > 0)) prob_zero = (pr == 0).sum("time") / pr.notnull().sum("time") prob = prob_zero + (1 - prob_zero) * prob_pos # Invert to normal distribution with ppf and obtain SPI @@ -1004,23 +1028,24 @@ def get_sub_spi(pr): coords=dict(dparams=(["loc", "scale"])), attrs=dict(scipy_dist="norm"), ) - spi_month = dist_method("ppf", params_norm, prob) - return spi_month + sub_spi = dist_method("ppf", params_norm, prob) + return sub_spi spi = pr.groupby(group).map(get_sub_spi) spi.attrs["units"] = "" - spi.attrs["calibration_period"] = cal_range + + spi.attrs["calibration_data"] = "Input parameters" if uses_params else cal_range return spi @declare_units( wb="[precipitation]", - wb_cal="[precipitation]", ) def standardized_precipitation_evapotranspiration_index( wb: xarray.DataArray, - dates: tuple, + cal_range: tuple, + params: Quantified | None = None, freq: str = "MS", window: int = 1, dist: str = "gamma", @@ -1038,6 +1063,8 @@ def standardized_precipitation_evapotranspiration_index( Daily water budget (pr - pet). cal_range: tuple Calibration dates + params: xarray.DataArray + Fit parameters. freq : str Resampling frequency. A monthly or daily frequency is expected. window : int @@ -1072,7 +1099,7 @@ def standardized_precipitation_evapotranspiration_index( with xarray.set_options(keep_attrs=True): wb = wb + offset - spei = standardized_precipitation_index(wb, dates, freq, window, dist, method) + spei = standardized_precipitation_index(wb, cal_range, freq, window, dist, method) return spei From aeb8038acaeaccddfe11c8baaf7f7efa13c25c86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 1 Mar 2023 11:38:34 -0500 Subject: [PATCH 04/66] Clearer var names and better doc --- xclim/indices/_agro.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 7b3ca43bf..0fcc5b143 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -937,14 +937,27 @@ def standardized_precipitation_index( >>> from xclim.indices import standardized_precipitation_index >>> ds = xr.open_dataset(path_to_pr_file) >>> pr = ds.pr + >>> cal_range = ("1990-05-01", "1990-08-31") >>> spi_3 = standardized_precipitation_index( ... pr, - ... ("1990-05-01", "1990-08-31"), + ... cal_range, ... freq="MS", ... window=3, ... dist="gamma", ... method="ML", ... ) # Computing SPI-3 months using a gamma distribution for the fit + >>> # Fitting parameters can also be obtained ... + >>> params = standardized_precipitation_index( + ... pr, + ... cal_range, + ... freq="MS", + ... window=3, + ... dist="gamma", + ... method="ML", + ... get_params=True, + ... ) # First getting params + >>> # ... and used as input + >>> spi_3 = standardized_precipitation_index(pr, None, params=params) References ---------- @@ -1003,12 +1016,12 @@ def standardized_precipitation_index( return pr.groupby(group).map(fit, (dist, method)) if uses_params: - paramsd = dict(params.groupby(group.rsplit(".")[1])) + params_dict = dict(params.groupby(group.rsplit(".")[1])) def get_sub_spi(pr): if uses_params: - groupk = pr[group][0].values.item() - sub_params = paramsd[groupk] + group_key = pr[group][0].values.item() + sub_params = params_dict[group_key] else: if uses_range: pr_cal = pr.sel(time=slice(cal_range[0], cal_range[1])) From 3c5ac712166184441793376582fcaf8669ab1ed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 1 Mar 2023 12:58:33 -0500 Subject: [PATCH 05/66] Remove uses_range --- xclim/indices/_agro.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 0fcc5b143..f52bd9988 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -964,7 +964,6 @@ def standardized_precipitation_index( :cite:cts:`mckee_relationship_1993` """ uses_params = params is not None - uses_range = cal_range is not None # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} if dist not in dist_and_methods: @@ -1011,7 +1010,7 @@ def standardized_precipitation_index( pr = pr.rolling(time=window).mean(skipna=False, keep_attrs=True) if get_params: - if uses_range: + if cal_range: pr = pr.sel(time=slice(cal_range[0], cal_range[1])) return pr.groupby(group).map(fit, (dist, method)) @@ -1023,7 +1022,7 @@ def get_sub_spi(pr): group_key = pr[group][0].values.item() sub_params = params_dict[group_key] else: - if uses_range: + if cal_range: pr_cal = pr.sel(time=slice(cal_range[0], cal_range[1])) else: pr_cal = pr From ed4164dc294a084bc26d29535bd027d5261c9b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 1 Mar 2023 15:23:00 -0500 Subject: [PATCH 06/66] cal_range type changed, new warnings --- xclim/core/utils.py | 6 ++++++ xclim/indices/_agro.py | 27 ++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/xclim/core/utils.py b/xclim/core/utils.py index 6d0310f04..3c6b4ebfb 100644 --- a/xclim/core/utils.py +++ b/xclim/core/utils.py @@ -566,6 +566,12 @@ class InputKind(IntEnum): Annotation : ``bool``, may be optional. """ + DATE_TUPLE = 10 + """A tuple of dates in the YYYY-MM-DD format, may include a time. + + !!! not sure how to write this !!! + Annotation : `Tuple[xclim.core.utils.DateStr, xclim.core.utils.DateStr]` + """ KWARGS = 50 """A mapping from argument name to value. diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index f52bd9988..f1a6bea72 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -2,6 +2,7 @@ from __future__ import annotations import warnings +from typing import Tuple import numpy as np import xarray @@ -16,7 +17,7 @@ rate2amount, to_agg_units, ) -from xclim.core.utils import DayOfYearStr, Quantified, uses_dask +from xclim.core.utils import DateStr, DayOfYearStr, Quantified, uses_dask from xclim.indices._threshold import ( first_day_temperature_above, first_day_temperature_below, @@ -886,10 +887,11 @@ def water_budget( @declare_units( pr="[precipitation]", + params="[]", ) def standardized_precipitation_index( pr: xarray.DataArray, - cal_range: tuple | None, + cal_range: tuple[DateStr, DateStr] | None = None, params: Quantified | None = None, freq: str | None = "MS", window: int = 1, @@ -903,8 +905,9 @@ def standardized_precipitation_index( ---------- pr : xarray.DataArray Daily precipitation. - cal_range: tuple | None - Calibration dates. `None` means that the full range of `pr` is used. + cal_range: Tuple[DateStr, DateStr] | None + Dates used to subset `pr` and obtain dataset for calibration. The tuple is formed by two `DateStr`, i.e. a `str` in format `"YYYY-MM-DD"`. + Default option `None` means that the full range of `pr` is used. params: xarray.DataArray Fit parameters. freq : str @@ -918,6 +921,8 @@ def standardized_precipitation_index( method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. + get_params : bool + If `True`, the function only outputs the fit parameters from the calibration. The output Returns ------- @@ -964,6 +969,12 @@ def standardized_precipitation_index( :cite:cts:`mckee_relationship_1993` """ uses_params = params is not None + if cal_range and uses_params: + raise ValueError( + "Inputing both calibration dates (`cal_range`) and calibration parameters (`params`) is not accepted," + "input only one or neither of those options (the latter case reverts to default behaviour which performs a calibration" + "with the full input dataset)." + ) # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} if dist not in dist_and_methods: @@ -1053,6 +1064,7 @@ def get_sub_spi(pr): @declare_units( wb="[precipitation]", + params="[]", ) def standardized_precipitation_evapotranspiration_index( wb: xarray.DataArray, @@ -1062,6 +1074,7 @@ def standardized_precipitation_evapotranspiration_index( window: int = 1, dist: str = "gamma", method: str = "APP", + get_params: bool = False, ) -> xarray.DataArray: r"""Standardized Precipitation Evapotranspiration Index (SPEI). @@ -1088,6 +1101,8 @@ def standardized_precipitation_evapotranspiration_index( Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. Available methods vary with the distribution: 'gamma':{'APP', 'ML'}, 'fisk':{'ML'} + get_params : bool + If `True`, the function only outputs the fit parameters from the calibration. The output Returns ------- @@ -1111,7 +1126,9 @@ def standardized_precipitation_evapotranspiration_index( with xarray.set_options(keep_attrs=True): wb = wb + offset - spei = standardized_precipitation_index(wb, cal_range, freq, window, dist, method) + spei = standardized_precipitation_index( + wb, cal_range, params, freq, window, dist, method, get_params + ) return spei From 8f22ff1a85bd057cbbdff0bf792b789bc6046e55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 1 Mar 2023 15:26:46 -0500 Subject: [PATCH 07/66] cal_range type change 2/2 --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index f1a6bea72..3dd0e12a3 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1068,7 +1068,7 @@ def get_sub_spi(pr): ) def standardized_precipitation_evapotranspiration_index( wb: xarray.DataArray, - cal_range: tuple, + cal_range: tuple[DateStr, DateStr] | None = None, params: Quantified | None = None, freq: str = "MS", window: int = 1, From cbca142e100113fbf3f9d3a010ea9c40fedbd754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 1 Mar 2023 17:07:46 -0500 Subject: [PATCH 08/66] fitting/rolling/resampling defined in a separate function --- xclim/indices/_agro.py | 100 ++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 52 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 3dd0e12a3..aa84e6340 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -885,6 +885,48 @@ def water_budget( return out +# this might be useful elsewhere? +def _get_group(da, freq): + if freq is None: + _get_group(da, xarray.infer_freq(da.time)) + else: + _, base, _, _ = parse_offset(freq) + try: + group = {"D": "time.dayofyear", "M": "time.month"}[base] + except KeyError(): + raise ValueError(f"Resampling frequency `{freq}` not supported.") + return group + + +def _preprocess_spx(pr, freq, window): + if freq: + pr = pr.resample(time=freq).mean(keep_attrs=True) + if uses_dask(pr) and len(pr.chunks[pr.get_axis_num("time")]) > 1: + warnings.warn( + "The input data is chunked on time dimension and must be fully rechunked to" + " run `fit` on groups ." + " Beware, this operation can significantly increase the number of tasks dask" + " has to handle.", + stacklevel=2, + ) + pr = pr.chunk({"time": -1}) + + # Rolling precipitations + if window > 1: + pr = pr.rolling(time=window).mean(skipna=False, keep_attrs=True) + pr.attrs["_processed"] = True + return pr + + +def _compute_spx_fit_params(pr, cal_range, freq, window, dist, method, group=None): + group = group if group else _get_group(pr, freq) + if pr.attrs["_processed_"]: + pr = _preprocess_spx(pr, freq, window) + if cal_range: + pr = pr.sel(time=slice(cal_range[0], cal_range[1])) + return pr.groupby(group).map(fit, (dist, method)) + + @declare_units( pr="[precipitation]", params="[]", @@ -897,7 +939,6 @@ def standardized_precipitation_index( window: int = 1, dist: str = "gamma", method: str = "APP", - get_params: bool = False, ) -> xarray.DataArray: r"""Standardized Precipitation Index (SPI). @@ -921,8 +962,6 @@ def standardized_precipitation_index( method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. - get_params : bool - If `True`, the function only outputs the fit parameters from the calibration. The output Returns ------- @@ -984,46 +1023,8 @@ def standardized_precipitation_index( f"The method `{method}` is not supported for distribution `{dist}`." ) - # freq is None allows to use monthly precips - if freq is None: - group = "time.month" - - else: - # Determine group type - if freq == "D" or freq is None: - freq = "D" - group = "time.dayofyear" - else: - _, base, _, _ = parse_offset(freq) - if base in ["M"]: - group = "time.month" - else: - raise NotImplementedError( - f"Resampling frequency `{freq}` not supported." - ) - - # Resampling precipitations - if freq != "D": - pr = pr.resample(time=freq).mean(keep_attrs=True) - - if uses_dask(pr) and len(pr.chunks[pr.get_axis_num("time")]) > 1: - warnings.warn( - "The input data is chunked on time dimension and must be fully rechunked to" - " run `fit` on groups ." - " Beware, this operation can significantly increase the number of tasks dask" - " has to handle.", - stacklevel=2, - ) - pr = pr.chunk({"time": -1}) - - # Rolling precipitations - if window > 1: - pr = pr.rolling(time=window).mean(skipna=False, keep_attrs=True) - - if get_params: - if cal_range: - pr = pr.sel(time=slice(cal_range[0], cal_range[1])) - return pr.groupby(group).map(fit, (dist, method)) + group = _get_group(pr, freq) + pr = _preprocess_spx(pr, freq, window) if uses_params: params_dict = dict(params.groupby(group.rsplit(".")[1])) @@ -1033,11 +1034,9 @@ def get_sub_spi(pr): group_key = pr[group][0].values.item() sub_params = params_dict[group_key] else: - if cal_range: - pr_cal = pr.sel(time=slice(cal_range[0], cal_range[1])) - else: - pr_cal = pr - sub_params = fit(pr_cal, dist, method) + sub_params = _compute_spx_fit_params( + pr, cal_range, freq, window, dist, method, group=group + ) # ppf to cdf if dist in ["gamma", "fisk"]: @@ -1074,7 +1073,6 @@ def standardized_precipitation_evapotranspiration_index( window: int = 1, dist: str = "gamma", method: str = "APP", - get_params: bool = False, ) -> xarray.DataArray: r"""Standardized Precipitation Evapotranspiration Index (SPEI). @@ -1101,8 +1099,6 @@ def standardized_precipitation_evapotranspiration_index( Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. Available methods vary with the distribution: 'gamma':{'APP', 'ML'}, 'fisk':{'ML'} - get_params : bool - If `True`, the function only outputs the fit parameters from the calibration. The output Returns ------- @@ -1127,7 +1123,7 @@ def standardized_precipitation_evapotranspiration_index( wb = wb + offset spei = standardized_precipitation_index( - wb, cal_range, params, freq, window, dist, method, get_params + wb, cal_range, params, freq, window, dist, method ) return spei From 16f4b6b5572dbc0b8ce80184bceb66604d32fa26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 2 Mar 2023 11:13:08 -0500 Subject: [PATCH 09/66] resampling/rolling/fitting out of SPI --- xclim/indices/_agro.py | 128 ++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 66 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index aa84e6340..2ec8ed8b5 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -885,23 +885,35 @@ def water_budget( return out -# this might be useful elsewhere? -def _get_group(da, freq): - if freq is None: - _get_group(da, xarray.infer_freq(da.time)) - else: - _, base, _, _ = parse_offset(freq) - try: - group = {"D": "time.dayofyear", "M": "time.month"}[base] - except KeyError(): - raise ValueError(f"Resampling frequency `{freq}` not supported.") - return group +def _preprocess_spx(da, freq, window): + _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) + try: + group = {"D": "time.dayofyear", "M": "time.month"}[base] + except KeyError(): + raise ValueError(f"Standardized index with frequency `{freq}` not supported.") + if freq: + da = da.resample(time=freq).mean(keep_attrs=True) + if window > 1: + da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) + return da, group -def _preprocess_spx(pr, freq, window): - if freq: - pr = pr.resample(time=freq).mean(keep_attrs=True) - if uses_dask(pr) and len(pr.chunks[pr.get_axis_num("time")]) > 1: +def _compute_spx_fit_params(da, cal_range, freq, window, dist, method, group=None): + # "WPM" method doesn't seem to work for gamma or pearson3 + dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} + if dist not in dist_and_methods: + raise NotImplementedError(f"The distribution `{dist}` is not supported.") + if method not in dist_and_methods[dist]: + raise NotImplementedError( + f"The method `{method}` is not supported for distribution `{dist}`." + ) + if group is None: + da, group = _preprocess_spx(da, freq, window) + + if cal_range: + da = da.sel(time=slice(cal_range[0], cal_range[1])) + + if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: warnings.warn( "The input data is chunked on time dimension and must be fully rechunked to" " run `fit` on groups ." @@ -909,22 +921,16 @@ def _preprocess_spx(pr, freq, window): " has to handle.", stacklevel=2, ) - pr = pr.chunk({"time": -1}) - - # Rolling precipitations - if window > 1: - pr = pr.rolling(time=window).mean(skipna=False, keep_attrs=True) - pr.attrs["_processed"] = True - return pr - - -def _compute_spx_fit_params(pr, cal_range, freq, window, dist, method, group=None): - group = group if group else _get_group(pr, freq) - if pr.attrs["_processed_"]: - pr = _preprocess_spx(pr, freq, window) - if cal_range: - pr = pr.sel(time=slice(cal_range[0], cal_range[1])) - return pr.groupby(group).map(fit, (dist, method)) + da = da.chunk({"time": -1}) + params = da.groupby(group).map(fit, (dist, method)) + params.attrs.update( + { + "Calibration period": str(cal_range), + "Distribution": dist, + "Method": method, + } + ) + return params @declare_units( @@ -947,8 +953,8 @@ def standardized_precipitation_index( pr : xarray.DataArray Daily precipitation. cal_range: Tuple[DateStr, DateStr] | None - Dates used to subset `pr` and obtain dataset for calibration. The tuple is formed by two `DateStr`, i.e. a `str` in format `"YYYY-MM-DD"`. - Default option `None` means that the full range of `pr` is used. + Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, + i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. params: xarray.DataArray Fit parameters. freq : str @@ -984,59 +990,46 @@ def standardized_precipitation_index( >>> cal_range = ("1990-05-01", "1990-08-31") >>> spi_3 = standardized_precipitation_index( ... pr, - ... cal_range, + ... cal_range=cal_range, ... freq="MS", ... window=3, ... dist="gamma", ... method="ML", ... ) # Computing SPI-3 months using a gamma distribution for the fit >>> # Fitting parameters can also be obtained ... - >>> params = standardized_precipitation_index( + >>> params = _compute_spx_fit_params( ... pr, ... cal_range, ... freq="MS", ... window=3, ... dist="gamma", ... method="ML", - ... get_params=True, ... ) # First getting params >>> # ... and used as input - >>> spi_3 = standardized_precipitation_index(pr, None, params=params) + >>> spi_3 = standardized_precipitation_index(pr, params=params) References ---------- :cite:cts:`mckee_relationship_1993` """ - uses_params = params is not None - if cal_range and uses_params: + uses_input_params = params is not None + if cal_range and uses_input_params: raise ValueError( "Inputing both calibration dates (`cal_range`) and calibration parameters (`params`) is not accepted," "input only one or neither of those options (the latter case reverts to default behaviour which performs a calibration" "with the full input dataset)." ) - # "WPM" method doesn't seem to work for gamma or pearson3 - dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} - if dist not in dist_and_methods: - raise NotImplementedError(f"The distribution `{dist}` is not supported.") - if method not in dist_and_methods[dist]: - raise NotImplementedError( - f"The method `{method}` is not supported for distribution `{dist}`." - ) - group = _get_group(pr, freq) - pr = _preprocess_spx(pr, freq, window) - - if uses_params: - params_dict = dict(params.groupby(group.rsplit(".")[1])) + pr, group = _preprocess_spx(pr, freq, window) + if uses_input_params is False: + params = _compute_spx_fit_params( + pr, cal_range, freq, window, dist, method, group=group + ) + params_dict = dict(params.groupby(group.rsplit(".")[1])) def get_sub_spi(pr): - if uses_params: - group_key = pr[group][0].values.item() - sub_params = params_dict[group_key] - else: - sub_params = _compute_spx_fit_params( - pr, cal_range, freq, window, dist, method, group=group - ) + group_key = pr[group][0].values.item() + sub_params = params_dict[group_key] # ppf to cdf if dist in ["gamma", "fisk"]: @@ -1054,9 +1047,11 @@ def get_sub_spi(pr): return sub_spi spi = pr.groupby(group).map(get_sub_spi) - spi.attrs["units"] = "" - - spi.attrs["calibration_data"] = "Input parameters" if uses_params else cal_range + spi.attrs = params.attrs + if uses_input_params: + spi.attrs["Calibration period"] = ( + spi.attrs["Calibration period"] + "(input parameters)" + ) return spi @@ -1071,8 +1066,8 @@ def standardized_precipitation_evapotranspiration_index( params: Quantified | None = None, freq: str = "MS", window: int = 1, - dist: str = "gamma", - method: str = "APP", + dist: str = "fisk", + method: str = "ML", ) -> xarray.DataArray: r"""Standardized Precipitation Evapotranspiration Index (SPEI). @@ -1084,8 +1079,9 @@ def standardized_precipitation_evapotranspiration_index( ---------- wb : xarray.DataArray Daily water budget (pr - pet). - cal_range: tuple - Calibration dates + cal_range: Tuple[DateStr, DateStr] | None + Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, + i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. params: xarray.DataArray Fit parameters. freq : str From e9904063cfe19f747061c59dfb74472a80bb08ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 2 Mar 2023 11:26:06 -0500 Subject: [PATCH 10/66] Update doc and formatting --- xclim/core/formatting.py | 1 + xclim/indices/_agro.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/xclim/core/formatting.py b/xclim/core/formatting.py index d0e84f477..8e7d05fff 100644 --- a/xclim/core/formatting.py +++ b/xclim/core/formatting.py @@ -535,6 +535,7 @@ def unprefix_attrs(source: dict, keys: Sequence, prefix: str): InputKind.STRING: "str", InputKind.DAY_OF_YEAR: "date (string, MM-DD)", InputKind.DATE: "date (string, YYYY-MM-DD)", + InputKind.DATE_TUPLE: "date tuple ((string_1, YYYY-MM-DD), (string_2, YYYY-MM-DD))", InputKind.BOOL: "boolean", InputKind.DATASET: "Dataset, optional", InputKind.KWARGS: "", diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 2ec8ed8b5..899fe814f 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -957,8 +957,9 @@ def standardized_precipitation_index( i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. params: xarray.DataArray Fit parameters. - freq : str - Resampling frequency. A monthly or daily frequency is expected. + freq : str | None + Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling + has already been applied input dataset and will skip the resampling step. window : int Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, i.e. a monthly resampling, the window is an integer number of months. @@ -1084,8 +1085,9 @@ def standardized_precipitation_evapotranspiration_index( i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. params: xarray.DataArray Fit parameters. - freq : str - Resampling frequency. A monthly or daily frequency is expected. + freq : str | None + Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling + has already been applied input dataset and will skip the resampling step. window : int Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, i.e. a monthly resampling, the window is an integer number of months. From ca577c214a85a864e8d1cb5f53e9cc14e9b5df88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 2 Mar 2023 15:36:37 -0500 Subject: [PATCH 11/66] indexer support --- xclim/indices/_agro.py | 70 ++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 899fe814f..4cc94f164 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -885,7 +885,7 @@ def water_budget( return out -def _preprocess_spx(da, freq, window): +def _preprocess_spx(da, freq, window, **indexer): _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) try: group = {"D": "time.dayofyear", "M": "time.month"}[base] @@ -895,10 +895,13 @@ def _preprocess_spx(da, freq, window): da = da.resample(time=freq).mean(keep_attrs=True) if window > 1: da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) + da = select_time(da, **indexer) return da, group -def _compute_spx_fit_params(da, cal_range, freq, window, dist, method, group=None): +def _compute_spx_fit_params( + da, cal_range, freq, window, dist, method, group=None, **indexer +): # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} if dist not in dist_and_methods: @@ -907,6 +910,7 @@ def _compute_spx_fit_params(da, cal_range, freq, window, dist, method, group=Non raise NotImplementedError( f"The method `{method}` is not supported for distribution `{dist}`." ) + if group is None: da, group = _preprocess_spx(da, freq, window) @@ -922,14 +926,25 @@ def _compute_spx_fit_params(da, cal_range, freq, window, dist, method, group=Non stacklevel=2, ) da = da.chunk({"time": -1}) - params = da.groupby(group).map(fit, (dist, method)) - params.attrs.update( - { - "Calibration period": str(cal_range), - "Distribution": dist, - "Method": method, - } - ) + + da = select_time(da, **indexer) + + def wrap_fit(da): + if indexer != {}: + if da.isnull.all(): + select_dims = {d: 0 for d in da.dims if d != "time"} + with xarray.set_options(keep_attrs=True): + params = ( + fit(da.isel(time=slice(0, 2))[select_dims], dist, method) + * da.isel(time=0, drop=True) + * np.NaN + ) + return params + return fit(da, dist, method) + + params = da.groupby(group).map(wrap_fit) + params.attrs["Calibration period"] = str(cal_range) + return params @@ -945,6 +960,7 @@ def standardized_precipitation_index( window: int = 1, dist: str = "gamma", method: str = "APP", + **indexer, ) -> xarray.DataArray: r"""Standardized Precipitation Index (SPI). @@ -969,6 +985,10 @@ def standardized_precipitation_index( method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. + Indexing is done after finding the dry days, but before finding the spells. Returns ------- @@ -1021,20 +1041,17 @@ def standardized_precipitation_index( "with the full input dataset)." ) - pr, group = _preprocess_spx(pr, freq, window) + pr, group = _preprocess_spx(pr, freq, window, **indexer) if uses_input_params is False: params = _compute_spx_fit_params( pr, cal_range, freq, window, dist, method, group=group ) params_dict = dict(params.groupby(group.rsplit(".")[1])) - def get_sub_spi(pr): - group_key = pr[group][0].values.item() - sub_params = params_dict[group_key] - + def ppf_to_cdf(da, params): # ppf to cdf if dist in ["gamma", "fisk"]: - prob_pos = dist_method("cdf", sub_params, pr.where(pr > 0)) + prob_pos = dist_method("cdf", params, pr.where(pr > 0)) prob_zero = (pr == 0).sum("time") / pr.notnull().sum("time") prob = prob_zero + (1 - prob_zero) * prob_pos # Invert to normal distribution with ppf and obtain SPI @@ -1047,6 +1064,20 @@ def get_sub_spi(pr): sub_spi = dist_method("ppf", params_norm, prob) return sub_spi + def get_sub_spi(pr): + group_key = pr[group][0].values.item() + sub_params = params_dict[group_key] + if indexer != {}: + if pr.isnull().all(): + select_dims = {d: 0 for d in pr.dims if d != "time"} + sub_spi = ppf_to_cdf( + pr.isel(time=slice(0, 2))[select_dims], sub_params[select_dims] + ) + with xarray.set_options(keep_attrs=True): + sub_spi = sub_spi * pr * np.NaN + return sub_spi + return ppf_to_cdf(pr, sub_params) + spi = pr.groupby(group).map(get_sub_spi) spi.attrs = params.attrs if uses_input_params: @@ -1069,6 +1100,7 @@ def standardized_precipitation_evapotranspiration_index( window: int = 1, dist: str = "fisk", method: str = "ML", + **indexer, ) -> xarray.DataArray: r"""Standardized Precipitation Evapotranspiration Index (SPEI). @@ -1097,6 +1129,10 @@ def standardized_precipitation_evapotranspiration_index( Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. Available methods vary with the distribution: 'gamma':{'APP', 'ML'}, 'fisk':{'ML'} + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. + Indexing is done after finding the dry days, but before finding the spells. Returns ------- @@ -1121,7 +1157,7 @@ def standardized_precipitation_evapotranspiration_index( wb = wb + offset spei = standardized_precipitation_index( - wb, cal_range, params, freq, window, dist, method + wb, cal_range, params, freq, window, dist, method, **indexer ) return spei From fde244139b07a61e95872856799fe55c94d28c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 2 Mar 2023 15:49:39 -0500 Subject: [PATCH 12/66] update doc --- xclim/indices/_agro.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 4cc94f164..197a8b0a3 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -988,7 +988,6 @@ def standardized_precipitation_index( indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. - Indexing is done after finding the dry days, but before finding the spells. Returns ------- @@ -1132,7 +1131,6 @@ def standardized_precipitation_evapotranspiration_index( indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. - Indexing is done after finding the dry days, but before finding the spells. Returns ------- From 0fa83f4205ef8d182581640f4446e87608e29ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 15 Mar 2023 16:25:55 -0400 Subject: [PATCH 13/66] typo function isnull --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 197a8b0a3..1ed08db67 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -931,7 +931,7 @@ def _compute_spx_fit_params( def wrap_fit(da): if indexer != {}: - if da.isnull.all(): + if da.isnull().all(): select_dims = {d: 0 for d in da.dims if d != "time"} with xarray.set_options(keep_attrs=True): params = ( From f747dea597f5e043a1edc1d1f275009ec6217814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Mon, 19 Jun 2023 10:31:37 -0400 Subject: [PATCH 14/66] More documentation, remove useless step --- xclim/indices/_agro.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 1ed08db67..f28e05260 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -927,19 +927,20 @@ def _compute_spx_fit_params( ) da = da.chunk({"time": -1}) - da = select_time(da, **indexer) + # I don't think that's necessary? At this point, da is already preprocessed and time selected + # da = select_time(da, **indexer) + # Should I apply the indexer at the very end? Would that avoid unecessary computations? def wrap_fit(da): + # this could be simplified as: + # if group representative completely in the excluded time period of indexer, + # just put NaNs there. I wonder if this can be done with less lines of code if indexer != {}: if da.isnull().all(): select_dims = {d: 0 for d in da.dims if d != "time"} with xarray.set_options(keep_attrs=True): - params = ( - fit(da.isel(time=slice(0, 2))[select_dims], dist, method) - * da.isel(time=0, drop=True) - * np.NaN - ) - return params + template = fit(da.isel(time=slice(0, 2))[select_dims], dist, method) + return template * da.isel(time=0, drop=True) * np.NaN return fit(da, dist, method) params = da.groupby(group).map(wrap_fit) @@ -948,6 +949,16 @@ def wrap_fit(da): return params +# TODO : ADD +""" +streamflow index (SSI) GEV, log-logistic (Vincente-Serrano et al., 2012) Tweedie (Barker et al., 2016) (Station de suivi) +groundwater index (SGI) log-normal, gamma, GEV (Bloomfield et Marchant, 2013) (Station de suivi) +NEW dist: GEV, log-normal +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.genextreme.html#scipy.stats.genextreme +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html +""" + + @declare_units( pr="[precipitation]", params="[]", From 2b6dfe951fd2b544be66fc7d79aab87d9ffc6078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 22 Jun 2023 13:05:58 -0400 Subject: [PATCH 15/66] Refactoring, simplifications, shorter msgs --- xclim/indices/_agro.py | 56 +++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index f28e05260..cc57d2344 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -899,9 +899,7 @@ def _preprocess_spx(da, freq, window, **indexer): return da, group -def _compute_spx_fit_params( - da, cal_range, freq, window, dist, method, group=None, **indexer -): +def _spx_fit_params(da, cal_range, freq, window, dist, method, group=None, **indexer): # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} if dist not in dist_and_methods: @@ -932,15 +930,13 @@ def _compute_spx_fit_params( # Should I apply the indexer at the very end? Would that avoid unecessary computations? def wrap_fit(da): - # this could be simplified as: - # if group representative completely in the excluded time period of indexer, - # just put NaNs there. I wonder if this can be done with less lines of code - if indexer != {}: - if da.isnull().all(): - select_dims = {d: 0 for d in da.dims if d != "time"} - with xarray.set_options(keep_attrs=True): - template = fit(da.isel(time=slice(0, 2))[select_dims], dist, method) - return template * da.isel(time=0, drop=True) * np.NaN + if indexer != {} and da.isnull().all(): + fitted = fit( + da[{d: 0 if d != "time" else [0, 1] for d in da.dims}], dist, method + ) + return (fitted * da.isel(time=0, drop=True) * np.NaN).assign_attrs( + fitted.attrs + ) return fit(da, dist, method) params = da.groupby(group).map(wrap_fit) @@ -1043,26 +1039,22 @@ def standardized_precipitation_index( ---------- :cite:cts:`mckee_relationship_1993` """ - uses_input_params = params is not None - if cal_range and uses_input_params: - raise ValueError( - "Inputing both calibration dates (`cal_range`) and calibration parameters (`params`) is not accepted," - "input only one or neither of those options (the latter case reverts to default behaviour which performs a calibration" - "with the full input dataset)." + uses_params = params is not None + if cal_range and uses_params: + warnings.warn( + "Expected either `cal_range` or `params`, got both. Proceeding with `params`." ) pr, group = _preprocess_spx(pr, freq, window, **indexer) - if uses_input_params is False: - params = _compute_spx_fit_params( - pr, cal_range, freq, window, dist, method, group=group - ) + if uses_params is False: + params = _spx_fit_params(pr, cal_range, freq, window, dist, method, group=group) params_dict = dict(params.groupby(group.rsplit(".")[1])) def ppf_to_cdf(da, params): # ppf to cdf if dist in ["gamma", "fisk"]: - prob_pos = dist_method("cdf", params, pr.where(pr > 0)) - prob_zero = (pr == 0).sum("time") / pr.notnull().sum("time") + prob_pos = dist_method("cdf", params, da.where(da > 0)) + prob_zero = (da == 0).sum("time") / da.notnull().sum("time") prob = prob_zero + (1 - prob_zero) * prob_pos # Invert to normal distribution with ppf and obtain SPI params_norm = xarray.DataArray( @@ -1077,20 +1069,16 @@ def ppf_to_cdf(da, params): def get_sub_spi(pr): group_key = pr[group][0].values.item() sub_params = params_dict[group_key] - if indexer != {}: - if pr.isnull().all(): - select_dims = {d: 0 for d in pr.dims if d != "time"} - sub_spi = ppf_to_cdf( - pr.isel(time=slice(0, 2))[select_dims], sub_params[select_dims] - ) - with xarray.set_options(keep_attrs=True): - sub_spi = sub_spi * pr * np.NaN - return sub_spi + # put NaNs if outside time selection + if indexer != {} and pr.isnull().all(): + select_dims = {d: 0 for d in pr.dims if d != "time"} + sub_spi = ppf_to_cdf(pr[select_dims][{"time": [0, 1]}], params[select_dims]) + return sub_spi.broadcast_like(pr.isel(time=0, drop=True)) return ppf_to_cdf(pr, sub_params) spi = pr.groupby(group).map(get_sub_spi) spi.attrs = params.attrs - if uses_input_params: + if uses_params: spi.attrs["Calibration period"] = ( spi.attrs["Calibration period"] + "(input parameters)" ) From 09d72b3815b0d6241a57c601a2fe0da29832d54b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 22 Jun 2023 14:29:38 -0400 Subject: [PATCH 16/66] More documentation, some cleaning --- xclim/indices/_agro.py | 59 +++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index cc57d2344..60f39c527 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -44,6 +44,7 @@ "huglin_index", "latitude_temperature_index", "qian_weighted_mean_average", + "spx_fit_params", "standardized_precipitation_evapotranspiration_index", "standardized_precipitation_index", "water_budget", @@ -886,6 +887,12 @@ def water_budget( def _preprocess_spx(da, freq, window, **indexer): + """ + Resample and roll operations involved when computing a standardized index. + + The `group` variable returned is used spx computations and to keep track if + preprocessing was already done or not. + """ _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) try: group = {"D": "time.dayofyear", "M": "time.month"}[base] @@ -895,11 +902,46 @@ def _preprocess_spx(da, freq, window, **indexer): da = da.resample(time=freq).mean(keep_attrs=True) if window > 1: da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) + + # The time reduction must be done after the rolling da = select_time(da, **indexer) return da, group -def _spx_fit_params(da, cal_range, freq, window, dist, method, group=None, **indexer): +# TODO: Find a more generic place for this, SPX indices are not exclusive to _agro +def spx_fit_params( + da, cal_range, freq, window, dist, method, group=None, **indexer +) -> xarray.DataArray: + """Standardized Index fitting parameters. + + A standardized index measures the deviation of a variable averaged over a rolling temporal window and + fitted with a given distribution `dist` with respect to a calibration dataset. The comparison is done by porting + back results to a normalized distribution. The fitting parameters of the calibration dataset fitted with `dist` + are obtained here. + + Parameters + ---------- + da : xarray.DataArray + Input array. + cal_range: Tuple[DateStr, DateStr] | None + Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, + i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. + freq : str | None + Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling + has already been applied input dataset and will skip the resampling step. + window : int + Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, + i.e. a monthly resampling, the window is an integer number of months. + dist : str + Name of the univariate distribution. + (see :py:mod:`scipy.stats`). + method : str + Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method + uses a deterministic function that doesn't involve any optimization. + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. + """ # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} if dist not in dist_and_methods: @@ -925,18 +967,11 @@ def _spx_fit_params(da, cal_range, freq, window, dist, method, group=None, **ind ) da = da.chunk({"time": -1}) - # I don't think that's necessary? At this point, da is already preprocessed and time selected - # da = select_time(da, **indexer) - # Should I apply the indexer at the very end? Would that avoid unecessary computations? - def wrap_fit(da): if indexer != {} and da.isnull().all(): - fitted = fit( - da[{d: 0 if d != "time" else [0, 1] for d in da.dims}], dist, method - ) - return (fitted * da.isel(time=0, drop=True) * np.NaN).assign_attrs( - fitted.attrs - ) + select_dims = {d: 0 if d != "time" else [0, 1] for d in da.dims} + fitted = fit(da[select_dims], dist, method) + return fitted.broadcast_like(da.isel(time=0, drop=True)) return fit(da, dist, method) params = da.groupby(group).map(wrap_fit) @@ -1047,7 +1082,7 @@ def standardized_precipitation_index( pr, group = _preprocess_spx(pr, freq, window, **indexer) if uses_params is False: - params = _spx_fit_params(pr, cal_range, freq, window, dist, method, group=group) + params = spx_fit_params(pr, cal_range, freq, window, dist, method, group=group) params_dict = dict(params.groupby(group.rsplit(".")[1])) def ppf_to_cdf(da, params): From 17c1463f56eb862aaa4aea8b03633d7a365ce67c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 28 Jun 2023 09:24:46 -0400 Subject: [PATCH 17/66] SPI accepts da & params as sufficient arguments --- xclim/indices/_agro.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 60f39c527..00725b4c3 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -975,8 +975,13 @@ def wrap_fit(da): return fit(da, dist, method) params = da.groupby(group).map(wrap_fit) - params.attrs["Calibration period"] = str(cal_range) - + params.attrs = { + "Calibration period": str(cal_range), + "freq": freq, + "window": window, + "dist": dist, + "method": method, + } return params @@ -997,11 +1002,11 @@ def wrap_fit(da): def standardized_precipitation_index( pr: xarray.DataArray, cal_range: tuple[DateStr, DateStr] | None = None, - params: Quantified | None = None, freq: str | None = "MS", - window: int = 1, - dist: str = "gamma", - method: str = "APP", + window: int | None = 1, + dist: str | None = "gamma", + method: str | None = "APP", + params: Quantified | None = None, **indexer, ) -> xarray.DataArray: r"""Standardized Precipitation Index (SPI). @@ -1013,8 +1018,6 @@ def standardized_precipitation_index( cal_range: Tuple[DateStr, DateStr] | None Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. - params: xarray.DataArray - Fit parameters. freq : str | None Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. @@ -1027,6 +1030,8 @@ def standardized_precipitation_index( method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. + params: xarray.DataArray + Fit parameters. If `params` is given as input, it overrides the `cal_range` option. indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1041,7 +1046,7 @@ def standardized_precipitation_index( * The length `N` of the N-month SPI is determined by choosing the `window = N`. * Supported statistical distributions are: ["gamma", "fisk"], where "fisk" is scipy's implementation of a log-logistic distribution - * If `params` is given as input, it overrides the `cal_range` option. + * If `params` is given as input, it overrides the `cal_range`, `freq` and `window`, `dist` and `method` options. Example ------- @@ -1079,6 +1084,11 @@ def standardized_precipitation_index( warnings.warn( "Expected either `cal_range` or `params`, got both. Proceeding with `params`." ) + # Because default parameters for `window` and other options are not None, I can't + # make a similar warning for other params. + freq, window, dist, method = ( + params.attrs[s] for s in ["freq", "window", "dist", "method"] + ) pr, group = _preprocess_spx(pr, freq, window, **indexer) if uses_params is False: @@ -1113,6 +1123,7 @@ def get_sub_spi(pr): spi = pr.groupby(group).map(get_sub_spi) spi.attrs = params.attrs + spi.attrs["units"] = "" if uses_params: spi.attrs["Calibration period"] = ( spi.attrs["Calibration period"] + "(input parameters)" From d2dd196fa2981b2fe7b817c288451850b26410f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 30 Jun 2023 14:22:47 -0400 Subject: [PATCH 18/66] Remove problem in declare_units and move functions --- xclim/indices/_agro.py | 219 ++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 110 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 7cbe6ac7b..5c281faa3 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -887,115 +887,6 @@ def water_budget( return out -def _preprocess_spx(da, freq, window, **indexer): - """ - Resample and roll operations involved when computing a standardized index. - - The `group` variable returned is used spx computations and to keep track if - preprocessing was already done or not. - """ - _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) - try: - group = {"D": "time.dayofyear", "M": "time.month"}[base] - except KeyError(): - raise ValueError(f"Standardized index with frequency `{freq}` not supported.") - if freq: - da = da.resample(time=freq).mean(keep_attrs=True) - if window > 1: - da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) - - # The time reduction must be done after the rolling - da = select_time(da, **indexer) - return da, group - - -# TODO: Find a more generic place for this, SPX indices are not exclusive to _agro -def spx_fit_params( - da, cal_range, freq, window, dist, method, group=None, **indexer -) -> xarray.DataArray: - """Standardized Index fitting parameters. - - A standardized index measures the deviation of a variable averaged over a rolling temporal window and - fitted with a given distribution `dist` with respect to a calibration dataset. The comparison is done by porting - back results to a normalized distribution. The fitting parameters of the calibration dataset fitted with `dist` - are obtained here. - - Parameters - ---------- - da : xarray.DataArray - Input array. - cal_range: Tuple[DateStr, DateStr] | None - Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, - i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. - freq : str | None - Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling - has already been applied input dataset and will skip the resampling step. - window : int - Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, - i.e. a monthly resampling, the window is an integer number of months. - dist : str - Name of the univariate distribution. - (see :py:mod:`scipy.stats`). - method : str - Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method - uses a deterministic function that doesn't involve any optimization. - indexer - Indexing parameters to compute the indicator on a temporal subset of the data. - It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. - """ - # "WPM" method doesn't seem to work for gamma or pearson3 - dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} - if dist not in dist_and_methods: - raise NotImplementedError(f"The distribution `{dist}` is not supported.") - if method not in dist_and_methods[dist]: - raise NotImplementedError( - f"The method `{method}` is not supported for distribution `{dist}`." - ) - - if group is None: - da, group = _preprocess_spx(da, freq, window) - - if cal_range: - da = da.sel(time=slice(cal_range[0], cal_range[1])) - - if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: - warnings.warn( - "The input data is chunked on time dimension and must be fully rechunked to" - " run `fit` on groups ." - " Beware, this operation can significantly increase the number of tasks dask" - " has to handle.", - stacklevel=2, - ) - da = da.chunk({"time": -1}) - - def wrap_fit(da): - if indexer != {} and da.isnull().all(): - select_dims = {d: 0 if d != "time" else [0, 1] for d in da.dims} - fitted = fit(da[select_dims], dist, method) - return fitted.broadcast_like(da.isel(time=0, drop=True)) - return fit(da, dist, method) - - params = da.groupby(group).map(wrap_fit) - params.attrs = { - "Calibration period": str(cal_range), - "freq": freq, - "window": window, - "dist": dist, - "method": method, - } - return params - - -# TODO : ADD -""" -streamflow index (SSI) GEV, log-logistic (Vincente-Serrano et al., 2012) Tweedie (Barker et al., 2016) (Station de suivi) -groundwater index (SGI) log-normal, gamma, GEV (Bloomfield et Marchant, 2013) (Station de suivi) -NEW dist: GEV, log-normal -https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.genextreme.html#scipy.stats.genextreme -https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html -""" - - @declare_units( pr="[precipitation]", thresh_wet_start="[length]", @@ -1178,10 +1069,118 @@ def _get_rain_season(pram): return out["rain_season_start"], out["rain_season_end"], out["rain_season_length"] +def _preprocess_spx(da, freq, window, **indexer): + """ + Resample and roll operations involved when computing a standardized index. + + The `group` variable returned is used spx computations and to keep track if + preprocessing was already done or not. + """ + _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) + try: + group = {"D": "time.dayofyear", "M": "time.month"}[base] + except KeyError(): + raise ValueError(f"Standardized index with frequency `{freq}` not supported.") + if freq: + da = da.resample(time=freq).mean(keep_attrs=True) + if window > 1: + da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) + + # The time reduction must be done after the rolling + da = select_time(da, **indexer) + return da, group + + +# TODO: Find a more generic place for this, SPX indices are not exclusive to _agro +def spx_fit_params( + da, cal_range, freq, window, dist, method, group=None, **indexer +) -> xarray.DataArray: + """Standardized Index fitting parameters. + + A standardized index measures the deviation of a variable averaged over a rolling temporal window and + fitted with a given distribution `dist` with respect to a calibration dataset. The comparison is done by porting + back results to a normalized distribution. The fitting parameters of the calibration dataset fitted with `dist` + are obtained here. + + Parameters + ---------- + da : xarray.DataArray + Input array. + cal_range: Tuple[DateStr, DateStr] | None + Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, + i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. + freq : str | None + Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling + has already been applied input dataset and will skip the resampling step. + window : int + Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, + i.e. a monthly resampling, the window is an integer number of months. + dist : str + Name of the univariate distribution. + (see :py:mod:`scipy.stats`). + method : str + Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method + uses a deterministic function that doesn't involve any optimization. + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. + """ + # "WPM" method doesn't seem to work for gamma or pearson3 + dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} + if dist not in dist_and_methods: + raise NotImplementedError(f"The distribution `{dist}` is not supported.") + if method not in dist_and_methods[dist]: + raise NotImplementedError( + f"The method `{method}` is not supported for distribution `{dist}`." + ) + + if group is None: + da, group = _preprocess_spx(da, freq, window) + + if cal_range: + da = da.sel(time=slice(cal_range[0], cal_range[1])) + + if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: + warnings.warn( + "The input data is chunked on time dimension and must be fully rechunked to" + " run `fit` on groups ." + " Beware, this operation can significantly increase the number of tasks dask" + " has to handle.", + stacklevel=2, + ) + da = da.chunk({"time": -1}) + + def wrap_fit(da): + if indexer != {} and da.isnull().all(): + select_dims = {d: 0 if d != "time" else [0, 1] for d in da.dims} + fitted = fit(da[select_dims], dist, method) + return fitted.broadcast_like(da.isel(time=0, drop=True)) + return fit(da, dist, method) + + params = da.groupby(group).map(wrap_fit) + params.attrs = { + "Calibration period": str(cal_range), + "freq": freq, + "window": window, + "dist": dist, + "method": method, + } + return params + + +# TODO : ADD +""" +streamflow index (SSI) GEV, log-logistic (Vincente-Serrano et al., 2012) Tweedie (Barker et al., 2016) (Station de suivi) +groundwater index (SGI) log-normal, gamma, GEV (Bloomfield et Marchant, 2013) (Station de suivi) +NEW dist: GEV, log-normal +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.genextreme.html#scipy.stats.genextreme +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html +""" + + @declare_units( pr="[precipitation]", pr_cal="[precipitation]", - params: Quantified | None = None, ) def standardized_precipitation_index( pr: xarray.DataArray, From 50b7f7154b3ec7d6892319706c46172e8cf6731c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 30 Jun 2023 14:28:29 -0400 Subject: [PATCH 19/66] remvoe pr_cal from declare_units --- xclim/indices/_agro.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 5c281faa3..d110f60af 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1180,7 +1180,6 @@ def wrap_fit(da): @declare_units( pr="[precipitation]", - pr_cal="[precipitation]", ) def standardized_precipitation_index( pr: xarray.DataArray, From fdef0846d228d1b65215adacafc8346428d55e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Sat, 1 Jul 2023 15:43:47 -0400 Subject: [PATCH 20/66] Refactor functions names, no more "spx" --- xclim/indices/_agro.py | 57 +++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index d110f60af..8a06dd041 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -45,7 +45,7 @@ "latitude_temperature_index", "qian_weighted_mean_average", "rain_season", - "spx_fit_params", + "standardized_index_fit_params", "standardized_precipitation_evapotranspiration_index", "standardized_precipitation_index", "water_budget", @@ -1069,11 +1069,11 @@ def _get_rain_season(pram): return out["rain_season_start"], out["rain_season_end"], out["rain_season_length"] -def _preprocess_spx(da, freq, window, **indexer): +def _preprocess_standardized_index(da, freq, window, **indexer): """ Resample and roll operations involved when computing a standardized index. - The `group` variable returned is used spx computations and to keep track if + The `group` variable returned is used in standardized index computations and to keep track if preprocessing was already done or not. """ _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) @@ -1092,7 +1092,7 @@ def _preprocess_spx(da, freq, window, **indexer): # TODO: Find a more generic place for this, SPX indices are not exclusive to _agro -def spx_fit_params( +def standardized_index_fit_params( da, cal_range, freq, window, dist, method, group=None, **indexer ) -> xarray.DataArray: """Standardized Index fitting parameters. @@ -1135,7 +1135,7 @@ def spx_fit_params( ) if group is None: - da, group = _preprocess_spx(da, freq, window) + da, group = _preprocess_standardized_index(da, freq, window) if cal_range: da = da.sel(time=slice(cal_range[0], cal_range[1])) @@ -1168,6 +1168,23 @@ def wrap_fit(da): return params +def _get_standardized_index(da, params): + # pdf to cdf + # Is this way of treating the special case of 0 always the right way? + if params.attrs["dist"] in ["gamma", "fisk"]: + prob_pos = dist_method("cdf", params, da.where(da > 0)) + prob_zero = (da == 0).sum("time") / da.notnull().sum("time") + prob = prob_zero + (1 - prob_zero) * prob_pos + # Invert to normal distribution with normal ppf to get standardized index + params_norm = xarray.DataArray( + [0, 1], + dims=["dparams"], + coords=dict(dparams=(["loc", "scale"])), + attrs=dict(scipy_dist="norm"), + ) + return dist_method("ppf", params_norm, prob) + + # TODO : ADD """ streamflow index (SSI) GEV, log-logistic (Vincente-Serrano et al., 2012) Tweedie (Barker et al., 2016) (Station de suivi) @@ -1246,7 +1263,7 @@ def standardized_precipitation_index( ... method="ML", ... ) # Computing SPI-3 months using a gamma distribution for the fit >>> # Fitting parameters can also be obtained ... - >>> params = _compute_spx_fit_params( + >>> params = _standardized_index_fit_params( ... pr, ... cal_range, ... freq="MS", @@ -1272,26 +1289,12 @@ def standardized_precipitation_index( params.attrs[s] for s in ["freq", "window", "dist", "method"] ) - pr, group = _preprocess_spx(pr, freq, window, **indexer) + pr, group = _preprocess_standardized_index(pr, freq, window, **indexer) if uses_params is False: - params = spx_fit_params(pr, cal_range, freq, window, dist, method, group=group) - params_dict = dict(params.groupby(group.rsplit(".")[1])) - - def ppf_to_cdf(da, params): - # ppf to cdf - if dist in ["gamma", "fisk"]: - prob_pos = dist_method("cdf", params, da.where(da > 0)) - prob_zero = (da == 0).sum("time") / da.notnull().sum("time") - prob = prob_zero + (1 - prob_zero) * prob_pos - # Invert to normal distribution with ppf and obtain SPI - params_norm = xarray.DataArray( - [0, 1], - dims=["dparams"], - coords=dict(dparams=(["loc", "scale"])), - attrs=dict(scipy_dist="norm"), + params = standardized_index_fit_params( + pr, cal_range, freq, window, dist, method, group=group ) - sub_spi = dist_method("ppf", params_norm, prob) - return sub_spi + params_dict = dict(params.groupby(group.rsplit(".")[1])) def get_sub_spi(pr): group_key = pr[group][0].values.item() @@ -1299,9 +1302,11 @@ def get_sub_spi(pr): # put NaNs if outside time selection if indexer != {} and pr.isnull().all(): select_dims = {d: 0 for d in pr.dims if d != "time"} - sub_spi = ppf_to_cdf(pr[select_dims][{"time": [0, 1]}], params[select_dims]) + sub_spi = _get_standardized_index( + pr[select_dims][{"time": [0, 1]}], params[select_dims] + ) return sub_spi.broadcast_like(pr.isel(time=0, drop=True)) - return ppf_to_cdf(pr, sub_params) + return _get_standardized_index(pr, sub_params) spi = pr.groupby(group).map(get_sub_spi) spi.attrs = params.attrs From 3f90a30212096231a434b3f0f26ad5b734c35002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 11 Jul 2023 13:03:14 -0400 Subject: [PATCH 21/66] faster _get_standardized_index & refactoring --- tests/test_indices.py | 11 ++- xclim/indices/_agro.py | 185 +++++++++++++++++++++++++---------------- 2 files changed, 119 insertions(+), 77 deletions(-) diff --git a/tests/test_indices.py b/tests/test_indices.py index d10d2b58f..8f52dc04d 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -526,10 +526,10 @@ def test_standardized_precipitation_index( ds = open_dataset("sdba/CanESM2_1950-2100.nc").isel(location=1) pr = ds.pr.sel(time=slice("1998", "2000")) pr_cal = ds.pr.sel(time=slice("1950", "1980")) - spi = xci.standardized_precipitation_index( - pr, pr_cal, freq, window, dist, method + params = xci.standardized_index_fit_params( + pr_cal, freq=freq, window=window, dist=dist, method=method ) - + spi = xci.standardized_precipitation_index(pr, params=params) # Only a few moments before year 2000 are tested spi = spi.isel(time=slice(-11, -1, 2)) @@ -613,8 +613,11 @@ def test_standardized_precipitation_evapotranspiration_index( wb_cal = wb.sel(time=slice("1950", "1980")) wb = wb.sel(time=slice("1998", "2000")) + params = xci.standardized_index_fit_params( + wb_cal, freq=freq, window=window, dist=dist, method=method + ) spei = xci.standardized_precipitation_evapotranspiration_index( - wb, wb_cal, freq, window, dist, method + wb, params=params ) # Only a few moments before year 2000 are tested diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 8a06dd041..0ffaceba8 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -6,6 +6,7 @@ import numpy as np import xarray +from scipy.stats import norm import xclim.indices.run_length as rl from xclim.core.calendar import parse_offset, resample_doy, select_time @@ -25,7 +26,7 @@ ) from xclim.indices.generic import aggregate_between_dates, get_zones from xclim.indices.helpers import _gather_lat, day_lengths -from xclim.indices.stats import dist_method, fit +from xclim.indices.stats import dist_method, fit, get_dist # Frequencies : YS: year start, QS-DEC: seasons starting in december, MS: month start # See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html @@ -1081,8 +1082,20 @@ def _preprocess_standardized_index(da, freq, window, **indexer): group = {"D": "time.dayofyear", "M": "time.month"}[base] except KeyError(): raise ValueError(f"Standardized index with frequency `{freq}` not supported.") + if freq: da = da.resample(time=freq).mean(keep_attrs=True) + + if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: + warnings.warn( + "The input data is chunked on time dimension and must be fully rechunked to" + " run `fit` on groups ." + " Beware, this operation can significantly increase the number of tasks dask" + " has to handle.", + stacklevel=2, + ) + da = da.chunk({"time": -1}) + if window > 1: da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) @@ -1093,7 +1106,13 @@ def _preprocess_standardized_index(da, freq, window, **indexer): # TODO: Find a more generic place for this, SPX indices are not exclusive to _agro def standardized_index_fit_params( - da, cal_range, freq, window, dist, method, group=None, **indexer + da: xarray.DataArray, + freq: str | None, + window: int | None, + dist: str | None, + method: str | None, + cal_range: tuple[DateStr, DateStr] | None = None, + **indexer, ) -> xarray.DataArray: """Standardized Index fitting parameters. @@ -1134,22 +1153,11 @@ def standardized_index_fit_params( f"The method `{method}` is not supported for distribution `{dist}`." ) - if group is None: - da, group = _preprocess_standardized_index(da, freq, window) + da, group = _preprocess_standardized_index(da, freq, window, **indexer) if cal_range: da = da.sel(time=slice(cal_range[0], cal_range[1])) - if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: - warnings.warn( - "The input data is chunked on time dimension and must be fully rechunked to" - " run `fit` on groups ." - " Beware, this operation can significantly increase the number of tasks dask" - " has to handle.", - stacklevel=2, - ) - da = da.chunk({"time": -1}) - def wrap_fit(da): if indexer != {} and da.isnull().all(): select_dims = {d: 0 if d != "time" else [0, 1] for d in da.dims} @@ -1159,30 +1167,54 @@ def wrap_fit(da): params = da.groupby(group).map(wrap_fit) params.attrs = { - "Calibration period": str(cal_range), + "Calibration period": cal_range, "freq": freq, "window": window, - "dist": dist, + "scipy_dist": dist, "method": method, + "group": group, + "time_indexer": indexer, } return params -def _get_standardized_index(da, params): - # pdf to cdf - # Is this way of treating the special case of 0 always the right way? - if params.attrs["dist"] in ["gamma", "fisk"]: - prob_pos = dist_method("cdf", params, da.where(da > 0)) - prob_zero = (da == 0).sum("time") / da.notnull().sum("time") - prob = prob_zero + (1 - prob_zero) * prob_pos - # Invert to normal distribution with normal ppf to get standardized index - params_norm = xarray.DataArray( - [0, 1], - dims=["dparams"], - coords=dict(dparams=(["loc", "scale"])), - attrs=dict(scipy_dist="norm"), +def _get_standardized_index(da, params, **indexer): + scipy_dist = get_dist(params.attrs["scipy_dist"]) + group_name = params.attrs["group"].rsplit(".")[-1] + param_names = params.dparams.values + + def wrap_cdf_ppf(da, params, idxs): + if indexer != {} and np.isnan(da).all(): + return np.zeros_like(da) * np.NaN + + spi = np.zeros_like(da) + # loop over groups + for ii, v in enumerate(list(dict.fromkeys(group_idxs).keys())): + pars = params[ii, :] + indices = np.argwhere(group_idxs == v) + vals = da[indices] + zeros = (vals == 0).sum(axis=0) + vals[vals == 0] = np.NaN + paramsd = {param_names[jj]: pars[jj] for jj in range(len(pars))} + probs_of_zero = zeros / vals.shape[0] + dist_probs = scipy_dist.cdf(vals, **paramsd) + probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) + spi[indices] = norm.ppf(probs) + return spi + + group_idxs = da.time.dt.month if group_name == "month" else da.time.dt.dayofyear + return xarray.apply_ufunc( + wrap_cdf_ppf, + da, + params, + group_idxs, + input_core_dims=[["time"], [group_name, "dparams"], ["time"]], + output_core_dims=[["time"]], + dask="parallelized", + dask_gufunc_kwargs={"allow_rechunk": True}, + output_dtypes=[da.dtype], + vectorize=True, ) - return dist_method("ppf", params_norm, prob) # TODO : ADD @@ -1197,14 +1229,16 @@ def _get_standardized_index(da, params): @declare_units( pr="[precipitation]", + pr_cal="[precipitation]", ) def standardized_precipitation_index( pr: xarray.DataArray, - cal_range: tuple[DateStr, DateStr] | None = None, + pr_cal: Quantified | None = None, freq: str | None = "MS", window: int | None = 1, dist: str | None = "gamma", method: str | None = "APP", + cal_range: tuple[DateStr, DateStr] | None = None, params: Quantified | None = None, **indexer, ) -> xarray.DataArray: @@ -1214,9 +1248,9 @@ def standardized_precipitation_index( ---------- pr : xarray.DataArray Daily precipitation. - cal_range: Tuple[DateStr, DateStr] | None - Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, - i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. + pr_cal : xarray.DataArray + Daily precipitation used for calibration. Usually this is a temporal subset of `pr` over some reference period. This + option will be removed in xclim==0.46.0. Two behaviour will be possible (see below) freq : str | None Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. @@ -1229,8 +1263,12 @@ def standardized_precipitation_index( method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. + cal_range: Tuple[DateStr, DateStr] | None + Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, + i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. params: xarray.DataArray - Fit parameters. If `params` is given as input, it overrides the `cal_range` option. + Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. + The ouput can be given here as input, and it overrides other options (among others, `cal_range`). indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1278,59 +1316,54 @@ def standardized_precipitation_index( ---------- :cite:cts:`mckee_relationship_1993` """ - uses_params = params is not None - if cal_range and uses_params: + if params is not None and pr_cal is None: + freq, window = (params.attrs[s] for s in ["freq", "window"]) + if cal_range: + warnings.warn( + "Expected either `cal_range` or `params`, got both. The `params` input will be used, and" + "`freq`, `window`, and `dist` used to obtain `params` will be used here." + ) + + if pr_cal is not None: warnings.warn( - "Expected either `cal_range` or `params`, got both. Proceeding with `params`." + "Inputing `pr_cal` will be deprecated in xclim==0.46.0. If `pr_cal` is a subset of `pr`, then instead of:\n" + "`standardized_precipitation_index(pr=pr,pr_cal=pr.sel(time=slice(t0,t1)),...)`,\n one can call:\n" + "`standardized_precipitation_index(pr=pr,cal_range=(t0,t1),...).\n" + "If for some reason `pr_cal` is not a subset of `pr`, then the following approach will still be possible:\n" + "`params = standardized_index_fit_params(da=pr_cal, freq=freq, window=window, dist=dist, method=method)`.\n" + "`spi = standardized_precipitation_index(pr=pr, params=params)`.\n" + "This approach can be used in both scenarios to break up the computations in two, i.e. get params, then compute " + "standardized indices." ) - # Because default parameters for `window` and other options are not None, I can't - # make a similar warning for other params. - freq, window, dist, method = ( - params.attrs[s] for s in ["freq", "window", "dist", "method"] + params = standardized_index_fit_params( + pr_cal, freq=freq, window=window, dist=dist, method=method ) - pr, group = _preprocess_standardized_index(pr, freq, window, **indexer) - if uses_params is False: + pr, _ = _preprocess_standardized_index(pr, freq=freq, window=window, **indexer) + if params is None: params = standardized_index_fit_params( - pr, cal_range, freq, window, dist, method, group=group + pr, cal_range=cal_range, freq=None, window=1, dist=dist, method=method ) - params_dict = dict(params.groupby(group.rsplit(".")[1])) - - def get_sub_spi(pr): - group_key = pr[group][0].values.item() - sub_params = params_dict[group_key] - # put NaNs if outside time selection - if indexer != {} and pr.isnull().all(): - select_dims = {d: 0 for d in pr.dims if d != "time"} - sub_spi = _get_standardized_index( - pr[select_dims][{"time": [0, 1]}], params[select_dims] - ) - return sub_spi.broadcast_like(pr.isel(time=0, drop=True)) - return _get_standardized_index(pr, sub_params) - - spi = pr.groupby(group).map(get_sub_spi) + spi = _get_standardized_index(pr, params, **indexer) spi.attrs = params.attrs spi.attrs["units"] = "" - if uses_params: - spi.attrs["Calibration period"] = ( - spi.attrs["Calibration period"] + "(input parameters)" - ) - return spi @declare_units( wb="[precipitation]", + wb_cal="[precipitation]", params="[]", ) def standardized_precipitation_evapotranspiration_index( wb: xarray.DataArray, + wb_cal: Quantified | None = None, + freq: str | None = "MS", + window: int | None = 1, + dist: str | None = "gamma", + method: str | None = "APP", cal_range: tuple[DateStr, DateStr] | None = None, params: Quantified | None = None, - freq: str = "MS", - window: int = 1, - dist: str = "fisk", - method: str = "ML", **indexer, ) -> xarray.DataArray: r"""Standardized Precipitation Evapotranspiration Index (SPEI). @@ -1343,9 +1376,9 @@ def standardized_precipitation_evapotranspiration_index( ---------- wb : xarray.DataArray Daily water budget (pr - pet). - cal_range: Tuple[DateStr, DateStr] | None - Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, - i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. + wb_cal : xarray.DataArray + Daily water budget used for calibration. Usually this is a temporal subset of `wb` over some reference period. This + option will be removed in xclim==0.46.0. Two behaviour will be possible (see below) params: xarray.DataArray Fit parameters. freq : str | None @@ -1360,6 +1393,12 @@ def standardized_precipitation_evapotranspiration_index( Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. Available methods vary with the distribution: 'gamma':{'APP', 'ML'}, 'fisk':{'ML'} + cal_range: Tuple[DateStr, DateStr] | None + Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, + i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. + params: xarray.DataArray + Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. + The ouput can be given here as input, and it overrides other options (among others, `cal_range`). indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1387,7 +1426,7 @@ def standardized_precipitation_evapotranspiration_index( wb = wb + offset spei = standardized_precipitation_index( - wb, cal_range, params, freq, window, dist, method, **indexer + wb, wb_cal, freq, window, dist, method, cal_range, params, **indexer ) return spei From b02750bfc02439830bd6dc3bdf56d0a725030c0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 11 Jul 2023 13:22:55 -0400 Subject: [PATCH 22/66] fix variable names --- xclim/indices/_agro.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 0ffaceba8..683928679 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1183,15 +1183,15 @@ def _get_standardized_index(da, params, **indexer): group_name = params.attrs["group"].rsplit(".")[-1] param_names = params.dparams.values - def wrap_cdf_ppf(da, params, idxs): + def wrap_cdf_ppf(da, params, group_idxs): if indexer != {} and np.isnan(da).all(): return np.zeros_like(da) * np.NaN spi = np.zeros_like(da) # loop over groups - for ii, v in enumerate(list(dict.fromkeys(group_idxs).keys())): + for ii, group_idx in enumerate(list(dict.fromkeys(group_idxs).keys())): pars = params[ii, :] - indices = np.argwhere(group_idxs == v) + indices = np.argwhere(group_idxs == group_idx) vals = da[indices] zeros = (vals == 0).sum(axis=0) vals[vals == 0] = np.NaN From e81ad734f417b1593ada5a192e9ac8350a32cb6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 11 Jul 2023 14:48:10 -0400 Subject: [PATCH 23/66] Fixing SPEI units --- xclim/indices/_agro.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 683928679..2c2d7d82b 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1230,6 +1230,7 @@ def wrap_cdf_ppf(da, params, group_idxs): @declare_units( pr="[precipitation]", pr_cal="[precipitation]", + params="[]", ) def standardized_precipitation_index( pr: xarray.DataArray, @@ -1326,7 +1327,7 @@ def standardized_precipitation_index( if pr_cal is not None: warnings.warn( - "Inputing `pr_cal` will be deprecated in xclim==0.46.0. If `pr_cal` is a subset of `pr`, then instead of:\n" + "Inputing a calibration array will be deprecated in xclim==0.46.0. For example, if `pr_cal` is a subset of `pr`, then instead of say:\n" "`standardized_precipitation_index(pr=pr,pr_cal=pr.sel(time=slice(t0,t1)),...)`,\n one can call:\n" "`standardized_precipitation_index(pr=pr,cal_range=(t0,t1),...).\n" "If for some reason `pr_cal` is not a subset of `pr`, then the following approach will still be possible:\n" @@ -1429,7 +1430,7 @@ def standardized_precipitation_evapotranspiration_index( wb, wb_cal, freq, window, dist, method, cal_range, params, **indexer ) - return spei + return spei.attrs({"units": ""}) @declare_units(tas="[temperature]") From 5b7d4833294e3af766b2fc6ad0153d9259a57d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 11 Jul 2023 16:47:23 -0400 Subject: [PATCH 24/66] add issue number --- CHANGES.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 46032726f..7bd632de3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,6 +14,10 @@ Internal changes ^^^^^^^^^^^^^^^^ * Tolerance thresholds for error in ``test_stats::test_fit`` have been relaxed to allow for more variation in the results. Previously untested ``*_moving_yearly_window`` functions are now tested. (:issue:`1400`, :pull:`1402`). +New features and enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) are faster. (:issue:`1270` ,:pull:`1311`). + v0.44.0 (2023-06-23) -------------------- Contributors to this version: Éric Dupuis (:user:`coxipi`), Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`), Ludwig Lierhammer (:user:`ludwiglierhammer`), David Huard (:user:`huard`). From f8bae10583f43f398dc1fb7cbbeedd565466263a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 14 Jul 2023 12:04:12 -0400 Subject: [PATCH 25/66] cal_range -> cal_{start|end} && offset in SPEI public --- tests/test_indices.py | 14 +++++--- xclim/indices/_agro.py | 76 ++++++++++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/tests/test_indices.py b/tests/test_indices.py index 8f52dc04d..8cf795b37 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -613,11 +613,17 @@ def test_standardized_precipitation_evapotranspiration_index( wb_cal = wb.sel(time=slice("1950", "1980")) wb = wb.sel(time=slice("1998", "2000")) - params = xci.standardized_index_fit_params( - wb_cal, freq=freq, window=window, dist=dist, method=method - ) + # params = xci.standardized_index_fit_params( + # wb, wb_cal, freq=freq, window=window, dist=dist, method=method,offset="1 mm/d" + # ) spei = xci.standardized_precipitation_evapotranspiration_index( - wb, params=params + wb, + wb_cal, + freq=freq, + window=window, + dist=dist, + method=method, + offset="1 mm/d", ) # Only a few moments before year 2000 are tested diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 2c2d7d82b..a8c0e8ed1 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1111,7 +1111,6 @@ def standardized_index_fit_params( window: int | None, dist: str | None, method: str | None, - cal_range: tuple[DateStr, DateStr] | None = None, **indexer, ) -> xarray.DataArray: """Standardized Index fitting parameters. @@ -1125,9 +1124,6 @@ def standardized_index_fit_params( ---------- da : xarray.DataArray Input array. - cal_range: Tuple[DateStr, DateStr] | None - Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, - i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. freq : str | None Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. @@ -1155,9 +1151,6 @@ def standardized_index_fit_params( da, group = _preprocess_standardized_index(da, freq, window, **indexer) - if cal_range: - da = da.sel(time=slice(cal_range[0], cal_range[1])) - def wrap_fit(da): if indexer != {} and da.isnull().all(): select_dims = {d: 0 if d != "time" else [0, 1] for d in da.dims} @@ -1165,6 +1158,10 @@ def wrap_fit(da): return fitted.broadcast_like(da.isel(time=0, drop=True)) return fit(da, dist, method) + cal_range = ( + da.time.min().dt.strftime("%Y-%m-%d"), + da.time.max().dt.strftime("%Y-%m-%d"), + ) params = da.groupby(group).map(wrap_fit) params.attrs = { "Calibration period": cal_range, @@ -1174,6 +1171,7 @@ def wrap_fit(da): "method": method, "group": group, "time_indexer": indexer, + "units": "", } return params @@ -1239,7 +1237,8 @@ def standardized_precipitation_index( window: int | None = 1, dist: str | None = "gamma", method: str | None = "APP", - cal_range: tuple[DateStr, DateStr] | None = None, + cal_start: DateStr | None = None, + cal_end: DateStr | None = None, params: Quantified | None = None, **indexer, ) -> xarray.DataArray: @@ -1264,12 +1263,15 @@ def standardized_precipitation_index( method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. - cal_range: Tuple[DateStr, DateStr] | None - Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, - i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. + cal_start: DateStr | None + Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` + means that the calibration period begins at the start of the input dataset. + cal_end: DateStr | None + End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` + means that the calibration period finishes at the end of the input dataset. params: xarray.DataArray Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. - The ouput can be given here as input, and it overrides other options (among others, `cal_range`). + The ouput can be given here as input, and it overrides other options. indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1284,7 +1286,7 @@ def standardized_precipitation_index( * The length `N` of the N-month SPI is determined by choosing the `window = N`. * Supported statistical distributions are: ["gamma", "fisk"], where "fisk" is scipy's implementation of a log-logistic distribution - * If `params` is given as input, it overrides the `cal_range`, `freq` and `window`, `dist` and `method` options. + * If `params` is given as input, it overrides the `cal_start`, `cal_end`, `freq` and `window`, `dist` and `method` options. Example ------- @@ -1292,19 +1294,19 @@ def standardized_precipitation_index( >>> from xclim.indices import standardized_precipitation_index >>> ds = xr.open_dataset(path_to_pr_file) >>> pr = ds.pr - >>> cal_range = ("1990-05-01", "1990-08-31") + >>> cal_start, cal_end = "1990-05-01", "1990-08-31" >>> spi_3 = standardized_precipitation_index( ... pr, - ... cal_range=cal_range, ... freq="MS", ... window=3, ... dist="gamma", ... method="ML", + ... cal_start=cal_start, + ... cal_end=cal_end, ... ) # Computing SPI-3 months using a gamma distribution for the fit >>> # Fitting parameters can also be obtained ... >>> params = _standardized_index_fit_params( - ... pr, - ... cal_range, + ... pr.sel(time=slice(cal_start, cal_end)), ... freq="MS", ... window=3, ... dist="gamma", @@ -1319,10 +1321,10 @@ def standardized_precipitation_index( """ if params is not None and pr_cal is None: freq, window = (params.attrs[s] for s in ["freq", "window"]) - if cal_range: + if cal_start or cal_end: warnings.warn( - "Expected either `cal_range` or `params`, got both. The `params` input will be used, and" - "`freq`, `window`, and `dist` used to obtain `params` will be used here." + "Expected either `cal_{start|end}` or `params`, got both. The `params` input overrides other inputs." + "If `cal_start`, `cal_end`, `freq`, `window`, and/or `dist` were given as input, they will be ignored." ) if pr_cal is not None: @@ -1343,7 +1345,13 @@ def standardized_precipitation_index( pr, _ = _preprocess_standardized_index(pr, freq=freq, window=window, **indexer) if params is None: params = standardized_index_fit_params( - pr, cal_range=cal_range, freq=None, window=1, dist=dist, method=method + pr, + freq=None, + window=1, + dist=dist, + method=method, + cal_start=cal_start, + cal_end=cal_end, ) spi = _get_standardized_index(pr, params, **indexer) spi.attrs = params.attrs @@ -1354,6 +1362,7 @@ def standardized_precipitation_index( @declare_units( wb="[precipitation]", wb_cal="[precipitation]", + offset="[precipitation]", params="[]", ) def standardized_precipitation_evapotranspiration_index( @@ -1363,7 +1372,9 @@ def standardized_precipitation_evapotranspiration_index( window: int | None = 1, dist: str | None = "gamma", method: str | None = "APP", - cal_range: tuple[DateStr, DateStr] | None = None, + offset: Quantified = "1 mm/d", + cal_start: DateStr | None = None, + cal_end: DateStr | None = None, params: Quantified | None = None, **indexer, ) -> xarray.DataArray: @@ -1394,12 +1405,18 @@ def standardized_precipitation_evapotranspiration_index( Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. Available methods vary with the distribution: 'gamma':{'APP', 'ML'}, 'fisk':{'ML'} - cal_range: Tuple[DateStr, DateStr] | None - Dates used to take a subset the input dataset for calibration. The tuple is formed by two `DateStr`, - i.e. a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the full range of the input dataset is used. + cal_start: DateStr | None + Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` + means that the calibration period begins at the start of the input dataset. + cal_end: DateStr | None + End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` + means that the calibration period finishes at the end of the input dataset. params: xarray.DataArray Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. - The ouput can be given here as input, and it overrides other options (among others, `cal_range`). + The ouput can be given here as input, and it overrides other options. + offset: xarray.DataArray + For distributions bounded by zero (e.g. "gamma", "fisk"): Water budget must be shifted, only positive values are allowed. + This can be given as a precipitation flux or a rate. indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1422,15 +1439,16 @@ def standardized_precipitation_evapotranspiration_index( # Distributions bounded by zero: Water budget must be shifted, only positive values # are allowed. The offset choice is arbitrary and the same offset as the monocongo # library is taken - offset = convert_units_to("1 mm/d", wb.units, context="hydro") + offset = convert_units_to(offset, wb.units, context="hydro") with xarray.set_options(keep_attrs=True): wb = wb + offset + wb_cal = wb_cal + offset spei = standardized_precipitation_index( - wb, wb_cal, freq, window, dist, method, cal_range, params, **indexer + wb, wb_cal, freq, window, dist, method, cal_start, cal_end, params, **indexer ) - return spei.attrs({"units": ""}) + return spei @declare_units(tas="[temperature]") From 900225ed5ad2112fe6b418e64e574e90a35b171d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 14 Jul 2023 12:06:37 -0400 Subject: [PATCH 26/66] remove DATE_TUPLE type, etc --- xclim/core/formatting.py | 1 - xclim/core/utils.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/xclim/core/formatting.py b/xclim/core/formatting.py index 80bc5bbb3..fc2b13f20 100644 --- a/xclim/core/formatting.py +++ b/xclim/core/formatting.py @@ -532,7 +532,6 @@ def unprefix_attrs(source: dict, keys: Sequence, prefix: str): InputKind.STRING: "str", InputKind.DAY_OF_YEAR: "date (string, MM-DD)", InputKind.DATE: "date (string, YYYY-MM-DD)", - InputKind.DATE_TUPLE: "date tuple ((string_1, YYYY-MM-DD), (string_2, YYYY-MM-DD))", InputKind.BOOL: "boolean", InputKind.DATASET: "Dataset, optional", InputKind.KWARGS: "", diff --git a/xclim/core/utils.py b/xclim/core/utils.py index 0420ab471..9c286cc33 100644 --- a/xclim/core/utils.py +++ b/xclim/core/utils.py @@ -565,12 +565,6 @@ class InputKind(IntEnum): Annotation : ``bool``, may be optional. """ - DATE_TUPLE = 10 - """A tuple of dates in the YYYY-MM-DD format, may include a time. - - !!! not sure how to write this !!! - Annotation : `Tuple[xclim.core.utils.DateStr, xclim.core.utils.DateStr]` - """ KWARGS = 50 """A mapping from argument name to value. From b60cd2d330cbd5b6745c7ca5a03a2fbda079403d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 20 Jul 2023 13:25:57 -0400 Subject: [PATCH 27/66] Add offset as possible input in standardized_fit_params --- tests/test_indices.py | 13 +++++------- xclim/indices/_agro.py | 43 +++++++++++++++++++++++++++++++-------- xclim/sdba/_adjustment.py | 17 ++++++++++++++++ 3 files changed, 57 insertions(+), 16 deletions(-) diff --git a/tests/test_indices.py b/tests/test_indices.py index 8cf795b37..ef466a33d 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -610,21 +610,18 @@ def test_standardized_precipitation_evapotranspiration_index( tas = tasmax - 2.5 tasmin = tasmax - 5 wb = xci.water_budget(pr, None, tasmin, tasmax, tas) - wb_cal = wb.sel(time=slice("1950", "1980")) - wb = wb.sel(time=slice("1998", "2000")) - # params = xci.standardized_index_fit_params( - # wb, wb_cal, freq=freq, window=window, dist=dist, method=method,offset="1 mm/d" - # ) - spei = xci.standardized_precipitation_evapotranspiration_index( - wb, - wb_cal, + params = xci.standardized_index_fit_params( + wb.sel(time=slice("1950", "1980")), freq=freq, window=window, dist=dist, method=method, offset="1 mm/d", ) + spei = xci.standardized_precipitation_evapotranspiration_index( + wb.sel(time=slice("1998", "2000")), params=params + ) # Only a few moments before year 2000 are tested spei = spei.isel(time=slice(-11, -1, 2)) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index a8c0e8ed1..ab6d48f33 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -2,7 +2,6 @@ from __future__ import annotations import warnings -from typing import Tuple import numpy as np import xarray @@ -1111,6 +1110,7 @@ def standardized_index_fit_params( window: int | None, dist: str | None, method: str | None, + offset: Quantified | None = None, **indexer, ) -> xarray.DataArray: """Standardized Index fitting parameters. @@ -1136,6 +1136,9 @@ def standardized_index_fit_params( method : str Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. + offset: Quantified + Distributions bounded by zero (e.g. "gamma", "fisk") can be used for datasets with negative values by using a offset: + `da + offset` indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1149,6 +1152,10 @@ def standardized_index_fit_params( f"The method `{method}` is not supported for distribution `{dist}`." ) + if offset: + with xarray.set_options(keep_attrs=True): + da = da + convert_units_to(offset, da, context="hydro") + da, group = _preprocess_standardized_index(da, freq, window, **indexer) def wrap_fit(da): @@ -1172,6 +1179,7 @@ def wrap_fit(da): "group": group, "time_indexer": indexer, "units": "", + "offset": offset, } return params @@ -1414,7 +1422,7 @@ def standardized_precipitation_evapotranspiration_index( params: xarray.DataArray Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. - offset: xarray.DataArray + offset: Quantified For distributions bounded by zero (e.g. "gamma", "fisk"): Water budget must be shifted, only positive values are allowed. This can be given as a precipitation flux or a rate. indexer @@ -1434,15 +1442,34 @@ def standardized_precipitation_evapotranspiration_index( ----- See Standardized Precipitation Index (SPI) for more details on usage. """ + uses_default_offset = offset == "1 mm/d" + offset = 0 if offset is None else convert_units_to(offset, wb, context="hydro") + if params is not None: + params_offset = ( + 0 + if params.attrs["offset"] is None + else convert_units_to(params.attrs["offset"], wb, context="hydro") + ) + if params_offset != offset: + extra_msg = ( + " (using default SPEI value 1 mm/d)" if uses_default_offset else "" + ) + warnings.warn( + f"The offset in `params` differs from the input `offset`{extra_msg}." + "Procceding with the value given in `params`." + ) + offset = params_offset # Allowed distributions are constrained by the SPI function - if dist in ["gamma", "fisk"]: - # Distributions bounded by zero: Water budget must be shifted, only positive values - # are allowed. The offset choice is arbitrary and the same offset as the monocongo - # library is taken - offset = convert_units_to(offset, wb.units, context="hydro") + if dist in ["gamma", "fisk"] and offset <= 0: + raise ValueError( + "The water budget must be shifted towards positive values to be used with `gamma` and `fisk` " + f"distributions which are bounded by zero. A positive offset is required. Current value: {offset}{wb.attrs['units']}" + ) + if offset != 0: with xarray.set_options(keep_attrs=True): wb = wb + offset - wb_cal = wb_cal + offset + if wb_cal is not None: + wb_cal = wb_cal + offset spei = standardized_precipitation_index( wb, wb_cal, freq, window, dist, method, cal_start, cal_end, params, **indexer diff --git a/xclim/sdba/_adjustment.py b/xclim/sdba/_adjustment.py index 407b61e1f..06987aecd 100644 --- a/xclim/sdba/_adjustment.py +++ b/xclim/sdba/_adjustment.py @@ -46,6 +46,22 @@ def dqm_train(ds, *, dim, kind, quantiles) -> xr.Dataset: return xr.Dataset(data_vars=dict(af=af, hist_q=hist_q, scaling=scaling)) +# @ensure_longest_doy +# def get_correction2(x: xr.DataArray, y: xr.DataArray, kind: str) -> xr.DataArray: +# """Return the additive or multiplicative correction/adjustment factors.""" +# with xr.set_options(keep_attrs=True): +# if kind == ADDITIVE: +# out = y - x +# elif kind == MULTIPLICATIVE: +# out = y / x +# else: +# raise ValueError("kind must be + or *.") + +# if isinstance(out, xr.DataArray): +# out.attrs["kind"] = kind +# return out + + @map_groups( af=[Grouper.PROP, "quantiles"], hist_q=[Grouper.PROP, "quantiles"], @@ -57,6 +73,7 @@ def eqm_train(ds, *, dim, kind, quantiles) -> xr.Dataset: ref : training target hist : training data """ + # af, hist_q = nbu.quantile_mapping(ds.ref, ds.hist, quantiles, kind, dim) ref_q = nbu.quantile(ds.ref, quantiles, dim) hist_q = nbu.quantile(ds.hist, quantiles, dim) From 4a293a3e4026605cd4cd32dc52d125af8cfdb4e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 20 Jul 2023 14:33:07 -0400 Subject: [PATCH 28/66] new test on modularity and fix API error in SPI --- tests/test_indices.py | 45 ++++++++++++++++++++++++++++++++++++++++++ xclim/indices/_agro.py | 6 ++---- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/tests/test_indices.py b/tests/test_indices.py index ef466a33d..82290b9da 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -631,6 +631,51 @@ def test_standardized_precipitation_evapotranspiration_index( np.testing.assert_allclose(spei.values, values, rtol=0, atol=diff_tol) + def test_standardized_index_modularity(self, open_dataset): + freq, window, dist, method = "MS", 6, "gamma", "APP" + ds = ( + open_dataset("sdba/CanESM2_1950-2100.nc") + .isel(location=1) + .sel(time=slice("1950", "2000")) + ) + pr = ds.pr + # generate water budget + with xr.set_options(keep_attrs=True): + tasmax = ds.tasmax + tas = tasmax - 2.5 + tasmin = tasmax - 5 + wb = xci.water_budget(pr, None, tasmin, tasmax, tas) + + params = xci.standardized_index_fit_params( + wb.sel(time=slice("1950", "1980")), + freq=freq, + window=window, + dist=dist, + method=method, + offset="1 mm/d", + ) + spei1 = xci.standardized_precipitation_evapotranspiration_index( + wb.sel(time=slice("1998", "2000")), params=params + ) + + spei2 = xci.standardized_precipitation_evapotranspiration_index( + wb, + freq=freq, + window=window, + dist=dist, + method=method, + offset="1 mm/d", + cal_start="1950", + cal_end="1980", + ).sel(time=slice("1998", "2000")) + + # In the previous computation, the first {window-1} values are NaN because the rolling is performed on the period [1998,2000]. + # Here, the computation is performed on the period [1950,2000], *then* subsetted to [1998,2000], so it doesn't have NaNs + # for the first values + spei2[{"time": slice(0, window - 1)}] = np.nan + + np.testing.assert_allclose(spei1.values, spei2.values, rtol=0, atol=1e-4) + class TestDailyFreezeThawCycles: @pytest.mark.parametrize( diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index ab6d48f33..7159f56bb 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1194,7 +1194,7 @@ def wrap_cdf_ppf(da, params, group_idxs): return np.zeros_like(da) * np.NaN spi = np.zeros_like(da) - # loop over groups + # loop over groups (similar to np.groupies idea, but np.groupies can't be used with custom functions) for ii, group_idx in enumerate(list(dict.fromkeys(group_idxs).keys())): pars = params[ii, :] indices = np.argwhere(group_idxs == group_idx) @@ -1353,13 +1353,11 @@ def standardized_precipitation_index( pr, _ = _preprocess_standardized_index(pr, freq=freq, window=window, **indexer) if params is None: params = standardized_index_fit_params( - pr, + pr.sel(time=slice(cal_start, cal_end)), freq=None, window=1, dist=dist, method=method, - cal_start=cal_start, - cal_end=cal_end, ) spi = _get_standardized_index(pr, params, **indexer) spi.attrs = params.attrs From fec71df816093260c1ed80c71c96483a90869877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 21 Jul 2023 15:57:56 -0400 Subject: [PATCH 29/66] update CHANGES --- CHANGES.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 7bd632de3..7e2b7802d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,7 @@ Changelog v0.45.0 (unreleased) -------------------- -Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`). +Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`), Éric Dupuis (:user:`coxipi`). New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -17,6 +17,11 @@ Internal changes New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) are faster. (:issue:`1270` ,:pull:`1311`). +* Standardized indicators can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. (:issue:`1270` ,:pull:`1311`). + +Breaking changes +^^^^^^^^^^^^^^^^ +* ``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index`` will no longer accept two datasets (data and calibration data). Instead, a single dataset along start and end of calibration dates (`cal_start`, `cal_end`) is expected. (:issue:`1270` ,:pull:`1311`) v0.44.0 (2023-06-23) -------------------- From 993b639c27b4a200ce01272ffc44f5ea5f5b3ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 21 Jul 2023 16:13:09 -0400 Subject: [PATCH 30/66] remove comments --- xclim/indices/_agro.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 7159f56bb..864bccbbd 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1223,16 +1223,6 @@ def wrap_cdf_ppf(da, params, group_idxs): ) -# TODO : ADD -""" -streamflow index (SSI) GEV, log-logistic (Vincente-Serrano et al., 2012) Tweedie (Barker et al., 2016) (Station de suivi) -groundwater index (SGI) log-normal, gamma, GEV (Bloomfield et Marchant, 2013) (Station de suivi) -NEW dist: GEV, log-normal -https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.genextreme.html#scipy.stats.genextreme -https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html -""" - - @declare_units( pr="[precipitation]", pr_cal="[precipitation]", From c8891c9197b530a36496c87d10828426da135ae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 27 Jul 2023 09:33:49 -0400 Subject: [PATCH 31/66] clipped std_index values --- CHANGES.rst | 6 ++++-- xclim/indices/_agro.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 7e2b7802d..d2f131848 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -16,8 +16,10 @@ Internal changes New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) are faster. (:issue:`1270` ,:pull:`1311`). -* Standardized indicators can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. (:issue:`1270` ,:pull:`1311`). +* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, ,:pull:`1311`) were changed: + * More optimized + * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. + * The standardized index values are now clipped by ± 8.21. This reflects the float64 precision of the computation when cdf values are inverted to a normal distribution. Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 864bccbbd..5d8104fcf 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1209,7 +1209,7 @@ def wrap_cdf_ppf(da, params, group_idxs): return spi group_idxs = da.time.dt.month if group_name == "month" else da.time.dt.dayofyear - return xarray.apply_ufunc( + std_index = xarray.apply_ufunc( wrap_cdf_ppf, da, params, @@ -1221,6 +1221,11 @@ def wrap_cdf_ppf(da, params, group_idxs): output_dtypes=[da.dtype], vectorize=True, ) + # A cdf value of 0 or 1 gives ±np.inf when inverted to the normal distribution. + # The neighbouring values 0.00...1 and 0.99...9 with a float64 give a standardized index of ± 8.21 + # We use this index as reference for maximal/minimal standardized values + std_index = std_index.clip(-8.21, 8.21) + return std_index @declare_units( @@ -1285,6 +1290,8 @@ def standardized_precipitation_index( * Supported statistical distributions are: ["gamma", "fisk"], where "fisk" is scipy's implementation of a log-logistic distribution * If `params` is given as input, it overrides the `cal_start`, `cal_end`, `freq` and `window`, `dist` and `method` options. + * The standardized index is bounded by ±8.21. 8.21 is the largest standardized index as constrained by the float64 precision in \ + the inversion to the normal distribution. Example ------- @@ -1327,7 +1334,7 @@ def standardized_precipitation_index( if pr_cal is not None: warnings.warn( - "Inputing a calibration array will be deprecated in xclim==0.46.0. For example, if `pr_cal` is a subset of `pr`, then instead of say:\n" + "Inputing a calibration array will be deprecated in xclim>=0.46.0. For example, if `pr_cal` is a subset of `pr`, then instead of say:\n" "`standardized_precipitation_index(pr=pr,pr_cal=pr.sel(time=slice(t0,t1)),...)`,\n one can call:\n" "`standardized_precipitation_index(pr=pr,cal_range=(t0,t1),...).\n" "If for some reason `pr_cal` is not a subset of `pr`, then the following approach will still be possible:\n" From 486d5fe02432c896e33ff8dd2c5203c70ececd1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Thu, 27 Jul 2023 12:38:46 -0400 Subject: [PATCH 32/66] format docstring Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 5d8104fcf..85c0da1a8 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1290,7 +1290,7 @@ def standardized_precipitation_index( * Supported statistical distributions are: ["gamma", "fisk"], where "fisk" is scipy's implementation of a log-logistic distribution * If `params` is given as input, it overrides the `cal_start`, `cal_end`, `freq` and `window`, `dist` and `method` options. - * The standardized index is bounded by ±8.21. 8.21 is the largest standardized index as constrained by the float64 precision in \ + * The standardized index is bounded by ±8.21. 8.21 is the largest standardized index as constrained by the float64 precision in the inversion to the normal distribution. Example From 852bb07a0c2d8ec3849f40efc19e5bb14ae838d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Thu, 27 Jul 2023 12:44:50 -0400 Subject: [PATCH 33/66] Better description CHANGES.rst Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> --- CHANGES.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c33363ba6..8979b16e2 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -24,10 +24,10 @@ Internal changes New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, ,:pull:`1311`) were changed: - * More optimized +* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :pull:`1311`) were changed: + * More optimized and noticeably faster calculation. * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. - * The standardized index values are now clipped by ± 8.21. This reflects the float64 precision of the computation when cdf values are inverted to a normal distribution. + * The standardized index values are now clipped to ± 8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution. Breaking changes ^^^^^^^^^^^^^^^^ From f62cf04b5a3490310a3c6a5ab08093fc5737bc11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 1 Aug 2023 10:39:52 -0400 Subject: [PATCH 34/66] remove unrelated comments --- xclim/sdba/_adjustment.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/xclim/sdba/_adjustment.py b/xclim/sdba/_adjustment.py index 06987aecd..6ee01d656 100644 --- a/xclim/sdba/_adjustment.py +++ b/xclim/sdba/_adjustment.py @@ -46,22 +46,6 @@ def dqm_train(ds, *, dim, kind, quantiles) -> xr.Dataset: return xr.Dataset(data_vars=dict(af=af, hist_q=hist_q, scaling=scaling)) -# @ensure_longest_doy -# def get_correction2(x: xr.DataArray, y: xr.DataArray, kind: str) -> xr.DataArray: -# """Return the additive or multiplicative correction/adjustment factors.""" -# with xr.set_options(keep_attrs=True): -# if kind == ADDITIVE: -# out = y - x -# elif kind == MULTIPLICATIVE: -# out = y / x -# else: -# raise ValueError("kind must be + or *.") - -# if isinstance(out, xr.DataArray): -# out.attrs["kind"] = kind -# return out - - @map_groups( af=[Grouper.PROP, "quantiles"], hist_q=[Grouper.PROP, "quantiles"], From 64052a509a612a0f1764aab60d34d5d1847a587e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 1 Aug 2023 10:57:29 -0400 Subject: [PATCH 35/66] more docuemntation on std_index bounds --- xclim/indices/_agro.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 85c0da1a8..d7d219526 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1223,7 +1223,11 @@ def wrap_cdf_ppf(da, params, group_idxs): ) # A cdf value of 0 or 1 gives ±np.inf when inverted to the normal distribution. # The neighbouring values 0.00...1 and 0.99...9 with a float64 give a standardized index of ± 8.21 - # We use this index as reference for maximal/minimal standardized values + # We use this index as reference for maximal/minimal standardized values. + # Smaller values of standardized index could also be used as bounds. For example, [Guttman, 1999] + # notes that precipitation events (over a month/season) generally occur less than 100 times in the US. + # Values of standardized index outside the range [-3.09, 3.09] correspond to probabilities smaller than 0.001 + # or greater than 0.999, which could be considered non-statistically relevant for this sample size. std_index = std_index.clip(-8.21, 8.21) return std_index From abdc00b31d1e75f8be3e7b466de85d18e2e25cb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 16 Aug 2023 10:53:50 -0400 Subject: [PATCH 36/66] Broadcast params if needed --- xclim/indices/_agro.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index d7d219526..5394b2c78 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1360,6 +1360,13 @@ def standardized_precipitation_index( dist=dist, method=method, ) + + # If params only contains a subset of main dataset time grouping + # (e.g. 8/12 months, etc.), it needs to be broadcasted + paramsd = {k: v for k, v in params.sizes.items() if k != "dparams"} + if paramsd != pr.sizes: + params = params.broadcast_like(pr) + spi = _get_standardized_index(pr, params, **indexer) spi.attrs = params.attrs spi.attrs["units"] = "" From 2b26e88a3f48d9db18183e46f75b6718d8530362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 16 Aug 2023 11:08:12 -0400 Subject: [PATCH 37/66] Use template to broadcast --- xclim/indices/_agro.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 5394b2c78..867c405c2 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1363,9 +1363,10 @@ def standardized_precipitation_index( # If params only contains a subset of main dataset time grouping # (e.g. 8/12 months, etc.), it needs to be broadcasted + template = pr.groupby(params.attrs["group"]).apply(lambda x: x.isel(time=0)) paramsd = {k: v for k, v in params.sizes.items() if k != "dparams"} - if paramsd != pr.sizes: - params = params.broadcast_like(pr) + if paramsd != template.sizes: + params = params.broadcast_like(template) spi = _get_standardized_index(pr, params, **indexer) spi.attrs = params.attrs From 6bba2015dd31ef0f11fd98185c15bad58eb8739b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 16 Aug 2023 11:09:17 -0400 Subject: [PATCH 38/66] better notation --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 867c405c2..4ac296470 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1363,7 +1363,7 @@ def standardized_precipitation_index( # If params only contains a subset of main dataset time grouping # (e.g. 8/12 months, etc.), it needs to be broadcasted - template = pr.groupby(params.attrs["group"]).apply(lambda x: x.isel(time=0)) + template = pr.groupby(params.attrs["group"]).apply(lambda da: da.isel(time=0)) paramsd = {k: v for k, v in params.sizes.items() if k != "dparams"} if paramsd != template.sizes: params = params.broadcast_like(template) From ea1d3ea4dbd630903d68522e6884292208e1a61f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 16 Aug 2023 11:24:55 -0400 Subject: [PATCH 39/66] correct function name in docstring test --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 4ac296470..ed657b214 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1314,7 +1314,7 @@ def standardized_precipitation_index( ... cal_end=cal_end, ... ) # Computing SPI-3 months using a gamma distribution for the fit >>> # Fitting parameters can also be obtained ... - >>> params = _standardized_index_fit_params( + >>> params = standardized_index_fit_params( ... pr.sel(time=slice(cal_start, cal_end)), ... freq="MS", ... window=3, From 485e71896b9aebceab91c7de24a2d88a85c32af8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:55:25 -0400 Subject: [PATCH 40/66] Improve description of SPEI offset Co-authored-by: David Huard --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index ed657b214..5785ddb93 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1430,7 +1430,7 @@ def standardized_precipitation_evapotranspiration_index( Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. offset: Quantified - For distributions bounded by zero (e.g. "gamma", "fisk"): Water budget must be shifted, only positive values are allowed. + For distributions bounded by zero (e.g. "gamma", "fisk"), an offset must be added to the water budget to make sure there are no negative values. Keep the offset as small as possible to minimize its influence on the results. This can be given as a precipitation flux or a rate. indexer Indexing parameters to compute the indicator on a temporal subset of the data. From 8f1868ef72e5277426409e49229e1b0b2ea7bad6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Sep 2023 19:56:58 +0000 Subject: [PATCH 41/66] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 5785ddb93..3108acab6 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1430,7 +1430,7 @@ def standardized_precipitation_evapotranspiration_index( Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. offset: Quantified - For distributions bounded by zero (e.g. "gamma", "fisk"), an offset must be added to the water budget to make sure there are no negative values. Keep the offset as small as possible to minimize its influence on the results. + For distributions bounded by zero (e.g. "gamma", "fisk"), an offset must be added to the water budget to make sure there are no negative values. Keep the offset as small as possible to minimize its influence on the results. This can be given as a precipitation flux or a rate. indexer Indexing parameters to compute the indicator on a temporal subset of the data. From 06271eb0121390a57ac229dd5db3cdc0a6256ee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Mon, 11 Sep 2023 19:44:29 -0400 Subject: [PATCH 42/66] add notes about NaNs probable origin in SPEI --- xclim/indices/_agro.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 3108acab6..564eb559f 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1447,6 +1447,8 @@ def standardized_precipitation_evapotranspiration_index( Notes ----- + If results include NaNs, check that the `offset` parameter is larger than the minimum water budget values. + See Standardized Precipitation Index (SPI) for more details on usage. """ uses_default_offset = offset == "1 mm/d" From 18f346de06ee333a92e8fd3cf762dd43ec2b828c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 12 Sep 2023 09:04:47 -0400 Subject: [PATCH 43/66] Update CHANGES --- CHANGES.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index acd25b3de..f4ab96a25 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,10 +6,22 @@ v0.46.0 -------------------- Contributors to this version: Éric Dupuis (:user:`coxipi`). +New features and enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :issue:`1474`, :pull:`1311`) were changed: + * More optimized and noticeably faster calculation. + * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. + * The standardized index values are now clipped to ± 8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution. + * An offset parameter is now accessible ``xclim.indices.standardized_precipitation_evapotranspiration_index``. + Internal changes ^^^^^^^^^^^^^^^^ * Change "degK" to "K" used to design Kelvin units. (:pull:`1475`). +Breaking changes +^^^^^^^^^^^^^^^^ +* ``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index`` will no longer accept two datasets (data and calibration data). Instead, a single dataset along start and end of calibration dates (`cal_start`, `cal_end`) is expected. (:issue:`1270` ,:pull:`1311`) + v0.45.0 (2023-09-05) -------------------- Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`), Gabriel Rondeau-Genesse (:user:`RondeauG`), Marco Braun (:user:`vindelico`), Éric Dupuis (:user:`coxipi`). @@ -67,17 +79,6 @@ Breaking changes * Fix and adapt ``percentile_doy`` for an error raised by xarray > 2023.7.0. (:issue:`1417`, :pull:`1450`). * `integral` replaces `prod` and `delta_prod` as possible input in ``xclim.core.units.to_agg_units`` (:pull:`1443`, :issue:`1386`). -New features and enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :pull:`1311`) were changed: - * More optimized and noticeably faster calculation. - * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. - * The standardized index values are now clipped to ± 8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution. - -Breaking changes -^^^^^^^^^^^^^^^^ -* ``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index`` will no longer accept two datasets (data and calibration data). Instead, a single dataset along start and end of calibration dates (`cal_start`, `cal_end`) is expected. (:issue:`1270` ,:pull:`1311`) - v0.44.0 (2023-06-23) -------------------- Contributors to this version: Éric Dupuis (:user:`coxipi`), Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`), Ludwig Lierhammer (:user:`ludwiglierhammer`), David Huard (:user:`huard`). From b15e5669cdc088919772d3ff910a59e5f52c8444 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Tue, 12 Sep 2023 13:56:18 -0400 Subject: [PATCH 44/66] Improve documentation (review) Co-authored-by: David Huard --- CHANGES.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index f4ab96a25..97dfd4f1b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -9,10 +9,10 @@ Contributors to this version: Éric Dupuis (:user:`coxipi`). New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :issue:`1474`, :pull:`1311`) were changed: - * More optimized and noticeably faster calculation. + * Optimized and noticeably faster calculation. * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. - * The standardized index values are now clipped to ± 8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution. - * An offset parameter is now accessible ``xclim.indices.standardized_precipitation_evapotranspiration_index``. + * The standardized index values are now clipped to ±8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution. + * An offset parameter is now available to account for negative water balance values``xclim.indices.standardized_precipitation_evapotranspiration_index``. Internal changes ^^^^^^^^^^^^^^^^ @@ -20,7 +20,7 @@ Internal changes Breaking changes ^^^^^^^^^^^^^^^^ -* ``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index`` will no longer accept two datasets (data and calibration data). Instead, a single dataset along start and end of calibration dates (`cal_start`, `cal_end`) is expected. (:issue:`1270` ,:pull:`1311`) +* ``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index`` will no longer accept two datasets (data and calibration data). Instead, a single dataset covering both the calibration and evaluation periods is expected. (:issue:`1270` ,:pull:`1311`) v0.45.0 (2023-09-05) -------------------- From 25c1d0111f96a4aa88ff5f148cdb98df13ca1af4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Tue, 12 Sep 2023 13:58:57 -0400 Subject: [PATCH 45/66] Improve documentation (review) 2/2 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 97dfd4f1b..787c07882 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,7 +11,7 @@ New features and enhancements * Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :issue:`1474`, :pull:`1311`) were changed: * Optimized and noticeably faster calculation. * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. - * The standardized index values are now clipped to ±8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution. + * The standardized index values are now clipped to ±8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution and avoids returning infinite values. * An offset parameter is now available to account for negative water balance values``xclim.indices.standardized_precipitation_evapotranspiration_index``. Internal changes From a2283bf30465b5febcb89bb75041d8a78a263214 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 12 Sep 2023 14:24:41 -0400 Subject: [PATCH 46/66] add docstring for _get_standardized_index --- xclim/indices/_agro.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 20db02ef2..76ffc9fbf 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1185,6 +1185,23 @@ def wrap_fit(da): def _get_standardized_index(da, params, **indexer): + """Compute standardized index for given fit parameters + + This computes standardized indices which measure the deviation of variables in the dataset compared + to a reference distribution. The reference is statistical distribution computed with fitting parameters `params` + over a given calibration period of the dataset. Those fitting parameters are obtained with + ``xclim.standardized_index_fit_params``. + + Parameters + ---------- + da : xarray.DataArray + Input array. + params: xarray.DataArray + Fit parameters computed using ``xclim.indices.standardized_index_fit_params``. + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. + """ scipy_dist = get_dist(params.attrs["scipy_dist"]) group_name = params.attrs["group"].rsplit(".")[-1] param_names = params.dparams.values From 693f5881ae4bd518b7e627ddd75fde82edd74157 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Tue, 12 Sep 2023 14:26:42 -0400 Subject: [PATCH 47/66] typo in docsring --- xclim/indices/_agro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 76ffc9fbf..6853bc69a 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1188,7 +1188,7 @@ def _get_standardized_index(da, params, **indexer): """Compute standardized index for given fit parameters This computes standardized indices which measure the deviation of variables in the dataset compared - to a reference distribution. The reference is statistical distribution computed with fitting parameters `params` + to a reference distribution. The reference is a statistical distribution computed with fitting parameters `params` over a given calibration period of the dataset. Those fitting parameters are obtained with ``xclim.standardized_index_fit_params``. From 211c955fc279f6d05dccbaada6c7283ec2bdb25e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 18 Oct 2023 14:42:02 -0400 Subject: [PATCH 48/66] simpler loop over group_idx --- xclim/indices/_agro.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 6853bc69a..a8d19bf5b 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1070,11 +1070,19 @@ def _get_rain_season(pram): def _preprocess_standardized_index(da, freq, window, **indexer): - """ - Resample and roll operations involved when computing a standardized index. + """Perform resample and roll operations involved in computing a standardized index. - The `group` variable returned is used in standardized index computations and to keep track if - preprocessing was already done or not. + da : xarray.DataArray + Input array. + freq : str | None + Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling + has already been applied input dataset and will skip the resampling step. + window : int + Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, + i.e. a monthly resampling, the window is an integer number of months. + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. """ _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) try: @@ -1165,11 +1173,11 @@ def wrap_fit(da): return fitted.broadcast_like(da.isel(time=0, drop=True)) return fit(da, dist, method) + params = da.groupby(group).map(wrap_fit) cal_range = ( da.time.min().dt.strftime("%Y-%m-%d"), da.time.max().dt.strftime("%Y-%m-%d"), ) - params = da.groupby(group).map(wrap_fit) params.attrs = { "Calibration period": cal_range, "freq": freq, @@ -1212,8 +1220,8 @@ def wrap_cdf_ppf(da, params, group_idxs): spi = np.zeros_like(da) # loop over groups (similar to np.groupies idea, but np.groupies can't be used with custom functions) - for ii, group_idx in enumerate(list(dict.fromkeys(group_idxs).keys())): - pars = params[ii, :] + for group_idx in list(dict.fromkeys(group_idxs).keys()): + pars = params[group_idx - 1] indices = np.argwhere(group_idxs == group_idx) vals = da[indices] zeros = (vals == 0).sum(axis=0) From b3d3713dfa446572c228296c5198b4ed5256d01e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 18 Oct 2023 14:51:11 -0400 Subject: [PATCH 49/66] more comments, update docstrings --- xclim/indices/_agro.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index f2a7e7a96..028da757f 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1179,6 +1179,7 @@ def standardized_index_fit_params( da, group = _preprocess_standardized_index(da, freq, window, **indexer) def wrap_fit(da): + # if all values are null, obtain a template of the fit and broadcast if indexer != {} and da.isnull().all(): select_dims = {d: 0 if d != "time" else [0, 1] for d in da.dims} fitted = fit(da[select_dims], dist, method) @@ -1226,6 +1227,8 @@ def _get_standardized_index(da, params, **indexer): group_name = params.attrs["group"].rsplit(".")[-1] param_names = params.dparams.values + # transform `da` values to a cdf distribution with fitted parameters `params` + # then invert back to a normal distrubtion (ppf) def wrap_cdf_ppf(da, params, group_idxs): if indexer != {} and np.isnan(da).all(): return np.zeros_like(da) * np.NaN @@ -1245,6 +1248,7 @@ def wrap_cdf_ppf(da, params, group_idxs): spi[indices] = norm.ppf(probs) return spi + # only datasets with daily or monthly values are supported group_idxs = da.time.dt.month if group_name == "month" else da.time.dt.dayofyear std_index = xarray.apply_ufunc( wrap_cdf_ppf, @@ -1294,7 +1298,7 @@ def standardized_precipitation_index( Daily precipitation. pr_cal : xarray.DataArray Daily precipitation used for calibration. Usually this is a temporal subset of `pr` over some reference period. This - option will be removed in xclim==0.46.0. Two behaviour will be possible (see below) + option will be removed in xclim >=0.47.0. Two behaviour will be possible (see below) freq : str | None Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. @@ -1375,7 +1379,7 @@ def standardized_precipitation_index( if pr_cal is not None: warnings.warn( - "Inputing a calibration array will be deprecated in xclim>=0.46.0. For example, if `pr_cal` is a subset of `pr`, then instead of say:\n" + "Inputing a calibration array will be deprecated in xclim>=0.47.0. For example, if `pr_cal` is a subset of `pr`, then instead of say:\n" "`standardized_precipitation_index(pr=pr,pr_cal=pr.sel(time=slice(t0,t1)),...)`,\n one can call:\n" "`standardized_precipitation_index(pr=pr,cal_range=(t0,t1),...).\n" "If for some reason `pr_cal` is not a subset of `pr`, then the following approach will still be possible:\n" @@ -1442,7 +1446,7 @@ def standardized_precipitation_evapotranspiration_index( Daily water budget (pr - pet). wb_cal : xarray.DataArray Daily water budget used for calibration. Usually this is a temporal subset of `wb` over some reference period. This - option will be removed in xclim==0.46.0. Two behaviour will be possible (see below) + option will be removed in xclim >=0.47.0. Two behaviour will be possible (see below) params: xarray.DataArray Fit parameters. freq : str | None From f6acc4564f03e99c6469e0836f4f48a0f9adb87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 18 Oct 2023 15:14:28 -0400 Subject: [PATCH 50/66] more simple management of the SPEI offset --- xclim/indices/_agro.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 028da757f..84a46068a 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1428,7 +1428,7 @@ def standardized_precipitation_evapotranspiration_index( window: int | None = 1, dist: str | None = "gamma", method: str | None = "APP", - offset: Quantified = "1 mm/d", + offset: Quantified = "1.000 mm/d", cal_start: DateStr | None = None, cal_end: DateStr | None = None, params: Quantified | None = None, @@ -1492,29 +1492,25 @@ def standardized_precipitation_evapotranspiration_index( See Standardized Precipitation Index (SPI) for more details on usage. """ - uses_default_offset = offset == "1 mm/d" - offset = 0 if offset is None else convert_units_to(offset, wb, context="hydro") + uses_default_offset = offset == "1.000 mm/d" if params is not None: - params_offset = ( - 0 - if params.attrs["offset"] is None - else convert_units_to(params.attrs["offset"], wb, context="hydro") - ) - if params_offset != offset: - extra_msg = ( - " (using default SPEI value 1 mm/d)" if uses_default_offset else "" - ) + params_offset = params.attrs["offset"] + if uses_default_offset is False and offset != params_offset: warnings.warn( - f"The offset in `params` differs from the input `offset`{extra_msg}." + "The offset in `params` differs from the input `offset`." "Procceding with the value given in `params`." ) - offset = params_offset + offset = params_offset + offset = 0 if offset is None else convert_units_to(offset, wb, context="hydro") # Allowed distributions are constrained by the SPI function if dist in ["gamma", "fisk"] and offset <= 0: raise ValueError( "The water budget must be shifted towards positive values to be used with `gamma` and `fisk` " f"distributions which are bounded by zero. A positive offset is required. Current value: {offset}{wb.attrs['units']}" ) + # Note that the default behaviour would imply an offset for any distribution, even those distributions + # that can accomodate negative values of the water budget. This needs to be changed in future versions + # of the index. if offset != 0: with xarray.set_options(keep_attrs=True): wb = wb + offset From c717169e17a9ca8625303d7196bff6f1ce82dd6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:40:03 -0400 Subject: [PATCH 51/66] Better doc & simplifications (review) Co-authored-by: Pascal Bourgault --- xclim/indices/_agro.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 1f0ed1c07..b18be2ce4 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1092,7 +1092,7 @@ def _preprocess_standardized_index(da, freq, window, **indexer): da : xarray.DataArray Input array. - freq : str | None + freq : {D, MS}, optional Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. window : int @@ -1198,7 +1198,7 @@ def wrap_fit(da): da.time.max().dt.strftime("%Y-%m-%d"), ) params.attrs = { - "Calibration period": cal_range, + "calibration_period": cal_range, "freq": freq, "window": window, "scipy_dist": dist, @@ -1230,7 +1230,7 @@ def _get_standardized_index(da, params, **indexer): It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. """ scipy_dist = get_dist(params.attrs["scipy_dist"]) - group_name = params.attrs["group"].rsplit(".")[-1] + group_name = params.attrs["group"].split(".")[-1] param_names = params.dparams.values # transform `da` values to a cdf distribution with fitted parameters `params` @@ -1410,7 +1410,7 @@ def standardized_precipitation_index( # If params only contains a subset of main dataset time grouping # (e.g. 8/12 months, etc.), it needs to be broadcasted - template = pr.groupby(params.attrs["group"]).apply(lambda da: da.isel(time=0)) + template = pr.groupby(params.attrs["group"]).first() paramsd = {k: v for k, v in params.sizes.items() if k != "dparams"} if paramsd != template.sizes: params = params.broadcast_like(template) From 19deb92610b2a09b742b3c02bb7196a16214982e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Thu, 19 Oct 2023 09:49:40 -0400 Subject: [PATCH 52/66] only accept "D" & "MS" freqs Co-authored-by: Pascal Bourgault --- xclim/indices/_agro.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index b18be2ce4..5090861f2 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1102,9 +1102,8 @@ def _preprocess_standardized_index(da, freq, window, **indexer): Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. """ - _, base, _, _ = parse_offset(freq or xarray.infer_freq(da.time)) try: - group = {"D": "time.dayofyear", "M": "time.month"}[base] + group = {"D": "time.dayofyear", "MS": "time.month", None: None}[freq] except KeyError(): raise ValueError(f"Standardized index with frequency `{freq}` not supported.") From 2aefe81c2f33eadd2c724a5f593a3241cd0874dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 19 Oct 2023 18:20:33 -0400 Subject: [PATCH 53/66] put std_index functions in stats.py, get rid of group_idx loop --- xclim/indices/_agro.py | 211 +++-------------------------------------- xclim/indices/stats.py | 208 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 219 insertions(+), 200 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 5090861f2..ca0084558 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -5,10 +5,9 @@ import numpy as np import xarray -from scipy.stats import norm import xclim.indices.run_length as rl -from xclim.core.calendar import parse_offset, resample_doy, select_time +from xclim.core.calendar import parse_offset, select_time from xclim.core.units import ( amount2lwethickness, convert_units_to, @@ -16,7 +15,7 @@ rate2amount, to_agg_units, ) -from xclim.core.utils import DateStr, DayOfYearStr, Quantified, uses_dask +from xclim.core.utils import DateStr, DayOfYearStr, Quantified from xclim.indices._conversion import potential_evapotranspiration from xclim.indices._simple import tn_min from xclim.indices._threshold import ( @@ -25,7 +24,11 @@ ) from xclim.indices.generic import aggregate_between_dates, get_zones from xclim.indices.helpers import _gather_lat, day_lengths -from xclim.indices.stats import dist_method, fit, get_dist +from xclim.indices.stats import ( + preprocess_standardized_index, + standardized_index, + standardized_index_fit_params, +) # Frequencies : YS: year start, QS-DEC: seasons starting in december, MS: month start # See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html @@ -1087,197 +1090,6 @@ def _get_rain_season(pram): return out["rain_season_start"], out["rain_season_end"], out["rain_season_length"] -def _preprocess_standardized_index(da, freq, window, **indexer): - """Perform resample and roll operations involved in computing a standardized index. - - da : xarray.DataArray - Input array. - freq : {D, MS}, optional - Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling - has already been applied input dataset and will skip the resampling step. - window : int - Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, - i.e. a monthly resampling, the window is an integer number of months. - indexer - Indexing parameters to compute the indicator on a temporal subset of the data. - It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. - """ - try: - group = {"D": "time.dayofyear", "MS": "time.month", None: None}[freq] - except KeyError(): - raise ValueError(f"Standardized index with frequency `{freq}` not supported.") - - if freq: - da = da.resample(time=freq).mean(keep_attrs=True) - - if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: - warnings.warn( - "The input data is chunked on time dimension and must be fully rechunked to" - " run `fit` on groups ." - " Beware, this operation can significantly increase the number of tasks dask" - " has to handle.", - stacklevel=2, - ) - da = da.chunk({"time": -1}) - - if window > 1: - da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) - - # The time reduction must be done after the rolling - da = select_time(da, **indexer) - return da, group - - -# TODO: Find a more generic place for this, SPX indices are not exclusive to _agro -def standardized_index_fit_params( - da: xarray.DataArray, - freq: str | None, - window: int | None, - dist: str | None, - method: str | None, - offset: Quantified | None = None, - **indexer, -) -> xarray.DataArray: - """Standardized Index fitting parameters. - - A standardized index measures the deviation of a variable averaged over a rolling temporal window and - fitted with a given distribution `dist` with respect to a calibration dataset. The comparison is done by porting - back results to a normalized distribution. The fitting parameters of the calibration dataset fitted with `dist` - are obtained here. - - Parameters - ---------- - da : xarray.DataArray - Input array. - freq : str | None - Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling - has already been applied input dataset and will skip the resampling step. - window : int - Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, - i.e. a monthly resampling, the window is an integer number of months. - dist : str - Name of the univariate distribution. - (see :py:mod:`scipy.stats`). - method : str - Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method - uses a deterministic function that doesn't involve any optimization. - offset: Quantified - Distributions bounded by zero (e.g. "gamma", "fisk") can be used for datasets with negative values by using a offset: - `da + offset` - indexer - Indexing parameters to compute the indicator on a temporal subset of the data. - It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. - """ - # "WPM" method doesn't seem to work for gamma or pearson3 - dist_and_methods = {"gamma": ["ML", "APP"], "fisk": ["ML", "APP"]} - if dist not in dist_and_methods: - raise NotImplementedError(f"The distribution `{dist}` is not supported.") - if method not in dist_and_methods[dist]: - raise NotImplementedError( - f"The method `{method}` is not supported for distribution `{dist}`." - ) - - if offset: - with xarray.set_options(keep_attrs=True): - da = da + convert_units_to(offset, da, context="hydro") - - da, group = _preprocess_standardized_index(da, freq, window, **indexer) - - def wrap_fit(da): - # if all values are null, obtain a template of the fit and broadcast - if indexer != {} and da.isnull().all(): - select_dims = {d: 0 if d != "time" else [0, 1] for d in da.dims} - fitted = fit(da[select_dims], dist, method) - return fitted.broadcast_like(da.isel(time=0, drop=True)) - return fit(da, dist, method) - - params = da.groupby(group).map(wrap_fit) - cal_range = ( - da.time.min().dt.strftime("%Y-%m-%d"), - da.time.max().dt.strftime("%Y-%m-%d"), - ) - params.attrs = { - "calibration_period": cal_range, - "freq": freq, - "window": window, - "scipy_dist": dist, - "method": method, - "group": group, - "time_indexer": indexer, - "units": "", - "offset": offset, - } - return params - - -def _get_standardized_index(da, params, **indexer): - """Compute standardized index for given fit parameters - - This computes standardized indices which measure the deviation of variables in the dataset compared - to a reference distribution. The reference is a statistical distribution computed with fitting parameters `params` - over a given calibration period of the dataset. Those fitting parameters are obtained with - ``xclim.standardized_index_fit_params``. - - Parameters - ---------- - da : xarray.DataArray - Input array. - params: xarray.DataArray - Fit parameters computed using ``xclim.indices.standardized_index_fit_params``. - indexer - Indexing parameters to compute the indicator on a temporal subset of the data. - It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. - """ - scipy_dist = get_dist(params.attrs["scipy_dist"]) - group_name = params.attrs["group"].split(".")[-1] - param_names = params.dparams.values - - # transform `da` values to a cdf distribution with fitted parameters `params` - # then invert back to a normal distrubtion (ppf) - def wrap_cdf_ppf(da, params, group_idxs): - if indexer != {} and np.isnan(da).all(): - return np.zeros_like(da) * np.NaN - - spi = np.zeros_like(da) - # loop over groups (similar to np.groupies idea, but np.groupies can't be used with custom functions) - for group_idx in list(dict.fromkeys(group_idxs).keys()): - pars = params[group_idx - 1] - indices = np.argwhere(group_idxs == group_idx) - vals = da[indices] - zeros = (vals == 0).sum(axis=0) - vals[vals == 0] = np.NaN - paramsd = {param_names[jj]: pars[jj] for jj in range(len(pars))} - probs_of_zero = zeros / vals.shape[0] - dist_probs = scipy_dist.cdf(vals, **paramsd) - probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) - spi[indices] = norm.ppf(probs) - return spi - - # only datasets with daily or monthly values are supported - group_idxs = da.time.dt.month if group_name == "month" else da.time.dt.dayofyear - std_index = xarray.apply_ufunc( - wrap_cdf_ppf, - da, - params, - group_idxs, - input_core_dims=[["time"], [group_name, "dparams"], ["time"]], - output_core_dims=[["time"]], - dask="parallelized", - dask_gufunc_kwargs={"allow_rechunk": True}, - output_dtypes=[da.dtype], - vectorize=True, - ) - # A cdf value of 0 or 1 gives ±np.inf when inverted to the normal distribution. - # The neighbouring values 0.00...1 and 0.99...9 with a float64 give a standardized index of ± 8.21 - # We use this index as reference for maximal/minimal standardized values. - # Smaller values of standardized index could also be used as bounds. For example, [Guttman, 1999] - # notes that precipitation events (over a month/season) generally occur less than 100 times in the US. - # Values of standardized index outside the range [-3.09, 3.09] correspond to probabilities smaller than 0.001 - # or greater than 0.999, which could be considered non-statistically relevant for this sample size. - std_index = std_index.clip(-8.21, 8.21) - return std_index - - @declare_units( pr="[precipitation]", pr_cal="[precipitation]", @@ -1394,10 +1206,10 @@ def standardized_precipitation_index( "standardized indices." ) params = standardized_index_fit_params( - pr_cal, freq=freq, window=window, dist=dist, method=method + pr_cal, freq=freq, window=window, dist=dist, method=method, **indexer ) - pr, _ = _preprocess_standardized_index(pr, freq=freq, window=window, **indexer) + pr, _ = preprocess_standardized_index(pr, freq=freq, window=window, **indexer) if params is None: params = standardized_index_fit_params( pr.sel(time=slice(cal_start, cal_end)), @@ -1414,7 +1226,7 @@ def standardized_precipitation_index( if paramsd != template.sizes: params = params.broadcast_like(template) - spi = _get_standardized_index(pr, params, **indexer) + spi = standardized_index(pr, params, **indexer) spi.attrs = params.attrs spi.attrs["units"] = "" return spi @@ -1476,7 +1288,8 @@ def standardized_precipitation_evapotranspiration_index( Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. offset: Quantified - For distributions bounded by zero (e.g. "gamma", "fisk"), an offset must be added to the water budget to make sure there are no negative values. Keep the offset as small as possible to minimize its influence on the results. + For distributions bounded by zero (e.g. "gamma", "fisk"), an offset must be added to the water budget + to make sure there are no negative values. Keep the offset as small as possible to minimize its influence on the results. This can be given as a precipitation flux or a rate. indexer Indexing parameters to compute the indicator on a temporal subset of the data. diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 9df05a5be..8e4eb61a8 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -1,14 +1,19 @@ """Statistic-related functions. See the `frequency_analysis` notebook for examples.""" from __future__ import annotations +import warnings from typing import Any, Sequence +import flox.xarray as floxx import lmoments3.distr import numpy as np import xarray as xr +from scipy.stats import norm +from xclim.core.calendar import resample_doy, select_time from xclim.core.formatting import prefix_attrs, unprefix_attrs, update_history -from xclim.core.utils import uses_dask +from xclim.core.units import convert_units_to +from xclim.core.utils import Quantified, uses_dask from . import generic @@ -23,6 +28,9 @@ "get_lm3_dist", "parametric_cdf", "parametric_quantile", + "preprocess_standardized_index", + "standardized_index", + "standardized_index_fit_params", ] @@ -602,3 +610,201 @@ def dist_method( output_dtypes=[float], dask="parallelized", ) + + +def preprocess_standardized_index(da, freq, window, **indexer): + """Perform resample and roll operations involved in computing a standardized index. + + da : xarray.DataArray + Input array. + freq : {D, MS}, optional + Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling + has already been applied input dataset and will skip the resampling step. + window : int + Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, + i.e. a monthly resampling, the window is an integer number of months. + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. + + Returns + ------- + xarray.DataArray, str + Processed array and time grouping corresponding to the final time frequency (following resampling if applicable). + """ + # We could allow a more general frequency in this function and move + # the constraint {"D", "MS"} in specific indices such as SPI / SPEI. + final_freq = freq or xr.infer_freq(da) + try: + group = {"D": "time.dayofyear", "MS": "time.month"}[final_freq] + except KeyError(): + raise ValueError( + f"The input (following resampling if applicable) has a frequency `{final_freq}`" + "which is not supported for standardized indices." + ) + + if freq: + da = da.resample(time=freq).mean(keep_attrs=True) + + if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: + warnings.warn( + "The input data is chunked on time dimension and must be fully rechunked to" + " run `fit` on groups ." + " Beware, this operation can significantly increase the number of tasks dask" + " has to handle.", + stacklevel=2, + ) + da = da.chunk({"time": -1}) + + if window > 1: + da = da.rolling(time=window).mean(skipna=False, keep_attrs=True) + + # The time reduction must be done after the rolling + da = select_time(da, **indexer) + return da, group + + +def standardized_index_fit_params( + da: xr.DataArray, + freq: str | None, + window: int | None, + dist: str | None, + method: str | None, + offset: Quantified | None = None, + **indexer, +) -> xr.DataArray: + """Standardized Index fitting parameters. + + A standardized index measures the deviation of a variable averaged over a rolling temporal window and + fitted with a given distribution `dist` with respect to a calibration dataset. The comparison is done by porting + back results to a normalized distribution. The fitting parameters of the calibration dataset fitted with `dist` + are obtained here. + + Parameters + ---------- + da : xarray.DataArray + Input array. + freq : str | None + Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling + has already been applied input dataset and will skip the resampling step. + window : int + Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, + i.e. a monthly resampling, the window is an integer number of months. + dist : str + Name of the univariate distribution. + (see :py:mod:`scipy.stats`). + method : str + Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method + uses a deterministic function that doesn't involve any optimization. + offset: Quantified + Distributions bounded by zero (e.g. "gamma", "fisk") can be used for datasets with negative values by using a offset: + `da + offset` + indexer + Indexing parameters to compute the indicator on a temporal subset of the data. + It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. + + Returns + ------- + xarray.DataArray + Standardized Index fitting parameters. The time dimension of the initial array is reduced to + + Notes + ----- + Supported combinations of `dist` and `method` are: + * Gamma ("gamma") : "ML", "APP", "PWM" + * Log-logistic ("fisk") : "ML", "APP" + + """ + # "WPM" method doesn't seem to work for gamma or pearson3 + dist_and_methods = {"gamma": ["ML", "APP", "PWM"], "fisk": ["ML", "APP"]} + if dist not in dist_and_methods: + raise NotImplementedError(f"The distribution `{dist}` is not supported.") + if method not in dist_and_methods[dist]: + raise NotImplementedError( + f"The method `{method}` is not supported for distribution `{dist}`." + ) + + if offset: + with xr.set_options(keep_attrs=True): + da = da + convert_units_to(offset, da, context="hydro") + + da, group = preprocess_standardized_index(da, freq, window, **indexer) + params = da.groupby(group).map(fit, dist=dist, method=method) + cal_range = ( + da.time.min().dt.strftime("%Y-%m-%d"), + da.time.max().dt.strftime("%Y-%m-%d"), + ) + params.attrs = { + "calibration_period": cal_range, + "freq": freq, + "window": window, + "scipy_dist": dist, + "method": method, + "group": group, + "time_indexer": indexer, + "units": "", + "offset": offset, + } + return params + + +def standardized_index(da, params): + """Compute standardized index for given fit parameters + + This computes standardized indices which measure the deviation of variables in the dataset compared + to a reference distribution. The reference is a statistical distribution computed with fitting parameters `params` + over a given calibration period of the dataset. Those fitting parameters are obtained with + ``xclim.standardized_index_fit_params``. + + Parameters + ---------- + da : xarray.DataArray + Input array. + params: xarray.DataArray + Fit parameters computed using ``xclim.indices.standardized_index_fit_params``. + """ + freq = params.attrs["freq"] or xr.infer_freq(da) + + def resample_to_time(da, da_ref): + if freq == "D": + da = resample_doy(da, da_ref) + elif freq == "MS": + da = da.rename(month="time").reindex(time=da_ref.time.dt.month) + da["time"] = da_ref.time + else: + raise NotImplementedError( + f"The frequency of the input `{freq}` is not supported." + ) + return da.chunk({"time": -1}) + + idxs = da.time.dt.month + probs_of_zero = floxx.xarray_reduce((da + 1) * (da == 0), idxs, func="mean") + params, probs_of_zero = ( + resample_to_time(dax, da) for dax in [params, probs_of_zero] + ) + + def wrap_cdf_ppf(da, pars, probs_of_zero): + dist_probs = get_dist(params.attrs["scipy_dist"]).cdf(da[:], *pars) + probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) + return norm.ppf(probs) + + std_index = xr.apply_ufunc( + wrap_cdf_ppf, + da, + params, + probs_of_zero, + input_core_dims=[["time"], ["dparams", "time"], ["time"]], + output_core_dims=[["time"]], + vectorize=True, + dask="parallelized", + ) + + # A cdf value of 0 or 1 gives ±np.inf when inverted to the normal distribution. + # The neighbouring values 0.00...1 and 0.99...9 with a float64 give a standardized index of ± 8.21 + # We use this index as reference for maximal/minimal standardized values. + # Smaller values of standardized index could also be used as bounds. For example, [Guttman, 1999] + # notes that precipitation events (over a month/season) generally occur less than 100 times in the US. + # Values of standardized index outside the range [-3.09, 3.09] correspond to probabilities smaller than 0.001 + # or greater than 0.999, which could be considered non-statistically relevant for this sample size. + std_index = std_index.clip(-8.21, 8.21) + return std_index From 96e320a471e893369a86c04aca8583992b2386c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 19 Oct 2023 18:26:11 -0400 Subject: [PATCH 54/66] non-hardcoded definition of group indices --- xclim/indices/stats.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 8e4eb61a8..5df02eecd 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -777,7 +777,9 @@ def resample_to_time(da, da_ref): ) return da.chunk({"time": -1}) - idxs = da.time.dt.month + idxs = ( + da.time.dt.month if "month" in params.attrs["group"] else da.time.dt.dayofyear + ) probs_of_zero = floxx.xarray_reduce((da + 1) * (da == 0), idxs, func="mean") params, probs_of_zero = ( resample_to_time(dax, da) for dax in [params, probs_of_zero] From ffa1e1081de4a22620fcce2e2701e10dbdb2572e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 20 Oct 2023 10:39:40 -0400 Subject: [PATCH 55/66] correct infer_freq usage --- xclim/indices/stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 5df02eecd..b5f6e9518 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -634,7 +634,7 @@ def preprocess_standardized_index(da, freq, window, **indexer): """ # We could allow a more general frequency in this function and move # the constraint {"D", "MS"} in specific indices such as SPI / SPEI. - final_freq = freq or xr.infer_freq(da) + final_freq = freq or xr.infer_freq(da.time) try: group = {"D": "time.dayofyear", "MS": "time.month"}[final_freq] except KeyError(): @@ -785,6 +785,7 @@ def resample_to_time(da, da_ref): resample_to_time(dax, da) for dax in [params, probs_of_zero] ) + # TODO: Change `dist_method` to reproduce the speed obtained below def wrap_cdf_ppf(da, pars, probs_of_zero): dist_probs = get_dist(params.attrs["scipy_dist"]).cdf(da[:], *pars) probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) From 3e28bbb3faef9f449410c7489945b94a07ea86b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 20 Oct 2023 21:00:15 -0400 Subject: [PATCH 56/66] optimized dist_method to simplify std_index functions --- xclim/indices/stats.py | 85 ++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 56 deletions(-) diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index b5f6e9518..f5f290d44 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -4,11 +4,9 @@ import warnings from typing import Any, Sequence -import flox.xarray as floxx import lmoments3.distr import numpy as np import xarray as xr -from scipy.stats import norm from xclim.core.calendar import resample_doy, select_time from xclim.core.formatting import prefix_attrs, unprefix_attrs, update_history @@ -530,9 +528,7 @@ def _fit_start(x, dist: str, **fitkwargs: Any) -> tuple[tuple, dict]: return (), {} -def _dist_method_1D( - params: Sequence[float], arg=None, *, dist: str, function: str, **kwargs: Any -) -> xr.DataArray: +def _dist_method_1D(*args, dist: str, function: str, **kwargs: Any) -> xr.DataArray: r"""Statistical function for given argument on given distribution initialized with params. See :py:ref:`scipy:scipy.stats.rv_continuous` for all available functions and their arguments. @@ -540,10 +536,8 @@ def _dist_method_1D( Parameters ---------- - params : 1D sequence of floats - Distribution parameters, in the same order as given by :py:func:`fit`. - arg : array_like, optional - The argument for the requested function. + args + The arguments for the requested scipy function. dist : str The scipy name of the distribution. function : str @@ -554,10 +548,8 @@ def _dist_method_1D( Returns ------- array_like - Same shape as arg in most cases. """ dist = get_dist(dist) - args = ([arg] if arg is not None else []) + list(params) return getattr(dist, function)(*args, **kwargs) @@ -580,7 +572,7 @@ def dist_method( Distribution parameters are along `dparams`, in the same order as given by :py:func:`fit`. Must have a `scipy_dist` attribute with the name of the distribution fitted. arg : array_like, optional - The argument for the requested function. + The first argument for the requested function if different from `fit_params`. \*\*kwargs Other parameters to pass to the function call. @@ -593,20 +585,15 @@ def dist_method( -------- scipy:scipy.stats.rv_continuous : for all available functions and their arguments. """ - args = [fit_params] - input_core_dims = [["dparams"]] - - if arg is not None: - args.append(arg) - input_core_dims.append([]) - + # Typically the data to be transformed + arg = [arg] if arg is not None else [] + args = arg + [fit_params.sel(dparams=dp) for dp in fit_params.dparams.values] return xr.apply_ufunc( _dist_method_1D, *args, - input_core_dims=input_core_dims, + input_core_dims=[[]] * len(args), output_core_dims=[[]], kwargs={"dist": fit_params.attrs["scipy_dist"], "function": function, **kwargs}, - vectorize=True, output_dtypes=[float], dask="parallelized", ) @@ -731,8 +718,8 @@ def standardized_index_fit_params( da, group = preprocess_standardized_index(da, freq, window, **indexer) params = da.groupby(group).map(fit, dist=dist, method=method) cal_range = ( - da.time.min().dt.strftime("%Y-%m-%d"), - da.time.max().dt.strftime("%Y-%m-%d"), + da.time.min().dt.strftime("%Y-%m-%d").item(), + da.time.max().dt.strftime("%Y-%m-%d").item(), ) params.attrs = { "calibration_period": cal_range, @@ -759,49 +746,35 @@ def standardized_index(da, params): Parameters ---------- da : xarray.DataArray - Input array. + Input array. Resampling and rolling operations should have already been applied using ``xclim.indices.preprocess_standardized_index``. params: xarray.DataArray Fit parameters computed using ``xclim.indices.standardized_index_fit_params``. """ - freq = params.attrs["freq"] or xr.infer_freq(da) + group = params.attrs["group"] - def resample_to_time(da, da_ref): - if freq == "D": + def reindex_time(da, da_ref): + if group == "time.dayofyear": + da = da.rename(day="time").reindex(time=da_ref.time.dt.dayofyear) + da["time"] = da_ref.time da = resample_doy(da, da_ref) - elif freq == "MS": + elif group == "time.month": da = da.rename(month="time").reindex(time=da_ref.time.dt.month) da["time"] = da_ref.time - else: - raise NotImplementedError( - f"The frequency of the input `{freq}` is not supported." - ) return da.chunk({"time": -1}) - idxs = ( - da.time.dt.month if "month" in params.attrs["group"] else da.time.dt.dayofyear - ) - probs_of_zero = floxx.xarray_reduce((da + 1) * (da == 0), idxs, func="mean") - params, probs_of_zero = ( - resample_to_time(dax, da) for dax in [params, probs_of_zero] + probs_of_zero = da.groupby(group).map( + lambda x: (x == 0).sum("time") / x.notnull().sum("time") ) - - # TODO: Change `dist_method` to reproduce the speed obtained below - def wrap_cdf_ppf(da, pars, probs_of_zero): - dist_probs = get_dist(params.attrs["scipy_dist"]).cdf(da[:], *pars) - probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) - return norm.ppf(probs) - - std_index = xr.apply_ufunc( - wrap_cdf_ppf, - da, - params, - probs_of_zero, - input_core_dims=[["time"], ["dparams", "time"], ["time"]], - output_core_dims=[["time"]], - vectorize=True, - dask="parallelized", - ) - + params, probs_of_zero = (reindex_time(dax, da) for dax in [params, probs_of_zero]) + dist_probs = dist_method("cdf", params, da) + probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) + + params_norm = xr.concat( + [0 * params.sel(dparams="loc"), 0 * params.sel(dparams="scale") + 1], + dim="dparams", + ).chunk({"dparams": -1}) + params_norm.attrs = dict(scipy_dist="norm") + std_index = dist_method("ppf", params_norm, probs) # A cdf value of 0 or 1 gives ±np.inf when inverted to the normal distribution. # The neighbouring values 0.00...1 and 0.99...9 with a float64 give a standardized index of ± 8.21 # We use this index as reference for maximal/minimal standardized values. From b809ab6e3a3e1b5e3247e5c8852176fecc180230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Fri, 20 Oct 2023 21:26:21 -0400 Subject: [PATCH 57/66] remove uneeded function name in header --- xclim/indices/_agro.py | 7 +++---- xclim/indices/stats.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index ca0084558..f7f0331d4 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -48,7 +48,6 @@ "latitude_temperature_index", "qian_weighted_mean_average", "rain_season", - "standardized_index_fit_params", "standardized_precipitation_evapotranspiration_index", "standardized_precipitation_index", "water_budget", @@ -1275,9 +1274,9 @@ def standardized_precipitation_evapotranspiration_index( dist : {'gamma', 'fisk'} Name of the univariate distribution. (see :py:mod:`scipy.stats`). method : {'APP', 'ML'} - Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method - uses a deterministic function that doesn't involve any optimization. Available methods - vary with the distribution: 'gamma':{'APP', 'ML'}, 'fisk':{'ML'} + Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate), `PWM` (probability weighted moments). + The approximate method uses a deterministic function that doesn't involve any optimization. Available methods + vary with the distribution: 'gamma':{'APP', 'ML', 'PWM'}, 'fisk':{'APP', 'ML'} cal_start: DateStr | None Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the calibration period begins at the start of the input dataset. diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index f5f290d44..15fb1b28c 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -536,7 +536,7 @@ def _dist_method_1D(*args, dist: str, function: str, **kwargs: Any) -> xr.DataAr Parameters ---------- - args + \*args The arguments for the requested scipy function. dist : str The scipy name of the distribution. @@ -677,10 +677,10 @@ def standardized_index_fit_params( window : int Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, i.e. a monthly resampling, the window is an integer number of months. - dist : str + dist : {'gamma', 'fisk'} Name of the univariate distribution. (see :py:mod:`scipy.stats`). - method : str + method : {'ML', 'APP', 'PWM'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. offset: Quantified From cc8b3e6e062be6520a32b9ffd757662ae6846818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Mon, 23 Oct 2023 10:36:52 -0400 Subject: [PATCH 58/66] fix xci -> xci.stat where appropriate --- tests/test_indices.py | 8 ++++---- xclim/indices/_agro.py | 4 ++-- xclim/indices/stats.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_indices.py b/tests/test_indices.py index 1a72eb836..f39b4a684 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -526,7 +526,7 @@ def test_standardized_precipitation_index( ds = open_dataset("sdba/CanESM2_1950-2100.nc").isel(location=1) pr = ds.pr.sel(time=slice("1998", "2000")) pr_cal = ds.pr.sel(time=slice("1950", "1980")) - params = xci.standardized_index_fit_params( + params = xci.stats.standardized_index_fit_params( pr_cal, freq=freq, window=window, dist=dist, method=method ) spi = xci.standardized_precipitation_index(pr, params=params) @@ -611,7 +611,7 @@ def test_standardized_precipitation_evapotranspiration_index( tasmin = tasmax - 5 wb = xci.water_budget(pr, None, tasmin, tasmax, tas) - params = xci.standardized_index_fit_params( + params = xci.stats.standardized_index_fit_params( wb.sel(time=slice("1950", "1980")), freq=freq, window=window, @@ -646,7 +646,7 @@ def test_standardized_index_modularity(self, open_dataset): tasmin = tasmax - 5 wb = xci.water_budget(pr, None, tasmin, tasmax, tas) - params = xci.standardized_index_fit_params( + params = xci.stats.standardized_index_fit_params( wb.sel(time=slice("1950", "1980")), freq=freq, window=window, @@ -654,7 +654,7 @@ def test_standardized_index_modularity(self, open_dataset): method=method, offset="1 mm/d", ) - spei1 = xci.standardized_precipitation_evapotranspiration_index( + spei1 = xci.stats.standardized_precipitation_evapotranspiration_index( wb.sel(time=slice("1998", "2000")), params=params ) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index f7f0331d4..f22338c8c 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1134,7 +1134,7 @@ def standardized_precipitation_index( End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the calibration period finishes at the end of the input dataset. params: xarray.DataArray - Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. + Fit parameters. The `params` can be computed using ``xclim.indices.stats.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. indexer Indexing parameters to compute the indicator on a temporal subset of the data. @@ -1284,7 +1284,7 @@ def standardized_precipitation_evapotranspiration_index( End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the calibration period finishes at the end of the input dataset. params: xarray.DataArray - Fit parameters. The `params` can be computed using ``xclim.indices.standardized_index_fit_params`` in advance. + Fit parameters. The `params` can be computed using ``xclim.indices.stats.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. offset: Quantified For distributions bounded by zero (e.g. "gamma", "fisk"), an offset must be added to the water budget diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 15fb1b28c..31d2b46d0 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -748,7 +748,7 @@ def standardized_index(da, params): da : xarray.DataArray Input array. Resampling and rolling operations should have already been applied using ``xclim.indices.preprocess_standardized_index``. params: xarray.DataArray - Fit parameters computed using ``xclim.indices.standardized_index_fit_params``. + Fit parameters computed using ``xclim.indices.stats.standardized_index_fit_params``. """ group = params.attrs["group"] From 3b4b8f2a65ed8916cca8fbbf61dd2b59160849e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Mon, 23 Oct 2023 10:55:32 -0400 Subject: [PATCH 59/66] typo --- tests/test_indices.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_indices.py b/tests/test_indices.py index f39b4a684..f78bb83bc 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -654,7 +654,7 @@ def test_standardized_index_modularity(self, open_dataset): method=method, offset="1 mm/d", ) - spei1 = xci.stats.standardized_precipitation_evapotranspiration_index( + spei1 = xci.standardized_precipitation_evapotranspiration_index( wb.sel(time=slice("1998", "2000")), params=params ) From 9ec645a552f1daf82e534caf9e5eb99fb30642a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Mon, 23 Oct 2023 11:14:19 -0400 Subject: [PATCH 60/66] remove useless input/output core_dims from xr.apply_ufunc --- xclim/indices/stats.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 31d2b46d0..db1b27338 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -591,8 +591,6 @@ def dist_method( return xr.apply_ufunc( _dist_method_1D, *args, - input_core_dims=[[]] * len(args), - output_core_dims=[[]], kwargs={"dist": fit_params.attrs["scipy_dist"], "function": function, **kwargs}, output_dtypes=[float], dask="parallelized", From 845cac4de45178ee04da9289527c9dc888ac2b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Mon, 23 Oct 2023 11:36:07 -0400 Subject: [PATCH 61/66] no need to broadcast `params_norm` --- xclim/indices/stats.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index db1b27338..a3500d3a1 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -767,11 +767,12 @@ def reindex_time(da, da_ref): dist_probs = dist_method("cdf", params, da) probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) - params_norm = xr.concat( - [0 * params.sel(dparams="loc"), 0 * params.sel(dparams="scale") + 1], - dim="dparams", - ).chunk({"dparams": -1}) - params_norm.attrs = dict(scipy_dist="norm") + params_norm = xr.DataArray( + [0, 1], + dims=["dparams"], + coords=dict(dparams=(["loc", "scale"])), + attrs=dict(scipy_dist="norm"), + ) std_index = dist_method("ppf", params_norm, probs) # A cdf value of 0 or 1 gives ±np.inf when inverted to the normal distribution. # The neighbouring values 0.00...1 and 0.99...9 with a float64 give a standardized index of ± 8.21 From c702e480097a7142193542ca0f1a972ea704599c Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 23 Oct 2023 12:58:13 -0400 Subject: [PATCH 62/66] fix merge of CHANGES.rst --- CHANGES.rst | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 98c36d92e..efe5d6904 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -20,6 +20,11 @@ New features and enhancements * Indicator ``generic.stats`` now accepts any frequency (previously only daily). (:pull:`1498`). * Added argument ``out_units`` to ``select_resample_op`` to bypass limitations of ``to_agg_units`` in custom indicators. Add "var" to supported operations in ``to_agg_units``. (:pull:`1498`). * `adapt_freq_thresh` argument added `sdba` training functions, allowing to perform frequency adaptation appropriately in each map block. (:pull:`1407`). +* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :issue:`1474`, :pull:`1311`) were changed: + * Optimized and noticeably faster calculation. + * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. + * The standardized index values are now clipped to ±8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution and avoids returning infinite values. + * An offset parameter is now available to account for negative water balance values``xclim.indices.standardized_precipitation_evapotranspiration_index``. Bug fixes ^^^^^^^^^ @@ -41,14 +46,7 @@ Breaking changes * Default threshold in ``xclim.indices.snw_season_{start|length|end}`` changed form `20 kg m-2` to `4 kg m-2`. (:pull:`1505`). * `xclim` development dependencies now include `ruff`. `pycodestyle` and `pydocstyle` have been replaced by `ruff` and removed from the `dev` installation recipe. (:pull:`1504`). * The `mf_file` call signature found in ``xclim.ensembles.create_ensemble`` (and ``xclim.ensembles._ens_align_dataset``) has been removed (deprecated since `xclim` v0.43.0). (:pull:`1506`). - -New features and enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :issue:`1474`, :pull:`1311`) were changed: - * Optimized and noticeably faster calculation. - * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. - * The standardized index values are now clipped to ±8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution and avoids returning infinite values. - * An offset parameter is now available to account for negative water balance values``xclim.indices.standardized_precipitation_evapotranspiration_index``. +* ``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index`` will no longer accept two datasets (data and calibration data). Instead, a single dataset covering both the calibration and evaluation periods is expected. (:issue:`1270`, :pull:`1311`). Internal changes ^^^^^^^^^^^^^^^^ @@ -69,10 +67,6 @@ Internal changes * Deprecation wrapper ``xclim.core.utils.deprecated`` are added to help with deprecation warnings. (:pull:`1505`). * `xclim` now uses `ruff` to format the codebase with `make lint` and `pre-commit`. `flake8` is still used for the time being, solely to enforce docstring linting (with `flake8-rst-docstrings`) and alphabetical `__all__` entries (with `flake8-alphabetize`). (:pull:`1504`). -Breaking changes -^^^^^^^^^^^^^^^^ -* ``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index`` will no longer accept two datasets (data and calibration data). Instead, a single dataset covering both the calibration and evaluation periods is expected. (:issue:`1270` ,:pull:`1311`) - v0.45.0 (2023-09-05) -------------------- Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`), Gabriel Rondeau-Genesse (:user:`RondeauG`), Marco Braun (:user:`vindelico`), Éric Dupuis (:user:`coxipi`). From b3b9d7ca850c15576b5ad2c70b617239a6361314 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 23 Oct 2023 13:30:52 -0400 Subject: [PATCH 63/66] docstring and typing adjustments --- xclim/indices/_agro.py | 140 ++++++++++++++++++++++------------------- xclim/indices/stats.py | 55 ++++++++-------- 2 files changed, 104 insertions(+), 91 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index f22338c8c..6b40739e4 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -926,54 +926,54 @@ def rain_season( ): """Find the length of the rain season and the day of year of its start and its end. - The rain season begins when two conditions are met: 1) There must be a number of wet days with - precipitations above or equal to a given threshold; 2) There must be another sequence following, where, for a given period in time, there are no - dry sequence (i.e. a certain number of days where precipitations are below or equal to a certain threshold). The rain season ends - when there is a dry sequence. + The rain season begins when two conditions are met: 1) There must be a number of wet days with precipitations above + or equal to a given threshold; 2) There must be another sequence following, where, for a given period in time, there + are no dry sequence (i.e. a certain number of days where precipitations are below or equal to a certain threshold). + The rain season ends when there is a dry sequence. Parameters ---------- - pr: xr.DataArray + pr : xr.DataArray Precipitation data. - thresh_wet_start: Quantified + thresh_wet_start : Quantified Accumulated precipitation threshold associated with `window_wet_start`. - window_wet_start: int + window_wet_start : int Number of days when accumulated precipitation is above `thresh_wet_start`. Defines the first condition to start the rain season - window_not_dry_start: int + window_not_dry_start : int Number of days, after `window_wet_start` days, during which no dry period must be found as a second and last condition to start the rain season. A dry sequence is defined with `thresh_dry_start`, `window_dry_start` and `method_dry_start`. - thresh_dry_start: Quantified + thresh_dry_start : Quantified Threshold length defining a dry day in the sequence related to `window_dry_start`. - window_dry_start: int + window_dry_start : int Number of days used to define a dry sequence in the start of the season. Daily precipitations lower than `thresh_dry_start` during `window_dry_start` days are considered a dry sequence. The precipitations must be lower than `thresh_dry_start` for either every day in the sequence (`method_dry_start == "per_day"`) or for the total (`method_dry_start == "total"`). - method_dry_start: {"per_day", "total"} + method_dry_start : {"per_day", "total"} Method used to define a dry sequence associated with `window_dry_start`. The threshold `thresh_dry_start` is either compared to every daily precipitation (`method_dry_start == "per_day"`) or to total precipitations (`method_dry_start == "total"`) in the sequence `window_dry_start` days. - date_min_start: DayOfYearStr + date_min_start : DayOfYearStr First day of year when season can start ("mm-dd"). - date_max_start: DayOfYearStr + date_max_start : DayOfYearStr Last day of year when season can start ("mm-dd"). - thresh_dry_end: str + thresh_dry_end : str Threshold length defining a dry day in the sequence related to `window_dry_end`. - window_dry_end: int + window_dry_end : int Number of days used to define a dry sequence in the end of the season. Daily precipitations lower than `thresh_dry_end` during `window_dry_end` days are considered a dry sequence. The precipitations must be lower than `thresh_dry_end` for either every day in the sequence (`method_dry_end == "per_day"`) or for the total (`method_dry_end == "total"`). - method_dry_end: {"per_day", "total"} + method_dry_end : {"per_day", "total"} Method used to define a dry sequence associated with `window_dry_end`. The threshold `thresh_dry_end` is either compared to every daily precipitation (`method_dry_end == "per_day"`) or to total precipitations (`method_dry_end == "total"`) in the sequence `window_dry` days. - date_min_end: DayOfYearStr + date_min_end : DayOfYearStr First day of year when season can end ("mm-dd"). - date_max_end: DayOfYearStr + date_max_end : DayOfYearStr Last day of year when season can end ("mm-dd"). freq : str Resampling frequency. @@ -1037,6 +1037,7 @@ def _get_first_run_start(pram): return _get_first_run(run_positions, date_min_start, date_max_start) # Find the end of the rain season + # FIXME: This function mixes local and parent-level variables. It should be refactored. def _get_first_run_end(pram): if method_dry_end == "per_day": da_stop = pram <= thresh_dry_end @@ -1048,12 +1049,14 @@ def _get_first_run_end(pram): return _get_first_run(run_positions, date_min_end, date_max_end) # Get start, end and length of rain season. Written as a function so it can be resampled + # FIXME: This function mixes local and parent-level variables. It should be refactored. def _get_rain_season(pram): start = _get_first_run_start(pram) # masking value before start of the season (end of season should be after) # Get valid integer indexer of the day after the first run starts. - # `start != NaN` only possible if a condition on next few time steps is respected. Thus, `start+1` exists if `start != NaN` + # `start != NaN` only possible if a condition on next few time steps is respected. + # Thus, `start+1` exists if `start != NaN` start_ind = (start + 1).fillna(-1).astype(int) mask = pram * np.NaN # Put "True" on the day of run start @@ -1098,9 +1101,9 @@ def standardized_precipitation_index( pr: xarray.DataArray, pr_cal: Quantified | None = None, freq: str | None = "MS", - window: int | None = 1, - dist: str | None = "gamma", - method: str | None = "APP", + window: int = 1, + dist: str = "gamma", + method: str = "APP", cal_start: DateStr | None = None, cal_end: DateStr | None = None, params: Quantified | None = None, @@ -1112,31 +1115,31 @@ def standardized_precipitation_index( ---------- pr : xarray.DataArray Daily precipitation. - pr_cal : xarray.DataArray - Daily precipitation used for calibration. Usually this is a temporal subset of `pr` over some reference period. This - option will be removed in xclim >=0.47.0. Two behaviour will be possible (see below) - freq : str | None + pr_cal : xarray.DataArray, optional + Daily precipitation used for calibration. Usually this is a temporal subset of `pr` over some reference period. + This option will be removed in xclim >=0.47.0. Two behaviour will be possible (see below) + freq : str, optional Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. window : int Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, i.e. a monthly resampling, the window is an integer number of months. dist : {"gamma", "fisk"} - Name of the univariate distribution. - (see :py:mod:`scipy.stats`). + Name of the univariate distribution. (see :py:mod:`scipy.stats`). method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. - cal_start: DateStr | None - Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` - means that the calibration period begins at the start of the input dataset. - cal_end: DateStr | None - End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` - means that the calibration period finishes at the end of the input dataset. + cal_start: DateStr, optional + Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. + Default option `None` means that the calibration period begins at the start of the input dataset. + cal_end: DateStr, optional + End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. + Default option `None` means that the calibration period finishes at the end of the input dataset. params: xarray.DataArray - Fit parameters. The `params` can be computed using ``xclim.indices.stats.standardized_index_fit_params`` in advance. + Fit parameters. + The `params` can be computed using ``xclim.indices.stats.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. - indexer + \*\*indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1195,14 +1198,16 @@ def standardized_precipitation_index( if pr_cal is not None: warnings.warn( - "Inputing a calibration array will be deprecated in xclim>=0.47.0. For example, if `pr_cal` is a subset of `pr`, then instead of say:\n" - "`standardized_precipitation_index(pr=pr,pr_cal=pr.sel(time=slice(t0,t1)),...)`,\n one can call:\n" + "Inputting a calibration array will be deprecated in xclim>=0.47.0. " + "For example, if `pr_cal` is a subset of `pr`, then instead of say:\n" + "`standardized_precipitation_index(pr=pr,pr_cal=pr.sel(time=slice(t0,t1)),...)`,\n" + "one can call:\n" "`standardized_precipitation_index(pr=pr,cal_range=(t0,t1),...).\n" "If for some reason `pr_cal` is not a subset of `pr`, then the following approach will still be possible:\n" "`params = standardized_index_fit_params(da=pr_cal, freq=freq, window=window, dist=dist, method=method)`.\n" "`spi = standardized_precipitation_index(pr=pr, params=params)`.\n" - "This approach can be used in both scenarios to break up the computations in two, i.e. get params, then compute " - "standardized indices." + "This approach can be used in both scenarios to break up the computations in two," + "i.e. get params, then compute standardized indices." ) params = standardized_index_fit_params( pr_cal, freq=freq, window=window, dist=dist, method=method, **indexer @@ -1241,9 +1246,9 @@ def standardized_precipitation_evapotranspiration_index( wb: xarray.DataArray, wb_cal: Quantified | None = None, freq: str | None = "MS", - window: int | None = 1, - dist: str | None = "gamma", - method: str | None = "APP", + window: int = 1, + dist: str = "gamma", + method: str = "APP", offset: Quantified = "1.000 mm/d", cal_start: DateStr | None = None, cal_end: DateStr | None = None, @@ -1260,12 +1265,10 @@ def standardized_precipitation_evapotranspiration_index( ---------- wb : xarray.DataArray Daily water budget (pr - pet). - wb_cal : xarray.DataArray - Daily water budget used for calibration. Usually this is a temporal subset of `wb` over some reference period. This - option will be removed in xclim >=0.47.0. Two behaviour will be possible (see below) - params: xarray.DataArray - Fit parameters. - freq : str | None + wb_cal : xarray.DataArray, optional + Daily water budget used for calibration. Usually this is a temporal subset of `wb` over some reference period. + This option will be removed in xclim >=0.47.0. Two behaviours will be possible (see below). + freq : str, optional Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. window : int @@ -1274,23 +1277,26 @@ def standardized_precipitation_evapotranspiration_index( dist : {'gamma', 'fisk'} Name of the univariate distribution. (see :py:mod:`scipy.stats`). method : {'APP', 'ML'} - Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate), `PWM` (probability weighted moments). + Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate), or + `PWM` (probability weighted moments). The approximate method uses a deterministic function that doesn't involve any optimization. Available methods vary with the distribution: 'gamma':{'APP', 'ML', 'PWM'}, 'fisk':{'APP', 'ML'} - cal_start: DateStr | None - Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` - means that the calibration period begins at the start of the input dataset. - cal_end: DateStr | None - End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` - means that the calibration period finishes at the end of the input dataset. - params: xarray.DataArray - Fit parameters. The `params` can be computed using ``xclim.indices.stats.standardized_index_fit_params`` in advance. - The ouput can be given here as input, and it overrides other options. - offset: Quantified + cal_start : DateStr, optional + Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. + Default option `None` means that the calibration period begins at the start of the input dataset. + cal_end : DateStr, optional + End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. + Default option `None` means that the calibration period finishes at the end of the input dataset. + params : xarray.DataArray, optional + Fit parameters. + The `params` can be computed using ``xclim.indices.stats.standardized_index_fit_params`` in advance. + The output can be given here as input, and it overrides other options. + offset : Quantified For distributions bounded by zero (e.g. "gamma", "fisk"), an offset must be added to the water budget - to make sure there are no negative values. Keep the offset as small as possible to minimize its influence on the results. + to make sure there are no negative values. + Keep the offset as small as possible to minimize its influence on the results. This can be given as a precipitation flux or a rate. - indexer + \*\*indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -1315,7 +1321,7 @@ def standardized_precipitation_evapotranspiration_index( if uses_default_offset is False and offset != params_offset: warnings.warn( "The offset in `params` differs from the input `offset`." - "Procceding with the value given in `params`." + "Proceeding with the value given in `params`." ) offset = params_offset offset = 0 if offset is None else convert_units_to(offset, wb, context="hydro") @@ -1323,10 +1329,11 @@ def standardized_precipitation_evapotranspiration_index( if dist in ["gamma", "fisk"] and offset <= 0: raise ValueError( "The water budget must be shifted towards positive values to be used with `gamma` and `fisk` " - f"distributions which are bounded by zero. A positive offset is required. Current value: {offset}{wb.attrs['units']}" + "distributions which are bounded by zero. A positive offset is required. Current value: " + f"{offset}{wb.attrs['units']}." ) # Note that the default behaviour would imply an offset for any distribution, even those distributions - # that can accomodate negative values of the water budget. This needs to be changed in future versions + # that can accommodate negative values of the water budget. This needs to be changed in future versions # of the index. if offset != 0: with xarray.set_options(keep_attrs=True): @@ -1403,7 +1410,8 @@ def effective_growing_degree_days( ) -> xarray.DataArray: r"""Effective growing degree days. - Growing degree days based on a dynamic start and end of the growing season, as defined in :cite:p:`bootsma_impacts_2005`. + Growing degree days based on a dynamic start and end of the growing season, + as defined in :cite:p:`bootsma_impacts_2005`. Parameters ---------- diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 31d2b46d0..03f778641 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -135,6 +135,8 @@ def fit( dc = get_dist(dist) if method == "PWM": lm3dc = get_lm3_dist(dist) + else: + lm3dc = None shape_params = [] if dc.shapes is None else dc.shapes.split(",") dist_params = shape_params + ["loc", "scale"] @@ -322,7 +324,7 @@ def fa( ---------- da : xr.DataArray Maximized/minimized input data with a `time` dimension. - t : Union[int, Sequence] + t : int or Sequence of int Return period. The period depends on the resolution of the input data. If the input array's resolution is yearly, then the return period is in years. dist : str @@ -392,7 +394,7 @@ def frequency_analysis( Name of the univariate distribution, e.g. `beta`, `expon`, `genextreme`, `gamma`, `gumbel_r`, `lognorm`, `norm`. window : int Averaging window length (days). - freq : str + freq : str, optional Resampling frequency. If None, the frequency is assumed to be 'YS' unless the indexer is season='DJF', in which case `freq` would be set to `AS-DEC`. method : {"ML" or "MLE", "MOM", "PWM", "APP"} @@ -431,7 +433,7 @@ def frequency_analysis( return fa(sel, t, dist=dist, mode=mode, method=method) -def get_dist(dist): +def get_dist(dist: str): """Return a distribution object from `scipy.stats`.""" from scipy import stats # pylint: disable=import-outside-toplevel @@ -442,7 +444,7 @@ def get_dist(dist): return dc -def get_lm3_dist(dist): +def get_lm3_dist(dist: str): """Return a distribution object from `lmoments3.distr`.""" if dist not in _lm3_dist_map: raise ValueError( @@ -599,8 +601,10 @@ def dist_method( ) -def preprocess_standardized_index(da, freq, window, **indexer): - """Perform resample and roll operations involved in computing a standardized index. +def preprocess_standardized_index( + da: xr.DataArray, freq: str | None, window: int, **indexer +): + r"""Perform resample and roll operations involved in computing a standardized index. da : xarray.DataArray Input array. @@ -610,14 +614,15 @@ def preprocess_standardized_index(da, freq, window, **indexer): window : int Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, i.e. a monthly resampling, the window is an integer number of months. - indexer + \*\*indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. Returns ------- xarray.DataArray, str - Processed array and time grouping corresponding to the final time frequency (following resampling if applicable). + Processed array and time grouping corresponding to the final time frequency + (following resampling if applicable). """ # We could allow a more general frequency in this function and move # the constraint {"D", "MS"} in specific indices such as SPI / SPEI. @@ -630,7 +635,7 @@ def preprocess_standardized_index(da, freq, window, **indexer): "which is not supported for standardized indices." ) - if freq: + if freq is not None: da = da.resample(time=freq).mean(keep_attrs=True) if uses_dask(da) and len(da.chunks[da.get_axis_num("time")]) > 1: @@ -654,13 +659,13 @@ def preprocess_standardized_index(da, freq, window, **indexer): def standardized_index_fit_params( da: xr.DataArray, freq: str | None, - window: int | None, - dist: str | None, - method: str | None, + window: int, + dist: str, + method: str, offset: Quantified | None = None, **indexer, ) -> xr.DataArray: - """Standardized Index fitting parameters. + r"""Standardized Index fitting parameters. A standardized index measures the deviation of a variable averaged over a rolling temporal window and fitted with a given distribution `dist` with respect to a calibration dataset. The comparison is done by porting @@ -671,22 +676,21 @@ def standardized_index_fit_params( ---------- da : xarray.DataArray Input array. - freq : str | None + freq : str, optional Resampling frequency. A monthly or daily frequency is expected. Option `None` assumes that desired resampling has already been applied input dataset and will skip the resampling step. window : int Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, i.e. a monthly resampling, the window is an integer number of months. dist : {'gamma', 'fisk'} - Name of the univariate distribution. - (see :py:mod:`scipy.stats`). + Name of the univariate distribution. (see :py:mod:`scipy.stats`). method : {'ML', 'APP', 'PWM'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. offset: Quantified - Distributions bounded by zero (e.g. "gamma", "fisk") can be used for datasets with negative values by using a offset: - `da + offset` - indexer + Distributions bounded by zero (e.g. "gamma", "fisk") can be used for datasets with negative values + by using an offset: `da + offset`. + \*\*indexer Indexing parameters to compute the indicator on a temporal subset of the data. It accepts the same arguments as :py:func:`xclim.indices.generic.select_time`. @@ -700,7 +704,6 @@ def standardized_index_fit_params( Supported combinations of `dist` and `method` are: * Gamma ("gamma") : "ML", "APP", "PWM" * Log-logistic ("fisk") : "ML", "APP" - """ # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP", "PWM"], "fisk": ["ML", "APP"]} @@ -711,7 +714,7 @@ def standardized_index_fit_params( f"The method `{method}` is not supported for distribution `{dist}`." ) - if offset: + if offset is not None: with xr.set_options(keep_attrs=True): da = da + convert_units_to(offset, da, context="hydro") @@ -735,8 +738,8 @@ def standardized_index_fit_params( return params -def standardized_index(da, params): - """Compute standardized index for given fit parameters +def standardized_index(da: xr.DataArray, params: xr.DataArray): + """Compute standardized index for given fit parameters. This computes standardized indices which measure the deviation of variables in the dataset compared to a reference distribution. The reference is a statistical distribution computed with fitting parameters `params` @@ -746,8 +749,10 @@ def standardized_index(da, params): Parameters ---------- da : xarray.DataArray - Input array. Resampling and rolling operations should have already been applied using ``xclim.indices.preprocess_standardized_index``. - params: xarray.DataArray + Input array. + Resampling and rolling operations should have already been applied using + ``xclim.indices.preprocess_standardized_index``. + params : xarray.DataArray Fit parameters computed using ``xclim.indices.stats.standardized_index_fit_params``. """ group = params.attrs["group"] From 69ab14aae0892432eea5afc2dd6c905a565e61c4 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 23 Oct 2023 14:07:57 -0400 Subject: [PATCH 64/66] whitespace --- xclim/indices/_agro.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xclim/indices/_agro.py b/xclim/indices/_agro.py index 6b40739e4..af70b63a4 100644 --- a/xclim/indices/_agro.py +++ b/xclim/indices/_agro.py @@ -1129,13 +1129,13 @@ def standardized_precipitation_index( method : {'APP', 'ML'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method uses a deterministic function that doesn't involve any optimization. - cal_start: DateStr, optional + cal_start : DateStr, optional Start date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the calibration period begins at the start of the input dataset. - cal_end: DateStr, optional + cal_end : DateStr, optional End date of the calibration period. A `DateStr` is expected, that is a `str` in format `"YYYY-MM-DD"`. Default option `None` means that the calibration period finishes at the end of the input dataset. - params: xarray.DataArray + params : xarray.DataArray Fit parameters. The `params` can be computed using ``xclim.indices.stats.standardized_index_fit_params`` in advance. The ouput can be given here as input, and it overrides other options. From 7f468561e4e97c39bf904822bccb93dd5f6f19dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Mon, 23 Oct 2023 14:36:11 -0400 Subject: [PATCH 65/66] Update CHANGES.rst (review) Co-authored-by: Pascal Bourgault --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index efe5d6904..36d10c6a0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,7 +22,7 @@ New features and enhancements * `adapt_freq_thresh` argument added `sdba` training functions, allowing to perform frequency adaptation appropriately in each map block. (:pull:`1407`). * Standardized indices (``xclim.indices.standardized_precipitation_index`` and ``xclim.indices.standardized_precipitation_evapotranspiration_index``) (:issue:`1270`, :issue:`1416`, :issue:`1474`, :pull:`1311`) were changed: * Optimized and noticeably faster calculation. - * Can be computed in two steps: First compute fit parameters with ``xclim.indices.standardized_index_fit_params``, then use the output in the standardized indices functions. + * Can be computed in two steps: First compute fit parameters with ``xclim.indices.stats.standardized_index_fit_params``, then use the output in the standardized indices functions. * The standardized index values are now clipped to ±8.21. This reflects the ``float64`` precision of the computation when cumulative distributed function values are inverted to a normal distribution and avoids returning infinite values. * An offset parameter is now available to account for negative water balance values``xclim.indices.standardized_precipitation_evapotranspiration_index``. From 59e7df6f53addfe0477dcb03604965f9cdbb807d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= <71575674+coxipi@users.noreply.github.com> Date: Mon, 23 Oct 2023 14:37:17 -0400 Subject: [PATCH 66/66] only chunk if necessary, check if dask used (review) Co-authored-by: Pascal Bourgault --- xclim/indices/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 110377d06..2fe52b7e9 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -763,7 +763,7 @@ def reindex_time(da, da_ref): elif group == "time.month": da = da.rename(month="time").reindex(time=da_ref.time.dt.month) da["time"] = da_ref.time - return da.chunk({"time": -1}) + return da if not uses_dask(da) else da.chunk({"time": -1}) probs_of_zero = da.groupby(group).map( lambda x: (x == 0).sum("time") / x.notnull().sum("time")