From 7b5934c2c557448c6cbcd306d5f60f1498f0f2d0 Mon Sep 17 00:00:00 2001 From: SarahG-579462 Date: Fri, 14 Feb 2025 16:56:12 -0500 Subject: [PATCH 01/12] Add fit method: MSE (maximum product of spacings) --- src/xclim/indices/stats.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index ca2b2874a..1505e5fb4 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -55,6 +55,9 @@ def _fitfunc_1d(arr, *, dist, nparams, method, **fitkwargs): params = dist.fit(x, *args, method="mle", **kwargs, **fitkwargs) elif method == "MM": params = dist.fit(x, method="mm", **fitkwargs) + elif method == "MSE": + fitresult = scipy.stats.fit(dist, x, method='mse', **kwargs, **fitkwargs) + params = fitresult.params elif method == "PWM": # lmoments3 will raise an error if only dist.numargs + 2 values are provided if len(x) <= dist.numargs + 2: @@ -107,10 +110,10 @@ def fit( dist : str or rv_continuous distribution object Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm (see :py:mod:scipy.stats for full list) or the distribution object itself. - method : {"ML", "MLE", "MM", "PWM", "APP"} - Fitting method, either maximum likelihood (ML or MLE), method of moments (MM) or approximate method (APP). + method : {"ML", "MLE", "MM", "PWM", "APP", "MSE"} + Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. - The PWM method is usually more robust to outliers. + The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although it can be more sensitive to repeated data. dim : str The dimension upon which to perform the indexing (default: "time"). **fitkwargs : dict @@ -131,8 +134,9 @@ def fit( "ML": "maximum likelihood", "MM": "method of moments", "MLE": "maximum likelihood", + "MSE": "maximum product of spacings", "PWM": "probability weighted moments", - "APP": "approximative method", + "APP": "approximative method" } if method not in method_name: raise ValueError(f"Fitting method not recognized: {method}") From 70b3d4ef07cb9c2ab56ddac9373c672c93417523 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 22:03:36 +0000 Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/xclim/indices/stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index 1505e5fb4..89d1474db 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -56,7 +56,7 @@ def _fitfunc_1d(arr, *, dist, nparams, method, **fitkwargs): elif method == "MM": params = dist.fit(x, method="mm", **fitkwargs) elif method == "MSE": - fitresult = scipy.stats.fit(dist, x, method='mse', **kwargs, **fitkwargs) + fitresult = scipy.stats.fit(dist, x, method="mse", **kwargs, **fitkwargs) params = fitresult.params elif method == "PWM": # lmoments3 will raise an error if only dist.numargs + 2 values are provided @@ -136,7 +136,7 @@ def fit( "MLE": "maximum likelihood", "MSE": "maximum product of spacings", "PWM": "probability weighted moments", - "APP": "approximative method" + "APP": "approximative method", } if method not in method_name: raise ValueError(f"Fitting method not recognized: {method}") From 096715e7d05fe9c31cd56246f72f83393f5dc6fa Mon Sep 17 00:00:00 2001 From: SarahG-579462 Date: Fri, 14 Feb 2025 17:08:02 -0500 Subject: [PATCH 03/12] alias MPS to MSE --- src/xclim/indices/stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index 1505e5fb4..a7c982451 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -111,7 +111,7 @@ def fit( Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm (see :py:mod:scipy.stats for full list) or the distribution object itself. method : {"ML", "MLE", "MM", "PWM", "APP", "MSE"} - Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE) or approximate method (APP). + Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although it can be more sensitive to repeated data. dim : str @@ -135,6 +135,7 @@ def fit( "MM": "method of moments", "MLE": "maximum likelihood", "MSE": "maximum product of spacings", + "MPS": "maximum product of spacings", "PWM": "probability weighted moments", "APP": "approximative method" } From 059c102d8f391445819f11b75f4a5b64c8fec093 Mon Sep 17 00:00:00 2001 From: SarahG-579462 Date: Fri, 14 Feb 2025 17:12:40 -0500 Subject: [PATCH 04/12] Changelog updated, doc updated --- CHANGELOG.rst | 1 + src/xclim/indices/stats.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index bf16832f8..357805c4f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -30,6 +30,7 @@ New features and enhancements * New ``xclim.indices.fao_allen98``, exporting the FAO-56 Penman-Monteith equation for potential evapotranspiration (:issue:`2004`, :pull:`2067`). * Missing values method "pct" and "at_least_n" now accept a new "subfreq" option that allows to compute the missing mask in two steps. When given, the algorithm is applied at this "subfreq" resampling frequency first and then the result is resampled at the target indicator frequency. In the output, a period is invalid if any of its subgroup where flagged as invalid by the chosen method. (:pull:`2058`, :issue:`1820`). * `rv_continuous` functions can now be given directly as the `dist` argument in ``standardized_precipitation_index`` and ``standardized_precipitation_evapotranspiration_index`` indicators. This includes `lmoments3` distribution when specifying `method="PWM"`. (:issue:`2043`, :pull:`2045`). +* Maximum Spacing Estimation method for distribution fitting has been added to `xclim.indices.stats.fit` (:issue:`2078`, :pull:`2077`) Internal changes ^^^^^^^^^^^^^^^^ diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index e880e33d4..bd32bbcbe 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -110,7 +110,7 @@ def fit( dist : str or rv_continuous distribution object Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm (see :py:mod:scipy.stats for full list) or the distribution object itself. - method : {"ML", "MLE", "MM", "PWM", "APP", "MSE"} + method : {"ML", "MLE", "MM", "PWM", "APP", "MSE", "MPS"} Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although it can be more sensitive to repeated data. From f8378c900d14416186c8c9c36eb8d55f7be2d628 Mon Sep 17 00:00:00 2001 From: Sarah Gammon <91751417+SarahG-579462@users.noreply.github.com> Date: Mon, 17 Feb 2025 09:59:48 -0500 Subject: [PATCH 05/12] Update src/xclim/indices/stats.py Co-authored-by: Pascal Bourgault --- src/xclim/indices/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index bd32bbcbe..34aa83069 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -55,7 +55,7 @@ def _fitfunc_1d(arr, *, dist, nparams, method, **fitkwargs): params = dist.fit(x, *args, method="mle", **kwargs, **fitkwargs) elif method == "MM": params = dist.fit(x, method="mm", **fitkwargs) - elif method == "MSE": + elif method in ["MSE", "MPS"]: fitresult = scipy.stats.fit(dist, x, method="mse", **kwargs, **fitkwargs) params = fitresult.params elif method == "PWM": From 5016a6add50360923710a72deefca532dde54def Mon Sep 17 00:00:00 2001 From: SarahG-579462 Date: Mon, 17 Feb 2025 12:38:04 -0500 Subject: [PATCH 06/12] add tests, add docs for bounds, use initial guess --- src/xclim/indices/stats.py | 8 +++++++- tests/test_stats.py | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index 34aa83069..dad1df19b 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -56,7 +56,12 @@ def _fitfunc_1d(arr, *, dist, nparams, method, **fitkwargs): elif method == "MM": params = dist.fit(x, method="mm", **fitkwargs) elif method in ["MSE", "MPS"]: - fitresult = scipy.stats.fit(dist, x, method="mse", **kwargs, **fitkwargs) + args, guess = _fit_start(x, dist.name, **fitkwargs) + param_info = dist.shapes + for i,arg in enumerate(args): + guess[param_info[i]] = arg + + fitresult = scipy.stats.fit(dist=dist, data=x, method="mse", guess=guess, **fitkwargs) params = fitresult.params elif method == "PWM": # lmoments3 will raise an error if only dist.numargs + 2 values are provided @@ -114,6 +119,7 @@ def fit( Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although it can be more sensitive to repeated data. + For the MSE method, each variable parameter must be given finite bounds (provided with keyword argument bounds={'param_name':(min,max),...}). dim : str The dimension upon which to perform the indexing (default: "time"). **fitkwargs : dict diff --git a/tests/test_stats.py b/tests/test_stats.py index 29a40b3f0..1c621cd85 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -156,6 +156,10 @@ def test_genextreme_fit(genextreme): p = stats.fit(genextreme, "genextreme") np.testing.assert_allclose(p, (0.20949, 297.954091, 75.7911863), 1e-5) +def test_mse_fit(genextreme): + """Check MSE fit with a series that leads to poor values without good initial conditions.""" + p = stats.fit(genextreme, "genextreme", "MSE", bounds=dict(c=(0,1), scale=(0,100), loc=(200,400))) + np.testing.assert_allclose(p, (0.18397, 293.6795784, 86.60649), 1e-5) def test_fa(fitda): T = 10 From f1bfdde2bf1fe9a4979faecb7be4ffbe4a05e697 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Feb 2025 18:07:08 +0000 Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/xclim/indices/stats.py | 8 +++++--- tests/test_stats.py | 9 ++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index dad1df19b..8d02d42e3 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -58,10 +58,12 @@ def _fitfunc_1d(arr, *, dist, nparams, method, **fitkwargs): elif method in ["MSE", "MPS"]: args, guess = _fit_start(x, dist.name, **fitkwargs) param_info = dist.shapes - for i,arg in enumerate(args): + for i, arg in enumerate(args): guess[param_info[i]] = arg - fitresult = scipy.stats.fit(dist=dist, data=x, method="mse", guess=guess, **fitkwargs) + fitresult = scipy.stats.fit( + dist=dist, data=x, method="mse", guess=guess, **fitkwargs + ) params = fitresult.params elif method == "PWM": # lmoments3 will raise an error if only dist.numargs + 2 values are provided @@ -119,7 +121,7 @@ def fit( Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although it can be more sensitive to repeated data. - For the MSE method, each variable parameter must be given finite bounds (provided with keyword argument bounds={'param_name':(min,max),...}). + For the MSE method, each variable parameter must be given finite bounds (provided with keyword argument bounds={'param_name':(min,max),...}). dim : str The dimension upon which to perform the indexing (default: "time"). **fitkwargs : dict diff --git a/tests/test_stats.py b/tests/test_stats.py index 1c621cd85..0a524620a 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -156,11 +156,18 @@ def test_genextreme_fit(genextreme): p = stats.fit(genextreme, "genextreme") np.testing.assert_allclose(p, (0.20949, 297.954091, 75.7911863), 1e-5) + def test_mse_fit(genextreme): """Check MSE fit with a series that leads to poor values without good initial conditions.""" - p = stats.fit(genextreme, "genextreme", "MSE", bounds=dict(c=(0,1), scale=(0,100), loc=(200,400))) + p = stats.fit( + genextreme, + "genextreme", + "MSE", + bounds=dict(c=(0, 1), scale=(0, 100), loc=(200, 400)), + ) np.testing.assert_allclose(p, (0.18397, 293.6795784, 86.60649), 1e-5) + def test_fa(fitda): T = 10 q = stats.fa(fitda, T, "lognorm") From 5eff2cd8ac33f69f25680f1ece7e040efc4429d0 Mon Sep 17 00:00:00 2001 From: SarahG-579462 Date: Mon, 17 Feb 2025 13:11:03 -0500 Subject: [PATCH 08/12] lint --- src/xclim/indices/stats.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index 8d02d42e3..04ab6b603 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -118,10 +118,13 @@ def fit( Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm (see :py:mod:scipy.stats for full list) or the distribution object itself. method : {"ML", "MLE", "MM", "PWM", "APP", "MSE", "MPS"} - Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) or approximate method (APP). + Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) + or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. - The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although it can be more sensitive to repeated data. - For the MSE method, each variable parameter must be given finite bounds (provided with keyword argument bounds={'param_name':(min,max),...}). + The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although + it can be more sensitive to repeated data. + For the MSE method, each variable parameter must be given finite bounds + (provided with keyword argument bounds={'param_name':(min,max),...}). dim : str The dimension upon which to perform the indexing (default: "time"). **fitkwargs : dict From de7923dbaa2401dd9188f40c0fce9d8a74f381e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Feb 2025 18:12:38 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/xclim/indices/stats.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index 04ab6b603..1ef2e3f1f 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -118,12 +118,12 @@ def fit( Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm (see :py:mod:scipy.stats for full list) or the distribution object itself. method : {"ML", "MLE", "MM", "PWM", "APP", "MSE", "MPS"} - Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) + Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. - The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although + The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although it can be more sensitive to repeated data. - For the MSE method, each variable parameter must be given finite bounds + For the MSE method, each variable parameter must be given finite bounds (provided with keyword argument bounds={'param_name':(min,max),...}). dim : str The dimension upon which to perform the indexing (default: "time"). From eba6b0814e3d20548ef825004ba7821fef769928 Mon Sep 17 00:00:00 2001 From: SarahG-579462 Date: Mon, 17 Feb 2025 13:15:01 -0500 Subject: [PATCH 10/12] spelling --- src/xclim/indices/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xclim/indices/stats.py b/src/xclim/indices/stats.py index 04ab6b603..144616e0f 100644 --- a/src/xclim/indices/stats.py +++ b/src/xclim/indices/stats.py @@ -121,7 +121,7 @@ def fit( Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), maximum product of spacings (MSE or MPS) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. - The PWM method is usually more robust to outliers. The MSE method is more consistant than the MLE method, although + The PWM method is usually more robust to outliers. The MSE method is more consistent than the MLE method, although it can be more sensitive to repeated data. For the MSE method, each variable parameter must be given finite bounds (provided with keyword argument bounds={'param_name':(min,max),...}). From 438278afaf34dbcdcff941b83c03ed4d0974b516 Mon Sep 17 00:00:00 2001 From: SarahG-579462 Date: Mon, 17 Feb 2025 13:44:23 -0500 Subject: [PATCH 11/12] update test values to repeatable and less stringent. --- tests/test_stats.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index 0a524620a..4f258571f 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -6,6 +6,8 @@ import pytest import xarray as xr from scipy.stats import lognorm, norm +from scipy.optimize import differential_evolution +from functools import partial from xclim.indices import stats @@ -159,13 +161,18 @@ def test_genextreme_fit(genextreme): def test_mse_fit(genextreme): """Check MSE fit with a series that leads to poor values without good initial conditions.""" + # Use fixed-seed rng to remove randomness of differential_evolution + # (alternative: change optimizer to non-stochastic gradient-based) + # TODO: change when minimum scipy exceeds 1.15.0 to rng=0 + optimizer = partial(differential_evolution, seed=0) p = stats.fit( genextreme, "genextreme", "MSE", bounds=dict(c=(0, 1), scale=(0, 100), loc=(200, 400)), + optimizer=optimizer, ) - np.testing.assert_allclose(p, (0.18397, 293.6795784, 86.60649), 1e-5) + np.testing.assert_allclose(p, (0.18435517630019815, 293.61049928703073, 86.70937297745427), 1e-3) def test_fa(fitda): From 95ac085a4843e1a9845a6b562ed4429efe91415b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Feb 2025 18:45:30 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_stats.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index 4f258571f..96289d717 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,12 +2,13 @@ from __future__ import annotations +from functools import partial + import numpy as np import pytest import xarray as xr -from scipy.stats import lognorm, norm from scipy.optimize import differential_evolution -from functools import partial +from scipy.stats import lognorm, norm from xclim.indices import stats @@ -172,7 +173,9 @@ def test_mse_fit(genextreme): bounds=dict(c=(0, 1), scale=(0, 100), loc=(200, 400)), optimizer=optimizer, ) - np.testing.assert_allclose(p, (0.18435517630019815, 293.61049928703073, 86.70937297745427), 1e-3) + np.testing.assert_allclose( + p, (0.18435517630019815, 293.61049928703073, 86.70937297745427), 1e-3 + ) def test_fa(fitda):