From ddc352faa6de91f266a1749773d08ae8d6f09683 Mon Sep 17 00:00:00 2001 From: Sam Levang <39069044+slevang@users.noreply.github.com> Date: Wed, 31 Mar 2021 12:55:53 -0400 Subject: [PATCH] Basic curvefit implementation (#4849) --- doc/api.rst | 3 +- doc/user-guide/computation.rst | 83 ++++++++++++ doc/whats-new.rst | 2 + xarray/core/dataarray.py | 78 +++++++++++ xarray/core/dataset.py | 233 +++++++++++++++++++++++++++++++++ xarray/tests/test_dataarray.py | 54 ++++++++ 6 files changed, 452 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index a140d9e2b81..6288ce01803 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -179,6 +179,7 @@ Computation Dataset.integrate Dataset.map_blocks Dataset.polyfit + Dataset.curvefit **Aggregation**: :py:attr:`~Dataset.all` @@ -375,7 +376,7 @@ Computation DataArray.integrate DataArray.polyfit DataArray.map_blocks - + DataArray.curvefit **Aggregation**: :py:attr:`~DataArray.all` diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst index 28804bcba6f..2a894f72fdf 100644 --- a/doc/user-guide/computation.rst +++ b/doc/user-guide/computation.rst @@ -444,6 +444,89 @@ The inverse operation is done with :py:meth:`~xarray.polyval`, .. note:: These methods replicate the behaviour of :py:func:`numpy.polyfit` and :py:func:`numpy.polyval`. + +.. _compute.curvefit: + +Fitting arbitrary functions +=========================== + +Xarray objects also provide an interface for fitting more complex functions using +:py:meth:`scipy.optimize.curve_fit`. :py:meth:`~xarray.DataArray.curvefit` accepts +user-defined functions and can fit along multiple coordinates. + +For example, we can fit a relationship between two ``DataArray`` objects, maintaining +a unique fit at each spatial coordinate but aggregating over the time dimension: + +.. ipython:: python + + def exponential(x, a, xc): + return np.exp((x - xc) / a) + + + x = np.arange(-5, 5, 0.1) + t = np.arange(-5, 5, 0.1) + X, T = np.meshgrid(x, t) + Z1 = np.random.uniform(low=-5, high=5, size=X.shape) + Z2 = exponential(Z1, 3, X) + Z3 = exponential(Z1, 1, -X) + + ds = xr.Dataset( + data_vars=dict( + var1=(["t", "x"], Z1), var2=(["t", "x"], Z2), var3=(["t", "x"], Z3) + ), + coords={"t": t, "x": x}, + ) + ds[["var2", "var3"]].curvefit( + coords=ds.var1, + func=exponential, + reduce_dims="t", + bounds={"a": (0.5, 5), "xc": (-5, 5)}, + ) + +We can also fit multi-dimensional functions, and even use a wrapper function to +simultaneously fit a summation of several functions, such as this field containing +two gaussian peaks: + +.. ipython:: python + + def gaussian_2d(coords, a, xc, yc, xalpha, yalpha): + x, y = coords + z = a * np.exp( + -np.square(x - xc) / 2 / np.square(xalpha) + - np.square(y - yc) / 2 / np.square(yalpha) + ) + return z + + + def multi_peak(coords, *args): + z = np.zeros(coords[0].shape) + for i in range(len(args) // 5): + z += gaussian_2d(coords, *args[i * 5 : i * 5 + 5]) + return z + + + x = np.arange(-5, 5, 0.1) + y = np.arange(-5, 5, 0.1) + X, Y = np.meshgrid(x, y) + + n_peaks = 2 + names = ["a", "xc", "yc", "xalpha", "yalpha"] + names = [f"{name}{i}" for i in range(n_peaks) for name in names] + Z = gaussian_2d((X, Y), 3, 1, 1, 2, 1) + gaussian_2d((X, Y), 2, -1, -2, 1, 1) + Z += np.random.normal(scale=0.1, size=Z.shape) + + da = xr.DataArray(Z, dims=["y", "x"], coords={"y": y, "x": x}) + da.curvefit( + coords=["x", "y"], + func=multi_peak, + param_names=names, + kwargs={"maxfev": 10000}, + ) + +.. note:: + This method replicates the behavior of :py:func:`scipy.optimize.curve_fit`. + + .. _compute.broadcasting: Broadcasting by dimension name diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 408b59d3c6a..5e394236096 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -65,6 +65,8 @@ New Features :py:meth:`~pandas.core.groupby.GroupBy.get_group`. By `Deepak Cherian `_. - Disable the `cfgrib` backend if the `eccodes` library is not installed (:pull:`5083`). By `Baudouin Raoult `_. +- Added :py:meth:`DataArray.curvefit` and :py:meth:`Dataset.curvefit` for general curve fitting applications. (:issue:`4300`, :pull:`4849`) + By `Sam Levang `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1f82b48d7f8..3b15144df4e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4418,6 +4418,84 @@ def query( ) return ds[self.name] + def curvefit( + self, + coords: Union[Union[str, "DataArray"], Iterable[Union[str, "DataArray"]]], + func: Callable[..., Any], + reduce_dims: Union[Hashable, Iterable[Hashable]] = None, + skipna: bool = True, + p0: Dict[str, Any] = None, + bounds: Dict[str, Any] = None, + param_names: Sequence[str] = None, + kwargs: Dict[str, Any] = None, + ): + """ + Curve fitting optimization for arbitrary functions. + + Wraps `scipy.optimize.curve_fit` with `apply_ufunc`. + + Parameters + ---------- + coords : DataArray, str or sequence of DataArray, str + Independent coordinate(s) over which to perform the curve fitting. Must share + at least one dimension with the calling object. When fitting multi-dimensional + functions, supply `coords` as a sequence in the same order as arguments in + `func`. To fit along existing dimensions of the calling object, `coords` can + also be specified as a str or sequence of strs. + func : callable + User specified function in the form `f(x, *params)` which returns a numpy + array of length `len(x)`. `params` are the fittable parameters which are optimized + by scipy curve_fit. `x` can also be specified as a sequence containing multiple + coordinates, e.g. `f((x0, x1), *params)`. + reduce_dims : str or sequence of str + Additional dimension(s) over which to aggregate while fitting. For example, + calling `ds.curvefit(coords='time', reduce_dims=['lat', 'lon'], ...)` will + aggregate all lat and lon points and fit the specified function along the + time dimension. + skipna : bool, optional + Whether to skip missing values when fitting. Default is True. + p0 : dictionary, optional + Optional dictionary of parameter names to initial guesses passed to the + `curve_fit` `p0` arg. If none or only some parameters are passed, the rest will + be assigned initial values following the default scipy behavior. + bounds : dictionary, optional + Optional dictionary of parameter names to bounding values passed to the + `curve_fit` `bounds` arg. If none or only some parameters are passed, the rest + will be unbounded following the default scipy behavior. + param_names: seq, optional + Sequence of names for the fittable parameters of `func`. If not supplied, + this will be automatically determined by arguments of `func`. `param_names` + should be manually supplied when fitting a function that takes a variable + number of parameters. + kwargs : dictionary + Additional keyword arguments to passed to scipy curve_fit. + + Returns + ------- + curvefit_results : Dataset + A single dataset which contains: + + [var]_curvefit_coefficients + The coefficients of the best fit. + [var]_curvefit_covariance + The covariance matrix of the coefficient estimates. + + See also + -------- + DataArray.polyfit + scipy.optimize.curve_fit + """ + return self._to_temp_dataset().curvefit( + coords, + func, + reduce_dims=reduce_dims, + skipna=skipna, + p0=p0, + bounds=bounds, + param_names=param_names, + kwargs=kwargs, + ) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e959135e8d9..8beacbf8c42 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1,6 +1,7 @@ import copy import datetime import functools +import inspect import sys import warnings from collections import defaultdict @@ -456,6 +457,63 @@ def as_dataset(obj: Any) -> "Dataset": return obj +def _get_func_args(func, param_names): + """Use `inspect.signature` to try accessing `func` args. Otherwise, ensure + they are provided by user. + """ + try: + func_args = inspect.signature(func).parameters + except ValueError: + func_args = {} + if not param_names: + raise ValueError( + "Unable to inspect `func` signature, and `param_names` was not provided." + ) + if param_names: + params = param_names + else: + params = list(func_args)[1:] + if any( + [(p.kind in [p.VAR_POSITIONAL, p.VAR_KEYWORD]) for p in func_args.values()] + ): + raise ValueError( + "`param_names` must be provided because `func` takes variable length arguments." + ) + return params, func_args + + +def _initialize_curvefit_params(params, p0, bounds, func_args): + """Set initial guess and bounds for curvefit. + Priority: 1) passed args 2) func signature 3) scipy defaults + """ + + def _initialize_feasible(lb, ub): + # Mimics functionality of scipy.optimize.minpack._initialize_feasible + lb_finite = np.isfinite(lb) + ub_finite = np.isfinite(ub) + p0 = np.nansum( + [ + 0.5 * (lb + ub) * int(lb_finite & ub_finite), + (lb + 1) * int(lb_finite & ~ub_finite), + (ub - 1) * int(~lb_finite & ub_finite), + ] + ) + return p0 + + param_defaults = {p: 1 for p in params} + bounds_defaults = {p: (-np.inf, np.inf) for p in params} + for p in params: + if p in func_args and func_args[p].default is not func_args[p].empty: + param_defaults[p] = func_args[p].default + if p in bounds: + bounds_defaults[p] = tuple(bounds[p]) + if param_defaults[p] < bounds[p][0] or param_defaults[p] > bounds[p][1]: + param_defaults[p] = _initialize_feasible(bounds[p][0], bounds[p][1]) + if p in p0: + param_defaults[p] = p0[p] + return param_defaults, bounds_defaults + + class DataVariables(Mapping[Hashable, "DataArray"]): __slots__ = ("_dataset",) @@ -7074,5 +7132,180 @@ def query( # apply the selection return self.isel(indexers, missing_dims=missing_dims) + def curvefit( + self, + coords: Union[Union[str, "DataArray"], Iterable[Union[str, "DataArray"]]], + func: Callable[..., Any], + reduce_dims: Union[Hashable, Iterable[Hashable]] = None, + skipna: bool = True, + p0: Dict[str, Any] = None, + bounds: Dict[str, Any] = None, + param_names: Sequence[str] = None, + kwargs: Dict[str, Any] = None, + ): + """ + Curve fitting optimization for arbitrary functions. + + Wraps `scipy.optimize.curve_fit` with `apply_ufunc`. + + Parameters + ---------- + coords : DataArray, str or sequence of DataArray, str + Independent coordinate(s) over which to perform the curve fitting. Must share + at least one dimension with the calling object. When fitting multi-dimensional + functions, supply `coords` as a sequence in the same order as arguments in + `func`. To fit along existing dimensions of the calling object, `coords` can + also be specified as a str or sequence of strs. + func : callable + User specified function in the form `f(x, *params)` which returns a numpy + array of length `len(x)`. `params` are the fittable parameters which are optimized + by scipy curve_fit. `x` can also be specified as a sequence containing multiple + coordinates, e.g. `f((x0, x1), *params)`. + reduce_dims : str or sequence of str + Additional dimension(s) over which to aggregate while fitting. For example, + calling `ds.curvefit(coords='time', reduce_dims=['lat', 'lon'], ...)` will + aggregate all lat and lon points and fit the specified function along the + time dimension. + skipna : bool, optional + Whether to skip missing values when fitting. Default is True. + p0 : dictionary, optional + Optional dictionary of parameter names to initial guesses passed to the + `curve_fit` `p0` arg. If none or only some parameters are passed, the rest will + be assigned initial values following the default scipy behavior. + bounds : dictionary, optional + Optional dictionary of parameter names to bounding values passed to the + `curve_fit` `bounds` arg. If none or only some parameters are passed, the rest + will be unbounded following the default scipy behavior. + param_names: seq, optional + Sequence of names for the fittable parameters of `func`. If not supplied, + this will be automatically determined by arguments of `func`. `param_names` + should be manually supplied when fitting a function that takes a variable + number of parameters. + kwargs : dictionary + Additional keyword arguments to passed to scipy curve_fit. + + Returns + ------- + curvefit_results : Dataset + A single dataset which contains: + + [var]_curvefit_coefficients + The coefficients of the best fit. + [var]_curvefit_covariance + The covariance matrix of the coefficient estimates. + + See also + -------- + Dataset.polyfit + scipy.optimize.curve_fit + """ + from scipy.optimize import curve_fit + + if p0 is None: + p0 = {} + if bounds is None: + bounds = {} + if kwargs is None: + kwargs = {} + + if not reduce_dims: + reduce_dims_ = [] + elif isinstance(reduce_dims, str) or not isinstance(reduce_dims, Iterable): + reduce_dims_ = [reduce_dims] + else: + reduce_dims_ = list(reduce_dims) + + if ( + isinstance(coords, str) + or isinstance(coords, xr.DataArray) + or not isinstance(coords, Iterable) + ): + coords = [coords] + coords_ = [self[coord] if isinstance(coord, str) else coord for coord in coords] + + # Determine whether any coords are dims on self + for coord in coords_: + reduce_dims_ += [c for c in self.dims if coord.equals(self[c])] + reduce_dims_ = list(set(reduce_dims_)) + preserved_dims = list(set(self.dims) - set(reduce_dims_)) + if not reduce_dims_: + raise ValueError( + "No arguments to `coords` were identified as a dimension on the calling " + "object, and no dims were supplied to `reduce_dims`. This would result " + "in fitting on scalar data." + ) + + # Broadcast all coords with each other + coords_ = xr.broadcast(*coords_) + coords_ = [ + coord.broadcast_like(self, exclude=preserved_dims) for coord in coords_ + ] + + params, func_args = _get_func_args(func, param_names) + param_defaults, bounds_defaults = _initialize_curvefit_params( + params, p0, bounds, func_args + ) + n_params = len(params) + kwargs.setdefault("p0", [param_defaults[p] for p in params]) + kwargs.setdefault( + "bounds", + [ + [bounds_defaults[p][0] for p in params], + [bounds_defaults[p][1] for p in params], + ], + ) + + def _wrapper(Y, *coords_, **kwargs): + # Wrap curve_fit with raveled coordinates and pointwise NaN handling + x = np.vstack([c.ravel() for c in coords_]) + y = Y.ravel() + if skipna: + mask = np.all([np.any(~np.isnan(x), axis=0), ~np.isnan(y)], axis=0) + x = x[:, mask] + y = y[mask] + if not len(y): + popt = np.full([n_params], np.nan) + pcov = np.full([n_params, n_params], np.nan) + return popt, pcov + x = np.squeeze(x) + popt, pcov = curve_fit(func, x, y, **kwargs) + return popt, pcov + + result = xr.Dataset() + for name, da in self.data_vars.items(): + if name is xr.core.dataarray._THIS_ARRAY: + name = "" + else: + name = f"{str(name)}_" + + popt, pcov = xr.apply_ufunc( + _wrapper, + da, + *coords_, + vectorize=True, + dask="parallelized", + input_core_dims=[reduce_dims_ for d in range(len(coords_) + 1)], + output_core_dims=[["param"], ["cov_i", "cov_j"]], + dask_gufunc_kwargs={ + "output_sizes": { + "param": n_params, + "cov_i": n_params, + "cov_j": n_params, + }, + }, + output_dtypes=(np.float64, np.float64), + exclude_dims=set(reduce_dims_), + kwargs=kwargs, + ) + result[name + "curvefit_coefficients"] = popt + result[name + "curvefit_covariance"] = pcov + + result = result.assign_coords( + {"param": params, "cov_i": params, "cov_j": params} + ) + result.attrs = self.attrs.copy() + + return result + ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 20e357543d6..c18f9acfca1 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4690,6 +4690,60 @@ def test_query(self, backend, engine, parser): with pytest.raises(UndefinedVariableError): aa.query(x="spam > 50") # name not present + @requires_scipy + @pytest.mark.parametrize("use_dask", [True, False]) + def test_curvefit(self, use_dask): + if use_dask and not has_dask: + pytest.skip("requires dask") + + def exp_decay(t, n0, tau=1): + return n0 * np.exp(-t / tau) + + t = np.arange(0, 5, 0.5) + da = DataArray( + np.stack([exp_decay(t, 3, 3), exp_decay(t, 5, 4), np.nan * t], axis=-1), + dims=("t", "x"), + coords={"t": t, "x": [0, 1, 2]}, + ) + da[0, 0] = np.nan + + expected = DataArray( + [[3, 3], [5, 4], [np.nan, np.nan]], + dims=("x", "param"), + coords={"x": [0, 1, 2], "param": ["n0", "tau"]}, + ) + + if use_dask: + da = da.chunk({"x": 1}) + + fit = da.curvefit( + coords=[da.t], func=exp_decay, p0={"n0": 4}, bounds={"tau": [2, 6]} + ) + assert_allclose(fit.curvefit_coefficients, expected, rtol=1e-3) + + da = da.compute() + fit = da.curvefit(coords="t", func=np.power, reduce_dims="x", param_names=["a"]) + assert "a" in fit.param + assert "x" not in fit.dims + + def test_curvefit_helpers(self): + def exp_decay(t, n0, tau=1): + return n0 * np.exp(-t / tau) + + params, func_args = xr.core.dataset._get_func_args(exp_decay, []) + assert params == ["n0", "tau"] + param_defaults, bounds_defaults = xr.core.dataset._initialize_curvefit_params( + params, {"n0": 4}, {"tau": [5, np.inf]}, func_args + ) + assert param_defaults == {"n0": 4, "tau": 6} + assert bounds_defaults == {"n0": (-np.inf, np.inf), "tau": (5, np.inf)} + + param_names = ["a"] + params, func_args = xr.core.dataset._get_func_args(np.power, param_names) + assert params == param_names + with pytest.raises(ValueError): + xr.core.dataset._get_func_args(np.power, []) + class TestReduce: @pytest.fixture(autouse=True)