From ddc352faa6de91f266a1749773d08ae8d6f09683 Mon Sep 17 00:00:00 2001
From: Sam Levang <39069044+slevang@users.noreply.github.com>
Date: Wed, 31 Mar 2021 12:55:53 -0400
Subject: [PATCH] Basic curvefit implementation (#4849)

---
 doc/api.rst                    |   3 +-
 doc/user-guide/computation.rst |  83 ++++++++++++
 doc/whats-new.rst              |   2 +
 xarray/core/dataarray.py       |  78 +++++++++++
 xarray/core/dataset.py         | 233 +++++++++++++++++++++++++++++++++
 xarray/tests/test_dataarray.py |  54 ++++++++
 6 files changed, 452 insertions(+), 1 deletion(-)

diff --git a/doc/api.rst b/doc/api.rst
index a140d9e2b81..6288ce01803 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -179,6 +179,7 @@ Computation
    Dataset.integrate
    Dataset.map_blocks
    Dataset.polyfit
+   Dataset.curvefit
 
 **Aggregation**:
 :py:attr:`~Dataset.all`
@@ -375,7 +376,7 @@ Computation
    DataArray.integrate
    DataArray.polyfit
    DataArray.map_blocks
-
+   DataArray.curvefit
 
 **Aggregation**:
 :py:attr:`~DataArray.all`
diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst
index 28804bcba6f..2a894f72fdf 100644
--- a/doc/user-guide/computation.rst
+++ b/doc/user-guide/computation.rst
@@ -444,6 +444,89 @@ The inverse operation is done with :py:meth:`~xarray.polyval`,
 .. note::
     These methods replicate the behaviour of :py:func:`numpy.polyfit` and :py:func:`numpy.polyval`.
 
+
+.. _compute.curvefit:
+
+Fitting arbitrary functions
+===========================
+
+Xarray objects also provide an interface for fitting more complex functions using
+:py:meth:`scipy.optimize.curve_fit`. :py:meth:`~xarray.DataArray.curvefit` accepts
+user-defined functions and can fit along multiple coordinates.
+
+For example, we can fit a relationship between two ``DataArray`` objects, maintaining
+a unique fit at each spatial coordinate but aggregating over the time dimension:
+
+.. ipython:: python
+
+    def exponential(x, a, xc):
+        return np.exp((x - xc) / a)
+
+
+    x = np.arange(-5, 5, 0.1)
+    t = np.arange(-5, 5, 0.1)
+    X, T = np.meshgrid(x, t)
+    Z1 = np.random.uniform(low=-5, high=5, size=X.shape)
+    Z2 = exponential(Z1, 3, X)
+    Z3 = exponential(Z1, 1, -X)
+
+    ds = xr.Dataset(
+        data_vars=dict(
+            var1=(["t", "x"], Z1), var2=(["t", "x"], Z2), var3=(["t", "x"], Z3)
+        ),
+        coords={"t": t, "x": x},
+    )
+    ds[["var2", "var3"]].curvefit(
+        coords=ds.var1,
+        func=exponential,
+        reduce_dims="t",
+        bounds={"a": (0.5, 5), "xc": (-5, 5)},
+    )
+
+We can also fit multi-dimensional functions, and even use a wrapper function to
+simultaneously fit a summation of several functions, such as this field containing
+two gaussian peaks:
+
+.. ipython:: python
+
+    def gaussian_2d(coords, a, xc, yc, xalpha, yalpha):
+        x, y = coords
+        z = a * np.exp(
+            -np.square(x - xc) / 2 / np.square(xalpha)
+            - np.square(y - yc) / 2 / np.square(yalpha)
+        )
+        return z
+
+
+    def multi_peak(coords, *args):
+        z = np.zeros(coords[0].shape)
+        for i in range(len(args) // 5):
+            z += gaussian_2d(coords, *args[i * 5 : i * 5 + 5])
+        return z
+
+
+    x = np.arange(-5, 5, 0.1)
+    y = np.arange(-5, 5, 0.1)
+    X, Y = np.meshgrid(x, y)
+
+    n_peaks = 2
+    names = ["a", "xc", "yc", "xalpha", "yalpha"]
+    names = [f"{name}{i}" for i in range(n_peaks) for name in names]
+    Z = gaussian_2d((X, Y), 3, 1, 1, 2, 1) + gaussian_2d((X, Y), 2, -1, -2, 1, 1)
+    Z += np.random.normal(scale=0.1, size=Z.shape)
+
+    da = xr.DataArray(Z, dims=["y", "x"], coords={"y": y, "x": x})
+    da.curvefit(
+        coords=["x", "y"],
+        func=multi_peak,
+        param_names=names,
+        kwargs={"maxfev": 10000},
+    )
+
+.. note::
+    This method replicates the behavior of :py:func:`scipy.optimize.curve_fit`.
+
+
 .. _compute.broadcasting:
 
 Broadcasting by dimension name
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 408b59d3c6a..5e394236096 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -65,6 +65,8 @@ New Features
   :py:meth:`~pandas.core.groupby.GroupBy.get_group`.
   By `Deepak Cherian <https://github.com/dcherian>`_.
 - Disable the `cfgrib` backend if the `eccodes` library is not installed (:pull:`5083`). By `Baudouin Raoult <https://github.com/b8raoult>`_.
+- Added :py:meth:`DataArray.curvefit` and :py:meth:`Dataset.curvefit` for general curve fitting applications. (:issue:`4300`, :pull:`4849`)
+  By `Sam Levang <https://github.com/slevang>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
index 1f82b48d7f8..3b15144df4e 100644
--- a/xarray/core/dataarray.py
+++ b/xarray/core/dataarray.py
@@ -4418,6 +4418,84 @@ def query(
         )
         return ds[self.name]
 
+    def curvefit(
+        self,
+        coords: Union[Union[str, "DataArray"], Iterable[Union[str, "DataArray"]]],
+        func: Callable[..., Any],
+        reduce_dims: Union[Hashable, Iterable[Hashable]] = None,
+        skipna: bool = True,
+        p0: Dict[str, Any] = None,
+        bounds: Dict[str, Any] = None,
+        param_names: Sequence[str] = None,
+        kwargs: Dict[str, Any] = None,
+    ):
+        """
+        Curve fitting optimization for arbitrary functions.
+
+        Wraps `scipy.optimize.curve_fit` with `apply_ufunc`.
+
+        Parameters
+        ----------
+        coords : DataArray, str or sequence of DataArray, str
+            Independent coordinate(s) over which to perform the curve fitting. Must share
+            at least one dimension with the calling object. When fitting multi-dimensional
+            functions, supply `coords` as a sequence in the same order as arguments in
+            `func`. To fit along existing dimensions of the calling object, `coords` can
+            also be specified as a str or sequence of strs.
+        func : callable
+            User specified function in the form `f(x, *params)` which returns a numpy
+            array of length `len(x)`. `params` are the fittable parameters which are optimized
+            by scipy curve_fit. `x` can also be specified as a sequence containing multiple
+            coordinates, e.g. `f((x0, x1), *params)`.
+        reduce_dims : str or sequence of str
+            Additional dimension(s) over which to aggregate while fitting. For example,
+            calling `ds.curvefit(coords='time', reduce_dims=['lat', 'lon'], ...)` will
+            aggregate all lat and lon points and fit the specified function along the
+            time dimension.
+        skipna : bool, optional
+            Whether to skip missing values when fitting. Default is True.
+        p0 : dictionary, optional
+            Optional dictionary of parameter names to initial guesses passed to the
+            `curve_fit` `p0` arg. If none or only some parameters are passed, the rest will
+            be assigned initial values following the default scipy behavior.
+        bounds : dictionary, optional
+            Optional dictionary of parameter names to bounding values passed to the
+            `curve_fit` `bounds` arg. If none or only some parameters are passed, the rest
+            will be unbounded following the default scipy behavior.
+        param_names: seq, optional
+            Sequence of names for the fittable parameters of `func`. If not supplied,
+            this will be automatically determined by arguments of `func`. `param_names`
+            should be manually supplied when fitting a function that takes a variable
+            number of parameters.
+        kwargs : dictionary
+            Additional keyword arguments to passed to scipy curve_fit.
+
+        Returns
+        -------
+        curvefit_results : Dataset
+            A single dataset which contains:
+
+            [var]_curvefit_coefficients
+                The coefficients of the best fit.
+            [var]_curvefit_covariance
+                The covariance matrix of the coefficient estimates.
+
+        See also
+        --------
+        DataArray.polyfit
+        scipy.optimize.curve_fit
+        """
+        return self._to_temp_dataset().curvefit(
+            coords,
+            func,
+            reduce_dims=reduce_dims,
+            skipna=skipna,
+            p0=p0,
+            bounds=bounds,
+            param_names=param_names,
+            kwargs=kwargs,
+        )
+
     # this needs to be at the end, or mypy will confuse with `str`
     # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
     str = utils.UncachedAccessor(StringAccessor)
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index e959135e8d9..8beacbf8c42 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -1,6 +1,7 @@
 import copy
 import datetime
 import functools
+import inspect
 import sys
 import warnings
 from collections import defaultdict
@@ -456,6 +457,63 @@ def as_dataset(obj: Any) -> "Dataset":
     return obj
 
 
+def _get_func_args(func, param_names):
+    """Use `inspect.signature` to try accessing `func` args. Otherwise, ensure
+    they are provided by user.
+    """
+    try:
+        func_args = inspect.signature(func).parameters
+    except ValueError:
+        func_args = {}
+        if not param_names:
+            raise ValueError(
+                "Unable to inspect `func` signature, and `param_names` was not provided."
+            )
+    if param_names:
+        params = param_names
+    else:
+        params = list(func_args)[1:]
+        if any(
+            [(p.kind in [p.VAR_POSITIONAL, p.VAR_KEYWORD]) for p in func_args.values()]
+        ):
+            raise ValueError(
+                "`param_names` must be provided because `func` takes variable length arguments."
+            )
+    return params, func_args
+
+
+def _initialize_curvefit_params(params, p0, bounds, func_args):
+    """Set initial guess and bounds for curvefit.
+    Priority: 1) passed args 2) func signature 3) scipy defaults
+    """
+
+    def _initialize_feasible(lb, ub):
+        # Mimics functionality of scipy.optimize.minpack._initialize_feasible
+        lb_finite = np.isfinite(lb)
+        ub_finite = np.isfinite(ub)
+        p0 = np.nansum(
+            [
+                0.5 * (lb + ub) * int(lb_finite & ub_finite),
+                (lb + 1) * int(lb_finite & ~ub_finite),
+                (ub - 1) * int(~lb_finite & ub_finite),
+            ]
+        )
+        return p0
+
+    param_defaults = {p: 1 for p in params}
+    bounds_defaults = {p: (-np.inf, np.inf) for p in params}
+    for p in params:
+        if p in func_args and func_args[p].default is not func_args[p].empty:
+            param_defaults[p] = func_args[p].default
+        if p in bounds:
+            bounds_defaults[p] = tuple(bounds[p])
+            if param_defaults[p] < bounds[p][0] or param_defaults[p] > bounds[p][1]:
+                param_defaults[p] = _initialize_feasible(bounds[p][0], bounds[p][1])
+        if p in p0:
+            param_defaults[p] = p0[p]
+    return param_defaults, bounds_defaults
+
+
 class DataVariables(Mapping[Hashable, "DataArray"]):
     __slots__ = ("_dataset",)
 
@@ -7074,5 +7132,180 @@ def query(
         # apply the selection
         return self.isel(indexers, missing_dims=missing_dims)
 
+    def curvefit(
+        self,
+        coords: Union[Union[str, "DataArray"], Iterable[Union[str, "DataArray"]]],
+        func: Callable[..., Any],
+        reduce_dims: Union[Hashable, Iterable[Hashable]] = None,
+        skipna: bool = True,
+        p0: Dict[str, Any] = None,
+        bounds: Dict[str, Any] = None,
+        param_names: Sequence[str] = None,
+        kwargs: Dict[str, Any] = None,
+    ):
+        """
+        Curve fitting optimization for arbitrary functions.
+
+        Wraps `scipy.optimize.curve_fit` with `apply_ufunc`.
+
+        Parameters
+        ----------
+        coords : DataArray, str or sequence of DataArray, str
+            Independent coordinate(s) over which to perform the curve fitting. Must share
+            at least one dimension with the calling object. When fitting multi-dimensional
+            functions, supply `coords` as a sequence in the same order as arguments in
+            `func`. To fit along existing dimensions of the calling object, `coords` can
+            also be specified as a str or sequence of strs.
+        func : callable
+            User specified function in the form `f(x, *params)` which returns a numpy
+            array of length `len(x)`. `params` are the fittable parameters which are optimized
+            by scipy curve_fit. `x` can also be specified as a sequence containing multiple
+            coordinates, e.g. `f((x0, x1), *params)`.
+        reduce_dims : str or sequence of str
+            Additional dimension(s) over which to aggregate while fitting. For example,
+            calling `ds.curvefit(coords='time', reduce_dims=['lat', 'lon'], ...)` will
+            aggregate all lat and lon points and fit the specified function along the
+            time dimension.
+        skipna : bool, optional
+            Whether to skip missing values when fitting. Default is True.
+        p0 : dictionary, optional
+            Optional dictionary of parameter names to initial guesses passed to the
+            `curve_fit` `p0` arg. If none or only some parameters are passed, the rest will
+            be assigned initial values following the default scipy behavior.
+        bounds : dictionary, optional
+            Optional dictionary of parameter names to bounding values passed to the
+            `curve_fit` `bounds` arg. If none or only some parameters are passed, the rest
+            will be unbounded following the default scipy behavior.
+        param_names: seq, optional
+            Sequence of names for the fittable parameters of `func`. If not supplied,
+            this will be automatically determined by arguments of `func`. `param_names`
+            should be manually supplied when fitting a function that takes a variable
+            number of parameters.
+        kwargs : dictionary
+            Additional keyword arguments to passed to scipy curve_fit.
+
+        Returns
+        -------
+        curvefit_results : Dataset
+            A single dataset which contains:
+
+            [var]_curvefit_coefficients
+                The coefficients of the best fit.
+            [var]_curvefit_covariance
+                The covariance matrix of the coefficient estimates.
+
+        See also
+        --------
+        Dataset.polyfit
+        scipy.optimize.curve_fit
+        """
+        from scipy.optimize import curve_fit
+
+        if p0 is None:
+            p0 = {}
+        if bounds is None:
+            bounds = {}
+        if kwargs is None:
+            kwargs = {}
+
+        if not reduce_dims:
+            reduce_dims_ = []
+        elif isinstance(reduce_dims, str) or not isinstance(reduce_dims, Iterable):
+            reduce_dims_ = [reduce_dims]
+        else:
+            reduce_dims_ = list(reduce_dims)
+
+        if (
+            isinstance(coords, str)
+            or isinstance(coords, xr.DataArray)
+            or not isinstance(coords, Iterable)
+        ):
+            coords = [coords]
+        coords_ = [self[coord] if isinstance(coord, str) else coord for coord in coords]
+
+        # Determine whether any coords are dims on self
+        for coord in coords_:
+            reduce_dims_ += [c for c in self.dims if coord.equals(self[c])]
+        reduce_dims_ = list(set(reduce_dims_))
+        preserved_dims = list(set(self.dims) - set(reduce_dims_))
+        if not reduce_dims_:
+            raise ValueError(
+                "No arguments to `coords` were identified as a dimension on the calling "
+                "object, and no dims were supplied to `reduce_dims`. This would result "
+                "in fitting on scalar data."
+            )
+
+        # Broadcast all coords with each other
+        coords_ = xr.broadcast(*coords_)
+        coords_ = [
+            coord.broadcast_like(self, exclude=preserved_dims) for coord in coords_
+        ]
+
+        params, func_args = _get_func_args(func, param_names)
+        param_defaults, bounds_defaults = _initialize_curvefit_params(
+            params, p0, bounds, func_args
+        )
+        n_params = len(params)
+        kwargs.setdefault("p0", [param_defaults[p] for p in params])
+        kwargs.setdefault(
+            "bounds",
+            [
+                [bounds_defaults[p][0] for p in params],
+                [bounds_defaults[p][1] for p in params],
+            ],
+        )
+
+        def _wrapper(Y, *coords_, **kwargs):
+            # Wrap curve_fit with raveled coordinates and pointwise NaN handling
+            x = np.vstack([c.ravel() for c in coords_])
+            y = Y.ravel()
+            if skipna:
+                mask = np.all([np.any(~np.isnan(x), axis=0), ~np.isnan(y)], axis=0)
+                x = x[:, mask]
+                y = y[mask]
+                if not len(y):
+                    popt = np.full([n_params], np.nan)
+                    pcov = np.full([n_params, n_params], np.nan)
+                    return popt, pcov
+            x = np.squeeze(x)
+            popt, pcov = curve_fit(func, x, y, **kwargs)
+            return popt, pcov
+
+        result = xr.Dataset()
+        for name, da in self.data_vars.items():
+            if name is xr.core.dataarray._THIS_ARRAY:
+                name = ""
+            else:
+                name = f"{str(name)}_"
+
+            popt, pcov = xr.apply_ufunc(
+                _wrapper,
+                da,
+                *coords_,
+                vectorize=True,
+                dask="parallelized",
+                input_core_dims=[reduce_dims_ for d in range(len(coords_) + 1)],
+                output_core_dims=[["param"], ["cov_i", "cov_j"]],
+                dask_gufunc_kwargs={
+                    "output_sizes": {
+                        "param": n_params,
+                        "cov_i": n_params,
+                        "cov_j": n_params,
+                    },
+                },
+                output_dtypes=(np.float64, np.float64),
+                exclude_dims=set(reduce_dims_),
+                kwargs=kwargs,
+            )
+            result[name + "curvefit_coefficients"] = popt
+            result[name + "curvefit_covariance"] = pcov
+
+        result = result.assign_coords(
+            {"param": params, "cov_i": params, "cov_j": params}
+        )
+        result.attrs = self.attrs.copy()
+
+        return result
+
 
 ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index 20e357543d6..c18f9acfca1 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -4690,6 +4690,60 @@ def test_query(self, backend, engine, parser):
         with pytest.raises(UndefinedVariableError):
             aa.query(x="spam > 50")  # name not present
 
+    @requires_scipy
+    @pytest.mark.parametrize("use_dask", [True, False])
+    def test_curvefit(self, use_dask):
+        if use_dask and not has_dask:
+            pytest.skip("requires dask")
+
+        def exp_decay(t, n0, tau=1):
+            return n0 * np.exp(-t / tau)
+
+        t = np.arange(0, 5, 0.5)
+        da = DataArray(
+            np.stack([exp_decay(t, 3, 3), exp_decay(t, 5, 4), np.nan * t], axis=-1),
+            dims=("t", "x"),
+            coords={"t": t, "x": [0, 1, 2]},
+        )
+        da[0, 0] = np.nan
+
+        expected = DataArray(
+            [[3, 3], [5, 4], [np.nan, np.nan]],
+            dims=("x", "param"),
+            coords={"x": [0, 1, 2], "param": ["n0", "tau"]},
+        )
+
+        if use_dask:
+            da = da.chunk({"x": 1})
+
+        fit = da.curvefit(
+            coords=[da.t], func=exp_decay, p0={"n0": 4}, bounds={"tau": [2, 6]}
+        )
+        assert_allclose(fit.curvefit_coefficients, expected, rtol=1e-3)
+
+        da = da.compute()
+        fit = da.curvefit(coords="t", func=np.power, reduce_dims="x", param_names=["a"])
+        assert "a" in fit.param
+        assert "x" not in fit.dims
+
+    def test_curvefit_helpers(self):
+        def exp_decay(t, n0, tau=1):
+            return n0 * np.exp(-t / tau)
+
+        params, func_args = xr.core.dataset._get_func_args(exp_decay, [])
+        assert params == ["n0", "tau"]
+        param_defaults, bounds_defaults = xr.core.dataset._initialize_curvefit_params(
+            params, {"n0": 4}, {"tau": [5, np.inf]}, func_args
+        )
+        assert param_defaults == {"n0": 4, "tau": 6}
+        assert bounds_defaults == {"n0": (-np.inf, np.inf), "tau": (5, np.inf)}
+
+        param_names = ["a"]
+        params, func_args = xr.core.dataset._get_func_args(np.power, param_names)
+        assert params == param_names
+        with pytest.raises(ValueError):
+            xr.core.dataset._get_func_args(np.power, [])
+
 
 class TestReduce:
     @pytest.fixture(autouse=True)