Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Coarsen #2612

Merged
merged 30 commits into from
Jan 6, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
3525b9c
Added variable.coarsen
fujiisoup Dec 15, 2018
5ff3102
Added DataArray.coarsen and Dataset.coarsen
fujiisoup Dec 16, 2018
6f3cf0c
pep8
fujiisoup Dec 16, 2018
f1f4804
a bugfix for mpa3
fujiisoup Dec 17, 2018
ab5d2f6
Support mean for datatime dtype
fujiisoup Dec 17, 2018
9123fd4
nanmean for DateTime
fujiisoup Dec 17, 2018
c85d18a
API updatedd via comments
fujiisoup Dec 20, 2018
0aa7a37
bug fix in tests
fujiisoup Dec 20, 2018
b656d62
updated docs
fujiisoup Dec 20, 2018
2ffcb23
Merge branch 'master' into corsen
fujiisoup Dec 20, 2018
04773eb
use pd.isnull rather than isnat
fujiisoup Dec 21, 2018
b33020b
support Variable in datetime_to_numeric
fujiisoup Dec 21, 2018
b13af18
use pd.isnull instead of numpy.isnat in test
fujiisoup Dec 21, 2018
24f3061
Merge branch 'master' into corsen
fujiisoup Dec 24, 2018
d806c96
Added an example to doc.
fujiisoup Dec 24, 2018
96bf29b
coordinate_func -> coord_func. Support 0d-array mean with datetime
fujiisoup Dec 24, 2018
b70996a
Added an two dimensional example
fujiisoup Dec 24, 2018
827794e
flake8
fujiisoup Dec 24, 2018
a354005
Merge branch 'master' into corsen
fujiisoup Dec 25, 2018
82c08af
flake8
fujiisoup Dec 25, 2018
d73d1d5
a potential bug fix
fujiisoup Dec 25, 2018
a92c431
Update via comments
fujiisoup Dec 30, 2018
0e53c7b
Always use datetime64[ns] in mean
fujiisoup Dec 30, 2018
07b8060
Added tests for 2d coarsen with value check
fujiisoup Dec 31, 2018
aa41f39
update via comment
fujiisoup Jan 3, 2019
4c347af
Merge branch 'master' into corsen
fujiisoup Jan 3, 2019
2a06b05
whats new
fujiisoup Jan 3, 2019
50fa6aa
Merge branch 'master' into corsen
fujiisoup Jan 3, 2019
1d04bdd
typo fix
fujiisoup Jan 4, 2019
1523292
Merge branch 'master' into corsen
shoyer Jan 6, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,55 @@ def rolling(self, dim=None, min_periods=None, center=False, **dim_kwargs):
return self._rolling_cls(self, dim, min_periods=min_periods,
center=center)

def coarsen(self, dim=None, side='left', trim_excess=False,
coordinate_func=None, **dim_kwargs):
"""
Coarsen object.

Parameters
----------
dim: dict, optional
Mapping from the dimension name to the window size.
side : 'left' or 'right', or dict
If left, coarsening windows start from 0th index. The excessed
entries in the most right will be removed (if trim_excess is True).
If right, coarsen windows ends at the most right entry, while
excessed entries in the most left will be removed.
trim_excess : boolean, default False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have an API that lets us express at least three options:

  • trim excess entries
  • pad with NaN
  • raise an error if the shape does not divide exactly (this is probably the safest default behavior)

Maybe a string valued keyword argument would work better here, e.g., boundary='trim', boundary='pad' and boundary='exact'?

I would also suggest putting this argument before side, since side is only used for a particular (non-default) value of this argument.

If true, the excessed entries are trimed. If False, np.nan will be
filled.
**dim_kwargs : optional
The keyword arguments form of ``dim``.
One of dim or dim_kwargs must be provided.

Returns
-------
Coarsen object (core.rolling.DataArrayCoarsen for DataArray,
core.rolling.DatasetCoarsen for Dataset.)


Examples
--------
Coarsen the long time series by averaging for every seven data.

>>> da = xr.DataArray(np.linspace(0, 365, num=365),
... dims='time',
... coords={'time': pd.date_range(
... '15/12/1999', periods=365)})
>>> da
# TODO add example

See Also
--------
core.rolling.DataArrayCoarsen
core.rolling.DatasetCoarsen
"""
dim = either_dict_or_kwargs(dim, dim_kwargs, 'coarsen')
return self._coarsen_cls(
self, dim, side=side, trim_excess=trim_excess,
coordinate_func=coordinate_func)


def resample(self, indexer=None, skipna=None, closed=None, label=None,
base=0, keep_attrs=None, **indexer_kwargs):
"""Returns a Resample object for performing resampling operations.
Expand Down Expand Up @@ -673,7 +722,7 @@ def resample(self, indexer=None, skipna=None, closed=None, label=None,
raise TypeError('resample() no longer supports the `how` or '
'`dim` arguments. Instead call methods on resample '
"objects, e.g., data.resample(time='1D').mean()")

indexer = either_dict_or_kwargs(indexer, indexer_kwargs, 'resample')

if len(indexer) != 1:
Expand Down
1 change: 1 addition & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ class DataArray(AbstractArray, DataWithCoords):
"""
_groupby_cls = groupby.DataArrayGroupBy
_rolling_cls = rolling.DataArrayRolling
_coarsen_cls = rolling.DataArrayCoarsen
_resample_cls = resample.DataArrayResample

dt = property(DatetimeAccessor)
Expand Down
1 change: 1 addition & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords,
"""
_groupby_cls = groupby.DatasetGroupBy
_rolling_cls = rolling.DatasetRolling
_coarsen_cls = rolling.DatasetCoarsen
_resample_cls = resample.DatasetResample

def __init__(self, data_vars=None, coords=None, attrs=None,
Expand Down
26 changes: 26 additions & 0 deletions xarray/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,20 @@
New {da_or_ds} object with `{name}` applied along its rolling dimnension.
"""

_COARSEN_REDUCE_DOCSTRING_TEMPLATE = """\
Coarsen this object by applying `{name}` along its dimensions.

Parameters
----------
**kwargs : dict
Additional keyword arguments passed on to `{name}`.

Returns
-------
reduced : DataArray or Dataset
New object with `{name}` applied along its coasen dimnensions.
"""


def fillna(data, other, join="left", dataset_join="left"):
"""Fill missing values in this object with data from the other object.
Expand Down Expand Up @@ -378,3 +392,15 @@ def inject_datasetrolling_methods(cls):
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
name=func.__name__, da_or_ds='Dataset')
setattr(cls, 'count', func)


def inject_coarsen_methods(cls):
# standard numpy reduce methods
methods = [(name, getattr(duck_array_ops, name))
for name in NAN_REDUCE_METHODS]
for name, f in methods:
func = cls._reduce_method(f)
func.__name__ = name
func.__doc__ = _COARSEN_REDUCE_DOCSTRING_TEMPLATE.format(
name=func.__name__)
setattr(cls, name, func)
144 changes: 121 additions & 23 deletions xarray/core/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,14 @@

import numpy as np

from . import dtypes
from . import dtypes, utils
from .dask_array_ops import dask_rolling_wrapper
from .ops import (
bn, has_bottleneck, inject_bottleneck_rolling_methods,
inject_datasetrolling_methods)
bn, has_bottleneck, inject_coarsen_methods,
inject_bottleneck_rolling_methods, inject_datasetrolling_methods)
from .pycompat import OrderedDict, dask_array_type, zip


def _get_new_dimname(dims, new_dim):
""" Get an new dimension name based on new_dim, that is not used in dims.
If the same name exists, we add an underscore(s) in the head.

Example1:
dims: ['a', 'b', 'c']
new_dim: ['_rolling']
-> ['_rolling']
Example2:
dims: ['a', 'b', 'c', '_rolling']
new_dim: ['_rolling']
-> ['__rolling']
"""
while new_dim in dims:
new_dim = '_' + new_dim
return new_dim


class Rolling(object):
"""A object that implements the moving window pattern.

Expand Down Expand Up @@ -231,7 +213,7 @@ def reduce(self, func, **kwargs):
reduced : DataArray
Array with summarized data.
"""
rolling_dim = _get_new_dimname(self.obj.dims, '_rolling_dim')
rolling_dim = utils.get_temp_dimname(self.obj.dims, '_rolling_dim')
windows = self.construct(rolling_dim)
result = windows.reduce(func, dim=rolling_dim, **kwargs)

Expand All @@ -242,7 +224,7 @@ def reduce(self, func, **kwargs):
def _counts(self):
""" Number of non-nan entries in each rolling window. """

rolling_dim = _get_new_dimname(self.obj.dims, '_rolling_dim')
rolling_dim = utils.get_temp_dimname(self.obj.dims, '_rolling_dim')
# We use False as the fill_value instead of np.nan, since boolean
# array is faster to be reduced than object array.
# The use of skipna==False is also faster since it does not need to
Expand Down Expand Up @@ -454,5 +436,121 @@ def construct(self, window_dim, stride=1, fill_value=dtypes.NA):
**{self.dim: slice(None, None, stride)})


class Coarsen(object):
"""A object that implements the coarsen.

See Also
--------
Dataset.coarsen
DataArray.coarsen
"""

_attributes = ['windows', 'side', 'trim_excess']

def __init__(self, obj, windows, side, trim_excess, coordinate_func):
"""
Moving window object.

Parameters
----------
obj : Dataset or DataArray
Object to window.
windows : A mapping from a dimension name to window size
dim : str
Name of the dimension to create the rolling iterator
along (e.g., `time`).
window : int
Size of the moving window.
side : 'left' or 'right' or mapping from dimension to 'left' or 'right'
coordinate_func: mapping from coordinate name to func.

trim_excess : boolean, or dict of boolean default False
Set the labels at the center of the window.

Returns
-------
coarsen
"""
self.obj = obj
self.windows = windows
self.side = side
self.trim_excess = trim_excess

if coordinate_func is None:
coordinate_func = {}
for c in self.obj.coords:
if c not in coordinate_func:
coordinate_func[c] = np.mean
self.coordinate_func = coordinate_func

def __repr__(self):
"""provide a nice str repr of our rolling object"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rolling -> coarsen


attrs = ["{k}->{v}".format(k=k, v=getattr(self, k))
for k in self._attributes
if getattr(self, k, None) is not None]
return "{klass} [{attrs}]".format(klass=self.__class__.__name__,
attrs=','.join(attrs))


class DataArrayCoarsen(Coarsen):
@classmethod
def _reduce_method(cls, func):
"""
Return a wrapped function for injecting numpy and bottoleneck methods.
see ops.inject_coarsen_methods
"""
def wrapped_func(self, **kwargs):
from .dataarray import DataArray

reduced = self.obj.variable.coarsen(
self.windows, func, self.side, self.trim_excess)
coords = {}
for c, v in self.obj.coords.items():
if c == self.obj.name:
coords[c] = reduced
else:
if any(d in self.windows for d in v.dims):
coords[c] = v.variable.coarsen(
self.windows, self.coordinate_func[c], self.side,
self.trim_excess)
else:
coords[c] = v
return DataArray(reduced, dims=self.obj.dims, coords=coords)

return wrapped_func


class DatasetCoarsen(Coarsen):
@classmethod
def _reduce_method(cls, func):
"""
Return a wrapped function for injecting numpy and bottoleneck methods.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bottoleneck -> bottleneck

see ops.inject_coarsen_methods
"""
def wrapped_func(self, **kwargs):
from .dataset import Dataset

reduced = OrderedDict()
for key, da in self.obj.data_vars.items():
reduced[key] = da.variable.coarsen(
self.windows, func, self.side, self.trim_excess)

coords = {}
for c, v in self.obj.coords.items():
if any(d in self.windows for d in v.dims):
coords[c] = v.variable.coarsen(
self.windows, self.coordinate_func[c], self.side,
self.trim_excess)
else:
coords[c] = v.variable
return Dataset(reduced, coords=coords)

return wrapped_func



inject_bottleneck_rolling_methods(DataArrayRolling)
inject_datasetrolling_methods(DatasetRolling)
inject_coarsen_methods(DataArrayCoarsen)
inject_coarsen_methods(DatasetCoarsen)
18 changes: 18 additions & 0 deletions xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,3 +629,21 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
if datetime_unit:
return (array / np.timedelta64(1, datetime_unit)).astype(dtype)
return array.astype(dtype)


def get_temp_dimname(dims, new_dim):
""" Get an new dimension name based on new_dim, that is not used in dims.
If the same name exists, we add an underscore(s) in the head.

Example1:
dims: ['a', 'b', 'c']
new_dim: ['_rolling']
-> ['_rolling']
Example2:
dims: ['a', 'b', 'c', '_rolling']
new_dim: ['_rolling']
-> ['__rolling']
"""
while new_dim in dims:
new_dim = '_' + new_dim
return new_dim
67 changes: 67 additions & 0 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1632,6 +1632,73 @@ def rolling_window(self, dim, window, window_dim, center=False,
array, axis=self.get_axis_num(dim), window=window,
center=center, fill_value=fill_value))

def coarsen(self, windows, func, side='left', trim_excess=False):
windows = {k: v for k, v in windows.items() if k in self.dims}
new_dimensions = {k: utils.get_temp_dimname(self.dims, k)
for k in windows}
reshaped = self._coarsen_reshape(windows, side, trim_excess,
new_dimensions)

axis = tuple([reshaped.get_axis_num(d) for d
in new_dimensions.values()])
return type(self)(self.dims, func(reshaped, axis=axis), self._attrs)

def _coarsen_reshape(self, windows, side, trim_excess, coarsen_dimensions):
"""
Construct a reshaped-variable for corsen
"""
if not utils.is_dict_like(side):
side = {d: side for d in windows.keys()}

if not utils.is_dict_like(trim_excess):
trim_excess = {d: trim_excess for d in windows.keys()}

# remove unrelated dimensions
side = {k: v for k, v in side.items() if k in self.dims}
trim_excess = {k: v for k, v in trim_excess.items() if k in self.dims}

if windows == {}:
return type(self)(self.dims, self.data, self._attrs)

for d, window in windows.items():
if window <= 0:
raise ValueError('window must be > 0')

variable = self
for d, window in windows.items():
# trim or pad the object
size = variable.shape[self._get_axis_num(d)]
n = int(size / window)
if trim_excess[d]:
if side[d] == 'left':
variable = variable.isel({d: slice(window * int(n))})
else:
excess = size - window * n
variable = variable.isel({d: slice(excess, None)})
else: # pad
pad = window * (n + 1) - size
if side[d] == 'left':
pad_widths = {d: (0, pad)}
else:
pad_widths = {d: (pad, 0)}
variable = variable.pad_with_fill_value(pad_widths)

shape = []
axes = []
dims = []
for i, d in enumerate(variable.dims):
if d in windows:
size = variable.shape[i]
shape.append(int(size / windows[d]))
shape.append(windows[d])
dims.append(d)
dims.append(coarsen_dimensions[d])
else:
shape.append(variable.shape[i])
dims.append(d)

return Variable(dims, variable.data.reshape(shape), self._attrs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth making an actual xarray.Variable object here rather than just returning data? With the need to come up with new dimension names, I think it might be cleaner avoiding that.


@property
def real(self):
return type(self)(self.dims, self.data.real, self._attrs)
Expand Down
Loading