Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: progress toward making groupby work with multiple arguments #924

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,14 +341,16 @@ def groupby(self, group, squeeze=True):
"""
if isinstance(group, basestring):
group = self[group]
elif isinstance(group, (list, tuple)):
group = [self[g] if isinstance(g, basestring) else g for g in group]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think I see...here is where the list of groups names is converted to a list of DataArrays.

return self.groupby_cls(self, group, squeeze=squeeze)

def groupby_bins(self, group, bins, right=True, labels=None, precision=3,
include_lowest=False, squeeze=True):
"""Returns a GroupBy object for performing grouped operations.

Rather than using all unique values of `group`, the values are discretized
first by applying `pandas.cut` [1]_ to `group`.
Rather than using all unique values of `group`, the values are
discretized first by applying `pandas.cut` [1]_ to `group`.

Parameters
----------
Expand Down
6 changes: 4 additions & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,11 @@ def _to_dataset_whole(self, name=None, shallow_copy=True):
if name is None:
raise ValueError('unable to convert unnamed DataArray to a '
'Dataset without providing an explicit name')
if name in self.coords:
if (name in self.coords and
not self.variable.identical(self._coords[name])):
raise ValueError('cannot create a Dataset from a DataArray with '
'the same name as one of its coordinates')
'the same name as one of its coordinates '
'unless they are identical')
# use private APIs here for speed: this is called by _to_temp_dataset(),
# which is used in the guts of a lot of operations (e.g., reindex)
variables = self._coords.copy()
Expand Down
149 changes: 109 additions & 40 deletions xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

from . import nputils
from . import ops
from .alignment import broadcast
from .combine import concat
from .common import (
ImplementsArrayReduce, ImplementsDatasetReduce, _maybe_promote,
)
from .pycompat import zip
from .merge import merge
from .pycompat import zip, OrderedDict
from .utils import peek_at, maybe_wrap_array, safe_cast_to_index
from .variable import as_variable, Variable, Coordinate

Expand All @@ -19,22 +21,28 @@ def unique_value_groups(ar):
Parameters
----------
ar : array-like
Input array. This will be flattened if it is not already 1-D.
One dimensional array-like.

Returns
-------
values : np.ndarray
Sorted, unique values as returned by `np.unique`.
values : pd.Index
Sorted, unique values as returned by `pd.factorize`.
indices : list of lists of int
Each element provides the integer indices in `ar` with values given by
the corresponding value in `unique_values`.
"""
inverse, values = pd.factorize(ar, sort=True)
index = safe_cast_to_index(ar)
inverse, values = pd.factorize(index, sort=True)
groups = [[] for _ in range(len(values))]
for n, g in enumerate(inverse):
if g >= 0:
# pandas uses -1 to mark NaN, but doesn't include them in values
groups[g].append(n)

if isinstance(values, pd.MultiIndex):
# restore level names
values = values.set_names(index.names)

return values, groups


Expand Down Expand Up @@ -114,6 +122,11 @@ def _inverse_permutation_indices(positions):
return indices


def _is_monotonic_unique(group):
index = safe_cast_to_index(group)
return index.is_monotonic and index.is_unique


class GroupBy(object):
"""A object that implements the split-apply-combine pattern.

Expand All @@ -131,7 +144,7 @@ class GroupBy(object):
DataArray.groupby
"""
def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
cut_kwargs={}):
cut_kwargs={}):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😳 these PEP8 violations are from my PR. Sorry! I have since started linting...

"""Create a GroupBy object

Parameters
Expand All @@ -152,44 +165,104 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
cut_kwargs : dict, optional
Extra keyword arguments to pass to `pandas.cut`
"""
from .dataset import as_dataset
from .dataset import Dataset
from .dataarray import DataArray

if getattr(group, 'name', None) is None:
raise ValueError('`group` must have a name')
self._stacked_dim = None
if group.ndim != 1:
def check_valid_group(group_obj):
if not isinstance(group_obj, (DataArray, Variable)):
raise TypeError('`group` must be a DataArray, Variable or list '
'of DataArrays and/or Variables')
if getattr(group_obj, 'name', None) is None:
raise ValueError('each item in `group` must have a name')

if grouper is not None and bins is not None:
raise TypeError("Can't specify both `grouper` and `bins`.")

if isinstance(group, (list, tuple)):
if not group:
raise ValueError('must supply at least one item to groupby')
for g in group:
check_valid_group(g)
group_names = [g.name for g in group]
# we merge multiple groupby variables into Dataset, so they can be
# stacked if they use multiple dimensions
group = merge(group)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm trying to understand what happens here if the group is a list of dimension names (e.g. group=['x', 'y']). What will merge return?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind...I get it now. By this point, all the items in the list should already be DataArrays or Variables. Still not sure I can visualize what merge is doing though.

else:
check_valid_group(group)
group_names = []

orig_dims = []
stacked_dim_name = None
if len(group.dims) > 1:
# try to stack the dims of the group into a single dim
# TODO: figure out how to exclude dimensions from the stacking
# (e.g. group over space dims but leave time dim intact)
orig_dims = group.dims
orig_dims = tuple(group.dims)
stacked_dim_name = 'stacked_' + '_'.join(orig_dims)

# the copy is necessary here, otherwise read only array raises error
# in pandas: https://github.com/pydata/pandas/issues/12813
group = group.stack(**{stacked_dim_name: orig_dims}).copy()
obj = obj.stack(**{stacked_dim_name: orig_dims})
self._stacked_dim = stacked_dim_name
self._unstacked_dims = orig_dims
if not hasattr(group, 'dims'):
raise ValueError("`group` must have a 'dims' attribute")
group_dim, = group.dims

try:
expected_size = obj.dims[group_dim]
except TypeError:
expected_size = obj.shape[obj.get_axis_num(group_dim)]
grouped_dim_name = None

if isinstance(group, Dataset):
# list or tuple input is now a 1-dimensional Dataset

unstacked_group_names = [g for g in group_names
if g not in orig_dims]
stacked_group_names = [g for g in group_names if g in orig_dims]

levels = []
labels = []
names = []
if unstacked_group_names:
if len(unstacked_group_names) == 1:
# MultiIndex.from_array returns a normal Index when passed
# a single argument, so we use factorize instead.
unstacked_name, = unstacked_group_names
label, level = pd.factorize(
group[unstacked_name].to_index())
levels.append(level)
labels.append(label)
names.append(unstacked_name)
else:
index = pd.MultiIndex.from_arrays(
[group[name].to_index()
for name in unstacked_group_names],
names=unstacked_group_names)
levels.extend(index.levels)
labels.extend(index.labels)
names.extend(index.names)

if stacked_group_names:
index = group.coords[stacked_dim_name].to_index()
for level, label, name in zip(
index.levels, index.labels, index.names):
if name in stacked_group_names:
levels.append(level)
labels.append(label)
names.append(name)

group_index = pd.MultiIndex(levels, labels, names=names)
grouped_dim_name = 'grouped_' + '_'.join(group_names)
group = DataArray(group_index, group.coords,
dims=list(group.dims), name=grouped_dim_name)

group_dim, = group.dims
expected_size = obj.coords[group_dim].size
if group.size != expected_size:
raise ValueError('the group variable\'s length does not '
'match the length of this variable along its '
'dimension')
full_index = None

if grouper is not None and bins is not None:
raise TypeError("Can't specify both `grouper` and `bins`.")
if bins is not None:
binned = pd.cut(group.values, bins, **cut_kwargs)
new_dim_name = group.name + '_bins'
group = DataArray(binned, group.coords, name=new_dim_name)

if grouper is not None:
index = safe_cast_to_index(group)
if not index.is_monotonic:
Expand All @@ -205,13 +278,10 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
group_indices = ([slice(i, j) for i, j in zip(sbins[:-1], sbins[1:])] +
[slice(sbins[-1], None)])
unique_coord = Coordinate(group.name, first_items.index)
elif group.name in obj.dims and bins is None:
# assume that group already has sorted, unique values
# (if using bins, the group will have the same name as a dimension
# but different values)
if group.dims != (group.name,):
raise ValueError('`group` is required to be a coordinate if '
'`group.name` is a dimension in `obj`')
elif group.name in obj.dims and _is_monotonic_unique(group):
# TODO(shoyer): Figure out how to handle cases where group is a
# dimension coordinate, but not monotonic unique. How should we
# handle squeeze?
group_indices = np.arange(group.size)
if not squeeze:
# group_indices = group_indices.reshape(-1, 1)
Expand All @@ -230,6 +300,8 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
self.unique_coord = unique_coord
self._groups = None
self._full_index = full_index
self._stacked_dim = stacked_dim_name
self._grouped_dim = grouped_dim_name

@property
def groups(self):
Expand Down Expand Up @@ -296,16 +368,20 @@ def _maybe_restore_empty_groups(self, combined):
"""Our index contained empty groups (e.g., from a resampling). If we
reduced on that dimension, we want to restore the full index.
"""
if (self._full_index is not None and self.group.name in combined.dims):
if self._full_index is not None and self.group.name in combined.dims:
indexers = {self.group.name: self._full_index}
combined = combined.reindex(**indexers)
return combined

def _maybe_unstack_array(self, arr):
"""This gets called if we are applying on an array with a
multidimensional group."""
if self._stacked_dim is not None and self._stacked_dim in arr.dims:
arr = arr.unstack(self._stacked_dim)
if self._stacked_dim is not None:
if self._stacked_dim in arr.dims:
arr = arr.unstack(self._stacked_dim)
elif (self._grouped_dim is not None
and self._grouped_dim in arr.dims):
arr = arr.unstack(self._grouped_dim)
return arr

def fillna(self, value):
Expand Down Expand Up @@ -426,12 +502,6 @@ def lookup_order(dimension):
new_order = sorted(stacked.dims, key=lookup_order)
return stacked.transpose(*new_order)

def _restore_multiindex(self, combined):
if self._stacked_dim is not None and self._stacked_dim in combined.dims:
stacked_dim = self.group[self._stacked_dim]
combined[self._stacked_dim] = stacked_dim
return combined

def apply(self, func, shortcut=False, **kwargs):
"""Apply a function over each array in the group and concatenate them
together into a new array.
Expand Down Expand Up @@ -490,7 +560,6 @@ def _concat(self, applied, shortcut=False):
combined = _maybe_reorder(combined, concat_dim, positions)
if isinstance(combined, type(self.obj)):
combined = self._restore_dim_order(combined)
combined = self._restore_multiindex(combined)
return combined

def reduce(self, func, dim=None, axis=None, keep_attrs=False,
Expand Down
Loading