-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: progress toward making groupby work with multiple arguments #924
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,11 +4,13 @@ | |
|
||
from . import nputils | ||
from . import ops | ||
from .alignment import broadcast | ||
from .combine import concat | ||
from .common import ( | ||
ImplementsArrayReduce, ImplementsDatasetReduce, _maybe_promote, | ||
) | ||
from .pycompat import zip | ||
from .merge import merge | ||
from .pycompat import zip, OrderedDict | ||
from .utils import peek_at, maybe_wrap_array, safe_cast_to_index | ||
from .variable import as_variable, Variable, Coordinate | ||
|
||
|
@@ -19,22 +21,28 @@ def unique_value_groups(ar): | |
Parameters | ||
---------- | ||
ar : array-like | ||
Input array. This will be flattened if it is not already 1-D. | ||
One dimensional array-like. | ||
|
||
Returns | ||
------- | ||
values : np.ndarray | ||
Sorted, unique values as returned by `np.unique`. | ||
values : pd.Index | ||
Sorted, unique values as returned by `pd.factorize`. | ||
indices : list of lists of int | ||
Each element provides the integer indices in `ar` with values given by | ||
the corresponding value in `unique_values`. | ||
""" | ||
inverse, values = pd.factorize(ar, sort=True) | ||
index = safe_cast_to_index(ar) | ||
inverse, values = pd.factorize(index, sort=True) | ||
groups = [[] for _ in range(len(values))] | ||
for n, g in enumerate(inverse): | ||
if g >= 0: | ||
# pandas uses -1 to mark NaN, but doesn't include them in values | ||
groups[g].append(n) | ||
|
||
if isinstance(values, pd.MultiIndex): | ||
# restore level names | ||
values = values.set_names(index.names) | ||
|
||
return values, groups | ||
|
||
|
||
|
@@ -114,6 +122,11 @@ def _inverse_permutation_indices(positions): | |
return indices | ||
|
||
|
||
def _is_monotonic_unique(group): | ||
index = safe_cast_to_index(group) | ||
return index.is_monotonic and index.is_unique | ||
|
||
|
||
class GroupBy(object): | ||
"""A object that implements the split-apply-combine pattern. | ||
|
||
|
@@ -131,7 +144,7 @@ class GroupBy(object): | |
DataArray.groupby | ||
""" | ||
def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, | ||
cut_kwargs={}): | ||
cut_kwargs={}): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😳 these PEP8 violations are from my PR. Sorry! I have since started linting... |
||
"""Create a GroupBy object | ||
|
||
Parameters | ||
|
@@ -152,44 +165,104 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, | |
cut_kwargs : dict, optional | ||
Extra keyword arguments to pass to `pandas.cut` | ||
""" | ||
from .dataset import as_dataset | ||
from .dataset import Dataset | ||
from .dataarray import DataArray | ||
|
||
if getattr(group, 'name', None) is None: | ||
raise ValueError('`group` must have a name') | ||
self._stacked_dim = None | ||
if group.ndim != 1: | ||
def check_valid_group(group_obj): | ||
if not isinstance(group_obj, (DataArray, Variable)): | ||
raise TypeError('`group` must be a DataArray, Variable or list ' | ||
'of DataArrays and/or Variables') | ||
if getattr(group_obj, 'name', None) is None: | ||
raise ValueError('each item in `group` must have a name') | ||
|
||
if grouper is not None and bins is not None: | ||
raise TypeError("Can't specify both `grouper` and `bins`.") | ||
|
||
if isinstance(group, (list, tuple)): | ||
if not group: | ||
raise ValueError('must supply at least one item to groupby') | ||
for g in group: | ||
check_valid_group(g) | ||
group_names = [g.name for g in group] | ||
# we merge multiple groupby variables into Dataset, so they can be | ||
# stacked if they use multiple dimensions | ||
group = merge(group) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm trying to understand what happens here if the group is a list of dimension names (e.g. group=['x', 'y']). What will merge return? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never mind...I get it now. By this point, all the items in the list should already be DataArrays or Variables. Still not sure I can visualize what merge is doing though. |
||
else: | ||
check_valid_group(group) | ||
group_names = [] | ||
|
||
orig_dims = [] | ||
stacked_dim_name = None | ||
if len(group.dims) > 1: | ||
# try to stack the dims of the group into a single dim | ||
# TODO: figure out how to exclude dimensions from the stacking | ||
# (e.g. group over space dims but leave time dim intact) | ||
orig_dims = group.dims | ||
orig_dims = tuple(group.dims) | ||
stacked_dim_name = 'stacked_' + '_'.join(orig_dims) | ||
|
||
# the copy is necessary here, otherwise read only array raises error | ||
# in pandas: https://github.com/pydata/pandas/issues/12813 | ||
group = group.stack(**{stacked_dim_name: orig_dims}).copy() | ||
obj = obj.stack(**{stacked_dim_name: orig_dims}) | ||
self._stacked_dim = stacked_dim_name | ||
self._unstacked_dims = orig_dims | ||
if not hasattr(group, 'dims'): | ||
raise ValueError("`group` must have a 'dims' attribute") | ||
group_dim, = group.dims | ||
|
||
try: | ||
expected_size = obj.dims[group_dim] | ||
except TypeError: | ||
expected_size = obj.shape[obj.get_axis_num(group_dim)] | ||
grouped_dim_name = None | ||
|
||
if isinstance(group, Dataset): | ||
# list or tuple input is now a 1-dimensional Dataset | ||
|
||
unstacked_group_names = [g for g in group_names | ||
if g not in orig_dims] | ||
stacked_group_names = [g for g in group_names if g in orig_dims] | ||
|
||
levels = [] | ||
labels = [] | ||
names = [] | ||
if unstacked_group_names: | ||
if len(unstacked_group_names) == 1: | ||
# MultiIndex.from_array returns a normal Index when passed | ||
# a single argument, so we use factorize instead. | ||
unstacked_name, = unstacked_group_names | ||
label, level = pd.factorize( | ||
group[unstacked_name].to_index()) | ||
levels.append(level) | ||
labels.append(label) | ||
names.append(unstacked_name) | ||
else: | ||
index = pd.MultiIndex.from_arrays( | ||
[group[name].to_index() | ||
for name in unstacked_group_names], | ||
names=unstacked_group_names) | ||
levels.extend(index.levels) | ||
labels.extend(index.labels) | ||
names.extend(index.names) | ||
|
||
if stacked_group_names: | ||
index = group.coords[stacked_dim_name].to_index() | ||
for level, label, name in zip( | ||
index.levels, index.labels, index.names): | ||
if name in stacked_group_names: | ||
levels.append(level) | ||
labels.append(label) | ||
names.append(name) | ||
|
||
group_index = pd.MultiIndex(levels, labels, names=names) | ||
grouped_dim_name = 'grouped_' + '_'.join(group_names) | ||
group = DataArray(group_index, group.coords, | ||
dims=list(group.dims), name=grouped_dim_name) | ||
|
||
group_dim, = group.dims | ||
expected_size = obj.coords[group_dim].size | ||
if group.size != expected_size: | ||
raise ValueError('the group variable\'s length does not ' | ||
'match the length of this variable along its ' | ||
'dimension') | ||
full_index = None | ||
|
||
if grouper is not None and bins is not None: | ||
raise TypeError("Can't specify both `grouper` and `bins`.") | ||
if bins is not None: | ||
binned = pd.cut(group.values, bins, **cut_kwargs) | ||
new_dim_name = group.name + '_bins' | ||
group = DataArray(binned, group.coords, name=new_dim_name) | ||
|
||
if grouper is not None: | ||
index = safe_cast_to_index(group) | ||
if not index.is_monotonic: | ||
|
@@ -205,13 +278,10 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, | |
group_indices = ([slice(i, j) for i, j in zip(sbins[:-1], sbins[1:])] + | ||
[slice(sbins[-1], None)]) | ||
unique_coord = Coordinate(group.name, first_items.index) | ||
elif group.name in obj.dims and bins is None: | ||
# assume that group already has sorted, unique values | ||
# (if using bins, the group will have the same name as a dimension | ||
# but different values) | ||
if group.dims != (group.name,): | ||
raise ValueError('`group` is required to be a coordinate if ' | ||
'`group.name` is a dimension in `obj`') | ||
elif group.name in obj.dims and _is_monotonic_unique(group): | ||
# TODO(shoyer): Figure out how to handle cases where group is a | ||
# dimension coordinate, but not monotonic unique. How should we | ||
# handle squeeze? | ||
group_indices = np.arange(group.size) | ||
if not squeeze: | ||
# group_indices = group_indices.reshape(-1, 1) | ||
|
@@ -230,6 +300,8 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, | |
self.unique_coord = unique_coord | ||
self._groups = None | ||
self._full_index = full_index | ||
self._stacked_dim = stacked_dim_name | ||
self._grouped_dim = grouped_dim_name | ||
|
||
@property | ||
def groups(self): | ||
|
@@ -296,16 +368,20 @@ def _maybe_restore_empty_groups(self, combined): | |
"""Our index contained empty groups (e.g., from a resampling). If we | ||
reduced on that dimension, we want to restore the full index. | ||
""" | ||
if (self._full_index is not None and self.group.name in combined.dims): | ||
if self._full_index is not None and self.group.name in combined.dims: | ||
indexers = {self.group.name: self._full_index} | ||
combined = combined.reindex(**indexers) | ||
return combined | ||
|
||
def _maybe_unstack_array(self, arr): | ||
"""This gets called if we are applying on an array with a | ||
multidimensional group.""" | ||
if self._stacked_dim is not None and self._stacked_dim in arr.dims: | ||
arr = arr.unstack(self._stacked_dim) | ||
if self._stacked_dim is not None: | ||
if self._stacked_dim in arr.dims: | ||
arr = arr.unstack(self._stacked_dim) | ||
elif (self._grouped_dim is not None | ||
and self._grouped_dim in arr.dims): | ||
arr = arr.unstack(self._grouped_dim) | ||
return arr | ||
|
||
def fillna(self, value): | ||
|
@@ -426,12 +502,6 @@ def lookup_order(dimension): | |
new_order = sorted(stacked.dims, key=lookup_order) | ||
return stacked.transpose(*new_order) | ||
|
||
def _restore_multiindex(self, combined): | ||
if self._stacked_dim is not None and self._stacked_dim in combined.dims: | ||
stacked_dim = self.group[self._stacked_dim] | ||
combined[self._stacked_dim] = stacked_dim | ||
return combined | ||
|
||
def apply(self, func, shortcut=False, **kwargs): | ||
"""Apply a function over each array in the group and concatenate them | ||
together into a new array. | ||
|
@@ -490,7 +560,6 @@ def _concat(self, applied, shortcut=False): | |
combined = _maybe_reorder(combined, concat_dim, positions) | ||
if isinstance(combined, type(self.obj)): | ||
combined = self._restore_dim_order(combined) | ||
combined = self._restore_multiindex(combined) | ||
return combined | ||
|
||
def reduce(self, func, dim=None, axis=None, keep_attrs=False, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I think I see...here is where the list of groups names is converted to a list of DataArrays.