pydata · shoyer · Jul 29, 2016 · Jul 31, 2016 · rabernat · Jul 30, 2016
diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -341,14 +341,16 @@ def groupby(self, group, squeeze=True):
         """
         if isinstance(group, basestring):
             group = self[group]
+        elif isinstance(group, (list, tuple)):
+            group = [self[g] if isinstance(g, basestring) else g for g in group]
         return self.groupby_cls(self, group, squeeze=squeeze)
 
     def groupby_bins(self, group, bins, right=True, labels=None, precision=3,
                      include_lowest=False, squeeze=True):
         """Returns a GroupBy object for performing grouped operations.
 
-        Rather than using all unique values of `group`, the values are discretized
-        first by applying `pandas.cut` [1]_ to `group`.
+        Rather than using all unique values of `group`, the values are
+        discretized first by applying `pandas.cut` [1]_ to `group`.
 
         Parameters
         ----------

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -286,9 +286,11 @@ def _to_dataset_whole(self, name=None, shallow_copy=True):
         if name is None:
             raise ValueError('unable to convert unnamed DataArray to a '
                              'Dataset without providing an explicit name')
-        if name in self.coords:
+        if (name in self.coords and
+                not self.variable.identical(self._coords[name])):
             raise ValueError('cannot create a Dataset from a DataArray with '
-                             'the same name as one of its coordinates')
+                             'the same name as one of its coordinates '
+                             'unless they are identical')
         # use private APIs here for speed: this is called by _to_temp_dataset(),
         # which is used in the guts of a lot of operations (e.g., reindex)
         variables = self._coords.copy()

diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -4,11 +4,13 @@
 
 from . import nputils
 from . import ops
+from .alignment import broadcast
 from .combine import concat
 from .common import (
     ImplementsArrayReduce, ImplementsDatasetReduce, _maybe_promote,
 )
-from .pycompat import zip
+from .merge import merge
+from .pycompat import zip, OrderedDict
 from .utils import peek_at, maybe_wrap_array, safe_cast_to_index
 from .variable import as_variable, Variable, Coordinate
 
@@ -19,22 +21,28 @@ def unique_value_groups(ar):
     Parameters
     ----------
     ar : array-like
-        Input array. This will be flattened if it is not already 1-D.
+        One dimensional array-like.
 
     Returns
     -------
-    values : np.ndarray
-        Sorted, unique values as returned by `np.unique`.
+    values : pd.Index
+        Sorted, unique values as returned by `pd.factorize`.
     indices : list of lists of int
         Each element provides the integer indices in `ar` with values given by
         the corresponding value in `unique_values`.
     """
-    inverse, values = pd.factorize(ar, sort=True)
+    index = safe_cast_to_index(ar)
+    inverse, values = pd.factorize(index, sort=True)
     groups = [[] for _ in range(len(values))]
     for n, g in enumerate(inverse):
         if g >= 0:
             # pandas uses -1 to mark NaN, but doesn't include them in values
             groups[g].append(n)
+
+    if isinstance(values, pd.MultiIndex):
+        # restore level names
+        values = values.set_names(index.names)
+
     return values, groups
 
 
@@ -114,6 +122,11 @@ def _inverse_permutation_indices(positions):
     return indices
 
 
+def _is_monotonic_unique(group):
+    index = safe_cast_to_index(group)
+    return index.is_monotonic and index.is_unique
+
+
 class GroupBy(object):
     """A object that implements the split-apply-combine pattern.
 
@@ -131,7 +144,7 @@ class GroupBy(object):
     DataArray.groupby
     """
     def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
-                    cut_kwargs={}):
+                 cut_kwargs={}):
         """Create a GroupBy object
 
         Parameters
@@ -152,44 +165,104 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
         cut_kwargs : dict, optional
             Extra keyword arguments to pass to `pandas.cut`
         """
-        from .dataset import as_dataset
+        from .dataset import Dataset
         from .dataarray import DataArray
 
-        if getattr(group, 'name', None) is None:
-            raise ValueError('`group` must have a name')
-        self._stacked_dim = None
-        if group.ndim != 1:
+        def check_valid_group(group_obj):
+            if not isinstance(group_obj, (DataArray, Variable)):
+                raise TypeError('`group` must be a DataArray, Variable or list '
+                                'of DataArrays and/or Variables')
+            if getattr(group_obj, 'name', None) is None:
+                raise ValueError('each item in `group` must have a name')
+
+        if grouper is not None and bins is not None:
+            raise TypeError("Can't specify both `grouper` and `bins`.")
+
+        if isinstance(group, (list, tuple)):
+            if not group:
+                raise ValueError('must supply at least one item to groupby')
+            for g in group:
+                check_valid_group(g)
+            group_names = [g.name for g in group]
+            # we merge multiple groupby variables into Dataset, so they can be
+            # stacked if they use multiple dimensions
+            group = merge(group)
+        else:
+            check_valid_group(group)
+            group_names = []
+
+        orig_dims = []
+        stacked_dim_name = None
+        if len(group.dims) > 1:
             # try to stack the dims of the group into a single dim
             # TODO: figure out how to exclude dimensions from the stacking
             #       (e.g. group over space dims but leave time dim intact)
-            orig_dims = group.dims
+            orig_dims = tuple(group.dims)
             stacked_dim_name = 'stacked_' + '_'.join(orig_dims)
+
             # the copy is necessary here, otherwise read only array raises error
             # in pandas: https://github.com/pydata/pandas/issues/12813
             group = group.stack(**{stacked_dim_name: orig_dims}).copy()
             obj = obj.stack(**{stacked_dim_name: orig_dims})
-            self._stacked_dim = stacked_dim_name
-            self._unstacked_dims = orig_dims
-        if not hasattr(group, 'dims'):
-            raise ValueError("`group` must have a 'dims' attribute")
-        group_dim, = group.dims
 
-        try:
-            expected_size = obj.dims[group_dim]
-        except TypeError:
-            expected_size = obj.shape[obj.get_axis_num(group_dim)]
+        grouped_dim_name = None
+
+        if isinstance(group, Dataset):
+            # list or tuple input is now a 1-dimensional Dataset
+
+            unstacked_group_names = [g for g in group_names
+                                     if g not in orig_dims]
+            stacked_group_names = [g for g in group_names if g in orig_dims]
+
+            levels = []
+            labels = []
+            names = []
+            if unstacked_group_names:
+                if len(unstacked_group_names) == 1:
+                    # MultiIndex.from_array returns a normal Index when passed
+                    # a single argument, so we use factorize instead.
+                    unstacked_name, = unstacked_group_names
+                    label, level = pd.factorize(
+                        group[unstacked_name].to_index())
+                    levels.append(level)
+                    labels.append(label)
+                    names.append(unstacked_name)
+                else:
+                    index = pd.MultiIndex.from_arrays(
+                        [group[name].to_index()
+                         for name in unstacked_group_names],
+                        names=unstacked_group_names)
+                    levels.extend(index.levels)
+                    labels.extend(index.labels)
+                    names.extend(index.names)
+
+            if stacked_group_names:
+                index = group.coords[stacked_dim_name].to_index()
+                for level, label, name in zip(
+                        index.levels, index.labels, index.names):
+                    if name in stacked_group_names:
+                        levels.append(level)
+                        labels.append(label)
+                        names.append(name)
+
+            group_index = pd.MultiIndex(levels, labels, names=names)
+            grouped_dim_name = 'grouped_' + '_'.join(group_names)
+            group = DataArray(group_index, group.coords,
+                              dims=list(group.dims), name=grouped_dim_name)
+
+        group_dim, = group.dims
+        expected_size = obj.coords[group_dim].size
         if group.size != expected_size:
             raise ValueError('the group variable\'s length does not '
                              'match the length of this variable along its '
                              'dimension')
         full_index = None
 
-        if grouper is not None and bins is not None:
-            raise TypeError("Can't specify both `grouper` and `bins`.")
         if bins is not None:
             binned = pd.cut(group.values, bins, **cut_kwargs)
             new_dim_name = group.name + '_bins'
             group = DataArray(binned, group.coords, name=new_dim_name)
+
         if grouper is not None:
             index = safe_cast_to_index(group)
             if not index.is_monotonic:
@@ -205,13 +278,10 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
             group_indices = ([slice(i, j) for i, j in zip(sbins[:-1], sbins[1:])] +
                              [slice(sbins[-1], None)])
             unique_coord = Coordinate(group.name, first_items.index)
-        elif group.name in obj.dims and bins is None:
-            # assume that group already has sorted, unique values
-            # (if using bins, the group will have the same name as a dimension
-            # but different values)
-            if group.dims != (group.name,):
-                raise ValueError('`group` is required to be a coordinate if '
-                                 '`group.name` is a dimension in `obj`')
+        elif group.name in obj.dims and _is_monotonic_unique(group):
+            # TODO(shoyer): Figure out how to handle cases where group is a
+            # dimension coordinate, but not monotonic unique. How should we
+            # handle squeeze?
             group_indices = np.arange(group.size)
             if not squeeze:
                 # group_indices = group_indices.reshape(-1, 1)
@@ -230,6 +300,8 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
         self.unique_coord = unique_coord
         self._groups = None
         self._full_index = full_index
+        self._stacked_dim = stacked_dim_name
+        self._grouped_dim = grouped_dim_name
 
     @property
     def groups(self):
@@ -296,16 +368,20 @@ def _maybe_restore_empty_groups(self, combined):
         """Our index contained empty groups (e.g., from a resampling). If we
         reduced on that dimension, we want to restore the full index.
         """
-        if (self._full_index is not None and self.group.name in combined.dims):
+        if self._full_index is not None and self.group.name in combined.dims:
             indexers = {self.group.name: self._full_index}
             combined = combined.reindex(**indexers)
         return combined
 
     def _maybe_unstack_array(self, arr):
         """This gets called if we are applying on an array with a
         multidimensional group."""
-        if self._stacked_dim is not None and self._stacked_dim in arr.dims:
-            arr = arr.unstack(self._stacked_dim)
+        if self._stacked_dim is not None:
+            if self._stacked_dim in arr.dims:
+                arr = arr.unstack(self._stacked_dim)
+            elif (self._grouped_dim is not None
+                  and self._grouped_dim in arr.dims):
+                arr = arr.unstack(self._grouped_dim)
         return arr
 
     def fillna(self, value):
@@ -426,12 +502,6 @@ def lookup_order(dimension):
         new_order = sorted(stacked.dims, key=lookup_order)
         return stacked.transpose(*new_order)
 
-    def _restore_multiindex(self, combined):
-        if self._stacked_dim is not None and self._stacked_dim in combined.dims:
-            stacked_dim = self.group[self._stacked_dim]
-            combined[self._stacked_dim] = stacked_dim
-        return combined
-
     def apply(self, func, shortcut=False, **kwargs):
         """Apply a function over each array in the group and concatenate them
         together into a new array.
@@ -490,7 +560,6 @@ def _concat(self, applied, shortcut=False):
             combined = _maybe_reorder(combined, concat_dim, positions)
         if isinstance(combined, type(self.obj)):
             combined = self._restore_dim_order(combined)
-            combined = self._restore_multiindex(combined)
         return combined
 
     def reduce(self, func, dim=None, axis=None, keep_attrs=False,