Skip to content

Commit

Permalink
Fixed Inconsistent GroupBy Output Shape with Duplicate Column Labels (p…
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd authored and jacobaustin123 committed Nov 20, 2019
1 parent 517221e commit ded70c8
Show file tree
Hide file tree
Showing 7 changed files with 255 additions and 65 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
- Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`)
- Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`)
- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`)

Reshaping
^^^^^^^^^
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/groupby/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@
hold the whitelist of methods that are exposed on the
SeriesGroupBy and the DataFrameGroupBy objects.
"""
import collections

from pandas.core.dtypes.common import is_list_like, is_scalar

OutputKey = collections.namedtuple("OutputKey", ["label", "position"])


class GroupByMixin:
"""
Expand Down
175 changes: 137 additions & 38 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,17 @@
from functools import partial
from textwrap import dedent
import typing
from typing import Any, Callable, FrozenSet, Iterable, Sequence, Type, Union, cast
from typing import (
Any,
Callable,
FrozenSet,
Iterable,
Mapping,
Sequence,
Type,
Union,
cast,
)

import numpy as np

Expand Down Expand Up @@ -309,28 +319,91 @@ def _aggregate_multiple_funcs(self, arg):

return DataFrame(results, columns=columns)

def _wrap_series_output(self, output, index, names=None):
""" common agg/transform wrapping logic """
output = output[self._selection_name]
def _wrap_series_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index,
) -> Union[Series, DataFrame]:
"""
Wraps the output of a SeriesGroupBy operation into the expected result.
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
index : pd.Index
Index to apply to the output.
if names is not None:
return DataFrame(output, index=index, columns=names)
Returns
-------
Series or DataFrame
Notes
-----
In the vast majority of cases output and columns will only contain one
element. The exception is operations that expand dimensions, like ohlc.
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index(key.label for key in output)

result: Union[Series, DataFrame]
if len(output) > 1:
result = DataFrame(indexed_output, index=index)
result.columns = columns
else:
name = self._selection_name
if name is None:
name = self._selected_obj.name
return Series(output, index=index, name=name)
result = Series(indexed_output[0], index=index, name=columns[0])

return result

def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
) -> Union[Series, DataFrame]:
"""
Wraps the output of a SeriesGroupBy aggregation into the expected result.
def _wrap_aggregated_output(self, output, names=None):
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
Returns
-------
Series or DataFrame
Notes
-----
In the vast majority of cases output will only contain one element.
The exception is operations that expand dimensions, like ohlc.
"""
result = self._wrap_series_output(
output=output, index=self.grouper.result_index, names=names
output=output, index=self.grouper.result_index
)
return self._reindex_output(result)._convert(datetime=True)

def _wrap_transformed_output(self, output, names=None):
return self._wrap_series_output(
output=output, index=self.obj.index, names=names
)
def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
) -> Series:
"""
Wraps the output of a SeriesGroupBy aggregation into the expected result.
Parameters
----------
output : dict[base.OutputKey, Union[Series, np.ndarray]]
Dict with a sole key of 0 and a value of the result values.
Returns
-------
Series
Notes
-----
output should always contain one element. It is specified as a dict
for consistency with DataFrame methods and _wrap_aggregated_output.
"""
assert len(output) == 1
result = self._wrap_series_output(output=output, index=self.obj.index)

# No transformations increase the ndim of the result
assert isinstance(result, Series)
return result

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
Expand Down Expand Up @@ -1084,17 +1157,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:

return DataFrame(result, columns=result_columns)

def _decide_output_index(self, output, labels):
if len(output) == len(labels):
output_keys = labels
else:
output_keys = sorted(output)

if isinstance(labels, MultiIndex):
output_keys = MultiIndex.from_tuples(output_keys, names=labels.names)

return output_keys

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
return DataFrame(index=keys)
Expand Down Expand Up @@ -1561,27 +1623,62 @@ def _insert_inaxis_grouper_inplace(self, result):
if in_axis:
result.insert(0, name, lev)

def _wrap_aggregated_output(self, output, names=None):
agg_axis = 0 if self.axis == 1 else 1
agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
) -> DataFrame:
"""
Wraps the output of DataFrameGroupBy aggregations into the expected result.
output_keys = self._decide_output_index(output, agg_labels)
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
Returns
-------
DataFrame
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index(key.label for key in output)

result = DataFrame(indexed_output)
result.columns = columns

if not self.as_index:
result = DataFrame(output, columns=output_keys)
self._insert_inaxis_grouper_inplace(result)
result = result._consolidate()
else:
index = self.grouper.result_index
result = DataFrame(output, index=index, columns=output_keys)
result.index = index

if self.axis == 1:
result = result.T

return self._reindex_output(result)._convert(datetime=True)

def _wrap_transformed_output(self, output, names=None) -> DataFrame:
return DataFrame(output, index=self.obj.index)
def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
) -> DataFrame:
"""
Wraps the output of DataFrameGroupBy transformations into the expected result.
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
Returns
-------
DataFrame
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index(key.label for key in output)

result = DataFrame(indexed_output)
result.columns = columns
result.index = self.obj.index

return result

def _wrap_agged_blocks(self, items, blocks):
if not self.as_index:
Expand Down Expand Up @@ -1701,9 +1798,11 @@ def groupby_series(obj, col=None):
if isinstance(obj, Series):
results = groupby_series(obj)
else:
# TODO: this is duplicative of how GroupBy naturally works
# Try to consolidate with normal wrapping functions
from pandas.core.reshape.concat import concat

results = [groupby_series(obj[col], col) for col in obj.columns]
results = [groupby_series(content, label) for label, content in obj.items()]
results = concat(results, axis=1)
results.columns.names = obj.columns.names

Expand Down Expand Up @@ -1745,7 +1844,7 @@ def _normalize_keyword_aggregation(kwargs):
"""
Normalize user-provided "named aggregation" kwargs.
Transforms from the new ``Dict[str, NamedAgg]`` style kwargs
Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs
to the old OrderedDict[str, List[scalar]]].
Parameters
Expand All @@ -1766,7 +1865,7 @@ def _normalize_keyword_aggregation(kwargs):
>>> _normalize_keyword_aggregation({'output': ('input', 'sum')})
(OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')])
"""
# Normalize the aggregation functions as Dict[column, List[func]],
# Normalize the aggregation functions as Mapping[column, List[func]],
# process normally, then fixup the names.
# TODO(Py35): When we drop python 3.5, change this to
# defaultdict(list)
Expand Down
Loading

0 comments on commit ded70c8

Please sign in to comment.