diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7c850ffedfcab..1cb8710799d30 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -519,6 +519,7 @@ Groupby/resample/rolling - Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`) - Bug in :meth:`df.groupby(..).quantile() ` and :meth:`df.resample(..).quantile() ` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) - Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) +- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e493e5e9d41d3..0b0334d52c1e9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -896,21 +896,28 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, if lab != cur: if lab != -1: - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][i - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) - + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][i - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][i - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:i] start = i cur = lab - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][n - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][n - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][n - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:] return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bca71b5c9646b..ccf23a6f24c42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -229,12 +229,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ - if len(self.groupings) == 1: - return self.groupings[0].indices - else: - codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + codes_list = [ping.codes for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return get_indexer_dict(codes_list, keys) @property def codes(self) -> List[np.ndarray]: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2e32a7572adc7..e390229b5dcba 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -4,6 +4,7 @@ TYPE_CHECKING, Callable, DefaultDict, + Dict, Iterable, List, Optional, @@ -528,16 +529,22 @@ def get_flattened_list( return [tuple(array) for array in arrays.values()] -def get_indexer_dict(label_list, keys): +def get_indexer_dict( + label_list: List[np.ndarray], keys: List["Index"] +) -> Dict[Union[str, Tuple], np.ndarray]: """ Returns ------- - dict + dict: Labels mapped to indexers. """ shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) + if np.all(group_index == -1): + # When all keys are nan and dropna=True, indices_fast can't handle this + # and the return is empty anyway + return {} ngroups = ( ((group_index.size and group_index.max()) + 1) if is_int64_overflow_possible(shape) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2563eeeb68672..a0c228200e73a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1298,6 +1298,13 @@ def test_groupby_nat_exclude(): grouped.get_group(pd.NaT) +def test_groupby_two_group_keys_all_nan(): + # GH #36842: Grouping over two group keys shouldn't raise an error + df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]}) + result = df.groupby(["a", "b"]).indices + assert result == {} + + def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 29a8f883f0ff5..02ce4dcf2ae2b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -335,3 +335,21 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, expected = pd.DataFrame(selected_data, index=mi) tm.assert_frame_equal(result, expected) + + +def test_groupby_nan_included(): + # GH 35646 + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + df = pd.DataFrame(data) + grouped = df.groupby("group", dropna=False) + result = grouped.indices + dtype = "int64" + expected = { + "g1": np.array([0, 2], dtype=dtype), + "g2": np.array([3], dtype=dtype), + np.nan: np.array([1, 4], dtype=dtype), + } + for result_values, expected_values in zip(result.values(), expected.values()): + tm.assert_numpy_array_equal(result_values, expected_values) + assert np.isnan(list(result.keys())[2]) + assert list(result.keys())[0:2] == ["g1", "g2"] diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2c8439aae75e5..02bcfab8d3388 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1087,3 +1087,18 @@ def test_rolling_corr_timedelta_index(index, window): result = x.rolling(window).corr(y) expected = Series([np.nan, np.nan, 1, 1, 1], index=index) tm.assert_almost_equal(result, expected) + + +def test_groupby_rolling_nan_included(): + # GH 35542 + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + df = DataFrame(data) + result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean() + expected = DataFrame( + {"B": [0.0, 2.0, 3.0, 1.0, 4.0]}, + index=pd.MultiIndex.from_tuples( + [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)], + names=["group", None], + ), + ) + tm.assert_frame_equal(result, expected)