From d3be81ad595c5338781bed9963c729a9702e6611 Mon Sep 17 00:00:00 2001 From: kernc Date: Tue, 26 Sep 2017 21:01:39 +0200 Subject: [PATCH] BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack (#16616) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/categorical.py | 10 ++- pandas/core/internals.py | 97 ++++++++++++++++++++++++++++- pandas/core/reshape/reshape.py | 49 +++++---------- pandas/tests/sparse/test_reshape.py | 38 +++++++++++ 5 files changed, 159 insertions(+), 37 deletions(-) create mode 100644 pandas/tests/sparse/test_reshape.py diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b6bd86bd79a1f..06f19782682b0 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -607,7 +607,7 @@ Sparse - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) - +- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 011aa74632296..d79937829cf3f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -125,10 +125,16 @@ def f(self, other): return f -def maybe_to_categorical(array): - """ coerce to a categorical if a series is given """ +def _maybe_to_categorical(array): + """ + Coerce to a categorical if a series is given. + + Internal use ONLY. + """ if isinstance(array, (ABCSeries, ABCCategoricalIndex)): return array._values + elif isinstance(array, np.ndarray): + return Categorical(array) return array diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e6f61a22e3137..9e348819ce5a3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -56,7 +56,7 @@ from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer -from pandas.core.categorical import Categorical, maybe_to_categorical +from pandas.core.categorical import Categorical, _maybe_to_categorical from pandas.core.indexes.datetimes import DatetimeIndex from pandas.io.formats.printing import pprint_thing @@ -1484,6 +1484,35 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) + def _unstack(self, unstacker_func, new_columns): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [make_block(new_values, placement=new_placement)] + return blocks, mask + def quantile(self, qs, interpolation='linear', axis=0, mgr=None): """ compute the quantiles of the @@ -1712,6 +1741,38 @@ def _slice(self, slicer): def _try_cast_result(self, result, dtype=None): return result + def _unstack(self, unstacker_func, new_columns): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + # NonConsolidatable blocks can have a single item only, so we return + # one block per item + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [self.make_block_same_class(vals, [place]) + for vals, place in zip(new_values, new_placement)] + return blocks, mask + class NumericBlock(Block): __slots__ = () @@ -2227,7 +2288,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): def __init__(self, values, placement, fastpath=False, **kwargs): # coerce to categorical if we can - super(CategoricalBlock, self).__init__(maybe_to_categorical(values), + super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), fastpath=True, placement=placement, **kwargs) @@ -4192,6 +4253,38 @@ def canonicalize(block): return all(block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks)) + def unstack(self, unstacker_func): + """Return a blockmanager with all blocks unstacked. + + Parameters + ---------- + unstacker_func : callable + A (partially-applied) ``pd.core.reshape._Unstacker`` class. + + Returns + ------- + unstacked : BlockManager + """ + dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) + new_columns = dummy.get_new_columns() + new_index = dummy.get_new_index() + new_blocks = [] + columns_mask = [] + + for blk in self.blocks: + blocks, mask = blk._unstack( + partial(unstacker_func, + value_columns=self.items[blk.mgr_locs.indexer]), + new_columns) + + new_blocks.extend(blocks) + columns_mask.extend(mask) + + new_columns = new_columns[columns_mask] + + bm = BlockManager(new_blocks, [new_columns, new_index]) + return bm + class SingleBlockManager(BlockManager): """ manage a single block with """ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index bff09be6149f3..d280c4f3f73d7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -2,6 +2,7 @@ # pylint: disable=W0703,W0622,W0613,W0201 from pandas.compat import range, text_type, zip from pandas import compat +from functools import partial import itertools import re @@ -10,7 +11,7 @@ from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, is_bool_dtype, - needs_i8_conversion) + needs_i8_conversion, is_sparse) from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.missing import notna import pandas.core.dtypes.concat as _concat @@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None, fill_value=None): self.is_categorical = None + self.is_sparse = is_sparse(values) if values.ndim == 1: if isinstance(values, Categorical): self.is_categorical = values values = np.array(values) + elif self.is_sparse: + # XXX: Makes SparseArray *dense*, but it's supposedly + # a single column at a time, so it's "doable" + values = values.values values = values[:, np.newaxis] self.values = values self.value_columns = value_columns @@ -177,7 +183,8 @@ def get_result(self): ordered=ordered) for i in range(values.shape[-1])] - return DataFrame(values, index=index, columns=columns) + klass = SparseDataFrame if self.is_sparse else DataFrame + return klass(values, index=index, columns=columns) def get_new_values(self): values = self.values @@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None): def _unstack_frame(obj, level, fill_value=None): - from pandas.core.internals import BlockManager, make_block - if obj._is_mixed_type: - unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy - obj.index, level=level, - value_columns=obj.columns) - new_columns = unstacker.get_new_columns() - new_index = unstacker.get_new_index() - new_axes = [new_columns, new_index] - - new_blocks = [] - mask_blocks = [] - for blk in obj._data.blocks: - blk_items = obj._data.items[blk.mgr_locs.indexer] - bunstacker = _Unstacker(blk.values.T, obj.index, level=level, - value_columns=blk_items, - fill_value=fill_value) - new_items = bunstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = bunstacker.get_new_values() - - mblk = make_block(mask.T, placement=new_placement) - mask_blocks.append(mblk) - - newb = make_block(new_values.T, placement=new_placement) - new_blocks.append(newb) - - result = DataFrame(BlockManager(new_blocks, new_axes)) - mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) - return result.loc[:, mask_frame.sum(0) > 0] + unstacker = partial(_Unstacker, index=obj.index, + level=level, fill_value=fill_value) + blocks = obj._data.unstack(unstacker) + klass = type(obj) + return klass(blocks) else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, @@ -559,7 +542,9 @@ def factorize(index): mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] - return Series(new_values, index=new_index) + + klass = type(frame)._constructor_sliced + return klass(new_values, index=new_index) def stack_multiple(frame, level, dropna=True): diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py new file mode 100644 index 0000000000000..b492c47375bcf --- /dev/null +++ b/pandas/tests/sparse/test_reshape.py @@ -0,0 +1,38 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.fixture +def sparse_df(): + return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye + + +@pytest.fixture +def multi_index3(): + return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) + + +def test_sparse_frame_stack(sparse_df, multi_index3): + ss = sparse_df.stack() + expected = pd.SparseSeries(np.ones(3), index=multi_index3) + tm.assert_sp_series_equal(ss, expected) + + +def test_sparse_frame_unstack(sparse_df): + mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)]) + sparse_df.index = mi + arr = np.array([[1, np.nan, np.nan], + [np.nan, 1, np.nan], + [np.nan, np.nan, 1]]) + unstacked_df = pd.DataFrame(arr, index=mi).unstack() + unstacked_sdf = sparse_df.unstack() + + tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values) + + +def test_sparse_series_unstack(sparse_df, multi_index3): + frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack() + tm.assert_sp_frame_equal(frame, sparse_df)