Skip to content

Commit

Permalink
BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack (#16616)
Browse files Browse the repository at this point in the history
  • Loading branch information
kernc authored and jreback committed Sep 26, 2017
1 parent 66f4cc1 commit d3be81a
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 37 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ Sparse

- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)

- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)

Reshaping
^^^^^^^^^
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,16 @@ def f(self, other):
return f


def maybe_to_categorical(array):
""" coerce to a categorical if a series is given """
def _maybe_to_categorical(array):
"""
Coerce to a categorical if a series is given.
Internal use ONLY.
"""
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
return array._values
elif isinstance(array, np.ndarray):
return Categorical(array)
return array


Expand Down
97 changes: 95 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import maybe_convert_indices, length_of_indexer
from pandas.core.categorical import Categorical, maybe_to_categorical
from pandas.core.categorical import Categorical, _maybe_to_categorical
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -1484,6 +1484,35 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self
Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask

def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
"""
compute the quantiles of the
Expand Down Expand Up @@ -1712,6 +1741,38 @@ def _slice(self, slicer):
def _try_cast_result(self, result, dtype=None):
return result

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self
Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
# NonConsolidatable blocks can have a single item only, so we return
# one block per item
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)]
return blocks, mask


class NumericBlock(Block):
__slots__ = ()
Expand Down Expand Up @@ -2227,7 +2288,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
def __init__(self, values, placement, fastpath=False, **kwargs):

# coerce to categorical if we can
super(CategoricalBlock, self).__init__(maybe_to_categorical(values),
super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
fastpath=True,
placement=placement, **kwargs)

Expand Down Expand Up @@ -4192,6 +4253,38 @@ def canonicalize(block):
return all(block.equals(oblock)
for block, oblock in zip(self_blocks, other_blocks))

def unstack(self, unstacker_func):
"""Return a blockmanager with all blocks unstacked.
Parameters
----------
unstacker_func : callable
A (partially-applied) ``pd.core.reshape._Unstacker`` class.
Returns
-------
unstacked : BlockManager
"""
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
new_columns = dummy.get_new_columns()
new_index = dummy.get_new_index()
new_blocks = []
columns_mask = []

for blk in self.blocks:
blocks, mask = blk._unstack(
partial(unstacker_func,
value_columns=self.items[blk.mgr_locs.indexer]),
new_columns)

new_blocks.extend(blocks)
columns_mask.extend(mask)

new_columns = new_columns[columns_mask]

bm = BlockManager(new_blocks, [new_columns, new_index])
return bm


class SingleBlockManager(BlockManager):
""" manage a single block with """
Expand Down
49 changes: 17 additions & 32 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# pylint: disable=W0703,W0622,W0613,W0201
from pandas.compat import range, text_type, zip
from pandas import compat
from functools import partial
import itertools
import re

Expand All @@ -10,7 +11,7 @@
from pandas.core.dtypes.common import (
_ensure_platform_int,
is_list_like, is_bool_dtype,
needs_i8_conversion)
needs_i8_conversion, is_sparse)
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.missing import notna
import pandas.core.dtypes.concat as _concat
Expand Down Expand Up @@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None):

self.is_categorical = None
self.is_sparse = is_sparse(values)
if values.ndim == 1:
if isinstance(values, Categorical):
self.is_categorical = values
values = np.array(values)
elif self.is_sparse:
# XXX: Makes SparseArray *dense*, but it's supposedly
# a single column at a time, so it's "doable"
values = values.values
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
Expand Down Expand Up @@ -177,7 +183,8 @@ def get_result(self):
ordered=ordered)
for i in range(values.shape[-1])]

return DataFrame(values, index=index, columns=columns)
klass = SparseDataFrame if self.is_sparse else DataFrame
return klass(values, index=index, columns=columns)

def get_new_values(self):
values = self.values
Expand Down Expand Up @@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None):


def _unstack_frame(obj, level, fill_value=None):
from pandas.core.internals import BlockManager, make_block

if obj._is_mixed_type:
unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
obj.index, level=level,
value_columns=obj.columns)
new_columns = unstacker.get_new_columns()
new_index = unstacker.get_new_index()
new_axes = [new_columns, new_index]

new_blocks = []
mask_blocks = []
for blk in obj._data.blocks:
blk_items = obj._data.items[blk.mgr_locs.indexer]
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
value_columns=blk_items,
fill_value=fill_value)
new_items = bunstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = bunstacker.get_new_values()

mblk = make_block(mask.T, placement=new_placement)
mask_blocks.append(mblk)

newb = make_block(new_values.T, placement=new_placement)
new_blocks.append(newb)

result = DataFrame(BlockManager(new_blocks, new_axes))
mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
return result.loc[:, mask_frame.sum(0) > 0]
unstacker = partial(_Unstacker, index=obj.index,
level=level, fill_value=fill_value)
blocks = obj._data.unstack(unstacker)
klass = type(obj)
return klass(blocks)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
value_columns=obj.columns,
Expand Down Expand Up @@ -559,7 +542,9 @@ def factorize(index):
mask = notna(new_values)
new_values = new_values[mask]
new_index = new_index[mask]
return Series(new_values, index=new_index)

klass = type(frame)._constructor_sliced
return klass(new_values, index=new_index)


def stack_multiple(frame, level, dropna=True):
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/sparse/test_reshape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest
import numpy as np

import pandas as pd
import pandas.util.testing as tm


@pytest.fixture
def sparse_df():
return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye


@pytest.fixture
def multi_index3():
return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])


def test_sparse_frame_stack(sparse_df, multi_index3):
ss = sparse_df.stack()
expected = pd.SparseSeries(np.ones(3), index=multi_index3)
tm.assert_sp_series_equal(ss, expected)


def test_sparse_frame_unstack(sparse_df):
mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)])
sparse_df.index = mi
arr = np.array([[1, np.nan, np.nan],
[np.nan, 1, np.nan],
[np.nan, np.nan, 1]])
unstacked_df = pd.DataFrame(arr, index=mi).unstack()
unstacked_sdf = sparse_df.unstack()

tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values)


def test_sparse_series_unstack(sparse_df, multi_index3):
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
tm.assert_sp_frame_equal(frame, sparse_df)

0 comments on commit d3be81a

Please sign in to comment.