Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack #16616

Merged
merged 8 commits into from
Sep 26, 2017
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ Sparse

- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)

- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)

Reshaping
^^^^^^^^^
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,16 @@ def f(self, other):
return f


def maybe_to_categorical(array):
""" coerce to a categorical if a series is given """
def _maybe_to_categorical(array):
"""
Coerce to a categorical if a series is given.

Internal use ONLY.
"""
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
return array._values
elif isinstance(array, np.ndarray):
return Categorical(array)
return array


Expand Down
97 changes: 95 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import maybe_convert_indices, length_of_indexer
from pandas.core.categorical import Categorical, maybe_to_categorical
from pandas.core.categorical import Categorical, _maybe_to_categorical
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -1484,6 +1484,35 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self

Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.

Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask

def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
"""
compute the quantiles of the
Expand Down Expand Up @@ -1712,6 +1741,38 @@ def _slice(self, slicer):
def _try_cast_result(self, result, dtype=None):
return result

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self

Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.

Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
# NonConsolidatable blocks can have a single item only, so we return
# one block per item
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)]
return blocks, mask


class NumericBlock(Block):
__slots__ = ()
Expand Down Expand Up @@ -2227,7 +2288,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
def __init__(self, values, placement, fastpath=False, **kwargs):

# coerce to categorical if we can
super(CategoricalBlock, self).__init__(maybe_to_categorical(values),
super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
fastpath=True,
placement=placement, **kwargs)

Expand Down Expand Up @@ -4167,6 +4228,38 @@ def canonicalize(block):
return all(block.equals(oblock)
for block, oblock in zip(self_blocks, other_blocks))

def unstack(self, unstacker_func):
"""Return a blockmanager with all blocks unstacked.

Parameters
----------
unstacker_func : callable
A (partially-applied) ``pd.core.reshape._Unstacker`` class.

Returns
-------
unstacked : BlockManager
"""
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
new_columns = dummy.get_new_columns()
new_index = dummy.get_new_index()
new_blocks = []
columns_mask = []

for blk in self.blocks:
blocks, mask = blk._unstack(
partial(unstacker_func,
value_columns=self.items[blk.mgr_locs.indexer]),
new_columns)

new_blocks.extend(blocks)
columns_mask.extend(mask)

new_columns = new_columns[columns_mask]

bm = BlockManager(new_blocks, [new_columns, new_index])
return bm


class SingleBlockManager(BlockManager):
""" manage a single block with """
Expand Down
49 changes: 17 additions & 32 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# pylint: disable=W0703,W0622,W0613,W0201
from pandas.compat import range, text_type, zip
from pandas import compat
from functools import partial
import itertools
import re

Expand All @@ -10,7 +11,7 @@
from pandas.core.dtypes.common import (
_ensure_platform_int,
is_list_like, is_bool_dtype,
needs_i8_conversion)
needs_i8_conversion, is_sparse)
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.missing import notna
import pandas.core.dtypes.concat as _concat
Expand Down Expand Up @@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None):

self.is_categorical = None
self.is_sparse = is_sparse(values)
if values.ndim == 1:
if isinstance(values, Categorical):
self.is_categorical = values
values = np.array(values)
elif self.is_sparse:
# XXX: Makes SparseArray *dense*, but it's supposedly
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a TODO? or a comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment with a mild TODO hint at whomever will be refactoring the whole thing eventually to take the note into consideration.

# a single column at a time, so it's "doable"
values = values.values
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
Expand Down Expand Up @@ -177,7 +183,8 @@ def get_result(self):
ordered=ordered)
for i in range(values.shape[-1])]

return DataFrame(values, index=index, columns=columns)
klass = SparseDataFrame if self.is_sparse else DataFrame
return klass(values, index=index, columns=columns)

def get_new_values(self):
values = self.values
Expand Down Expand Up @@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None):


def _unstack_frame(obj, level, fill_value=None):
from pandas.core.internals import BlockManager, make_block

if obj._is_mixed_type:
unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
obj.index, level=level,
value_columns=obj.columns)
new_columns = unstacker.get_new_columns()
new_index = unstacker.get_new_index()
new_axes = [new_columns, new_index]

new_blocks = []
mask_blocks = []
for blk in obj._data.blocks:
blk_items = obj._data.items[blk.mgr_locs.indexer]
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
value_columns=blk_items,
fill_value=fill_value)
new_items = bunstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = bunstacker.get_new_values()

mblk = make_block(mask.T, placement=new_placement)
mask_blocks.append(mblk)

newb = make_block(new_values.T, placement=new_placement)
new_blocks.append(newb)

result = DataFrame(BlockManager(new_blocks, new_axes))
mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
return result.loc[:, mask_frame.sum(0) > 0]
unstacker = partial(_Unstacker, index=obj.index,
level=level, fill_value=fill_value)
blocks = obj._data.unstack(unstacker)
klass = type(obj)
return klass(blocks)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
value_columns=obj.columns,
Expand Down Expand Up @@ -559,7 +542,9 @@ def factorize(index):
mask = notna(new_values)
new_values = new_values[mask]
new_index = new_index[mask]
return Series(new_values, index=new_index)

klass = type(frame)._constructor_sliced
return klass(new_values, index=new_index)


def stack_multiple(frame, level, dropna=True):
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/sparse/test_reshape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest
import numpy as np

import pandas as pd
import pandas.util.testing as tm


@pytest.fixture
def sparse_df():
return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye


@pytest.fixture
def multi_index3():
return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])


def test_sparse_frame_stack(sparse_df, multi_index3):
ss = sparse_df.stack()
expected = pd.SparseSeries(np.ones(3), index=multi_index3)
tm.assert_sp_series_equal(ss, expected)


def test_sparse_frame_unstack(sparse_df):
mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)])
sparse_df.index = mi
arr = np.array([[1, np.nan, np.nan],
[np.nan, 1, np.nan],
[np.nan, np.nan, 1]])
unstacked_df = pd.DataFrame(arr, index=mi).unstack()
unstacked_sdf = sparse_df.unstack()

tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values)


def test_sparse_series_unstack(sparse_df, multi_index3):
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
tm.assert_sp_frame_equal(frame, sparse_df)