Skip to content

Commit

Permalink
REF: refactor MultiIndex to not store tuples, be more efficient. test…
Browse files Browse the repository at this point in the history
…ing and compatibility checks. close #1467
  • Loading branch information
wesm committed Jun 14, 2012
1 parent 09118d2 commit e79f481
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 52 deletions.
6 changes: 3 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,21 @@ def _count_generic(values, table_type, type_caster):
from pandas.core.series import Series

values = type_caster(values)
table = table_type(len(values))
table = table_type(min(len(values), 1000000))
uniques, labels, counts = table.factorize(values)

return Series(counts, index=uniques)

def _match_generic(values, index, table_type, type_caster):
values = type_caster(values)
index = type_caster(index)
table = table_type(len(index))
table = table_type(min(len(index), 1000000))
table.map_locations(index)
return table.lookup(values)

def _unique_generic(values, table_type, type_caster):
values = type_caster(values)
table = table_type(len(values))
table = table_type(min(len(values), 1000000))
uniques = table.unique(values)
return type_caster(uniques)

Expand Down
2 changes: 2 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,8 @@ def _possibly_cast_item(obj, item, dtype):

def _is_bool_indexer(key):
if isinstance(key, np.ndarray) and key.dtype == np.object_:
key = np.asarray(key)

if not lib.is_bool_array(key):
if isnull(key).any():
raise ValueError('cannot index with vector containing '
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3514,8 +3514,10 @@ def _apply_standard(self, func, axis, ignore_failures=False):
values = self.values
dummy = Series(np.nan, index=self._get_axis(axis),
dtype=values.dtype)

labels = self._get_agg_axis(axis)
result = lib.reduce(values, func, axis=axis, dummy=dummy,
labels=self._get_agg_axis(axis))
labels=labels)
return Series(result, index=self._get_agg_axis(axis))
except Exception:
pass
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2084,7 +2084,7 @@ def _compress_group_index(group_index, sort=True):
"""

uniques = []
table = lib.Int64HashTable(len(group_index))
table = lib.Int64HashTable(min(1000000, len(group_index)))

group_index = com._ensure_int64(group_index)

Expand Down
94 changes: 63 additions & 31 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _cleanup(self):
@cache_readonly
def _engine(self):
# property, for now, slow to look up
return self._engine_type(weakref.ref(self))
return self._engine_type(lambda: self.values, len(self))

def _get_level_number(self, level):
if not isinstance(level, int):
Expand Down Expand Up @@ -752,7 +752,10 @@ def isin(self, values):
is_contained : ndarray (boolean dtype)
"""
value_set = set(values)
return lib.ismember(self, value_set)
return lib.ismember(self._array_values(), value_set)

def _array_values(self):
return self

def _get_method(self, method):
if method:
Expand Down Expand Up @@ -1223,14 +1226,8 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None):
levels = [_ensure_index(lev) for lev in levels]
labels = [np.asarray(labs, dtype=np.int_) for labs in labels]

values = [ndtake(lev.values, lab)
for lev, lab in zip(levels, labels)]

# Need to box timestamps, etc.
values = _clean_arrays(values)

subarr = lib.fast_zip(values).view(cls)

# v3, 0.8.0
subarr = np.empty(0, dtype=object).view(cls)
subarr.levels = levels
subarr.labels = labels

Expand Down Expand Up @@ -1267,14 +1264,42 @@ def copy(self, order='C'):
cp.sortorder = self.sortorder
return cp

def _array_values(self):
# hack for various methods
return self.values

@property
def dtype(self):
return np.dtype('O')

def __repr__(self):
output = 'MultiIndex\n%s'

options = np.get_printoptions()
np.set_printoptions(threshold=50)

if len(self) > 100:
values = np.concatenate([self[:50].values,
self[-50:].values])
else:
values = self.values
summary = np.array2string(values, max_line_width=70)

np.set_printoptions(threshold=options['threshold'])

return output % summary

def __len__(self):
return len(self.labels[0])

@property
def _constructor(self):
return MultiIndex.from_tuples

@cache_readonly
def inferred_type(self):
return 'mixed'

@staticmethod
def _from_elements(values, labels=None, levels=None, names=None,
sortorder=None):
Expand Down Expand Up @@ -1302,21 +1327,35 @@ def _get_level_number(self, level):
% (self.nlevels, level))
return level

_tuples = None

@property
def values(self):
if self._is_legacy_format:
# for legacy MultiIndex
values = [ndtake(np.asarray(lev), lab)
for lev, lab in zip(self.levels, self.labels)]
return lib.fast_zip(values)
else:
if self._is_v2:
return self.view(np.ndarray)
else:
if self._tuples is not None:
return self._tuples

values = [ndtake(lev.values, lab)
for lev, lab in zip(self.levels, self.labels)]

# Need to box timestamps, etc.
values = _clean_arrays(values)
self._tuples = lib.fast_zip(values)
return self._tuples

# fml
@property
def _is_legacy_format(self):
def _is_v1(self):
contents = self.view(np.ndarray)
return len(contents) > 0 and not isinstance(contents[0], tuple)

@property
def _is_v2(self):
contents = self.view(np.ndarray)
return len(contents) > 0 and isinstance(contents[0], tuple)

@property
def _has_complex_internals(self):
# to disable groupby tricks
Expand Down Expand Up @@ -1458,7 +1497,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
-------
index : MultiIndex
"""
from pandas.core.categorical import Factor
from pandas.core.categorical import Categorical

if len(arrays) == 1:
name = None if names is None else names[0]
Expand All @@ -1467,7 +1506,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
levels = []
labels = []
for arr in arrays:
factor = Factor.from_array(arr)
factor = Categorical.from_array(arr)
levels.append(factor.levels)
labels.append(factor.labels)

Expand Down Expand Up @@ -1539,7 +1578,6 @@ def __setstate__(self, state):
self.sortorder = sortorder

def __getitem__(self, key):
arr_idx = self.view(np.ndarray)
if np.isscalar(key):
return tuple(lev[lab[key]]
for lev, lab in zip(self.levels, self.labels))
Expand All @@ -1551,11 +1589,10 @@ def __getitem__(self, key):
# cannot be sure whether the result will be sorted
sortorder = None

new_tuples = arr_idx[key]
result = np.empty(0, dtype=object).view(type(self))
new_labels = [lab[key] for lab in self.labels]

# an optimization
result = new_tuples.view(MultiIndex)
result.levels = list(self.levels)
result.labels = new_labels
result.sortorder = sortorder
Expand Down Expand Up @@ -1759,11 +1796,8 @@ def sortlevel(self, level=0, ascending=True):
indexer = com._ensure_platform_int(indexer)
new_labels = [lab.take(indexer) for lab in self.labels]

new_index = MultiIndex._from_elements(self.values.take(indexer),
labels=new_labels,
levels=self.levels,
names=self.names,
sortorder=level)
new_index = MultiIndex(labels=new_labels, levels=self.levels,
names=self.names, sortorder=level)

return new_index, indexer

Expand Down Expand Up @@ -1800,15 +1834,13 @@ def get_indexer(self, target, method=None, limit=None):
target = _ensure_index(target)

target_index = target
if isinstance(target, MultiIndex) and target._is_legacy_format:
if isinstance(target, MultiIndex):
target_index = target.get_tuple_index()

if target_index.dtype != object:
return np.ones(len(target_index)) * -1

self_index = self
if self._is_legacy_format:
self_index = self.get_tuple_index()
self_index = self.get_tuple_index()

if method == 'pad':
assert(self.is_unique and self.is_monotonic)
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,8 +863,7 @@ def delete(self, item):
i, _ = self._find_block(item)
loc = self.items.get_loc(item)

new_items = self.items._constructor(
np.delete(np.asarray(self.items), loc))
new_items = self.items.delete(loc)

self._delete_from_block(i, item)
self.set_items_norename(new_items)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def __new__(cls, data=None, index=None, dtype=None, name=None,
elif isinstance(index, PeriodIndex):
data = [data.get(i, nan) for i in index]
else:
data = lib.fast_multiget(data, index, default=np.nan)
data = lib.fast_multiget(data, index.values, default=np.nan)
except TypeError:
data = [data.get(i, nan) for i in index]

Expand Down Expand Up @@ -763,7 +763,7 @@ def __repr__(self):
width, height = get_terminal_size()
max_rows = (height if fmt.print_config.max_rows == 0
else fmt.print_config.max_rows)
if len(self.index) > max_rows:
if len(self.index) > (max_rows or 1000):
result = self._tidy_repr(min(30, max_rows - 4))
elif len(self.index) > 0:
result = self._get_repr(print_header=True,
Expand Down
12 changes: 6 additions & 6 deletions pandas/src/engines.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,18 @@ cdef int _SIZE_CUTOFF = 1000000
cdef class IndexEngine:

cdef readonly:
object index_weakref
object vgetter
HashTable mapping
bint over_size_threshold

cdef:
bint unique, monotonic
bint initialized, monotonic_check, unique_check

def __init__(self, index_weakref):
self.index_weakref = index_weakref
def __init__(self, vgetter, n):
self.vgetter = vgetter

self.over_size_threshold = len(index_weakref()) >= _SIZE_CUTOFF
self.over_size_threshold = n >= _SIZE_CUTOFF

self.initialized = 0
self.monotonic_check = 0
Expand Down Expand Up @@ -206,7 +206,7 @@ cdef class IndexEngine:
self.monotonic_check = 1

cdef _get_index_values(self):
return self.index_weakref().values
return self.vgetter()

cdef inline _do_unique_check(self):
self._ensure_mapping_populated()
Expand Down Expand Up @@ -370,7 +370,7 @@ cdef class DatetimeEngine(Int64Engine):
return _to_i8(val) in self.mapping

cdef _get_index_values(self):
return self.index_weakref().values.view('i8')
return self.vgetter().view('i8')

def _call_monotonic(self, values):
return _algos.is_monotonic_int64(values)
Expand Down
3 changes: 3 additions & 0 deletions pandas/src/reduce.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -358,5 +358,8 @@ cdef class Slider:
self.buf.data = self.orig_data

def reduce(arr, f, axis=0, dummy=None, labels=None):
if labels._has_complex_internals:
raise Exception('Cannot use shortcut')

reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels)
return reducer.get_result()
25 changes: 23 additions & 2 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pickle
import unittest
import nose
import os

import numpy as np
from numpy.testing import assert_array_equal
Expand All @@ -13,6 +14,7 @@
from pandas.core.index import Index, Int64Index, MultiIndex
from pandas.util.testing import assert_almost_equal
from pandas.util import py3compat
import pandas.core.common as com

import pandas.util.testing as tm

Expand Down Expand Up @@ -895,15 +897,34 @@ def test_legacy_pickle(self):
if py3compat.PY3:
raise nose.SkipTest

import os
def curpath():
pth, _ = os.path.split(os.path.abspath(__file__))
return pth

ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle')
obj = pickle.load(open(ppath, 'r'))

self.assert_(obj._is_legacy_format)
self.assert_(obj._is_v1)

obj2 = MultiIndex.from_tuples(obj.values)
self.assert_(obj.equals(obj2))

res = obj.get_indexer(obj)
exp = np.arange(len(obj))
assert_almost_equal(res, exp)

res = obj.get_indexer(obj2[::-1])
exp = obj.get_indexer(obj[::-1])
exp2 = obj2.get_indexer(obj2[::-1])
assert_almost_equal(res, exp)
assert_almost_equal(exp, exp2)

def test_legacy_v2_unpickle(self):
# 0.7.3 -> 0.8.0 format manage
pth, _ = os.path.split(os.path.abspath(__file__))
filepath = os.path.join(pth, 'data', 'mindex_073.pickle')

obj = com.load(filepath)

obj2 = MultiIndex.from_tuples(obj.values)
self.assert_(obj.equals(obj2))
Expand Down
Loading

0 comments on commit e79f481

Please sign in to comment.