Skip to content

Commit

Permalink
ENH: re #720, added alternative private constructor
Browse files Browse the repository at this point in the history
  • Loading branch information
adamklein committed Feb 1, 2012
1 parent b6ee864 commit ce3c4fa
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 10 deletions.
12 changes: 9 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4055,12 +4055,18 @@ def complete_dataframe(obj, prev_completions):
except Exception:
pass

def _indexer_from_factorized(labels, shape):
def _indexer_from_factorized(labels, shape, compress=True):
from pandas.core.groupby import get_group_index, _compress_group_index

group_index = get_group_index(labels, shape)
comp_ids, obs_ids = _compress_group_index(group_index)
max_group = len(obs_ids)

if compress:
comp_ids, obs_ids = _compress_group_index(group_index)
max_group = len(obs_ids)
else:
comp_ids = group_index
max_group = np.prod(shape)

indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)

return indexer
Expand Down
21 changes: 18 additions & 3 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,8 +1324,9 @@ def _get_slice(slob):

def get_group_index(label_list, shape):
"""
Gets the offsets into what would be the cartesian product of all
possible labels given the label_list.
For the particular label_list, gets the offsets into the hypothetical list
representing the totally ordered cartesian product of all possible label
combinations.
"""
if len(label_list) == 1:
return label_list[0]
Expand Down Expand Up @@ -1409,24 +1410,38 @@ def cython_aggregate(values, group_index, ngroups, how='add'):
# sorting levels...cleverly?

def _compress_group_index(group_index, sort=True):
"""
Group_index is offsets into cartesian product of all possible labels. This
space can be huge, so this function compresses it, by computing offsets
(comp_ids) into the list of unique labels (obs_group_ids).
"""

uniques = []
table = lib.Int64HashTable(len(group_index))

group_index = _ensure_int64(group_index)

# note, group labels come out ascending (ie, 1,2,3 etc)
comp_ids = table.get_labels_groupby(group_index, uniques)

# these are the ones we observed
# these are the unique ones we observed, in the order we observed them
obs_group_ids = np.array(uniques, dtype='i8')

if sort and len(obs_group_ids) > 0:
# sorter is index where elements ought to go
sorter = obs_group_ids.argsort()

# reverse_indexer is where elements came from
reverse_indexer = np.empty(len(sorter), dtype='i4')
reverse_indexer.put(sorter, np.arange(len(sorter)))

mask = comp_ids < 0

# move comp_ids to right locations (ie, unsort ascending labels)
comp_ids = reverse_indexer.take(comp_ids)
np.putmask(comp_ids, mask, -1)

# sort observed ids
obs_group_ids = obs_group_ids.take(sorter)

return comp_ids, obs_group_ids
Expand Down
19 changes: 16 additions & 3 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,16 @@ def copy(self, order='C'):
def dtype(self):
return np.dtype('O')

@staticmethod
def _from_elements(self, values, labels=None, levels=None, names=None,
sortorder=None):
index = values.view(MultiIndex)
index.levels = levels
index.labels = labels
index.names = names
index.sortorder = sortorder
return index

def _get_level_number(self, level):
try:
count = self.names.count(level)
Expand Down Expand Up @@ -1527,14 +1537,17 @@ def sortlevel(self, level=0, ascending=True):
level = self._get_level_number(level)
primary = labels.pop(level)
indexer = _indexer_from_factorized((primary,) + tuple(labels),
self.levshape)
self.levshape, compress=False)
if not ascending:
indexer = indexer[::-1]

new_labels = [lab.take(indexer) for lab in self.labels]

new_index = MultiIndex(levels=self.levels, labels=new_labels,
names=self.names, sortorder=level)
new_index = self._from_elements(self.values.take(indexer),
labels = new_labels,
levels = self.levels,
names = self.names,
sortorder = level)

return new_index, indexer

Expand Down
2 changes: 1 addition & 1 deletion vb_suite/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
"""
dependencies = ['pandas_vb_common.py']

START_DATE = datetime(2010, 6, 1)
START_DATE = datetime(2012, 1, 20)

repo = GitRepo(REPO_PATH)

Expand Down

0 comments on commit ce3c4fa

Please sign in to comment.