diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
index 6330a2b36c516..41208125e8f32 100644
--- a/asv_bench/benchmarks/replace.py
+++ b/asv_bench/benchmarks/replace.py
@@ -44,15 +44,15 @@ class Convert(object):
goal_time = 0.5
params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta'])
- param_names = ['contructor', 'replace_data']
+ param_names = ['constructor', 'replace_data']
- def setup(self, contructor, replace_data):
+ def setup(self, constructor, replace_data):
N = 10**3
data = {'Series': pd.Series(np.random.randint(N, size=N)),
'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N),
'B': np.random.randint(N, size=N)})}
self.to_replace = {i: getattr(pd, replace_data) for i in range(N)}
- self.data = data[contructor]
+ self.data = data[constructor]
- def time_replace(self, contructor, replace_data):
+ def time_replace(self, constructor, replace_data):
self.data.replace(self.to_replace)
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index 45142c53dcd01..59cf7d090a622 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -12,14 +12,14 @@ class Methods(object):
['int', 'float'],
['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
'sum', 'corr', 'cov'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ param_names = ['constructor', 'window', 'dtype', 'method']
- def setup(self, contructor, window, dtype, method):
+ def setup(self, constructor, window, dtype, method):
N = 10**5
arr = np.random.random(N).astype(dtype)
- self.roll = getattr(pd, contructor)(arr).rolling(window)
+ self.roll = getattr(pd, constructor)(arr).rolling(window)
- def time_rolling(self, contructor, window, dtype, method):
+ def time_rolling(self, constructor, window, dtype, method):
getattr(self.roll, method)()
@@ -30,12 +30,12 @@ class Quantile(object):
[10, 1000],
['int', 'float'],
[0, 0.5, 1])
- param_names = ['contructor', 'window', 'dtype', 'percentile']
+ param_names = ['constructor', 'window', 'dtype', 'percentile']
- def setup(self, contructor, window, dtype, percentile):
+ def setup(self, constructor, window, dtype, percentile):
N = 10**5
arr = np.random.random(N).astype(dtype)
- self.roll = getattr(pd, contructor)(arr).rolling(window)
+ self.roll = getattr(pd, constructor)(arr).rolling(window)
- def time_quantile(self, contructor, window, dtype, percentile):
+ def time_quantile(self, constructor, window, dtype, percentile):
self.roll.quantile(percentile)
diff --git a/doc/source/api.rst b/doc/source/api.rst
index ddd09327935ce..44f87aa3e1cec 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -2500,7 +2500,7 @@ Scalar introspection
Extensions
----------
-These are primarily intented for library authors looking to extend pandas
+These are primarily intended for library authors looking to extend pandas
objects.
.. currentmodule:: pandas
diff --git a/doc/source/io.rst b/doc/source/io.rst
index ae04996b4fddf..60dc89f8fd495 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2675,7 +2675,7 @@ file, and the ``sheet_name`` indicating which sheet to parse.
+++++++++++++++++++
To facilitate working with multiple sheets from the same file, the ``ExcelFile``
-class can be used to wrap the file and can be be passed into ``read_excel``
+class can be used to wrap the file and can be passed into ``read_excel``
There will be a performance benefit for reading multiple sheets as the file is
read into memory only once.
@@ -4537,7 +4537,7 @@ See the documentation for `pyarrow `__ and
.. note::
These engines are very similar and should read/write nearly identical parquet format files.
- Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
+ Currently ``pyarrow`` does not support timedelta data, ``fastparquet>=0.1.4`` supports timezone aware datetimes.
These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
.. ipython:: python
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index edde49a1d144f..a08199438a8d7 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -431,6 +431,7 @@ Performance Improvements
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
+- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
.. _whatsnew_0230.docs:
@@ -528,7 +529,11 @@ MultiIndex
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
--
+- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
+- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
+- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
+- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)
+
I/O
^^^
diff --git a/doc/sphinxext/numpydoc/tests/test_docscrape.py b/doc/sphinxext/numpydoc/tests/test_docscrape.py
index b682504e1618f..b412124d774bb 100755
--- a/doc/sphinxext/numpydoc/tests/test_docscrape.py
+++ b/doc/sphinxext/numpydoc/tests/test_docscrape.py
@@ -42,7 +42,7 @@
-------
out : ndarray
The drawn samples, arranged according to `shape`. If the
- shape given is (m,n,...), then the shape of `out` is is
+ shape given is (m,n,...), then the shape of `out` is
(m,n,...,N).
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
@@ -222,7 +222,7 @@ def test_str():
-------
out : ndarray
The drawn samples, arranged according to `shape`. If the
- shape given is (m,n,...), then the shape of `out` is is
+ shape given is (m,n,...), then the shape of `out` is
(m,n,...,N).
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
@@ -340,7 +340,7 @@ def test_sphinx_str():
**out** : ndarray
The drawn samples, arranged according to `shape`. If the
- shape given is (m,n,...), then the shape of `out` is is
+ shape given is (m,n,...), then the shape of `out` is
(m,n,...,N).
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
index 014da22df3382..d735b3c0673b2 100644
--- a/pandas/_libs/hashtable.pxd
+++ b/pandas/_libs/hashtable.pxd
@@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable):
cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
-cdef class MultiIndexHashTable(HashTable):
- cdef:
- kh_uint64_t *table
- object mi
-
- cpdef get_item(self, object val)
- cpdef set_item(self, object key, Py_ssize_t val)
- cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
-
cdef class StringHashTable(HashTable):
cdef kh_str_t *table
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index bd9dd1f9bae37..bca4e388f3279 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable):
count += 1
return np.asarray(labels)
-
-
-cdef class MultiIndexHashTable(HashTable):
-
- def __init__(self, size_hint=1):
- self.table = kh_init_uint64()
- self.mi = None
- kh_resize_uint64(self.table, size_hint)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_uint64(self.table)
- self.table = NULL
-
- def __len__(self):
- return self.table.size
-
- def sizeof(self, deep=False):
- """ return the size of my table in bytes """
- return self.table.n_buckets * (sizeof(uint64_t) + # keys
- sizeof(size_t) + # vals
- sizeof(uint32_t)) # flags
-
- def _check_for_collisions(self, int64_t[:] locs, object mi):
- # validate that the locs map to the actual values
- # provided in the mi
- # we can only check if we *don't* have any missing values
- # :<
- cdef:
- ndarray[int64_t] alocs
-
- alocs = np.asarray(locs)
- if (alocs != -1).all():
-
- result = self.mi.take(locs)
- if isinstance(mi, tuple):
- from pandas import Index
- mi = Index([mi])
- if not result.equals(mi):
- raise AssertionError(
- "hash collision\nlocs:\n{}\n"
- "result:\n{}\nmi:\n{}".format(alocs, result, mi))
-
- cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
- # validate that the loc maps to the actual value
- # version of _check_for_collisions above for single label (tuple)
-
- result = self.mi[loc]
-
- if not all(l == r or (is_null_datetimelike(l)
- and is_null_datetimelike(r))
- for l, r in zip(result, label)):
- raise AssertionError(
- "hash collision\nloc:\n{}\n"
- "result:\n{}\nmi:\n{}".format(loc, result, label))
-
- def __contains__(self, object key):
- try:
- self.get_item(key)
- return True
- except (KeyError, ValueError, TypeError):
- return False
-
- cpdef get_item(self, object key):
- cdef:
- khiter_t k
- uint64_t value
- int64_t[:] locs
- Py_ssize_t loc
-
- value = self.mi._hashed_indexing_key(key)
- k = kh_get_uint64(self.table, value)
- if k != self.table.n_buckets:
- loc = self.table.vals[k]
- self._check_for_collision(loc, key)
- return loc
- else:
- raise KeyError(key)
-
- cpdef set_item(self, object key, Py_ssize_t val):
- raise NotImplementedError
-
- @cython.boundscheck(False)
- def map_locations(self, object mi):
- cdef:
- Py_ssize_t i, n
- ndarray[uint64_t] values
- uint64_t val
- int ret = 0
- khiter_t k
-
- self.mi = mi
- n = len(mi)
- values = mi._hashed_values
-
- with nogil:
- for i in range(n):
- val = values[i]
- k = kh_put_uint64(self.table, val, &ret)
- self.table.vals[k] = i
-
- @cython.boundscheck(False)
- def lookup(self, object mi):
- # look up with a target mi
- cdef:
- Py_ssize_t i, n
- ndarray[uint64_t] values
- int ret = 0
- uint64_t val
- khiter_t k
- int64_t[:] locs
-
- n = len(mi)
- values = mi._hashed_values
-
- locs = np.empty(n, dtype=np.int64)
-
- with nogil:
- for i in range(n):
- val = values[i]
- k = kh_get_uint64(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- self._check_for_collisions(locs, mi)
- return np.asarray(locs)
-
- def unique(self, object mi):
- raise NotImplementedError
-
- def get_labels(self, object mi, ObjectVector uniques,
- Py_ssize_t count_prior, int64_t na_sentinel,
- bint check_null=True):
- raise NotImplementedError
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index bfea4ff9915ac..6b23e487aad3a 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -26,11 +26,12 @@ from hashtable cimport HashTable
from pandas._libs import algos, hashtable as _hash
from pandas._libs.tslibs import period as periodlib
from pandas._libs.tslib import Timestamp, Timedelta
+from pandas._libs.missing import checknull
cdef int64_t iNaT = util.get_nat()
-cdef inline is_definitely_invalid_key(object val):
+cdef inline bint is_definitely_invalid_key(object val):
if PyTuple_Check(val):
try:
hash(val)
@@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value):
return value
-cdef class MultiIndexObjectEngine(ObjectEngine):
+cdef class BaseMultiIndexCodesEngine:
"""
- provide the same interface as the MultiIndexEngine
- but use the IndexEngine for computation
-
- This provides good performance with samller MI's
+ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
+ represent each label in a MultiIndex as an integer, by juxtaposing the bits
+ encoding each level, with appropriate offsets.
+
+ For instance: if 3 levels have respectively 3, 6 and 1 possible values,
+ then their labels can be represented using respectively 2, 3 and 1 bits,
+ as follows:
+ _ _ _ _____ _ __ __ __
+ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
+ — — — ————— — —— —— ——
+ |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
+ — — — ————— — —— —— ——
+ |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
+ and the resulting unsigned integer representation will be:
+ _ _ _ _____ _ __ __ __ __ __ __
+ |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
+
+ Offsets are calculated at initialization, labels are transformed by method
+ _codes_to_ints.
+
+ Keys are located by first locating each component against the respective
+ level, then locating (the integer representation of) codes.
"""
- def get_indexer(self, values):
- # convert a MI to an ndarray
- if hasattr(values, 'values'):
- values = values.values
- return super(MultiIndexObjectEngine, self).get_indexer(values)
+ def __init__(self, object levels, object labels,
+ ndarray[uint64_t, ndim=1] offsets):
+ """
+ Parameters
+ ----------
+ levels : list-like of numpy arrays
+ Levels of the MultiIndex
+ labels : list-like of numpy arrays of integer dtype
+ Labels of the MultiIndex
+ offsets : numpy array of uint64 dtype
+ Pre-calculated offsets, one for each level of the index
+ """
- cpdef get_loc(self, object val):
+ self.levels = levels
+ self.offsets = offsets
- # convert a MI to an ndarray
- if hasattr(val, 'values'):
- val = val.values
- return super(MultiIndexObjectEngine, self).get_loc(val)
+ # Transform labels in a single array, and add 1 so that we are working
+ # with positive integers (-1 for NaN becomes 0):
+ codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
+ copy=False)
+ # Map each codes combination in the index to an integer unambiguously
+ # (no collisions possible), based on the "offsets", which describe the
+ # number of bits to switch labels for each level:
+ lab_ints = self._codes_to_ints(codes)
-cdef class MultiIndexHashEngine(ObjectEngine):
- """
- Use a hashing based MultiIndex impl
- but use the IndexEngine for computation
+ # Initialize underlying index (e.g. libindex.UInt64Engine) with
+ # integers representing labels: we will use its get_loc and get_indexer
+ self._base.__init__(self, lambda: lab_ints, len(lab_ints))
- This provides good performance with larger MI's
- """
+ def _extract_level_codes(self, object target, object method=None):
+ """
+ Map the requested list of (tuple) keys to their integer representations
+ for searching in the underlying integer index.
+
+ Parameters
+ ----------
+ target : list-like of keys
+ Each key is a tuple, with a label for each level of the index.
+
+ Returns
+ ------
+ int_keys : 1-dimensional array of dtype uint64 or object
+ Integers representing one combination each
+ """
- def _call_monotonic(self, object mi):
- # defer these back to the mi iteself
- return (mi.is_monotonic_increasing,
- mi.is_monotonic_decreasing,
- mi.is_unique)
+ level_codes = [lev.get_indexer(codes) + 1 for lev, codes
+ in zip(self.levels, zip(*target))]
+ return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
+
+ def get_indexer(self, object target, object method=None,
+ object limit=None):
+ lab_ints = self._extract_level_codes(target)
+
+ # All methods (exact, backfill, pad) directly map to the respective
+ # methods of the underlying (integers) index...
+ if method is not None:
+ # but underlying backfill and pad methods require index and keys
+ # to be sorted. The index already is (checked in
+ # Index._get_fill_indexer), sort (integer representations of) keys:
+ order = np.argsort(lab_ints)
+ lab_ints = lab_ints[order]
+ indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
+ (self, lab_ints, limit=limit))
+ indexer = indexer[order]
+ else:
+ indexer = self._base.get_indexer(self, lab_ints)
- def get_backfill_indexer(self, other, limit=None):
- # we coerce to ndarray-of-tuples
- values = np.array(self._get_index_values())
- return algos.backfill_object(values, other, limit=limit)
+ return indexer
- def get_pad_indexer(self, other, limit=None):
- # we coerce to ndarray-of-tuples
- values = np.array(self._get_index_values())
- return algos.pad_object(values, other, limit=limit)
+ def get_loc(self, object key):
+ if is_definitely_invalid_key(key):
+ raise TypeError("'{key}' is an invalid key".format(key=key))
+ if not PyTuple_Check(key):
+ raise KeyError(key)
+ try:
+ indices = [0 if checknull(v) else lev.get_loc(v) + 1
+ for lev, v in zip(self.levels, key)]
+ except KeyError:
+ raise KeyError(key)
- cpdef get_loc(self, object val):
- if is_definitely_invalid_key(val):
- raise TypeError("'{val}' is an invalid key".format(val=val))
+ # Transform indices into single integer:
+ lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
- self._ensure_mapping_populated()
- if not self.unique:
- return self._get_loc_duplicates(val)
+ return self._base.get_loc(self, lab_int)
- try:
- return self.mapping.get_item(val)
- except TypeError:
- raise KeyError(val)
+ def get_indexer_non_unique(self, object target):
+ # This needs to be overridden just because the default one works on
+ # target._values, and target can be itself a MultiIndex.
- def get_indexer(self, values):
- self._ensure_mapping_populated()
- return self.mapping.lookup(values)
+ lab_ints = self._extract_level_codes(target)
+ indexer = self._base.get_indexer_non_unique(self, lab_ints)
+
+ return indexer
+
+ def __contains__(self, object val):
+ # Default __contains__ looks in the underlying mapping, which in this
+ # case only contains integer representations.
+ try:
+ self.get_loc(val)
+ return True
+ except (KeyError, TypeError, ValueError):
+ return False
- cdef _make_hash_table(self, n):
- return _hash.MultiIndexHashTable(n)
# Generated from template.
include "index_class_helper.pxi"
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 1e6ea7794dfff..37693068e0974 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -897,7 +897,7 @@ class Timedelta(_Timedelta):
Represents a duration, the difference between two dates or times.
Timedelta is the pandas equivalent of python's ``datetime.timedelta``
- and is interchangable with it in most cases.
+ and is interchangeable with it in most cases.
Parameters
----------
diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx
index c22e0b8e555a3..215ae9ce087ee 100644
--- a/pandas/_libs/tslibs/timezones.pyx
+++ b/pandas/_libs/tslibs/timezones.pyx
@@ -295,7 +295,7 @@ cpdef bint tz_compare(object start, object end):
timezones. For example
`` and
`` are essentially same
- timezones but aren't evaluted such, but the string representation
+ timezones but aren't evaluated such, but the string representation
for both of these is `'Europe/Paris'`.
This exists only to add a notion of equality to pytz-style zones
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 7328cd336babf..788b236b0ec59 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4115,7 +4115,7 @@ def combine(self, other, func, fill_value=None, overwrite=True):
series[this_mask] = fill_value
otherSeries[other_mask] = fill_value
- # if we have different dtypes, possibily promote
+ # if we have different dtypes, possibly promote
new_dtype = this_dtype
if not is_dtype_equal(this_dtype, other_dtype):
new_dtype = find_common_type([this_dtype, other_dtype])
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index f43c6dc567f69..8e77c7a7fa48c 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -332,7 +332,7 @@ def freqstr(self):
@cache_readonly
def inferred_freq(self):
"""
- Trys to return a string representing a frequency guess,
+ Tryies to return a string representing a frequency guess,
generated by infer_freq. Returns None if it can't autodetect the
frequency.
"""
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 797774832aaa5..510f7245cebd8 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -45,6 +45,87 @@
target_klass='MultiIndex or list of tuples'))
+class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
+ libindex.UInt64Engine):
+ """
+ This class manages a MultiIndex by mapping label combinations to positive
+ integers.
+ """
+ _base = libindex.UInt64Engine
+
+ def _codes_to_ints(self, codes):
+ """
+ Transform combination(s) of uint64 in one uint64 (each), in a strictly
+ monotonic way (i.e. respecting the lexicographic order of integer
+ combinations): see BaseMultiIndexCodesEngine documentation.
+
+ Parameters
+ ----------
+ codes : 1- or 2-dimensional array of dtype uint64
+ Combinations of integers (one per row)
+
+ Returns
+ ------
+ int_keys : scalar or 1-dimensional array, of dtype uint64
+ Integer(s) representing one combination (each)
+ """
+ # Shift the representation of each level by the pre-calculated number
+ # of bits:
+ codes <<= self.offsets
+
+ # Now sum and OR are in fact interchangeable. This is a simple
+ # composition of the (disjunct) significant bits of each level (i.e.
+ # each column in "codes") in a single positive integer:
+ if codes.ndim == 1:
+ # Single key
+ return np.bitwise_or.reduce(codes)
+
+ # Multiple keys
+ return np.bitwise_or.reduce(codes, axis=1)
+
+
+class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
+ libindex.ObjectEngine):
+ """
+ This class manages those (extreme) cases in which the number of possible
+ label combinations overflows the 64 bits integers, and uses an ObjectEngine
+ containing Python integers.
+ """
+ _base = libindex.ObjectEngine
+
+ def _codes_to_ints(self, codes):
+ """
+ Transform combination(s) of uint64 in one Python integer (each), in a
+ strictly monotonic way (i.e. respecting the lexicographic order of
+ integer combinations): see BaseMultiIndexCodesEngine documentation.
+
+ Parameters
+ ----------
+ codes : 1- or 2-dimensional array of dtype uint64
+ Combinations of integers (one per row)
+
+ Returns
+ ------
+ int_keys : int, or 1-dimensional array of dtype object
+ Integer(s) representing one combination (each)
+ """
+
+ # Shift the representation of each level by the pre-calculated number
+ # of bits. Since this can overflow uint64, first make sure we are
+ # working with Python integers:
+ codes = codes.astype('object') << self.offsets
+
+ # Now sum and OR are in fact interchangeable. This is a simple
+ # composition of the (disjunct) significant bits of each level (i.e.
+ # each column in "codes") in a single positive integer (per row):
+ if codes.ndim == 1:
+ # Single key
+ return np.bitwise_or.reduce(codes)
+
+ # Multiple keys
+ return np.bitwise_or.reduce(codes, axis=1)
+
+
class MultiIndex(Index):
"""
A multi-level, or hierarchical, index object for pandas objects
@@ -687,16 +768,25 @@ def _get_level_number(self, level):
@cache_readonly
def _engine(self):
-
- # choose our engine based on our size
- # the hashing based MultiIndex for larger
- # sizes, and the MultiIndexOjbect for smaller
- # xref: https://github.com/pandas-dev/pandas/pull/16324
- l = len(self)
- if l > 10000:
- return libindex.MultiIndexHashEngine(lambda: self, l)
-
- return libindex.MultiIndexObjectEngine(lambda: self.values, l)
+ # Calculate the number of bits needed to represent labels in each
+ # level, as log2 of their sizes (including -1 for NaN):
+ sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
+
+ # Sum bit counts, starting from the _right_....
+ lev_bits = np.cumsum(sizes[::-1])[::-1]
+
+ # ... in order to obtain offsets such that sorting the combination of
+ # shifted codes (one for each level, resulting in a unique integer) is
+ # equivalent to sorting lexicographically the codes themselves. Notice
+ # that each level needs to be shifted by the number of bits needed to
+ # represent the _previous_ ones:
+ offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')
+
+ # Check the total number of bits needed for our representation:
+ if lev_bits[0] > 64:
+ # The levels would overflow a 64 bit uint - use Python integers:
+ return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
+ return MultiIndexUIntEngine(self.levels, self.labels, offsets)
@property
def values(self):
@@ -1885,16 +1975,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
if tolerance is not None:
raise NotImplementedError("tolerance not implemented yet "
'for MultiIndex')
- indexer = self._get_fill_indexer(target, method, limit)
+ indexer = self._engine.get_indexer(target, method, limit)
elif method == 'nearest':
raise NotImplementedError("method='nearest' not implemented yet "
'for MultiIndex; see GitHub issue 9365')
else:
- # we may not compare equally because of hashing if we
- # don't have the same dtypes
- if self._inferred_type_levels != target._inferred_type_levels:
- return Index(self.values).get_indexer(target.values)
-
indexer = self._engine.get_indexer(target)
return _ensure_platform_int(indexer)
@@ -2131,17 +2216,6 @@ def _maybe_to_slice(loc):
''.format(keylen, self.nlevels))
if keylen == self.nlevels and self.is_unique:
-
- def _maybe_str_to_time_stamp(key, lev):
- if lev.is_all_dates and not isinstance(key, Timestamp):
- try:
- return Timestamp(key, tz=getattr(lev, 'tz', None))
- except Exception:
- pass
- return key
-
- key = com._values_from_object(key)
- key = tuple(map(_maybe_str_to_time_stamp, key, self.levels))
return self._engine.get_loc(key)
# -- partial selection or non-unique index
@@ -2274,34 +2348,9 @@ def partial_selection(key, indexer=None):
return indexer, maybe_droplevels(indexer, ilevels,
drop_level)
- if len(key) == self.nlevels:
-
- if self.is_unique:
-
- # here we have a completely specified key, but are
- # using some partial string matching here
- # GH4758
- all_dates = ((l.is_all_dates and
- not isinstance(k, compat.string_types))
- for k, l in zip(key, self.levels))
- can_index_exactly = any(all_dates)
- if (any(l.is_all_dates
- for k, l in zip(key, self.levels)) and
- not can_index_exactly):
- indexer = self.get_loc(key)
-
- # we have a multiple selection here
- if (not isinstance(indexer, slice) or
- indexer.stop - indexer.start != 1):
- return partial_selection(key, indexer)
-
- key = tuple(self[indexer].tolist()[0])
-
- return (self._engine.get_loc(
- com._values_from_object(key)), None)
-
- else:
- return partial_selection(key)
+ if len(key) == self.nlevels and self.is_unique:
+ # Complete key in unique index -> standard get_loc
+ return (self._engine.get_loc(key), None)
else:
return partial_selection(key)
else:
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index c2d3d0852384c..ec884035fe0c4 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -224,12 +224,17 @@ def make_block_scalar(self, values):
"""
return ScalarBlock(values)
- def make_block_same_class(self, values, placement=None, ndim=None):
+ def make_block_same_class(self, values, placement=None, ndim=None,
+ dtype=None):
""" Wrap given values in a block of same type as self. """
+ if dtype is not None:
+ # issue 19431 fastparquet is passing this
+ warnings.warn("dtype argument is deprecated, will be removed "
+ "in a future release.", FutureWarning)
if placement is None:
placement = self.mgr_locs
return make_block(values, placement=placement, ndim=ndim,
- klass=self.__class__)
+ klass=self.__class__, dtype=dtype)
def __unicode__(self):
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
index 99bf0d5b7ac51..91dc44e3f185e 100644
--- a/pandas/core/sparse/frame.py
+++ b/pandas/core/sparse/frame.py
@@ -120,7 +120,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
if dtype is not None:
mgr = mgr.astype(dtype)
else:
- msg = ('SparseDataFrame called with unkown type "{data_type}" '
+ msg = ('SparseDataFrame called with unknown type "{data_type}" '
'for data argument')
raise TypeError(msg.format(data_type=type(data).__name__))
diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
index 4e207f9d1838c..1c23527cf57c4 100644
--- a/pandas/core/sparse/series.py
+++ b/pandas/core/sparse/series.py
@@ -493,7 +493,7 @@ def _set_value(self, label, value, takeable=False):
values = self.to_dense()
# if the label doesn't exist, we will create a new object here
- # and possibily change the index
+ # and possibly change the index
new_values = values._set_value(label, value, takeable=takeable)
if new_values is not None:
values = new_values
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 5c31b9a5668ff..12c7feb5f2b15 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -1395,7 +1395,7 @@ def _validate(data):
elif isinstance(data, Index):
# can't use ABCIndex to exclude non-str
- # see scc/inferrence.pyx which can contain string values
+ # see src/inference.pyx which can contain string values
allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
if data.inferred_type not in allowed_types:
message = ("Can only use .str accessor with string values "
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 0c82773b75c28..7edb5b16ce77a 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -210,7 +210,7 @@ def _hash_categorical(c, encoding, hash_key):
# we have uint64, as we don't directly support missing values
# we don't want to use take_nd which will coerce to float
- # instead, directly construt the result with a
+ # instead, directly construct the result with a
# max(np.uint64) as the missing value indicator
#
# TODO: GH 15362
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 2293032ebb8a1..bca0b64cb53fe 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -1961,7 +1961,7 @@ def formatter(value):
def get_result_as_array(self):
"""
Returns the float values converted into strings using
- the parameters given at initalisation, as a numpy array
+ the parameters given at initialisation, as a numpy array
"""
if self.formatter is not None:
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 106823199ee93..5376473f83f22 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -3763,7 +3763,7 @@ def write(self, **kwargs):
class LegacyTable(Table):
""" an appendable table: allow append/query/delete operations to a
- (possibily) already existing appendable table this table ALLOWS
+ (possibly) already existing appendable table this table ALLOWS
append (but doesn't require them), and stores the data in a format
that can be easily searched
diff --git a/pandas/tests/categorical/test_constructors.py b/pandas/tests/categorical/test_constructors.py
index b29d75bed5c6f..6cc34770a65e0 100644
--- a/pandas/tests/categorical/test_constructors.py
+++ b/pandas/tests/categorical/test_constructors.py
@@ -382,7 +382,7 @@ def test_constructor_from_categorical_with_unknown_dtype(self):
ordered=True)
tm.assert_categorical_equal(result, expected)
- def test_contructor_from_categorical_string(self):
+ def test_constructor_from_categorical_string(self):
values = Categorical(['a', 'b', 'd'])
# use categories, ordered
result = Categorical(values, categories=['a', 'b', 'c'], ordered=True,
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 8b57e96e6fa06..b24ae22162a34 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -543,7 +543,7 @@ def test_nested_dict_frame_constructor(self):
tm.assert_frame_equal(result, df)
def _check_basic_constructor(self, empty):
- # mat: 2d matrix with shpae (3, 2) to input. empty - makes sized
+ # mat: 2d matrix with shape (3, 2) to input. empty - makes sized
# objects
mat = empty((2, 3), dtype=float)
# 2-D input
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
index 9664d73651185..aedc957ec67da 100644
--- a/pandas/tests/indexes/test_multi.py
+++ b/pandas/tests/indexes/test_multi.py
@@ -1258,6 +1258,17 @@ def test_get_loc_level(self):
assert result == expected
assert new_index.equals(index.droplevel(0))
+ @pytest.mark.parametrize('level', [0, 1])
+ @pytest.mark.parametrize('null_val', [np.nan, pd.NaT, None])
+ def test_get_loc_nan(self, level, null_val):
+ # GH 18485 : NaN in MultiIndex
+ levels = [['a', 'b'], ['c', 'd']]
+ key = ['b', 'd']
+ levels[level] = np.array([0, null_val], dtype=type(null_val))
+ key[level] = null_val
+ idx = MultiIndex.from_product(levels)
+ assert idx.get_loc(tuple(key)) == 3
+
def test_get_loc_missing_nan(self):
# GH 8569
idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])
@@ -1266,6 +1277,38 @@ def test_get_loc_missing_nan(self):
pytest.raises(KeyError, idx.get_loc, np.nan)
pytest.raises(KeyError, idx.get_loc, [np.nan])
+ @pytest.mark.parametrize('dtype1', [int, float, bool, str])
+ @pytest.mark.parametrize('dtype2', [int, float, bool, str])
+ def test_get_loc_multiple_dtypes(self, dtype1, dtype2):
+ # GH 18520
+ levels = [np.array([0, 1]).astype(dtype1),
+ np.array([0, 1]).astype(dtype2)]
+ idx = pd.MultiIndex.from_product(levels)
+ assert idx.get_loc(idx[2]) == 2
+
+ @pytest.mark.parametrize('level', [0, 1])
+ @pytest.mark.parametrize('dtypes', [[int, float], [float, int]])
+ def test_get_loc_implicit_cast(self, level, dtypes):
+ # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa
+ levels = [['a', 'b'], ['c', 'd']]
+ key = ['b', 'd']
+ lev_dtype, key_dtype = dtypes
+ levels[level] = np.array([0, 1], dtype=lev_dtype)
+ key[level] = key_dtype(1)
+ idx = MultiIndex.from_product(levels)
+ assert idx.get_loc(tuple(key)) == 3
+
+ def test_get_loc_cast_bool(self):
+ # GH 19086 : int is casted to bool, but not vice-versa
+ levels = [[False, True], np.arange(2, dtype='int64')]
+ idx = MultiIndex.from_product(levels)
+
+ assert idx.get_loc((0, 1)) == 1
+ assert idx.get_loc((1, 0)) == 2
+
+ pytest.raises(KeyError, idx.get_loc, (False, True))
+ pytest.raises(KeyError, idx.get_loc, (True, False))
+
def test_slice_locs(self):
df = tm.makeTimeDataFrame()
stacked = df.stack()
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
index 57884e9816ed3..f17306b8b52f9 100644
--- a/pandas/tests/internals/test_internals.py
+++ b/pandas/tests/internals/test_internals.py
@@ -285,6 +285,13 @@ def test_delete(self):
with pytest.raises(Exception):
newb.delete(3)
+ def test_make_block_same_class(self):
+ # issue 19431
+ block = create_block('M8[ns, US/Eastern]', [3])
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ block.make_block_same_class(block.values, dtype=block.values.dtype)
+
class TestDatetimeBlock(object):
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
index b277d8256e612..e0ce27de5c31f 100644
--- a/pandas/tests/io/formats/test_format.py
+++ b/pandas/tests/io/formats/test_format.py
@@ -2531,7 +2531,7 @@ def test_date_tz(self):
[datetime(2013, 1, 1), pd.NaT], utc=True).format()
assert formatted[0] == "2013-01-01 00:00:00+00:00"
- def test_date_explict_date_format(self):
+ def test_date_explicit_date_format(self):
formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format(
date_format="%m-%d-%Y", na_rep="UT")
assert formatted[0] == "02-01-2003"
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 6c172c80514e7..11cbea8ce6331 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -71,6 +71,15 @@ def fp():
return 'fastparquet'
+@pytest.fixture
+def fp_lt_014():
+ if not _HAVE_FASTPARQUET:
+ pytest.skip("fastparquet is not installed")
+ if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
+ pytest.skip("fastparquet is >= 0.1.4")
+ return 'fastparquet'
+
+
@pytest.fixture
def df_compat():
return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'})
@@ -435,8 +444,10 @@ def test_basic(self, fp, df_full):
df = df_full
# additional supported types for fastparquet
+ if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
+ df['datetime_tz'] = pd.date_range('20130101', periods=3,
+ tz='US/Eastern')
df['timedelta'] = pd.timedelta_range('1 day', periods=3)
-
check_round_trip(df, fp)
@pytest.mark.skip(reason="not supported")
@@ -468,14 +479,15 @@ def test_categorical(self, fp):
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
check_round_trip(df, fp)
- def test_datetime_tz(self, fp):
- # doesn't preserve tz
+ def test_datetime_tz(self, fp_lt_014):
+
+ # fastparquet<0.1.4 doesn't preserve tz
df = pd.DataFrame({'a': pd.date_range('20130101', periods=3,
tz='US/Eastern')})
-
# warns on the coercion
with catch_warnings(record=True):
- check_round_trip(df, fp, expected=df.astype('datetime64[ns]'))
+ check_round_trip(df, fp_lt_014,
+ expected=df.astype('datetime64[ns]'))
def test_filter_row_groups(self, fp):
d = {'a': list(range(0, 3))}
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index f2b7c20b774b0..0e6e44e839464 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -43,7 +43,7 @@ def test_empty(self, method, unit, use_bottleneck):
result = getattr(s, method)()
assert result == unit
- # Explict
+ # Explicit
result = getattr(s, method)(min_count=0)
assert result == unit
diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py
index 7505e6b0cec3b..38e5753d1752d 100644
--- a/pandas/tests/series/test_operators.py
+++ b/pandas/tests/series/test_operators.py
@@ -1163,7 +1163,7 @@ def test_timedelta_floordiv(self, scalar_td):
('NCC1701D', 'NCC1701D', 'NCC1701D')])
def test_td64_series_with_tdi(self, names):
# GH#17250 make sure result dtype is correct
- # GH#19043 make sure names are propogated correctly
+ # GH#19043 make sure names are propagated correctly
tdi = pd.TimedeltaIndex(['0 days', '1 day'], name=names[0])
ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1])
expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)],
diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py
index 2b589ebd4735e..0b7948cc32d24 100644
--- a/pandas/tests/sparse/frame/test_frame.py
+++ b/pandas/tests/sparse/frame/test_frame.py
@@ -218,7 +218,7 @@ def test_constructor_from_unknown_type(self):
class Unknown:
pass
with pytest.raises(TypeError,
- message='SparseDataFrame called with unkown type '
+ message='SparseDataFrame called with unknown type '
'"Unknown" for data argument'):
SparseDataFrame(Unknown())
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index 424ba6aab9a56..65332ae7153e2 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -1590,6 +1590,38 @@ def test_unstack_group_index_overflow(self):
result = s.unstack(4)
assert result.shape == (500, 2)
+ def test_pyint_engine(self):
+ # GH 18519 : when combinations of codes cannot be represented in 64
+ # bits, the index underlying the MultiIndex engine works with Python
+ # integers, rather than uint64.
+ N = 5
+ keys = [tuple(l) for l in [[0] * 10 * N,
+ [1] * 10 * N,
+ [2] * 10 * N,
+ [np.nan] * N + [2] * 9 * N,
+ [0] * N + [2] * 9 * N,
+ [np.nan] * N + [2] * 8 * N + [0] * N]]
+ # Each level contains 4 elements (including NaN), so it is represented
+ # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
+ # 64 bit engine and truncating the first levels, the fourth and fifth
+ # keys would collide; if truncating the last levels, the fifth and
+ # sixth; if rotating bits rather than shifting, the third and fifth.
+
+ for idx in range(len(keys)):
+ index = MultiIndex.from_tuples(keys)
+ assert index.get_loc(keys[idx]) == idx
+
+ expected = np.arange(idx + 1, dtype=np.intp)
+ result = index.get_indexer([keys[i] for i in expected])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # With missing key:
+ idces = range(len(keys))
+ expected = np.array([-1] + list(idces), dtype='int64')
+ missing = tuple([0, 1] * 5 * N)
+ result = index.get_indexer([missing] + [keys[i] for i in idces])
+ tm.assert_numpy_array_equal(result, expected)
+
def test_getitem_lowerdim_corner(self):
pytest.raises(KeyError, self.frame.loc.__getitem__,
(('bar', 'three'), 'B'))
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 803cb62b70f55..94fb5555c0a56 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -2427,7 +2427,7 @@ class for all warnings. To check that no warning is returned,
into errors.
Valid values are:
- * "error" - turns matching warnings into exeptions
+ * "error" - turns matching warnings into exceptions
* "ignore" - discard the warning
* "always" - always emit a warning
* "default" - print the warning the first time it is generated