diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 6330a2b36c516..41208125e8f32 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -44,15 +44,15 @@ class Convert(object): goal_time = 0.5 params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta']) - param_names = ['contructor', 'replace_data'] + param_names = ['constructor', 'replace_data'] - def setup(self, contructor, replace_data): + def setup(self, constructor, replace_data): N = 10**3 data = {'Series': pd.Series(np.random.randint(N, size=N)), 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N), 'B': np.random.randint(N, size=N)})} self.to_replace = {i: getattr(pd, replace_data) for i in range(N)} - self.data = data[contructor] + self.data = data[constructor] - def time_replace(self, contructor, replace_data): + def time_replace(self, constructor, replace_data): self.data.replace(self.to_replace) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 45142c53dcd01..59cf7d090a622 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -12,14 +12,14 @@ class Methods(object): ['int', 'float'], ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', 'sum', 'corr', 'cov']) - param_names = ['contructor', 'window', 'dtype', 'method'] + param_names = ['constructor', 'window', 'dtype', 'method'] - def setup(self, contructor, window, dtype, method): + def setup(self, constructor, window, dtype, method): N = 10**5 arr = np.random.random(N).astype(dtype) - self.roll = getattr(pd, contructor)(arr).rolling(window) + self.roll = getattr(pd, constructor)(arr).rolling(window) - def time_rolling(self, contructor, window, dtype, method): + def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() @@ -30,12 +30,12 @@ class Quantile(object): [10, 1000], ['int', 'float'], [0, 0.5, 1]) - param_names = ['contructor', 'window', 'dtype', 'percentile'] + param_names = ['constructor', 'window', 'dtype', 'percentile'] - def setup(self, contructor, window, dtype, percentile): + def setup(self, constructor, window, dtype, percentile): N = 10**5 arr = np.random.random(N).astype(dtype) - self.roll = getattr(pd, contructor)(arr).rolling(window) + self.roll = getattr(pd, constructor)(arr).rolling(window) - def time_quantile(self, contructor, window, dtype, percentile): + def time_quantile(self, constructor, window, dtype, percentile): self.roll.quantile(percentile) diff --git a/doc/source/api.rst b/doc/source/api.rst index ddd09327935ce..44f87aa3e1cec 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2500,7 +2500,7 @@ Scalar introspection Extensions ---------- -These are primarily intented for library authors looking to extend pandas +These are primarily intended for library authors looking to extend pandas objects. .. currentmodule:: pandas diff --git a/doc/source/io.rst b/doc/source/io.rst index ae04996b4fddf..60dc89f8fd495 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2675,7 +2675,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. +++++++++++++++++++ To facilitate working with multiple sheets from the same file, the ``ExcelFile`` -class can be used to wrap the file and can be be passed into ``read_excel`` +class can be used to wrap the file and can be passed into ``read_excel`` There will be a performance benefit for reading multiple sheets as the file is read into memory only once. @@ -4537,7 +4537,7 @@ See the documentation for `pyarrow `__ and .. note:: These engines are very similar and should read/write nearly identical parquet format files. - Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC). + Currently ``pyarrow`` does not support timedelta data, ``fastparquet>=0.1.4`` supports timezone aware datetimes. These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library). .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index edde49a1d144f..a08199438a8d7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -431,6 +431,7 @@ Performance Improvements - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) +- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) .. _whatsnew_0230.docs: @@ -528,7 +529,11 @@ MultiIndex - Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`) - Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`) - Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) -- +- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`) +- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`) +- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`) +- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`) + I/O ^^^ diff --git a/doc/sphinxext/numpydoc/tests/test_docscrape.py b/doc/sphinxext/numpydoc/tests/test_docscrape.py index b682504e1618f..b412124d774bb 100755 --- a/doc/sphinxext/numpydoc/tests/test_docscrape.py +++ b/doc/sphinxext/numpydoc/tests/test_docscrape.py @@ -42,7 +42,7 @@ ------- out : ndarray The drawn samples, arranged according to `shape`. If the - shape given is (m,n,...), then the shape of `out` is is + shape given is (m,n,...), then the shape of `out` is (m,n,...,N). In other words, each entry ``out[i,j,...,:]`` is an N-dimensional @@ -222,7 +222,7 @@ def test_str(): ------- out : ndarray The drawn samples, arranged according to `shape`. If the - shape given is (m,n,...), then the shape of `out` is is + shape given is (m,n,...), then the shape of `out` is (m,n,...,N). In other words, each entry ``out[i,j,...,:]`` is an N-dimensional @@ -340,7 +340,7 @@ def test_sphinx_str(): **out** : ndarray The drawn samples, arranged according to `shape`. If the - shape given is (m,n,...), then the shape of `out` is is + shape given is (m,n,...), then the shape of `out` is (m,n,...,N). In other words, each entry ``out[i,j,...,:]`` is an N-dimensional diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 014da22df3382..d735b3c0673b2 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) -cdef class MultiIndexHashTable(HashTable): - cdef: - kh_uint64_t *table - object mi - - cpdef get_item(self, object val) - cpdef set_item(self, object key, Py_ssize_t val) - cdef inline void _check_for_collision(self, Py_ssize_t loc, object label) - cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bd9dd1f9bae37..bca4e388f3279 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable): count += 1 return np.asarray(labels) - - -cdef class MultiIndexHashTable(HashTable): - - def __init__(self, size_hint=1): - self.table = kh_init_uint64() - self.mi = None - kh_resize_uint64(self.table, size_hint) - - def __dealloc__(self): - if self.table is not NULL: - kh_destroy_uint64(self.table) - self.table = NULL - - def __len__(self): - return self.table.size - - def sizeof(self, deep=False): - """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof(uint64_t) + # keys - sizeof(size_t) + # vals - sizeof(uint32_t)) # flags - - def _check_for_collisions(self, int64_t[:] locs, object mi): - # validate that the locs map to the actual values - # provided in the mi - # we can only check if we *don't* have any missing values - # :< - cdef: - ndarray[int64_t] alocs - - alocs = np.asarray(locs) - if (alocs != -1).all(): - - result = self.mi.take(locs) - if isinstance(mi, tuple): - from pandas import Index - mi = Index([mi]) - if not result.equals(mi): - raise AssertionError( - "hash collision\nlocs:\n{}\n" - "result:\n{}\nmi:\n{}".format(alocs, result, mi)) - - cdef inline void _check_for_collision(self, Py_ssize_t loc, object label): - # validate that the loc maps to the actual value - # version of _check_for_collisions above for single label (tuple) - - result = self.mi[loc] - - if not all(l == r or (is_null_datetimelike(l) - and is_null_datetimelike(r)) - for l, r in zip(result, label)): - raise AssertionError( - "hash collision\nloc:\n{}\n" - "result:\n{}\nmi:\n{}".format(loc, result, label)) - - def __contains__(self, object key): - try: - self.get_item(key) - return True - except (KeyError, ValueError, TypeError): - return False - - cpdef get_item(self, object key): - cdef: - khiter_t k - uint64_t value - int64_t[:] locs - Py_ssize_t loc - - value = self.mi._hashed_indexing_key(key) - k = kh_get_uint64(self.table, value) - if k != self.table.n_buckets: - loc = self.table.vals[k] - self._check_for_collision(loc, key) - return loc - else: - raise KeyError(key) - - cpdef set_item(self, object key, Py_ssize_t val): - raise NotImplementedError - - @cython.boundscheck(False) - def map_locations(self, object mi): - cdef: - Py_ssize_t i, n - ndarray[uint64_t] values - uint64_t val - int ret = 0 - khiter_t k - - self.mi = mi - n = len(mi) - values = mi._hashed_values - - with nogil: - for i in range(n): - val = values[i] - k = kh_put_uint64(self.table, val, &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, object mi): - # look up with a target mi - cdef: - Py_ssize_t i, n - ndarray[uint64_t] values - int ret = 0 - uint64_t val - khiter_t k - int64_t[:] locs - - n = len(mi) - values = mi._hashed_values - - locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_uint64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - self._check_for_collisions(locs, mi) - return np.asarray(locs) - - def unique(self, object mi): - raise NotImplementedError - - def get_labels(self, object mi, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - raise NotImplementedError diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index bfea4ff9915ac..6b23e487aad3a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -26,11 +26,12 @@ from hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash from pandas._libs.tslibs import period as periodlib from pandas._libs.tslib import Timestamp, Timedelta +from pandas._libs.missing import checknull cdef int64_t iNaT = util.get_nat() -cdef inline is_definitely_invalid_key(object val): +cdef inline bint is_definitely_invalid_key(object val): if PyTuple_Check(val): try: hash(val) @@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value): return value -cdef class MultiIndexObjectEngine(ObjectEngine): +cdef class BaseMultiIndexCodesEngine: """ - provide the same interface as the MultiIndexEngine - but use the IndexEngine for computation - - This provides good performance with samller MI's + Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which + represent each label in a MultiIndex as an integer, by juxtaposing the bits + encoding each level, with appropriate offsets. + + For instance: if 3 levels have respectively 3, 6 and 1 possible values, + then their labels can be represented using respectively 2, 3 and 1 bits, + as follows: + _ _ _ _____ _ __ __ __ + |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level) + — — — ————— — —— —— —— + |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level) + — — — ————— — —— —— —— + |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels) + ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ + and the resulting unsigned integer representation will be: + _ _ _ _____ _ __ __ __ __ __ __ + |0|0|0| ... |0|c0|b2|b1|b0|a1|a0| + ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ + + Offsets are calculated at initialization, labels are transformed by method + _codes_to_ints. + + Keys are located by first locating each component against the respective + level, then locating (the integer representation of) codes. """ - def get_indexer(self, values): - # convert a MI to an ndarray - if hasattr(values, 'values'): - values = values.values - return super(MultiIndexObjectEngine, self).get_indexer(values) + def __init__(self, object levels, object labels, + ndarray[uint64_t, ndim=1] offsets): + """ + Parameters + ---------- + levels : list-like of numpy arrays + Levels of the MultiIndex + labels : list-like of numpy arrays of integer dtype + Labels of the MultiIndex + offsets : numpy array of uint64 dtype + Pre-calculated offsets, one for each level of the index + """ - cpdef get_loc(self, object val): + self.levels = levels + self.offsets = offsets - # convert a MI to an ndarray - if hasattr(val, 'values'): - val = val.values - return super(MultiIndexObjectEngine, self).get_loc(val) + # Transform labels in a single array, and add 1 so that we are working + # with positive integers (-1 for NaN becomes 0): + codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', + copy=False) + # Map each codes combination in the index to an integer unambiguously + # (no collisions possible), based on the "offsets", which describe the + # number of bits to switch labels for each level: + lab_ints = self._codes_to_ints(codes) -cdef class MultiIndexHashEngine(ObjectEngine): - """ - Use a hashing based MultiIndex impl - but use the IndexEngine for computation + # Initialize underlying index (e.g. libindex.UInt64Engine) with + # integers representing labels: we will use its get_loc and get_indexer + self._base.__init__(self, lambda: lab_ints, len(lab_ints)) - This provides good performance with larger MI's - """ + def _extract_level_codes(self, object target, object method=None): + """ + Map the requested list of (tuple) keys to their integer representations + for searching in the underlying integer index. + + Parameters + ---------- + target : list-like of keys + Each key is a tuple, with a label for each level of the index. + + Returns + ------ + int_keys : 1-dimensional array of dtype uint64 or object + Integers representing one combination each + """ - def _call_monotonic(self, object mi): - # defer these back to the mi iteself - return (mi.is_monotonic_increasing, - mi.is_monotonic_decreasing, - mi.is_unique) + level_codes = [lev.get_indexer(codes) + 1 for lev, codes + in zip(self.levels, zip(*target))] + return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) + + def get_indexer(self, object target, object method=None, + object limit=None): + lab_ints = self._extract_level_codes(target) + + # All methods (exact, backfill, pad) directly map to the respective + # methods of the underlying (integers) index... + if method is not None: + # but underlying backfill and pad methods require index and keys + # to be sorted. The index already is (checked in + # Index._get_fill_indexer), sort (integer representations of) keys: + order = np.argsort(lab_ints) + lab_ints = lab_ints[order] + indexer = (getattr(self._base, 'get_{}_indexer'.format(method)) + (self, lab_ints, limit=limit)) + indexer = indexer[order] + else: + indexer = self._base.get_indexer(self, lab_ints) - def get_backfill_indexer(self, other, limit=None): - # we coerce to ndarray-of-tuples - values = np.array(self._get_index_values()) - return algos.backfill_object(values, other, limit=limit) + return indexer - def get_pad_indexer(self, other, limit=None): - # we coerce to ndarray-of-tuples - values = np.array(self._get_index_values()) - return algos.pad_object(values, other, limit=limit) + def get_loc(self, object key): + if is_definitely_invalid_key(key): + raise TypeError("'{key}' is an invalid key".format(key=key)) + if not PyTuple_Check(key): + raise KeyError(key) + try: + indices = [0 if checknull(v) else lev.get_loc(v) + 1 + for lev, v in zip(self.levels, key)] + except KeyError: + raise KeyError(key) - cpdef get_loc(self, object val): - if is_definitely_invalid_key(val): - raise TypeError("'{val}' is an invalid key".format(val=val)) + # Transform indices into single integer: + lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) - self._ensure_mapping_populated() - if not self.unique: - return self._get_loc_duplicates(val) + return self._base.get_loc(self, lab_int) - try: - return self.mapping.get_item(val) - except TypeError: - raise KeyError(val) + def get_indexer_non_unique(self, object target): + # This needs to be overridden just because the default one works on + # target._values, and target can be itself a MultiIndex. - def get_indexer(self, values): - self._ensure_mapping_populated() - return self.mapping.lookup(values) + lab_ints = self._extract_level_codes(target) + indexer = self._base.get_indexer_non_unique(self, lab_ints) + + return indexer + + def __contains__(self, object val): + # Default __contains__ looks in the underlying mapping, which in this + # case only contains integer representations. + try: + self.get_loc(val) + return True + except (KeyError, TypeError, ValueError): + return False - cdef _make_hash_table(self, n): - return _hash.MultiIndexHashTable(n) # Generated from template. include "index_class_helper.pxi" diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 1e6ea7794dfff..37693068e0974 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -897,7 +897,7 @@ class Timedelta(_Timedelta): Represents a duration, the difference between two dates or times. Timedelta is the pandas equivalent of python's ``datetime.timedelta`` - and is interchangable with it in most cases. + and is interchangeable with it in most cases. Parameters ---------- diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index c22e0b8e555a3..215ae9ce087ee 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -295,7 +295,7 @@ cpdef bint tz_compare(object start, object end): timezones. For example `` and `` are essentially same - timezones but aren't evaluted such, but the string representation + timezones but aren't evaluated such, but the string representation for both of these is `'Europe/Paris'`. This exists only to add a notion of equality to pytz-style zones diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7328cd336babf..788b236b0ec59 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4115,7 +4115,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): series[this_mask] = fill_value otherSeries[other_mask] = fill_value - # if we have different dtypes, possibily promote + # if we have different dtypes, possibly promote new_dtype = this_dtype if not is_dtype_equal(this_dtype, other_dtype): new_dtype = find_common_type([this_dtype, other_dtype]) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f43c6dc567f69..8e77c7a7fa48c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -332,7 +332,7 @@ def freqstr(self): @cache_readonly def inferred_freq(self): """ - Trys to return a string representing a frequency guess, + Tryies to return a string representing a frequency guess, generated by infer_freq. Returns None if it can't autodetect the frequency. """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 797774832aaa5..510f7245cebd8 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -45,6 +45,87 @@ target_klass='MultiIndex or list of tuples')) +class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, + libindex.UInt64Engine): + """ + This class manages a MultiIndex by mapping label combinations to positive + integers. + """ + _base = libindex.UInt64Engine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one uint64 (each), in a strictly + monotonic way (i.e. respecting the lexicographic order of integer + combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------ + int_keys : scalar or 1-dimensional array, of dtype uint64 + Integer(s) representing one combination (each) + """ + # Shift the representation of each level by the pre-calculated number + # of bits: + codes <<= self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer: + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, + libindex.ObjectEngine): + """ + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + _base = libindex.ObjectEngine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of + integer combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------ + int_keys : int, or 1-dimensional array of dtype object + Integer(s) representing one combination (each) + """ + + # Shift the representation of each level by the pre-calculated number + # of bits. Since this can overflow uint64, first make sure we are + # working with Python integers: + codes = codes.astype('object') << self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + class MultiIndex(Index): """ A multi-level, or hierarchical, index object for pandas objects @@ -687,16 +768,25 @@ def _get_level_number(self, level): @cache_readonly def _engine(self): - - # choose our engine based on our size - # the hashing based MultiIndex for larger - # sizes, and the MultiIndexOjbect for smaller - # xref: https://github.com/pandas-dev/pandas/pull/16324 - l = len(self) - if l > 10000: - return libindex.MultiIndexHashEngine(lambda: self, l) - - return libindex.MultiIndexObjectEngine(lambda: self.values, l) + # Calculate the number of bits needed to represent labels in each + # level, as log2 of their sizes (including -1 for NaN): + sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) + + # Sum bit counts, starting from the _right_.... + lev_bits = np.cumsum(sizes[::-1])[::-1] + + # ... in order to obtain offsets such that sorting the combination of + # shifted codes (one for each level, resulting in a unique integer) is + # equivalent to sorting lexicographically the codes themselves. Notice + # that each level needs to be shifted by the number of bits needed to + # represent the _previous_ ones: + offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64') + + # Check the total number of bits needed for our representation: + if lev_bits[0] > 64: + # The levels would overflow a 64 bit uint - use Python integers: + return MultiIndexPyIntEngine(self.levels, self.labels, offsets) + return MultiIndexUIntEngine(self.levels, self.labels, offsets) @property def values(self): @@ -1885,16 +1975,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: raise NotImplementedError("tolerance not implemented yet " 'for MultiIndex') - indexer = self._get_fill_indexer(target, method, limit) + indexer = self._engine.get_indexer(target, method, limit) elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for MultiIndex; see GitHub issue 9365') else: - # we may not compare equally because of hashing if we - # don't have the same dtypes - if self._inferred_type_levels != target._inferred_type_levels: - return Index(self.values).get_indexer(target.values) - indexer = self._engine.get_indexer(target) return _ensure_platform_int(indexer) @@ -2131,17 +2216,6 @@ def _maybe_to_slice(loc): ''.format(keylen, self.nlevels)) if keylen == self.nlevels and self.is_unique: - - def _maybe_str_to_time_stamp(key, lev): - if lev.is_all_dates and not isinstance(key, Timestamp): - try: - return Timestamp(key, tz=getattr(lev, 'tz', None)) - except Exception: - pass - return key - - key = com._values_from_object(key) - key = tuple(map(_maybe_str_to_time_stamp, key, self.levels)) return self._engine.get_loc(key) # -- partial selection or non-unique index @@ -2274,34 +2348,9 @@ def partial_selection(key, indexer=None): return indexer, maybe_droplevels(indexer, ilevels, drop_level) - if len(key) == self.nlevels: - - if self.is_unique: - - # here we have a completely specified key, but are - # using some partial string matching here - # GH4758 - all_dates = ((l.is_all_dates and - not isinstance(k, compat.string_types)) - for k, l in zip(key, self.levels)) - can_index_exactly = any(all_dates) - if (any(l.is_all_dates - for k, l in zip(key, self.levels)) and - not can_index_exactly): - indexer = self.get_loc(key) - - # we have a multiple selection here - if (not isinstance(indexer, slice) or - indexer.stop - indexer.start != 1): - return partial_selection(key, indexer) - - key = tuple(self[indexer].tolist()[0]) - - return (self._engine.get_loc( - com._values_from_object(key)), None) - - else: - return partial_selection(key) + if len(key) == self.nlevels and self.is_unique: + # Complete key in unique index -> standard get_loc + return (self._engine.get_loc(key), None) else: return partial_selection(key) else: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c2d3d0852384c..ec884035fe0c4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -224,12 +224,17 @@ def make_block_scalar(self, values): """ return ScalarBlock(values) - def make_block_same_class(self, values, placement=None, ndim=None): + def make_block_same_class(self, values, placement=None, ndim=None, + dtype=None): """ Wrap given values in a block of same type as self. """ + if dtype is not None: + # issue 19431 fastparquet is passing this + warnings.warn("dtype argument is deprecated, will be removed " + "in a future release.", FutureWarning) if placement is None: placement = self.mgr_locs return make_block(values, placement=placement, ndim=ndim, - klass=self.__class__) + klass=self.__class__, dtype=dtype) def __unicode__(self): diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 99bf0d5b7ac51..91dc44e3f185e 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -120,7 +120,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if dtype is not None: mgr = mgr.astype(dtype) else: - msg = ('SparseDataFrame called with unkown type "{data_type}" ' + msg = ('SparseDataFrame called with unknown type "{data_type}" ' 'for data argument') raise TypeError(msg.format(data_type=type(data).__name__)) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 4e207f9d1838c..1c23527cf57c4 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -493,7 +493,7 @@ def _set_value(self, label, value, takeable=False): values = self.to_dense() # if the label doesn't exist, we will create a new object here - # and possibily change the index + # and possibly change the index new_values = values._set_value(label, value, takeable=takeable) if new_values is not None: values = new_values diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5c31b9a5668ff..12c7feb5f2b15 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1395,7 +1395,7 @@ def _validate(data): elif isinstance(data, Index): # can't use ABCIndex to exclude non-str - # see scc/inferrence.pyx which can contain string values + # see src/inference.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if data.inferred_type not in allowed_types: message = ("Can only use .str accessor with string values " diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 0c82773b75c28..7edb5b16ce77a 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -210,7 +210,7 @@ def _hash_categorical(c, encoding, hash_key): # we have uint64, as we don't directly support missing values # we don't want to use take_nd which will coerce to float - # instead, directly construt the result with a + # instead, directly construct the result with a # max(np.uint64) as the missing value indicator # # TODO: GH 15362 diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2293032ebb8a1..bca0b64cb53fe 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1961,7 +1961,7 @@ def formatter(value): def get_result_as_array(self): """ Returns the float values converted into strings using - the parameters given at initalisation, as a numpy array + the parameters given at initialisation, as a numpy array """ if self.formatter is not None: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 106823199ee93..5376473f83f22 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3763,7 +3763,7 @@ def write(self, **kwargs): class LegacyTable(Table): """ an appendable table: allow append/query/delete operations to a - (possibily) already existing appendable table this table ALLOWS + (possibly) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format that can be easily searched diff --git a/pandas/tests/categorical/test_constructors.py b/pandas/tests/categorical/test_constructors.py index b29d75bed5c6f..6cc34770a65e0 100644 --- a/pandas/tests/categorical/test_constructors.py +++ b/pandas/tests/categorical/test_constructors.py @@ -382,7 +382,7 @@ def test_constructor_from_categorical_with_unknown_dtype(self): ordered=True) tm.assert_categorical_equal(result, expected) - def test_contructor_from_categorical_string(self): + def test_constructor_from_categorical_string(self): values = Categorical(['a', 'b', 'd']) # use categories, ordered result = Categorical(values, categories=['a', 'b', 'c'], ordered=True, diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8b57e96e6fa06..b24ae22162a34 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -543,7 +543,7 @@ def test_nested_dict_frame_constructor(self): tm.assert_frame_equal(result, df) def _check_basic_constructor(self, empty): - # mat: 2d matrix with shpae (3, 2) to input. empty - makes sized + # mat: 2d matrix with shape (3, 2) to input. empty - makes sized # objects mat = empty((2, 3), dtype=float) # 2-D input diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 9664d73651185..aedc957ec67da 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1258,6 +1258,17 @@ def test_get_loc_level(self): assert result == expected assert new_index.equals(index.droplevel(0)) + @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize('null_val', [np.nan, pd.NaT, None]) + def test_get_loc_nan(self, level, null_val): + # GH 18485 : NaN in MultiIndex + levels = [['a', 'b'], ['c', 'd']] + key = ['b', 'd'] + levels[level] = np.array([0, null_val], dtype=type(null_val)) + key[level] = null_val + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + def test_get_loc_missing_nan(self): # GH 8569 idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]]) @@ -1266,6 +1277,38 @@ def test_get_loc_missing_nan(self): pytest.raises(KeyError, idx.get_loc, np.nan) pytest.raises(KeyError, idx.get_loc, [np.nan]) + @pytest.mark.parametrize('dtype1', [int, float, bool, str]) + @pytest.mark.parametrize('dtype2', [int, float, bool, str]) + def test_get_loc_multiple_dtypes(self, dtype1, dtype2): + # GH 18520 + levels = [np.array([0, 1]).astype(dtype1), + np.array([0, 1]).astype(dtype2)] + idx = pd.MultiIndex.from_product(levels) + assert idx.get_loc(idx[2]) == 2 + + @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize('dtypes', [[int, float], [float, int]]) + def test_get_loc_implicit_cast(self, level, dtypes): + # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa + levels = [['a', 'b'], ['c', 'd']] + key = ['b', 'd'] + lev_dtype, key_dtype = dtypes + levels[level] = np.array([0, 1], dtype=lev_dtype) + key[level] = key_dtype(1) + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + + def test_get_loc_cast_bool(self): + # GH 19086 : int is casted to bool, but not vice-versa + levels = [[False, True], np.arange(2, dtype='int64')] + idx = MultiIndex.from_product(levels) + + assert idx.get_loc((0, 1)) == 1 + assert idx.get_loc((1, 0)) == 2 + + pytest.raises(KeyError, idx.get_loc, (False, True)) + pytest.raises(KeyError, idx.get_loc, (True, False)) + def test_slice_locs(self): df = tm.makeTimeDataFrame() stacked = df.stack() diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 57884e9816ed3..f17306b8b52f9 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -285,6 +285,13 @@ def test_delete(self): with pytest.raises(Exception): newb.delete(3) + def test_make_block_same_class(self): + # issue 19431 + block = create_block('M8[ns, US/Eastern]', [3]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + block.make_block_same_class(block.values, dtype=block.values.dtype) + class TestDatetimeBlock(object): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b277d8256e612..e0ce27de5c31f 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2531,7 +2531,7 @@ def test_date_tz(self): [datetime(2013, 1, 1), pd.NaT], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" - def test_date_explict_date_format(self): + def test_date_explicit_date_format(self): formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format( date_format="%m-%d-%Y", na_rep="UT") assert formatted[0] == "02-01-2003" diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6c172c80514e7..11cbea8ce6331 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -71,6 +71,15 @@ def fp(): return 'fastparquet' +@pytest.fixture +def fp_lt_014(): + if not _HAVE_FASTPARQUET: + pytest.skip("fastparquet is not installed") + if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'): + pytest.skip("fastparquet is >= 0.1.4") + return 'fastparquet' + + @pytest.fixture def df_compat(): return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'}) @@ -435,8 +444,10 @@ def test_basic(self, fp, df_full): df = df_full # additional supported types for fastparquet + if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'): + df['datetime_tz'] = pd.date_range('20130101', periods=3, + tz='US/Eastern') df['timedelta'] = pd.timedelta_range('1 day', periods=3) - check_round_trip(df, fp) @pytest.mark.skip(reason="not supported") @@ -468,14 +479,15 @@ def test_categorical(self, fp): df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) check_round_trip(df, fp) - def test_datetime_tz(self, fp): - # doesn't preserve tz + def test_datetime_tz(self, fp_lt_014): + + # fastparquet<0.1.4 doesn't preserve tz df = pd.DataFrame({'a': pd.date_range('20130101', periods=3, tz='US/Eastern')}) - # warns on the coercion with catch_warnings(record=True): - check_round_trip(df, fp, expected=df.astype('datetime64[ns]')) + check_round_trip(df, fp_lt_014, + expected=df.astype('datetime64[ns]')) def test_filter_row_groups(self, fp): d = {'a': list(range(0, 3))} diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index f2b7c20b774b0..0e6e44e839464 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -43,7 +43,7 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)() assert result == unit - # Explict + # Explicit result = getattr(s, method)(min_count=0) assert result == unit diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7505e6b0cec3b..38e5753d1752d 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1163,7 +1163,7 @@ def test_timedelta_floordiv(self, scalar_td): ('NCC1701D', 'NCC1701D', 'NCC1701D')]) def test_td64_series_with_tdi(self, names): # GH#17250 make sure result dtype is correct - # GH#19043 make sure names are propogated correctly + # GH#19043 make sure names are propagated correctly tdi = pd.TimedeltaIndex(['0 days', '1 day'], name=names[0]) ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 2b589ebd4735e..0b7948cc32d24 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -218,7 +218,7 @@ def test_constructor_from_unknown_type(self): class Unknown: pass with pytest.raises(TypeError, - message='SparseDataFrame called with unkown type ' + message='SparseDataFrame called with unknown type ' '"Unknown" for data argument'): SparseDataFrame(Unknown()) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 424ba6aab9a56..65332ae7153e2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1590,6 +1590,38 @@ def test_unstack_group_index_overflow(self): result = s.unstack(4) assert result.shape == (500, 2) + def test_pyint_engine(self): + # GH 18519 : when combinations of codes cannot be represented in 64 + # bits, the index underlying the MultiIndex engine works with Python + # integers, rather than uint64. + N = 5 + keys = [tuple(l) for l in [[0] * 10 * N, + [1] * 10 * N, + [2] * 10 * N, + [np.nan] * N + [2] * 9 * N, + [0] * N + [2] * 9 * N, + [np.nan] * N + [2] * 8 * N + [0] * N]] + # Each level contains 4 elements (including NaN), so it is represented + # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a + # 64 bit engine and truncating the first levels, the fourth and fifth + # keys would collide; if truncating the last levels, the fifth and + # sixth; if rotating bits rather than shifting, the third and fifth. + + for idx in range(len(keys)): + index = MultiIndex.from_tuples(keys) + assert index.get_loc(keys[idx]) == idx + + expected = np.arange(idx + 1, dtype=np.intp) + result = index.get_indexer([keys[i] for i in expected]) + tm.assert_numpy_array_equal(result, expected) + + # With missing key: + idces = range(len(keys)) + expected = np.array([-1] + list(idces), dtype='int64') + missing = tuple([0, 1] * 5 * N) + result = index.get_indexer([missing] + [keys[i] for i in idces]) + tm.assert_numpy_array_equal(result, expected) + def test_getitem_lowerdim_corner(self): pytest.raises(KeyError, self.frame.loc.__getitem__, (('bar', 'three'), 'B')) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 803cb62b70f55..94fb5555c0a56 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2427,7 +2427,7 @@ class for all warnings. To check that no warning is returned, into errors. Valid values are: - * "error" - turns matching warnings into exeptions + * "error" - turns matching warnings into exceptions * "ignore" - discard the warning * "always" - always emit a warning * "default" - print the warning the first time it is generated