Merge branch 'master' of https://github.com/pandas-dev/pandas into di…

…v_zero2
pandas-dev · Jan 29, 2018 · 1ef3a6c · 1ef3a6c
2 parents 78de1a4 + f483321
commit 1ef3a6c
Show file tree

Hide file tree

Showing 32 changed files with 369 additions and 293 deletions.
diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
@@ -44,15 +44,15 @@ class Convert(object):
 
     goal_time = 0.5
     params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta'])
-    param_names = ['contructor', 'replace_data']
+    param_names = ['constructor', 'replace_data']
 
-    def setup(self, contructor, replace_data):
+    def setup(self, constructor, replace_data):
         N = 10**3
         data = {'Series': pd.Series(np.random.randint(N, size=N)),
                 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N),
                                            'B': np.random.randint(N, size=N)})}
         self.to_replace = {i: getattr(pd, replace_data) for i in range(N)}
-        self.data = data[contructor]
+        self.data = data[constructor]
 
-    def time_replace(self, contructor, replace_data):
+    def time_replace(self, constructor, replace_data):
         self.data.replace(self.to_replace)
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -12,14 +12,14 @@ class Methods(object):
               ['int', 'float'],
               ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
                'sum', 'corr', 'cov'])
-    param_names = ['contructor', 'window', 'dtype', 'method']
+    param_names = ['constructor', 'window', 'dtype', 'method']
 
-    def setup(self, contructor, window, dtype, method):
+    def setup(self, constructor, window, dtype, method):
         N = 10**5
         arr = np.random.random(N).astype(dtype)
-        self.roll = getattr(pd, contructor)(arr).rolling(window)
+        self.roll = getattr(pd, constructor)(arr).rolling(window)
 
-    def time_rolling(self, contructor, window, dtype, method):
+    def time_rolling(self, constructor, window, dtype, method):
         getattr(self.roll, method)()
 
 
@@ -30,12 +30,12 @@ class Quantile(object):
               [10, 1000],
               ['int', 'float'],
               [0, 0.5, 1])
-    param_names = ['contructor', 'window', 'dtype', 'percentile']
+    param_names = ['constructor', 'window', 'dtype', 'percentile']
 
-    def setup(self, contructor, window, dtype, percentile):
+    def setup(self, constructor, window, dtype, percentile):
         N = 10**5
         arr = np.random.random(N).astype(dtype)
-        self.roll = getattr(pd, contructor)(arr).rolling(window)
+        self.roll = getattr(pd, constructor)(arr).rolling(window)
 
-    def time_quantile(self, contructor, window, dtype, percentile):
+    def time_quantile(self, constructor, window, dtype, percentile):
         self.roll.quantile(percentile)
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -2500,7 +2500,7 @@ Scalar introspection
 Extensions
 ----------
 
-These are primarily intented for library authors looking to extend pandas
+These are primarily intended for library authors looking to extend pandas
 objects.
 
 .. currentmodule:: pandas

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2675,7 +2675,7 @@ file, and the ``sheet_name`` indicating which sheet to parse.
 +++++++++++++++++++
 
 To facilitate working with multiple sheets from the same file, the ``ExcelFile``
-class can be used to wrap the file and can be be passed into ``read_excel``
+class can be used to wrap the file and can be passed into ``read_excel``
 There will be a performance benefit for reading multiple sheets as the file is
 read into memory only once.
 
@@ -4537,7 +4537,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
 .. note::
 
    These engines are very similar and should read/write nearly identical parquet format files.
-   Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
+   Currently ``pyarrow`` does not support timedelta data, ``fastparquet>=0.1.4`` supports timezone aware datetimes.
    These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
 
 .. ipython:: python

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -431,6 +431,7 @@ Performance Improvements
 - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
 - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
 - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
+- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
 
 
 .. _whatsnew_0230.docs:
@@ -528,7 +529,11 @@ MultiIndex
 - Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
 - Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
 - Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
--
+- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
+- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
+- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
+- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)
+
 
 I/O
 ^^^

diff --git a/doc/sphinxext/numpydoc/tests/test_docscrape.py b/doc/sphinxext/numpydoc/tests/test_docscrape.py
@@ -42,7 +42,7 @@
   -------
   out : ndarray
       The drawn samples, arranged according to `shape`.  If the
-      shape given is (m,n,...), then the shape of `out` is is
+      shape given is (m,n,...), then the shape of `out` is
       (m,n,...,N).
 
       In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
@@ -222,7 +222,7 @@ def test_str():
 -------
 out : ndarray
     The drawn samples, arranged according to `shape`.  If the
-    shape given is (m,n,...), then the shape of `out` is is
+    shape given is (m,n,...), then the shape of `out` is
     (m,n,...,N).
 
     In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
@@ -340,7 +340,7 @@ def test_sphinx_str():
     **out** : ndarray
 
         The drawn samples, arranged according to `shape`.  If the
-        shape given is (m,n,...), then the shape of `out` is is
+        shape given is (m,n,...), then the shape of `out` is
         (m,n,...,N).
 
         In other words, each entry ``out[i,j,...,:]`` is an N-dimensional

diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable):
     cpdef get_item(self, object val)
     cpdef set_item(self, object key, Py_ssize_t val)
 
-cdef class MultiIndexHashTable(HashTable):
-    cdef:
-        kh_uint64_t *table
-        object mi
-
-    cpdef get_item(self, object val)
-    cpdef set_item(self, object key, Py_ssize_t val)
-    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
-
 
 cdef class StringHashTable(HashTable):
     cdef kh_str_t *table

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable):
                 count += 1
 
         return np.asarray(labels)
-
-
-cdef class MultiIndexHashTable(HashTable):
-
-    def __init__(self, size_hint=1):
-        self.table = kh_init_uint64()
-        self.mi = None
-        kh_resize_uint64(self.table, size_hint)
-
-    def __dealloc__(self):
-        if self.table is not NULL:
-            kh_destroy_uint64(self.table)
-            self.table = NULL
-
-    def __len__(self):
-        return self.table.size
-
-    def sizeof(self, deep=False):
-        """ return the size of my table in bytes """
-        return self.table.n_buckets * (sizeof(uint64_t) + # keys
-                                       sizeof(size_t) + # vals
-                                       sizeof(uint32_t)) # flags
-
-    def _check_for_collisions(self, int64_t[:] locs, object mi):
-        # validate that the locs map to the actual values
-        # provided in the mi
-        # we can only check if we *don't* have any missing values
-        # :<
-        cdef:
-            ndarray[int64_t] alocs
-
-        alocs = np.asarray(locs)
-        if (alocs != -1).all():
-
-            result = self.mi.take(locs)
-            if isinstance(mi, tuple):
-                from pandas import Index
-                mi = Index([mi])
-            if not result.equals(mi):
-                raise AssertionError(
-                    "hash collision\nlocs:\n{}\n"
-                    "result:\n{}\nmi:\n{}".format(alocs, result, mi))
-
-    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
-        # validate that the loc maps to the actual value
-        # version of _check_for_collisions above for single label (tuple)
-
-        result = self.mi[loc]
-
-        if not all(l == r or (is_null_datetimelike(l)
-                              and is_null_datetimelike(r))
-                   for l, r in zip(result, label)):
-            raise AssertionError(
-                "hash collision\nloc:\n{}\n"
-                "result:\n{}\nmi:\n{}".format(loc, result, label))
-
-    def __contains__(self, object key):
-        try:
-            self.get_item(key)
-            return True
-        except (KeyError, ValueError, TypeError):
-            return False
-
-    cpdef get_item(self, object key):
-        cdef:
-            khiter_t k
-            uint64_t value
-            int64_t[:] locs
-            Py_ssize_t loc
-
-        value = self.mi._hashed_indexing_key(key)
-        k = kh_get_uint64(self.table, value)
-        if k != self.table.n_buckets:
-            loc = self.table.vals[k]
-            self._check_for_collision(loc, key)
-            return loc
-        else:
-            raise KeyError(key)
-
-    cpdef set_item(self, object key, Py_ssize_t val):
-        raise NotImplementedError
-
-    @cython.boundscheck(False)
-    def map_locations(self, object mi):
-        cdef:
-            Py_ssize_t i, n
-            ndarray[uint64_t] values
-            uint64_t val
-            int ret = 0
-            khiter_t k
-
-        self.mi = mi
-        n = len(mi)
-        values = mi._hashed_values
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_put_uint64(self.table, val, &ret)
-                self.table.vals[k] = i
-
-    @cython.boundscheck(False)
-    def lookup(self, object mi):
-        # look up with a target mi
-        cdef:
-            Py_ssize_t i, n
-            ndarray[uint64_t] values
-            int ret = 0
-            uint64_t val
-            khiter_t k
-            int64_t[:] locs
-
-        n = len(mi)
-        values = mi._hashed_values
-
-        locs = np.empty(n, dtype=np.int64)
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_uint64(self.table, val)
-                if k != self.table.n_buckets:
-                    locs[i] = self.table.vals[k]
-                else:
-                    locs[i] = -1
-
-        self._check_for_collisions(locs, mi)
-        return np.asarray(locs)
-
-    def unique(self, object mi):
-        raise NotImplementedError
-
-    def get_labels(self, object mi, ObjectVector uniques,
-                   Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=True):
-        raise NotImplementedError