Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/pandas-dev/pandas into di…
Browse files Browse the repository at this point in the history
…v_zero2
  • Loading branch information
jbrockmendel committed Jan 29, 2018
2 parents 78de1a4 + f483321 commit 1ef3a6c
Show file tree
Hide file tree
Showing 32 changed files with 369 additions and 293 deletions.
8 changes: 4 additions & 4 deletions asv_bench/benchmarks/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ class Convert(object):

goal_time = 0.5
params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta'])
param_names = ['contructor', 'replace_data']
param_names = ['constructor', 'replace_data']

def setup(self, contructor, replace_data):
def setup(self, constructor, replace_data):
N = 10**3
data = {'Series': pd.Series(np.random.randint(N, size=N)),
'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N),
'B': np.random.randint(N, size=N)})}
self.to_replace = {i: getattr(pd, replace_data) for i in range(N)}
self.data = data[contructor]
self.data = data[constructor]

def time_replace(self, contructor, replace_data):
def time_replace(self, constructor, replace_data):
self.data.replace(self.to_replace)
16 changes: 8 additions & 8 deletions asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ class Methods(object):
['int', 'float'],
['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
'sum', 'corr', 'cov'])
param_names = ['contructor', 'window', 'dtype', 'method']
param_names = ['constructor', 'window', 'dtype', 'method']

def setup(self, contructor, window, dtype, method):
def setup(self, constructor, window, dtype, method):
N = 10**5
arr = np.random.random(N).astype(dtype)
self.roll = getattr(pd, contructor)(arr).rolling(window)
self.roll = getattr(pd, constructor)(arr).rolling(window)

def time_rolling(self, contructor, window, dtype, method):
def time_rolling(self, constructor, window, dtype, method):
getattr(self.roll, method)()


Expand All @@ -30,12 +30,12 @@ class Quantile(object):
[10, 1000],
['int', 'float'],
[0, 0.5, 1])
param_names = ['contructor', 'window', 'dtype', 'percentile']
param_names = ['constructor', 'window', 'dtype', 'percentile']

def setup(self, contructor, window, dtype, percentile):
def setup(self, constructor, window, dtype, percentile):
N = 10**5
arr = np.random.random(N).astype(dtype)
self.roll = getattr(pd, contructor)(arr).rolling(window)
self.roll = getattr(pd, constructor)(arr).rolling(window)

def time_quantile(self, contructor, window, dtype, percentile):
def time_quantile(self, constructor, window, dtype, percentile):
self.roll.quantile(percentile)
2 changes: 1 addition & 1 deletion doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2500,7 +2500,7 @@ Scalar introspection
Extensions
----------

These are primarily intented for library authors looking to extend pandas
These are primarily intended for library authors looking to extend pandas
objects.

.. currentmodule:: pandas
Expand Down
4 changes: 2 additions & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2675,7 +2675,7 @@ file, and the ``sheet_name`` indicating which sheet to parse.
+++++++++++++++++++

To facilitate working with multiple sheets from the same file, the ``ExcelFile``
class can be used to wrap the file and can be be passed into ``read_excel``
class can be used to wrap the file and can be passed into ``read_excel``
There will be a performance benefit for reading multiple sheets as the file is
read into memory only once.

Expand Down Expand Up @@ -4537,7 +4537,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
.. note::

These engines are very similar and should read/write nearly identical parquet format files.
Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
Currently ``pyarrow`` does not support timedelta data, ``fastparquet>=0.1.4`` supports timezone aware datetimes.
These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).

.. ipython:: python
Expand Down
7 changes: 6 additions & 1 deletion doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ Performance Improvements
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)


.. _whatsnew_0230.docs:
Expand Down Expand Up @@ -528,7 +529,11 @@ MultiIndex
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
-
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)


I/O
^^^
Expand Down
6 changes: 3 additions & 3 deletions doc/sphinxext/numpydoc/tests/test_docscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
-------
out : ndarray
The drawn samples, arranged according to `shape`. If the
shape given is (m,n,...), then the shape of `out` is is
shape given is (m,n,...), then the shape of `out` is
(m,n,...,N).
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
Expand Down Expand Up @@ -222,7 +222,7 @@ def test_str():
-------
out : ndarray
The drawn samples, arranged according to `shape`. If the
shape given is (m,n,...), then the shape of `out` is is
shape given is (m,n,...), then the shape of `out` is
(m,n,...,N).
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
Expand Down Expand Up @@ -340,7 +340,7 @@ def test_sphinx_str():
**out** : ndarray
The drawn samples, arranged according to `shape`. If the
shape given is (m,n,...), then the shape of `out` is is
shape given is (m,n,...), then the shape of `out` is
(m,n,...,N).
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
Expand Down
9 changes: 0 additions & 9 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable):
cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)

cdef class MultiIndexHashTable(HashTable):
cdef:
kh_uint64_t *table
object mi

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)


cdef class StringHashTable(HashTable):
cdef kh_str_t *table
Expand Down
136 changes: 0 additions & 136 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable):
count += 1

return np.asarray(labels)


cdef class MultiIndexHashTable(HashTable):

def __init__(self, size_hint=1):
self.table = kh_init_uint64()
self.mi = None
kh_resize_uint64(self.table, size_hint)

def __dealloc__(self):
if self.table is not NULL:
kh_destroy_uint64(self.table)
self.table = NULL

def __len__(self):
return self.table.size

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof(uint64_t) + # keys
sizeof(size_t) + # vals
sizeof(uint32_t)) # flags

def _check_for_collisions(self, int64_t[:] locs, object mi):
# validate that the locs map to the actual values
# provided in the mi
# we can only check if we *don't* have any missing values
# :<
cdef:
ndarray[int64_t] alocs

alocs = np.asarray(locs)
if (alocs != -1).all():

result = self.mi.take(locs)
if isinstance(mi, tuple):
from pandas import Index
mi = Index([mi])
if not result.equals(mi):
raise AssertionError(
"hash collision\nlocs:\n{}\n"
"result:\n{}\nmi:\n{}".format(alocs, result, mi))

cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
# validate that the loc maps to the actual value
# version of _check_for_collisions above for single label (tuple)

result = self.mi[loc]

if not all(l == r or (is_null_datetimelike(l)
and is_null_datetimelike(r))
for l, r in zip(result, label)):
raise AssertionError(
"hash collision\nloc:\n{}\n"
"result:\n{}\nmi:\n{}".format(loc, result, label))

def __contains__(self, object key):
try:
self.get_item(key)
return True
except (KeyError, ValueError, TypeError):
return False

cpdef get_item(self, object key):
cdef:
khiter_t k
uint64_t value
int64_t[:] locs
Py_ssize_t loc

value = self.mi._hashed_indexing_key(key)
k = kh_get_uint64(self.table, value)
if k != self.table.n_buckets:
loc = self.table.vals[k]
self._check_for_collision(loc, key)
return loc
else:
raise KeyError(key)

cpdef set_item(self, object key, Py_ssize_t val):
raise NotImplementedError

@cython.boundscheck(False)
def map_locations(self, object mi):
cdef:
Py_ssize_t i, n
ndarray[uint64_t] values
uint64_t val
int ret = 0
khiter_t k

self.mi = mi
n = len(mi)
values = mi._hashed_values

with nogil:
for i in range(n):
val = values[i]
k = kh_put_uint64(self.table, val, &ret)
self.table.vals[k] = i

@cython.boundscheck(False)
def lookup(self, object mi):
# look up with a target mi
cdef:
Py_ssize_t i, n
ndarray[uint64_t] values
int ret = 0
uint64_t val
khiter_t k
int64_t[:] locs

n = len(mi)
values = mi._hashed_values

locs = np.empty(n, dtype=np.int64)

with nogil:
for i in range(n):
val = values[i]
k = kh_get_uint64(self.table, val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1

self._check_for_collisions(locs, mi)
return np.asarray(locs)

def unique(self, object mi):
raise NotImplementedError

def get_labels(self, object mi, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
raise NotImplementedError
Loading

0 comments on commit 1ef3a6c

Please sign in to comment.