From 6ad3f12ea29cafc6ff32875134c96eaa3a382930 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 24 Jul 2018 00:35:39 +0100 Subject: [PATCH 1/9] improve performance of Series.searchsorted --- asv_bench/benchmarks/series_methods.py | 19 +++++++++++++++++++ doc/source/whatsnew/v0.25.0.rst | 3 ++- pandas/core/series.py | 6 ++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index f7d0083b86a01..3303483c50e20 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -124,6 +124,25 @@ def time_dropna(self, dtype): self.s.dropna() +class SearchSorted(object): + + goal_time = 0.2 + params = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float16', 'float32', 'float64', + 'str'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**5 + data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) + self.s = Series(data) + + def time_searchsorted(self, dtype): + key = '2' if dtype == 'str' else 2 + self.s.searchsorted(key) + + class Map(object): params = ['dict', 'Series'] diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6e225185ecf84..3f68887811765 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -64,7 +64,8 @@ Performance Improvements - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) -- +- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within + the integer bounds for the dtype(:issue:`22034`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/series.py b/pandas/core/series.py index a5dfe8d43c336..4a9b762c7e454 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2394,9 +2394,11 @@ def __rmatmul__(self, other): def searchsorted(self, value, side='left', sorter=None): if sorter is not None: sorter = ensure_platform_int(sorter) - result = self._values.searchsorted(Series(value)._values, - side=side, sorter=sorter) + if not is_extension_type(self._values): + value = np.asarray(value, dtype=self._values.dtype) + value = value[..., np.newaxis] if value.ndim == 0 else value + result = self._values.searchsorted(value, side=side, sorter=sorter) return result[0] if is_scalar(value) else result # ------------------------------------------------------------------- From 60742c3debe85163a60476518fb25f5834f2b92f Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 25 Jul 2018 10:19:30 +0100 Subject: [PATCH 2/9] added explanation --- pandas/core/series.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4a9b762c7e454..c989438732e98 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2395,6 +2395,9 @@ def searchsorted(self, value, side='left', sorter=None): if sorter is not None: sorter = ensure_platform_int(sorter) if not is_extension_type(self._values): + # numpy searchsorted is only fast if value is of same dtype as the + # searched array. Below we ensure that value has the right dtype, + # and is not 0-dimensional. value = np.asarray(value, dtype=self._values.dtype) value = value[..., np.newaxis] if value.ndim == 0 else value From 672802da72495692b89287f6d1bc9692e1a2b03c Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Jul 2018 18:55:24 +0100 Subject: [PATCH 3/9] Make common impl. with Index.searchsorted --- pandas/core/base.py | 8 +++- pandas/core/common.py | 95 ++++++++++++++++++++++++++++++++++++++++--- pandas/core/series.py | 17 +++----- 3 files changed, 101 insertions(+), 19 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 7fdc64a8d9f85..246e6e09d5f9f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1525,8 +1525,12 @@ def factorize(self, sort=False, na_sentinel=-1): @Substitution(klass='IndexOpsMixin') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - # needs coercion on the key (DatetimeIndex does already) - return self._values.searchsorted(value, side=side, sorter=sorter) + result = com.searchsorted(self._values, value, + side=side, sorter=sorter) + + if is_scalar(value): + return result if is_scalar(result) else result[0] + return result def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') diff --git a/pandas/core/common.py b/pandas/core/common.py index 5b83cb344b1e7..ac0ad652f86fa 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -13,13 +13,16 @@ import numpy as np from pandas._libs import lib, tslibs -import pandas.compat as compat -from pandas.compat import PY36, iteritems - from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -from pandas.core.dtypes.common import ( - is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer) -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas import compat +from pandas.compat import iteritems, PY2, PY36, OrderedDict +from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass +from pandas.core.dtypes.common import (is_integer, is_integer_dtype, + is_bool_dtype, is_extension_array_dtype, + is_array_like, + is_float_dtype, is_object_dtype, + is_categorical_dtype, is_numeric_dtype, + is_scalar, ensure_platform_int) from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -482,3 +485,83 @@ def f(x): f = mapper return f + + +def ensure_integer_dtype(arr, value): + """ + Ensure optimal dtype for :func:`searchsorted_integer` is returned. + + Parameters + ---------- + arr : a numpy integer array + value : a number or array of numbers + + Returns + ------- + dtype : an numpy integer dtype + + Raises + ------ + TypeError : if value is not a number + """ + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + + if PY2 and not is_numeric_dtype(value_arr): + # python 2 allows "a" < 1, avoid such nonsense + msg = "value must be numeric, was type {}" + raise TypeError(msg.format(value)) + + iinfo = np.iinfo(arr.dtype) + if not ((value_arr < iinfo.min).any() or (value_arr > iinfo.max).any()): + return arr.dtype + else: + return value_arr.dtype + + +def searchsorted_integer(arr, value, side="left", sorter=None): + """ + searchsorted implementation, but only for integer arrays. + + We get a speedup if the dtype of arr and value is the same. + + See :func:`searchsorted` for a more general searchsorted implementation. + """ + if sorter is not None: + sorter = ensure_platform_int(sorter) + + dtype = ensure_integer_dtype(arr, value) + + if is_integer(value) or is_integer_dtype(value): + value = np.asarray(value, dtype=dtype) + elif hasattr(value, 'is_integer') and value.is_integer(): + # float 2.0 can be converted to int 2 for better speed, + # but float 2.2 should *not* be converted to int 2 + value = np.asarray(value, dtype=dtype) + + return np.searchsorted(arr, value, side=side, sorter=sorter) + + +def searchsorted(arr, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted array-like `arr` such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `arr` would be preserved. + + See :class:`IndexOpsMixin.searchsorted` for more details and examples. + """ + if sorter is not None: + sorter = ensure_platform_int(sorter) + + if is_integer_dtype(arr): + return searchsorted_integer(arr, value, side=side, sorter=sorter) + elif (is_object_dtype(arr) or is_float_dtype(arr) or + is_categorical_dtype(arr)): + return arr.searchsorted(value, side=side, sorter=sorter) + else: + # fallback solution. E.g. arr is an array with dtype='datetime64[ns]' + # and value is a pd.Timestamp, need to convert value + from pandas.core.series import Series + value = Series(value)._values + return arr.searchsorted(value, side=side, sorter=sorter) diff --git a/pandas/core/series.py b/pandas/core/series.py index c989438732e98..106c10d81289a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2392,17 +2392,12 @@ def __rmatmul__(self, other): @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - if sorter is not None: - sorter = ensure_platform_int(sorter) - if not is_extension_type(self._values): - # numpy searchsorted is only fast if value is of same dtype as the - # searched array. Below we ensure that value has the right dtype, - # and is not 0-dimensional. - value = np.asarray(value, dtype=self._values.dtype) - value = value[..., np.newaxis] if value.ndim == 0 else value - - result = self._values.searchsorted(value, side=side, sorter=sorter) - return result[0] if is_scalar(value) else result + result = com.searchsorted(self._values, value, + side=side, sorter=sorter) + + if is_scalar(value): + return result if is_scalar(result) else result[0] + return result # ------------------------------------------------------------------- # Combination From c1a337c070b63503e1a01a2dbf39fe96c79ee281 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 19 Nov 2018 21:21:20 +0000 Subject: [PATCH 4/9] Simplify implementation --- pandas/core/common.py | 109 +++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index ac0ad652f86fa..8c0a2b2c4d703 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -15,14 +15,12 @@ from pandas._libs import lib, tslibs from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas import compat -from pandas.compat import iteritems, PY2, PY36, OrderedDict +from pandas.compat import iteritems, PY36, OrderedDict from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass -from pandas.core.dtypes.common import (is_integer, is_integer_dtype, - is_bool_dtype, is_extension_array_dtype, - is_array_like, - is_float_dtype, is_object_dtype, - is_categorical_dtype, is_numeric_dtype, - is_scalar, ensure_platform_int) +from pandas.core.dtypes.common import ( + is_integer, is_integer_dtype, is_bool_dtype, + is_extension_array_dtype, is_array_like, is_object_dtype, + is_categorical_dtype, is_numeric_dtype, is_scalar, ensure_platform_int) from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -487,58 +485,47 @@ def f(x): return f -def ensure_integer_dtype(arr, value): +def searchsorted_integer(arr, value, side="left", sorter=None): """ - Ensure optimal dtype for :func:`searchsorted_integer` is returned. + searchsorted implementation for searching integer arrays. + + We get a speedup if we ensure the dtype of arr and value are the same + (if possible) before searchingm as numpy implicitly converts the dtypes + if they're different, which would cause a slowdown. + + See :func:`searchsorted` for a more general searchsorted implementation. Parameters ---------- - arr : a numpy integer array - value : a number or array of numbers + arr : numpy.array + a numpy array of integers + value : int or numpy.array + an integer or an array of integers that we want to find the + location(s) for in `arr` + side : str + One of {'left', 'right'} + sorter : numpy.array, optional Returns ------- - dtype : an numpy integer dtype - - Raises - ------ - TypeError : if value is not a number - """ - value_arr = np.array([value]) if is_scalar(value) else np.array(value) - - if PY2 and not is_numeric_dtype(value_arr): - # python 2 allows "a" < 1, avoid such nonsense - msg = "value must be numeric, was type {}" - raise TypeError(msg.format(value)) - - iinfo = np.iinfo(arr.dtype) - if not ((value_arr < iinfo.min).any() or (value_arr > iinfo.max).any()): - return arr.dtype - else: - return value_arr.dtype - - -def searchsorted_integer(arr, value, side="left", sorter=None): - """ - searchsorted implementation, but only for integer arrays. - - We get a speedup if the dtype of arr and value is the same. - - See :func:`searchsorted` for a more general searchsorted implementation. + int or numpy.array + The locations(s) of `value` in `arr`. """ if sorter is not None: sorter = ensure_platform_int(sorter) - dtype = ensure_integer_dtype(arr, value) - - if is_integer(value) or is_integer_dtype(value): - value = np.asarray(value, dtype=dtype) - elif hasattr(value, 'is_integer') and value.is_integer(): - # float 2.0 can be converted to int 2 for better speed, - # but float 2.2 should *not* be converted to int 2 - value = np.asarray(value, dtype=dtype) + # below we try to give `value` the same dtype as `arr`, while guarding + # against integer overflows. If the value of `value` is outside of the + # bound of `arr`, `arr` would be recast by numpy, causing a slower search. + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + iinfo = np.iinfo(arr.dtype) + if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): + dtype = arr.dtype + else: + dtype = value_arr.dtype + value = np.asarray(value, dtype=dtype) - return np.searchsorted(arr, value, side=side, sorter=sorter) + return arr.searchsorted(value, side=side, sorter=sorter) def searchsorted(arr, value, side="left", sorter=None): @@ -550,18 +537,30 @@ def searchsorted(arr, value, side="left", sorter=None): the order of `arr` would be preserved. See :class:`IndexOpsMixin.searchsorted` for more details and examples. + + Parameters + ---------- + arr : numpy.array or ExtensionArray + value : scalar or numpy.array + side : str + One of {'left', 'right'} + sorter : numpy.array, optional + + Returns + ------- + int or numpy.array + The locations(s) of `value` in `arr`. """ if sorter is not None: sorter = ensure_platform_int(sorter) - if is_integer_dtype(arr): + if is_integer_dtype(arr) and ( + is_integer(value) or is_integer_dtype(value)): return searchsorted_integer(arr, value, side=side, sorter=sorter) - elif (is_object_dtype(arr) or is_float_dtype(arr) or - is_categorical_dtype(arr)): - return arr.searchsorted(value, side=side, sorter=sorter) - else: - # fallback solution. E.g. arr is an array with dtype='datetime64[ns]' - # and value is a pd.Timestamp, need to convert value + if not (is_object_dtype(arr) or is_numeric_dtype(arr) or + is_categorical_dtype(arr)): + # E.g. if `arr` is an array with dtype='datetime64[ns]' + # and `value` is a pd.Timestamp, we may need to convert value from pandas.core.series import Series value = Series(value)._values - return arr.searchsorted(value, side=side, sorter=sorter) + return arr.searchsorted(value, side=side, sorter=sorter) From 686a0a1473bc36a9934cbdf458d56634d78f9e40 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 27 Jan 2019 00:29:41 +0000 Subject: [PATCH 5/9] rebase --- pandas/core/common.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 8c0a2b2c4d703..0474cd300db1d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -511,6 +511,7 @@ def searchsorted_integer(arr, value, side="left", sorter=None): int or numpy.array The locations(s) of `value` in `arr`. """ + from .arrays.array_ import array if sorter is not None: sorter = ensure_platform_int(sorter) @@ -518,12 +519,16 @@ def searchsorted_integer(arr, value, side="left", sorter=None): # against integer overflows. If the value of `value` is outside of the # bound of `arr`, `arr` would be recast by numpy, causing a slower search. value_arr = np.array([value]) if is_scalar(value) else np.array(value) - iinfo = np.iinfo(arr.dtype) + iinfo = np.iinfo(arr.dtype.type) if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): dtype = arr.dtype else: dtype = value_arr.dtype - value = np.asarray(value, dtype=dtype) + + if is_scalar(value): + value = dtype.type(value) + else: + value = array(value, dtype=dtype) return arr.searchsorted(value, side=side, sorter=sorter) From ea8280ea7d9b8d0b31e9c4348af9fe34d5fb986b Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 27 Jan 2019 08:18:34 +0000 Subject: [PATCH 6/9] collect into one function --- pandas/core/arrays/base.py | 18 ++-- pandas/core/arrays/numpy_.py | 8 +- pandas/core/base.py | 10 +-- pandas/core/common.py | 134 ++++++++++++++---------------- pandas/core/series.py | 8 +- pandas/tests/arrays/test_array.py | 43 ++++++++++ 6 files changed, 128 insertions(+), 93 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7aaefef3d03e5..e770281596134 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -555,17 +555,17 @@ def searchsorted(self, value, side="left", sorter=None): .. versionadded:: 0.24.0 Find the indices into a sorted array `self` (a) such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. - Assuming that `a` is sorted: + Assuming that `self` is sorted: - ====== ============================ + ====== ================================ `side` returned index `i` satisfies - ====== ============================ - left ``self[i-1] < v <= self[i]`` - right ``self[i-1] <= v < self[i]`` - ====== ============================ + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ Parameters ---------- @@ -581,7 +581,7 @@ def searchsorted(self, value, side="left", sorter=None): Returns ------- - indices : array of ints + array of ints Array of insertion points with the same shape as `value`. See Also diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 791ff44303e96..11b04b8afc3b2 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -4,6 +4,7 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -11,7 +12,7 @@ from pandas.core.dtypes.inference import is_array_like, is_list_like from pandas import compat -from pandas.core import nanops +from pandas.core import common as com, nanops from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -423,6 +424,11 @@ def to_numpy(self, dtype=None, copy=False): return result + @Appender(ExtensionArray.searchsorted.__doc__) + def searchsorted(self, value, side='left', sorter=None): + return com.searchsorted(self.to_numpy(), value, + side=side, sorter=sorter) + # ------------------------------------------------------------------------ # Ops diff --git a/pandas/core/base.py b/pandas/core/base.py index 246e6e09d5f9f..66dce0560c5ef 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1522,15 +1522,11 @@ def factorize(self, sort=False, na_sentinel=-1): array([3]) """) - @Substitution(klass='IndexOpsMixin') + @Substitution(klass='Index') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - result = com.searchsorted(self._values, value, - side=side, sorter=sorter) - - if is_scalar(value): - return result if is_scalar(result) else result[0] - return result + return com.searchsorted(self._values, value, + side=side, sorter=sorter) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') diff --git a/pandas/core/common.py b/pandas/core/common.py index 0474cd300db1d..b8e4580eabd22 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -13,17 +13,19 @@ import numpy as np from pandas._libs import lib, tslibs +from pandas.compat import PY36, OrderedDict, iteritems + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -from pandas import compat -from pandas.compat import iteritems, PY36, OrderedDict -from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass from pandas.core.dtypes.common import ( - is_integer, is_integer_dtype, is_bool_dtype, - is_extension_array_dtype, is_array_like, is_object_dtype, - is_categorical_dtype, is_numeric_dtype, is_scalar, ensure_platform_int) + ensure_platform_int, is_array_like, is_bool_dtype, is_categorical_dtype, + is_extension_array_dtype, is_integer, is_integer_dtype, is_numeric_dtype, + is_object_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa +from pandas import compat + class SettingWithCopyError(ValueError): pass @@ -485,87 +487,79 @@ def f(x): return f -def searchsorted_integer(arr, value, side="left", sorter=None): - """ - searchsorted implementation for searching integer arrays. - - We get a speedup if we ensure the dtype of arr and value are the same - (if possible) before searchingm as numpy implicitly converts the dtypes - if they're different, which would cause a slowdown. - - See :func:`searchsorted` for a more general searchsorted implementation. - - Parameters - ---------- - arr : numpy.array - a numpy array of integers - value : int or numpy.array - an integer or an array of integers that we want to find the - location(s) for in `arr` - side : str - One of {'left', 'right'} - sorter : numpy.array, optional - - Returns - ------- - int or numpy.array - The locations(s) of `value` in `arr`. - """ - from .arrays.array_ import array - if sorter is not None: - sorter = ensure_platform_int(sorter) - - # below we try to give `value` the same dtype as `arr`, while guarding - # against integer overflows. If the value of `value` is outside of the - # bound of `arr`, `arr` would be recast by numpy, causing a slower search. - value_arr = np.array([value]) if is_scalar(value) else np.array(value) - iinfo = np.iinfo(arr.dtype.type) - if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): - dtype = arr.dtype - else: - dtype = value_arr.dtype - - if is_scalar(value): - value = dtype.type(value) - else: - value = array(value, dtype=dtype) - - return arr.searchsorted(value, side=side, sorter=sorter) - - def searchsorted(arr, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. - Find the indices into a sorted array-like `arr` such that, if the + .. versionadded:: 0.25.0 + + Find the indices into a sorted array `self` (a) such that, if the corresponding elements in `value` were inserted before the indices, - the order of `arr` would be preserved. + the order of `self` would be preserved. + + Assuming that `self` is sorted: - See :class:`IndexOpsMixin.searchsorted` for more details and examples. + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ Parameters ---------- - arr : numpy.array or ExtensionArray - value : scalar or numpy.array - side : str - One of {'left', 'right'} - sorter : numpy.array, optional + arr: numpy.array or ExtensionArray + array to search in. Cannot be Index, Series or PandasArray, as that + would cause a RecursionError. + value : array_like + Values to insert into `arr`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. Returns ------- - int or numpy.array - The locations(s) of `value` in `arr`. + array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. """ if sorter is not None: sorter = ensure_platform_int(sorter) if is_integer_dtype(arr) and ( is_integer(value) or is_integer_dtype(value)): - return searchsorted_integer(arr, value, side=side, sorter=sorter) - if not (is_object_dtype(arr) or is_numeric_dtype(arr) or - is_categorical_dtype(arr)): + from .arrays.array_ import array + # if `arr` and `value` have different dtypes, `arr` would be + # recast by numpy, causing a slow search. + # Before searching below, we therefore try to give `value` the + # same dtype as `arr`, while guarding against integer overflows. + iinfo = np.iinfo(arr.dtype.type) + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): + # value within bounds, so no overflow, so can convert value dtype + # to dtype of arr + dtype = arr.dtype + else: + dtype = value_arr.dtype + + if is_scalar(value): + value = dtype.type(value) + else: + value = array(value, dtype=dtype) + elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or + is_categorical_dtype(arr)): + from pandas.core.series import Series # E.g. if `arr` is an array with dtype='datetime64[ns]' # and `value` is a pd.Timestamp, we may need to convert value - from pandas.core.series import Series - value = Series(value)._values - return arr.searchsorted(value, side=side, sorter=sorter) + value_ser = Series(value)._values + value = value_ser[0] if is_scalar(value) else value_ser + + result = arr.searchsorted(value, side=side, sorter=sorter) + return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 106c10d81289a..3584b91464e98 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2392,12 +2392,8 @@ def __rmatmul__(self, other): @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - result = com.searchsorted(self._values, value, - side=side, sorter=sorter) - - if is_scalar(value): - return result if is_scalar(result) else result[0] - return result + return com.searchsorted(self._values, value, + side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 9fea1989e46df..fb9e4f51646f0 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -9,6 +9,7 @@ import pandas as pd from pandas.api.extensions import register_extension_dtype +from pandas.api.types import is_scalar from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import ( DecimalArray, DecimalDtype, to_decimal) @@ -254,3 +255,45 @@ def test_array_not_registered(registry_without_decimal): result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) tm.assert_equal(result, expected) + + +class TestArrayAnalytics(object): + def test_searchsorted(self, string_dtype): + arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) + + result = arr.searchsorted('a', side='left') + assert is_scalar(result) + assert result == 0 + + result = arr.searchsorted('a', side='right') + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted(30) + assert is_scalar(result) + assert result == 2 + + result = arr.searchsorted([30]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted([2, 30]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_search_sorted_datetime64_scalar(self): + arr = pd.array(pd.date_range('20120101', periods=10, freq='2D')) + val = pd.Timestamp('20120102') + result = arr.searchsorted(val) + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_sorter(self, any_real_dtype): + arr = pd.array([3, 1, 2], dtype=any_real_dtype) + result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) + expected = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) From a9905fd69d79788ca0859e3aa002588c0c663403 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 27 Jan 2019 20:39:19 +0000 Subject: [PATCH 7/9] move searchsorted to algorithms.py --- pandas/core/algorithms.py | 84 +++++++++++++++++++++++++++++++++++- pandas/core/arrays/numpy_.py | 7 +-- pandas/core/base.py | 4 +- pandas/core/common.py | 82 +---------------------------------- pandas/core/series.py | 4 +- 5 files changed, 92 insertions(+), 89 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c5c8f47ad6dba..4d6a471b630a8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,7 +19,7 @@ ensure_float64, ensure_int64, ensure_object, ensure_platform_int, ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_float_dtype, + is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -1724,6 +1724,88 @@ def func(arr, indexer, out, fill_value=np.nan): return out +# ---- # +# searchsorted # +# ---- # + +def searchsorted(arr, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.25.0 + + Find the indices into a sorted array `self` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Assuming that `self` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + arr: numpy.array or ExtensionArray + array to search in. Cannot be Index, Series or PandasArray, as that + would cause a RecursionError. + value : array_like + Values to insert into `arr`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + if sorter is not None: + sorter = ensure_platform_int(sorter) + + if is_integer_dtype(arr) and ( + is_integer(value) or is_integer_dtype(value)): + from .arrays.array_ import array + # if `arr` and `value` have different dtypes, `arr` would be + # recast by numpy, causing a slow search. + # Before searching below, we therefore try to give `value` the + # same dtype as `arr`, while guarding against integer overflows. + iinfo = np.iinfo(arr.dtype.type) + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): + # value within bounds, so no overflow, so can convert value dtype + # to dtype of arr + dtype = arr.dtype + else: + dtype = value_arr.dtype + + if is_scalar(value): + value = dtype.type(value) + else: + value = array(value, dtype=dtype) + elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or + is_categorical_dtype(arr)): + from pandas.core.series import Series + # E.g. if `arr` is an array with dtype='datetime64[ns]' + # and `value` is a pd.Timestamp, we may need to convert value + value_ser = Series(value)._values + value = value_ser[0] if is_scalar(value) else value_ser + + result = arr.searchsorted(value, side=side, sorter=sorter) + return result + + # ---- # # diff # # ---- # diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 11b04b8afc3b2..8e2ab586cacb6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -12,7 +12,8 @@ from pandas.core.dtypes.inference import is_array_like, is_list_like from pandas import compat -from pandas.core import common as com, nanops +from pandas.core import nanops +from pandas.core.algorithms import searchsorted from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -426,8 +427,8 @@ def to_numpy(self, dtype=None, copy=False): @Appender(ExtensionArray.searchsorted.__doc__) def searchsorted(self, value, side='left', sorter=None): - return com.searchsorted(self.to_numpy(), value, - side=side, sorter=sorter) + return searchsorted(self.to_numpy(), value, + side=side, sorter=sorter) # ------------------------------------------------------------------------ # Ops diff --git a/pandas/core/base.py b/pandas/core/base.py index 66dce0560c5ef..f896596dd5216 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1525,8 +1525,8 @@ def factorize(self, sort=False, na_sentinel=-1): @Substitution(klass='Index') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - return com.searchsorted(self._values, value, - side=side, sorter=sorter) + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') diff --git a/pandas/core/common.py b/pandas/core/common.py index b8e4580eabd22..099e0947a33df 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -17,9 +17,7 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( - ensure_platform_int, is_array_like, is_bool_dtype, is_categorical_dtype, - is_extension_array_dtype, is_integer, is_integer_dtype, is_numeric_dtype, - is_object_dtype, is_scalar) + is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -485,81 +483,3 @@ def f(x): f = mapper return f - - -def searchsorted(arr, value, side="left", sorter=None): - """ - Find indices where elements should be inserted to maintain order. - - .. versionadded:: 0.25.0 - - Find the indices into a sorted array `self` (a) such that, if the - corresponding elements in `value` were inserted before the indices, - the order of `self` would be preserved. - - Assuming that `self` is sorted: - - ====== ================================ - `side` returned index `i` satisfies - ====== ================================ - left ``self[i-1] < value <= self[i]`` - right ``self[i-1] <= value < self[i]`` - ====== ================================ - - Parameters - ---------- - arr: numpy.array or ExtensionArray - array to search in. Cannot be Index, Series or PandasArray, as that - would cause a RecursionError. - value : array_like - Values to insert into `arr`. - side : {'left', 'right'}, optional - If 'left', the index of the first suitable location found is given. - If 'right', return the last such index. If there is no suitable - index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array_like, optional - Optional array of integer indices that sort array a into ascending - order. They are typically the result of argsort. - - Returns - ------- - array of ints - Array of insertion points with the same shape as `value`. - - See Also - -------- - numpy.searchsorted : Similar method from NumPy. - """ - if sorter is not None: - sorter = ensure_platform_int(sorter) - - if is_integer_dtype(arr) and ( - is_integer(value) or is_integer_dtype(value)): - from .arrays.array_ import array - # if `arr` and `value` have different dtypes, `arr` would be - # recast by numpy, causing a slow search. - # Before searching below, we therefore try to give `value` the - # same dtype as `arr`, while guarding against integer overflows. - iinfo = np.iinfo(arr.dtype.type) - value_arr = np.array([value]) if is_scalar(value) else np.array(value) - if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): - # value within bounds, so no overflow, so can convert value dtype - # to dtype of arr - dtype = arr.dtype - else: - dtype = value_arr.dtype - - if is_scalar(value): - value = dtype.type(value) - else: - value = array(value, dtype=dtype) - elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or - is_categorical_dtype(arr)): - from pandas.core.series import Series - # E.g. if `arr` is an array with dtype='datetime64[ns]' - # and `value` is a pd.Timestamp, we may need to convert value - value_ser = Series(value)._values - value = value_ser[0] if is_scalar(value) else value_ser - - result = arr.searchsorted(value, side=side, sorter=sorter) - return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 3584b91464e98..ad7c6af21f637 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2392,8 +2392,8 @@ def __rmatmul__(self, other): @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - return com.searchsorted(self._values, value, - side=side, sorter=sorter) + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination From 9e6ed434a72eee9ed8c000eec623167ad4a10926 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 28 Jan 2019 22:52:31 +0000 Subject: [PATCH 8/9] Guard against IntegerArray + cleanups --- doc/source/whatsnew/v0.25.0.rst | 4 ++-- pandas/core/algorithms.py | 6 +++--- pandas/core/common.py | 5 ++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3f68887811765..18e0d8f242697 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -64,8 +64,8 @@ Performance Improvements - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) -- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within - the integer bounds for the dtype(:issue:`22034`) +- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is + int8/int16/int32 and the searched key is within the integer bounds for the dtype(:issue:`22034`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4d6a471b630a8..1ca6712a374c9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1724,9 +1724,9 @@ def func(arr, indexer, out, fill_value=np.nan): return out -# ---- # +# ------------ # # searchsorted # -# ---- # +# ------------ # def searchsorted(arr, value, side="left", sorter=None): """ @@ -1774,7 +1774,7 @@ def searchsorted(arr, value, side="left", sorter=None): if sorter is not None: sorter = ensure_platform_int(sorter) - if is_integer_dtype(arr) and ( + if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and ( is_integer(value) or is_integer_dtype(value)): from .arrays.array_ import array # if `arr` and `value` have different dtypes, `arr` would be diff --git a/pandas/core/common.py b/pandas/core/common.py index 099e0947a33df..5b83cb344b1e7 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -13,7 +13,8 @@ import numpy as np from pandas._libs import lib, tslibs -from pandas.compat import PY36, OrderedDict, iteritems +import pandas.compat as compat +from pandas.compat import PY36, iteritems from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -22,8 +23,6 @@ from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa -from pandas import compat - class SettingWithCopyError(ValueError): pass From bcbe22624b6d815f9d946e21d1d4e00c282dc96d Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 23 Feb 2019 20:20:20 +0000 Subject: [PATCH 9/9] cleanups --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/algorithms.py | 17 +++++++++-------- pandas/tests/arrays/test_array.py | 12 +++++++++--- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 18e0d8f242697..bc4778a3cce2d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -65,7 +65,7 @@ Performance Improvements - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is - int8/int16/int32 and the searched key is within the integer bounds for the dtype(:issue:`22034`) + int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1ca6712a374c9..b056a357d0a51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1734,24 +1734,25 @@ def searchsorted(arr, value, side="left", sorter=None): .. versionadded:: 0.25.0 - Find the indices into a sorted array `self` (a) such that, if the + Find the indices into a sorted array `arr` (a) such that, if the corresponding elements in `value` were inserted before the indices, - the order of `self` would be preserved. + the order of `arr` would be preserved. - Assuming that `self` is sorted: + Assuming that `arr` is sorted: ====== ================================ `side` returned index `i` satisfies ====== ================================ - left ``self[i-1] < value <= self[i]`` - right ``self[i-1] <= value < self[i]`` + left ``arr[i-1] < value <= self[i]`` + right ``arr[i-1] <= value < self[i]`` ====== ================================ Parameters ---------- - arr: numpy.array or ExtensionArray - array to search in. Cannot be Index, Series or PandasArray, as that - would cause a RecursionError. + arr: array-like + Input array. If `sorter` is None, then it must be sorted in + ascending order, otherwise `sorter` must be an array of indices + that sort it. value : array_like Values to insert into `arr`. side : {'left', 'right'}, optional diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index fb9e4f51646f0..b68ec2bf348b4 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -285,9 +285,15 @@ def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): expected = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - def test_search_sorted_datetime64_scalar(self): - arr = pd.array(pd.date_range('20120101', periods=10, freq='2D')) - val = pd.Timestamp('20120102') + @pytest.mark.parametrize('arr, val', [ + [pd.date_range('20120101', periods=10, freq='2D'), + pd.Timestamp('20120102')], + [pd.date_range('20120101', periods=10, freq='2D', tz='Asia/Hong_Kong'), + pd.Timestamp('20120102', tz='Asia/Hong_Kong')], + [pd.timedelta_range(start='1 day', end='10 days', periods=10), + pd.Timedelta('2 days')]]) + def test_search_sorted_datetime64_scalar(self, arr, val): + arr = pd.array(arr) result = arr.searchsorted(val) assert is_scalar(result) assert result == 1