Skip to content

Commit

Permalink
PERF: performance regression in Series.asof (#14476)
Browse files Browse the repository at this point in the history
* Fix performance regression in Series.asof by avoiding pre-computing nulls and returning value by indexing the underlying ndarray.
  • Loading branch information
laudney authored and jorisvandenbossche committed Oct 26, 2016
1 parent d1d75d7 commit e3d943d
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 44 deletions.
81 changes: 51 additions & 30 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,56 +284,77 @@ class timeseries_asof(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.N = 10000
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.ts2 = self.ts.copy()
self.ts2[250:5000] = np.nan
self.ts3 = self.ts.copy()
self.ts3[-5000:] = np.nan

def time_timeseries_asof(self):
# test speed of pre-computing NAs.
def time_asof_list(self):
self.ts.asof(self.dates)

# should be roughly the same as above.
def time_asof_nan_list(self):
self.ts2.asof(self.dates)

class timeseries_asof_nan(object):
goal_time = 0.2
# test speed of the code path for a scalar index
# without *while* loop
def time_asof_single(self):
self.ts.asof(self.dates[0])

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.N = 10000
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
self.ts[250:5000] = np.nan
# test speed of the code path for a scalar index
# before the start. should be the same as above.
def time_asof_single_early(self):
self.ts.asof(self.dates[0] - dt.timedelta(10))

def time_timeseries_asof_nan(self):
self.ts.asof(self.dates)
# test the speed of the code path for a scalar index
# with a long *while* loop. should still be much
# faster than pre-computing all the NAs.
def time_asof_nan_single(self):
self.ts3.asof(self.dates[-1])


class timeseries_asof_single(object):
class timeseries_dataframe_asof(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.N = 10000
self.M = 100
self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng)
self.ts2 = self.ts.copy()
self.ts2.iloc[250:5000] = np.nan
self.ts3 = self.ts.copy()
self.ts3.iloc[-5000:] = np.nan

# test speed of pre-computing NAs.
def time_asof_list(self):
self.ts.asof(self.dates)

def time_timeseries_asof_single(self):
# should be roughly the same as above.
def time_asof_nan_list(self):
self.ts2.asof(self.dates)

# test speed of the code path for a scalar index
# with pre-computing all NAs.
def time_asof_single(self):
self.ts.asof(self.dates[0])

# should be roughly the same as above.
def time_asof_nan_single(self):
self.ts3.asof(self.dates[-1])

# test speed of the code path for a scalar index
# before the start. should be without the cost of
# pre-computing all the NAs.
def time_asof_single_early(self):
self.ts.asof(self.dates[0] - dt.timedelta(10))


class timeseries_custom_bday_apply(object):
goal_time = 0.2
Expand Down
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)

- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)
- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461)
- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461)



Expand Down
34 changes: 22 additions & 12 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3735,10 +3735,10 @@ def asof(self, where, subset=None):
if not self.index.is_monotonic:
raise ValueError("asof requires a sorted index")

if isinstance(self, ABCSeries):
is_series = isinstance(self, ABCSeries)
if is_series:
if subset is not None:
raise ValueError("subset is not valid for Series")
nulls = self.isnull()
elif self.ndim > 2:
raise NotImplementedError("asof is not implemented "
"for {type}".format(type(self)))
Expand All @@ -3747,9 +3747,9 @@ def asof(self, where, subset=None):
subset = self.columns
if not is_list_like(subset):
subset = [subset]
nulls = self[subset].isnull().any(1)

if not is_list_like(where):
is_list = is_list_like(where)
if not is_list:
start = self.index[0]
if isinstance(self.index, PeriodIndex):
where = Period(where, freq=self.index.freq).ordinal
Expand All @@ -3758,24 +3758,34 @@ def asof(self, where, subset=None):
if where < start:
return np.nan

loc = self.index.searchsorted(where, side='right')
if loc > 0:
loc -= 1
while nulls[loc] and loc > 0:
loc -= 1
return self.iloc[loc]
# It's always much faster to use a *while* loop here for
# Series than pre-computing all the NAs. However a
# *while* loop is extremely expensive for DataFrame
# so we later pre-compute all the NAs and use the same
# code path whether *where* is a scalar or list.
# See PR: https://github.com/pandas-dev/pandas/pull/14476
if is_series:
loc = self.index.searchsorted(where, side='right')
if loc > 0:
loc -= 1

values = self._values
while loc > 0 and isnull(values[loc]):
loc -= 1
return values[loc]

if not isinstance(where, Index):
where = Index(where)
where = Index(where) if is_list else Index([where])

nulls = self.isnull() if is_series else self[subset].isnull().any(1)
locs = self.index.asof_locs(where, ~(nulls.values))

# mask the missing
missing = locs == -1
data = self.take(locs, is_copy=False)
data.index = where
data.loc[missing] = np.nan
return data
return data if is_list else data.iloc[-1]

# ----------------------------------------------------------------------
# Action Methods
Expand Down

0 comments on commit e3d943d

Please sign in to comment.