Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: ignore_failures in BlockManager.reduce #35881

Merged
merged 42 commits into from
Oct 10, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
4c5eddd
REF: remove unnecesary try/except
jbrockmendel Aug 21, 2020
c632c9f
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Aug 21, 2020
9e64be3
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Aug 21, 2020
42649fb
TST: add test for agg on ordered categorical cols (#35630)
mathurk1 Aug 21, 2020
47121dd
TST: resample does not yield empty groups (#10603) (#35799)
tkmz-n Aug 21, 2020
1decb3e
revert accidental rebase
jbrockmendel Aug 22, 2020
57c5dd3
Merge branch 'master' of https://github.com/pandas-dev/pandas into ma…
jbrockmendel Aug 22, 2020
a358463
Merge branch 'master' of https://github.com/pandas-dev/pandas into ma…
jbrockmendel Aug 23, 2020
ffa7ad7
Merge branch 'master' of https://github.com/pandas-dev/pandas into ma…
jbrockmendel Aug 23, 2020
5281ce7
REF: implement Block.reduce
jbrockmendel Aug 24, 2020
cdcc1a0
remove outdated comment
jbrockmendel Aug 24, 2020
b04e023
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Aug 24, 2020
356cdf7
DOC: Updated aggregate docstring (#35042)
gurukiran07 Aug 24, 2020
e29283b
REF: BlockManager.reduce with ignore_failures
jbrockmendel Aug 24, 2020
4b52eda
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Aug 25, 2020
a58fdf0
de-duplicate
jbrockmendel Aug 25, 2020
313280f
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Aug 25, 2020
cdad23d
mypy fixup
jbrockmendel Aug 25, 2020
028a0b7
mypy fixup
jbrockmendel Aug 25, 2020
3467913
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Aug 27, 2020
2f10b72
comment
jbrockmendel Aug 27, 2020
8f2a047
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Aug 30, 2020
699b96b
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 2, 2020
e128da8
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 2, 2020
fd964d9
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 2, 2020
253c0ea
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 2, 2020
fbe67f4
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 4, 2020
23e3a6a
update tested behavior
jbrockmendel Sep 4, 2020
37e2f99
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 6, 2020
146b322
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 7, 2020
ccea5a5
whatsnew
jbrockmendel Sep 7, 2020
c6b4e2c
rewords whatsnew
jbrockmendel Sep 10, 2020
1165129
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 11, 2020
11126bc
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 20, 2020
a5df009
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 21, 2020
e1c1a5b
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 22, 2020
d7099fe
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 23, 2020
97376bf
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 24, 2020
0a7fa6f
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Sep 25, 2020
c5de076
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Oct 3, 2020
6a30fcf
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Oct 6, 2020
f349ef7
Revert behavior-changing component
jbrockmendel Oct 6, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8598,6 +8598,8 @@ def _reduce(
cols = self.columns[~dtype_is_dt]
self = self[cols]

any_object = self.dtypes.apply(is_object_dtype).any()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is partly the culprit of the slowdown. See also the top post of #33252, which shows that self.dtypes.apply(..) is slower than the method that is used a few lines above for dtype_is_dt


if axis is None and filter_type == "bool":
labels = None
constructor = None
Expand Down Expand Up @@ -8630,7 +8632,9 @@ def _get_data(axis_matters):
raise NotImplementedError(msg)
return data

if numeric_only is not None and axis in [0, 1]:
if (numeric_only is not None and axis in [0, 1]) or (
numeric_only is None and axis == 0 and not any_object
):
df = self
if numeric_only is True:
df = _get_data(axis_matters=True)
Expand All @@ -8639,6 +8643,7 @@ def _get_data(axis_matters):
axis = 0

out_dtype = "bool" if filter_type == "bool" else None
ignore_failures = numeric_only is None

def blk_func(values):
if isinstance(values, ExtensionArray):
Expand All @@ -8648,12 +8653,14 @@ def blk_func(values):

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res,).iloc[0].rename(None)
res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures)
out = df._constructor(res).iloc[0].rename(None)
if out_dtype is not None:
out = out.astype(out_dtype)
if axis == 0 and is_object_dtype(out.dtype):
out[:] = coerce_to_dtypes(out.values, df.dtypes)
# GH#35865 careful to cast explicitly to object
nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)])
out[:] = np.array(nvs, dtype=object)
return out

if not self._is_homogeneous_type or self._mgr.any_extension_types:
Expand Down
31 changes: 25 additions & 6 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,18 +330,31 @@ def _verify_integrity(self) -> None:
f"tot_items: {tot_items}"
)

def reduce(self: T, func) -> T:
def reduce(self: T, func, ignore_failures: bool = False) -> Tuple[T, np.ndarray]:
# If 2D, we assume that we're operating column-wise
assert self.ndim == 2

res_blocks = []
for blk in self.blocks:
nbs = blk.reduce(func)
res_blocks: List[Block] = []
skipped: List[int] = []
for i, blk in enumerate(self.blocks):
try:
nbs = blk.reduce(func)
except TypeError:
if ignore_failures:
skipped.append(i)
continue
raise
res_blocks.extend(nbs)

if res_blocks:
indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks])
else:
indexer = []

new_items = self.reset_dropped_locs(res_blocks, skipped)
index = Index([0]) # placeholder
new_mgr = BlockManager.from_blocks(res_blocks, [self.items, index])
return new_mgr
new_mgr = BlockManager.from_blocks(res_blocks, [new_items, index])
return new_mgr, indexer

def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager":
"""
Expand Down Expand Up @@ -1499,6 +1512,12 @@ def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index:
-----
Alters each block's mgr_locs inplace.
"""
if not skipped:
return self.items.copy()
elif not blocks:
# empty index with same dtype and name
return self.items[:0]

ncols = len(self)

new_locs = [blk.mgr_locs.as_array for blk in blocks]
Expand Down