-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PERF: improve 2D array access / transpose() for masked dtypes #52083
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,7 +107,10 @@ | |
needs_i8_conversion, | ||
pandas_dtype, | ||
) | ||
from pandas.core.dtypes.dtypes import ExtensionDtype | ||
from pandas.core.dtypes.dtypes import ( | ||
BaseMaskedDtype, | ||
ExtensionDtype, | ||
) | ||
from pandas.core.dtypes.missing import ( | ||
isna, | ||
notna, | ||
|
@@ -2505,6 +2508,7 @@ def _from_arrays( | |
index, | ||
dtype: Dtype | None = None, | ||
verify_integrity: bool = True, | ||
is_1d_ea_only: bool = False, | ||
) -> Self: | ||
""" | ||
Create DataFrame from a list of arrays corresponding to the columns. | ||
|
@@ -2544,6 +2548,7 @@ def _from_arrays( | |
dtype=dtype, | ||
verify_integrity=verify_integrity, | ||
typ=manager, | ||
is_1d_ea_only=is_1d_ea_only, | ||
) | ||
return cls(mgr) | ||
|
||
|
@@ -3610,11 +3615,21 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: | |
# We have EAs with the same dtype. We can preserve that dtype in transpose. | ||
dtype = dtypes[0] | ||
arr_type = dtype.construct_array_type() | ||
values = self.values | ||
if isinstance(dtype, BaseMaskedDtype): | ||
data, mask = self._mgr.as_array_masked() | ||
new_values = [arr_type(data[i], mask[i]) for i in range(self.shape[0])] | ||
else: | ||
values = self.values | ||
new_values = [ | ||
arr_type._from_sequence(row, dtype=dtype) for row in values | ||
] | ||
Comment on lines
+3618
to
+3625
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this is the main change (bringing the large part of the speed-up for transpose) |
||
|
||
new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] | ||
result = type(self)._from_arrays( | ||
new_values, index=self.columns, columns=self.index | ||
new_values, | ||
index=self.columns, | ||
columns=self.index, | ||
verify_integrity=False, | ||
is_1d_ea_only=True, | ||
) | ||
|
||
else: | ||
|
@@ -10935,7 +10950,23 @@ def _get_data() -> DataFrame: | |
df = _get_data() | ||
if axis is None: | ||
return func(df.values) | ||
elif axis == 1: | ||
|
||
# if len(df._mgr) > 0: | ||
# common_dtype = find_common_type(list(df._mgr.get_dtypes())) | ||
# is_masked_ea = isinstance(common_dtype, BaseMaskedDtype) | ||
# is_np = isinstance(common_dtype, np.dtype) | ||
# else: | ||
# common_dtype = None | ||
|
||
# if axis == 1 and common_dtype and is_masked_ea: | ||
# data, mask = self._mgr.as_array_masked() | ||
# ea2d = common_dtype.construct_array_type()(data, mask) | ||
# result = ea2d._reduce(name, axis=axis, skipna=skipna, **kwds) | ||
# labels = self._get_agg_axis(axis) | ||
# result = self._constructor_sliced(result, index=labels, copy=False) | ||
# return result | ||
Comment on lines
+10954
to
+10967
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rhshadrach this relates to #51923, and making use of the Manually running one of the benchmarks you mentioned in #51955 (the bottom row of
Now, this is probably a bit too optimistic, because the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
As long as we're not doing inference, the transpose is guaranteed to be a single dtype. This is because two different dtypes in the transpose could only arise from two different dtypes in the rows of the original. So we can determine the common dtype, cast to a single dtype, and then transpose. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doing a cast beforehand might indeed be the easiest, then |
||
|
||
if axis == 1: | ||
if len(df.index) == 0: | ||
# Taking a transpose would result in no columns, losing the dtype. | ||
# In the empty case, reducing along axis 0 or 1 gives the same | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1747,6 +1747,40 @@ def as_array( | |
|
||
return arr.transpose() | ||
|
||
def as_array_masked(self) -> np.ndarray: | ||
""" | ||
Convert the blockmanager data into an numpy array. | ||
|
||
Returns | ||
------- | ||
arr : ndarray | ||
""" | ||
# # TODO(CoW) handle case where resulting array is a view | ||
# if len(self.blocks) == 0: | ||
# arr = np.empty(self.shape, dtype=float) | ||
# return arr.transpose() | ||
Comment on lines
+1758
to
+1761
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche - I don't follow this todo - can you elaborate? |
||
|
||
# TODO we already established we only have a single dtype, but this | ||
# could be generalized to be a mix of all masked dtypes | ||
dtype = self.blocks[0].dtype.numpy_dtype | ||
|
||
result_data = np.empty(self.shape, dtype=dtype) | ||
result_mask = np.empty(self.shape, dtype="bool") | ||
|
||
itemmask = np.zeros(self.shape[0]) | ||
|
||
for blk in self.blocks: | ||
rl = blk.mgr_locs | ||
arr = blk.values | ||
result_data[rl.indexer] = arr._data | ||
result_mask[rl.indexer] = arr._mask | ||
itemmask[rl.indexer] = 1 | ||
|
||
if not itemmask.all(): | ||
raise AssertionError("Some items were not contained in blocks") | ||
|
||
return result_data.transpose(), result_mask.transpose() | ||
|
||
def _interleave( | ||
self, | ||
dtype: np.dtype | None = None, | ||
|
@@ -2130,6 +2164,7 @@ def create_block_manager_from_column_arrays( | |
axes: list[Index], | ||
consolidate: bool, | ||
refs: list, | ||
is_1d_ea_only: bool = False, | ||
) -> BlockManager: | ||
# Assertions disabled for performance (caller is responsible for verifying) | ||
# assert isinstance(axes, list) | ||
|
@@ -2143,7 +2178,7 @@ def create_block_manager_from_column_arrays( | |
# verify_integrity=False below. | ||
|
||
try: | ||
blocks = _form_blocks(arrays, consolidate, refs) | ||
blocks = _form_blocks(arrays, consolidate, refs, is_1d_ea_only) | ||
mgr = BlockManager(blocks, axes, verify_integrity=False) | ||
except ValueError as e: | ||
raise_construction_error(len(arrays), arrays[0].shape, axes, e) | ||
|
@@ -2197,9 +2232,17 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]: | |
return sep, isinstance(dtype, np.dtype), dtype | ||
|
||
|
||
def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: | ||
def _form_blocks( | ||
arrays: list[ArrayLike], consolidate: bool, refs: list, is_1d_ea_only: bool | ||
) -> list[Block]: | ||
tuples = list(enumerate(arrays)) | ||
|
||
if is_1d_ea_only: | ||
block_type = get_block_type(arrays[0].dtype) | ||
return [ | ||
block_type(arr, placement=BlockPlacement(i), ndim=2) for i, arr in tuples | ||
] | ||
|
||
if not consolidate: | ||
nbs = _tuples_to_blocks_no_consolidate(tuples, refs) | ||
return nbs | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
arrays_to_mgr already has a consolidate keyword. could we use that instead?