From f250ea43aa256aff1873616ed46fe4d441e4461a Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 21 Apr 2023 18:35:35 +0100 Subject: [PATCH 01/18] PERF: Faster transposition of masked arrays --- asv_bench/benchmarks/reshape.py | 33 ++++++++++++++++++++++++++++++++ doc/source/whatsnew/v2.1.0.rst | 2 ++ pandas/core/arrays/masked.py | 34 +++++++++++++++++++++++++++++++++ pandas/core/frame.py | 24 ++++++++++++++++------- 4 files changed, 86 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 551af7ccb40bc..5063f1c07466c 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -93,6 +93,39 @@ def time_transpose(self, dtype): self.df.T +class ReshapeMaskedArrayDtype: + params = ["Int64", "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + lev = pd.Index(list("ABCDEFGHIJ")) + ri = pd.Index(range(1000)) + mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"]) + + values = np.random.randn(10_000).astype(int) + + ser = pd.Series(values, dtype=dtype, index=mi) + df = ser.unstack("bar") + # roundtrips -> df.stack().equals(ser) + + self.ser = ser + self.df = df + + def time_stack(self, dtype): + self.df.stack() + + def time_unstack_fast(self, dtype): + # last level -> doesn't have to make copies + self.ser.unstack("bar") + + def time_unstack_slow(self, dtype): + # first level -> must make copies + self.ser.unstack("foo") + + def time_transpose(self, dtype): + self.df.T + + class Unstack: params = ["int", "category"] diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2c5263f447951..00491b054c0ec 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -305,6 +305,8 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) +- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`xxxxx`) +- .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index df9224e192140..b61a6c2ea390f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -6,6 +6,7 @@ Iterator, Literal, Sequence, + TypeVar, overload, ) import warnings @@ -92,6 +93,8 @@ from pandas.compat.numpy import function as nv +_BaseMaskedArrayT = TypeVar("_BaseMaskedArrayT", bound="BaseMaskedArray") + class BaseMaskedArray(OpsMixin, ExtensionArray): """ @@ -1424,3 +1427,34 @@ def _groupby_op( # res_values should already have the correct dtype, we just need to # wrap in a MaskedArray return self._maybe_mask_result(res_values, result_mask) + + +def transpose_homogenous_masked_arrays( + masked_arrays: list[_BaseMaskedArrayT], +) -> list[_BaseMaskedArrayT]: + """Transpose masked arrays in a list, but faster. + + Input should be a list of 1-dim masked arrays of equal length and all have the + same dtype. The caller is responsible for ensuring validity of input data. + """ + transposed_shape = len(masked_arrays), len(masked_arrays[0]) + values = [arr._data for arr in masked_arrays] + transposed_values = np.empty(transposed_shape, dtype=values[0].dtype) + for i, val in enumerate(values): + transposed_values[i, :] = val + transposed_values = transposed_values.copy(order="F") + + masks = [arr._mask for arr in masked_arrays] + transposed_masks = np.empty(transposed_shape, dtype=masks[0].dtype) + for i, mask in enumerate(masks): + transposed_masks[i, :] = mask + transposed_masks = transposed_masks.copy(order="F") + + dtype = masked_arrays[0].dtype + arr_type = dtype.construct_array_type() + transposed_arrays = [] + for i in range(transposed_values.shape[1]): + transposed_arr = arr_type(transposed_values[:, i], mask=transposed_masks[:, i]) + transposed_arrays.append(transposed_arr) + + return transposed_arrays diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f1521e78be0d4..2ee2b93f735b7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -107,6 +107,7 @@ ) from pandas.core.dtypes.dtypes import ( ArrowDtype, + BaseMaskedDtype, ExtensionDtype, ) from pandas.core.dtypes.missing import ( @@ -3587,20 +3588,29 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: ) if using_copy_on_write() and len(self) > 0: result._mgr.add_references(self._mgr) # type: ignore[arg-type] - elif ( self._is_homogeneous_type and dtypes and isinstance(dtypes[0], ExtensionDtype) ): - # We have EAs with the same dtype. We can preserve that dtype in transpose. - dtype = dtypes[0] - arr_type = dtype.construct_array_type() - values = self.values + if isinstance(dtypes[0], BaseMaskedDtype): + # We have masked arrays with the same dtype. We can transpose faster. + from pandas.core.arrays.masked import transpose_homogenous_masked_arrays + + masked_arrays = [blk.values for blk in self._mgr.blocks] + new_values = transpose_homogenous_masked_arrays(masked_arrays) + else: + # We have other EAs with the same dtype. We preserve dtype in transpose. + dtyp = dtypes[0] + arr_typ = dtyp.construct_array_type() + values = self.values + new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] - new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] result = type(self)._from_arrays( - new_values, index=self.columns, columns=self.index + new_values, + index=self.columns, + columns=self.index, + verify_integrity=False, ) else: From deaa6bbc3d80560c0c16011de0834e12582f8a1f Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 21 Apr 2023 19:55:19 +0100 Subject: [PATCH 02/18] simplify ASVs --- asv_bench/benchmarks/reshape.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 5063f1c07466c..f80e9fd9ff256 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -93,7 +93,7 @@ def time_transpose(self, dtype): self.df.T -class ReshapeMaskedArrayDtype: +class ReshapeMaskedArrayDtype(ReshapeExtensionDtype): params = ["Int64", "Float64"] param_names = ["dtype"] @@ -111,20 +111,6 @@ def setup(self, dtype): self.ser = ser self.df = df - def time_stack(self, dtype): - self.df.stack() - - def time_unstack_fast(self, dtype): - # last level -> doesn't have to make copies - self.ser.unstack("bar") - - def time_unstack_slow(self, dtype): - # first level -> must make copies - self.ser.unstack("foo") - - def time_transpose(self, dtype): - self.df.T - class Unstack: params = ["int", "category"] From 3e56638cfec11dbd7551047f835c4913a60677f4 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 21 Apr 2023 21:24:41 +0100 Subject: [PATCH 03/18] typing failed, less typing --- pandas/core/arrays/masked.py | 6 +----- pandas/core/frame.py | 6 ++++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index b61a6c2ea390f..0d1272eed2835 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -93,8 +93,6 @@ from pandas.compat.numpy import function as nv -_BaseMaskedArrayT = TypeVar("_BaseMaskedArrayT", bound="BaseMaskedArray") - class BaseMaskedArray(OpsMixin, ExtensionArray): """ @@ -1429,9 +1427,7 @@ def _groupby_op( return self._maybe_mask_result(res_values, result_mask) -def transpose_homogenous_masked_arrays( - masked_arrays: list[_BaseMaskedArrayT], -) -> list[_BaseMaskedArrayT]: +def transpose_homogenous_masked_arrays(masked_arrays): """Transpose masked arrays in a list, but faster. Input should be a list of 1-dim masked arrays of equal length and all have the diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2ee2b93f735b7..10aedd0e67ef9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3596,8 +3596,10 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if isinstance(dtypes[0], BaseMaskedDtype): # We have masked arrays with the same dtype. We can transpose faster. from pandas.core.arrays.masked import transpose_homogenous_masked_arrays - - masked_arrays = [blk.values for blk in self._mgr.blocks] + if isinstance(self._mgr, ArrayManager): + masked_arrays = [arr for arr in self._mgr.arrays] + else: + masked_arrays = [blk.values for blk in self._mgr.blocks] new_values = transpose_homogenous_masked_arrays(masked_arrays) else: # We have other EAs with the same dtype. We preserve dtype in transpose. From 95230dd31d24498eb4106a6b33f13ab0e0be63ed Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 21 Apr 2023 23:26:14 +0100 Subject: [PATCH 04/18] pre-commit fixes --- pandas/core/arrays/masked.py | 1 - pandas/core/frame.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0d1272eed2835..1021358712069 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -6,7 +6,6 @@ Iterator, Literal, Sequence, - TypeVar, overload, ) import warnings diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 10aedd0e67ef9..f844ca0055e9c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3596,8 +3596,9 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if isinstance(dtypes[0], BaseMaskedDtype): # We have masked arrays with the same dtype. We can transpose faster. from pandas.core.arrays.masked import transpose_homogenous_masked_arrays + if isinstance(self._mgr, ArrayManager): - masked_arrays = [arr for arr in self._mgr.arrays] + masked_arrays = self._mgr.arrays else: masked_arrays = [blk.values for blk in self._mgr.blocks] new_values = transpose_homogenous_masked_arrays(masked_arrays) From 3847c486217d8c0f0fe474e0028640c7de2cb992 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 23:55:57 +0100 Subject: [PATCH 05/18] update --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/frame.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 00491b054c0ec..98cb89f34a863 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,6 +298,7 @@ Performance improvements - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) +- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) @@ -305,7 +306,6 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) -- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`xxxxx`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f844ca0055e9c..a3de6301bcea4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3588,6 +3588,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: ) if using_copy_on_write() and len(self) > 0: result._mgr.add_references(self._mgr) # type: ignore[arg-type] + elif ( self._is_homogeneous_type and dtypes From e87ad15be724bf936ae276a69fb0e0e1b7f6188e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 4 May 2023 19:43:38 +0200 Subject: [PATCH 06/18] Improve performance when selecting rows and columns (#53014) * Improve performance when selecting rows and columns * Update doc/source/whatsnew/v2.1.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update indexing.py * Update v2.1.0.rst --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 98cb89f34a863..a2a851e5ad3d1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -299,6 +299,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) +- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) From cb7fbe96e8abcbaf452a472be0e4b50d91b555e3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 5 May 2023 19:59:41 +0200 Subject: [PATCH 07/18] PERF: Improve performance when accessing GroupBy.groups (#53088) * PERF: Improve performance when accessing GroupBy.groups * Update v2.1.0.rst * Fix --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index a2a851e5ad3d1..069329d21ef6f 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -299,6 +299,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) +- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) From e065f4dfd5fb99aa4c29b8b928537d9697bf9644 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 7 May 2023 10:55:20 +0100 Subject: [PATCH 08/18] changes according to comments --- pandas/core/arrays/masked.py | 21 +++++++-------------- pandas/core/frame.py | 2 +- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1021358712069..661399c9906d4 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1426,28 +1426,21 @@ def _groupby_op( return self._maybe_mask_result(res_values, result_mask) -def transpose_homogenous_masked_arrays(masked_arrays): +def transpose_homogenous_masked_arrays(masked_arrays) -> list[BaseMaskedArray]: """Transpose masked arrays in a list, but faster. Input should be a list of 1-dim masked arrays of equal length and all have the same dtype. The caller is responsible for ensuring validity of input data. """ - transposed_shape = len(masked_arrays), len(masked_arrays[0]) - values = [arr._data for arr in masked_arrays] - transposed_values = np.empty(transposed_shape, dtype=values[0].dtype) - for i, val in enumerate(values): - transposed_values[i, :] = val - transposed_values = transposed_values.copy(order="F") - - masks = [arr._mask for arr in masked_arrays] - transposed_masks = np.empty(transposed_shape, dtype=masks[0].dtype) - for i, mask in enumerate(masks): - transposed_masks[i, :] = mask - transposed_masks = transposed_masks.copy(order="F") + values = [arr._data.reshape(1, -1) for arr in masked_arrays] + transposed_values = np.concatenate(values, axis=0) + + masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] + transposed_masks = np.concatenate(masks, axis=0) dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() - transposed_arrays = [] + transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): transposed_arr = arr_type(transposed_values[:, i], mask=transposed_masks[:, i]) transposed_arrays.append(transposed_arr) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a3de6301bcea4..2fbda42be7c90 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3601,7 +3601,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if isinstance(self._mgr, ArrayManager): masked_arrays = self._mgr.arrays else: - masked_arrays = [blk.values for blk in self._mgr.blocks] + masked_arrays = [*self._iter_column_arrays()] new_values = transpose_homogenous_masked_arrays(masked_arrays) else: # We have other EAs with the same dtype. We preserve dtype in transpose. From b664503c1bd41e5d7e2e95b8f21d3bdccb04109e Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 7 May 2023 11:31:28 +0100 Subject: [PATCH 09/18] fix precommit --- doc/source/whatsnew/v2.1.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 069329d21ef6f..98cb89f34a863 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -299,8 +299,6 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) -- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) -- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) From 29c03c6a26d59bfcd5a6b724eb436a0de08815e5 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 21 Apr 2023 18:35:35 +0100 Subject: [PATCH 10/18] PERF: Faster transposition of masked arrays --- pandas/core/arrays/masked.py | 3 +++ pandas/core/frame.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 661399c9906d4..a05dda88c3358 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -6,6 +6,7 @@ Iterator, Literal, Sequence, + TypeVar, overload, ) import warnings @@ -92,6 +93,8 @@ from pandas.compat.numpy import function as nv +_BaseMaskedArrayT = TypeVar("_BaseMaskedArrayT", bound="BaseMaskedArray") + class BaseMaskedArray(OpsMixin, ExtensionArray): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2fbda42be7c90..2a45f7cc3b484 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3588,7 +3588,6 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: ) if using_copy_on_write() and len(self) > 0: result._mgr.add_references(self._mgr) # type: ignore[arg-type] - elif ( self._is_homogeneous_type and dtypes From 2af2fc3280c808cf0d8a1b29ce672d26a89c31d6 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 23:55:57 +0100 Subject: [PATCH 11/18] update --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2a45f7cc3b484..2fbda42be7c90 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3588,6 +3588,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: ) if using_copy_on_write() and len(self) > 0: result._mgr.add_references(self._mgr) # type: ignore[arg-type] + elif ( self._is_homogeneous_type and dtypes From e56f987c3cb2ac404fe8b3f950ded1a56c4912e9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 4 May 2023 19:43:38 +0200 Subject: [PATCH 12/18] Improve performance when selecting rows and columns (#53014) * Improve performance when selecting rows and columns * Update doc/source/whatsnew/v2.1.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update indexing.py * Update v2.1.0.rst --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 98cb89f34a863..a2a851e5ad3d1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -299,6 +299,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) +- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) From 22f52384c3e8143dbd455c9133c27745f566ff44 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 5 May 2023 19:59:41 +0200 Subject: [PATCH 13/18] PERF: Improve performance when accessing GroupBy.groups (#53088) * PERF: Improve performance when accessing GroupBy.groups * Update v2.1.0.rst * Fix --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index a2a851e5ad3d1..069329d21ef6f 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -299,6 +299,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) +- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) From 1852bb28fc152ec81cb2b2dcd0e3283c65b9160d Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 7 May 2023 11:31:28 +0100 Subject: [PATCH 14/18] fix precommit --- doc/source/whatsnew/v2.1.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 069329d21ef6f..98cb89f34a863 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -299,8 +299,6 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) -- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) -- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) From 502bcee88f051fcbe4e5abe30b133f796348b883 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 27 May 2023 08:57:24 +0100 Subject: [PATCH 15/18] fix pre-commit --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 98cb89f34a863..b37e51b3ffb93 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -297,8 +297,8 @@ Performance improvements - Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`) - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) -- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) +- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) From 07663afb84045574f3b36122da8e45da2cee6484 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 15 Jul 2023 09:29:06 +0100 Subject: [PATCH 16/18] update according to comments --- pandas/core/arrays/masked.py | 6 +- pandas/core/arrays/object.py | 556 ----------------------------------- pandas/core/frame.py | 2 +- test.csv | 4 - 4 files changed, 5 insertions(+), 563 deletions(-) delete mode 100644 pandas/core/arrays/object.py delete mode 100644 test.csv diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1033a57a0fea9..4d7bd77c88aca 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -99,7 +99,7 @@ from pandas.compat.numpy import function as nv -_BaseMaskedArrayT = TypeVar("_BaseMaskedArrayT", bound="BaseMaskedArray") +BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray") class BaseMaskedArray(OpsMixin, ExtensionArray): @@ -1467,7 +1467,9 @@ def _groupby_op( return self._maybe_mask_result(res_values, result_mask) -def transpose_homogenous_masked_arrays(masked_arrays) -> list[BaseMaskedArray]: +def transpose_homogenous_masked_arrays( + masked_arrays: list[BaseMaskedArrayT], +) -> list[BaseMaskedArrayT]: """Transpose masked arrays in a list, but faster. Input should be a list of 1-dim masked arrays of equal length and all have the diff --git a/pandas/core/arrays/object.py b/pandas/core/arrays/object.py deleted file mode 100644 index 7e0318b080c72..0000000000000 --- a/pandas/core/arrays/object.py +++ /dev/null @@ -1,556 +0,0 @@ -from __future__ import annotations - -import re - -from typing import ( - TYPE_CHECKING, - Literal, -) - -import numpy as np - -from pandas._config import get_option - -from pandas._libs import ( - lib, - missing as libmissing, -) -from pandas._libs.arrays import NDArrayBacked -from pandas.compat import pa_version_under7p0 -from pandas.compat.numpy import function as nv -from pandas.util._decorators import doc - -from pandas.core.dtypes.base import ( - ExtensionDtype, - StorageExtensionDtype, - register_extension_dtype, -) -from pandas.core.dtypes.common import ( - is_array_like, - is_bool_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype, - pandas_dtype, -) - -from pandas.core import ops -from pandas.core.array_algos import masked_reductions -from pandas.core.arrays.base import ExtensionArray -from pandas.core.arrays.floating import ( - FloatingArray, - FloatingDtype, -) -from pandas.core.arrays.integer import ( - IntegerArray, - IntegerDtype, -) -from pandas.core.arrays.numpy_ import PandasArray -from pandas.core.construction import extract_array -from pandas.core.indexers import check_array_indexer -from pandas.core.missing import isna - -if TYPE_CHECKING: - import pyarrow - - from pandas._typing import ( - AxisInt, - Dtype, - NumpySorter, - NumpyValueArrayLike, - Scalar, - npt, - type_t, - ) - - from pandas import Series - - -@register_extension_dtype -class ObjectDtype(ExtensionDtype): - """ - Extension dtype for string data. - - .. warning:: - - StringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Parameters - ---------- - storage : {"python", "pyarrow"}, optional - If not given, the value of ``pd.options.mode.string_storage``. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> pd.ObjectDtype(str) - object[str] - - >>> import decimal - >>> pd.ObjectDtype(decimal.Decimal) - string[Decimal] - """ - - name = "object" - _match = re.compile(r"object\[(?P.+)\]") - - @property - def na_value(self) -> libmissing.NAType: - return libmissing.NA - - def __init__(self, typ: type) -> None: - if not isinstance(typ, type): - raise TypeError( - f"type must be python class. Got {type(typ)} instead." - ) - self._type = typ - - @property - def type(self) -> type_t: - return self._type - - def __repr__(self) -> str: - return f"{self.name}[{self._type.__name__}]" - - @classmethod - def construct_from_string(cls, string: str): - """ - Construct a ObjectDtype from a string. - - Parameters - ---------- - string : str - The type of the name. - - Returns - ------- - ObjectDtype - - Raise - ----- - TypeError - If the string is not a valid option. - """ - if not isinstance(string, str): - raise TypeError( - f"'construct_from_string' expects a string, got {type(string)}" - ) - match = cls._match.match(string) - if match is None: - raise TypeError(f"{repr(string)} not a valid string") - d = match.groupdict() - type_name = d.get("type") - type_ = __builtins__.get(type_name, None) - if type_ is None or not isinstance(type_, type): - raise TypeError(f"{repr(string)} not a valid string") - else: - return cls(type_) - - # https://github.com/pandas-dev/pandas/issues/36126 - # error: Signature of "construct_array_type" incompatible with supertype - # "ExtensionDtype" - def construct_array_type( # type: ignore[override] - self, - ) -> type_t[ObjectArray]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ObjectArray - - -# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is -# incompatible with definition in base class "ExtensionArray" -class ObjectArray(PandasArray): # type: ignore[misc] - """ - Extension array for object data. - - .. warning:: - - ObjectArray is considered experimental. The implementation and - parts of the API may change without warning. - - Parameters - ---------- - values : array-like - The array of data. - - .. warning:: - - Currently, this expects an object-dtype ndarray - where the elements are Python strings - or nan-likes (``None``, ``np.nan``, ``NA``). - This may change without warning in the future. Use - :meth:`pandas.array` with ``dtype="string"`` for a stable way of - creating a `StringArray` from any sequence. - - .. versionchanged:: 1.5.0 - - StringArray now accepts array-likes containing - nan-likes(``None``, ``np.nan``) for the ``values`` parameter - in addition to strings and :attr:`pandas.NA` - - copy : bool, default False - Whether to copy the array of data. - - Attributes - ---------- - None - - Methods - ------- - None - - See Also - -------- - :func:`pandas.array` - The recommended function for creating a StringArray. - Series.str - The string methods are available on Series backed by - a StringArray. - - Notes - ----- - StringArray returns a BooleanArray for comparison methods. - - Examples - -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="object[str]") - - ['This is', 'some text', , 'data.'] - Length: 4, dtype: object[str] - - Unlike arrays instantiated with ``dtype="object"``, ``ObjectArray`` - will convert the values to given type. - - >>> pd.array(['1', 1], dtype="object") - - ['1', 1] - Length: 2, dtype: object - >>> pd.array(['1', 1], dtype="object[int]") - - [1, 1] - Length: 2, dtype: object[int] - - However, instantiating ObjectArrays directly with incompatible objects will raise an - error. - - For comparison methods, `ObjectArray` returns a :class:`pandas.BooleanArray`: - - >>> pd.array(["a", None, "c"], dtype="object[str]") == "a" - - [True, , False] - Length: 3, dtype: boolean - """ - - # undo the PandasArray hack - _typ = "extension" - - def __init__(self, values, typ: type, copy: bool = False) -> None: - values = extract_array(values) - - super().__init__(values, copy=copy) - if not isinstance(values, type(self)): - self._validate() - NDArrayBacked.__init__(self, self._ndarray, ObjectDtype(typ)) - - def _validate(self): - """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != "object": - raise ValueError( - "StringArray requires a sequence of strings or pandas.NA. Got " - f"'{self._ndarray.dtype}' dtype instead." - ) - # Check to see if need to convert Na values to pd.NA - if self._ndarray.ndim > 2: - # Ravel if ndims > 2 b/c no cythonized version available - lib.convert_nans_to_NA(self._ndarray.ravel("K")) - else: - lib.convert_nans_to_NA(self._ndarray) - - @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): - if dtype and not (isinstance(dtype, str) and dtype == "string"): - dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "python" - - from pandas.core.arrays.masked import BaseMaskedArray - - if isinstance(scalars, BaseMaskedArray): - # avoid costly conversion to object dtype - na_values = scalars._mask - result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA - - else: - if lib.is_pyarrow_array(scalars): - # pyarrow array; we cannot rely on the "to_numpy" check in - # ensure_string_array because calling scalars.to_numpy would set - # zero_copy_only to True which caused problems see GH#52076 - scalars = np.array(scalars) - # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) - - # Manually creating new array avoids the validation step in the __init__, so is - # faster. Refactor need for validation? - new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) - - return new_string_array - - @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False - ): - return cls._from_sequence(strings, dtype=dtype, copy=copy) - - @classmethod - def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=object) - values[:] = libmissing.NA - return cls(values).astype(dtype, copy=False) - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - if type is None: - type = pa.string() - - values = self._ndarray.copy() - values[self.isna()] = None - return pa.array(values, type=type, from_pandas=True) - - def _values_for_factorize(self): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None - - def __setitem__(self, key, value): - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract PandasArray subclasses - value = value._ndarray - - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items - if scalar_value: - if isna(value): - value = libmissing.NA - elif not isinstance(value, str): - raise TypeError( - f"Cannot set non-string value '{value}' into a StringArray." - ) - else: - if not is_array_like(value): - value = np.asarray(value, dtype=object) - if len(value) and not lib.is_string_array(value, skipna=True): - raise TypeError("Must provide strings.") - - mask = isna(value) - if mask.any(): - value = value.copy() - value[isna(value)] = libmissing.NA - - super().__setitem__(key, value) - - def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: - # the super() method NDArrayBackedExtensionArray._putmask uses - # np.putmask which doesn't properly handle None/pd.NA, so using the - # base class implementation that uses __setitem__ - ExtensionArray._putmask(self, mask, value) - - def astype(self, dtype, copy: bool = True): - dtype = pandas_dtype(dtype) - - if dtype == self.dtype: - if copy: - return self.copy() - return self - - elif isinstance(dtype, IntegerDtype): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = 0 - values = arr.astype(dtype.numpy_dtype) - return IntegerArray(values, mask, copy=False) - elif isinstance(dtype, FloatingDtype): - arr = self.copy() - mask = self.isna() - arr[mask] = "0" - values = arr.astype(dtype.numpy_dtype) - return FloatingArray(values, mask, copy=False) - elif isinstance(dtype, ExtensionDtype): - # Skip the PandasArray.astype method - return ExtensionArray.astype(self, dtype, copy) - elif np.issubdtype(dtype, np.floating): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = 0 - values = arr.astype(dtype) - values[mask] = np.nan - return values - - return super().astype(dtype, copy) - - def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs - ): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) - - raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - - def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: - nv.validate_min((), kwargs) - result = masked_reductions.min( - values=self.to_numpy(), mask=self.isna(), skipna=skipna - ) - return self._wrap_reduction_result(axis, result) - - def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: - nv.validate_max((), kwargs) - result = masked_reductions.max( - values=self.to_numpy(), mask=self.isna(), skipna=skipna - ) - return self._wrap_reduction_result(axis, result) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas.core.algorithms import value_counts_internal as value_counts - - result = value_counts(self._ndarray, dropna=dropna).astype("Int64") - result.index = result.index.astype(self.dtype) - return result - - def memory_usage(self, deep: bool = False) -> int: - result = self._ndarray.nbytes - if deep: - return result + lib.memory_usage_of_objects(self._ndarray) - return result - - @doc(ExtensionArray.searchsorted) - def searchsorted( - self, - value: NumpyValueArrayLike | ExtensionArray, - side: Literal["left", "right"] = "left", - sorter: NumpySorter = None, - ) -> npt.NDArray[np.intp] | np.intp: - if self._hasna: - raise ValueError( - "searchsorted requires array to be sorted, which is impossible " - "with NAs present." - ) - return super().searchsorted(value=value, side=side, sorter=sorter) - - def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray - - if isinstance(other, StringArray): - other = other._ndarray - - mask = isna(self) | isna(other) - valid = ~mask - - if not lib.is_scalar(other): - if len(other) != len(self): - # prevent improper broadcasting when other is 2D - raise ValueError( - f"Lengths of operands do not match: {len(self)} != {len(other)}" - ) - - other = np.asarray(other) - other = other[valid] - - if op.__name__ in ops.ARITHMETIC_BINOPS: - result = np.empty_like(self._ndarray, dtype="object") - result[mask] = libmissing.NA - result[valid] = op(self._ndarray[valid], other) - return StringArray(result) - else: - # logical - result = np.zeros(len(self._ndarray), dtype="bool") - result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) - - _arith_method = _cmp_method - - # ------------------------------------------------------------------------ - # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "PandasArray" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - from pandas.arrays import BooleanArray - - if dtype is None: - dtype = StringDtype(storage="python") - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray] | type[BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 449a1de5d433f..f4fecdc2f14d5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3627,7 +3627,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if isinstance(self._mgr, ArrayManager): masked_arrays = self._mgr.arrays else: - masked_arrays = [*self._iter_column_arrays()] + masked_arrays = list(self._iter_column_arrays()) new_values = transpose_homogenous_masked_arrays(masked_arrays) else: # We have other EAs with the same dtype. We preserve dtype in transpose. diff --git a/test.csv b/test.csv deleted file mode 100644 index 56dc351a43492..0000000000000 --- a/test.csv +++ /dev/null @@ -1,4 +0,0 @@ -a,b,c -1,h,A -2,i,N/A -3,j,BBB From 094ee0a02c7c54732d8404b1fcc118902b30c03b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 15 Jul 2023 13:48:32 +0100 Subject: [PATCH 17/18] update according to comments II --- pandas/core/arrays/masked.py | 7 ++----- pandas/core/frame.py | 5 ++++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 4d7bd77c88aca..f2225efe4093c 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -4,7 +4,6 @@ TYPE_CHECKING, Any, Literal, - TypeVar, overload, ) import warnings @@ -99,8 +98,6 @@ from pandas.compat.numpy import function as nv -BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray") - class BaseMaskedArray(OpsMixin, ExtensionArray): """ @@ -1468,8 +1465,8 @@ def _groupby_op( def transpose_homogenous_masked_arrays( - masked_arrays: list[BaseMaskedArrayT], -) -> list[BaseMaskedArrayT]: + masked_arrays: Sequence[BaseMaskedArray], +) -> list[BaseMaskedArray]: """Transpose masked arrays in a list, but faster. Input should be a list of 1-dim masked arrays of equal length and all have the diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f4fecdc2f14d5..ce07d008dc555 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -132,6 +132,7 @@ ExtensionArray, PeriodArray, TimedeltaArray, + BaseMaskedArray, ) from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( @@ -3628,7 +3629,9 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: masked_arrays = self._mgr.arrays else: masked_arrays = list(self._iter_column_arrays()) - new_values = transpose_homogenous_masked_arrays(masked_arrays) + new_values = transpose_homogenous_masked_arrays( + cast(list[BaseMaskedArray], masked_arrays) + ) else: # We have other EAs with the same dtype. We preserve dtype in transpose. dtyp = dtypes[0] From 7234b03af3e814ed6afd86d9ed95f8786db04c4c Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 15 Jul 2023 16:28:13 +0100 Subject: [PATCH 18/18] fix pre-commit --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ce07d008dc555..a399507694fcc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -128,11 +128,11 @@ from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( + BaseMaskedArray, DatetimeArray, ExtensionArray, PeriodArray, TimedeltaArray, - BaseMaskedArray, ) from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import (