-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: add BooleanArray extension array #29555
Changes from 5 commits
640dac9
b9597bb
fa77b7a
29415a9
c4a53f2
b1182bc
94c5a90
1861602
ad6c477
67bf21a
f153fb2
e24c097
f0d0c6e
a3e1e93
1717583
5ce67e2
8c0abe6
031a113
90558d6
af82754
0eb3ca2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
is_integer_dtype, | ||
is_list_like, | ||
is_scalar, | ||
pandas_dtype, | ||
) | ||
from pandas.core.dtypes.dtypes import register_extension_dtype | ||
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries | ||
|
@@ -103,12 +104,11 @@ def _is_boolean(self) -> bool: | |
|
||
def coerce_to_array(values, mask=None, copy=False): | ||
""" | ||
Coerce the input values array to numpy arrays with a mask | ||
Coerce the input values array to numpy arrays with a mask. | ||
|
||
Parameters | ||
---------- | ||
values : 1D list-like | ||
dtype : integer dtype | ||
mask : bool 1D array, optional | ||
copy : bool, default False | ||
if True, copy the input | ||
|
@@ -131,11 +131,27 @@ def coerce_to_array(values, mask=None, copy=False): | |
if copy: | ||
values = values.copy() | ||
else: | ||
# TODO conversion from integer/float ndarray can be done more efficiently | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# (avoid roundtrip through object) | ||
values_object = np.asarray(values, dtype=object) | ||
mask_values = isna(values) | ||
|
||
inferred_dtype = lib.infer_dtype(values_object, skipna=True) | ||
integer_like = ("floating", "integer", "mixed-integer-float") | ||
if inferred_dtype not in ("boolean", "empty") + integer_like: | ||
raise TypeError("Need to pass bool-like values") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tests hit here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, there is a test that passes all kinds of non-boolean-like values In general, I ran locally pytest with coverage, and there is 97% coverage for this file. The main non-covered things are some parts of the ufunc related code, and some length mismatch errors in the ops code. |
||
|
||
mask_values = isna(values_object) | ||
values = np.zeros(len(values), dtype=bool) | ||
values[~mask_values] = values_object[~mask_values].astype(bool) | ||
|
||
# if the values were integer-like, validate it were actually 0/1's | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if inferred_dtype in integer_like: | ||
if not np.all( | ||
values[~mask_values].astype(float) | ||
== values_object[~mask_values].astype(float) | ||
): | ||
raise TypeError("Need to pass bool-like values") | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if mask is None and mask_values is None: | ||
mask = np.zeros(len(values), dtype=bool) | ||
elif mask is None: | ||
|
@@ -153,9 +169,9 @@ def coerce_to_array(values, mask=None, copy=False): | |
mask = mask | mask_values | ||
|
||
if not values.ndim == 1: | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise TypeError("values must be a 1D list-like") | ||
raise ValueError("values must be a 1D list-like") | ||
if not mask.ndim == 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test hit all this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes |
||
raise TypeError("mask must be a 1D list-like") | ||
raise ValueError("mask must be a 1D list-like") | ||
|
||
return values, mask | ||
|
||
|
@@ -211,7 +227,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin): | |
Length: 3, dtype: boolean | ||
""" | ||
|
||
def __init__(self, values, mask, copy=False): | ||
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): | ||
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): | ||
raise TypeError( | ||
"values should be boolean numpy array. Use " | ||
|
@@ -222,6 +238,10 @@ def __init__(self, values, mask, copy=False): | |
"mask should be boolean numpy array. Use " | ||
"the 'array' function instead" | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't you need an isinstance check on values that it must be a BooleanArray? or ndim==1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There is already a isinstance check for the values being a boolean ndarray (so not a BooleanArray). But will add a check for ndim==1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
only when you actually coerce, not here though this is something we should check (here and in IntegerArray) as if you accidently pass a non ndim==1 then it would be an error (can be a followup) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Not fully sure I understand. We don't do any coercing in this
As following your comment, I already added a ndim check; I now raise an error if ndim is not 1 (on the lines below) |
||
if not values.ndim == 1: | ||
raise ValueError("values must be a 1D array") | ||
if not mask.ndim == 1: | ||
raise ValueError("mask must be a 1D array") | ||
|
||
if copy: | ||
values = values.copy() | ||
|
@@ -261,9 +281,15 @@ def __getitem__(self, item): | |
return self._data[item] | ||
return type(self)(self._data[item], self._mask[item]) | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def _coerce_to_ndarray(self, force_bool=False): | ||
def _coerce_to_ndarray(self, force_bool: bool = False): | ||
""" | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
coerce to an ndarary of object dtype | ||
Coerce to an ndarary of object dtype or bool dtype (if force_bool=True). | ||
|
||
Parameters | ||
---------- | ||
force_bool : bool, default False | ||
If True, return bool array or raise error if not possible (in | ||
presence of missing values) | ||
""" | ||
if force_bool: | ||
if not self.isna().any(): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
@@ -286,8 +312,11 @@ def __array__(self, dtype=None): | |
if dtype is not None: | ||
if is_bool_dtype(dtype): | ||
return self._coerce_to_ndarray(force_bool=True) | ||
# TODO can optimize this to not go through object dtype for | ||
# numeric dtypes | ||
arr = self._coerce_to_ndarray() | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return arr.astype(dtype, copy=False) | ||
# by default (no dtype specified), return an object array | ||
return self._coerce_to_ndarray() | ||
|
||
def __arrow_array__(self, type=None): | ||
|
@@ -414,7 +443,7 @@ def _concat_same_type(cls, to_concat): | |
|
||
def astype(self, dtype, copy=True): | ||
""" | ||
Cast to a NumPy array or BooleanArray with 'dtype'. | ||
Cast to a NumPy array or ExtensionArray with 'dtype'. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -427,15 +456,16 @@ def astype(self, dtype, copy=True): | |
|
||
Returns | ||
------- | ||
array : ndarray or BooleanArray | ||
NumPy ndarray or IntergerArray with 'dtype' for its dtype. | ||
array : ndarray or ExtensionArray | ||
NumPy ndarray, BooleanArray or IntergerArray with 'dtype' for its dtype. | ||
|
||
Raises | ||
------ | ||
TypeError | ||
if incompatible type with an IntegerDtype, equivalent of same_kind | ||
if incompatible type with an BooleanDtype, equivalent of same_kind | ||
casting | ||
""" | ||
dtype = pandas_dtype(dtype) | ||
|
||
if isinstance(dtype, BooleanDtype): | ||
values, mask = coerce_to_array(self, copy=copy) | ||
|
@@ -446,7 +476,7 @@ def astype(self, dtype, copy=True): | |
if self.isna().any(): | ||
raise ValueError("cannot convert float NaN to bool") | ||
else: | ||
return self._data.astype("bool", copy=copy) | ||
return self._data.astype(dtype, copy=copy) | ||
if is_extension_array_dtype(dtype) and is_integer_dtype(dtype): | ||
from pandas.core.arrays import IntegerArray | ||
|
||
|
@@ -457,19 +487,6 @@ def astype(self, dtype, copy=True): | |
data = self._coerce_to_ndarray() | ||
return astype_nansafe(data, dtype, copy=None) | ||
|
||
@property | ||
def _ndarray_values(self) -> np.ndarray: | ||
""" | ||
Internal pandas method for lossy conversion to a NumPy ndarray. | ||
|
||
This method is not part of the pandas interface. | ||
|
||
The expectation is that this is cheap to compute, and is primarily | ||
used for interacting with our indexers. | ||
""" | ||
raise NotImplementedError | ||
# return self._data | ||
|
||
def value_counts(self, dropna=True): | ||
""" | ||
Returns a Series containing counts of each category. | ||
|
@@ -629,18 +646,21 @@ def _reduce(self, name, skipna=True, **kwargs): | |
data[mask] = self._na_value | ||
|
||
op = getattr(nanops, "nan" + name) | ||
result = op(data, axis=0, skipna=skipna, mask=mask) | ||
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) | ||
|
||
# if we have a boolean op, don't coerce | ||
if name in ["any", "all"]: | ||
pass | ||
|
||
# if we have numeric op that would result in an int, coerce to int if possible | ||
elif name in ["sum", "min", "max", "prod"] and notna(result): | ||
int_result = int(result) | ||
elif name in ["sum", "prod"] and notna(result): | ||
int_result = np.int64(result) | ||
if int_result == result: | ||
result = int_result | ||
|
||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
elif name in ["min", "max"] and notna(result): | ||
result = np.bool_(result) | ||
|
||
return result | ||
|
||
def _maybe_mask_result(self, result, mask, other, op_name): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you type these as much as possible
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not really sure how to type those (I don't see anything in
pandas._typing
for "list like" ?)