-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Add Numpy Array Interface support to Pandas objects #8321
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -217,7 +217,7 @@ def _isnull_new(obj): | |
return _isnull_ndarraylike(obj) | ||
elif isinstance(obj, ABCGeneric): | ||
return obj._constructor(obj._data.isnull(func=isnull)) | ||
elif isinstance(obj, list) or hasattr(obj, '__array__'): | ||
elif isinstance(obj, list) or is_array_like(obj): | ||
return _isnull_ndarraylike(np.asarray(obj)) | ||
else: | ||
return obj is None | ||
|
@@ -243,7 +243,7 @@ def _isnull_old(obj): | |
return _isnull_ndarraylike_old(obj) | ||
elif isinstance(obj, ABCGeneric): | ||
return obj._constructor(obj._data.isnull(func=_isnull_old)) | ||
elif isinstance(obj, list) or hasattr(obj, '__array__'): | ||
elif isinstance(obj, list) or is_array_like(obj): | ||
return _isnull_ndarraylike_old(np.asarray(obj)) | ||
else: | ||
return obj is None | ||
|
@@ -2266,7 +2266,7 @@ def _asarray_tuplesafe(values, dtype=None): | |
from pandas.core.index import Index | ||
|
||
if not (isinstance(values, (list, tuple)) | ||
or hasattr(values, '__array__')): | ||
or is_array_like(values)): | ||
values = list(values) | ||
elif isinstance(values, Index): | ||
return values.values | ||
|
@@ -2489,6 +2489,38 @@ def is_list_like(arg): | |
return (hasattr(arg, '__iter__') and | ||
not isinstance(arg, compat.string_and_binary_types)) | ||
|
||
def is_array_like(obj): | ||
""" | ||
Check if object provides access to a data buffer via one of the numpy | ||
array apis. | ||
|
||
http://docs.scipy.org/doc/numpy/reference/arrays.classes.html | ||
http://docs.scipy.org/doc/numpy/reference/arrays.interface.html | ||
|
||
Parameters | ||
---------- | ||
obj : Object | ||
|
||
Note | ||
---- | ||
Remember that ndarrays and NDFrames are array-like. | ||
""" | ||
# numpy ndarray subclass api | ||
tmp = getattr(obj, '__array__', None) | ||
if callable(tmp): | ||
return True | ||
|
||
# Python side | ||
# __array_interface__ is a dict | ||
tmp = getattr(obj, '__array_interface__', None) | ||
if isinstance(tmp, dict): | ||
return True | ||
|
||
# C-struct access | ||
if hasattr(obj, '__array_struct__'): | ||
return True | ||
|
||
return False | ||
|
||
def _is_sequence(x): | ||
try: | ||
|
@@ -3105,3 +3137,39 @@ def _maybe_match_name(a, b): | |
if a_name == b_name: | ||
return a_name | ||
return None | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this function is superfluous There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm, should Categorical have a The Series constructor has the |
||
def _unhandled_array_interface(obj): | ||
""" | ||
Checks whether an object: | ||
1) Implements the array interface | ||
2) Is not an object type that pandas handles natively | ||
|
||
#2 is a moving target. Essentially any 3rd party module can implement the | ||
NumPy Array Interface and should be treated as array-like. For example, | ||
the rpy2 SexpVector implements `__array_struct__` which we do not | ||
explicitly handle. | ||
|
||
In the future, if we add explicit handling for the SexpVector, this | ||
function would have to account for that. | ||
|
||
Parameters | ||
---------- | ||
obj : Object | ||
|
||
Usage | ||
----- | ||
|
||
``` | ||
if com._unhandled_array_interface(data): | ||
data = np.asarray(data) | ||
``` | ||
|
||
""" | ||
if isinstance(obj, (np.ndarray)): | ||
return False | ||
|
||
import pandas.core.base as base | ||
if isinstance(obj, (base.PandasObject)): | ||
return False | ||
|
||
return is_array_like(obj) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -197,6 +197,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, | |
if dtype is not None: | ||
dtype = self._validate_dtype(dtype) | ||
|
||
# convert unhandled array-like objects | ||
if com._unhandled_array_interface(data): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, this needs to be part of the if-else. I would remove the |
||
data = np.asarray(data) | ||
|
||
if isinstance(data, DataFrame): | ||
data = data._data | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,6 +145,10 @@ def _init_data(self, data, copy, dtype, **kwargs): | |
if dtype is not None: | ||
dtype = self._validate_dtype(dtype) | ||
|
||
# convert unhandled array-like objects | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this necessary? the problem is their aren't enough tests to validate this (e.g. only a certain type of objects can actually make it here, that''s why its an if-else, which this is not part of ) |
||
if com._unhandled_array_interface(data): | ||
data = np.asarray(data) | ||
|
||
passed_axes = [kwargs.get(a) for a in self._AXIS_ORDERS] | ||
axes = None | ||
if isinstance(data, BlockManager): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
perhaps this function could just be:
I don't think you need to check the types of the attributes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's only appropriate to check for 'array' the others are not necessarily and c-level anyhow
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that is true. PIL exposes its data to numpy via
__array_interface__
and does not have a__array__
. rpy2 implements__array_struct__
and not__array__
. Looking around github there are projects that only implement__array_interface__
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nobody writes ndarray-like objects to spec :).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tell me about it. Problem is, it's hard to detangle should work vs does work. At this point, they are probably the same thing since there's enough code out there depending on current behaviors. :/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is too complicated. isn't this:
equivalent / simplier / faster? (this is going to be called a lot, pls show a perf check as well)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback look at my first comment in this thread!
also, you definitely want to do the generator compression
any(...)
rather than the list compressionany([...])
(the later would will always do every check).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shoyer I did see that but forgot :)
yes, @dalejung I think that expression is the correct one (with generators)