Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add Numpy Array Interface support to Pandas objects #8321

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 71 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def _isnull_new(obj):
return _isnull_ndarraylike(obj)
elif isinstance(obj, ABCGeneric):
return obj._constructor(obj._data.isnull(func=isnull))
elif isinstance(obj, list) or hasattr(obj, '__array__'):
elif isinstance(obj, list) or is_array_like(obj):
return _isnull_ndarraylike(np.asarray(obj))
else:
return obj is None
Expand All @@ -243,7 +243,7 @@ def _isnull_old(obj):
return _isnull_ndarraylike_old(obj)
elif isinstance(obj, ABCGeneric):
return obj._constructor(obj._data.isnull(func=_isnull_old))
elif isinstance(obj, list) or hasattr(obj, '__array__'):
elif isinstance(obj, list) or is_array_like(obj):
return _isnull_ndarraylike_old(np.asarray(obj))
else:
return obj is None
Expand Down Expand Up @@ -2266,7 +2266,7 @@ def _asarray_tuplesafe(values, dtype=None):
from pandas.core.index import Index

if not (isinstance(values, (list, tuple))
or hasattr(values, '__array__')):
or is_array_like(values)):
values = list(values)
elif isinstance(values, Index):
return values.values
Expand Down Expand Up @@ -2489,6 +2489,38 @@ def is_list_like(arg):
return (hasattr(arg, '__iter__') and
not isinstance(arg, compat.string_and_binary_types))

def is_array_like(obj):
"""
Check if object provides access to a data buffer via one of the numpy
array apis.

http://docs.scipy.org/doc/numpy/reference/arrays.classes.html
http://docs.scipy.org/doc/numpy/reference/arrays.interface.html

Parameters
----------
obj : Object

Note
----
Remember that ndarrays and NDFrames are array-like.
"""
# numpy ndarray subclass api
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps this function could just be:

array_like_attrs = ['__array__', '__array_interface__', '__array_struct__']
return any(hasattr(obj, attr) for attr in array_like_attrs)

I don't think you need to check the types of the attributes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's only appropriate to check for 'array' the others are not necessarily and c-level anyhow

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that is true. PIL exposes its data to numpy via __array_interface__ and does not have a __array__. rpy2 implements __array_struct__ and not __array__. Looking around github there are projects that only implement __array_interface__.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nobody writes ndarray-like objects to spec :).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tell me about it. Problem is, it's hard to detangle should work vs does work. At this point, they are probably the same thing since there's enough code out there depending on current behaviors. :/

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is too complicated. isn't this:

return any([ hasattr(obj, attr) for attr in ['__array__','__array_interface__','__array_true'] ])

equivalent / simplier / faster? (this is going to be called a lot, pls show a perf check as well)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback look at my first comment in this thread!

also, you definitely want to do the generator compression any(...) rather than the list compression any([...]) (the later would will always do every check).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer I did see that but forgot :)

yes, @dalejung I think that expression is the correct one (with generators)

tmp = getattr(obj, '__array__', None)
if callable(tmp):
return True

# Python side
# __array_interface__ is a dict
tmp = getattr(obj, '__array_interface__', None)
if isinstance(tmp, dict):
return True

# C-struct access
if hasattr(obj, '__array_struct__'):
return True

return False

def _is_sequence(x):
try:
Expand Down Expand Up @@ -3105,3 +3137,39 @@ def _maybe_match_name(a, b):
if a_name == b_name:
return a_name
return None

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function is superfluous
in the series/frame constructor is_array_like is the last thing checked (or u can simple check if it has 'array' and not '_typ' - which is the same thing as is a PandasObject

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm, should Categorical have a _typ?

The Series constructor has the is_array_like call in the else. However, that's because the np.ndarray check is a no-op. The Frame constructor has the is_array_like check above the big if block because we actually do stuff with ndarrays and the data needs to be transformed to trigger that conditional. I'll see how it looks checking for is_array_like within the isinstance(data, (np.ndarray, Series, Index)) check and just np.asarray as needed within that block.

def _unhandled_array_interface(obj):
"""
Checks whether an object:
1) Implements the array interface
2) Is not an object type that pandas handles natively

#2 is a moving target. Essentially any 3rd party module can implement the
NumPy Array Interface and should be treated as array-like. For example,
the rpy2 SexpVector implements `__array_struct__` which we do not
explicitly handle.

In the future, if we add explicit handling for the SexpVector, this
function would have to account for that.

Parameters
----------
obj : Object

Usage
-----

```
if com._unhandled_array_interface(data):
data = np.asarray(data)
```

"""
if isinstance(obj, (np.ndarray)):
return False

import pandas.core.base as base
if isinstance(obj, (base.PandasObject)):
return False

return is_array_like(obj)
4 changes: 4 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
if dtype is not None:
dtype = self._validate_dtype(dtype)

# convert unhandled array-like objects
if com._unhandled_array_interface(data):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, this needs to be part of the if-else. I would remove the com._unhandled_array_inteface and simply use is_array_like (and yes Categorical IS array_like. and if passed to the DataFrame constructor as a bare object WILL be converted to a ndarray, e.g. is_array_like is usally paired with a ndim check!)

data = np.asarray(data)

if isinstance(data, DataFrame):
data = data._data

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
if copy:
subarr = subarr.copy()

elif hasattr(data, '__array__'):
elif com.is_array_like(data):
return Index(np.asarray(data), dtype=dtype, copy=copy, name=name,
**kwargs)
elif data is None or np.isscalar(data):
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ def _init_data(self, data, copy, dtype, **kwargs):
if dtype is not None:
dtype = self._validate_dtype(dtype)

# convert unhandled array-like objects
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this necessary? the problem is their aren't enough tests to validate this (e.g. only a certain type of objects can actually make it here, that''s why its an if-else, which this is not part of )

if com._unhandled_array_interface(data):
data = np.asarray(data)

passed_axes = [kwargs.get(a) for a in self._AXIS_ORDERS]
axes = None
if isinstance(data, BlockManager):
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
raise TypeError("{0!r} type is unordered"
"".format(data.__class__.__name__))
else:
# unhandled array-like objects
if com.is_array_like(data):
data = np.asarray(data)

# handle sparse passed here (and force conversion)
if isinstance(data, ABCSparseArray):
Expand Down
25 changes: 25 additions & 0 deletions pandas/rpy/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,31 @@ def test_factor(self):
result = com.load_data(name)
assert np.equal(result, factors)

def test_pandas_constructor_compat(self):
"""
test that rpy2 SexpVector get handled by Pandas object constructors
"""
types = [pd.Series, pd.DataFrame, pd.Panel]
rnorm = r['rnorm']
for typ in types:
shape = typ._AXIS_LEN * [10]
N = 10 ** typ._AXIS_LEN

# create array on the R side
r_cmd = "test_arr = rnorm({N}); dim(test_arr) = c({shape});test_arr"
r_cmd = r_cmd.format(N=N, shape=','.join(map(str, shape)))
test_arr = r(r_cmd)

# numpy.array handles array interfaces correctly
npy_arr = np.array(test_arr)
assert npy_arr.ndim == typ._AXIS_LEN
assert npy_arr.size == N

assert isinstance(test_arr, robj.SexpVector)
pobj = typ(test_arr)
tm.assert_almost_equal(pobj.values, np.array(test_arr))
tm.assert_almost_equal(pobj.values, npy_arr)

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
# '--with-coverage', '--cover-package=pandas.core'],
Expand Down
65 changes: 64 additions & 1 deletion pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_get_callable_name():
from functools import partial
getname = com._get_callable_name

def fn(x):
def fn(x):
return x
lambda_ = lambda x: x
part1 = partial(fn)
Expand Down Expand Up @@ -890,6 +890,69 @@ def test_2d_datetime64(self):
expected[:, [2, 4]] = datetime(2007, 1, 1)
tm.assert_almost_equal(result, expected)

class FakeArrArray(object):
def __init__(self, arr):
self.arr = arr

def __array__(self):
return self.arr.__array__()

class FakeArrInterface(object):
def __init__(self, arr):
self.arr = arr

@property
def __array_interface__(self):
return self.arr.__array_interface__

class FakeArrStruct(object):
def __init__(self, arr):
self.arr = arr

@property
def __array_struct__(self):
return self.arr.__array_struct__

def test_is_array_like():
"""
Test interface from:
http://docs.scipy.org/doc/numpy/reference/arrays.interface.html

Different from ndarray subclass
"""
arr = np.arange(10)
assert com.is_array_like(arr) is True

# __array__
arr_array = FakeArrArray(arr)
assert com.is_array_like(arr_array) is True

# __array_interface__
arr_interface = FakeArrInterface(arr)
assert com.is_array_like(arr_interface) is True

# __array_struct__
arr_struct= FakeArrStruct(arr)
assert com.is_array_like(arr_struct) is True

def test_unhandled_array_interface():
"""
"""
# skip the strutures we already explicitly handle
arr = np.arange(10)
series = Series(arr)
frame = tm.makeDataFrame()
assert not com._unhandled_array_interface(series)
assert not com._unhandled_array_interface(frame)
assert not com._unhandled_array_interface(arr)

# __array_interface__
arr_interface = FakeArrInterface(arr)
assert com._unhandled_array_interface(arr_interface) is True

# __array_struct__
arr_struct= FakeArrStruct(arr)
assert com._unhandled_array_interface(arr_struct) is True

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,41 @@ def f(dtype):
f('float64')
f('M8[ns]')

def test_constructor_array_interface(self):
"""
Test that objects implementing NumPy Array Interface get treated
like arrays in constructor
"""
class FakeArrInterface(object):
def __init__(self, arr):
self.arr = arr

@property
def __array_interface__(self):
return self.arr.__array_interface__

class FakeArrStruct(object):
def __init__(self, arr):
self.arr = arr

@property
def __array_struct__(self):
return self.arr.__array_struct__

shape = [10] * self._ndim
arr = np.random.randn(*shape)
fai = FakeArrInterface(arr)
pobj = self._typ(fai)
assert_almost_equal(pobj.values, arr)
assert_almost_equal(pobj.values, np.array(fai))

arr = np.random.randn(*shape)
fas = FakeArrStruct(arr)
pobj2 = self._typ(fas)
assert_almost_equal(pobj2.values, arr)
assert_almost_equal(pobj2.values, np.array(fas))


def check_metadata(self, x, y=None):
for m in x._metadata:
v = getattr(x,m,None)
Expand Down