Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for query(...).df[] indexing #411

Merged
merged 1 commit into from
Nov 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tiledb/libtiledb.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,7 @@ cdef class Query(object):
cdef object order
cdef DomainIndexer domain_index
cdef object multi_index
cdef object df

cdef class ReadQuery(object):
cdef object _buffers
Expand Down
13 changes: 9 additions & 4 deletions tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def stats_dump():
import tiledb.core
print(tiledb.core.python_internal_stats())


cpdef unicode ustring(object s):
"""Coerce a python object to a unicode string"""

Expand Down Expand Up @@ -3633,8 +3633,6 @@ cdef class Array(object):
self.last_fragment_info = dict()
self.meta = Metadata(self)



def __cinit__(self):
self.ptr = NULL

Expand Down Expand Up @@ -4184,8 +4182,9 @@ cdef class Query(object):
self.order = order
self.domain_index = DomainIndexer(array, query=self)
# Delayed to avoid circular import
from .multirange_indexing import MultiRangeIndexer
from .multirange_indexing import MultiRangeIndexer, DataFrameIndexer
self.multi_index = MultiRangeIndexer(array, query=self)
self.df = DataFrameIndexer(array, query=self)

def __getitem__(self, object selection):
return self.array.subarray(selection,
Expand Down Expand Up @@ -4222,6 +4221,12 @@ cdef class Query(object):
"""Apply Array.multi_index with query parameters."""
return self.multi_index

@property
def df(self):
"""Apply Array.multi_index with query parameters and return result
as a Pandas dataframe."""
return self.df


# work around https://github.com/cython/cython/issues/2757
def _create_densearray(cls, sta):
Expand Down
1 change: 0 additions & 1 deletion tiledb/multirange_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@ class DataFrameIndexer(MultiRangeIndexer):
Implements `.df[]` indexing to directly return a dataframe
[] operator uses multi_index semantics.
"""

def __getitem__(self, idx):
from .dataframe_ import _tiledb_result_as_dataframe

Expand Down
16 changes: 15 additions & 1 deletion tiledb/tests/test_pandas_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,9 @@ def test_dataframe_multiindex_dims(self):
df_idx_res = A.df[slice(*ned_time), :]
tm.assert_frame_equal(df_idx_res, df)

# test .df[] indexing with query
df_idx_res = A.query(attrs=['int_vals']).df[slice(*ned_time), :]
tm.assert_frame_equal(df_idx_res, df)

def test_csv_dense(self):
col_size = 10
Expand Down Expand Up @@ -505,6 +508,17 @@ def test_dataframe_csv_chunked(self):
df_idx_res = A.df[int(ned[0]):int(ned[1])]
tm.assert_frame_equal(df_idx_res, df)

# test .df[] indexing with query
df_idx_res = A.query(attrs=['time']).df[int(ned[0]):int(ned[1])]
tm.assert_frame_equal(df_idx_res, df[['time']])

df_idx_res = A.query(attrs=['double_range']).df[int(ned[0]):int(ned[1])]
tm.assert_frame_equal(df_idx_res, df[['double_range']])

# disable coordinate dimension/index
df_idx_res = A.query(coords=False).df[int(ned[0]):int(ned[1])]
tm.assert_frame_equal(df_idx_res, df.reset_index(drop=True))

def test_csv_fillna(self):
col_size = 10
data = np.random.rand(10) * 100 # make some integers for the 2nd test
Expand All @@ -525,7 +539,7 @@ def check_array(path, df):
df['v'][4] = 0

with tiledb.open(path) as A:
df_bk = A.df[:] #pd.DataFrame.from_dict(res)
df_bk = A.df[:]
tm.assert_frame_equal(df_bk, df)

check_array(tmp_array, copy.deepcopy(df))
Expand Down