Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for 'capacity' as kwarg and 'tile' as dict of extents (from_csv/pandas) #391

Merged
merged 3 commits into from
Oct 8, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
significantly reduces complications and bugs with python multiprocessing fork mode.
- Support coalescing subarray ranges to give major performance boosts.

## Packaging Notes
## Packaging Notes
* TileDB-Py 0.7 packages on PyPI support macOS 10.13+ and manylinux10-compatible Linux distributions only.
For now, wheels could be produced supporting older systems but without Google Cloud Support; if needed,
please contact us to discuss.
Expand All @@ -16,6 +16,9 @@
* Added ".df[]" indexer tiledb.Array: directly returns a Pandas dataframe from a query (uses `multi_index` indexing behavior) [#390](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
* Added wrapping and support for TileDB checksumming filters: `ChecksumMD5Filter` and `ChecksumSHA256Filter` [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
* Removed TBB install from default setup.py, corresponding to TileDB Embedded changes [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
* Add support for 'capacity' kwarg to `from_csv`/`from_pandas` [#391](https://github.com/TileDB-Inc/TileDB-Py/pull/391)
* Add support for 'tile' kwarg to `from_csv`/`from_pandas` to customize Dim tile extent [#391](https://github.com/TileDB-Inc/TileDB-Py/pull/391)


## Misc Updates
* Added round-trip tests for all filter `repr` objects [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
Expand Down
72 changes: 59 additions & 13 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
'row_start_idx': None,
'fillna': None,
'column_types': None,
'capacity': None,
'date_spec': None,
'cell_order': 'row-major',
'tile_order': 'row-major',
Expand Down Expand Up @@ -241,6 +242,17 @@ def create_dims(ctx, dataframe, index_dims,
index_dict = OrderedDict()
index_dtype = None

per_dim_tile = False
if tile is not None:
if isinstance(tile, dict):
per_dim_tile = True

# input check, can't do until after per_dim_tile
if (per_dim_tile and not all(map(lambda x: isinstance(x,(int,float)), tile.values()))) or \
(per_dim_tile is False and not isinstance(tile, int)):
raise ValueError("Invalid tile kwarg: expected int or tuple of ints "
"got '{}'".format(tile))

if isinstance(index, pd.MultiIndex):
for name in index.names:
index_dict[name] = dataframe.index.get_level_values(name)
Expand All @@ -257,12 +269,22 @@ def create_dims(ctx, dataframe, index_dims,
else:
raise ValueError("Unhandled index type {}".format(type(index)))

dim_types = list(
dim_info_for_column(ctx, dataframe, values,
tile=tile, full_domain=full_domain,
index_dtype=index_dtype)
for values in index_dict.values()
)
# create list of dim types
# we need to know all the types in order to validate before creating Dims
dim_types = list()
for idx,(name, values) in enumerate(index_dict.items()):
if per_dim_tile and name in tile:
dim_tile = tile[name]
elif per_dim_tile:
# in this case we fall back to the default
dim_tile = None
else:
# in this case we use a scalar (type-checked earlier)
dim_tile = tile

dim_types.append(dim_info_for_column(ctx, dataframe, values,
tile=dim_tile, full_domain=full_domain,
index_dtype=index_dtype))

if any([d.dtype in (np.bytes_, np.unicode_) for d in dim_types]):
if sparse is False:
Expand All @@ -279,17 +301,34 @@ def create_dims(ctx, dataframe, index_dims,

ndim = len(dim_types)

dims = list(
dim_for_column(ctx, name, dim_types[i], values,
tile=tile, full_domain=full_domain, ndim=ndim)
for i, (name, values) in enumerate(index_dict.items())
)
dims = list()
for idx, (name, values) in enumerate(index_dict.items()):
if per_dim_tile and name in tile:
dim_tile = tile[name]
elif per_dim_tile:
# in this case we fall back to the default
dim_tile = None
else:
# in this case we use a scalar (type-checked earlier)
dim_tile = tile

dims.append(dim_for_column(ctx, name, dim_types[idx], values,
tile=dim_tile, full_domain=full_domain, ndim=ndim))

if index_dims:
for name in index_dims:
if per_dim_tile and name in tile:
dim_tile = tile[name]
elif per_dim_tile:
# in this case we fall back to the default
dim_tile = None
else:
# in this case we use a scalar (type-checked earlier)
dim_tile = tile

col = dataframe[name]
dims.append(
dim_for_column(ctx, dataframe, col.values, name)
dim_for_column(ctx, dataframe, col.values, name, tile=dim_tile)
)

return dims, sparse
Expand Down Expand Up @@ -342,6 +381,7 @@ def from_pandas(uri, dataframe, **kwargs):
attrs_filters = args.get('attrs_filters', None)
coords_filters = args.get('coords_filters', None)
full_domain = args.get('full_domain', False)
capacity = args.get('capacity', False)
tile = args.get('tile', None)
nrows = args.get('nrows', None)
row_start_idx = args.get('row_start_idx', None)
Expand All @@ -359,6 +399,9 @@ def from_pandas(uri, dataframe, **kwargs):
elif mode != 'ingest':
raise TileDBError("Invalid mode specified ('{}')".format(mode))

if capacity is None:
capacity = 0 # this will use the libtiledb internal default

if ctx is None:
ctx = tiledb.default_ctx()

Expand Down Expand Up @@ -398,6 +441,7 @@ def from_pandas(uri, dataframe, **kwargs):
tile_order=tile_order,
coords_filters=coords_filters,
allows_duplicates=allows_duplicates,
capacity=capacity,
sparse=sparse
)

Expand Down Expand Up @@ -523,7 +567,9 @@ def from_csv(uri, csv_file, **kwargs):
* ``attrs_filters``: FilterList to apply to all Attributes
* ``coords_filters``: FilterList to apply to all coordinates (Dimensions)
* ``sparse``: (default True) Create sparse schema
* ``tile``: Schema tiling (capacity)
* ``tile``: Dimension tiling: accepts either Int or a list of Tuple[Int] with per-dimension
'tile' arguments to apply to the generated ArraySchema.
* ``capacity``: Schema capacity
* ``date_spec``: Dictionary of {``column_name``: format_spec} to apply to date/time
columns which are not correctly inferred by pandas 'parse_dates'.
Format must be specified using the Python format codes:
Expand Down
13 changes: 12 additions & 1 deletion tiledb/tests/test_pandas_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,11 +377,21 @@ def test_csv_schema_only(self):
# the arg is correctly parsed/passed
coords_filters = tiledb.FilterList([tiledb.ZstdFilter(7)])

tmp_assert_dir = os.path.join(tmp_dir, "array")
# this should raise an error
with self.assertRaises(ValueError):
tiledb.from_csv(tmp_assert_dir, tmp_csv, tile=1.0)

with self.assertRaises(ValueError):
tiledb.from_csv(tmp_assert_dir, tmp_csv, tile=(3,1.0))

tmp_array = os.path.join(tmp_dir, "array")
tiledb.from_csv(tmp_array, tmp_csv,
index_col=['time', 'double_range'],
parse_dates=['time'],
mode='schema_only',
capacity=1001,
tile={'time': 5},
coords_filters=coords_filters)

t0, t1 = df.time.min(), df.time.max()
Expand All @@ -390,7 +400,7 @@ def test_csv_schema_only(self):
ref_schema = tiledb.ArraySchema(
domain=tiledb.Domain(*[
tiledb.Dim(name='time', domain=(t0.to_datetime64(), t1.to_datetime64()),
tile=1000, dtype='datetime64[ns]'),
tile=5, dtype='datetime64[ns]'),
tiledb.Dim(name='double_range', domain=(-1000.0, 1000.0), tile=1000, dtype='float64'),
]),
attrs=[
Expand All @@ -399,6 +409,7 @@ def test_csv_schema_only(self):
coords_filters=coords_filters,
cell_order='row-major',
tile_order='row-major',
capacity=1001,
sparse=True,
allows_duplicates=False)
# note: filters omitted
Expand Down