From 5bee82566923468a1eaddfaa3fe8ca84f456fa99 Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Wed, 7 Oct 2020 00:21:15 -0400 Subject: [PATCH] Add support for 'capacity' kwarg in from_csv and from_pandas --- HISTORY.md | 1 + tiledb/dataframe_.py | 8 +++++++- tiledb/tests/test_pandas_dataframe.py | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 48f3cf7486..77f59ccc57 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -16,6 +16,7 @@ * Added ".df[]" indexer tiledb.Array: directly returns a Pandas dataframe from a query (uses `multi_index` indexing behavior) [#390](https://github.com/TileDB-Inc/TileDB-Py/pull/389) * Added wrapping and support for TileDB checksumming filters: `ChecksumMD5Filter` and `ChecksumSHA256Filter` [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389) * Removed TBB install from default setup.py, corresponding to TileDB Embedded changes [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389) +* Add support for 'capacity' kwarg to `from_csv`/`from_pandas` [#391](https://github.com/TileDB-Inc/TileDB-Py/pull/391) ## Misc Updates * Added round-trip tests for all filter `repr` objects [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389) diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index e11a44ef25..83c8efde43 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -33,6 +33,7 @@ 'row_start_idx': None, 'fillna': None, 'column_types': None, + 'capacity': None, 'date_spec': None, 'cell_order': 'row-major', 'tile_order': 'row-major', @@ -342,6 +343,7 @@ def from_pandas(uri, dataframe, **kwargs): attrs_filters = args.get('attrs_filters', None) coords_filters = args.get('coords_filters', None) full_domain = args.get('full_domain', False) + capacity = args.get('capacity', False) tile = args.get('tile', None) nrows = args.get('nrows', None) row_start_idx = args.get('row_start_idx', None) @@ -359,6 +361,9 @@ def from_pandas(uri, dataframe, **kwargs): elif mode != 'ingest': raise TileDBError("Invalid mode specified ('{}')".format(mode)) + if capacity is None: + capacity = 0 # this will use the libtiledb internal defaul + if ctx is None: ctx = tiledb.default_ctx() @@ -398,6 +403,7 @@ def from_pandas(uri, dataframe, **kwargs): tile_order=tile_order, coords_filters=coords_filters, allows_duplicates=allows_duplicates, + capacity=capacity, sparse=sparse ) @@ -523,7 +529,7 @@ def from_csv(uri, csv_file, **kwargs): * ``attrs_filters``: FilterList to apply to all Attributes * ``coords_filters``: FilterList to apply to all coordinates (Dimensions) * ``sparse``: (default True) Create sparse schema - * ``tile``: Schema tiling (capacity) + * ``capacity``: Schema tiling (capacity) * ``date_spec``: Dictionary of {``column_name``: format_spec} to apply to date/time columns which are not correctly inferred by pandas 'parse_dates'. Format must be specified using the Python format codes: diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py index 883e6a2e7e..d109b100c9 100644 --- a/tiledb/tests/test_pandas_dataframe.py +++ b/tiledb/tests/test_pandas_dataframe.py @@ -382,6 +382,7 @@ def test_csv_schema_only(self): index_col=['time', 'double_range'], parse_dates=['time'], mode='schema_only', + capacity=1001, coords_filters=coords_filters) t0, t1 = df.time.min(), df.time.max() @@ -399,6 +400,7 @@ def test_csv_schema_only(self): coords_filters=coords_filters, cell_order='row-major', tile_order='row-major', + capacity=1001, sparse=True, allows_duplicates=False) # note: filters omitted