Add support for 'capacity' kwarg in from_csv and from_pandas

TileDB-Inc · Oct 7, 2020 · 5bee825 · 5bee825
1 parent f240836
commit 5bee825
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 1 deletion.
diff --git a/HISTORY.md b/HISTORY.md
@@ -16,6 +16,7 @@
 * Added ".df[]" indexer tiledb.Array: directly returns a Pandas dataframe from a query (uses `multi_index` indexing behavior) [#390](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
 * Added wrapping and support for TileDB checksumming filters: `ChecksumMD5Filter` and `ChecksumSHA256Filter` [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
 * Removed TBB install from default setup.py, corresponding to TileDB Embedded changes [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
+* Add support for 'capacity' kwarg to `from_csv`/`from_pandas` [#391](https://github.com/TileDB-Inc/TileDB-Py/pull/391)
 
 ## Misc Updates
 * Added round-trip tests for all filter `repr` objects [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)

diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py
@@ -33,6 +33,7 @@
     'row_start_idx': None,
     'fillna': None,
     'column_types': None,
+    'capacity': None,
     'date_spec': None,
     'cell_order': 'row-major',
     'tile_order': 'row-major',
@@ -342,6 +343,7 @@ def from_pandas(uri, dataframe, **kwargs):
     attrs_filters = args.get('attrs_filters', None)
     coords_filters = args.get('coords_filters', None)
     full_domain = args.get('full_domain', False)
+    capacity = args.get('capacity', False)
     tile = args.get('tile', None)
     nrows = args.get('nrows', None)
     row_start_idx = args.get('row_start_idx', None)
@@ -359,6 +361,9 @@ def from_pandas(uri, dataframe, **kwargs):
         elif mode != 'ingest':
             raise TileDBError("Invalid mode specified ('{}')".format(mode))
 
+    if capacity is None:
+        capacity = 0 # this will use the libtiledb internal defaul
+
     if ctx is None:
         ctx = tiledb.default_ctx()
 
@@ -398,6 +403,7 @@ def from_pandas(uri, dataframe, **kwargs):
             tile_order=tile_order,
             coords_filters=coords_filters,
             allows_duplicates=allows_duplicates,
+            capacity=capacity,
             sparse=sparse
         )
 
@@ -523,7 +529,7 @@ def from_csv(uri, csv_file, **kwargs):
             * ``attrs_filters``: FilterList to apply to all Attributes
             * ``coords_filters``: FilterList to apply to all coordinates (Dimensions)
             * ``sparse``: (default True) Create sparse schema
-            * ``tile``: Schema tiling (capacity)
+            * ``capacity``: Schema tiling (capacity)
             * ``date_spec``: Dictionary of {``column_name``: format_spec} to apply to date/time
               columns which are not correctly inferred by pandas 'parse_dates'.
               Format must be specified using the Python format codes:

diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py
@@ -382,6 +382,7 @@ def test_csv_schema_only(self):
                         index_col=['time', 'double_range'],
                         parse_dates=['time'],
                         mode='schema_only',
+                        capacity=1001,
                         coords_filters=coords_filters)
 
         t0, t1 = df.time.min(), df.time.max()
@@ -399,6 +400,7 @@ def test_csv_schema_only(self):
                         coords_filters=coords_filters,
                         cell_order='row-major',
                         tile_order='row-major',
+                        capacity=1001,
                         sparse=True,
                         allows_duplicates=False)
                         # note: filters omitted