TileDB-Inc · ihnorton · Oct 8, 2020 · Oct 7, 2020 · Oct 7, 2020 · Oct 7, 2020
diff --git a/HISTORY.md b/HISTORY.md
@@ -7,7 +7,7 @@
     significantly reduces complications and bugs with python multiprocessing fork mode.
   - Support coalescing subarray ranges to give major performance boosts.
 
-## Packaging  Notes
+## Packaging Notes
 * TileDB-Py 0.7 packages on PyPI support macOS 10.13+ and manylinux10-compatible Linux distributions only.
   For now, wheels could be produced supporting older systems but without Google Cloud Support; if needed,
   please contact us to discuss.
@@ -16,6 +16,9 @@
 * Added ".df[]" indexer tiledb.Array: directly returns a Pandas dataframe from a query (uses `multi_index` indexing behavior) [#390](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
 * Added wrapping and support for TileDB checksumming filters: `ChecksumMD5Filter` and `ChecksumSHA256Filter` [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
 * Removed TBB install from default setup.py, corresponding to TileDB Embedded changes [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)
+* Add support for 'capacity' kwarg to `from_csv`/`from_pandas` [#391](https://github.com/TileDB-Inc/TileDB-Py/pull/391)
+* Add support for 'tile' kwarg to `from_csv`/`from_pandas` to customize Dim tile extent [#391](https://github.com/TileDB-Inc/TileDB-Py/pull/391)
+
 
 ## Misc Updates
 * Added round-trip tests for all filter `repr` objects [#389](https://github.com/TileDB-Inc/TileDB-Py/pull/389)

diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py
@@ -33,6 +33,7 @@
     'row_start_idx': None,
     'fillna': None,
     'column_types': None,
+    'capacity': None,
     'date_spec': None,
     'cell_order': 'row-major',
     'tile_order': 'row-major',
@@ -241,6 +242,17 @@ def create_dims(ctx, dataframe, index_dims,
     index_dict = OrderedDict()
     index_dtype = None
 
+    per_dim_tile = False
+    if tile is not None:
+        if isinstance(tile, dict):
+            per_dim_tile = True
+
+        # input check, can't do until after per_dim_tile
+        if (per_dim_tile and not all(map(lambda x: isinstance(x,(int,float)), tile.values()))) or \
+           (per_dim_tile is False and not isinstance(tile, int)):
+            raise ValueError("Invalid tile kwarg: expected int or tuple of ints "
+                             "got '{}'".format(tile))
+
     if isinstance(index, pd.MultiIndex):
         for name in index.names:
             index_dict[name] = dataframe.index.get_level_values(name)
@@ -257,12 +269,22 @@ def create_dims(ctx, dataframe, index_dims,
     else:
         raise ValueError("Unhandled index type {}".format(type(index)))
 
-    dim_types = list(
-        dim_info_for_column(ctx, dataframe, values,
-                            tile=tile, full_domain=full_domain,
-                            index_dtype=index_dtype)
-        for values in index_dict.values()
-    )
+    # create list of dim types
+    # we need to know all the types in order to validate before creating Dims
+    dim_types = list()
+    for idx,(name, values) in enumerate(index_dict.items()):
+        if per_dim_tile and name in tile:
+            dim_tile = tile[name]
+        elif per_dim_tile:
+            # in this case we fall back to the default
+            dim_tile = None
+        else:
+            # in this case we use a scalar (type-checked earlier)
+            dim_tile = tile
+
+        dim_types.append(dim_info_for_column(ctx, dataframe, values,
+                         tile=dim_tile, full_domain=full_domain,
+                         index_dtype=index_dtype))
 
     if any([d.dtype in (np.bytes_, np.unicode_) for d in dim_types]):
         if sparse is False:
@@ -279,17 +301,34 @@ def create_dims(ctx, dataframe, index_dims,
 
     ndim = len(dim_types)
 
-    dims = list(
-        dim_for_column(ctx, name, dim_types[i], values,
-                       tile=tile, full_domain=full_domain, ndim=ndim)
-        for i, (name, values) in enumerate(index_dict.items())
-    )
+    dims = list()
+    for idx, (name, values) in enumerate(index_dict.items()):
+        if per_dim_tile and name in tile:
+            dim_tile = tile[name]
+        elif per_dim_tile:
+            # in this case we fall back to the default
+            dim_tile = None
+        else:
+            # in this case we use a scalar (type-checked earlier)
+            dim_tile = tile
+
+        dims.append(dim_for_column(ctx, name, dim_types[idx], values,
+                    tile=dim_tile, full_domain=full_domain, ndim=ndim))
 
     if index_dims:
         for name in index_dims:
+            if per_dim_tile and name in tile:
+                dim_tile = tile[name]
+            elif per_dim_tile:
+                # in this case we fall back to the default
+                dim_tile = None
+            else:
+                # in this case we use a scalar  (type-checked earlier)
+                dim_tile = tile
+
             col = dataframe[name]
             dims.append(
-                dim_for_column(ctx, dataframe, col.values, name)
+                dim_for_column(ctx, dataframe, col.values, name, tile=dim_tile)
             )
 
     return dims, sparse
@@ -342,6 +381,7 @@ def from_pandas(uri, dataframe, **kwargs):
     attrs_filters = args.get('attrs_filters', None)
     coords_filters = args.get('coords_filters', None)
     full_domain = args.get('full_domain', False)
+    capacity = args.get('capacity', False)
     tile = args.get('tile', None)
     nrows = args.get('nrows', None)
     row_start_idx = args.get('row_start_idx', None)
@@ -359,6 +399,9 @@ def from_pandas(uri, dataframe, **kwargs):
         elif mode != 'ingest':
             raise TileDBError("Invalid mode specified ('{}')".format(mode))
 
+    if capacity is None:
+        capacity = 0 # this will use the libtiledb internal default
+
     if ctx is None:
         ctx = tiledb.default_ctx()
 
@@ -398,6 +441,7 @@ def from_pandas(uri, dataframe, **kwargs):
             tile_order=tile_order,
             coords_filters=coords_filters,
             allows_duplicates=allows_duplicates,
+            capacity=capacity,
             sparse=sparse
         )
 
@@ -523,7 +567,9 @@ def from_csv(uri, csv_file, **kwargs):
             * ``attrs_filters``: FilterList to apply to all Attributes
             * ``coords_filters``: FilterList to apply to all coordinates (Dimensions)
             * ``sparse``: (default True) Create sparse schema
-            * ``tile``: Schema tiling (capacity)
+            * ``tile``: Dimension tiling: accepts either Int or a list of Tuple[Int] with per-dimension
+              'tile' arguments to apply to the generated ArraySchema.
+            * ``capacity``: Schema capacity
             * ``date_spec``: Dictionary of {``column_name``: format_spec} to apply to date/time
               columns which are not correctly inferred by pandas 'parse_dates'.
               Format must be specified using the Python format codes:

diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py
@@ -377,11 +377,21 @@ def test_csv_schema_only(self):
         #   the arg is correctly parsed/passed
         coords_filters = tiledb.FilterList([tiledb.ZstdFilter(7)])
 
+        tmp_assert_dir = os.path.join(tmp_dir, "array")
+        # this should raise an error
+        with self.assertRaises(ValueError):
+            tiledb.from_csv(tmp_assert_dir, tmp_csv, tile=1.0)
+
+        with self.assertRaises(ValueError):
+            tiledb.from_csv(tmp_assert_dir, tmp_csv, tile=(3,1.0))
+
         tmp_array = os.path.join(tmp_dir, "array")
         tiledb.from_csv(tmp_array, tmp_csv,
                         index_col=['time', 'double_range'],
                         parse_dates=['time'],
                         mode='schema_only',
+                        capacity=1001,
+                        tile={'time': 5},
                         coords_filters=coords_filters)
 
         t0, t1 = df.time.min(), df.time.max()
@@ -390,7 +400,7 @@ def test_csv_schema_only(self):
         ref_schema = tiledb.ArraySchema(
                         domain=tiledb.Domain(*[
                           tiledb.Dim(name='time', domain=(t0.to_datetime64(), t1.to_datetime64()),
-                                     tile=1000, dtype='datetime64[ns]'),
+                                     tile=5, dtype='datetime64[ns]'),
                           tiledb.Dim(name='double_range', domain=(-1000.0, 1000.0), tile=1000, dtype='float64'),
                         ]),
                         attrs=[
@@ -399,6 +409,7 @@ def test_csv_schema_only(self):
                         coords_filters=coords_filters,
                         cell_order='row-major',
                         tile_order='row-major',
+                        capacity=1001,
                         sparse=True,
                         allows_duplicates=False)
                         # note: filters omitted