rapidsai · kkraus14 · Jan 7, 2020 · Oct 18, 2019 · Oct 21, 2019 · Oct 22, 2019
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,9 @@ python/*/build
 python/cudf/*/_lib/**/*.cpp
 python/cudf/*/_lib/**/*.h
 python/cudf/*/_lib/.nfs*
+python/cudf/*/_libxx/**/*.cpp
+python/cudf/*/_libxx/**/*.h
+python/cudf/*/_libxx/.nfs*
 python/cudf/*.ipynb
 python/cudf/.ipynb_checkpoints
 python/*/record.txt

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 - PR #3224 Define and implement new join APIs.
 - PR #3284 Add gpu-accelerated parquet writer
+- PR #3254 Python redesign for libcudf++
 - PR #3336 Add `from_dlpack` and `to_dlpack`
 - PR #3555 Add column names support to libcudf++ io readers and writers
 - PR #3610 Add memory_usage to DataFrame and Series APIs

@@ -27,6 +27,7 @@ requirements:
     - dlpack
     - pyarrow 0.15.0.*
     - libcudf {{ version }}
+    - rmm {{ minor_version }}.*
   run:
     - python
     - pandas>=0.24.2,<0.25

@@ -108,7 +108,7 @@ class table_view_base {
    *
    * @throws std::out_of_range
    * If `column_index` is out of the range [0, num_columns)
-   * 
+   *
    * @param column_index The index of the desired column
    * @return A reference to the desired column
    *---------------------------------------------------------------------------**/
@@ -124,7 +124,7 @@ class table_view_base {
    *---------------------------------------------------------------------------**/
   size_type num_rows() const noexcept { return _num_rows; }
 
-  table_view_base() = delete;
+  table_view_base() = default;
 
   ~table_view_base() = default;
 
@@ -148,6 +148,8 @@ class table_view : public detail::table_view_base<column_view> {
 public:
   using ColumnView = column_view;
 
+  table_view() = default;
+
   /**---------------------------------------------------------------------------*
    * @brief Construct a table from a vector of table views
    *
@@ -193,6 +195,8 @@ class mutable_table_view : public detail::table_view_base<mutable_column_view> {
 public:
   using ColumnView = mutable_column_view;
 
+  mutable_table_view() = default;
+
   mutable_column_view& column(size_type column_index) const {
     return const_cast<mutable_column_view&>(table_view_base::column(column_index));
   }

@@ -7,17 +7,19 @@
 
 from cudf._lib.cudf cimport *
 from cudf._lib.cudf import *
+from cudf._libxx.column cimport Column
 from libc.stdlib cimport free
 from libcpp.vector cimport vector
 
 from cudf._lib.includes.concat cimport gdf_column_concat
 
 
-def _column_concat(cols_to_concat, output_col):
+def _column_concat(cols_to_concat, Column output_col):
     cdef gdf_column* c_output_col = column_view_from_column(output_col)
     cdef vector[gdf_column*] c_input_cols
     cdef int num_cols = len(cols_to_concat)
 
+    cdef Column col
     for col in cols_to_concat:
         c_input_cols.push_back(column_view_from_column(col))
 

@@ -5,12 +5,13 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
+import cudf
 from cudf.core.buffer import Buffer
 from cudf._lib.cudf cimport *
 from cudf._lib.cudf import *
 
 import cudf.utils.utils as utils
-from cudf.utils.dtypes import is_string_dtype
+from cudf.utils.dtypes import is_string_dtype, is_categorical_dtype
 from cudf._lib.utils cimport (
     columns_from_table,
     table_from_columns,
@@ -48,7 +49,7 @@ def clone_columns_with_size(in_cols, row_size):
     for col in in_cols:
         o_col = column.column_empty_like(col,
                                          dtype=col.dtype,
-                                         masked=col.has_null_mask,
+                                         masked=col.mask,
                                          newsize=row_size)
         out_cols.append(o_col)
 
@@ -60,7 +61,7 @@ def _normalize_maps(maps, size):
 
     maps = column.as_column(maps).astype("int32")
     maps = maps.binary_operator("mod", np.int32(size))
-    maps = maps.data.mem
+    maps = maps.data_array_view
     return maps
 
 
@@ -88,10 +89,7 @@ def gather(source, maps, bounds_check=True):
     for i, in_col in enumerate(in_cols):
         in_cols[i] = column.as_column(in_cols[i])
 
-    if is_string_dtype(in_cols[0]):
-        in_size = in_cols[0].data.size()
-    else:
-        in_size = in_cols[0].data.size
+    in_size = in_cols[0].size
 
     maps = column.as_column(maps)
 
@@ -110,11 +108,12 @@ def gather(source, maps, bounds_check=True):
 
     for i, in_col in enumerate(in_cols):
         if isinstance(in_col, CategoricalColumn):
-            out_cols[i] = CategoricalColumn(
-                data=out_cols[i].data,
-                mask=out_cols[i].mask,
+            out_cols[i] = column.build_categorical_column(
                 categories=in_col.cat().categories,
-                ordered=in_col.cat().ordered)
+                codes=out_cols[i],
+                mask=out_cols[i].mask,
+                ordered=in_col.cat().ordered
+            )
 
     free_column(c_maps)
     free_table(c_in_table)
@@ -163,6 +162,15 @@ def scatter(source, maps, target, bounds_check=True):
 
     result_cols = columns_from_table(&c_result_table)
 
+    for i, in_col in enumerate(target_cols):
+        if is_categorical_dtype(in_col.dtype):
+            result_cols[i] = column.build_categorical_column(
+                categories=in_col.cat().categories,
+                codes=result_cols[i],
+                mask=result_cols[i].mask,
+                ordered=in_col.cat().ordered
+            )
+
     del c_source_table
     del c_target_table
 
@@ -189,7 +197,8 @@ def copy_column(input_col):
 
 def copy_range(out_col, in_col, int out_begin, int out_end,
                int in_begin):
-    from cudf.core.column import Column
+
+    from cudf.core.column import as_column
 
     if abs(out_end - out_begin) <= 1:
         return out_col
@@ -202,17 +211,15 @@ def copy_range(out_col, in_col, int out_begin, int out_end,
     if out_begin > out_end:
         return out_col
 
-    if out_col.null_count == 0 and in_col.has_null_mask:
+    if not out_col.has_nulls and in_col.nullable:
         mask = utils.make_mask(len(out_col))
         cudautils.fill_value(mask, 0xff)
-        out_col._mask = Buffer(mask)
-        out_col._null_count = 0
+        out_col.mask = Buffer(mask)
 
-    if in_col.null_count == 0 and out_col.has_null_mask:
+    if not in_col.has_nulls and out_col.nullable:
         mask = utils.make_mask(len(in_col))
         cudautils.fill_value(mask, 0xff)
-        in_col._mask = Buffer(mask)
-        in_col._null_count = 0
+        in_col.mask = Buffer(mask)
 
     cdef gdf_column* c_out_col = column_view_from_column(out_col)
     cdef gdf_column* c_in_col = column_view_from_column(in_col)
@@ -224,20 +231,23 @@ def copy_range(out_col, in_col, int out_begin, int out_end,
                        out_end,
                        in_begin)
 
-    out_col._update_null_count(c_out_col.null_count)
-
     if is_string_dtype(out_col) and len(out_col) > 0:
-        update_nvstrings_col(
-            out_col,
-            <uintptr_t>c_out_col.dtype_info.category)
+        nvcat_ptr = int(<uintptr_t>c_out_col.dtype_info.category)
+        nvcat_obj = None
+        if nvcat_ptr:
+            nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
+            nvstr_obj = nvcat_obj.to_strings()
+        else:
+            nvstr_obj = nvstrings.to_device([])
+        out_col = as_column(nvstr_obj)
 
     free_column(c_in_col)
     free_column(c_out_col)
 
     return out_col
 
 
-def scatter_to_frames(source, maps, index=None):
+def scatter_to_frames(source, maps, index=None, names=None, index_names=None):
     """
     Scatters rows to 'n' dataframes according to maps
 
@@ -251,39 +261,37 @@ def scatter_to_frames(source, maps, index=None):
     -------
     list of scattered dataframes
     """
-    from cudf.core.column import column, CategoricalColumn
+    from cudf.core.column import column, build_column, build_categorical_column
     from cudf.core.series import Series
-
     in_cols = source
+
     if index:
-        ind_names = [ind.name for ind in index]
-        ind_names_tmp = [(ind_name or "_tmp_index") for ind_name in ind_names]
+        ind_names_tmp = [(ind_name or "_tmp_index")
+                         for ind_name in index_names]
         for i in range(len(index)):
-            index[i].name = ind_names_tmp[i]
             in_cols.append(index[i])
+            names.append(ind_names_tmp[i])
+
     col_count=len(in_cols)
     if col_count == 0:
         return []
 
     cats = {}
     for i, in_col in enumerate(in_cols):
         in_cols[i] = column.as_column(in_cols[i])
-        if isinstance(in_cols[i], CategoricalColumn):
-            cats[in_cols[i].name] = (
-                Series(in_cols[i]._categories),
-                in_cols[i]._ordered
+        if is_categorical_dtype(in_cols[i]):
+            cats[names[i]] = (
+                Series(in_cols[i].categories),
+                in_cols[i].ordered
             )
 
-    if is_string_dtype(in_cols[0]):
-        in_size = in_cols[0].data.size()
-    else:
-        in_size = in_cols[0].data.size
+    in_size = in_cols[0].size
 
     maps = column.as_column(maps).astype("int32")
     gather_count = len(maps)
     assert(gather_count == in_size)
 
-    cdef gdf_column** c_in_cols = cols_view_from_cols(in_cols)
+    cdef gdf_column** c_in_cols = cols_view_from_cols(in_cols, names)
     cdef cudf_table* c_in_table = new cudf_table(c_in_cols, col_count)
     cdef gdf_column* c_maps = column_view_from_column(maps)
     cdef vector[cudf_table] c_out_tables
@@ -295,18 +303,24 @@ def scatter_to_frames(source, maps, index=None):
     for tab in c_out_tables:
         df = table_to_dataframe(&tab, int_col_names=False)
         for name, cat_info in cats.items():
+            if is_categorical_dtype(df[name].dtype):
+                data_dtype = df[name].codes.dtype
+            else:
+                data_dtype = df[name].dtype
             df[name] = Series(
-                CategoricalColumn(
-                    data=df[name].data,
+                build_categorical_column(
                     categories=cat_info[0],
-                    ordered=cat_info[1],
+                    codes=df[name]._column,
+                    ordered=cat_info[1]
                 )
             )
-
         if index:
+            print(index)
+            print(index_names)
+            print(df)
             df = df.set_index(ind_names_tmp)
             if len(index) == 1:
-                df.index.name = ind_names[0]
+                df.index.name = index_names[0]
         out_tables.append(df)
 
     free_table(c_in_table, c_in_cols)

@@ -298,7 +298,7 @@ cpdef write_csv(
             if col_name not in cols:
                 raise NameError('column {!r} does not exist in DataFrame'
                                 .format(col_name))
-            col = cols[col_name]._column
+            col = cols[col_name]
             check_gdf_compatibility(col)
             # Workaround for string columns
             if col.dtype.type == np.object_:
@@ -308,7 +308,6 @@ cpdef write_csv(
             list_cols.push_back(c_col)
     else:
         for idx, (col_name, col) in enumerate(cols.items()):
-            col = col._column
             check_gdf_compatibility(col)
             # Workaround for string columns
             if col.dtype.type == np.object_:

@@ -17,6 +17,8 @@ from libc.stdint cimport (  # noqa: E211
 )
 from libcpp.vector cimport vector
 
+from cudf._libxx.column cimport Column
+
 # Utility functions to build gdf_columns, gdf_context and error handling
 
 cpdef get_ctype_ptr(obj)
@@ -31,20 +33,12 @@ cdef np_dtype_from_gdf_column(gdf_column* col)
 
 cdef get_scalar_value(gdf_scalar scalar, dtype)
 
-cdef gdf_column* column_view_from_column(col, col_name=*) except? NULL
-cdef gdf_column* column_view_from_NDArrays(
-    size,
-    data,
-    mask,
-    dtype,
-    null_count
-) except? NULL
+cdef gdf_column* column_view_from_column(Column col, col_name=*) except? NULL
 cdef gdf_scalar* gdf_scalar_from_scalar(val, dtype=*) except? NULL
-cdef gdf_column_to_column(gdf_column* c_col, int_col_name=*)
-cdef gdf_column_to_column_mem(gdf_column* input_col)
-cdef update_nvstrings_col(col, uintptr_t category_ptr)
-cdef gdf_column* column_view_from_string_column(col, col_name=*) except? NULL
-cdef gdf_column** cols_view_from_cols(cols) except ? NULL
+cdef Column gdf_column_to_column(gdf_column* c_col)
+cdef gdf_column* column_view_from_string_column(Column col,
+                                                col_name=*) except? NULL
+cdef gdf_column** cols_view_from_cols(cols, names=*) except ? NULL
 cdef free_table(cudf_table* table0, gdf_column** cols=*)
 cdef free_column(gdf_column* c_col)
 
@@ -382,4 +376,6 @@ cdef extern from "cudf/legacy/table.hpp" namespace "cudf" nogil:
 #        gdf_column const* const* end() const
 #        gdf_column const* get_column(size_type index) const except +
 
-cpdef gdf_dtype gdf_dtype_from_value(col, dtype=*) except? GDF_invalid
+cdef gdf_dtype gdf_dtype_from_dtype(dtype) except? GDF_invalid
+
+cdef char* py_to_c_str(object py_str) except? NULL