NVIDIA · sarahyurick · Nov 22, 2024 · Nov 22, 2024 · Dec 3, 2024 · Dec 11, 2024
diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
@@ -38,6 +38,9 @@ def __len__(self) -> int:
     def persist(self) -> "DocumentDataset":
         return DocumentDataset(self.df.persist())
 
+    def to_backend(self, backend: Optional[str] = None) -> "DocumentDataset":
+        return DocumentDataset(self.df.to_backend(backend))
+
     @wraps(dd.DataFrame.repartition)
     def repartition(self, *args, **kwargs) -> "DocumentDataset":
         return self.__class__(self.df.repartition(*args, **kwargs))

diff --git a/nemo_curator/modules/fuzzy_dedup/bucketstoedges.py b/nemo_curator/modules/fuzzy_dedup/bucketstoedges.py
@@ -124,6 +124,12 @@ def buckets_to_edges(
         return result_df
 
     def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
+        if "cudf" not in str(type(dataset.df)):
+            raise TypeError(
+                "Dask-cuDF DataFrame is required to run buckets to edges. "
+                'Please convert your DocumentDataset by using .to_backend("gpu").'
+            )
+
         buckets_df = dataset.df
         self._logger.info(f"Starting conversion of LSH Buckets to Graph Edgelist")
         if len(self.id_fields) > 1:

diff --git a/nemo_curator/modules/fuzzy_dedup/connectedcomponents.py b/nemo_curator/modules/fuzzy_dedup/connectedcomponents.py
@@ -58,7 +58,7 @@ def __init__(
         else:
             self._logger = logger
 
-    def cc_workflow(self, output_path):
+    def __call__(self, output_path):
         deduped_parsed_id_path = self._write_dedup_parsed_id()
         encoded_jaccard_pair_path = self._write_encoded_jaccard_pair(
             deduped_parsed_id_path
@@ -81,8 +81,15 @@ def _run_connected_components(
         with performance_report_if_with_ts_suffix(
             self.profile_dir, "connected-components-run"
         ):
+            try:
+                Comms.initialize(p2p=False)
+            except ValueError:
+                raise TypeError(
+                    "A GPU-based Dask client is required to run connected components. "
+                    'Please initialize your client with get_client(cluster_type="gpu") '
+                    "or with a LocalCUDACluster."
+                )
 
-            Comms.initialize(p2p=False)
             df = dask_cudf.read_parquet(
                 deduped_encoded_jaccard_path, blocksize="1GB", aggregate_files=True
             )

diff --git a/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py b/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py
@@ -141,6 +141,11 @@ def __call__(self, dataset: DocumentDataset):
         DocumentDataset containing IDs of all documents and the corresponding duplicate group
         they belong to. Documents in the same group are near duplicates.
         """
+        if "cudf" not in str(type(dataset.df)):
+            raise TypeError(
+                "Dask-cuDF DataFrame is required to run fuzzy deduplication. "
+                'Please convert your DocumentDataset by using .to_backend("gpu").'
+            )
 
         # Minhash + LSH
         stage_num = 1
@@ -175,14 +180,14 @@ def __call__(self, dataset: DocumentDataset):
                     mapped_buckets_w_anchors_path, write_index=False, overwrite=True
                 )
             self._logger.info(
-                f"Time taken for Map_buckets : {time.time() - t0}s and output written at {mapped_buckets_w_anchors_path}"
+                f"Time taken for Map_Buckets : {time.time() - t0}s and output written at {mapped_buckets_w_anchors_path}"
             )
 
-            print(f"Stage {stage_num} (False Postive Check): Map_Buckets Complete!")
+            print(f"Stage {stage_num} (False Positive Check): Map_Buckets complete!")
             stage_num += 1
 
             # Shuffle documents based on mapped buckets
-            print(f"Stage {stage_num} (False Postive Check): Shuffle docs")
+            print(f"Stage {stage_num} (False Positive Check): Shuffle Documents")
             shuffled_docs_path = os.path.join(
                 self.config.cache_dir, "shuffled_docs.parquet"
             )
@@ -194,12 +199,14 @@ def __call__(self, dataset: DocumentDataset):
                 parts_per_worker=self.config.parts_per_worker,
                 bucket_parts_per_worker=self.config.bucket_parts_per_worker,
             )
-            print(f"Stage {stage_num} (False Postive Check): Shuffle docs complete!")
+            print(
+                f"Stage {stage_num} (False Positive Check): Shuffle Documents complete!"
+            )
             stage_num += 1
 
             # jaccard comparision within buckets
             print(
-                f"Stage {stage_num} (False Postive Check): Jaccard Similarity in Buckets"
+                f"Stage {stage_num} (False Positive Check): Jaccard Similarity in Buckets"
             )
             jaccard_pairs_path = os.path.join(
                 self.config.cache_dir, "jaccard_similarity_results.parquet"
@@ -219,11 +226,11 @@ def __call__(self, dataset: DocumentDataset):
                     overwrite=True,
                 )
                 self._logger.info(
-                    f"Time taken for Jaccard Similarity = {time.time()-t0}s and output written at {jaccard_pairs_path}"
+                    f"Time taken for Jaccard Similarity: {time.time()-t0}s and output written at {jaccard_pairs_path}"
                 )
 
             print(
-                f"Stage {stage_num} (False Postive Check): Jaccard Similarity in Buckets Complete!"
+                f"Stage {stage_num} (False Positive Check): Jaccard Similarity in Buckets complete!"
             )
             stage_num += 1
 
@@ -232,15 +239,15 @@ def __call__(self, dataset: DocumentDataset):
             print(f"Stage {stage_num}: Starting LSH Buckets to Graph Edgelist")
             self.buckets_to_edges(buckets_df)
             print(
-                f"Stage {stage_num}: Starting LSH Buckets to Graph Edgelist Complete!"
+                f"Stage {stage_num}: Starting LSH Buckets to Graph Edgelist complete!"
             )
             stage_num += 1
 
         # Connected components across buckets
-        print(f"Stage {stage_num}: Connected Components across buckets")
+        print(f"Stage {stage_num}: Connected Components Across Buckets")
         cc_path = os.path.join(self.config.cache_dir, "connected_components.parquet")
-        self.connected_components.cc_workflow(cc_path)
-        print(f"Stage {stage_num}: Connected Components across buckets complete!")
+        self.connected_components(cc_path)
+        print(f"Stage {stage_num}: Connected Components Across Buckets complete!")
         stage_num += 1
 
         return DocumentDataset(dask_cudf.read_parquet(cc_path, split_row_groups=False))
diff --git a/nemo_curator/modules/fuzzy_dedup/jaccardsimilarity.py b/nemo_curator/modules/fuzzy_dedup/jaccardsimilarity.py
@@ -37,9 +37,6 @@ def __init__(
         self.right_id = f"{self.id_field}_y"
         self.ngram_width = ngram_width
 
-    def __call__(DocumentDataset):
-        raise NotImplementedError
-
     def jaccard_compute(self, shuffled_docs_path):
         paths = [
             entry.path

diff --git a/nemo_curator/modules/fuzzy_dedup/lsh.py b/nemo_curator/modules/fuzzy_dedup/lsh.py
@@ -272,6 +272,12 @@ def _write_bucket_parquet(
         return wrote_buckets, are_buckets_empty
 
     def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
+        if "cudf" not in str(type(dataset.df)):
+            raise TypeError(
+                "Dask-cuDF DataFrame is required to run locality-sensitive hashing. "
+                'Please convert your DocumentDataset by using .to_backend("gpu").'
+            )
+
         df = dataset.df
 
         write_path = os.path.join(self.cache_dir, "_buckets.parquet")

diff --git a/nemo_curator/modules/fuzzy_dedup/minhash.py b/nemo_curator/modules/fuzzy_dedup/minhash.py
@@ -169,6 +169,7 @@ def minhash64(
         """
         if not isinstance(ser, cudf.Series):
             raise TypeError("Expected data of type cudf.Series")
+
         if MINHASH_DEPRECATED_API:
             warnings.warn(
                 "Using an outdated minhash implementation, please update to cuDF version 24.12 "
@@ -202,6 +203,12 @@ def __call__(self, dataset: DocumentDataset) -> Union[str, DocumentDataset]:
         -------
         DocumentDataset containing IDs of all documents and the corresponding MinHash Signature
         """
+        if "cudf" not in str(type(dataset.df)):
+            raise TypeError(
+                "Dask-cuDF DataFrame is required to run minhashes. "
+                'Please convert your DocumentDataset by using .to_backend("gpu").'
+            )
+
         result = dataset.df[[self.id_field]]
         result["_minhash_signature"] = dataset.df[self.text_field].map_partitions(
             self.minhash_method,

diff --git a/nemo_curator/scripts/fuzzy_deduplication/connected_components.py b/nemo_curator/scripts/fuzzy_deduplication/connected_components.py
@@ -43,7 +43,7 @@ def main(args):
         logger=args.log_dir,
         profile_dir=args.profile_path,
     )
-    components_stage.cc_workflow(output_path=output_path)
+    components_stage(output_path=output_path)
     print(f"All done in {time.time()-st:.1f} seconds")
     print(f"Results written to {output_path}")
 

diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb
@@ -2737,7 +2737,7 @@
     "    id_column=id_field,\n",
     "    jaccard_threshold=jaccard_threshold,\n",
     ")\n",
-    "components_stage.cc_workflow(output_path=output_path)\n",
+    "components_stage(output_path=output_path)\n",
     "print(f\"Connected Component took {time.time()-t0} seconds\")"
    ]
   },
@@ -4441,7 +4441,7 @@
     "    id_column=id_field,\n",
     "    jaccard_threshold=jaccard_threshold,\n",
     ")\n",
-    "components_stage.cc_workflow(output_path=output_path)\n",
+    "components_stage(output_path=output_path)\n",
     "print(f\"Connected Component took {time.time()-t0} seconds\")"
    ]
   },

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -1752,7 +1752,7 @@
     ")\n",
     "\n",
     "#Load and run connected component\n",
-    "components_stage.cc_workflow(output_path=connected_component_output_path)\n",
+    "components_stage(output_path=connected_component_output_path)\n",
     "print(f\"Time taken for Connected Component: {time.time()-t0} s\")"
    ]
   },

diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/3_connected_components.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/3_connected_components.py
@@ -41,5 +41,5 @@
     )
 
     # Load and run connected components
-    components_stage.cc_workflow(output_path=connected_component_output_path)
+    components_stage(output_path=connected_component_output_path)
     logging.info(f"Time taken for Connected Components: {time.time() - t0:.2f} s")