Fix bugs in retriever sdg notebook (NVIDIA#522)

* Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * fixed qa bug 5008113, Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * bug fixes for generator, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * fixed precommit, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * fixed filters, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * fixed all issues, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * fixed bug with document id, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * check if filtering pipeline is present, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * fixed notebook, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * added functionality to filter pre-generated datasets, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * separated generation & filtering pipelines, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * fixed pre-commit, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * minor changes, Signed-off by [email protected] Signed-off-by: Vinay Raman <[email protected]> * fixed Ryan Wolf's comments, Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * fixed minor bugs in configs, Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * removed commented code in main.py, Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * added CLI flags for generation & filtering removed code duplication, Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * minor fix to quickstart notebook, Signed-off by [email protected] Signed-off-by: viraman <[email protected]> * removed filter.py & generate.py, Signed-off by [email protected] Signed-off-by: viraman <[email protected]> --------- Signed-off-by: viraman <[email protected]> Signed-off-by: Vinay Raman <[email protected]>
ruchaa-apte · Feb 19, 2025 · a46fb87 · a46fb87
1 parent d8f99f9
commit a46fb87
Show file tree

Hide file tree

Showing 9 changed files with 332 additions and 180 deletions.
diff --git a/nemo_curator/filters/synthetic.py b/nemo_curator/filters/synthetic.py
@@ -24,6 +24,15 @@
 
 from nemo_curator.filters.doc_filter import DocumentFilter
 from nemo_curator.utils.decorators import batched
+from nemo_curator.utils.distributed_utils import NoWorkerError, load_object_on_worker
+
+
+def create_client(base_url, api_key):
+    openai_client = OpenAI(
+        base_url=base_url,
+        api_key=api_key,
+    )
+    return openai_client
 
 
 # ----------------------------------------------------------------------------80
@@ -52,16 +61,21 @@ def __init__(
         self.percentile = percentile
         if truncate:
             self.truncate = truncate
-        try:
-            self.client = OpenAI(base_url=self.base_url, api_key=self.api_key)
-        except Exception as e:
-            print(f"Error accessing NIM model: {e}")
         self.batch_size = batch_size
         self.text_fields = text_fields
 
     @batched
     def score_document(self, df: pd.DataFrame):
 
+        try:
+            self.client = load_object_on_worker(
+                attr="openai_client_easiness",
+                load_object_function=create_client,
+                load_object_kwargs={"base_url": self.base_url, "api_key": self.api_key},
+            )
+        except NoWorkerError:
+            return pd.Series(np.ones(len(df)), dtype=float)
+
         document_score = self._calc_similarity_nim(
             df[self.text_fields[0]].to_list(), df[self.text_fields[1]].to_list()
         )
@@ -90,7 +104,7 @@ def _get_nim_embedding(self, text, input_type):
             print(f"Error: {e}")
             response = None
 
-        if response:
+        if response and not isinstance(response, str):
             if isinstance(text, list):
                 embeddings = [r.embedding for r in response.data]
             elif isinstance(text, str):
@@ -116,9 +130,6 @@ def _calc_similarity_nim(self, context, question):
 
         return sim
 
-    def __dask_tokenize__(self):
-        return normalize_token(EasinessFilter)
-
 
 # ----------------------------------------------------------------------------80
 # ----------------------- Answerability Filter ---------------------------------
@@ -149,19 +160,24 @@ def __init__(
         self.system_prompt = answerability_system_prompt
         self.user_prompt_template = answerability_user_prompt_template
         self.num_criteria = num_criteria
-
-        try:
-            self.client = OpenAI(base_url=self.base_url, api_key=self.api_key)
-        except Exception as e:
-            print(f"Error accessing NIM model: {e}")
-
         self.text_fields = text_fields
 
     @batched
     def score_document(self, df: pd.DataFrame):
-        return df.apply(
+
+        try:
+            self.client = load_object_on_worker(
+                attr="openai_client_answerability",
+                load_object_function=create_client,
+                load_object_kwargs={"base_url": self.base_url, "api_key": self.api_key},
+            )
+        except NoWorkerError:
+            return pd.Series(["string"] * len(df))
+
+        return df.progress_apply(
             lambda row: self._llm_as_judge(
-                row[self.text_fields[0]], row[self.text_fields[1]]
+                row[self.text_fields[0]],
+                row[self.text_fields[1]],
             ),
             axis=1,
         )
@@ -212,8 +228,5 @@ def _llm_as_judge(self, context: str, question: str):
 
         return generation
 
-    def __dask_tokenize__(self):
-        return normalize_token(AnswerabilityFilter)
-
 
 # ----------------------------------------------------------------------------80
diff --git a/tutorials/nemo-retriever-synthetic-data-generation/README.md b/tutorials/nemo-retriever-synthetic-data-generation/README.md
@@ -45,22 +45,35 @@ Navigate to the [quick start notebook](notebooks/quickstart.ipynb) and follow th
 
 ### Run Pipeline (CLI)
 
-The pipeline can be run with datasets in rawdoc (only text, title and ids if any) format. To test the pipeline, you can use the provided example data at ```sample_data_rawdoc.jsonl```
+The pipeline can be run with datasets in ```jsonl``` (only text, title and ids if any) format. To test the pipeline, you can use the provided example data at ```sample_data/sample_data_rawdoc.jsonl```
 
-Navigate to the top level of this project directory and run the following command in your command line. It will take roughly 5-10 minutes.
+To use jsonl format, provide your data in a single or multiple `.jsonl` files. The structure of the data should follow this format: `{"text": <document>, "title": <title>}`. Additionally, if the documents already have a document id, the input file can also contain document ids. The same ids will be persisted in the generated data as well. Another accepted format is `{"_id": <document_id>, "text": <document>, "title": <title>}`.
 
-- `Rawdoc format`
-
-To use rawdoc format, provide your data in a `.jsonl` file. The structure of the data should follow this format: `{"text": <document>, "title": <title>}`. Additionally, if the documents already have a document id, the input file can also contain document ids. The same ids will be persisted in the generated data as well. Another accepted format is `{"_id": <document_id>, "text": <document>, "title": <title>}`.
+The pipeline can be run in two modes (1. Generation and 2. Filtering). In order to run the full pipeline in generation mode, use the script ```main.py``` with the flag ```--pipeline-type=generate```
+```
+python tutorials/nemo-retriever-synthetic-data-generation/main.py \
+  --api-key=<API Key> \
+  --input-dir=tutorials/nemo-retriever-synthetic-data-generation/sample_data \
+  --pipeline-config=tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml\
+  --input-format=jsonl \
+  --pipeline-type=generate \
+  --output-dir=tutorials/nemo-retriever-synthetic-data-generation/outputs/sample_data_rawdoc
+  --save-format=jsonl
+  --n-partitions=5
+```
+The data can be saved in two formats (1. jsonl, 2. beir). Additionally, the user can pass ```--n-partitions``` flag to speed-up generation for large datasets.
 
-In order to run the pipeline, use the script ```main.py```
+To filter pre-generated data, run ```main.py``` with ```--pipeline-type=filter```
+Note the change in the ```input-dir```, we need to use the path to the generated data in jsonl format.
 ```
 python tutorials/nemo-retriever-synthetic-data-generation/main.py \
   --api-key=<API Key> \
-  --input-file=tutorials/nemo-retriever-synthetic-data-generation/data/sample_data_rawdoc.jsonl \
+  --input-dir= tutorials/nemo-retriever-synthetic-data-generation/outputs/sample_data_rawdoc/jsonl \
   --pipeline-config=tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml\
-  --input-format=rawdoc \
+  --input-format=jsonl \
+  --pipeline-type=filter \
   --output-dir=tutorials/nemo-retriever-synthetic-data-generation/outputs/sample_data_rawdoc
+  --save-format=jsonl
 ```
 
 For more information about the expected structure of the data, see the [quick start notebook](notebooks/quickstart.ipynb).

diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config-fiqa.yaml b/tutorials/nemo-retriever-synthetic-data-generation/config/config-fiqa.yaml
@@ -55,7 +55,7 @@ generator_system_prompt: |
       Do I need a new EIN since I am hiring employees for my LLC?
 
     user_prompt_template: |
-      Generate {num_questions} questions and corresponding answers based on Input Document.
+      Generate {n_openlines} questions and corresponding answers based on Input Document.
 
       Input Document:
       {document}
@@ -72,7 +72,7 @@ percentile: 70  # Percentile for threshold calculation (float) [0, 100]
 batch_size: 1
 
 #Answerability filter (LLM-as-judge)
-answerability_filter: "meta/llama3-70b-instruct"
+answerability_filter: "meta/llama-3.1-70b-instruct"
 num_criteria: 4  # Number of criteria to parse from the response. It must be alined with the prompt template
 answerability_system_prompt: |
       You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion:

diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config-nq.yaml b/tutorials/nemo-retriever-synthetic-data-generation/config/config-nq.yaml
@@ -72,7 +72,7 @@ percentile: 70  # Percentile for threshold calculation (float) [0, 100]
 batch_size: 1
 
 #Answerability filter (LLM-as-judge)
-answerability_filter: "meta/llama3-70b-instruct"
+answerability_filter: "meta/llama-3.1-70b-instruct"
 num_criteria: 4  # Number of criteria to parse from the response. It must be alined with the prompt template
 answerability_system_prompt: |
       You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion:

diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml b/tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml
@@ -63,7 +63,7 @@ percentile: 70  # Percentile for threshold calculation (float) [0, 100]
 batch_size: 1
 
 #Answerability filter (LLM-as-judge)
-answerability_filter: "meta/llama3-70b-instruct"
+answerability_filter: "meta/llama-3.1-70b-instruct"
 num_criteria: 4  # Number of criteria to parse from the response. It must be alined with the prompt template
 answerability_system_prompt: |
       You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion: