Fixing the refiner input prompt to something simpler that doesn't dep…

…end on the training data. Fixing beaker job workspace and bumping priority to high.
allenai · Sep 27, 2024 · decfd7f · decfd7f
1 parent 22b765e
commit decfd7f
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 7 deletions.
diff --git a/pdelfin/train/dataloader.py b/pdelfin/train/dataloader.py
@@ -149,10 +149,24 @@ def extract_openai_batch_query(query: Dict[str, Any]) -> Dict[str, Any]:
                     except IndexError:
                         input_prompt_image_base64 = ""
 
+    # At this point, the input_prompt_text is the raw text that was passed to the OpenAI model
+    # to generate our silver data. But, we want to have a simplfied prompt for this here fine tune,
+    # so we're going to extract out just the raw extracted prompt text
+    pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
+
+    # Use re.DOTALL to ensure that the dot matches newline characters
+    match = re.search(pattern, input_prompt_text, re.DOTALL)
+
+    if match:
+        raw_page_text = match.group(1).strip()
+    else:
+        raw_page_text = ""
+
     return {
         "custom_id": custom_id,
         "input_prompt_text": input_prompt_text,
         "input_prompt_image_base64": input_prompt_image_base64,
+        "raw_page_text": raw_page_text,
     }
 
 
@@ -223,7 +237,7 @@ def pick_image_sizes(x):
     final_dataset = final_dataset.filter(pick_image_sizes)
 
     # Limit the size of the input text not to explode the context size
-    final_dataset = final_dataset.filter(lambda x: len(x["input_prompt_text"]) < 4000)
+    final_dataset = final_dataset.filter(lambda x: len(x["raw_page_text"]) < 4000)
 
     return final_dataset
 

diff --git a/pdelfin/train/dataprep.py b/pdelfin/train/dataprep.py
@@ -15,6 +15,16 @@ def filter_by_max_seq_len(example, processor, max_prompt_len: int=2000, max_resp
     return True
 
 
+# This is a base prompt that will be used for training and running the fine tuned model
+# It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset
+def _build_finetuning_prompt(base_text: str) -> str:
+    return (
+        f"Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. "
+        f"Just return the plain text representation of this document as if you were reading it naturally.\n"
+        f"Do not hallucinate.\n"
+        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
+    )
+
 
 def prepare_data_for_qwen2_training(example, processor, add_batch_dim=False):
     # Prepare messages
@@ -26,7 +36,7 @@ def prepare_data_for_qwen2_training(example, processor, add_batch_dim=False):
                     "type": "image",
                     "image": example["input_prompt_image_base64"]  # Placeholder
                 },
-                {"type": "text", "text": example["input_prompt_text"]},
+                {"type": "text", "text": _build_finetuning_prompt(example["raw_page_text"])},
             ],
         }
     ]

diff --git a/scripts/qwen2vl-7b-gantry.sh b/scripts/qwen2vl-7b-gantry.sh
@@ -26,11 +26,11 @@ gantry run \
     --task-name "${run_name}"\
     --allow-dirty \
     --host-networking \
-    --workspace ai2/oe-data-pdf \
+    --workspace ai2/oe-data-model-based-cleanup \
     --beaker-image 'jakep/jakep-pdf-finetunev1.1' \
     --venv 'base' \
     --pip gantry-requirements.txt \
-    --priority normal \
+    --priority high \
     --gpus 8 \
     --preemptible \
     --cluster "ai2/${CLUSTER}*" \

diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py
@@ -73,7 +73,7 @@ def testPlotSequenceLengthHistogram(self):
         fig.write_image("sequence_lengths_histogram.png")
 
     def testExtractBatch(self):
-        query_data = load_jsonl_from_s3("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
+        query_data = load_jsonl_from_s3("s3://ai2-oe-data/jakep/openai_batch_data_v2_mini/*.jsonl", first_n_files=3)
         query_data = query_data["train"]
         query_data = query_data.map(extract_openai_batch_query, remove_columns=query_data.column_names)
 

diff --git a/tests/test_dataprep.py b/tests/test_dataprep.py
@@ -9,7 +9,7 @@
 )
 
 from pdelfin.train.dataprep import (
-    prepare_data_for_qwen2_training
+    prepare_data_for_qwen2_training, _build_finetuning_prompt
 )
 
 
@@ -32,7 +32,7 @@ def testTokenizationMatches(self):
                         "type": "image",
                         "image": example["input_prompt_image_base64"]  # Placeholder
                     },
-                    {"type": "text", "text": example["input_prompt_text"]},
+                    {"type": "text", "text": _build_finetuning_prompt(example["raw_page_text"])},
                 ],
             },
 
@@ -47,6 +47,11 @@ def testTokenizationMatches(self):
         # Decode image from base64
         main_image = Image.open(BytesIO(base64.b64decode(example["input_prompt_image_base64"])))
 
+        width, height = main_image.size
+        assert 1800 <= max(width, height) <= 2200, f"Image size {width}x{height} invalid"
+        main_image = main_image.resize((width // 2, height // 2), Image.LANCZOS)
+
+
         # Process inputs using processor
         inference_inputs = processor(
             text=[text],