A few notes, starting to test dataloader with new structured response…

… format
allenai · Oct 2, 2024 · b340ae5 · b340ae5
1 parent 8315162
commit b340ae5
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,6 @@ Things supported:
 You will probably need to install some fonts on your computer so that any pdfs you render come out looking nice.
 
 ```
-sudo apt-get install ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts
+sudo apt-get install ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
 
 ```
diff --git a/pdelfin/silver_data/buildsilver.py b/pdelfin/silver_data/buildsilver.py
@@ -71,7 +71,14 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
     # )
     # print(response)
 
-    # Construct OpenAI Batch API request format
+    # Construct OpenAI Batch API request format#
+    # There are a few tricks to know when doing data processing with OpenAI's apis
+    # First off, use the batch query system, it's 1/2 the price and exactly the same performance
+    # Second off, use structured outputs. If your application is not an actual chatbot, use structured outputs!
+    # Even if the last 10 queries you ran with the regular chat api returned exactly what you wanted without extra "LLM fluff text", that doesn't mean this will hold across 1000's of queries
+    # Also, structured outputs let you cheat, because the order in which fields are in your schema, is the order in which the model will answer them, so you can have it answer some "preperatory" or "chain of thought" style questions first before going into the meat of your response, which is going to give better answers
+    # Check your prompt for typos, it makes a performance difference!
+    # Ask for logprobs, it's not any more expensive and you can use them later to help identify problematic responses
     return {
         "custom_id": f"{pretty_pdf_path}-{page}",
         "method": "POST",

diff --git a/pdelfin/train/utils.py b/pdelfin/train/utils.py
@@ -149,4 +149,6 @@ def get_local_dir(output_dir: str):
             yield output_dir
         else:
             yield tmp_dir
-            copy_dir(tmp_dir, output_dir)
+            copy_dir(tmp_dir, output_dir)
+
+
diff --git a/tests/gnarly_pdfs/small_page_size.pdf b/tests/gnarly_pdfs/small_page_size.pdf
diff --git a/tests/test_anchor.py b/tests/test_anchor.py
@@ -37,4 +37,34 @@ def testAnchorImage(self):
 
         print(report)
 
-        print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
+        print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
+
+    def testSmallPage(self):
+        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
+
+        report = _pdf_report(local_pdf_path, 1)
+
+        print(report)
+
+        print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
+
+class BuildSilverTest(unittest.TestCase):
+    def testSmallPage(self):
+        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
+
+        from pdelfin.silver_data.buildsilver import build_page_query
+
+        result = build_page_query(local_pdf_path, "s3://test.pdf", 1)
+
+        from pdelfin.train.dataloader import get_png_dimensions_from_base64
+
+        base64data = result["body"]["messages"][0]["content"][1]["image_url"]["url"]
+
+        if base64data.startswith("data:image/png;base64,"):
+            base64data = base64data[22:]
+
+        width, height = get_png_dimensions_from_base64(base64data)
+
+        print(width, height)
+
+        assert max(width, height) == 2048