450tok/sec/core with smollm that appears to work well

allenai · Sep 17, 2024 · af2126d · af2126d
1 parent 2f71cb9
commit af2126d
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 4 deletions.
diff --git a/pdelfin/filter/coherency.py b/pdelfin/filter/coherency.py
@@ -5,7 +5,7 @@
 @lru_cache()
 def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
     model.eval()  # Set the model to evaluation mode
 
     return tokenizer, model

diff --git a/tests/gnarly_pdfs/bws_book.pdf b/tests/gnarly_pdfs/bws_book.pdf
diff --git a/tests/gnarly_pdfs/ti89_guidebook.pdf b/tests/gnarly_pdfs/ti89_guidebook.pdf
diff --git a/tests/test_coherency.py b/tests/test_coherency.py
@@ -1,6 +1,8 @@
 import os
+import time
 
 import unittest
+import multiprocessing
 
 from pdelfin.filter.coherency import get_document_coherency
 from pdelfin.extract_text import get_document_text, get_page_text
@@ -16,6 +18,24 @@ def testBadOcr1(self):
         print("Bad1", get_document_coherency(ocr1_text))
         print("Bad2", get_document_coherency(ocr2_text))
 
+    def testHugeBookCoherencySpeed(self):
+        base_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"))
+        print(f"ti89 book length: {len(base_text):,}")
+
+        warmup = get_document_coherency(base_text[:1000])
+
+        base_text = base_text[:40000]
+
+        start = time.perf_counter()
+        score = get_document_coherency(base_text)
+        end = time.perf_counter()
+
+        char_per_sec = len(base_text) / (end - start)
+        char_per_sec = char_per_sec / multiprocessing.cpu_count()
+
+        print(f"ti89 book score {score:.2f}")
+        print(f"{char_per_sec:.2f} chars per second per core")
+
     def testTwoColumnMisparse(self):
         pdftotext_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdftotext")
         pymupdf_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pymupdf")
@@ -24,8 +44,10 @@ def testTwoColumnMisparse(self):
         # pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
         # pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")
 
-        print("pdftotext_text", get_document_coherency(pdftotext_text))
-        print("pymupdf_text", get_document_coherency(pymupdf_text))
-        print("pdfium_text", get_document_coherency(pdfium_text))
+        print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
+        print("pymupdf_text", pymupdf_score := get_document_coherency(pymupdf_text))
+        print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
+
+        self.assertLess(pdftotext_score, pymupdf_score)