Using SmolLM, seems a lot better and is able to pass some tests

allenai · Sep 17, 2024 · 2f71cb9 · 2f71cb9
1 parent 57e80aa
commit 2f71cb9
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 2 deletions.
diff --git a/pdelfin/extract_text.py b/pdelfin/extract_text.py
@@ -1,6 +1,6 @@
 import subprocess
 import pymupdf
-
+import pypdfium2 as pdfium
 from typing import Literal
 
 
@@ -26,6 +26,13 @@ def get_page_text(local_pdf_path: str, page_num: int, pdf_engine: Literal["pdfto
     elif pdf_engine == "pymupdf":
         pm_doc = pymupdf.open(local_pdf_path)
         return pm_doc[page_num - 1].get_text()
+    elif pdf_engine == "pdfium":
+        pdf = pdfium.PdfDocument(local_pdf_path)
+        page = pdf[page_num - 1]
+        textpage = page.get_textpage()
+
+        # Extract text from the whole page
+        return textpage.get_text_range()
     else:
         raise NotImplementedError()
 
@@ -53,6 +60,16 @@ def get_document_text(local_pdf_path: str, pdf_engine: Literal["pdftotext", "pym
             result += page.get_text()
             result += "\n"
 
+        return result
+    elif pdf_engine == "pdfium":
+        pdf = pdfium.PdfDocument(local_pdf_path)
+        result = ""
+
+        for page in pdf:
+            textpage = page.get_textpage()
+            result += textpage.get_text_range()
+            result += "\n"
+
         return result
     else:
         raise NotImplementedError()
diff --git a/pdelfin/filter/coherency.py b/pdelfin/filter/coherency.py
@@ -3,7 +3,7 @@
 import torch
 
 @lru_cache()
-def load_coherency_model(model_name: str = "distilgpt2"):
+def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(model_name)
     model.eval()  # Set the model to evaluation mode

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
   "cached-path",
   "pypdf",
   "pymupdf",
+  "pypdfium2",
   "lingua-language-detector"
   "https://github.com/kpu/kenlm/archive/master.zip",
 ]

diff --git a/tests/test_coherency.py b/tests/test_coherency.py
@@ -19,8 +19,13 @@ def testBadOcr1(self):
     def testTwoColumnMisparse(self):
         pdftotext_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdftotext")
         pymupdf_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pymupdf")
+        pdfium_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdfium")
+
+        # pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
+        # pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")
 
         print("pdftotext_text", get_document_coherency(pdftotext_text))
         print("pymupdf_text", get_document_coherency(pymupdf_text))
+        print("pdfium_text", get_document_coherency(pdfium_text))