Skip to content

Commit

Permalink
Using SmolLM, seems a lot better and is able to pass some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Sep 17, 2024
1 parent 57e80aa commit 2f71cb9
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 2 deletions.
19 changes: 18 additions & 1 deletion pdelfin/extract_text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import subprocess
import pymupdf

import pypdfium2 as pdfium
from typing import Literal


Expand All @@ -26,6 +26,13 @@ def get_page_text(local_pdf_path: str, page_num: int, pdf_engine: Literal["pdfto
elif pdf_engine == "pymupdf":
pm_doc = pymupdf.open(local_pdf_path)
return pm_doc[page_num - 1].get_text()
elif pdf_engine == "pdfium":
pdf = pdfium.PdfDocument(local_pdf_path)
page = pdf[page_num - 1]
textpage = page.get_textpage()

# Extract text from the whole page
return textpage.get_text_range()
else:
raise NotImplementedError()

Expand Down Expand Up @@ -53,6 +60,16 @@ def get_document_text(local_pdf_path: str, pdf_engine: Literal["pdftotext", "pym
result += page.get_text()
result += "\n"

return result
elif pdf_engine == "pdfium":
pdf = pdfium.PdfDocument(local_pdf_path)
result = ""

for page in pdf:
textpage = page.get_textpage()
result += textpage.get_text_range()
result += "\n"

return result
else:
raise NotImplementedError()
2 changes: 1 addition & 1 deletion pdelfin/filter/coherency.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch

@lru_cache()
def load_coherency_model(model_name: str = "distilgpt2"):
def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval() # Set the model to evaluation mode
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
"cached-path",
"pypdf",
"pymupdf",
"pypdfium2",
"lingua-language-detector"
"https://github.com/kpu/kenlm/archive/master.zip",
]
Expand Down
5 changes: 5 additions & 0 deletions tests/test_coherency.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@ def testBadOcr1(self):
def testTwoColumnMisparse(self):
pdftotext_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdftotext")
pymupdf_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pymupdf")
pdfium_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdfium")

# pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
# pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")

print("pdftotext_text", get_document_coherency(pdftotext_text))
print("pymupdf_text", get_document_coherency(pymupdf_text))
print("pdfium_text", get_document_coherency(pdfium_text))


0 comments on commit 2f71cb9

Please sign in to comment.