Skip to content

Commit

Permalink
450tok/sec/core with smollm that appears to work well
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Sep 17, 2024
1 parent 2f71cb9 commit af2126d
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 4 deletions.
2 changes: 1 addition & 1 deletion pdelfin/filter/coherency.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
@lru_cache()
def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
model.eval() # Set the model to evaluation mode

return tokenizer, model
Expand Down
Binary file added tests/gnarly_pdfs/bws_book.pdf
Binary file not shown.
Binary file added tests/gnarly_pdfs/ti89_guidebook.pdf
Binary file not shown.
28 changes: 25 additions & 3 deletions tests/test_coherency.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import time

import unittest
import multiprocessing

from pdelfin.filter.coherency import get_document_coherency
from pdelfin.extract_text import get_document_text, get_page_text
Expand All @@ -16,6 +18,24 @@ def testBadOcr1(self):
print("Bad1", get_document_coherency(ocr1_text))
print("Bad2", get_document_coherency(ocr2_text))

def testHugeBookCoherencySpeed(self):
base_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"))
print(f"ti89 book length: {len(base_text):,}")

warmup = get_document_coherency(base_text[:1000])

base_text = base_text[:40000]

start = time.perf_counter()
score = get_document_coherency(base_text)
end = time.perf_counter()

char_per_sec = len(base_text) / (end - start)
char_per_sec = char_per_sec / multiprocessing.cpu_count()

print(f"ti89 book score {score:.2f}")
print(f"{char_per_sec:.2f} chars per second per core")

def testTwoColumnMisparse(self):
pdftotext_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdftotext")
pymupdf_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pymupdf")
Expand All @@ -24,8 +44,10 @@ def testTwoColumnMisparse(self):
# pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
# pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")

print("pdftotext_text", get_document_coherency(pdftotext_text))
print("pymupdf_text", get_document_coherency(pymupdf_text))
print("pdfium_text", get_document_coherency(pdfium_text))
print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
print("pymupdf_text", pymupdf_score := get_document_coherency(pymupdf_text))
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))

self.assertLess(pdftotext_score, pymupdf_score)


0 comments on commit af2126d

Please sign in to comment.