Skip to content

Commit

Permalink
Removing pymupdf
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Jan 30, 2025
1 parent ddeea92 commit 2ab7cb2
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 25 deletions.
1 change: 0 additions & 1 deletion gantry-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ torchvision
cached-path
smart_open
pypdf
pymupdf
pypdfium2
lingua-language-detector
Pillow
Expand Down
12 changes: 1 addition & 11 deletions olmocr/prompts/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

# pdftotext
# pdfium
# pymupdf
# pypdf

import random
Expand All @@ -16,7 +15,6 @@
from typing import List, Literal

import ftfy
import pymupdf
import pypdfium2 as pdfium
from pypdf import PdfReader
from pypdf.generic import RectangleObject
Expand All @@ -25,7 +23,7 @@


def get_anchor_text(
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
) -> str:
assert page > 0, "Pages are 1-indexed in pdf-land"

Expand All @@ -35,12 +33,9 @@ def get_anchor_text(
return _get_pdfium(local_pdf_path, page)
elif pdf_engine == "pypdf":
return _get_pypdf_raw(local_pdf_path, page)
elif pdf_engine == "pymupdf":
return _get_pymupdf(local_pdf_path, page)
elif pdf_engine == "topcoherency":
options = {
"pdftotext": _get_pdftotext(local_pdf_path, page),
"pymupdf": _get_pymupdf(local_pdf_path, page),
"pdfium": _get_pdfium(local_pdf_path, page),
"pypdf_raw": _get_pypdf_raw(local_pdf_path, page),
}
Expand Down Expand Up @@ -70,11 +65,6 @@ def _get_pdftotext(local_pdf_path: str, page: int) -> str:
return pdftotext_result.stdout.decode("utf-8")


def _get_pymupdf(local_pdf_path: str, page: int) -> str:
pm_doc = pymupdf.open(local_pdf_path)
return pm_doc[page - 1].get_text()


def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
reader = PdfReader(local_pdf_path)
pypage = reader.pages[page - 1]
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ dependencies = [
"cached-path",
"smart_open",
"pypdf>=5.2.0",
"pymupdf",
"pypdfium2",
"cryptography",
"lingua-language-detector",
Expand Down
14 changes: 2 additions & 12 deletions tests/test_coherency.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,27 +43,17 @@ def testTwoColumnMisparse(self):
page=2,
pdf_engine="pdftotext",
)
pymupdf_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page=2,
pdf_engine="pymupdf",
)
pdfium_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page=2,
pdf_engine="pdfium",
)

# pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
# pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")

print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
print("pymupdf_text", pymupdf_score := get_document_coherency(pymupdf_text))
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))

self.assertLess(pdftotext_score, pymupdf_score)
self.assertLess(pdfium_score, pymupdf_score)
self.assertLess(pdfium_score, pdftotext_score)

anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")

self.assertEqual(anchor_text, pymupdf_text)
self.assertEqual(anchor_text, pdfium_text)

0 comments on commit 2ab7cb2

Please sign in to comment.