Skip to content

Commit

Permalink
Can spit out anchor text for a gpt engine using pypdf, showing locati…
Browse files Browse the repository at this point in the history
…ons of images and text
  • Loading branch information
jakep-allenai committed Oct 1, 2024
1 parent e42cecf commit 6ef8226
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 18 deletions.
102 changes: 98 additions & 4 deletions pdelfin/prompts/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,22 @@

# coherency score best of these three
import subprocess
from typing import Literal
import sys
import json
from dataclasses import dataclass
from typing import Literal, List

from pypdf import PdfReader
import pypdfium2 as pdfium
import pymupdf

from pdelfin.filter.coherency import get_document_coherency

from pypdf import PdfReader
from pypdf.generic import RectangleObject
from pdelfin.prompts._adv_anchor import mult


def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency"]) -> str:
def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"]) -> str:
assert page > 0, "Pages are 1-indexed in pdf-land"

if pdf_engine == "pdftotext":
Expand All @@ -41,6 +47,10 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote

# return option with the best (highest) score (higher is more likley, as these are logprobs)
return options[scores.index(max(scores))]
elif pdf_engine == "pdfreport":
return _linearize_pdf_report(_pdf_report(local_pdf_path, page))
else:
raise NotImplementedError("Unknown engine")


def _get_pdftotext(local_pdf_path: str, page: int) -> str:
Expand All @@ -66,5 +76,89 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
def _get_pdfium(local_pdf_path: str, page: int) -> str:
pdf = pdfium.PdfDocument(local_pdf_path)
textpage = pdf[page - 1].get_textpage()
return textpage.get_text_range()
return textpage.get_text_bounded()

def _transform_point(x, y, m):
x_new = m[0]*x + m[2]*y + m[4]
y_new = m[1]*x + m[3]*y + m[5]
return x_new, y_new

@dataclass
class Element:
pass

@dataclass
class BoundingBox:
x0: float
y0: float
x1: float
y1: float

@staticmethod
def from_rectangle(rect: RectangleObject) -> "BoundingBox":
return BoundingBox(rect[0], rect[1], rect[2], rect[3])


@dataclass
class TextElement(Element):
text: str
x: float
y: float

@dataclass
class ImageElement(Element):
name: str
bbox: BoundingBox

@dataclass
class PageReport:
mediabox: BoundingBox
elements: List[Element]

def _pdf_report(local_pdf_path: str, page: int) -> PageReport:
reader = PdfReader(local_pdf_path)
page = reader.pages[page - 1]
resources = page.get("/Resources", {})
xobjects = resources.get("/XObject", {})
elements = []

def visitor_body(text, cm, tm, font_dict, font_size):
txt2user = mult(tm, cm)
elements.append(TextElement(text, txt2user[4], txt2user[5]))

def visitor_op(op, args, cm, tm):
if op == b"Do":
xobject_name = args[0]
xobject = xobjects.get(xobject_name)
if xobject and xobject["/Subtype"] == "/Image":
# Compute image bbox
# The image is placed according to the CTM
width = xobject.get("/Width")
height = xobject.get("/Height")
x0, y0 = _transform_point(0, 0, cm)
x1, y1 = _transform_point(1, 1, cm)
elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))

page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)

return PageReport(
mediabox=BoundingBox.from_rectangle(page.mediabox),
elements=elements,
)


def _linearize_pdf_report(report: PageReport) -> str:
result = ""

result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"

for index, element in enumerate(report.elements):
if isinstance(element, ImageElement):
result += f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]"
if isinstance(element, TextElement):
if len(element.text.strip()) == 0:
continue

result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"

return result
Binary file added tests/gnarly_pdfs/edgar.pdf
Binary file not shown.
33 changes: 19 additions & 14 deletions tests/test_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,37 @@

from pypdf import PdfReader

from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text

class AnchorTest(unittest.TestCase):
def testExtractText(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
reader = PdfReader(local_pdf_path)
page = reader.pages[1]
page = reader.pages[0]

def visitor_body(text, cm, tm, font_dict, font_size):
print(repr(text))
print(repr(text), cm, tm, font_size)

def visitor_op(op, args, cm, tm):
#print(op, args, cm, tm)
pass

page.extract_text(visitor_text=visitor_body)
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)

def testAnchorBase(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")

from pdelfin.prompts._adv_anchor import extract_page
reader = PdfReader(local_pdf_path)
pypage = reader.pages[1]
report = _pdf_report(local_pdf_path, 2)

def visitor_body(text, cm, tm, font_dict, font_size):
print(repr(text))
print(report)

print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))

extract_page(pypage, reader, visitor_text=visitor_body)
def testAnchorImage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")

# report = parse_pdf(local_pdf_path)
# print(json.dumps(report, indent=1))
report = _pdf_report(local_pdf_path, 2)

# report = _pdf_report(local_pdf_path, 1)
print(report)

# print(json.dumps(report, indent=1))
print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))

0 comments on commit 6ef8226

Please sign in to comment.