Can spit out anchor text for a gpt engine using pypdf, showing locati…

…ons of images and text
allenai · Oct 1, 2024 · 6ef8226 · 6ef8226
1 parent e42cecf
commit 6ef8226
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 18 deletions.
diff --git a/pdelfin/prompts/anchor.py b/pdelfin/prompts/anchor.py
@@ -9,16 +9,22 @@
 
 # coherency score best of these three
 import subprocess
-from typing import Literal
+import sys
+import json
+from dataclasses import dataclass
+from typing import Literal, List
 
-from pypdf import PdfReader
 import pypdfium2 as pdfium
 import pymupdf
 
 from pdelfin.filter.coherency import get_document_coherency
 
+from pypdf import PdfReader
+from pypdf.generic import RectangleObject
+from pdelfin.prompts._adv_anchor import mult
+
 
-def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency"]) -> str:
+def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"]) -> str:
     assert page > 0, "Pages are 1-indexed in pdf-land"
 
     if pdf_engine == "pdftotext":
@@ -41,6 +47,10 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
 
         # return option with the best (highest) score (higher is more likley, as these are logprobs)
         return options[scores.index(max(scores))]
+    elif pdf_engine == "pdfreport":
+        return _linearize_pdf_report(_pdf_report(local_pdf_path, page))
+    else:
+        raise NotImplementedError("Unknown engine")
 
 
 def _get_pdftotext(local_pdf_path: str, page: int) -> str:
@@ -66,5 +76,89 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
 def _get_pdfium(local_pdf_path: str, page: int) -> str:
     pdf = pdfium.PdfDocument(local_pdf_path)
     textpage = pdf[page - 1].get_textpage()
-    return textpage.get_text_range()
+    return textpage.get_text_bounded()
+
+def _transform_point(x, y, m):
+    x_new = m[0]*x + m[2]*y + m[4]
+    y_new = m[1]*x + m[3]*y + m[5]
+    return x_new, y_new
+
+@dataclass
+class Element:
+    pass
+
+@dataclass
+class BoundingBox:
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+
+    @staticmethod
+    def from_rectangle(rect: RectangleObject) -> "BoundingBox":
+        return BoundingBox(rect[0], rect[1], rect[2], rect[3])
+
+
+@dataclass
+class TextElement(Element):
+    text: str
+    x: float
+    y: float
+
+@dataclass
+class ImageElement(Element):
+    name: str
+    bbox: BoundingBox
+
+@dataclass
+class PageReport:
+    mediabox: BoundingBox
+    elements: List[Element]
+
+def _pdf_report(local_pdf_path: str, page: int) -> PageReport:
+    reader = PdfReader(local_pdf_path)
+    page = reader.pages[page - 1]
+    resources = page.get("/Resources", {})
+    xobjects = resources.get("/XObject", {})
+    elements = []
+
+    def visitor_body(text, cm, tm, font_dict, font_size):
+        txt2user = mult(tm, cm)
+        elements.append(TextElement(text, txt2user[4], txt2user[5]))
+
+    def visitor_op(op, args, cm, tm):
+        if op == b"Do":
+            xobject_name = args[0]
+            xobject = xobjects.get(xobject_name)
+            if xobject and xobject["/Subtype"] == "/Image":
+                # Compute image bbox
+                # The image is placed according to the CTM
+                width = xobject.get("/Width")
+                height = xobject.get("/Height")
+                x0, y0 = _transform_point(0, 0, cm)
+                x1, y1 = _transform_point(1, 1, cm)
+                elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
+
+    page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
+
+    return PageReport(
+        mediabox=BoundingBox.from_rectangle(page.mediabox),
+        elements=elements,
+    )
+
+
+def _linearize_pdf_report(report: PageReport) -> str:
+    result = ""
+
+    result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
+
+    for index, element in enumerate(report.elements):
+        if isinstance(element, ImageElement):
+            result += f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]"
+        if isinstance(element, TextElement):
+            if len(element.text.strip()) == 0:
+                continue
+
+            result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
 
+    return result
diff --git a/tests/gnarly_pdfs/edgar.pdf b/tests/gnarly_pdfs/edgar.pdf
diff --git a/tests/test_anchor.py b/tests/test_anchor.py
@@ -4,32 +4,37 @@
 
 from pypdf import PdfReader
 
+from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
+
 class AnchorTest(unittest.TestCase):
     def testExtractText(self):
-        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
+        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
         reader = PdfReader(local_pdf_path)
-        page = reader.pages[1]
+        page = reader.pages[0]
 
         def visitor_body(text, cm, tm, font_dict, font_size):
-            print(repr(text))
+            print(repr(text), cm, tm, font_size)
+
+        def visitor_op(op, args, cm, tm):
+            #print(op, args, cm, tm)
+            pass
 
-        page.extract_text(visitor_text=visitor_body)
+        page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
 
     def testAnchorBase(self):
         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
 
-        from pdelfin.prompts._adv_anchor import extract_page
-        reader = PdfReader(local_pdf_path)
-        pypage = reader.pages[1]
+        report = _pdf_report(local_pdf_path, 2)
 
-        def visitor_body(text, cm, tm, font_dict, font_size):
-            print(repr(text))
+        print(report)
+
+        print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))
 
-        extract_page(pypage, reader, visitor_text=visitor_body)
+    def testAnchorImage(self):
+        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
 
-        # report = parse_pdf(local_pdf_path)
-        # print(json.dumps(report, indent=1))
+        report = _pdf_report(local_pdf_path, 2)
 
-        # report = _pdf_report(local_pdf_path, 1)
+        print(report)
 
-        # print(json.dumps(report, indent=1))
+        print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))