Moving a whole bunch of code over, still broken

allenai · Sep 17, 2024 · 01bc0b2 · 01bc0b2
1 parent a534a01
commit 01bc0b2
Show file tree

Hide file tree

Showing 8 changed files with 164 additions and 1 deletion.
diff --git a/pdelfin/extract_text.py b/pdelfin/extract_text.py
@@ -0,0 +1,58 @@
+import subprocess
+import pymupdf
+
+from typing import Literal
+
+
+def get_page_text(local_pdf_path: str, page_num: int, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"]="pdftotext") -> str:
+    if pdf_engine == "pdftotext":
+        pdftotext_result = subprocess.run(
+            [
+                "pdftotext",
+                "-f",
+                str(page_num),
+                "-l",
+                str(page_num),
+                local_pdf_path,
+                "-",
+            ],
+            timeout=60,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        assert pdftotext_result.returncode == 0
+
+        return pdftotext_result.stdout.decode("utf-8")
+    elif pdf_engine == "pymupdf":
+        pm_doc = pymupdf.open(local_pdf_path)
+        return pm_doc[page_num - 1].get_text()
+    else:
+        raise NotImplementedError()
+
+
+def get_document_text(local_pdf_path: str, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"]="pdftotext") -> str:
+    if pdf_engine == "pdftotext":
+        pdftotext_result = subprocess.run(
+            [
+                "pdftotext",
+                local_pdf_path,
+                "-",
+            ],
+            timeout=60,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        assert pdftotext_result.returncode == 0
+
+        return pdftotext_result.stdout.decode("utf-8")
+    elif pdf_engine == "pymupdf":
+        pm_doc = pymupdf.open(local_pdf_path)
+        result = ""
+
+        for page in pm_doc:
+            result += page.get_text()
+            result += "\n"
+
+        return result
+    else:
+        raise NotImplementedError()
diff --git a/pdelfin/filter/__init__.py b/pdelfin/filter/__init__.py
@@ -0,0 +1 @@
+from .filter import PdfFilter
diff --git a/pdelfin/filter/coherency.py b/pdelfin/filter/coherency.py
@@ -0,0 +1,21 @@
+# Uses a premade kenLM filter trained on good DCLM filtered web data to help identify pdfs where the 
+# content has been very poorly parsed
+import kenlm
+
+from functools import lru_cache
+from cached_path import cached_path
+
+KENLM_S3_PATH = "s3://ai2-oe-data/jakep/kenlm-dclm/5gramtok.bin"
+
+@lru_cache()
+def load_kenlm():
+    local_path = cached_path(KENLM_S3_PATH)
+    model = kenlm.Model(local_path)
+
+    return model
+
+
+def get_document_coherency(text: str) -> float:
+    model = load_kenlm()
+
+    return model.score(text)
diff --git a/pdelfin/filter.py → pdelfin/filter/filter.py b/pdelfin/filter.py → pdelfin/filter/filter.py
diff --git a/pdelfin/filter/imagedetect.py b/pdelfin/filter/imagedetect.py
@@ -0,0 +1,54 @@
+import numpy as np
+
+from pypdf import PdfReader
+from pypdf.generic import ContentStream
+from pypdf.generic import NumberObject, NameObject
+
+def process_content(content_stream, resources):
+    total_image_area = 0
+    graphics_state_stack = []
+    current_matrix = np.eye(3)
+
+    for operands, operator in content_stream.operations:
+        if operator == b'q':  # Save graphics state
+            graphics_state_stack.append(current_matrix.copy())
+        elif operator == b'Q':  # Restore graphics state
+            current_matrix = graphics_state_stack.pop()
+        elif operator == b'cm':  # Concatenate matrix to CTM
+            a, b, c, d, e, f = operands # [a, b, c, d, e, f]
+            cm_matrix = np.array([[a, b, 0], [c, d, 0], [e, f, 1]])
+            current_matrix = np.matmul(current_matrix, cm_matrix)
+        elif operator == b'Do':  # Paint external object
+            xObjectName = operands[0]
+            if '/XObject' in resources and xObjectName in resources['/XObject']:
+                xObject = resources['/XObject'][xObjectName]
+                if xObject['/Subtype'] == '/Image':
+                    width = xObject['/Width']
+                    height = xObject['/Height']
+
+                    # Calculate the area scaling factor using the absolute value of the determinant
+
+                    image_area = float(width) * float(height) * np.linalg.det(current_matrix)
+                    total_image_area += image_area
+    return total_image_area
+
+
+
+
+def pdf_page_image_area(reader: PdfReader, page_num: int) -> float:
+    page = reader.pages[page_num - 1]
+
+    page_width = float(page.mediabox.width)
+    page_height = float(page.mediabox.height)
+    page_area = page_width * page_height
+
+    content = page.get_contents()
+    if content is None:
+        return float("nan")
+
+    content_stream = ContentStream(content, reader)
+    resources = page['/Resources']
+
+    image_area = process_content(content_stream, resources)
+
+    return image_area / page_area
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,8 +19,11 @@ authors = [
 ]
 requires-python = ">=3.8"
 dependencies = [
+  "cached-path",
   "pypdf",
+  "pymupdf",
   "lingua-language-detector"
+  "https://github.com/kpu/kenlm/archive/master.zip",
 ]
 license = {file = "LICENSE"}
 

diff --git a/tests/test_coherency.py b/tests/test_coherency.py
@@ -0,0 +1,15 @@
+import os
+
+import unittest
+
+from pdelfin.filter.coherency import get_document_coherency
+from pdelfin.extract_text import get_document_text
+
+
+class TestCoherencyScores(unittest.TestCase):
+    def testBadOcr1(self):
+        good_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"))
+        bad_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"))
+
+        print("Good", get_document_coherency(good_text))
+        print("Bad", get_document_coherency(bad_text))
diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -2,11 +2,22 @@
 import os
 
 from pdelfin.filter import PdfFilter
+from pdelfin.filter.imagedetect import pdf_page_image_area
+
+from pypdf import PdfReader
 
 class PdfFilterTest(unittest.TestCase):
     def setUp(self) -> None:
         self.filter = PdfFilter()
 
     def testFormLaterPages(self):
         self.assertTrue(self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf")))
-
+
+
+class ImageDetectionTest(unittest.TestCase):
+    def testSlideshowMostlyImages(self):
+        self.pdf = PdfReader(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "slideshow_mostly_images.pdf"))
+
+        for page in range(self.pdf.get_num_pages()):
+            print(page, pdf_page_image_area(self.pdf, page + 1))
+