Skip to content

Commit

Permalink
Moving a whole bunch of code over, still broken
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Sep 17, 2024
1 parent a534a01 commit 01bc0b2
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 1 deletion.
58 changes: 58 additions & 0 deletions pdelfin/extract_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import subprocess
import pymupdf

from typing import Literal


def get_page_text(local_pdf_path: str, page_num: int, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"]="pdftotext") -> str:
if pdf_engine == "pdftotext":
pdftotext_result = subprocess.run(
[
"pdftotext",
"-f",
str(page_num),
"-l",
str(page_num),
local_pdf_path,
"-",
],
timeout=60,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftotext_result.returncode == 0

return pdftotext_result.stdout.decode("utf-8")
elif pdf_engine == "pymupdf":
pm_doc = pymupdf.open(local_pdf_path)
return pm_doc[page_num - 1].get_text()
else:
raise NotImplementedError()


def get_document_text(local_pdf_path: str, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"]="pdftotext") -> str:
if pdf_engine == "pdftotext":
pdftotext_result = subprocess.run(
[
"pdftotext",
local_pdf_path,
"-",
],
timeout=60,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftotext_result.returncode == 0

return pdftotext_result.stdout.decode("utf-8")
elif pdf_engine == "pymupdf":
pm_doc = pymupdf.open(local_pdf_path)
result = ""

for page in pm_doc:
result += page.get_text()
result += "\n"

return result
else:
raise NotImplementedError()
1 change: 1 addition & 0 deletions pdelfin/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .filter import PdfFilter
21 changes: 21 additions & 0 deletions pdelfin/filter/coherency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Uses a premade kenLM filter trained on good DCLM filtered web data to help identify pdfs where the
# content has been very poorly parsed
import kenlm

from functools import lru_cache
from cached_path import cached_path

KENLM_S3_PATH = "s3://ai2-oe-data/jakep/kenlm-dclm/5gramtok.bin"

@lru_cache()
def load_kenlm():
local_path = cached_path(KENLM_S3_PATH)
model = kenlm.Model(local_path)

return model


def get_document_coherency(text: str) -> float:
model = load_kenlm()

return model.score(text)
File renamed without changes.
54 changes: 54 additions & 0 deletions pdelfin/filter/imagedetect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import numpy as np

from pypdf import PdfReader
from pypdf.generic import ContentStream
from pypdf.generic import NumberObject, NameObject

def process_content(content_stream, resources):
total_image_area = 0
graphics_state_stack = []
current_matrix = np.eye(3)

for operands, operator in content_stream.operations:
if operator == b'q': # Save graphics state
graphics_state_stack.append(current_matrix.copy())
elif operator == b'Q': # Restore graphics state
current_matrix = graphics_state_stack.pop()
elif operator == b'cm': # Concatenate matrix to CTM
a, b, c, d, e, f = operands # [a, b, c, d, e, f]
cm_matrix = np.array([[a, b, 0], [c, d, 0], [e, f, 1]])
current_matrix = np.matmul(current_matrix, cm_matrix)
elif operator == b'Do': # Paint external object
xObjectName = operands[0]
if '/XObject' in resources and xObjectName in resources['/XObject']:
xObject = resources['/XObject'][xObjectName]
if xObject['/Subtype'] == '/Image':
width = xObject['/Width']
height = xObject['/Height']

# Calculate the area scaling factor using the absolute value of the determinant

image_area = float(width) * float(height) * np.linalg.det(current_matrix)
total_image_area += image_area
return total_image_area




def pdf_page_image_area(reader: PdfReader, page_num: int) -> float:
page = reader.pages[page_num - 1]

page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
page_area = page_width * page_height

content = page.get_contents()
if content is None:
return float("nan")

content_stream = ContentStream(content, reader)
resources = page['/Resources']

image_area = process_content(content_stream, resources)

return image_area / page_area
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ authors = [
]
requires-python = ">=3.8"
dependencies = [
"cached-path",
"pypdf",
"pymupdf",
"lingua-language-detector"
"https://github.com/kpu/kenlm/archive/master.zip",
]
license = {file = "LICENSE"}

Expand Down
15 changes: 15 additions & 0 deletions tests/test_coherency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os

import unittest

from pdelfin.filter.coherency import get_document_coherency
from pdelfin.extract_text import get_document_text


class TestCoherencyScores(unittest.TestCase):
def testBadOcr1(self):
good_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"))
bad_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"))

print("Good", get_document_coherency(good_text))
print("Bad", get_document_coherency(bad_text))
13 changes: 12 additions & 1 deletion tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,22 @@
import os

from pdelfin.filter import PdfFilter
from pdelfin.filter.imagedetect import pdf_page_image_area

from pypdf import PdfReader

class PdfFilterTest(unittest.TestCase):
def setUp(self) -> None:
self.filter = PdfFilter()

def testFormLaterPages(self):
self.assertTrue(self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf")))



class ImageDetectionTest(unittest.TestCase):
def testSlideshowMostlyImages(self):
self.pdf = PdfReader(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "slideshow_mostly_images.pdf"))

for page in range(self.pdf.get_num_pages()):
print(page, pdf_page_image_area(self.pdf, page + 1))

0 comments on commit 01bc0b2

Please sign in to comment.