generated from allenai/python-package-template
-
Notifications
You must be signed in to change notification settings - Fork 397
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Moving a whole bunch of code over, still broken
- Loading branch information
1 parent
a534a01
commit 01bc0b2
Showing
8 changed files
with
164 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import subprocess | ||
import pymupdf | ||
|
||
from typing import Literal | ||
|
||
|
||
def get_page_text(local_pdf_path: str, page_num: int, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"]="pdftotext") -> str: | ||
if pdf_engine == "pdftotext": | ||
pdftotext_result = subprocess.run( | ||
[ | ||
"pdftotext", | ||
"-f", | ||
str(page_num), | ||
"-l", | ||
str(page_num), | ||
local_pdf_path, | ||
"-", | ||
], | ||
timeout=60, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
) | ||
assert pdftotext_result.returncode == 0 | ||
|
||
return pdftotext_result.stdout.decode("utf-8") | ||
elif pdf_engine == "pymupdf": | ||
pm_doc = pymupdf.open(local_pdf_path) | ||
return pm_doc[page_num - 1].get_text() | ||
else: | ||
raise NotImplementedError() | ||
|
||
|
||
def get_document_text(local_pdf_path: str, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"]="pdftotext") -> str: | ||
if pdf_engine == "pdftotext": | ||
pdftotext_result = subprocess.run( | ||
[ | ||
"pdftotext", | ||
local_pdf_path, | ||
"-", | ||
], | ||
timeout=60, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
) | ||
assert pdftotext_result.returncode == 0 | ||
|
||
return pdftotext_result.stdout.decode("utf-8") | ||
elif pdf_engine == "pymupdf": | ||
pm_doc = pymupdf.open(local_pdf_path) | ||
result = "" | ||
|
||
for page in pm_doc: | ||
result += page.get_text() | ||
result += "\n" | ||
|
||
return result | ||
else: | ||
raise NotImplementedError() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .filter import PdfFilter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Uses a premade kenLM filter trained on good DCLM filtered web data to help identify pdfs where the | ||
# content has been very poorly parsed | ||
import kenlm | ||
|
||
from functools import lru_cache | ||
from cached_path import cached_path | ||
|
||
KENLM_S3_PATH = "s3://ai2-oe-data/jakep/kenlm-dclm/5gramtok.bin" | ||
|
||
@lru_cache() | ||
def load_kenlm(): | ||
local_path = cached_path(KENLM_S3_PATH) | ||
model = kenlm.Model(local_path) | ||
|
||
return model | ||
|
||
|
||
def get_document_coherency(text: str) -> float: | ||
model = load_kenlm() | ||
|
||
return model.score(text) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import numpy as np | ||
|
||
from pypdf import PdfReader | ||
from pypdf.generic import ContentStream | ||
from pypdf.generic import NumberObject, NameObject | ||
|
||
def process_content(content_stream, resources): | ||
total_image_area = 0 | ||
graphics_state_stack = [] | ||
current_matrix = np.eye(3) | ||
|
||
for operands, operator in content_stream.operations: | ||
if operator == b'q': # Save graphics state | ||
graphics_state_stack.append(current_matrix.copy()) | ||
elif operator == b'Q': # Restore graphics state | ||
current_matrix = graphics_state_stack.pop() | ||
elif operator == b'cm': # Concatenate matrix to CTM | ||
a, b, c, d, e, f = operands # [a, b, c, d, e, f] | ||
cm_matrix = np.array([[a, b, 0], [c, d, 0], [e, f, 1]]) | ||
current_matrix = np.matmul(current_matrix, cm_matrix) | ||
elif operator == b'Do': # Paint external object | ||
xObjectName = operands[0] | ||
if '/XObject' in resources and xObjectName in resources['/XObject']: | ||
xObject = resources['/XObject'][xObjectName] | ||
if xObject['/Subtype'] == '/Image': | ||
width = xObject['/Width'] | ||
height = xObject['/Height'] | ||
|
||
# Calculate the area scaling factor using the absolute value of the determinant | ||
|
||
image_area = float(width) * float(height) * np.linalg.det(current_matrix) | ||
total_image_area += image_area | ||
return total_image_area | ||
|
||
|
||
|
||
|
||
def pdf_page_image_area(reader: PdfReader, page_num: int) -> float: | ||
page = reader.pages[page_num - 1] | ||
|
||
page_width = float(page.mediabox.width) | ||
page_height = float(page.mediabox.height) | ||
page_area = page_width * page_height | ||
|
||
content = page.get_contents() | ||
if content is None: | ||
return float("nan") | ||
|
||
content_stream = ContentStream(content, reader) | ||
resources = page['/Resources'] | ||
|
||
image_area = process_content(content_stream, resources) | ||
|
||
return image_area / page_area |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import os | ||
|
||
import unittest | ||
|
||
from pdelfin.filter.coherency import get_document_coherency | ||
from pdelfin.extract_text import get_document_text | ||
|
||
|
||
class TestCoherencyScores(unittest.TestCase): | ||
def testBadOcr1(self): | ||
good_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf")) | ||
bad_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf")) | ||
|
||
print("Good", get_document_coherency(good_text)) | ||
print("Bad", get_document_coherency(bad_text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters