Skip to content

Commit

Permalink
Unifying some of the pdf rendering stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 9, 2024
1 parent dc6440d commit 400e921
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 51 deletions.
37 changes: 10 additions & 27 deletions pdelfin/eval/evalhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,12 @@
from PIL import Image
from tqdm import tqdm

from pdelfin.silver_data.renderpdf import render_pdf_to_base64png

session = boto3.Session(profile_name='s2')
s3_client = session.client('s3')


def render_pdf_to_base64png(s3_path, page):
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
pdf_path = tmp_pdf.name
bucket, key = s3_path.replace("s3://", "").split('/', 1)
s3_client.download_file(bucket, key, pdf_path)

# Render the PDF to an image, and display it in the first position
pdftoppm_result = subprocess.run(
["pdftoppm",
"-png",
"-f", str(page),
"-l", str(page),
pdf_path],
timeout=120,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr

png_image = Image.open(io.BytesIO(pdftoppm_result.stdout))
webp_output = io.BytesIO()
png_image.save(webp_output, format="WEBP")

image_base64 = base64.b64encode(webp_output.getvalue()).decode("utf-8")

return image_base64


def process_entry(i, entry):
# Randomly decide whether to display gold on the left or right
if random.choice([True, False]):
Expand All @@ -62,9 +38,16 @@ def process_entry(i, entry):
s3_key = parsed_url.path.lstrip('/')
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)

with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
pdf_path = tmp_pdf.name
bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
s3_client.download_file(bucket, key, pdf_path)

page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024)

return {
"entry_id": i,
"page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
"page_image": page_image_base64,
"s3_path": entry["s3_path"],
"page": entry["page"],
"signed_pdf_link": signed_pdf_link,
Expand Down
4 changes: 4 additions & 0 deletions pdelfin/eval/runeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

from .evalhtml import create_review_html

import logging

logging.getLogger("pypdf").setLevel(logging.ERROR)


CACHE_DIR = os.path.join(Path.home(), ".cache", "pdf_gold_data_cache")

Expand Down
26 changes: 2 additions & 24 deletions pdelfin/silver_data/buildsilver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from urllib.parse import urlparse

from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema
from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.filter import PdfFilter
Expand All @@ -22,30 +23,7 @@
pdf_filter = PdfFilter()

def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
pdf = PdfReader(local_pdf_path)
pdf_page = pdf.pages[page - 1]
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)

# Convert PDF page to PNG using pdftoppm
pdftoppm_result = subprocess.run(
[
"pdftoppm",
"-png",
"-f",
str(page),
"-l",
str(page),
"-r",
str(TARGET_IMAGE_DIM * 72 / longest_dim),
local_pdf_path,
],
timeout=120,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
image_base64 = base64.b64encode(pdftoppm_result.stdout).decode("utf-8")

image_base64 = render_pdf_to_base64png(local_pdf_path, page, TARGET_IMAGE_DIM)
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

# DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
Expand Down
42 changes: 42 additions & 0 deletions pdelfin/silver_data/renderpdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import subprocess
import base64
import io
from pypdf import PdfReader

from PIL import Image


def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
pdf = PdfReader(local_pdf_path)
pdf_page = pdf.pages[page - 1]
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)

# Convert PDF page to PNG using pdftoppm
pdftoppm_result = subprocess.run(
[
"pdftoppm",
"-png",
"-f",
str(page),
"-l",
str(page),
"-r",
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
local_pdf_path,
],
timeout=120,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")


def render_pdf_to_base64webp(local_pdf_path: str, page: int, target_longest_image_dim: int=1024):
base64_png = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim)

png_image = Image.open(io.BytesIO(base64_png.encode("utf-8")))
webp_output = io.BytesIO()
png_image.save(webp_output, format="WEBP")

return base64.b64encode(webp_output.getvalue()).decode("utf-8")

0 comments on commit 400e921

Please sign in to comment.