diff --git a/pdelfin/eval/evalhtml.py b/pdelfin/eval/evalhtml.py
index b57cb3e..17cec53 100644
--- a/pdelfin/eval/evalhtml.py
+++ b/pdelfin/eval/evalhtml.py
@@ -12,36 +12,12 @@
from PIL import Image
from tqdm import tqdm
+from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
+
session = boto3.Session(profile_name='s2')
s3_client = session.client('s3')
-def render_pdf_to_base64png(s3_path, page):
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
- pdf_path = tmp_pdf.name
- bucket, key = s3_path.replace("s3://", "").split('/', 1)
- s3_client.download_file(bucket, key, pdf_path)
-
- # Render the PDF to an image, and display it in the first position
- pdftoppm_result = subprocess.run(
- ["pdftoppm",
- "-png",
- "-f", str(page),
- "-l", str(page),
- pdf_path],
- timeout=120,
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
-
- png_image = Image.open(io.BytesIO(pdftoppm_result.stdout))
- webp_output = io.BytesIO()
- png_image.save(webp_output, format="WEBP")
-
- image_base64 = base64.b64encode(webp_output.getvalue()).decode("utf-8")
-
- return image_base64
-
-
def process_entry(i, entry):
# Randomly decide whether to display gold on the left or right
if random.choice([True, False]):
@@ -62,9 +38,16 @@ def process_entry(i, entry):
s3_key = parsed_url.path.lstrip('/')
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
+ pdf_path = tmp_pdf.name
+ bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
+ s3_client.download_file(bucket, key, pdf_path)
+
+ page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024)
+
return {
"entry_id": i,
- "page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
+ "page_image": page_image_base64,
"s3_path": entry["s3_path"],
"page": entry["page"],
"signed_pdf_link": signed_pdf_link,
diff --git a/pdelfin/eval/runeval.py b/pdelfin/eval/runeval.py
index a6d07a1..d686674 100644
--- a/pdelfin/eval/runeval.py
+++ b/pdelfin/eval/runeval.py
@@ -24,6 +24,10 @@
from .evalhtml import create_review_html
+import logging
+
+logging.getLogger("pypdf").setLevel(logging.ERROR)
+
CACHE_DIR = os.path.join(Path.home(), ".cache", "pdf_gold_data_cache")
diff --git a/pdelfin/silver_data/buildsilver.py b/pdelfin/silver_data/buildsilver.py
index b386083..ba7e22c 100644
--- a/pdelfin/silver_data/buildsilver.py
+++ b/pdelfin/silver_data/buildsilver.py
@@ -12,6 +12,7 @@
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from urllib.parse import urlparse
+from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema
from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.filter import PdfFilter
@@ -22,30 +23,7 @@
pdf_filter = PdfFilter()
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
- pdf = PdfReader(local_pdf_path)
- pdf_page = pdf.pages[page - 1]
- longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
-
- # Convert PDF page to PNG using pdftoppm
- pdftoppm_result = subprocess.run(
- [
- "pdftoppm",
- "-png",
- "-f",
- str(page),
- "-l",
- str(page),
- "-r",
- str(TARGET_IMAGE_DIM * 72 / longest_dim),
- local_pdf_path,
- ],
- timeout=120,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
- assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
- image_base64 = base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
-
+ image_base64 = render_pdf_to_base64png(local_pdf_path, page, TARGET_IMAGE_DIM)
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
# DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
diff --git a/pdelfin/silver_data/renderpdf.py b/pdelfin/silver_data/renderpdf.py
new file mode 100644
index 0000000..9f4294e
--- /dev/null
+++ b/pdelfin/silver_data/renderpdf.py
@@ -0,0 +1,42 @@
+import subprocess
+import base64
+import io
+from pypdf import PdfReader
+
+from PIL import Image
+
+
+def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
+ pdf = PdfReader(local_pdf_path)
+ pdf_page = pdf.pages[page - 1]
+ longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
+
+ # Convert PDF page to PNG using pdftoppm
+ pdftoppm_result = subprocess.run(
+ [
+ "pdftoppm",
+ "-png",
+ "-f",
+ str(page),
+ "-l",
+ str(page),
+ "-r",
+ str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
+ local_pdf_path,
+ ],
+ timeout=120,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
+ return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
+
+
+def render_pdf_to_base64webp(local_pdf_path: str, page: int, target_longest_image_dim: int=1024):
+ base64_png = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim)
+
+ png_image = Image.open(io.BytesIO(base64_png.encode("utf-8")))
+ webp_output = io.BytesIO()
+ png_image.save(webp_output, format="WEBP")
+
+ return base64.b64encode(webp_output.getvalue()).decode("utf-8")
\ No newline at end of file