diff --git a/pdelfin/eval/evalhtml.py b/pdelfin/eval/evalhtml.py index b57cb3e..17cec53 100644 --- a/pdelfin/eval/evalhtml.py +++ b/pdelfin/eval/evalhtml.py @@ -12,36 +12,12 @@ from PIL import Image from tqdm import tqdm +from pdelfin.silver_data.renderpdf import render_pdf_to_base64png + session = boto3.Session(profile_name='s2') s3_client = session.client('s3') -def render_pdf_to_base64png(s3_path, page): - with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: - pdf_path = tmp_pdf.name - bucket, key = s3_path.replace("s3://", "").split('/', 1) - s3_client.download_file(bucket, key, pdf_path) - - # Render the PDF to an image, and display it in the first position - pdftoppm_result = subprocess.run( - ["pdftoppm", - "-png", - "-f", str(page), - "-l", str(page), - pdf_path], - timeout=120, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr - - png_image = Image.open(io.BytesIO(pdftoppm_result.stdout)) - webp_output = io.BytesIO() - png_image.save(webp_output, format="WEBP") - - image_base64 = base64.b64encode(webp_output.getvalue()).decode("utf-8") - - return image_base64 - - def process_entry(i, entry): # Randomly decide whether to display gold on the left or right if random.choice([True, False]): @@ -62,9 +38,16 @@ def process_entry(i, entry): s3_key = parsed_url.path.lstrip('/') signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800) + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: + pdf_path = tmp_pdf.name + bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1) + s3_client.download_file(bucket, key, pdf_path) + + page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024) + return { "entry_id": i, - "page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]), + "page_image": page_image_base64, "s3_path": entry["s3_path"], "page": entry["page"], "signed_pdf_link": signed_pdf_link, diff --git a/pdelfin/eval/runeval.py b/pdelfin/eval/runeval.py index a6d07a1..d686674 100644 --- a/pdelfin/eval/runeval.py +++ b/pdelfin/eval/runeval.py @@ -24,6 +24,10 @@ from .evalhtml import create_review_html +import logging + +logging.getLogger("pypdf").setLevel(logging.ERROR) + CACHE_DIR = os.path.join(Path.home(), ".cache", "pdf_gold_data_cache") diff --git a/pdelfin/silver_data/buildsilver.py b/pdelfin/silver_data/buildsilver.py index b386083..ba7e22c 100644 --- a/pdelfin/silver_data/buildsilver.py +++ b/pdelfin/silver_data/buildsilver.py @@ -12,6 +12,7 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed from urllib.parse import urlparse +from pdelfin.silver_data.renderpdf import render_pdf_to_base64png from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema from pdelfin.prompts.anchor import get_anchor_text from pdelfin.filter import PdfFilter @@ -22,30 +23,7 @@ pdf_filter = PdfFilter() def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict: - pdf = PdfReader(local_pdf_path) - pdf_page = pdf.pages[page - 1] - longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height) - - # Convert PDF page to PNG using pdftoppm - pdftoppm_result = subprocess.run( - [ - "pdftoppm", - "-png", - "-f", - str(page), - "-l", - str(page), - "-r", - str(TARGET_IMAGE_DIM * 72 / longest_dim), - local_pdf_path, - ], - timeout=120, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr - image_base64 = base64.b64encode(pdftoppm_result.stdout).decode("utf-8") - + image_base64 = render_pdf_to_base64png(local_pdf_path, page, TARGET_IMAGE_DIM) anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport") # DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit diff --git a/pdelfin/silver_data/renderpdf.py b/pdelfin/silver_data/renderpdf.py new file mode 100644 index 0000000..9f4294e --- /dev/null +++ b/pdelfin/silver_data/renderpdf.py @@ -0,0 +1,42 @@ +import subprocess +import base64 +import io +from pypdf import PdfReader + +from PIL import Image + + +def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048): + pdf = PdfReader(local_pdf_path) + pdf_page = pdf.pages[page - 1] + longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height) + + # Convert PDF page to PNG using pdftoppm + pdftoppm_result = subprocess.run( + [ + "pdftoppm", + "-png", + "-f", + str(page), + "-l", + str(page), + "-r", + str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor + local_pdf_path, + ], + timeout=120, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr + return base64.b64encode(pdftoppm_result.stdout).decode("utf-8") + + +def render_pdf_to_base64webp(local_pdf_path: str, page: int, target_longest_image_dim: int=1024): + base64_png = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim) + + png_image = Image.open(io.BytesIO(base64_png.encode("utf-8"))) + webp_output = io.BytesIO() + png_image.save(webp_output, format="WEBP") + + return base64.b64encode(webp_output.getvalue()).decode("utf-8") \ No newline at end of file