Skip to content

Commit

Permalink
Switching buildsilver to use new anchor code
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 2, 2024
1 parent 0071cbd commit be00ccf
Showing 1 changed file with 3 additions and 10 deletions.
13 changes: 3 additions & 10 deletions pdelfin/silver_data/buildsilver.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from urllib.parse import urlparse

from pdelfin.prompts import build_openai_silver_data_prompt
from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.filter import PdfFilter

TARGET_IMAGE_DIM = 2048
Expand Down Expand Up @@ -45,15 +46,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
image_base64 = base64.b64encode(pdftoppm_result.stdout).decode("utf-8")

# Extract text from the PDF page using pdftotext
pdftotext_result = subprocess.run(
["pdftotext", "-f", str(page), "-l", str(page), local_pdf_path, "-"],
timeout=60,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftotext_result.returncode == 0
base_text = pdftotext_result.stdout.decode("utf-8")
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

# Construct OpenAI Batch API request format
return {
Expand All @@ -66,7 +59,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
{
"role": "user",
"content": [
{"type": "text", "text": build_openai_silver_data_prompt(base_text)},
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
],
}
Expand Down

0 comments on commit be00ccf

Please sign in to comment.