Skip to content

Commit

Permalink
Cleaning up anchor text to deal with abnormally long lines
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 9, 2024
1 parent b6b74b7 commit dc6440d
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 9 deletions.
43 changes: 34 additions & 9 deletions pdelfin/prompts/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# coherency score best of these three
import subprocess
import math
import re
import ftfy
from dataclasses import dataclass
from typing import Literal, List
Expand Down Expand Up @@ -219,18 +219,46 @@ def bboxes_overlap(b1: BoundingBox, b2: BoundingBox, tolerance: float) -> bool:
# Return the merged images along with other elements
return merged_images

def _cap_split_string(text: str, max_length: int) -> str:
if len(text) <= max_length:
return text

head_length = max_length // 2 - 3
tail_length = head_length

head = text[:head_length].rsplit(' ', 1)[0] or text[:head_length]
tail = text[-tail_length:].split(' ', 1)[-1] or text[-tail_length:]

return f"{head} ... {tail}"

def _cleanup_element_text(element_text: str) -> str:
MAX_TEXT_ELEMENT_LENGTH = 250
TEXT_REPLACEMENTS = {
"[": "\\[",
"]": "\\]",
"\n": "\\n",
"\r": "\\r",
"\t": "\\t"
}
text_replacement_pattern = re.compile("|".join(re.escape(key) for key in TEXT_REPLACEMENTS.keys()))

element_text = ftfy.fix_text(element_text).strip()

# Replace square brackets with escaped brackets and other escaped chars
element_text = text_replacement_pattern.sub(lambda match: TEXT_REPLACEMENTS[match.group(0)], element_text)

return _cap_split_string(element_text, MAX_TEXT_ELEMENT_LENGTH)

def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
result = ""

result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"

images = _merge_image_elements(report.image_elements)

# Process image elements
image_strings = []
for element in images:
image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]"
image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]\n"
# Use element's unique identifier (e.g., id or position) for comparison
image_strings.append((element, image_str))

Expand All @@ -239,12 +267,9 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
for element in report.text_elements:
if len(element.text.strip()) == 0:
continue

element_text = ftfy.fix_text(element.text)
# Replace square brackets with escaped brackets
element_text = element_text.replace("[", "\\[").replace("]", "\\]")

text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}"

element_text = _cleanup_element_text(element.text)
text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n"
text_strings.append((element, text_str))

# Combine all elements with their positions for sorting
Expand Down
Binary file added tests/gnarly_pdfs/large_prompt_hint3.pdf
Binary file not shown.
9 changes: 9 additions & 0 deletions tests/test_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ def testLargePromptHint2(self):
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)

def testLargePromptHint3(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")

anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)

def testNewsPaperPromptHint(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")

Expand Down

0 comments on commit dc6440d

Please sign in to comment.