Skip to content

Commit

Permalink
Adding empty anchor support
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 23, 2024
1 parent f8c5aac commit 999f64d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pdelfin/prompts/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
result = ""
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"

if max_length < 20:
return result

images = _merge_image_elements(report.image_elements)

# Process image elements
Expand Down
22 changes: 22 additions & 0 deletions tests/test_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,21 @@ def testTobaccoPaperMissingParagraphs(self):
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)

def testAnchorOtherLengths(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")

anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=2000)

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 2000)

anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 6000)

def testFailingAnchor(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")

Expand All @@ -121,6 +136,13 @@ def testFailingAnchor(self):
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)

def testEmptyAnchor(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")

anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=0)

self.assertEqual(anchor_text.strip(), "Page dimensions: 612.0x792.0")

class BuildSilverTest(unittest.TestCase):
def testSmallPage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
Expand Down

0 comments on commit 999f64d

Please sign in to comment.