Skip to content

Commit

Permalink
Fix a reliability issue
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 18, 2024
1 parent 0af29f1 commit 96984fc
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 3 deletions.
11 changes: 8 additions & 3 deletions pdelfin/beakerpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,12 @@ async def process_page(args, session: aiohttp.ClientSession, worker_id: int, pdf

try:
async with session.post(COMPLETION_URL, json=query) as response:
response.raise_for_status()
if response.status == 400:
error_text = await response.text()
raise ValueError(f"Got BadRequestError from server: {error_text}, skipping this response")
else:
response.raise_for_status()

base_response_data = await response.json()

if base_response_data["usage"]["total_tokens"] > args.model_max_context:
Expand Down Expand Up @@ -872,9 +877,9 @@ async def main():
# - Refactor the work queue into its own file so it's reusable and generic, and it makes temporary work files (prevent issue where if a work item is done, then it stalls because queue was just emptied)
# X Fix the queue release mechanism so that it just does a timeout, based on zero queue size only, so you don't block things
# - Add logging of failed pages and have the stats function read them
# - Add the page rotation check and mechanism
# X Add the page rotation check and mechanism
# - Sglang commit a fix for the context length issue
# - Get a solid benchmark on the stream vs non stream approach
# - sglang error on s3://ai2-s2-pdfs/73ee/35e7ed5c2fb113ceba652284aaa51db7c2fc.pdf-2
# - Client error on attempt 0 for s3://ai2-s2-pdfs/e13c/9e03ce463ba53bfb15b26dbfd55c0bbc5568.pdf-1: 400, message='Bad Request',
# X Client error on attempt 0 for s3://ai2-s2-pdfs/e13c/9e03ce463ba53bfb15b26dbfd55c0bbc5568.pdf-1: 400, message='Bad Request',
# - Fix loading of the model checkpoints, it's so flakey now, maybe use datasets
Binary file added tests/gnarly_pdfs/map1.pdf
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/test_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,16 @@ def testEmptyAnchor(self):

self.assertEqual(anchor_text.strip(), "Page dimensions: 612.0x792.0")

# TODO This one still fails
def testExcessiveMapAnchor(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "map1.pdf")

anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)

class BuildSilverTest(unittest.TestCase):
def testSmallPage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
Expand Down

0 comments on commit 96984fc

Please sign in to comment.