Skip to content

Commit

Permalink
Appears as if the report method works really well, might need one las…
Browse files Browse the repository at this point in the history
…t step to detect rotated pages
  • Loading branch information
jakep-allenai committed Oct 2, 2024
1 parent 5703a59 commit 0071cbd
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# ml stuff
wandb/

/*.html

# build artifacts

.eggs/
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/prompts/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data
def build_openai_silver_data_prompt(base_text: str) -> str:
return (
f"Below is the image of one page of a PDF document, as well as some raw textual content that was previously extracted for it. "
f"Below is the image of one page of a PDF document, as well as some raw textual content that was previously extracted for it that includes position information for each image and block of text (The origin [0x0] of the coordinates is in the lower left corner of the image). "
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
f"Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n"
f"Read any natural handwriting.\n"
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/silver_data/convertsilver_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):

from pdelfin.prompts.anchor import get_anchor_text

raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="topcoherency")
raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

from pdelfin.prompts import build_openai_silver_data_prompt
obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def testAnchorBase(self):
def testAnchorImage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")

report = _pdf_report(local_pdf_path, 2)
report = _pdf_report(local_pdf_path, 1)

print(report)

print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))

0 comments on commit 0071cbd

Please sign in to comment.