Skip to content

Commit

Permalink
Review page size option, fixing mkdirs in convertsilver script
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 2, 2024
1 parent 276465a commit 73fb81e
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 13 deletions.
14 changes: 10 additions & 4 deletions pdelfin/eval/runeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def process_jsonl_file(jsonl_file, gold_data, comparer):

return total_alignment_score, char_weighted_alignment_score, total_chars, total_pages, page_data

def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) -> tuple[float, list[dict]]:
def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str, review_page_size: int) -> tuple[float, list[dict]]:
gold_data = load_gold_data(gold_data_path)

total_alignment_score = 0
Expand Down Expand Up @@ -237,10 +237,10 @@ def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) ->

# Select the top 20 lowest alignments
page_eval_data.sort(key=lambda x: x["alignment"])
create_review_html(page_eval_data[:20], filename=review_page_name + "_worst.html")
create_review_html(page_eval_data[:review_page_size], filename=review_page_name + "_worst.html")

# Select random entries to return in the page_eval_data
page_eval_data = random.sample(page_eval_data, 20)
page_eval_data = random.sample(page_eval_data, review_page_size)
create_review_html(page_eval_data, filename=review_page_name + "_sample.html")


Expand All @@ -256,6 +256,12 @@ def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) ->
default="review_page",
help="What name to give to this evaluation/comparison"
)
parser.add_argument(
'--review_size',
default=20,
type=int,
help="Number of entries to show on the generated review page",
)
parser.add_argument(
'gold_data_path',
type=str,
Expand All @@ -269,4 +275,4 @@ def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) ->

args = parser.parse_args()

result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name)
result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name, review_page_size=args.review_size)
22 changes: 13 additions & 9 deletions pdelfin/prompts/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,21 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
elif pdf_engine == "pymupdf":
return _get_pymupdf(local_pdf_path, page)
elif pdf_engine == "topcoherency":
options = [
_get_pdftotext(local_pdf_path, page),
_get_pymupdf(local_pdf_path, page),
_get_pdfium(local_pdf_path, page),
_get_pypdf_raw(local_pdf_path, page)
]
options = {
"pdftotext": _get_pdftotext(local_pdf_path, page),
"pymupdf": _get_pymupdf(local_pdf_path, page),
"pdfium": _get_pdfium(local_pdf_path, page),
"pypdf_raw": _get_pypdf_raw(local_pdf_path, page)
}

scores = [get_document_coherency(text) for text in options]
scores = {label: get_document_coherency(text) for label, text in options.items()}

# return option with the best (highest) score (higher is more likley, as these are logprobs)
return options[scores.index(max(scores))]
best_option_label = max(scores, key=scores.get)
best_option = options[best_option_label]

print(f"topcoherency chosen: {best_option_label}")

return best_option
elif pdf_engine == "pdfreport":
return _linearize_pdf_report(_pdf_report(local_pdf_path, page))
else:
Expand Down
24 changes: 24 additions & 0 deletions pdelfin/silver_data/convertsilver_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import sys
import os
import logging

import smart_open
Expand Down Expand Up @@ -63,6 +64,25 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):

if match:
raw_page_text = match.group(1).strip()

# Ok, now we want to try to see if it's better if we recalculate the anchor text
goldkey = obj["custom_id"]
s3_path = goldkey[:goldkey.rindex("-")]
page = int(goldkey[goldkey.rindex("-") + 1:])

# Save the pdf to a temporary cache folder
import os
local_pdf_path = "/home/ubuntu/.cache/samplepdfs/" + os.path.basename(s3_path)

if not os.path.exists(local_pdf_path):
print("Loading pdf", s3_path)
with smart_open.smart_open(s3_path, "rb") as fin, open(local_pdf_path, "wb") as fout:
fout.write(fin.read())

from pdelfin.prompts.anchor import get_anchor_text

raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="topcoherency")

from pdelfin.prompts import build_openai_silver_data_prompt
obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text)

Expand All @@ -74,6 +94,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):

logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.")
except Exception as e:
logging.exception(e)
logging.error(f"Failed to process file {input_file}: {e}")


Expand Down Expand Up @@ -191,6 +212,9 @@ def main():
output_dir = args.output_dir.rstrip('/')
max_jobs = args.jobs

if not output_dir.startswith("s3:"):
os.makedirs(output_dir, exist_ok=True)

# List input files
input_files = list_input_files(input_dir)

Expand Down

0 comments on commit 73fb81e

Please sign in to comment.