Skip to content

Commit

Permalink
Runeval is much improved now
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 1, 2024
1 parent 8a66ece commit 9d6e2fa
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 82 deletions.
83 changes: 46 additions & 37 deletions pdelfin/eval/evalhtml.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from jinja2 import Template
import random
import os
Expand Down Expand Up @@ -41,48 +42,56 @@ def render_pdf_to_base64png(s3_path, page):
return image_base64


def process_entry(i, entry):
# Randomly decide whether to display gold on the left or right
if random.choice([True, False]):
left_text, right_text = entry["gold_text"], entry["eval_text"]
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
left_class, right_class = "gold", "eval"
else:
left_text, right_text = entry["eval_text"], entry["gold_text"]
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
left_class, right_class = "eval", "gold"

# Convert newlines to <p> tags for proper formatting
left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>"
right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>"

parsed_url = urlparse(entry["s3_path"])
bucket = parsed_url.netloc
s3_key = parsed_url.path.lstrip('/')
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)

return {
"entry_id": i,
"page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
"s3_path": entry["s3_path"],
"page": entry["page"],
"signed_pdf_link": signed_pdf_link,
"left_text": left_text,
"right_text": right_text,
"left_alignment": left_alignment,
"right_alignment": right_alignment,
"left_class": left_class,
"right_class": right_class,
"gold_class": "gold" if left_class == "gold" else "eval",
"eval_class": "eval" if right_class == "eval" else "gold"
}


def create_review_html(data, filename="review_page.html"):
# Load the Jinja2 template from the file
with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f:
template = Template(f.read())

entries = []
for i, entry in tqdm(enumerate(data)):
# Randomly decide whether to display gold on the left or right
if random.choice([True, False]):
left_text, right_text = entry["gold_text"], entry["eval_text"]
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
left_class, right_class = "gold", "eval"
else:
left_text, right_text = entry["eval_text"], entry["gold_text"]
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
left_class, right_class = "eval", "gold"

# Convert newlines to <p> tags for proper formatting
left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>"
right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>"

parsed_url = urlparse(entry["s3_path"])
bucket = parsed_url.netloc
s3_key = parsed_url.path.lstrip('/')
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)

# Create a dictionary for each entry
entries.append({
"entry_id": i,
"page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
"s3_path": entry["s3_path"],
"page": entry["page"],
"signed_pdf_link": signed_pdf_link,
"left_text": left_text,
"right_text": right_text,
"left_alignment": left_alignment,
"right_alignment": right_alignment,
"left_class": left_class,
"right_class": right_class,
"gold_class": "gold" if left_class == "gold" else "eval",
"eval_class": "eval" if right_class == "eval" else "gold"
})
with ThreadPoolExecutor() as executor:
# Submit tasks to the executor
futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)]

# Process the results as they are completed
for future in tqdm(futures):
entries.append(future.result())

# Render the template with the entries
final_html = template.render(entries=entries)
Expand Down
97 changes: 52 additions & 45 deletions pdelfin/eval/runeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import random
import zstandard
import sys
import argparse

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
Expand Down Expand Up @@ -58,42 +59,24 @@ def load_gold_data(gold_data_path: str) -> dict:

gold_data = {}

# List the contents of the S3 bucket
bucket_name, prefix = gold_data_path.replace("s3://", "").split("/", 1)
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

for page in pages:
for obj in page.get('Contents', []):
s3_key = obj['Key']
if s3_key.endswith('.json'):
local_file_path = os.path.join(CACHE_DIR, os.path.basename(s3_key))
etag = obj['ETag'].strip('"') # ETag is the checksum
gold_jsonl_files = list_jsonl_files(gold_data_path)

for path in gold_jsonl_files:
# Load the JSON file
with smart_open(path, 'r') as f:
for line in f:
data = json.loads(line)

# Check if the file is already cached and verify its checksum
if os.path.exists(local_file_path):
cached_file_hash = compute_file_hash(local_file_path)
if cached_file_hash != etag:
raise ValueError(f"File {local_file_path} has changed on S3. Clear the cache in {CACHE_DIR} and reload.")
if "custom_id" in data:
# This is for loading gold data that came out of openai's batch API directly
custom_id = data["custom_id"]
text = data["response"]["body"]["choices"][0]["message"]["content"]
else:
# Download the file from S3 if not cached
download_from_s3(f"s3://{bucket_name}/{s3_key}", local_file_path)

# Load the JSON file
with smart_open(local_file_path, 'r') as f:
for line in f:
data = json.loads(line)

if "custom_id" in data:
# This is for loading gold data that came out of openai's batch API directly
custom_id = data["custom_id"]
text = data["response"]["body"]["choices"][0]["message"]["content"]
else:
# This is for loading gold data that went through the mise pdf refine pipeline
custom_id = data["s3_path"] + "-" + str(data["page"])
text = data["outputs"][0]["text"]

gold_data[custom_id] = text
# This is for loading gold data that went through the mise pdf refine pipeline
custom_id = data["s3_path"] + "-" + str(data["page"])
text = data["outputs"][0]["text"]

gold_data[custom_id] = text

print(f"Loaded {len(gold_data):,} gold data entries for comparison")

Expand Down Expand Up @@ -197,7 +180,7 @@ def process_jsonl_file(jsonl_file, gold_data, comparer):

return total_alignment_score, char_weighted_alignment_score, total_chars, total_pages, page_data

def do_eval(gold_data_path: str, eval_data_path: str, ) -> tuple[float, list[dict]]:
def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) -> tuple[float, list[dict]]:
gold_data = load_gold_data(gold_data_path)

total_alignment_score = 0
Expand Down Expand Up @@ -238,28 +221,52 @@ def do_eval(gold_data_path: str, eval_data_path: str, ) -> tuple[float, list[dic
# if pd["alignment"] > 0.97:
# continue

if len(pd["gold_text"]) < 200 and len(pd["eval_text"]) < 200:
continue
# if len(pd["gold_text"]) < 200 and len(pd["eval_text"]) < 200:
# continue

page_eval_data.append(pd)

print(f"Compared {len(total_pages_compared):,} pages")
print(f"Total corpus alignment: {total_alignment_score:.2f}")
print(f"Mean alignment: {total_alignment_score / total_weight:.3f}")

print("...creating review page")

# Select random entries to return in the page_eval_data
page_eval_data = random.sample(page_eval_data, 20)
create_review_html(page_eval_data, filename=review_page_name + "_sample.html")

# Select the top 20 lowest alignments
# page_eval_data.sort(key=lambda x: x["alignment"])
# page_eval_data = page_eval_data[:20]
page_eval_data.sort(key=lambda x: x["alignment"])
page_eval_data = page_eval_data[:20]

# Uncomment this to generate a nice review page to use with tinyhost
create_review_html(page_eval_data, filename="review_page.html")
create_review_html(page_eval_data, filename=review_page_name + "_worst.html")

print(f"Compared {len(total_pages_compared):,} pages")
print(f"Total corpus alignment: {total_alignment_score:.2f}")
print(f"Mean alignment: {total_alignment_score / total_weight:.3f}")

return total_alignment_score / total_weight, page_eval_data


if __name__ == "__main__":
result = do_eval(gold_data_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v3_eval/",
eval_data_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v3_eval/")
parser = argparse.ArgumentParser(
description="Transform JSONL files by extracting and renaming specific fields."
)
parser.add_argument(
'--name',
default="review_page",
help="What name to give to this evaluation/comparison"
)
parser.add_argument(
'gold_data_path',
type=str,
help='Path to the gold data directory containing JSONL files. Can be a local path or S3 URL. Can be openai "done" data, or birr "done" data'
)
parser.add_argument(
'eval_data_path',
type=str,
help='Path to the eval data directory containing JSONL files. Can be a local path or S3 URL. Can be openai "done" data, or birr "done" data'
)

args = parser.parse_args()

result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name)

0 comments on commit 9d6e2fa

Please sign in to comment.