Skip to content

Commit

Permalink
Viewer and gitignore
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Jan 29, 2025
1 parent 86267d8 commit b574766
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ s2orc_previews_3200/*
sample200_vllm/*
sample200_sglang/*
pdelfin_testset/*
localworkspace/*
/*.html
scoreelo.csv
debug.log
Expand Down
177 changes: 125 additions & 52 deletions olmocr/viewer/dolmaviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
import boto3
import tempfile
import glob
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from jinja2 import Template
import smart_open
Expand All @@ -14,19 +15,29 @@
from olmocr.s3_utils import get_s3_bytes, parse_s3_path
from olmocr.data.renderpdf import render_pdf_to_base64webp

def read_jsonl(path):
with smart_open.smart_open(path, 'r', encoding='utf-8') as f:
for line in f:
yield line.strip()
def read_jsonl(paths):
"""
Generator that yields lines from multiple JSONL files.
Supports both local and S3 paths.
"""
for path in paths:
try:
with smart_open.smart_open(path, 'r', encoding='utf-8') as f:
for line in f:
yield line.strip()
except Exception as e:
print(f"Error reading {path}: {e}")

def generate_presigned_url(s3_client, bucket_name, key_name):
try:
response = s3_client.generate_presigned_url('get_object',
Params={'Bucket': bucket_name, 'Key': key_name},
ExpiresIn=3600 * 24 * 7 - 100) # Link expires in 1 week
response = s3_client.generate_presigned_url(
'get_object',
Params={'Bucket': bucket_name, 'Key': key_name},
ExpiresIn=3600 * 24 * 7 - 100 # Link expires in 1 week
)
return response
except (NoCredentialsError, PartialCredentialsError):
print("Error: AWS credentials not found or incomplete.")
except (NoCredentialsError, PartialCredentialsError) as e:
print(f"Error generating presigned URL: {e}")
return None

def process_document(data, s3_client, template, output_dir):
Expand All @@ -38,24 +49,34 @@ def process_document(data, s3_client, template, output_dir):
source_file = metadata.get('Source-File')

# Generate base64 image of the corresponding PDF page
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
local_pdf.write(get_s3_bytes(s3_client, source_file))
local_pdf.flush()

pages = []
for span in pdf_page_numbers:
start_index, end_index, page_num = span
page_text = text[start_index:end_index]

# Detect and convert Markdown to HTML
page_text = html.escape(page_text, quote=True).replace('&lt;br&gt;', '<br>')
page_text = markdown2.markdown(page_text, extras=["tables"])

base64_image = render_pdf_to_base64webp(local_pdf.name, page_num)

pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image})

local_pdf.close()
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False)
try:
pdf_bytes = get_s3_bytes(s3_client, source_file)
if pdf_bytes is None:
print(f"Failed to retrieve PDF from {source_file}")
return
local_pdf.write(pdf_bytes)
local_pdf.flush()

pages = []
for span in pdf_page_numbers:
start_index, end_index, page_num = span
page_text = text[start_index:end_index]

# Detect and convert Markdown to HTML
page_text = html.escape(page_text, quote=True).replace('&lt;br&gt;', '<br>')
page_text = markdown2.markdown(page_text, extras=["tables"])

base64_image = render_pdf_to_base64webp(local_pdf.name, page_num)

pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image})

except Exception as e:
print(f"Error processing document ID {id_}: {e}")
return
finally:
local_pdf.close()
os.unlink(local_pdf.name)

# Generate pre-signed URL if source_file is an S3 path
s3_link = None
Expand All @@ -64,49 +85,101 @@ def process_document(data, s3_client, template, output_dir):
s3_link = generate_presigned_url(s3_client, bucket_name, key_name)

# Render the HTML using the Jinja template
html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
try:
html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
except Exception as e:
print(f"Error rendering HTML for document ID {id_}: {e}")
return

# Write the HTML content to a file
filename = f'{source_file.replace("s3://", "").replace("/", "_").replace(".", "_")}.html'
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(html_content)

def main(jsonl_path, output_dir, template_path):
try:
safe_source = source_file.replace("s3://", "").replace("/", "_").replace(".", "_") if source_file else f"id_{id_}"
filename = f'{safe_source}.html'
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(html_content)
except Exception as e:
print(f"Error writing HTML file for document ID {id_}: {e}")

def main(jsonl_paths, output_dir, template_path, s3_profile_name):
if not os.path.exists(output_dir):
os.makedirs(output_dir)

# Expand glob patterns for local paths
expanded_paths = []
for path in jsonl_paths:
if path.startswith('s3://'):
expanded_paths.append(path)
else:
matched = glob.glob(path)
if not matched:
print(f"No files matched the pattern: {path}")
expanded_paths.extend(matched)

if not expanded_paths:
print("No JSONL files to process.")
return

# Load the Jinja template
with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file:
template_content = template_file.read()
template = Template(template_content)
try:
with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file:
template_content = template_file.read()
template = Template(template_content)
except Exception as e:
print(f"Error loading template: {e}")
return

# Initialize S3 client for generating presigned URLs
workspace_session = boto3.Session(profile_name="s2")
s3_client = workspace_session.client("s3")
try:
workspace_session = boto3.Session(profile_name=s3_profile_name)
s3_client = workspace_session.client("s3")
except Exception as e:
print(f"Error initializing S3 client: {e}")
return

# Create ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
futures = []
for line in read_jsonl(jsonl_path):
for line in read_jsonl(expanded_paths):
if not line:
continue
data = json.loads(line)
try:
data = json.loads(line)
except json.JSONDecodeError as e:
print(f"Invalid JSON line: {e}")
continue
future = executor.submit(process_document, data, s3_client, template, output_dir)
futures.append(future)

for future in tqdm(as_completed(futures), total=len(futures)):
try:
future.result()
except Exception as e:
print(f"An error occurred: {e}")
raise
for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"):
pass # Progress bar updates automatically

print(f"Output HTML-viewable pages to directory: {args.output_dir}")

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.')
parser.add_argument('jsonl_path', help='Path to the JSONL file (local or s3://)')
parser.add_argument('--output_dir', default='dolma_previews', help='Directory to save HTML files')
parser.add_argument('--template_path', default='dolmaviewer_template.html', help='Path to the Jinja2 template file')
parser = argparse.ArgumentParser(
description='Generate HTML pages from one or more JSONL files with pre-signed S3 links.'
)
parser.add_argument(
'jsonl_paths',
nargs='+',
help='Path(s) to the JSONL file(s) (local or s3://). Supports glob patterns for local paths.'
)
parser.add_argument(
'--output_dir',
default='dolma_previews',
help='Directory to save HTML files'
)
parser.add_argument(
'--template_path',
default='dolmaviewer_template.html',
help='Path to the Jinja2 template file'
)
parser.add_argument(
'--s3_profile',
default=None,
help='S3 profile to use for accessing the source documents to render them in the viewer.'
)
args = parser.parse_args()

main(args.jsonl_path, args.output_dir, args.template_path)
main(args.jsonl_paths, args.output_dir, args.template_path, args.s3_profile)

0 comments on commit b574766

Please sign in to comment.