diff --git a/README.md b/README.md index 002642c..2221a80 100644 --- a/README.md +++ b/README.md @@ -69,10 +69,12 @@ cat localworkspace/results/output_*.jsonl You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command. -```python - +```bash +python -m olmocr.viewer.dolmaviewer localworkspace/results/output_*.jsonl ``` +Now open `./dolma_previews/tests_gnarly_pdfs_horribleocr_pdf.html` in your favorite browser. + ### Multi-node / Cluster Usage diff --git a/olmocr/viewer/dolmaviewer.py b/olmocr/viewer/dolmaviewer.py index 96c1905..41e791e 100644 --- a/olmocr/viewer/dolmaviewer.py +++ b/olmocr/viewer/dolmaviewer.py @@ -44,12 +44,9 @@ def process_document(data, s3_client, template, output_dir): source_file = metadata.get('Source-File') # Generate base64 image of the corresponding PDF page - if source_file and source_file.startswith('s3://'): - local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf") - local_pdf.write(get_s3_bytes(s3_client, source_file)) - local_pdf.flush() - else: - raise ValueError("Expecting s3 files only") + local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf") + local_pdf.write(get_s3_bytes(s3_client, source_file)) + local_pdf.flush() pages = [] for span in pdf_page_numbers: