diff --git a/README.md b/README.md index fe68204..6974861 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ You can also bulk convert many PDFS with a glob pattern: python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/*.pdf ``` +#### Viewing Results + Once that finishes, output is stored as [Dolma](https://github.com/allenai/dolma)-style JSONL inside of the `./localworkspace/results` directory. ```bash diff --git a/olmocr/viewer/dolmaviewer.py b/olmocr/viewer/dolmaviewer.py index 41e791e..9d4136f 100644 --- a/olmocr/viewer/dolmaviewer.py +++ b/olmocr/viewer/dolmaviewer.py @@ -11,7 +11,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import markdown2 -from olmocr.s3_utils import get_s3_bytes +from olmocr.s3_utils import get_s3_bytes, parse_s3_path from olmocr.data.renderpdf import render_pdf_to_base64webp def read_jsonl(path): @@ -19,12 +19,6 @@ def read_jsonl(path): for line in f: yield line.strip() -def parse_s3_path(path): - # s3://bucket_name/key_name - path = path[5:] # Remove 's3://' - bucket_name, key_name = path.split('/', 1) - return bucket_name, key_name - def generate_presigned_url(s3_client, bucket_name, key_name): try: response = s3_client.generate_presigned_url('get_object',