isort

allenai · Jan 29, 2025 · 4a1762d · 4a1762d
1 parent 0628d31
commit 4a1762d
Show file tree

Hide file tree

Showing 45 changed files with 374 additions and 298 deletions.
diff --git a/olmocr/check.py b/olmocr/check.py
@@ -1,7 +1,7 @@
-import sys
-import subprocess
-import logging
 import importlib.util
+import logging
+import subprocess
+import sys
 
 logger = logging.getLogger(__name__)
 

diff --git a/olmocr/data/buildsilver.py b/olmocr/data/buildsilver.py
@@ -1,21 +1,25 @@
-import os
+import argparse
+import base64
 import glob
+import json
+import os
 import random
 import subprocess
-import base64
-import argparse
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from typing import Generator
+from urllib.parse import urlparse
+
 import boto3
-import json
 from pypdf import PdfReader
 from tqdm import tqdm
-from typing import Generator
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
-from urllib.parse import urlparse
 
 from olmocr.data.renderpdf import render_pdf_to_base64png
-from olmocr.prompts import build_openai_silver_data_prompt, openai_response_format_schema
-from olmocr.prompts.anchor import get_anchor_text
 from olmocr.filter import PdfFilter
+from olmocr.prompts import (
+    build_openai_silver_data_prompt,
+    openai_response_format_schema,
+)
+from olmocr.prompts.anchor import get_anchor_text
 
 TARGET_IMAGE_DIM = 2048
 

diff --git a/olmocr/data/buildsilverdatasummary.py b/olmocr/data/buildsilverdatasummary.py
@@ -1,15 +1,17 @@
-import os
-import csv
-import json
 import argparse
-import re
 import collections
+import csv
+import json
+import os
 import random
+import re
 import sqlite3
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from urllib.parse import urlparse
+
 from tqdm import tqdm
 
+
 def parse_pdf_hash(pretty_pdf_path: str) -> str:
     pattern = r"s3://ai2-s2-pdfs/([a-f0-9]{4})/([a-f0-9]+)\.pdf-\d+"
     match = re.match(pattern, pretty_pdf_path)

diff --git a/olmocr/data/buildtestset.py b/olmocr/data/buildtestset.py
@@ -1,14 +1,15 @@
-import os
+import argparse
+import base64
 import glob
+import os
 import random
-import argparse
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import List
+from urllib.parse import urlparse
+
 import boto3
-import base64
 from pypdf import PdfReader, PdfWriter
 from tqdm import tqdm
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from urllib.parse import urlparse
-from typing import List
 
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.filter import PdfFilter

diff --git a/olmocr/data/convertsilver_birr.py b/olmocr/data/convertsilver_birr.py
@@ -1,21 +1,22 @@
 import argparse
 import json
+import logging
+import os
 import re
-from pathlib import Path
-from concurrent.futures import ProcessPoolExecutor, as_completed
 import sys
-import logging
 import tempfile
-import os
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
 
-import smart_open
 import boto3
-from olmocr.prompts import build_finetuning_prompt
-from olmocr.prompts.anchor import get_anchor_text
-from olmocr.data.renderpdf import render_pdf_to_base64png
 
 # Import Plotly for plotting
 import plotly.express as px
+import smart_open
+
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_finetuning_prompt
+from olmocr.prompts.anchor import get_anchor_text
 
 
 def setup_logging():

diff --git a/olmocr/data/convertsilver_openai.py b/olmocr/data/convertsilver_openai.py
@@ -1,14 +1,15 @@
 import argparse
 import json
+import logging
+import os
 import re
-from pathlib import Path
-from concurrent.futures import ProcessPoolExecutor, as_completed
 import sys
-import os
-import logging
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
 
 import smart_open
 from cached_path import cached_path
+
 from olmocr.prompts import build_finetuning_prompt
 
 
@@ -73,8 +74,8 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
                         # Save the pdf to a temporary cache folder
                         local_pdf_path = cached_path(s3_path, quiet=True)
 
-                        from olmocr.prompts.anchor import get_anchor_text
                         from olmocr.data.buildsilver import build_page_query
+                        from olmocr.prompts.anchor import get_anchor_text
                         obj = build_page_query(local_pdf_path, s3_path, page)
                         # raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
 
@@ -142,9 +143,10 @@ def list_input_files(input_dir):
     """
     if is_s3_path(input_dir):
         # Use smart_open's s3 functionality to list files
-        import boto3
         import fnmatch
 
+        import boto3
+
         # Parse bucket and prefix
         bucket_name = input_dir.split('s3://')[1].split('/')[0]
         path_and_pattern = '/'.join(input_dir.split('s3://')[1].split('/')[1:])

diff --git a/olmocr/data/renderpdf.py b/olmocr/data/renderpdf.py
@@ -1,8 +1,9 @@
-import subprocess
 import base64
 import io
-from pypdf import PdfReader
+import subprocess
+
 from PIL import Image
+from pypdf import PdfReader
 
 
 def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:

diff --git a/olmocr/data/runopenaibatch.py b/olmocr/data/runopenaibatch.py
@@ -1,15 +1,16 @@
 # Sends list of batch files to OpenAI for processing
 # However, it also waits and gets the files when they are done, saves its state, and 
 # allows you to submit more than the 100GB of file request limits that the openaiAPI has
+import argparse
+import datetime
+import json
 import os
 import time
-import json
-import datetime
-import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from enum import Enum
+
 from openai import OpenAI
 from tqdm import tqdm
-from concurrent.futures import ThreadPoolExecutor, as_completed
 
 # Set up OpenAI client (API key should be set in the environment)
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

diff --git a/olmocr/eval/buildelo.py b/olmocr/eval/buildelo.py
@@ -1,19 +1,20 @@
 import argparse
-import boto3
 import dataclasses
+import functools
 import random
 import re
 from concurrent.futures import ProcessPoolExecutor, as_completed
-import functools
-
-from tqdm import tqdm
 from itertools import combinations
-from olmocr.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes
+
+import boto3
+from dolma_refine.evaluate.aligners import HirschbergAligner
 from dolma_refine.evaluate.metrics import DocumentEditSimilarity
 from dolma_refine.evaluate.segmenters import SpacySegmenter
-from dolma_refine.evaluate.aligners import HirschbergAligner
+from tqdm import tqdm
 
 from olmocr.eval.evalhtml import create_review_html
+from olmocr.s3_utils import expand_s3_glob, get_s3_bytes, parse_s3_path
+
 
 @dataclasses.dataclass
 class Comparison:

diff --git a/olmocr/eval/evalhtml.py b/olmocr/eval/evalhtml.py
@@ -1,12 +1,14 @@
 import os
 import random
 import tempfile
-import boto3
 from concurrent.futures import ThreadPoolExecutor
-from jinja2 import Template
-from urllib.parse import urlparse
 from difflib import SequenceMatcher
+from urllib.parse import urlparse
+
+import boto3
+from jinja2 import Template
 from tqdm import tqdm
+
 from olmocr.data.renderpdf import render_pdf_to_base64png
 
 session = boto3.Session(profile_name='s2')

diff --git a/olmocr/eval/runeval.py b/olmocr/eval/runeval.py
@@ -3,29 +3,28 @@
 # You might need to pip install git+https://github.com/allenai/refine.git@soldni/eval-m
 # in order to use some of the existing aligner scoring that was developed as part
 # of the refiner pipeline
-import boto3
-import os
-import json
+import argparse
 import hashlib
+import json
+import logging
+import os
 import random
-import zstandard
 import sys
-import argparse
-
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
-from typing import Optional, Tuple, Dict
-from tqdm import tqdm
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
 from pathlib import Path
-from smart_open import smart_open, register_compressor
+from typing import Dict, Optional, Tuple
+
+import boto3
+import zstandard
+from dolma_refine.evaluate.aligners import HirschbergAligner
 from dolma_refine.evaluate.metrics import DocumentEditSimilarity
 from dolma_refine.evaluate.segmenters import SpacySegmenter
-from dolma_refine.evaluate.aligners import HirschbergAligner
+from smart_open import register_compressor, smart_open
+from tqdm import tqdm
 
 from .evalhtml import create_review_html
 
-import logging
-
 logging.getLogger("pypdf").setLevel(logging.ERROR)
 
 

diff --git a/olmocr/eval/scoreelo.py b/olmocr/eval/scoreelo.py
@@ -1,8 +1,10 @@
-import requests
-import re
-from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode
 import csv
+import re
 from collections import defaultdict
+from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit
+
+import requests
+
 
 def fetch_review_page_html(url):
     """

diff --git a/olmocr/filter/coherency.py b/olmocr/filter/coherency.py
@@ -7,7 +7,7 @@
 @lru_cache()
 def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
     model.eval()  # Set the model to evaluation mode
 
     return tokenizer, model

diff --git a/olmocr/filter/filter.py b/olmocr/filter/filter.py
@@ -124,11 +124,13 @@ def filter_out_pdf(self, local_pdf_path: str) -> bool:
 
 if __name__ == "__main__":
     import tempfile
+    from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
+
     import boto3
-    from olmocr.s3_utils import parse_s3_path
-    from concurrent.futures import ProcessPoolExecutor, wait, FIRST_COMPLETED
     from tqdm import tqdm
 
+    from olmocr.s3_utils import parse_s3_path
+
     # Quiet logs from pypdf
     logging.getLogger("pypdf").setLevel(logging.ERROR)
 

diff --git a/olmocr/metrics.py b/olmocr/metrics.py
@@ -1,9 +1,10 @@
-import time
 import asyncio
-from collections import deque, defaultdict
+import time
+from collections import defaultdict, deque
 from dataclasses import dataclass, field
 from typing import Dict
 
+
 class MetricsKeeper:
     def __init__(self, window=60*5):
         """