Skip to content

Commit

Permalink
isort
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Jan 29, 2025
1 parent 0628d31 commit 4a1762d
Show file tree
Hide file tree
Showing 45 changed files with 374 additions and 298 deletions.
6 changes: 3 additions & 3 deletions olmocr/check.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import subprocess
import logging
import importlib.util
import logging
import subprocess
import sys

logger = logging.getLogger(__name__)

Expand Down
22 changes: 13 additions & 9 deletions olmocr/data/buildsilver.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import os
import argparse
import base64
import glob
import json
import os
import random
import subprocess
import base64
import argparse
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from typing import Generator
from urllib.parse import urlparse

import boto3
import json
from pypdf import PdfReader
from tqdm import tqdm
from typing import Generator
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from urllib.parse import urlparse

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_openai_silver_data_prompt, openai_response_format_schema
from olmocr.prompts.anchor import get_anchor_text
from olmocr.filter import PdfFilter
from olmocr.prompts import (
build_openai_silver_data_prompt,
openai_response_format_schema,
)
from olmocr.prompts.anchor import get_anchor_text

TARGET_IMAGE_DIM = 2048

Expand Down
10 changes: 6 additions & 4 deletions olmocr/data/buildsilverdatasummary.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import os
import csv
import json
import argparse
import re
import collections
import csv
import json
import os
import random
import re
import sqlite3
from concurrent.futures import ProcessPoolExecutor, as_completed
from urllib.parse import urlparse

from tqdm import tqdm


def parse_pdf_hash(pretty_pdf_path: str) -> str:
pattern = r"s3://ai2-s2-pdfs/([a-f0-9]{4})/([a-f0-9]+)\.pdf-\d+"
match = re.match(pattern, pretty_pdf_path)
Expand Down
13 changes: 7 additions & 6 deletions olmocr/data/buildtestset.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import os
import argparse
import base64
import glob
import os
import random
import argparse
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import List
from urllib.parse import urlparse

import boto3
import base64
from pypdf import PdfReader, PdfWriter
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from urllib.parse import urlparse
from typing import List

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.filter import PdfFilter
Expand Down
17 changes: 9 additions & 8 deletions olmocr/data/convertsilver_birr.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import argparse
import json
import logging
import os
import re
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import sys
import logging
import tempfile
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

import smart_open
import boto3
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from olmocr.data.renderpdf import render_pdf_to_base64png

# Import Plotly for plotting
import plotly.express as px
import smart_open

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text


def setup_logging():
Expand Down
14 changes: 8 additions & 6 deletions olmocr/data/convertsilver_openai.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import argparse
import json
import logging
import os
import re
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import sys
import os
import logging
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

import smart_open
from cached_path import cached_path

from olmocr.prompts import build_finetuning_prompt


Expand Down Expand Up @@ -73,8 +74,8 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
# Save the pdf to a temporary cache folder
local_pdf_path = cached_path(s3_path, quiet=True)

from olmocr.prompts.anchor import get_anchor_text
from olmocr.data.buildsilver import build_page_query
from olmocr.prompts.anchor import get_anchor_text
obj = build_page_query(local_pdf_path, s3_path, page)
# raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

Expand Down Expand Up @@ -142,9 +143,10 @@ def list_input_files(input_dir):
"""
if is_s3_path(input_dir):
# Use smart_open's s3 functionality to list files
import boto3
import fnmatch

import boto3

# Parse bucket and prefix
bucket_name = input_dir.split('s3://')[1].split('/')[0]
path_and_pattern = '/'.join(input_dir.split('s3://')[1].split('/')[1:])
Expand Down
5 changes: 3 additions & 2 deletions olmocr/data/renderpdf.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import subprocess
import base64
import io
from pypdf import PdfReader
import subprocess

from PIL import Image
from pypdf import PdfReader


def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
Expand Down
9 changes: 5 additions & 4 deletions olmocr/data/runopenaibatch.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# Sends list of batch files to OpenAI for processing
# However, it also waits and gets the files when they are done, saves its state, and
# allows you to submit more than the 100GB of file request limits that the openaiAPI has
import argparse
import datetime
import json
import os
import time
import json
import datetime
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from enum import Enum

from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up OpenAI client (API key should be set in the environment)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
Expand Down
13 changes: 7 additions & 6 deletions olmocr/eval/buildelo.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import argparse
import boto3
import dataclasses
import functools
import random
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
import functools

from tqdm import tqdm
from itertools import combinations
from olmocr.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes

import boto3
from dolma_refine.evaluate.aligners import HirschbergAligner
from dolma_refine.evaluate.metrics import DocumentEditSimilarity
from dolma_refine.evaluate.segmenters import SpacySegmenter
from dolma_refine.evaluate.aligners import HirschbergAligner
from tqdm import tqdm

from olmocr.eval.evalhtml import create_review_html
from olmocr.s3_utils import expand_s3_glob, get_s3_bytes, parse_s3_path


@dataclasses.dataclass
class Comparison:
Expand Down
8 changes: 5 additions & 3 deletions olmocr/eval/evalhtml.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os
import random
import tempfile
import boto3
from concurrent.futures import ThreadPoolExecutor
from jinja2 import Template
from urllib.parse import urlparse
from difflib import SequenceMatcher
from urllib.parse import urlparse

import boto3
from jinja2 import Template
from tqdm import tqdm

from olmocr.data.renderpdf import render_pdf_to_base64png

session = boto3.Session(profile_name='s2')
Expand Down
25 changes: 12 additions & 13 deletions olmocr/eval/runeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,28 @@
# You might need to pip install git+https://github.com/allenai/refine.git@soldni/eval-m
# in order to use some of the existing aligner scoring that was developed as part
# of the refiner pipeline
import boto3
import os
import json
import argparse
import hashlib
import json
import logging
import os
import random
import zstandard
import sys
import argparse

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from typing import Optional, Tuple, Dict
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from pathlib import Path
from smart_open import smart_open, register_compressor
from typing import Dict, Optional, Tuple

import boto3
import zstandard
from dolma_refine.evaluate.aligners import HirschbergAligner
from dolma_refine.evaluate.metrics import DocumentEditSimilarity
from dolma_refine.evaluate.segmenters import SpacySegmenter
from dolma_refine.evaluate.aligners import HirschbergAligner
from smart_open import register_compressor, smart_open
from tqdm import tqdm

from .evalhtml import create_review_html

import logging

logging.getLogger("pypdf").setLevel(logging.ERROR)


Expand Down
8 changes: 5 additions & 3 deletions olmocr/eval/scoreelo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import requests
import re
from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode
import csv
import re
from collections import defaultdict
from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit

import requests


def fetch_review_page_html(url):
"""
Expand Down
2 changes: 1 addition & 1 deletion olmocr/filter/coherency.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
@lru_cache()
def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
model.eval() # Set the model to evaluation mode

return tokenizer, model
Expand Down
6 changes: 4 additions & 2 deletions olmocr/filter/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,13 @@ def filter_out_pdf(self, local_pdf_path: str) -> bool:

if __name__ == "__main__":
import tempfile
from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait

import boto3
from olmocr.s3_utils import parse_s3_path
from concurrent.futures import ProcessPoolExecutor, wait, FIRST_COMPLETED
from tqdm import tqdm

from olmocr.s3_utils import parse_s3_path

# Quiet logs from pypdf
logging.getLogger("pypdf").setLevel(logging.ERROR)

Expand Down
5 changes: 3 additions & 2 deletions olmocr/metrics.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import time
import asyncio
from collections import deque, defaultdict
import time
from collections import defaultdict, deque
from dataclasses import dataclass, field
from typing import Dict


class MetricsKeeper:
def __init__(self, window=60*5):
"""
Expand Down
Loading

0 comments on commit 4a1762d

Please sign in to comment.