Skip to content

Commit

Permalink
Ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Jan 29, 2025
1 parent fb40229 commit 5690377
Show file tree
Hide file tree
Showing 25 changed files with 30 additions and 109 deletions.
4 changes: 2 additions & 2 deletions olmocr/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def check_poppler_version():
if result.returncode == 0 and result.stderr.startswith("pdftoppm"):
logger.info("pdftoppm is installed and working.")
else:
logger.error(f"pdftoppm is installed but returned an error.")
logger.error("pdftoppm is installed but returned an error.")
sys.exit(1)
except FileNotFoundError:
logger.error("pdftoppm is not installed.")
Expand All @@ -22,7 +22,7 @@ def check_poppler_version():

def check_sglang_version():
if importlib.util.find_spec("sglang") is None:
logger.error(f"Please make sure sglang is installed according to the latest instructions here: https://docs.sglang.ai/start/install.html")
logger.error("Please make sure sglang is installed according to the latest instructions here: https://docs.sglang.ai/start/install.html")
logger.error("Sglang needs to be installed with a separate command in order to find all dependencies properly.")
sys.exit(1)

Expand Down
4 changes: 1 addition & 3 deletions olmocr/data/buildsilver.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import argparse
import base64
import glob
import json
import os
import random
import subprocess
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Generator
from urllib.parse import urlparse

Expand Down
2 changes: 0 additions & 2 deletions olmocr/data/convertsilver_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import smart_open
from cached_path import cached_path

from olmocr.prompts import build_finetuning_prompt


def setup_logging():
Expand Down Expand Up @@ -66,7 +65,6 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
local_pdf_path = cached_path(s3_path, quiet=True)

from olmocr.data.buildsilver import build_page_query
from olmocr.prompts.anchor import get_anchor_text

obj = build_page_query(local_pdf_path, s3_path, page)
# raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
Expand Down
1 change: 0 additions & 1 deletion olmocr/data/renderpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import subprocess

from PIL import Image
from pypdf import PdfReader


def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
Expand Down
2 changes: 0 additions & 2 deletions olmocr/data/runopenaibatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import json
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from enum import Enum

from openai import OpenAI
from tqdm import tqdm
Expand Down
2 changes: 1 addition & 1 deletion olmocr/eval/buildelo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from tqdm import tqdm

from olmocr.eval.evalhtml import create_review_html
from olmocr.s3_utils import expand_s3_glob, get_s3_bytes, parse_s3_path
from olmocr.s3_utils import expand_s3_glob, get_s3_bytes


@dataclasses.dataclass
Expand Down
2 changes: 1 addition & 1 deletion olmocr/eval/runeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional, Tuple
from typing import Dict, Optional

import boto3
import zstandard
Expand Down
1 change: 0 additions & 1 deletion olmocr/filter/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from lingua import Language, LanguageDetectorBuilder
from pypdf import PdfReader
from pypdf.errors import DependencyError, PyPdfError

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand Down
1 change: 0 additions & 1 deletion olmocr/metrics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
import time
from collections import defaultdict, deque
from dataclasses import dataclass, field
from typing import Dict


Expand Down
23 changes: 8 additions & 15 deletions olmocr/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
import random
import re
import shutil
import signal
import subprocess
import sys
import tempfile
import time
Expand All @@ -22,7 +20,6 @@
from dataclasses import dataclass
from functools import cache, partial
from io import BytesIO
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import urlparse

import boto3
Expand All @@ -44,13 +41,11 @@
from olmocr.prompts import PageResponse, build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from olmocr.s3_utils import (
download_directory,
download_zstd_csv,
expand_s3_glob,
get_s3_bytes,
get_s3_bytes_with_backoff,
parse_s3_path,
upload_zstd_csv,
)
from olmocr.version import VERSION
from olmocr.work_queue import LocalWorkQueue, S3WorkQueue, WorkQueue
Expand Down Expand Up @@ -245,7 +240,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
if base_response_data["usage"]["total_tokens"] > args.model_max_context:
local_anchor_text_len = max(1, local_anchor_text_len // 2)
logger.info(f"Reducing anchor text len to {local_anchor_text_len} for {pdf_orig_path}-{page_num}")
raise ValueError(f"Response exceeded model_max_context, cannot use this response")
raise ValueError("Response exceeded model_max_context, cannot use this response")

metrics.add_metrics(
sglang_input_tokens=base_response_data["usage"].get("prompt_tokens", 0),
Expand Down Expand Up @@ -627,8 +622,8 @@ async def sglang_server_host(args, semaphore):

if retry >= MAX_RETRIES:
logger.error(f"Ended up starting the sglang server more than {retry} times, cancelling pipeline")
logger.error(f"")
logger.error(f"Please make sure sglang is installed according to the latest instructions here: https://docs.sglang.ai/start/install.html")
logger.error("")
logger.error("Please make sure sglang is installed according to the latest instructions here: https://docs.sglang.ai/start/install.html")
sys.exit(1)


Expand Down Expand Up @@ -668,8 +663,6 @@ def submit_beaker_job(args):
from beaker import (
Beaker,
Constraints,
DataMount,
DataSource,
EnvVar,
ExperimentSpec,
ImageSource,
Expand Down Expand Up @@ -712,7 +705,7 @@ def submit_beaker_job(args):
b.secret.write(f"{owner}-AWS_CREDENTIALS_FILE", open(os.path.join(os.path.expanduser("~"), ".aws", "credentials")).read(), args.beaker_workspace)

try:
b.secret.get(f"OE_DATA_GCS_SA_KEY", args.beaker_workspace)
b.secret.get("OE_DATA_GCS_SA_KEY", args.beaker_workspace)
except SecretNotFound:
print("Input the olmo-gcs SA key if you would like to load weights from gcs (end with a double newline):")
lines = []
Expand All @@ -724,7 +717,7 @@ def submit_beaker_job(args):
lines.append(line)
gcs_sa_key = "\n".join(lines[:-1]).strip() # Remove the last empty line
if gcs_sa_key:
b.secret.write(f"OE_DATA_GCS_SA_KEY", gcs_sa_key, args.beaker_workspace)
b.secret.write("OE_DATA_GCS_SA_KEY", gcs_sa_key, args.beaker_workspace)

# Create the experiment spec
experiment_spec = ExperimentSpec(
Expand All @@ -748,7 +741,7 @@ def submit_beaker_job(args):
EnvVar(name="WEKA_ACCESS_KEY_ID", secret=f"{owner}-WEKA_ACCESS_KEY_ID"),
EnvVar(name="WEKA_SECRET_ACCESS_KEY", secret=f"{owner}-WEKA_SECRET_ACCESS_KEY"),
EnvVar(name="AWS_CREDENTIALS_FILE", secret=f"{owner}-AWS_CREDENTIALS_FILE"),
EnvVar(name="GOOGLE_APPLICATION_CREDENTIALS_FILE", secret=f"OE_DATA_GCS_SA_KEY"),
EnvVar(name="GOOGLE_APPLICATION_CREDENTIALS_FILE", secret="OE_DATA_GCS_SA_KEY"),
],
resources=TaskResources(gpu_count=1),
constraints=Constraints(cluster=args.beaker_cluster if isinstance(args.beaker_cluster, list) else [args.beaker_cluster]),
Expand Down Expand Up @@ -860,12 +853,12 @@ def process_output_file(s3_path):

skipped_paths = original_paths - all_processed_paths

print(f"\nWork Items Status:")
print("\nWork Items Status:")
print(f"Total work items: {total_items:,}")
print(f"Completed items: {completed_items:,}")
print(f"Remaining items: {total_items - completed_items:,}")

print(f"\nResults:")
print("\nResults:")
print(f"Total documents processed: {docs_total:,}")
print(f"Total documents skipped: {len(skipped_paths):,}")
print(f"Total pages on fallback: {fallback_pages_total:,}")
Expand Down
8 changes: 0 additions & 8 deletions olmocr/prompts/_adv_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,15 @@
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Optional,
Sequence,
Set,
Tuple,
Union,
cast,
overload,
)

from pypdf._cmap import build_char_map, unknown_char_map
from pypdf.constants import AnnotationDictionaryAttributes as ADA
from pypdf.constants import ImageAttributes as IA
from pypdf.constants import PageAttributes as PG
from pypdf.constants import Resources as RES
from pypdf.generic import (
ContentStream,
DictionaryObject,
Expand Down
1 change: 0 additions & 1 deletion olmocr/prompts/anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# coherency score best of these three
import subprocess
from dataclasses import dataclass
from functools import lru_cache
from typing import List, Literal

import ftfy
Expand Down
4 changes: 1 addition & 3 deletions olmocr/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging
import os
import posixpath
import tempfile
import time
from io import BytesIO, TextIOWrapper
from pathlib import Path
Expand All @@ -17,8 +16,7 @@
import zstandard as zstd
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
from botocore.exceptions import ClientError, NoCredentialsError
from google.auth import compute_engine
from botocore.exceptions import ClientError
from google.cloud import storage
from tqdm import tqdm

Expand Down
1 change: 0 additions & 1 deletion olmocr/train/buildparquetdataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import argparse
import logging
import os
from functools import partial

import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
Expand Down
16 changes: 2 additions & 14 deletions olmocr/train/dataloader.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,20 @@
import base64
import glob
import json
import logging
import os
import re
import tempfile
from functools import partial
from logging import Logger
from typing import Any, Dict, Optional
from typing import Optional

import boto3
import pypdf
import pypdf.errors
from datasets import (
Dataset,
DatasetDict,
Features,
Value,
concatenate_datasets,
load_dataset,
)
from filelock import FileLock

from olmocr.data.renderpdf import get_pdf_media_box_width_height
from olmocr.prompts.anchor import get_anchor_text
from olmocr.s3_utils import get_s3_bytes, parse_custom_id, parse_s3_path
from olmocr.s3_utils import parse_custom_id, parse_s3_path

from .core.config import DataConfig, SourceConfig

# Configure logging
logging.basicConfig(level=logging.INFO)
Expand Down
1 change: 0 additions & 1 deletion olmocr/train/fixqwen2vlcheckpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import concurrent.futures
import json
import os
import tempfile

import boto3
import torch
Expand Down
17 changes: 0 additions & 17 deletions olmocr/train/inference.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,18 @@
import base64
import json
import logging
import os
import time
from functools import partial
from io import BytesIO
from logging import Logger
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional

import accelerate
import torch
import torch.distributed
from PIL import Image
from tqdm import tqdm
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoProcessor,
Qwen2_5_VLForConditionalGeneration,
Qwen2VLForConditionalGeneration,
Trainer,
TrainerCallback,
TrainingArguments,
)

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import (
build_finetuning_prompt,
build_openai_silver_data_prompt,
)

Expand Down
6 changes: 2 additions & 4 deletions olmocr/train/loaddataset.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoProcessor, DataCollatorForSeq2Seq
from transformers import AutoProcessor

from olmocr.train.core.cli import make_cli
from olmocr.train.core.config import TrainConfig

from .utils import TruncatingCollator, make_dataset
from .utils import make_dataset


def main():
Expand Down
3 changes: 1 addition & 2 deletions olmocr/train/molmo/config_molmo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List

from transformers import AutoTokenizer, PretrainedConfig
from transformers import PretrainedConfig


class MolmoConfig(PretrainedConfig):
Expand Down
3 changes: 1 addition & 2 deletions olmocr/train/molmo/image_processing_molmo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Image processor class for Molmo"""

from typing import List, Mapping, Optional, Union
from typing import List, Optional, Union

import einops
import numpy as np
Expand All @@ -13,7 +13,6 @@
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ImageInput,
is_valid_image,
)
from transformers.processing_utils import ImagesKwargs
from transformers.utils import logging
Expand Down
Loading

0 comments on commit 5690377

Please sign in to comment.