Skip to content

Commit

Permalink
Nicer glob handing for pipeline.py
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Jan 30, 2025
1 parent 84477b5 commit d4d711d
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 18 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,15 @@ What is included:

### Installation

Requirements:
- Recent NVIDIA GPU (tested on RTX 4090, L40S, A100, H100)
- 30GB of free disk space

You will need to install poppler-utils and some additional fonts as a prerequisite. olmOCR uses poppler to render its PDF images.

Linux Ubuntu/Debian
```bash
sudo apt-get update
sudo apt-get install poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
```

Expand Down
35 changes: 17 additions & 18 deletions olmocr/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,7 @@ async def main():
)
parser.add_argument(
"--pdfs",
nargs="*",
help="Path to add pdfs stored in s3 to the workspace, can be a glob path s3://bucket/prefix/*.pdf or path to file containing list of pdf paths",
default=None,
)
Expand Down Expand Up @@ -956,26 +957,24 @@ async def main():

if args.pdfs:
logger.info("Got --pdfs argument, going to add to the work queue")

# Expand s3 paths
if args.pdfs.startswith("s3://"):
logger.info(f"Expanding s3 glob at {args.pdfs}")
pdf_work_paths = expand_s3_glob(pdf_s3, args.pdfs)
elif any(char in args.pdfs for char in {"*", "?", "[", "]"}):
logger.info(f"Expanding local glob at {args.pdfs}")
pdf_work_paths = glob.glob(args.pdfs)
elif os.path.exists(args.pdfs):
if open(args.pdfs, "rb").read(4) == b"%PDF":
logger.info(f"Loading file at {args.pdfs} as PDF document")
pdf_work_paths = [args.pdfs]
pdf_work_paths = set()

for pdf_path in args.pdfs:
# Expand s3 paths
if pdf_path.startswith("s3://"):
logger.info(f"Expanding s3 glob at {pdf_path}")
pdf_work_paths |= set(expand_s3_glob(pdf_s3, pdf_path))
elif os.path.exists(pdf_path):
if open(pdf_path, "rb").read(4) == b"%PDF":
logger.info(f"Loading file at {pdf_path} as PDF document")
pdf_work_paths.add(pdf_path)
else:
logger.info(f"Loading file at {args.pdfs} as list of paths")
with open(args.pdfs, "r") as f:
pdf_work_paths |= set(filter(None, (line.strip() for line in f)))
else:
logger.info(f"Loading file at {args.pdfs} as list of paths")
with open(args.pdfs, "r") as f:
pdf_work_paths = list(filter(None, (line.strip() for line in f)))
else:
raise ValueError("pdfs argument needs to be either a local path, an s3 path, or a glob pattern...")
raise ValueError("pdfs argument needs to be either a local path, an s3 path, or an s3 glob pattern...")

pdf_work_paths = set(pdf_work_paths)
logger.info(f"Found {len(pdf_work_paths):,} total pdf paths to add")

# Estimate average pages per pdf
Expand Down

0 comments on commit d4d711d

Please sign in to comment.