Skip to content

Commit

Permalink
Gcs support better
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 18, 2024
1 parent 9381bf8 commit 8c3b575
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 4 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,11 @@ options:

### Batch Inference Usage

If you want run a fine tuned model in order to linearize millions of PDFs, you need to use the [birrpipeline.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/birrpipeline.py) script.
Below are instructions to use mise/birr to conver pdfs:

birrpipeline.py will take as input all of your PDFs (stored in S3), and generate the inputs needed to run those through your fine-tuned model.
After that, you will use [birr](https://github.com/allenai/mise/tree/main/birr) (part of mise) in order to run those batch inference files efficiently via VLLM.

You should expect somewhere between 1,400 to 1,800 tokens per second per H100 GPU.

```
usage: birrpipeline.py [-h] [--add_pdfs ADD_PDFS] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--workspace_profile WORKSPACE_PROFILE]
Expand Down
16 changes: 16 additions & 0 deletions pdelfin/beakerpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,21 @@ def submit_beaker_job(args):
b.secret.write(f"{owner}-WEKA_SECRET_ACCESS_KEY", os.environ.get("WEKA_SECRET_ACCESS_KEY", ""), args.beaker_workspace)
b.secret.write(f"{owner}-AWS_CREDENTIALS_FILE", open(os.path.join(os.path.expanduser('~'), '.aws', 'credentials')).read(), args.beaker_workspace)

try:
b.secret.get(f"OE_DATA_GCS_SA_KEY", args.beaker_workspace)
raise SecretNotFound
except SecretNotFound:
print("Input the olmo-gcs SA key if you would like to load weights from gcs (end with a double newline):")
lines = []
prev_empty = False
for line in iter(input, None):
if not line and prev_empty:
break
prev_empty = not line
lines.append(line)
gcs_sa_key = "\n".join(lines[:-1]).strip() # Remove the last empty line
if gcs_sa_key:
b.secret.write(f"OE_DATA_GCS_SA_KEY", gcs_sa_key, args.beaker_workspace)

# Create the experiment spec
experiment_spec = ExperimentSpec(
Expand All @@ -554,6 +569,7 @@ def submit_beaker_job(args):
EnvVar(name="WEKA_ACCESS_KEY_ID", secret=f"{owner}-WEKA_ACCESS_KEY_ID"),
EnvVar(name="WEKA_SECRET_ACCESS_KEY", secret=f"{owner}-WEKA_SECRET_ACCESS_KEY"),
EnvVar(name="AWS_CREDENTIALS_FILE", secret=f"{owner}-AWS_CREDENTIALS_FILE"),
EnvVar(name="GOOGLE_APPLICATION_CREDENTIALS", secret=f"OE_DATA_GCS_SA_KEY"),
],
resources=TaskResources(gpu_count=1),
constraints=Constraints(cluster=args.beaker_cluster if isinstance(args.beaker_cluster, list) else [args.beaker_cluster]),
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def download_directory(model_choices: List[str], local_dir: str):
weka_choices = [path for path in model_choices if path.startswith("weka://")]

# This is so hacky, but if you are on beaker/pluto, don't use weka
if os.environ.get("BEAKER_NODE_HOSTNAME", "").lower().startswith("pluto"):
if os.environ.get("BEAKER_NODE_HOSTNAME", "").lower().startswith("pluto") or os.environ.get("BEAKER_NODE_HOSTNAME", "").lower().startswith("augusta"):
weka_choices = []

other_choices = [path for path in model_choices if not path.startswith("weka://")]
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "26"
_PATCH = "27"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down

0 comments on commit 8c3b575

Please sign in to comment.