diff --git a/README.md b/README.md index 6926d69..e6463d5 100644 --- a/README.md +++ b/README.md @@ -99,12 +99,11 @@ options: ### Batch Inference Usage -If you want run a fine tuned model in order to linearize millions of PDFs, you need to use the [birrpipeline.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/birrpipeline.py) script. +Below are instructions to use mise/birr to conver pdfs: birrpipeline.py will take as input all of your PDFs (stored in S3), and generate the inputs needed to run those through your fine-tuned model. After that, you will use [birr](https://github.com/allenai/mise/tree/main/birr) (part of mise) in order to run those batch inference files efficiently via VLLM. -You should expect somewhere between 1,400 to 1,800 tokens per second per H100 GPU. ``` usage: birrpipeline.py [-h] [--add_pdfs ADD_PDFS] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--workspace_profile WORKSPACE_PROFILE] diff --git a/pdelfin/beakerpipeline.py b/pdelfin/beakerpipeline.py index b0ad930..e5a251d 100644 --- a/pdelfin/beakerpipeline.py +++ b/pdelfin/beakerpipeline.py @@ -531,6 +531,21 @@ def submit_beaker_job(args): b.secret.write(f"{owner}-WEKA_SECRET_ACCESS_KEY", os.environ.get("WEKA_SECRET_ACCESS_KEY", ""), args.beaker_workspace) b.secret.write(f"{owner}-AWS_CREDENTIALS_FILE", open(os.path.join(os.path.expanduser('~'), '.aws', 'credentials')).read(), args.beaker_workspace) + try: + b.secret.get(f"OE_DATA_GCS_SA_KEY", args.beaker_workspace) + raise SecretNotFound + except SecretNotFound: + print("Input the olmo-gcs SA key if you would like to load weights from gcs (end with a double newline):") + lines = [] + prev_empty = False + for line in iter(input, None): + if not line and prev_empty: + break + prev_empty = not line + lines.append(line) + gcs_sa_key = "\n".join(lines[:-1]).strip() # Remove the last empty line + if gcs_sa_key: + b.secret.write(f"OE_DATA_GCS_SA_KEY", gcs_sa_key, args.beaker_workspace) # Create the experiment spec experiment_spec = ExperimentSpec( @@ -554,6 +569,7 @@ def submit_beaker_job(args): EnvVar(name="WEKA_ACCESS_KEY_ID", secret=f"{owner}-WEKA_ACCESS_KEY_ID"), EnvVar(name="WEKA_SECRET_ACCESS_KEY", secret=f"{owner}-WEKA_SECRET_ACCESS_KEY"), EnvVar(name="AWS_CREDENTIALS_FILE", secret=f"{owner}-AWS_CREDENTIALS_FILE"), + EnvVar(name="GOOGLE_APPLICATION_CREDENTIALS", secret=f"OE_DATA_GCS_SA_KEY"), ], resources=TaskResources(gpu_count=1), constraints=Constraints(cluster=args.beaker_cluster if isinstance(args.beaker_cluster, list) else [args.beaker_cluster]), diff --git a/pdelfin/s3_utils.py b/pdelfin/s3_utils.py index d1c71ea..fb66666 100644 --- a/pdelfin/s3_utils.py +++ b/pdelfin/s3_utils.py @@ -180,7 +180,7 @@ def download_directory(model_choices: List[str], local_dir: str): weka_choices = [path for path in model_choices if path.startswith("weka://")] # This is so hacky, but if you are on beaker/pluto, don't use weka - if os.environ.get("BEAKER_NODE_HOSTNAME", "").lower().startswith("pluto"): + if os.environ.get("BEAKER_NODE_HOSTNAME", "").lower().startswith("pluto") or os.environ.get("BEAKER_NODE_HOSTNAME", "").lower().startswith("augusta"): weka_choices = [] other_choices = [path for path in model_choices if not path.startswith("weka://")] diff --git a/pdelfin/version.py b/pdelfin/version.py index d1e204d..8ab463f 100644 --- a/pdelfin/version.py +++ b/pdelfin/version.py @@ -2,7 +2,7 @@ _MINOR = "1" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "26" +_PATCH = "27" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""