Skip to content

Commit

Permalink
Preloading the datasets directly
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 10, 2024
1 parent 85f2dc6 commit a8b50ae
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 2 deletions.
32 changes: 32 additions & 0 deletions pdelfin/train/loaddataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from transformers import (
AutoProcessor
)

from pdelfin.train.core.cli import make_cli
from pdelfin.train.core.config import TrainConfig

from .utils import (
make_dataset
)



def main():
train_config = make_cli(TrainConfig) # pyright: ignore

processor = AutoProcessor.from_pretrained(train_config.model.name_or_path)
train_dataset, valid_dataset = make_dataset(train_config, processor)

print("Training dataset........")
print(train_dataset)
print("\n\n")

print("Validation dataset........")
print(valid_dataset)
print("\n\n")

print("Datasets loaded into hugging face cache directory")


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion scripts/qwen2vl-7b-gantry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ gantry run \
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m pdelfin.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
2 changes: 1 addition & 1 deletion scripts/qwen2vl-7b-lora-gantry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ gantry run \
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m pdelfin.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"

0 comments on commit a8b50ae

Please sign in to comment.