Skip to content

Commit

Permalink
Trying out non-lora training
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 8, 2024
1 parent ec09408 commit fb4e585
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 31 deletions.
48 changes: 18 additions & 30 deletions pdelfin/train/config/qwen2vl-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,24 @@ generate:
train_data:
seed: 1337
sources:
- name: openai_batch_data_v2
query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl
response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json
backend:
- openai
size: 100_000
# These tend to be really big, so it's only practical to host them as parquets on weka, otherwise you may OOM or just never finish dataloading
- name: openai_batch_data_v5_1_train
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_parquet/*.parquet
- name: openai_batch_data_v5_1_train
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_parquet/*.parquet

valid_data:
metric_for_best_model: openai_batch_data_v5_1_eval_loss
sources:
- name: openai_batch_data_eval_mini
query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl
response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json
backend:
- openai
size: 100_000
# These tend to be small, so you can load from s3 it's no big deal
- name: openai_batch_data_v5_1_eval
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
- name: openai_batch_data_v5_1_iabooks_eval
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_eval/*.jsonl
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json



# Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
hparams:
Expand All @@ -52,30 +55,15 @@ hparams:
gradient_checkpointing: false
clip_grad_norm: 1.0
learning_rate: 3e-4
max_steps: 5000
max_steps: 9000
pad_multiple_of: 16
log_every_steps: 50
eval_every_steps: 500
log_every_steps: 10
eval_every_steps: 100
optim: adamw_torch
lr_scheduler: cosine
weight_decay: 0.01
warmup_ratio: 0.03

# From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py
# Disable LORA for now, because we want the visual network to get trained too
# lora:
# rank: 32
# alpha: 32
# dropout: 0.05
# task_type: causal_lm
# target_modules:
# - q_proj
# - k_proj
# - v_proj
# - o_proj
# - gate_proj
# - up_proj
# - down_proj

save:
path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
Expand Down
2 changes: 1 addition & 1 deletion scripts/qwen2vl-7b-gantry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ gantry run \
--env-secret WANDB_API_KEY=WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --use_fsdp --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
47 changes: 47 additions & 0 deletions scripts/qwen2vl-7b-lora-gantry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash

set -ex

# check if jq is installed
if ! command -v jq &> /dev/null
then
echo "jq could not be found. Please install it."
exit
fi


EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""

run_name=$(basename "$0" .sh)

# --cluster 'ai2/jupiter*' \
# --cluster 'ai2/pluto*' \
# --cluster 'ai2/allennlp-cirrascale' \
# --priority high \

CLUSTER='jupiter'

gantry run \
--description "${run_name}"\
--task-name "${run_name}"\
--allow-dirty \
--host-networking \
--workspace ai2/oe-data-model-based-cleanup \
--beaker-image 'jakep/jakep-pdf-finetunev1.1' \
--venv 'base' \
--pip gantry-requirements.txt \
--priority high \
--gpus 8 \
--preemptible \
--cluster "ai2/${CLUSTER}*" \
--budget ai2/oe-data \
--weka "oe-data-default:/data" \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"

0 comments on commit fb4e585

Please sign in to comment.