forked from pbelevich/pipeline_experiments
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlaunch.sh
executable file
·44 lines (40 loc) · 1.43 KB
/
launch.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
chunks=10
ws=40
nodes=$(((ws+7)/8))
bs=30
#model=BERT1.2
model=GPT1NODE-r
#model=GPT175
#model=GPT88
#model=BERT88
#model=test
model_params="--batch_size $bs --num_chunks=$chunks --world_size=$ws --num_workers=$nodes --num_batch=20 --epochs=1"
sbatch <<EOT
#!/bin/bash
#SBATCH --job-name=pipeline-$model-bs-$bs-ws-$ws-chunks-$chunks
#SBATCH --output=logs/logs.%x.%t.out
#SBATCH --error=logs/logs.%x.%t.err
#SBATCH --gres gpu:8
#SBATCH --nodes $nodes
#SBATCH --partition=q3
#SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-task 32
#SBATCH --time=1:00:00
export MASTER_ADDR=\$(srun --ntasks=1 hostname 2>&1 | tail -n1)
echo \$MASTER_ADDR
set -x
#GPT1NODE
srun -u python3 -m BERT.run_pipeline $model_params --nlayers=36 --emsize=6144 --nhid=24576 --nhead=16 --ep_embedding --ep_head --ep_noop
#GPT175
#srun -u python3 -m BERT.run_pipeline $model_params --nlayers=96 --emsize=12288 --nhid=49152 --nhead=16 --ep_embedding --ep_head --ep_noop
#GPT88
#srun -u python3 -m BERT.run_pipeline $model_params --nlayers=61 --emsize=12288 --nhid=49152 --nhead=16 --ep_embedding --ep_head --ep_noop
#BERT 1.2B:
#srun -u python3 -m BERT.run_pipeline $model_params --nlayers=24 --emsize=2048 --nhid=8192 --nhead=16
#BERT 88B:
#srun -u python3 -m BERT.run_pipeline $model_params --nlayers=1750 --emsize=2048 --nhid=8192 --nhead=16 --ep_head --ep_noop
#TEST
#srun -u python3 -m BERT.run_pipeline $model_params --nlayers=4 --emsize=256 --nhid=1024 --nhead=16
EOT
squeue