diff --git a/examples/aishell/NST/run_nst.sh b/examples/aishell/NST/run_nst.sh index 6676733d7..9d83fb258 100644 --- a/examples/aishell/NST/run_nst.sh +++ b/examples/aishell/NST/run_nst.sh @@ -23,14 +23,16 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO + stage=1 # start from 0 if you need to start from data preparation stop_stage=8 +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" +num_nodes=1 + + # here are extra parameters used in NST cer_out_dir="" dir="" @@ -61,15 +63,6 @@ cer_hypo_dir="wenet_cer_hypo" cer_label_dir="wenet_cer_label" pseudo_data_ratio=0.75 -# The num of machines(nodes) for multi-machine training, 1 is for one machine. -# NFS is required if num_nodes > 1. - -num_nodes=1 - -# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`. -# You should set the node_ranHk=0 on the first machine, set the node_rank=1 -# on the second machine, and so on. -node_rank=0 dict=data/dict/lang_char.txt # data_type can be `raw` or `shard`. Typically, raw is used for small dataset, @@ -119,9 +112,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - # the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets, # and it should be positioned at data/${train_set}/global_cmvn . cmvn_opts= @@ -132,15 +122,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # and output dimension, and $dir/train.yaml will be used for inference # and export. echo "checkpoint is " ${checkpoint} - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - echo "gpu number $i " - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -149,15 +132,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory - } & - done - wait fi # In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy diff --git a/examples/aishell/paraformer/run.sh b/examples/aishell/paraformer/run.sh index b2e397eeb..41b3fd412 100644 --- a/examples/aishell/paraformer/run.sh +++ b/examples/aishell/paraformer/run.sh @@ -6,22 +6,14 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1,2,3" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO stage=0 # start from 0 if you need to start from data preparation stop_stage=5 -# The num of machines(nodes) for multi-machine training, 1 is for one machine. -# NFS is required if num_nodes > 1. +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`. -# You should set the node_rank=0 on the first machine, set the node_rank=1 -# on the second machine, and so on. -node_rank=0 # The aishell dataset location, please change this to your own path # make sure of using absolute path. DO-NOT-USE relative path! data=/export/data/asr-data/OpenSLR/33/ @@ -120,8 +112,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" @@ -129,13 +119,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # train.py rewrite $train_config to $dir/train.yaml with model input # and output dimension, and $dir/train.yaml will be used for inference # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python3 wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python3 wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -143,16 +128,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --cv_data data/test/data.list \ ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 8 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/aishell/rnnt/run.sh b/examples/aishell/rnnt/run.sh index e9a064024..e15467e1b 100644 --- a/examples/aishell/rnnt/run.sh +++ b/examples/aishell/rnnt/run.sh @@ -12,14 +12,11 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" stage=0 # start from 0 if you need to start from data preparation stop_stage=5 -# The num of machines(nodes) for multi-machine training, 1 is for one machine. -# NFS is required if num_nodes > 1. +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`. -# You should set the node_rank=0 on the first machine, set the node_rank=1 -# on the second machine, and so on. -node_rank=0 # The aishell dataset location, please change this to your own path # make sure of using absolute path. DO-NOT-USE relatvie path! data=/export/data/asr-data/OpenSLR/33/ @@ -112,8 +109,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" @@ -121,13 +116,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # train.py rewrite $train_config to $dir/train.yaml with model input # and output dimension, and $dir/train.yaml will be used for inference # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -136,15 +126,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/aishell2/rnnt/run.sh b/examples/aishell2/rnnt/run.sh index 8102c1a88..64e4f6b29 100755 --- a/examples/aishell2/rnnt/run.sh +++ b/examples/aishell2/rnnt/run.sh @@ -11,14 +11,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" stage=0 # start from 0 if you need to start from data preparation stop_stage=5 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 # modify this to your AISHELL-2 data path # Note: the evaluation data (dev & test) is available at AISHELL. @@ -110,43 +106,26 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - #dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 4 \ - $cmvn_opts \ - 2>&1 | tee -a $dir/train.log || exit 1; - } & - done - wait + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ + --config $train_config \ + --data_type raw \ + --symbol_table $dict \ + --train_data data/$train_set/data.list \ + --cv_data data/dev/data.list \ + ${checkpoint:+--checkpoint $checkpoint} \ + --model_dir $dir \ + --ddp.init_method $init_method \ + --ddp.dist_backend $dist_backend \ + --num_workers 4 \ + $cmvn_opts \ + 2>&1 | tee -a $dir/train.log || exit 1; fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/aishell2/s0/run.sh b/examples/aishell2/s0/run.sh index 673ab3781..1ee5ce7b6 100755 --- a/examples/aishell2/s0/run.sh +++ b/examples/aishell2/s0/run.sh @@ -6,21 +6,13 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO stage=0 # start from 0 if you need to start from data preparation stop_stage=6 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 # modify this to your AISHELL-2 data path # Note: the evaluation data (dev & test) is available at AISHELL. @@ -106,41 +98,25 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 2 \ - $cmvn_opts - } & - done - wait + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ + --config $train_config \ + --data_type raw \ + --symbol_table $dict \ + --train_data data/$train_set/data.list \ + --cv_data data/dev/data.list \ + ${checkpoint:+--checkpoint $checkpoint} \ + --model_dir $dir \ + --ddp.init_method $init_method \ + --ddp.dist_backend $dist_backend \ + --num_workers 2 \ + $cmvn_opts fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/aishell4/s0/run.sh b/examples/aishell4/s0/run.sh index dc9aef9ab..9d1ebb4e7 100755 --- a/examples/aishell4/s0/run.sh +++ b/examples/aishell4/s0/run.sh @@ -6,21 +6,15 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO + stage=0 # start from 0 if you need to start from data preparation stop_stage=6 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 + num_utts_per_shard=1000 data_url=https://www.openslr.org/resources/111 data_source=/home/work_nfs5_ssd/yhliang/data/aishell4 @@ -108,24 +102,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ python wenet/bin/train.py --gpu $gpu_id \ --config $train_config \ --data_type shard \ @@ -135,14 +118,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts - } - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/chime4/s0/run.sh b/examples/chime4/s0/run.sh index f010265fa..7d9f045f2 100644 --- a/examples/chime4/s0/run.sh +++ b/examples/chime4/s0/run.sh @@ -69,7 +69,6 @@ fi if [ $end -ge 4 ] && [ $beg -le 4 ]; then mkdir -p $exp_dir && cp $data_dir/train/global_cmvn $exp_dir python wenet/bin/train.py \ - --gpu 0 \ --config $train_config \ --train_data $data_dir/train/data.list \ --cv_data $data_dir/dev/data.list \ diff --git a/examples/commonvoice/fr/run.sh b/examples/commonvoice/fr/run.sh index 5ca76d430..9c3ff2f9d 100644 --- a/examples/commonvoice/fr/run.sh +++ b/examples/commonvoice/fr/run.sh @@ -6,21 +6,14 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1,2" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO stage=0 # start from 0 if you need to start from data download stop_stage=2 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 + # data download_path=/root/autodl-tmp french_data=/root/autodl-tmp/cv-corpus-8.0-2022-01-19 @@ -133,8 +126,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" @@ -142,13 +133,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # train.py rewrite $train_config to $dir/train.yaml with model input # and output dimension, and $dir/train.yaml will be used for inference # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -158,15 +144,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/csj/s0/run.sh b/examples/csj/s0/run.sh index 39c91e410..a2f006717 100644 --- a/examples/csj/s0/run.sh +++ b/examples/csj/s0/run.sh @@ -19,6 +19,11 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" stage=1 # train -> 50 epochs stop_stage=8 # +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" +num_nodes=1 + # data #data_url=www.openslr.org/resources/12 # TODO use your own data path @@ -189,10 +194,8 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type raw \ --symbol_table $dict \ @@ -201,15 +204,10 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory - } & - done - wait fi ### test model ### diff --git a/examples/gigaspeech/s0/run.sh b/examples/gigaspeech/s0/run.sh index 4b6be6638..3eab0c482 100644 --- a/examples/gigaspeech/s0/run.sh +++ b/examples/gigaspeech/s0/run.sh @@ -10,14 +10,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" stage=0 # start from 0 if you need to start from data preparation stop_stage=5 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 # data # use your own data path, you can contact gigaspeech@speechcolab.orgfor getting data for data information about gigaspeech @@ -142,25 +138,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type "shard" \ --symbol_table $dict \ @@ -170,14 +155,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 16 \ $cmvn_opts - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/hkust/s0/run.sh b/examples/hkust/s0/run.sh index 612bddeaf..51b48ad98 100755 --- a/examples/hkust/s0/run.sh +++ b/examples/hkust/s0/run.sh @@ -8,18 +8,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" stage=4 # start from 0 if you need to start from data preparation stop_stage=4 -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 nj=16 feat_dir=raw_wav @@ -144,25 +136,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ python wenet/bin/train.py --gpu $gpu_id \ --config $train_config \ --data_type $data_type \ @@ -173,16 +153,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory \ --bpe_model ${bpecode} - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/librispeech/rnnt/run.sh b/examples/librispeech/rnnt/run.sh index a7a30cc29..31aab87be 100644 --- a/examples/librispeech/rnnt/run.sh +++ b/examples/librispeech/rnnt/run.sh @@ -9,6 +9,12 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" stage=-1 # start from 0 if you need to start from data preparation stop_stage=7 + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" +num_nodes=1 + # data data_url=www.openslr.org/resources/12 # data_url=https://us.openslr.org/resources/12 @@ -138,10 +144,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python3 wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python3 wenet/bin/train.py \ --config $train_config \ --data_type raw \ --symbol_table $dict \ @@ -150,16 +154,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --cv_data $wave_data/$dev_set/data.list \ ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ --ddp.rank $i \ --ddp.dist_backend $dist_backend \ --num_workers 4 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/s0/run.sh index ede3922c1..ec9393d75 100644 --- a/examples/librispeech/s0/run.sh +++ b/examples/librispeech/s0/run.sh @@ -9,6 +9,13 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" stage=0 # start from 0 if you need to start from data preparation stop_stage=5 + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" +num_nodes=1 + + # data data_url=www.openslr.org/resources/12 # use your own data path @@ -134,10 +141,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type raw \ --symbol_table $dict \ @@ -147,15 +152,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/multi_cn/s0/run.sh b/examples/multi_cn/s0/run.sh index 9b1813cec..d4b166bd6 100755 --- a/examples/multi_cn/s0/run.sh +++ b/examples/multi_cn/s0/run.sh @@ -10,18 +10,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" stage=0 # start from 0 if you need to start from data preparation stop_stage=6 -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 # data dbase=/ssd/nfs06/di.wu/open_source @@ -250,26 +242,14 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data_${en_modeling_unit}/$train_set/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -278,16 +258,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 4 \ ${enable_bpe:+--bpe_model $bpecode} \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/openasr2021/s0/run.sh b/examples/openasr2021/s0/run.sh index 7c338f13e..cd470c75d 100644 --- a/examples/openasr2021/s0/run.sh +++ b/examples/openasr2021/s0/run.sh @@ -9,6 +9,12 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" stage=0 # start from 0 if you need to start from data preparation stop_stage=5 + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" +num_nodes=1 + # data data=data data_url=www.openslr.org/resources/33 @@ -142,10 +148,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -157,14 +161,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --enc_init_mods $enc_init_mods \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ --ddp.dist_backend $dist_backend \ --num_workers 6 \ $cmvn_opts - } & - done - wait fi diff --git a/examples/swbd/s0/run.sh b/examples/swbd/s0/run.sh index 9b94c8cc1..78693a7e4 100755 --- a/examples/swbd/s0/run.sh +++ b/examples/swbd/s0/run.sh @@ -6,21 +6,13 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO stage=0 # start from 0 if you need to start from data preparation stop_stage=5 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 nj=16 feat_dir=raw_wav @@ -155,25 +147,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo ${CUDA_VISIBLE_DEVICES} | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr ${num_gpus} \* ${num_nodes}` - echo "total gpus is: ${world_size}" cmvn_opts= ${cmvn} && cp ${feat_dir}/${train_set}/global_cmvn ${dir} ${cmvn} && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < ${num_gpus}; ++i)); do - { - gpu_id=$(echo ${CUDA_VISIBLE_DEVICES} | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr ${node_rank} \* ${num_gpus} + ${i}` - python wenet/bin/train.py --gpu ${gpu_id} \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config ${train_config} \ --data_type ${data_type} \ --symbol_table ${dict} \ @@ -184,15 +165,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir ${dir} \ --ddp.init_method ${init_method} \ - --ddp.world_size ${world_size} \ - --ddp.rank ${rank} \ --ddp.dist_backend ${dist_backend} \ --num_workers 4 \ ${cmvn_opts} \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/tedlium3/s0/run.sh b/examples/tedlium3/s0/run.sh index 9697e02f1..7b4bc4958 100644 --- a/examples/tedlium3/s0/run.sh +++ b/examples/tedlium3/s0/run.sh @@ -6,22 +6,13 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO stage=0 # start from 0 if you need to start from data preparation stop_stage=5 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" +num_nodes=1 nj=16 feat_dir=raw_wav @@ -126,25 +117,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -154,15 +134,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 8 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/timit/run.sh b/examples/timit/run.sh index 01a08a8ff..f4927edcf 100644 --- a/examples/timit/run.sh +++ b/examples/timit/run.sh @@ -6,21 +6,9 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO stage=0 # start from 0 if you need to start from data preparation stop_stage=4 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 + # data timit_data=/home/Liangcd/data/timit # path to save preproecssed data @@ -125,8 +113,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" @@ -134,13 +120,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # train.py rewrite $train_config to $dir/train.yaml with model input # and output dimension, and $dir/train.yaml will be used for inference # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -149,15 +130,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/vkw2021/s0/run.sh b/examples/vkw2021/s0/run.sh index 4c3cf1bcf..36ae9f0d9 100755 --- a/examples/vkw2021/s0/run.sh +++ b/examples/vkw2021/s0/run.sh @@ -10,10 +10,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" stage=-1 stop_stage=0 -# The num of nodes +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of current node -node_rank=0 # data data=data @@ -118,26 +118,15 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp ${data}/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=$i ###`expr $node_rank \* $num_gpus + $i` - echo "start training" - python wenet/bin/train.py --gpu $gpu_id \ + echo "start training" + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -146,15 +135,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 4 \ $cmvn_opts \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/wenetspeech/s0/run.sh b/examples/wenetspeech/s0/run.sh index 52d288375..e4b379612 100755 --- a/examples/wenetspeech/s0/run.sh +++ b/examples/wenetspeech/s0/run.sh @@ -11,10 +11,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" stage=0 stop_stage=5 -# The num of nodes +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of current node -node_rank=0 # Use your own data path. You need to download the WenetSpeech dataset by yourself. wenetspeech_data_dir=/ssd/nfs07/binbinzhang/wenetspeech @@ -119,21 +119,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="nccl" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type "shard" \ --symbol_table $dict \ @@ -142,15 +135,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ $cmvn_opts \ --num_workers 8 \ --pin_memory - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/wsj/s0/run.sh b/examples/wsj/s0/run.sh index 1b240d718..e4c7a855f 100644 --- a/examples/wsj/s0/run.sh +++ b/examples/wsj/s0/run.sh @@ -6,21 +6,14 @@ # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO stage=0 # start from 0 if you need to start from data preparation stop_stage=4 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training + +# You should change the following two parameters for multiple machine training, +# see https://pytorch.org/docs/stable/elastic/run.html +HOST_NODE_ADDR="localhost:0" num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 + # data WSJ0=/home/lsq/corpus/WSJ/wsj0 WSJ1=/home/lsq/corpus/WSJ/wsj1 @@ -133,8 +126,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" @@ -142,13 +133,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # train.py rewrite $train_config to $dir/train.yaml with model input # and output dimension, and $dir/train.yaml will be used for inference # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ + python wenet/bin/train.py \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ @@ -157,16 +143,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory \ --non_lang_syms ${nlsyms} - } & - done - wait fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then