diff --git a/examples/aishell/NST/run_nst.sh b/examples/aishell/NST/run_nst.sh
index 6676733d7..9d83fb258 100644
--- a/examples/aishell/NST/run_nst.sh
+++ b/examples/aishell/NST/run_nst.sh
@@ -23,14 +23,16 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
+
 stage=1 # start from 0 if you need to start from data preparation
 stop_stage=8
 
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+
+
 # here are extra parameters used in NST
 cer_out_dir=""
 dir=""
@@ -61,15 +63,6 @@ cer_hypo_dir="wenet_cer_hypo"
 cer_label_dir="wenet_cer_label"
 pseudo_data_ratio=0.75
 
-# The num of machines(nodes) for multi-machine training, 1 is for one machine.
-# NFS is required if num_nodes > 1.
-
-num_nodes=1
-
-# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
-# You should set the node_ranHk=0 on the first machine, set the node_rank=1
-# on the second machine, and so on.
-node_rank=0
 dict=data/dict/lang_char.txt
 
 # data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
@@ -119,9 +112,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
-
   # the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets,
   # and it should be positioned at data/${train_set}/global_cmvn .
   cmvn_opts=
@@ -132,15 +122,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   # and output dimension, and $dir/train.yaml will be used for inference
   # and export.
   echo "checkpoint is "  ${checkpoint}
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    echo "gpu number  $i "
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -149,15 +132,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 # In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy
diff --git a/examples/aishell/paraformer/run.sh b/examples/aishell/paraformer/run.sh
index b2e397eeb..41b3fd412 100644
--- a/examples/aishell/paraformer/run.sh
+++ b/examples/aishell/paraformer/run.sh
@@ -6,22 +6,14 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
 
-# The num of machines(nodes) for multi-machine training, 1 is for one machine.
-# NFS is required if num_nodes > 1.
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
 
-# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
-# You should set the node_rank=0 on the first machine, set the node_rank=1
-# on the second machine, and so on.
-node_rank=0
 # The aishell dataset location, please change this to your own path
 # make sure of using absolute path. DO-NOT-USE relative path!
 data=/export/data/asr-data/OpenSLR/33/
@@ -120,8 +112,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
@@ -129,13 +119,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
   # and export.
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python3 wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python3 wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -143,16 +128,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       --cv_data data/test/data.list \
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
-      --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 8 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/aishell/rnnt/run.sh b/examples/aishell/rnnt/run.sh
index e9a064024..e15467e1b 100644
--- a/examples/aishell/rnnt/run.sh
+++ b/examples/aishell/rnnt/run.sh
@@ -12,14 +12,11 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
 
-# The num of machines(nodes) for multi-machine training, 1 is for one machine.
-# NFS is required if num_nodes > 1.
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
 
-# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
-# You should set the node_rank=0 on the first machine, set the node_rank=1
-# on the second machine, and so on.
-node_rank=0
 # The aishell dataset location, please change this to your own path
 # make sure of using absolute path. DO-NOT-USE relatvie path!
 data=/export/data/asr-data/OpenSLR/33/
@@ -112,8 +109,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
@@ -121,13 +116,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
   # and export.
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -136,15 +126,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/aishell2/rnnt/run.sh b/examples/aishell2/rnnt/run.sh
index 8102c1a88..64e4f6b29 100755
--- a/examples/aishell2/rnnt/run.sh
+++ b/examples/aishell2/rnnt/run.sh
@@ -11,14 +11,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
 
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
 
 # modify this to your AISHELL-2 data path
 # Note: the evaluation data (dev & test) is available at AISHELL.
@@ -110,43 +106,26 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
     # Use "nccl" if it works, otherwise use "gloo"
     dist_backend="gloo"
-    #dist_backend="nccl"
-    # The total number of processes/gpus, so that the master knows
-    # how many workers to wait for.
-    # More details about ddp can be found in
-    # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-    world_size=`expr $num_gpus \* $num_nodes`
-    echo "total gpus is: $world_size"
     cmvn_opts=
     $cmvn && cp data/${train_set}/global_cmvn $dir
     $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
     # train.py will write $train_config to $dir/train.yaml with model input
     # and output dimension, train.yaml will be used for inference or model
     # export later
-    for ((i = 0; i < $num_gpus; ++i)); do
-    {
-        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
-            --config $train_config \
-            --data_type raw \
-            --symbol_table $dict \
-            --train_data data/$train_set/data.list \
-            --cv_data data/dev/data.list \
-            ${checkpoint:+--checkpoint $checkpoint} \
-            --model_dir $dir \
-            --ddp.init_method $init_method \
-            --ddp.world_size $world_size \
-            --ddp.rank $rank \
-            --ddp.dist_backend $dist_backend \
-            --num_workers 4 \
-            $cmvn_opts \
-            2>&1 | tee -a $dir/train.log || exit 1;
-    } &
-    done
-    wait
+    torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+      python wenet/bin/train.py \
+              --config $train_config \
+              --data_type raw \
+              --symbol_table $dict \
+              --train_data data/$train_set/data.list \
+              --cv_data data/dev/data.list \
+              ${checkpoint:+--checkpoint $checkpoint} \
+              --model_dir $dir \
+              --ddp.init_method $init_method \
+              --ddp.dist_backend $dist_backend \
+              --num_workers 4 \
+              $cmvn_opts \
+              2>&1 | tee -a $dir/train.log || exit 1;
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/aishell2/s0/run.sh b/examples/aishell2/s0/run.sh
index 673ab3781..1ee5ce7b6 100755
--- a/examples/aishell2/s0/run.sh
+++ b/examples/aishell2/s0/run.sh
@@ -6,21 +6,13 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=6
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
 
 # modify this to your AISHELL-2 data path
 # Note: the evaluation data (dev & test) is available at AISHELL.
@@ -106,41 +98,25 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
     # Use "nccl" if it works, otherwise use "gloo"
     dist_backend="gloo"
-    # The total number of processes/gpus, so that the master knows
-    # how many workers to wait for.
-    # More details about ddp can be found in
-    # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-    world_size=`expr $num_gpus \* $num_nodes`
-    echo "total gpus is: $world_size"
     cmvn_opts=
     $cmvn && cp data/${train_set}/global_cmvn $dir
     $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
     # train.py will write $train_config to $dir/train.yaml with model input
     # and output dimension, train.yaml will be used for inference or model
     # export later
-    for ((i = 0; i < $num_gpus; ++i)); do
-    {
-        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
-            --config $train_config \
-            --data_type raw \
-            --symbol_table $dict \
-            --train_data data/$train_set/data.list \
-            --cv_data data/dev/data.list \
-            ${checkpoint:+--checkpoint $checkpoint} \
-            --model_dir $dir \
-            --ddp.init_method $init_method \
-            --ddp.world_size $world_size \
-            --ddp.rank $rank \
-            --ddp.dist_backend $dist_backend \
-            --num_workers 2 \
-            $cmvn_opts
-    } &
-    done
-    wait
+    torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+      python wenet/bin/train.py \
+              --config $train_config \
+              --data_type raw \
+              --symbol_table $dict \
+              --train_data data/$train_set/data.list \
+              --cv_data data/dev/data.list \
+              ${checkpoint:+--checkpoint $checkpoint} \
+              --model_dir $dir \
+              --ddp.init_method $init_method \
+              --ddp.dist_backend $dist_backend \
+              --num_workers 2 \
+              $cmvn_opts
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/aishell4/s0/run.sh b/examples/aishell4/s0/run.sh
index dc9aef9ab..9d1ebb4e7 100755
--- a/examples/aishell4/s0/run.sh
+++ b/examples/aishell4/s0/run.sh
@@ -6,21 +6,15 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
+
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=6
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
+
 num_utts_per_shard=1000
 data_url=https://www.openslr.org/resources/111
 data_source=/home/work_nfs5_ssd/yhliang/data/aishell4
@@ -108,24 +102,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  # The total number of processes/gpus, so that the master knows
-  # how many workers to wait for.
-  # More details about ddp can be found in
-  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
     python wenet/bin/train.py --gpu $gpu_id \
       --config $train_config \
       --data_type shard \
@@ -135,14 +118,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts
-  }
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/chime4/s0/run.sh b/examples/chime4/s0/run.sh
index f010265fa..7d9f045f2 100644
--- a/examples/chime4/s0/run.sh
+++ b/examples/chime4/s0/run.sh
@@ -69,7 +69,6 @@ fi
 if [ $end -ge 4 ] && [ $beg -le 4 ]; then
   mkdir -p $exp_dir && cp $data_dir/train/global_cmvn $exp_dir
   python wenet/bin/train.py \
-    --gpu 0 \
     --config $train_config \
     --train_data $data_dir/train/data.list \
     --cv_data $data_dir/dev/data.list \
diff --git a/examples/commonvoice/fr/run.sh b/examples/commonvoice/fr/run.sh
index 5ca76d430..9c3ff2f9d 100644
--- a/examples/commonvoice/fr/run.sh
+++ b/examples/commonvoice/fr/run.sh
@@ -6,21 +6,14 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0,1,2"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
 stage=0     # start from 0 if you need to start from data download
 stop_stage=2
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
+
 # data
 download_path=/root/autodl-tmp
 french_data=/root/autodl-tmp/cv-corpus-8.0-2022-01-19
@@ -133,8 +126,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
@@ -142,13 +133,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
   # and export.
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -158,15 +144,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/csj/s0/run.sh b/examples/csj/s0/run.sh
index 39c91e410..a2f006717 100644
--- a/examples/csj/s0/run.sh
+++ b/examples/csj/s0/run.sh
@@ -19,6 +19,11 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 stage=1 # train -> 50 epochs
 stop_stage=8 #
 
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+
 # data
 #data_url=www.openslr.org/resources/12
 # TODO use your own data path
@@ -189,10 +194,8 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type raw \
       --symbol_table $dict \
@@ -201,15 +204,10 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $num_gpus \
-      --ddp.rank $i \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 ### test model ###
diff --git a/examples/gigaspeech/s0/run.sh b/examples/gigaspeech/s0/run.sh
index 4b6be6638..3eab0c482 100644
--- a/examples/gigaspeech/s0/run.sh
+++ b/examples/gigaspeech/s0/run.sh
@@ -10,14 +10,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
 
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
 
 # data
 # use your own data path, you can contact gigaspeech@speechcolab.orgfor getting data for data information about gigaspeech
@@ -142,25 +138,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="nccl"
-  # The total number of processes/gpus, so that the master knows
-  # how many workers to wait for.
-  # More details about ddp can be found in
-  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type "shard" \
       --symbol_table $dict \
@@ -170,14 +155,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 16 \
       $cmvn_opts
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/hkust/s0/run.sh b/examples/hkust/s0/run.sh
index 612bddeaf..51b48ad98 100755
--- a/examples/hkust/s0/run.sh
+++ b/examples/hkust/s0/run.sh
@@ -8,18 +8,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
 stage=4 # start from 0 if you need to start from data preparation
 stop_stage=4
 
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
 
 nj=16
 feat_dir=raw_wav
@@ -144,25 +136,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  # The total number of processes/gpus, so that the master knows
-  # how many workers to wait for.
-  # More details about ddp can be found in
-  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
     python wenet/bin/train.py --gpu $gpu_id \
       --config $train_config \
       --data_type $data_type \
@@ -173,16 +153,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory \
       --bpe_model ${bpecode}
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/librispeech/rnnt/run.sh b/examples/librispeech/rnnt/run.sh
index a7a30cc29..31aab87be 100644
--- a/examples/librispeech/rnnt/run.sh
+++ b/examples/librispeech/rnnt/run.sh
@@ -9,6 +9,12 @@
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 stage=-1 # start from 0 if you need to start from data preparation
 stop_stage=7
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+
 # data
 data_url=www.openslr.org/resources/12
 # data_url=https://us.openslr.org/resources/12
@@ -138,10 +144,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    python3 wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python3 wenet/bin/train.py \
       --config $train_config \
       --data_type raw \
       --symbol_table $dict \
@@ -150,16 +154,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       --cv_data $wave_data/$dev_set/data.list \
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
-      --ddp.init_method $init_method \
-      --ddp.world_size $num_gpus \
       --ddp.rank $i \
       --ddp.dist_backend $dist_backend \
       --num_workers 4 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/s0/run.sh
index ede3922c1..ec9393d75 100644
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
@@ -9,6 +9,13 @@
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+
+
 # data
 data_url=www.openslr.org/resources/12
 # use your own data path
@@ -134,10 +141,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type raw \
       --symbol_table $dict \
@@ -147,15 +152,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $num_gpus \
-      --ddp.rank $i \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/multi_cn/s0/run.sh b/examples/multi_cn/s0/run.sh
index 9b1813cec..d4b166bd6 100755
--- a/examples/multi_cn/s0/run.sh
+++ b/examples/multi_cn/s0/run.sh
@@ -10,18 +10,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=6
 
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
 
 # data
 dbase=/ssd/nfs06/di.wu/open_source
@@ -250,26 +242,14 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="nccl"
-  # The total number of processes/gpus, so that the master knows
-  # how many workers to wait for.
-  # More details about ddp can be found in
-  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data_${en_modeling_unit}/$train_set/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -278,16 +258,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 4 \
       ${enable_bpe:+--bpe_model $bpecode} \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
diff --git a/examples/openasr2021/s0/run.sh b/examples/openasr2021/s0/run.sh
index 7c338f13e..cd470c75d 100644
--- a/examples/openasr2021/s0/run.sh
+++ b/examples/openasr2021/s0/run.sh
@@ -9,6 +9,12 @@
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
+
 # data
 data=data
 data_url=www.openslr.org/resources/33
@@ -142,10 +148,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # train.py will write $train_config to $dir/train.yaml with model input
     # and output dimension, train.yaml will be used for inference or model
     # export later
-    for ((i = 0; i < $num_gpus; ++i)); do
-    {
-        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-        python wenet/bin/train.py --gpu $gpu_id \
+    torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+        python wenet/bin/train.py \
             --config $train_config \
             --data_type $data_type \
             --symbol_table $dict \
@@ -157,14 +161,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --enc_init_mods $enc_init_mods \
             --model_dir $dir \
             --ddp.init_method $init_method \
-            --ddp.world_size $num_gpus \
-            --ddp.rank $i \
             --ddp.dist_backend $dist_backend \
             --num_workers 6 \
             $cmvn_opts
-    } &
-    done
-    wait
 fi
 
 
diff --git a/examples/swbd/s0/run.sh b/examples/swbd/s0/run.sh
index 9b94c8cc1..78693a7e4 100755
--- a/examples/swbd/s0/run.sh
+++ b/examples/swbd/s0/run.sh
@@ -6,21 +6,13 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0,1"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
 
 nj=16
 feat_dir=raw_wav
@@ -155,25 +147,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo ${CUDA_VISIBLE_DEVICES} | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="nccl"
-  # The total number of processes/gpus, so that the master knows
-  # how many workers to wait for.
-  # More details about ddp can be found in
-  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-  world_size=`expr ${num_gpus} \* ${num_nodes}`
-  echo "total gpus is: ${world_size}"
   cmvn_opts=
   ${cmvn} && cp ${feat_dir}/${train_set}/global_cmvn ${dir}
   ${cmvn} && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < ${num_gpus}; ++i)); do
-  {
-    gpu_id=$(echo ${CUDA_VISIBLE_DEVICES} | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr ${node_rank} \* ${num_gpus} + ${i}`
-    python wenet/bin/train.py --gpu ${gpu_id} \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config ${train_config} \
       --data_type ${data_type} \
       --symbol_table ${dict} \
@@ -184,15 +165,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir ${dir} \
       --ddp.init_method ${init_method} \
-      --ddp.world_size ${world_size} \
-      --ddp.rank ${rank} \
       --ddp.dist_backend ${dist_backend} \
       --num_workers 4 \
       ${cmvn_opts} \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/tedlium3/s0/run.sh b/examples/tedlium3/s0/run.sh
index 9697e02f1..7b4bc4958 100644
--- a/examples/tedlium3/s0/run.sh
+++ b/examples/tedlium3/s0/run.sh
@@ -6,22 +6,13 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
 stage=0 # start from 0 if you need to start from data preparation
 stop_stage=5
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
-num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
 
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
+num_nodes=1
 
 nj=16
 feat_dir=raw_wav
@@ -126,25 +117,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="nccl"
-  # The total number of processes/gpus, so that the master knows
-  # how many workers to wait for.
-  # More details about ddp can be found in
-  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -154,15 +134,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 8 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/timit/run.sh b/examples/timit/run.sh
index 01a08a8ff..f4927edcf 100644
--- a/examples/timit/run.sh
+++ b/examples/timit/run.sh
@@ -6,21 +6,9 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
 stage=0     # start from 0 if you need to start from data preparation
 stop_stage=4
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
-num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
+
 # data
 timit_data=/home/Liangcd/data/timit
 # path to save preproecssed data
@@ -125,8 +113,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
@@ -134,13 +120,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
   # and export.
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -149,15 +130,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/vkw2021/s0/run.sh b/examples/vkw2021/s0/run.sh
index 4c3cf1bcf..36ae9f0d9 100755
--- a/examples/vkw2021/s0/run.sh
+++ b/examples/vkw2021/s0/run.sh
@@ -10,10 +10,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 stage=-1
 stop_stage=0
 
-# The num of nodes
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of current node
-node_rank=0
 
 # data
 data=data
@@ -118,26 +118,15 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  # The total number of processes/gpus, so that the master knows
-  # how many workers to wait for.
-  # More details about ddp can be found in
-  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp ${data}/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=$i ###`expr $node_rank \* $num_gpus + $i`
-    echo "start training"
-    python wenet/bin/train.py --gpu $gpu_id \
+  echo "start training"
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -146,15 +135,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 4 \
       $cmvn_opts \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/wenetspeech/s0/run.sh b/examples/wenetspeech/s0/run.sh
index 52d288375..e4b379612 100755
--- a/examples/wenetspeech/s0/run.sh
+++ b/examples/wenetspeech/s0/run.sh
@@ -11,10 +11,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 stage=0
 stop_stage=5
 
-# The num of nodes
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of current node
-node_rank=0
 
 # Use your own data path. You need to download the WenetSpeech dataset by yourself.
 wenetspeech_data_dir=/ssd/nfs07/binbinzhang/wenetspeech
@@ -119,21 +119,14 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="nccl"
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
   # train.py will write $train_config to $dir/train.yaml with model input
   # and output dimension, train.yaml will be used for inference or model
   # export later
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type "shard" \
       --symbol_table $dict \
@@ -142,15 +135,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       $cmvn_opts \
       --num_workers 8 \
       --pin_memory
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
diff --git a/examples/wsj/s0/run.sh b/examples/wsj/s0/run.sh
index 1b240d718..e4c7a855f 100644
--- a/examples/wsj/s0/run.sh
+++ b/examples/wsj/s0/run.sh
@@ -6,21 +6,14 @@
 # Use this to control how many gpu you use, It's 1-gpu training if you specify
 # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
 export CUDA_VISIBLE_DEVICES="0"
-# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
-# communication. More details can be found in
-# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
-# export NCCL_SOCKET_IFNAME=ens4f1
-export NCCL_DEBUG=INFO
 stage=0     # start from 0 if you need to start from data preparation
 stop_stage=4
-# The num of nodes or machines used for multi-machine training
-# Default 1 for single machine/node
-# NFS will be needed if you want run multi-machine training
+
+# You should change the following two parameters for multiple machine training,
+# see https://pytorch.org/docs/stable/elastic/run.html
+HOST_NODE_ADDR="localhost:0"
 num_nodes=1
-# The rank of each node or machine, range from 0 to num_nodes -1
-# The first node/machine sets node_rank 0, the second one sets node_rank 1
-# the third one set node_rank 2, and so on. Default 0
-node_rank=0
+
 # data
 WSJ0=/home/lsq/corpus/WSJ/wsj0
 WSJ1=/home/lsq/corpus/WSJ/wsj1
@@ -133,8 +126,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="gloo"
-  world_size=`expr $num_gpus \* $num_nodes`
-  echo "total gpus is: $world_size"
   cmvn_opts=
   $cmvn && cp data/${train_set}/global_cmvn $dir
   $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
@@ -142,13 +133,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
   # and export.
-  for ((i = 0; i < $num_gpus; ++i)); do
-  {
-    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-    # Rank of each gpu/process used for knowing whether it is
-    # the master of a worker.
-    rank=`expr $node_rank \* $num_gpus + $i`
-    python wenet/bin/train.py --gpu $gpu_id \
+  torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
+    python wenet/bin/train.py \
       --config $train_config \
       --data_type $data_type \
       --symbol_table $dict \
@@ -157,16 +143,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.init_method $init_method \
-      --ddp.world_size $world_size \
-      --ddp.rank $rank \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
       $cmvn_opts \
       --pin_memory \
       --non_lang_syms ${nlsyms}
-  } &
-  done
-  wait
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then