[egs] Add Chime 6 baseline system (kaldi-asr#3755)

Bar-BY · Jan 21, 2020 · 9bee308 · 9bee308
1 parent 30ed947
commit 9bee308
Show file tree

Hide file tree

Showing 106 changed files with 6,805 additions and 49 deletions.
diff --git a/.gitignore b/.gitignore
@@ -77,6 +77,8 @@ GSYMS
 /egs/*/*/plp
 /egs/*/*/exp
 /egs/*/*/data
+/egs/*/*/wav
+/egs/*/*/enhan
 
 # /tools/
 /tools/pocolm/

diff --git a/egs/chime5/s5b/local/nnet3/compare_wer.sh b/egs/chime5/s5b/local/nnet3/compare_wer.sh
@@ -130,3 +130,4 @@ done
 echo
 
 echo
+
diff --git a/egs/chime5/s5b/local/nnet3/decode.sh b/egs/chime5/s5b/local/nnet3/decode.sh
@@ -35,6 +35,8 @@ post_decode_acwt=1.0 # important to change this when using chain models
 extra_left_context_initial=0
 extra_right_context_final=0
 
+graph_affix=
+
 score_opts="--min-lmwt 6 --max-lmwt 13"
 
 . ./cmd.sh
@@ -94,7 +96,7 @@ if [ $stage -le 2 ]; then
   fi
 fi
 
-decode_dir=$dir/decode_${data_set}${affix}
+decode_dir=$dir/decode${graph_affix}_${data_set}${affix}
 # generate the lattices
 if [ $stage -le 3 ]; then
   echo "Generating lattices, stage 1"

diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh
@@ -28,8 +28,8 @@ json_dir=${chime5_corpus}/transcriptions
 audio_dir=${chime5_corpus}/audio
 
 # training and test data
-train_set=train_worn_u100k
-test_sets="eval_${enhancement}_ref"
+train_set=train_worn_simu_u400k
+test_sets="eval_${enhancement}_dereverb_ref"
 
 # This script also needs the phonetisaurus g2p, srilm, beamformit
 ./local/check_tools.sh || exit 1
@@ -38,18 +38,27 @@ if [ $stage -le 4 ]; then
   # Beamforming using reference arrays
   # enhanced WAV directory
   enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
   for dset in eval; do
     for mictype in u01 u02 u03 u04 u05 u06; do
-      local/run_beamformit.sh --cmd "$train_cmd" \
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
 			      ${audio_dir}/${dset} \
+			      ${dereverb_dir}/${dset} \
+			      ${mictype}
+    done
+  done
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u05 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${dereverb_dir}/${dset} \
 			      ${enhandir}/${dset}_${enhancement}_${mictype} \
 			      ${mictype}
     done
   done
-  
+
   for dset in eval; do
     local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
-			  ${json_dir}/${dset} data/${dset}_${enhancement}_ref
+			  ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
   done
 fi
 
@@ -92,28 +101,13 @@ if [ $stage -le 7 ]; then
   done
 fi
 
-if [ $stage -le 17 ]; then
-  nnet3_affix=_${train_set}_cleaned
-  for datadir in ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-  done
-  for datadir in ${test_sets}; do
-    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
-    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
-  done
-  for data in $test_sets; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
-      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
-      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
-  done
-fi
+nnet3_affix=_${train_set}_cleaned_rvb
+
+lm_suffix=
 
 if [ $stage -le 18 ]; then
   # First the options that are passed through to run_ivector_common.sh
   # (some of which are also used in this script directly).
-  lm_suffix=
 
   # The rest are configs specific to this script.  Most of the parameters
   # are just hardcoded at this level, in the commands below.
@@ -138,16 +132,14 @@ if [ $stage -le 18 ]; then
 
   for data in $test_sets; do
     (
-      steps/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $chunk_left_context \
-          --extra-right-context $chunk_right_context \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
-          --frames-per-chunk $frames_per_chunk \
-          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+      local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --frames-per-chunk 150 --nj $decode_nj \
+        --ivector-dir exp/nnet3${nnet3_affix} \
+        --graph-affix ${lm_suffix} \
+        data/${data} data/lang${lm_suffix} \
+        $tree_dir/graph${lm_suffix} \
+        exp/chain${nnet3_affix}/tdnn1b_sp 
     ) || touch $dir/.error &
   done
   wait
@@ -159,6 +151,6 @@ if [ $stage -le 20 ]; then
   # please specify both dev and eval set directories so that the search parameters
   # (insertion penalty and language model weight) will be tuned using the dev set
   local/score_for_submit.sh \
-      --dev exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_dev_${enhancement}_ref \
-      --eval exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_eval_${enhancement}_ref
+      --dev exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_dev_${enhancement}_dereverb_ref_2stage \
+      --eval exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_eval_${enhancement}_dereverb_ref_2stage
 fi
diff --git a/egs/chime5/s5b/local/run_wpe.sh b/egs/chime5/s5b/local/run_wpe.sh
@@ -33,7 +33,8 @@ set -o pipefail
 
 miniconda_dir=$HOME/miniconda3/
 if [ ! -d $miniconda_dir ]; then
-    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';"
+    echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'."
+    exit 1
 fi
 
 # check if WPE is installed

diff --git a/egs/chime6/README.txt b/egs/chime6/README.txt
@@ -0,0 +1,6 @@
+This is a kaldi recipe for the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6).
+
+See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information.
+
+s5_track1 : Track 1 of the challenge (oracle segments and speaker label is provided)
+s5_track2 : Track 2 of the challenge (only raw audio is provided)
diff --git a/egs/chime6/s5_track1/RESULTS b/egs/chime6/s5_track1/RESULTS
@@ -0,0 +1,21 @@
+
+# tri2
+%WER 88.52 [ 52121 / 58881, 2023 ins, 30285 del, 19813 sub ] exp/tri2/decode_dev_gss/wer_17_0.5
+
+# tri3
+%WER 85.72 [ 50471 / 58881, 3079 ins, 23787 del, 23605 sub ] exp/tri3/decode_dev_gss/wer_17_0.5
+
+# nnet3 tdnn+chain
+%WER 41.21 [ 24267 / 58881, 2428 ins, 7606 del, 14233 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_worn_2stage/wer_11_0.0
+%WER 51.76 [ 30474 / 58881, 2665 ins, 11749 del, 16060 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_gss_multiarray_2stage/wer_10_0.0
+
+# result with the challenge submission format (Nov 17, 2019)
+# after the fix of speaker ID across arrays
+==== development set ====
+session S02 room DINING: #words 8288, #errors 4459, wer 53.80 %
+session S02 room KITCHEN: #words 12696, #errors 7170, wer 56.47 %
+session S02 room LIVING: #words 15460, #errors 7388, wer 47.78 %
+session S09 room DINING: #words 5766, #errors 3100, wer 53.76 %
+session S09 room KITCHEN: #words 8911, #errors 4483, wer 50.30 %
+session S09 room LIVING: #words 7760, #errors 3874, wer 49.92 %
+overall: #words 58881, #errors 30474, wer 51.75 %
diff --git a/egs/chime6/s5_track1/cmd.sh b/egs/chime6/s5_track1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+
diff --git a/egs/chime6/s5_track1/conf/beamformit.cfg b/egs/chime6/s5_track1/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime6/s5_track1/conf/mfcc.conf b/egs/chime6/s5_track1/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime6/s5_track1/conf/mfcc_hires.conf b/egs/chime6/s5_track1/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime6/s5_track1/conf/online_cmvn.conf b/egs/chime6/s5_track1/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime6/s5_track1/conf/queue.conf b/egs/chime6/s5_track1/conf/queue.conf
@@ -0,0 +1,10 @@
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l hostname='!b19*'
+option gpu=* -l gpu=$0 -q g.q -l hostname='!b19*'
+
diff --git a/egs/chime6/s5_track1/local/add_location_to_uttid.sh b/egs/chime6/s5_track1/local/add_location_to_uttid.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Author: Ashish Arora
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+enhancement=gss
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/add_location_to_uttid.sh [options] <json-transcription-in-dir>"
+   echo "                        <perutt-in-dir> <uttid-location-mapping-out-file>"
+   echo "main options (for others, see top of script file)"
+   echo "  --enhancement                    # enhancement type (gss or beamformit)"
+   exit 1;
+fi
+
+jdir=$1
+puttdir=$2
+utt_loc_file=$3
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+if [[ ${enhancement} == *gss* ]]; then
+  local/get_location.py $jdir > $utt_loc_file
+  local/replace_uttid.py $utt_loc_file $puttdir/per_utt > $puttdir/per_utt_loc
+fi
+
+if [[ ${enhancement} == *beamformit* ]]; then
+  cat $puttdir/per_utt > $puttdir/per_utt_loc
+fi
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh