forked from kaldi-asr/kaldi
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request kaldi-asr#4 from jsalt2020-asrdiar/libricss
RNNLM rescoring and multichannel recipe
- Loading branch information
Showing
27 changed files
with
1,124 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# you can change cmd.sh depending on what type of queue you are using. | ||
# If you have no queueing system and want to run on a local machine, you | ||
# can change all instances 'queue.pl' to run.pl (but be careful and run | ||
# commands one by one: most recipes will exhaust the memory on your | ||
# machine). queue.pl works with GridEngine (qsub). slurm.pl works | ||
# with slurm. Different queues are configured differently, with different | ||
# queue names and different ways of specifying things like memory; | ||
# to account for these differences you can create and edit the file | ||
# conf/queue.conf to match your queue's configuration. Search for | ||
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, | ||
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. | ||
|
||
export train_cmd="retry.pl queue.pl --mem 2G" | ||
export decode_cmd="queue.pl --mem 4G" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) | ||
|
||
# scrolling size to compute the delays | ||
scroll_size = 250 | ||
|
||
# cross correlation computation window size | ||
window_size = 500 | ||
|
||
#amount of maximum points for the xcorrelation taken into account | ||
nbest_amount = 4 | ||
|
||
#flag wether to apply an automatic noise thresholding | ||
do_noise_threshold = 1 | ||
|
||
#Percentage of frames with lower xcorr taken as noisy | ||
noise_percent = 10 | ||
|
||
######## acoustic modelling parameters | ||
|
||
#transition probabilities weight for multichannel decoding | ||
trans_weight_multi = 25 | ||
trans_weight_nbest = 25 | ||
|
||
### | ||
|
||
#flag wether to print the feaures after setting them, or not | ||
print_features = 1 | ||
|
||
#flag wether to use the bad frames in the sum process | ||
do_avoid_bad_frames = 1 | ||
|
||
#flag to use the best channel (SNR) as a reference | ||
#defined from command line | ||
do_compute_reference = 1 | ||
|
||
#flag wether to use a uem file or not(process all the file) | ||
do_use_uem_file = 0 | ||
|
||
#flag wether to use an adaptative weights scheme or fixed weights | ||
do_adapt_weights = 1 | ||
|
||
#flag wether to output the sph files or just run the system to create the auxiliary files | ||
do_write_sph_files = 1 | ||
|
||
####directories where to store/retrieve info#### | ||
#channels_file = ./cfg-files/channels | ||
|
||
#show needs to be passed as argument normally, here a default one is given just in case | ||
#show_id = Ttmp | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
--use-energy=false | ||
--sample-frequency=16000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# config for high-resolution MFCC features, intended for neural network training. | ||
# Note: we keep all cepstra, so it has the same info as filterbank features, | ||
# but MFCC is more easily compressible (because less correlated) which is why | ||
# we prefer this method. | ||
--use-energy=false # use average of log energy, not energy. | ||
--sample-frequency=16000 | ||
--num-mel-bins=40 | ||
--num-ceps=40 | ||
--low-freq=40 | ||
--high-freq=-400 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../callhome_diarization/v1/diarization |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../s5_mono/local |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
export KALDI_ROOT=`pwd`/../../.. | ||
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh | ||
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH | ||
export PATH=$PWD/dscore:$PATH | ||
export PYTHONPATH="${PYTHONPATH}:$PWD/dscore" | ||
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 | ||
. $KALDI_ROOT/tools/config/common_path.sh | ||
export LC_ALL=C | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../scripts/rnnlm/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/usr/bin/env bash | ||
# | ||
# LibriCSS multi-channel baseline recipe. | ||
# | ||
# Copyright 2020 Johns Hopkins University (Author: Desh Raj) | ||
# Apache 2.0 | ||
|
||
# Begin configuration section. | ||
nj=50 | ||
decode_nj=20 | ||
stage=0 | ||
|
||
# Different stages | ||
data_prep_stage=0 | ||
asr_stage=1 | ||
diarizer_stage=0 | ||
decode_stage=0 | ||
rnnlm_rescore=true | ||
|
||
enhancement=beamformit | ||
wpe=true | ||
|
||
use_oracle_segments=true | ||
|
||
# End configuration section | ||
. ./utils/parse_options.sh | ||
|
||
. ./cmd.sh | ||
. ./path.sh | ||
|
||
dereverb= | ||
$wpe && dereverb=_dereverb | ||
|
||
test_sets="dev${dereverb}_${enhancement} eval${dereverb}_${enhancement}" | ||
|
||
set -e # exit on error | ||
|
||
# please change the path accordingly | ||
libricss_corpus=/export/corpora/LibriCSS | ||
librispeech_corpus=/export/corpora/LibriSpeech/ | ||
|
||
########################################################################## | ||
# We first prepare the LibriCSS data (7ch) in the Kaldi data | ||
# format. We use session 0 for dev and others for eval. We also | ||
# apply online multichannel WPE for dereverberation and then combine | ||
# all channels using beamforming. | ||
########################################################################## | ||
if [ $stage -le 0 ]; then | ||
local/data_prep_7ch.sh --stage $data_prep_stage --wpe $wpe \ | ||
--enhancement $enhancement $libricss_corpus | ||
fi | ||
|
||
######################################################################### | ||
# ASR MODEL TRAINING | ||
# In this stage, we prepare the Librispeech data and train our ASR model. | ||
# This part is taken from the librispeech recipe, with parts related to | ||
# decoding removed. We use the 100h clean subset to train most of the | ||
# GMM models, except the SAT model, which is trained on the 460h clean | ||
# subset. The nnet is trained on the full 960h (clean + other). | ||
# To avoid training the whole ASR from scratch, you can download the | ||
# chain model using: | ||
# wget http://kaldi-asr.org/models/13/0013_librispeech_s5.tar.gz | ||
# Once it is downloaded, extract using: tar -xvzf 0013_librispeech_s5.tar.gz | ||
# and copy the contents of the exp/ directory to your exp/. | ||
######################################################################### | ||
if [ $stage -le 1 ]; then | ||
local/train_asr.sh --stage $asr_stage --nj $nj $librispeech_corpus | ||
fi | ||
|
||
########################################################################## | ||
# DIARIZATION MODEL TRAINING | ||
# You can also download a pretrained diarization model using: | ||
# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz | ||
# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz | ||
# and copy the contents of the exp/ directory to your exp/ | ||
########################################################################## | ||
if [ $stage -le 2 ]; then | ||
local/train_diarizer.sh --stage $diarizer_stage \ | ||
--data-dir data/train_other_500 \ | ||
--model-dir exp/xvector_nnet_1a | ||
fi | ||
|
||
########################################################################## | ||
# RNNLM TRAINING | ||
# We train a TDNN-LSTM based LM that will be used for rescoring the | ||
# decoded lattices. | ||
########################################################################## | ||
if [ $stage -le 3 ]; then | ||
local/rnnlm/train.sh --stage $rnnlm_stage | ||
fi | ||
|
||
########################################################################## | ||
# DECODING: We assume that we are just given the raw recordings (approx 10 | ||
# mins each), without segments or speaker information, so we have to decode | ||
# the whole pipeline, i.e., SAD -> Diarization -> ASR. This is done in the | ||
# local/decode.sh script. | ||
########################################################################## | ||
if [ $stage -le 4 ]; then | ||
local/decode.sh --stage $decode_stage \ | ||
--test-sets "$test_sets" \ | ||
--use-oracle-segments $use_oracle_segments | ||
fi | ||
|
||
exit 0; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../sre08/v1/sid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../wsj/s5/steps/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../wsj/s5/utils/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
#!/usr/bin/env bash | ||
# | ||
# Copyright 2020 Johns Hopkins University (Author: Desh Raj) | ||
# Apache 2.0 | ||
|
||
# Begin configuration section. | ||
# End configuration section | ||
enhancement= | ||
wpe= | ||
stage= | ||
|
||
. ./utils/parse_options.sh # accept options | ||
|
||
. ./path.sh | ||
|
||
echo >&2 "$0" "$@" | ||
if [ $# -ne 1 ] ; then | ||
echo >&2 "$0" "$@" | ||
echo >&2 "$0: Error: wrong number of arguments" | ||
echo -e >&2 "Usage:\n $0 [opts] <corpus-dir>" | ||
echo -e >&2 "eg:\n $0 /export/corpora/LibriCSS" | ||
exit 1 | ||
fi | ||
|
||
corpus_dir=$1 | ||
|
||
dereverb= | ||
$wpe && dereverb=_dereverb | ||
|
||
set -e -o pipefail | ||
|
||
if [ $stage -le 0 ]; then | ||
# If data is not already present, then download and unzip | ||
if [ ! -d $corpus_dir/for_release ]; then | ||
echo "Downloading and unpacking LibriCSS data." | ||
CWD=`pwd` | ||
mkdir -p $corpus_dir | ||
|
||
cd $corpus_dir | ||
|
||
# Download the data. If the data has already been downloaded, it | ||
# does nothing. (See wget -c) | ||
wget -c --load-cookies /tmp/cookies.txt \ | ||
"https://docs.google.com/uc?export=download&confirm=$(wget --quiet \ | ||
--save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \ | ||
'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \ | ||
-O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \ | ||
-O for_release.zip && rm -rf /tmp/cookies.txt | ||
|
||
# unzip (skip if already extracted) | ||
unzip -n for_release.zip | ||
|
||
# segmentation | ||
cd for_release | ||
python3 segment_libricss.py -data_path . | ||
|
||
cd $CWD | ||
fi | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
# Process the downloaded data directory to get data in Kaldi format. Here we get all | ||
# channels. | ||
mkdir -p data/local/data/ | ||
local/prepare_data.py --srcpath $corpus_dir/for_release --tgtpath data/local/data | ||
fi | ||
|
||
if [ $stage -le 2 ] && $wpe; then | ||
# Perform online multichannel WPE | ||
local/run_wpe.sh --cmd "$train_cmd --mem 60G" \ | ||
data/local/data | ||
|
||
# Change the path of the wav files to point to dereverberated file | ||
mv data/local/data/wav.scp data/local/data/wav.scp.bak | ||
cat data/local/data/wav.scp.bak | sed 's/wavs/wavs_dereverb/g' > data/local/data/wav.scp | ||
fi | ||
|
||
if [ $stage -le 3 ]; then | ||
if [ $enhancement == "gss" ]; then | ||
echo "$0: GSS not implemented yet" | ||
elif [ $enhancement == "beamformit" ]; then | ||
local/run_beamformit.sh --cmd "$train_cmd" \ | ||
data/local/data/wavs \ | ||
data/local/data_beamformit/wavs | ||
|
||
# Also create other files. Note that we still name the beamformed file as CH0 | ||
# only for consistency in naming (for scoring purposes) | ||
for file in wav.scp utt2spk text segments; do | ||
cat data/local/data/$file | sed 's/_CH[0-9]/_CH0/g' | sort -u > data/local/data_beamformit/$file | ||
done | ||
sed -i 's/data\/local\/data\/wavs_dereverb/data\/local\/data_beamformit\/wavs/g' data/local/data_beamformit/wav.scp | ||
|
||
else | ||
echo "$0: Enhancement type $enhancement not found" | ||
exit 1 | ||
fi | ||
fi | ||
|
||
if [ $stage -le 4 ]; then | ||
# Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to | ||
# session9) of approximately 1 hour each. In the below strings, separate each session by | ||
# '\|' to perform grep at once. | ||
dev_sessions="session0" | ||
eval_sessions="session1\|session2\|session3\|session4\|session5\|session6\|session7\|session8\|session9" | ||
|
||
mkdir -p data/dev${dereverb}_${enhancement} | ||
for file in wav.scp utt2spk text segments; do | ||
grep $dev_sessions data/local/data_${enhancement}/"$file" | sort > data/dev${dereverb}_${enhancement}/"$file" | ||
done | ||
|
||
mkdir -p data/eval${dereverb}_${enhancement} | ||
for file in wav.scp utt2spk text segments; do | ||
grep $eval_sessions data/local/data_${enhancement}/"$file" | sort > data/eval${dereverb}_${enhancement}/"$file" | ||
done | ||
fi | ||
|
||
if [ $stage -le 5 ]; then | ||
# Move the utt2spk, segments, and text file to .bak so that they are only used | ||
# in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for | ||
# these. | ||
for dataset in dev eval; do | ||
datadir=${dataset}${dereverb}_${enhancement} | ||
for file in text utt2spk segments; do | ||
mv data/$datadir/$file data/$datadir/$file.bak | ||
done | ||
|
||
awk '{print $1, $1}' data/$datadir/wav.scp > data/$datadir/utt2spk | ||
utils/utt2spk_to_spk2utt.pl data/$datadir/utt2spk > data/$datadir/spk2utt | ||
done | ||
fi |
Oops, something went wrong.