do fixes according to code review

kaldi-asr · Sep 23, 2018 · 771a556 · 771a556
1 parent 61d9560
commit 771a556
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 128 deletions.
diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh
diff --git a/egs/multi_en/s5/local/g2p/train_g2p.sh b/egs/multi_en/s5/local/g2p/train_g2p.sh
diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
@@ -115,13 +115,13 @@ if [ $stage -le 4 ]; then
   rm $dict_dir/lexiconp.txt 2>/dev/null || true
   cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
 
-  # awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
   echo 'Gathering missing words...'
 
   lexicon=data/local/dict_combined/lexicon.txt
   g2p_tmp_dir=data/local/g2p_phonetisarus
   mkdir -p $g2p_tmp_dir
 
+  # awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
   cat data/*/train/text | \
     local/count_oovs.pl $lexicon | \
     awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
@@ -134,11 +134,9 @@ if [ $stage -le 4 ]; then
 
   expanded_lexicon=$dict_dir/lexicon.txt
   echo "Adding new pronunciations to get expanded lexicon $expanded_lexicon"
-  cat "$lexicon" $g2p_tmp_dir/missing_lexicon.txt | sort | uniq > $expanded_lexicon
+  cat <(cut -f 1,3 $g2p_tmp_dir/missing_lexicon.txt) $lexicon | sort | uniq > $expanded_lexicon
 fi
 
-exit 0
-
 # We'll do multiple iterations of pron/sil-prob estimation. So the structure of
 # the dict/lang dirs are designed as ${dict/lang_root}_${dict_affix}, where dict_affix
 # is "nosp" or the name of the acoustic model we use to estimate pron/sil-probs.

diff --git a/egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh b/egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
@@ -10,9 +10,8 @@
 
 # Begin configuration section.  
 stage=0
-nbest=1    # Generate up to N variants
-pmass=     # Generate so many variants to produce 90 % of the prob mass
-model=
+nbest=          # Generate up to $nbest variants
+pmass=          # Generate so many variants to produce $pmass ammount, like 90%, of the prob mass
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -25,31 +24,38 @@ set -e
 
 if [ $# != 3 ]; then
   echo "Usage: $0 [options] <g2p-model> <word-list> <lexicon-out>"
-  echo "... where <g2p-model> is the trained g2p model"
-  echo "          <word-list> is a list of words whose pronunciation is to be generated"
-  echo "          <lexicon-out> output lexicon, whose format is ...."  # TODO
-  echo "e.g.: $0 exp/g2p/model.fst exp/g2p/oov_words.txt exp/g2p/model.fst data/local/dict_nosp/lexicon.txt"
+  echo "... where <g2p-model> is the trained g2p model."
+  echo "          <word-list> is a list of words whose pronunciation is to be generated."
+  echo "          <lexicon-out> output lexicon, whose format is <word>\t<prob>\t<pronunciation> for each line."
+  echo "e.g.: $0 exp/g2p/model.fst exp/g2p/oov_words.txt data/local/dict_nosp/lexicon.txt"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --nbest <int>    # Maximum number of hypotheses to produce. By default, nbest=1."
-  echo "  --pmass <float>  # Select the maximum number of hypotheses summing to pmass total mass"
-  echo "                   # for a word. By default, pmass is disabled."
+  echo "  --pmass <float>  # Select the maximum number of hypotheses summing to a total mass of pmass amount, within [0, 1], for a word."
+  echo "  --nbest <int> --pmass <float>  # When specified together, we generate the intersection of these two options."
   exit 1;
 fi
 
 model=$1
 word_list=$2
 out_lexicon=$3
 
+[ ! -z $nbest ] && [[ ! $nbest =~ ^[0-9]+$ ]] && echo "$0: nbest should be a positive integer." && exit 1
+[ ! -z $pmass ] && ! { [[ $pmass =~ ^[0-9]+\.?[0-9]*$ ]] && [ $(bc <<< "$pmass >= 0") -eq 1 -a $(bc <<< "$pmass <= 1") -eq 1 ]; } \
+  && echo "$0: pmass should be within [0, 1]." && exit 1
+[ -z $pmass ] && [ -z $nbest ] && nbest=1
 
 if [ -z $pmass ]; then
-  echo "Synthesizing pronunciations for words in $word_list based on nbest = $nbest"
-  option="--nbest $nbest"
+  echo "Synthesizing pronunciations for words in $word_list based on nbest=$nbest"
+  options="--nbest $nbest --pmass 1.0"
+elif [ -z $nbest ]; then
+  echo "Synthesizing pronunciations for words in $word_list based on pmass=$pmass"
+  options="--pmass $pmass --nbest 20"
 else
-  echo "Synthesizing pronunciations for words in $word_list based on pmass = $pmass"
-  option="--pmass $pmass"
+  echo "Synthesizing pronunciations for words in $word_list based on nbest=$nbest and pmass=$pmass"
+  options="--pmass $pmass --nbest $nbest"
 fi
-phonetisaurus-apply $option --model $model --thresh 5 --accumulate --word_list $word_list > $out_lexicon
+phonetisaurus-apply $options --model $model --thresh 5 --accumulate --verbose --prob --word_list $word_list 1>$out_lexicon
 
 echo "Finished. Synthesized lexicon for new words is in $out_lexicon"
 

diff --git a/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh b/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
@@ -72,7 +72,7 @@ fi
 
 if [ $stage -le 2 ]; then
   # Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
-  ./steps/dict/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
+  ./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
 fi
 
 if [ $stage -le 3 ]; then

diff --git a/egs/wsj/s5/steps/dict/make_kn_lm.py → egs/wsj/s5/utils/lang/make_kn_lm.py b/egs/wsj/s5/steps/dict/make_kn_lm.py → egs/wsj/s5/utils/lang/make_kn_lm.py