kaldi-asr · danpovey · Oct 9, 2018 · Sep 21, 2018 · Sep 21, 2018 · Sep 23, 2018
diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh
diff --git a/egs/multi_en/s5/local/g2p/train_g2p.sh b/egs/multi_en/s5/local/g2p/train_g2p.sh
diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
@@ -58,8 +58,8 @@ if [ $stage -le 1 ]; then
   # We prepare the basic dictionary in data/local/dict_combined.
   local/prepare_dict.sh $swbd $tedlium2
   (
-   local/g2p/train_g2p.sh --stage 0 --silence-phones \
-     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p || touch exp/g2p/.error
+   steps/dict/train_g2p_phonetisaurus.sh --stage 0 --silence-phones \
+     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined/lexicon.txt exp/g2p || touch exp/g2p/.error
   ) &
 fi
 
@@ -114,8 +114,27 @@ if [ $stage -le 4 ]; then
   mkdir -p $dict_dir
   rm $dict_dir/lexiconp.txt 2>/dev/null || true
   cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
-  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
-    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || exit 1;
+
+  echo 'Gathering missing words...'
+
+  lexicon=data/local/dict_combined/lexicon.txt
+  g2p_tmp_dir=data/local/g2p_phonetisarus
+  mkdir -p $g2p_tmp_dir
+
+  # awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
+  cat data/*/train/text | \
+    local/count_oovs.pl $lexicon | \
+    awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
+    perl -ape 's/\s/\n/g;' | \
+    sort | uniq > $g2p_tmp_dir/missing.txt
+  cat $g2p_tmp_dir/missing.txt | \
+    grep "^[a-z]*$"  > $g2p_tmp_dir/missing_onlywords.txt
+
+  steps/dict/apply_g2p_phonetisaurus.sh --nbest 1 exp/g2p/model.fst $g2p_tmp_dir/missing_onlywords.txt $g2p_tmp_dir/missing_lexicon.txt || exit 1;
+
+  expanded_lexicon=$dict_dir/lexicon.txt
+  echo "Adding new pronunciations to get expanded lexicon $expanded_lexicon"
+  cat <(cut -f 1,3 $g2p_tmp_dir/missing_lexicon.txt) $lexicon | sort | uniq > $expanded_lexicon
 fi
 
 # We'll do multiple iterations of pron/sil-prob estimation. So the structure of

diff --git a/egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh b/egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Xiaohui Zhang
+#           2018  Ruizhe Huang
+# Apache 2.0
+
+# This script applies a trained Phonetisarus G2P model to
+# synthesize pronunciations for missing words (i.e., words in
+# transcripts but not the lexicon), and output the expanded lexicon.
+
+# Begin configuration section.  
+stage=0
+nbest=          # Generate up to $nbest variants
+pmass=          # Generate so many variants to produce $pmass ammount, like 90%, of the prob mass
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [options] <g2p-model> <word-list> <lexicon-out>"
+  echo "... where <g2p-model> is the trained g2p model."
+  echo "          <word-list> is a list of words whose pronunciation is to be generated."
+  echo "          <lexicon-out> output lexicon, whose format is <word>\t<prob>\t<pronunciation> for each line."
+  echo "e.g.: $0 exp/g2p/model.fst exp/g2p/oov_words.txt data/local/dict_nosp/lexicon.txt"
+  echo ""
+  echo "main options (for others, see top of script file)"
+  echo "  --nbest <int>    # Maximum number of hypotheses to produce. By default, nbest=1."
+  echo "  --pmass <float>  # Select the maximum number of hypotheses summing to a total mass of pmass amount, within [0, 1], for a word."
+  echo "  --nbest <int> --pmass <float>  # When specified together, we generate the intersection of these two options."
+  exit 1;
+fi
+
+model=$1
+word_list=$2
+out_lexicon=$3
+
+[ ! -z $nbest ] && [[ ! $nbest =~ ^[0-9]+$ ]] && echo "$0: nbest should be a positive integer." && exit 1
+[ ! -z $pmass ] && ! { [[ $pmass =~ ^[0-9]+\.?[0-9]*$ ]] && [ $(bc <<< "$pmass >= 0") -eq 1 -a $(bc <<< "$pmass <= 1") -eq 1 ]; } \
+  && echo "$0: pmass should be within [0, 1]." && exit 1
+[ -z $pmass ] && [ -z $nbest ] && nbest=1
+
+if [ -z $pmass ]; then
+  echo "Synthesizing pronunciations for words in $word_list based on nbest=$nbest"
+  options="--nbest $nbest --pmass 1.0"
+elif [ -z $nbest ]; then
+  echo "Synthesizing pronunciations for words in $word_list based on pmass=$pmass"
+  options="--pmass $pmass --nbest 20"
+else
+  echo "Synthesizing pronunciations for words in $word_list based on nbest=$nbest and pmass=$pmass"
+  options="--pmass $pmass --nbest $nbest"
+fi
+phonetisaurus-apply $options --model $model --thresh 5 --accumulate --verbose --prob --word_list $word_list 1>$out_lexicon
+
+echo "Finished. Synthesized lexicon for new words is in $out_lexicon"
+
+exit 0
diff --git a/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh b/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Copyright 2017  Intellisist, Inc. (Author: Navneeth K)
+#           2017  Xiaohui Zhang
+#           2018  Ruizhe Huang
+# Apache License 2.0
+
+# This script trains a g2p model using Phonetisaurus.
+
+stage=0
+encoding='utf-8'
+only_words=true
+silence_phones=
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [options] <lexicon-in> <work-dir>"
+  echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
+  echo "    word per line, with lines like 'hello h uh l ow') and"
+  echo "    <work-dir> is directory where the models will be stored"
+  echo "e.g.: $0 --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
+  echo ""
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --silence-phones <silphones-list>                # e.g. data/local/dict/silence_phones.txt."
+  echo "                                                   # A list of silence phones, one or more per line"
+  echo "                                                   # Relates to  --only-words option"
+  echo "  --only-words (true|false)    (default: true)     # If true, exclude silence words, i.e."
+  echo "                                                   # words with one or multiple phones which are all silence."
+  exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+isuconv=`which uconv`
+if [ -z $isuconv ]; then
+  echo "uconv was not found. You must install the icu4c package."
+  exit 1;
+fi
+
+mkdir -p $wdir
+
+
+# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
+# and optionally remove words that are mapped to a single silence phone from the lexicon.
+if [ $stage -le 0 ]; then
+  if $only_words && [ ! -z "$silence_phones" ]; then
+    awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
+      $silence_phones $lexicon | \
+      awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
+  else
+    awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+  # Align lexicon stage. Lexicon is assumed to have first column tab separated
+  phonetisaurus-align --input=$wdir/lexicon_tab_separated.txt --ofile=${wdir}/aligned_lexicon.corpus || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
+  ./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
+fi
+
+if [ $stage -le 3 ]; then
+  # Convert the arpa file to FST.
+  phonetisaurus-arpa2wfst --lm=${wdir}/aligned_lexicon.arpa --ofile=${wdir}/model.fst
+fi
+