Skip to content

Commit

Permalink
do fixes according to code review
Browse files Browse the repository at this point in the history
  • Loading branch information
huangruizhe committed Sep 23, 2018
1 parent 61d9560 commit 771a556
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 128 deletions.
42 changes: 0 additions & 42 deletions egs/multi_en/s5/local/g2p/apply_g2p.sh

This file was deleted.

67 changes: 0 additions & 67 deletions egs/multi_en/s5/local/g2p/train_g2p.sh

This file was deleted.

6 changes: 2 additions & 4 deletions egs/multi_en/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,13 @@ if [ $stage -le 4 ]; then
rm $dict_dir/lexiconp.txt 2>/dev/null || true
cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir

# awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
echo 'Gathering missing words...'

lexicon=data/local/dict_combined/lexicon.txt
g2p_tmp_dir=data/local/g2p_phonetisarus
mkdir -p $g2p_tmp_dir

# awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
cat data/*/train/text | \
local/count_oovs.pl $lexicon | \
awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
Expand All @@ -134,11 +134,9 @@ if [ $stage -le 4 ]; then

expanded_lexicon=$dict_dir/lexicon.txt
echo "Adding new pronunciations to get expanded lexicon $expanded_lexicon"
cat "$lexicon" $g2p_tmp_dir/missing_lexicon.txt | sort | uniq > $expanded_lexicon
cat <(cut -f 1,3 $g2p_tmp_dir/missing_lexicon.txt) $lexicon | sort | uniq > $expanded_lexicon
fi

exit 0

# We'll do multiple iterations of pron/sil-prob estimation. So the structure of
# the dict/lang dirs are designed as ${dict/lang_root}_${dict_affix}, where dict_affix
# is "nosp" or the name of the acoustic model we use to estimate pron/sil-probs.
Expand Down
34 changes: 20 additions & 14 deletions egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@

# Begin configuration section.
stage=0
nbest=1 # Generate up to N variants
pmass= # Generate so many variants to produce 90 % of the prob mass
model=
nbest= # Generate up to $nbest variants
pmass= # Generate so many variants to produce $pmass ammount, like 90%, of the prob mass
# End configuration section.

echo "$0 $@" # Print the command line for logging
Expand All @@ -25,31 +24,38 @@ set -e

if [ $# != 3 ]; then
echo "Usage: $0 [options] <g2p-model> <word-list> <lexicon-out>"
echo "... where <g2p-model> is the trained g2p model"
echo " <word-list> is a list of words whose pronunciation is to be generated"
echo " <lexicon-out> output lexicon, whose format is ...." # TODO
echo "e.g.: $0 exp/g2p/model.fst exp/g2p/oov_words.txt exp/g2p/model.fst data/local/dict_nosp/lexicon.txt"
echo "... where <g2p-model> is the trained g2p model."
echo " <word-list> is a list of words whose pronunciation is to be generated."
echo " <lexicon-out> output lexicon, whose format is <word>\t<prob>\t<pronunciation> for each line."
echo "e.g.: $0 exp/g2p/model.fst exp/g2p/oov_words.txt data/local/dict_nosp/lexicon.txt"
echo ""
echo "main options (for others, see top of script file)"
echo " --nbest <int> # Maximum number of hypotheses to produce. By default, nbest=1."
echo " --pmass <float> # Select the maximum number of hypotheses summing to pmass total mass"
echo " # for a word. By default, pmass is disabled."
echo " --pmass <float> # Select the maximum number of hypotheses summing to a total mass of pmass amount, within [0, 1], for a word."
echo " --nbest <int> --pmass <float> # When specified together, we generate the intersection of these two options."
exit 1;
fi

model=$1
word_list=$2
out_lexicon=$3

[ ! -z $nbest ] && [[ ! $nbest =~ ^[0-9]+$ ]] && echo "$0: nbest should be a positive integer." && exit 1
[ ! -z $pmass ] && ! { [[ $pmass =~ ^[0-9]+\.?[0-9]*$ ]] && [ $(bc <<< "$pmass >= 0") -eq 1 -a $(bc <<< "$pmass <= 1") -eq 1 ]; } \
&& echo "$0: pmass should be within [0, 1]." && exit 1
[ -z $pmass ] && [ -z $nbest ] && nbest=1

if [ -z $pmass ]; then
echo "Synthesizing pronunciations for words in $word_list based on nbest = $nbest"
option="--nbest $nbest"
echo "Synthesizing pronunciations for words in $word_list based on nbest=$nbest"
options="--nbest $nbest --pmass 1.0"
elif [ -z $nbest ]; then
echo "Synthesizing pronunciations for words in $word_list based on pmass=$pmass"
options="--pmass $pmass --nbest 20"
else
echo "Synthesizing pronunciations for words in $word_list based on pmass = $pmass"
option="--pmass $pmass"
echo "Synthesizing pronunciations for words in $word_list based on nbest=$nbest and pmass=$pmass"
options="--pmass $pmass --nbest $nbest"
fi
phonetisaurus-apply $option --model $model --thresh 5 --accumulate --word_list $word_list > $out_lexicon
phonetisaurus-apply $options --model $model --thresh 5 --accumulate --verbose --prob --word_list $word_list 1>$out_lexicon

echo "Finished. Synthesized lexicon for new words is in $out_lexicon"

Expand Down
2 changes: 1 addition & 1 deletion egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ fi

if [ $stage -le 2 ]; then
# Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
./steps/dict/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
fi

if [ $stage -le 3 ]; then
Expand Down
File renamed without changes.

0 comments on commit 771a556

Please sign in to comment.