Skip to content

Commit

Permalink
[scripts,egs] Added phonetisaurus-based g2p scripts (#2730)
Browse files Browse the repository at this point in the history
Phonetisaurus is much faster to train then sequitur.
  • Loading branch information
huangruizhe authored and danpovey committed Oct 9, 2018
1 parent 50411bd commit 735e2a5
Show file tree
Hide file tree
Showing 6 changed files with 590 additions and 113 deletions.
42 changes: 0 additions & 42 deletions egs/multi_en/s5/local/g2p/apply_g2p.sh

This file was deleted.

67 changes: 0 additions & 67 deletions egs/multi_en/s5/local/g2p/train_g2p.sh

This file was deleted.

28 changes: 24 additions & 4 deletions egs/multi_en/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ if [ $stage -le 1 ]; then
# We prepare the basic dictionary in data/local/dict_combined.
local/prepare_dict.sh $swbd $tedlium2
(
local/g2p/train_g2p.sh --stage 0 --silence-phones \
"data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p || touch exp/g2p/.error
steps/dict/train_g2p_phonetisaurus.sh --stage 0 --silence-phones \
"data/local/dict_combined/silence_phones.txt" data/local/dict_combined/lexicon.txt exp/g2p || touch exp/g2p/.error
) &
fi

Expand Down Expand Up @@ -114,8 +114,28 @@ if [ $stage -le 4 ]; then
mkdir -p $dict_dir
rm $dict_dir/lexiconp.txt 2>/dev/null || true
cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || exit 1;

echo 'Gathering missing words...'

lexicon=data/local/dict_combined/lexicon.txt
g2p_tmp_dir=data/local/g2p_phonetisarus
mkdir -p $g2p_tmp_dir

# awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
cat data/*/train/text | \
local/count_oovs.pl $lexicon | \
awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
perl -ape 's/\s/\n/g;' | \
sort | uniq > $g2p_tmp_dir/missing.txt
cat $g2p_tmp_dir/missing.txt | \
grep "^[a-z]*$" > $g2p_tmp_dir/missing_onlywords.txt

steps/dict/apply_g2p_phonetisaurus.sh --nbest 1 $g2p_tmp_dir/missing_onlywords.txt exp/g2p exp/g2p/oov_lex || exit 1;
cp exp/g2p/oov_lex/lexicon.lex $g2p_tmp_dir/missing_lexicon.txt

extended_lexicon=$dict_dir/lexicon.txt
echo "Adding new pronunciations to get extended lexicon $extended_lexicon"
cat <(cut -f 1,3 $g2p_tmp_dir/missing_lexicon.txt) $lexicon | sort | uniq > $extended_lexicon
fi

# We'll do multiple iterations of pron/sil-prob estimation. So the structure of
Expand Down
99 changes: 99 additions & 0 deletions egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash
# Copyright 2014 Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016 Xiaohui Zhang
# 2018 Ruizhe Huang
# Apache 2.0

# This script applies a trained Phonetisarus G2P model to
# synthesize pronunciations for missing words (i.e., words in
# transcripts but not the lexicon), and output the expanded lexicon.
# The user could specify either nbest or pmass option
# to determine the number of output pronunciation variants,
# or use them together to get the intersection of two options.

# Begin configuration section.
stage=0
nbest= # Generate up to N, like N=3, pronunciation variants for each word
# (The maximum size of the nbest list, not considering pruning and taking the prob-mass yet).
thresh=5 # Pruning threshold for the n-best list, in (0, 99], which is a -log-probability value.
# A large threshold makes the nbest list shorter, and less likely to hit the max size.
# This value corresponds to the weight_threshold in shortest-path.h of openfst.
pmass= # Select the top variants from the pruned nbest list,
# summing up to this total prob-mass for a word.
# On the "boundary", it's greedy by design, e.g. if pmass = 0.8,
# and we have prob(pron_1) = 0.5, and prob(pron_2) = 0.4, then we get both.
# End configuration section.

echo "$0 $@" # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 3 ]; then
echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>"
echo "... where <word-list> is a list of words whose pronunciation is to be generated."
echo " <g2p-model-dir> is a directory used as a target during training of G2P"
echo " <output-dir> is the directory where the output lexicon should be stored."
echo " The format of the output lexicon output-dir/lexicon.lex is"
echo " <word>\t<prob>\t<pronunciation> per line."
echo "e.g.: $0 --nbest 1 exp/g2p/oov_words.txt exp/g2p exp/g2p/oov_lex"
echo ""
echo "main options (for others, see top of script file)"
echo " --nbest <int> # Generate upto N pronunciation variants for each word."
echo " --pmass <float> # Select the top variants from the pruned nbest list,"
echo " # summing up to this total prob-mass, within [0, 1], for a word."
echo " --thresh <int> # Pruning threshold for n-best."
exit 1;
fi

wordlist=$1
modeldir=$2
outdir=$3

model=$modeldir/model.fst
output_lex=$outdir/lexicon.lex
mkdir -p $outdir

[ ! -f ${model:-} ] && echo "$0: File $model not found in the directory $modeldir." && exit 1
[ ! -f $wordlist ] && echo "$0: File $wordlist not found!" && exit 1
[ -z $pmass ] && [ -z $nbest ] && echo "$0: nbest or/and pmass should be specified." && exit 1;
if ! phonetisaurus=`which phonetisaurus-apply` ; then
echo "Phonetisarus was not found !"
echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
exit 1
fi

cp $wordlist $outdir/wordlist.txt

# three options: 1) nbest, 2) pmass, 3) nbest+pmass,
nbest=${nbest:-20} # if nbest is not specified, set it to 20, due to Phonetisaurus mechanism
pmass=${pmass:-1.0} # if pmass is not specified, set it to 1.0, due to Phonetisaurus mechanism

[[ ! $nbest =~ ^[1-9][0-9]*$ ]] && echo "$0: nbest should be a positive integer." && exit 1;

echo "Applying the G2P model to wordlist $wordlist"
phonetisaurus-apply --pmass $pmass --nbest $nbest --thresh $thresh \
--word_list $wordlist --model $model \
--accumulate --verbose --prob \
1>$output_lex

echo "Completed. Synthesized lexicon for new words is in $output_lex"

# Some words might have been removed or skipped during the process,
# let's check it and warn the user if so...
nlex=`cut -f 1 $output_lex | sort -u | wc -l`
nwlist=`cut -f 1 $wordlist | sort -u | wc -l`
if [ $nlex -ne $nwlist ] ; then
failed_wordlist=$outdir/lexicon.failed
echo "WARNING: Unable to generate pronunciation for all words. ";
echo "WARINNG: Wordlist: $nwlist words"
echo "WARNING: Lexicon : $nlex words"
comm -13 <(cut -f 1 $output_lex | sort -u ) \
<(cut -f 1 $wordlist | sort -u ) \
>$failed_wordlist && echo "WARNING: The list of failed words is in $failed_wordlist"
fi
exit 0

88 changes: 88 additions & 0 deletions egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash

# Copyright 2017 Intellisist, Inc. (Author: Navneeth K)
# 2017 Xiaohui Zhang
# 2018 Ruizhe Huang
# Apache License 2.0

# This script trains a g2p model using Phonetisaurus.

stage=0
encoding='utf-8'
only_words=true
silence_phones=

echo "$0 $@" # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 2 ]; then
echo "Usage: $0 [options] <lexicon-in> <work-dir>"
echo " where <lexicon-in> is the training lexicon (one pronunciation per "
echo " word per line, with lines like 'hello h uh l ow') and"
echo " <work-dir> is directory where the models will be stored"
echo "e.g.: $0 --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
echo ""
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --silence-phones <silphones-list> # e.g. data/local/dict/silence_phones.txt."
echo " # A list of silence phones, one or more per line"
echo " # Relates to --only-words option"
echo " --only-words (true|false) (default: true) # If true, exclude silence words, i.e."
echo " # words with one or multiple phones which are all silence."
exit 1;
fi

lexicon=$1
wdir=$2

[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit

isuconv=`which uconv`
if [ -z $isuconv ]; then
echo "uconv was not found. You must install the icu4c package."
exit 1;
fi

if ! phonetisaurus=`which phonetisaurus-apply` ; then
echo "Phonetisarus was not found !"
echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
exit 1
fi

mkdir -p $wdir


# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
# and optionally remove words that are mapped to a single silence phone from the lexicon.
if [ $stage -le 0 ]; then
if $only_words && [ ! -z "$silence_phones" ]; then
awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
$silence_phones $lexicon | \
awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
else
awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
fi
fi

if [ $stage -le 1 ]; then
# Align lexicon stage. Lexicon is assumed to have first column tab separated
phonetisaurus-align --input=$wdir/lexicon_tab_separated.txt --ofile=${wdir}/aligned_lexicon.corpus || exit 1;
fi

if [ $stage -le 2 ]; then
# Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
fi

if [ $stage -le 3 ]; then
# Convert the arpa file to FST.
phonetisaurus-arpa2wfst --lm=${wdir}/aligned_lexicon.arpa --ofile=${wdir}/model.fst
fi

Loading

0 comments on commit 735e2a5

Please sign in to comment.