Skip to content

Commit

Permalink
[egs] Fix encoding issues in Chinese ASR recipe (#3430) (#3434)
Browse files Browse the repository at this point in the history
  • Loading branch information
boystray authored and danpovey committed Jun 29, 2019
1 parent f5d34d7 commit 8c0277e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 0 deletions.
2 changes: 2 additions & 0 deletions egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@
use utf8;
my %prons;
open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
binmode(DICT,":encoding(utf8)");
foreach (<DICT>) {
chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
}
close DICT;

open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
binmode(WORDS,":encoding(utf8)");
while (<WORDS>) {
chomp;
print $_;
Expand Down
4 changes: 4 additions & 0 deletions egs/aidatatang_200zh/s5/local/prepare_dict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
cat $dict_dir/cedict/ch-dict.txt |\
perl -e '
use utf8;
binmode(STDIN,":encoding(utf8)");
binmode(STDOUT,":encoding(utf8)");
while (<STDIN>) {
@A = split(" ", $_);
$word_len = length($A[0]);
Expand All @@ -190,6 +192,8 @@ cat $dict_dir/cedict/ch-dict.txt |\
cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
perl -e '
use utf8;
binmode(STDIN,":encoding(utf8)");
binmode(STDOUT,":encoding(utf8)");
while (<STDIN>) {
@A = split(" ", $_);
@chars = split("", $A[0]);
Expand Down

0 comments on commit 8c0277e

Please sign in to comment.