Skip to content

Commit

Permalink
[scripts] handle frame_shit and utt2num_frames in utils/ (kaldi-asr#3323
Browse files Browse the repository at this point in the history
)

subset_data_dir.sh has been refactored thoroughly so that its
logic can be followed easier. It has been well tested and
dogfooded.

All changes here are necessary to subset, combine and verify
utt2num_frames, and copy frame_shift to new directories where
necessary.
  • Loading branch information
kkm (aka Kirill Katsnelson) authored and danpovey committed Jun 19, 2019
1 parent 9ae4a5c commit 74ebdee
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 124 deletions.
20 changes: 19 additions & 1 deletion egs/wsj/s5/utils/combine_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,20 @@ for dir in $*; do
fi
done

# Check that frame_shift are compatible, where present together with features.
dir_with_frame_shift=
for dir in $*; do
if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
if [[ $dir_with_frame_shift ]] &&
! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
echo "$0:error: different frame_shift in directories $dir and " \
"$dir_with_frame_shift. Cannot combine features."
exit 1;
fi
dir_with_frame_shift=$dir
fi
done

# W.r.t. utt2uniq file the script has different behavior compared to other files
# it is not compulsary for it to exist in src directories, but if it exists in
# even one it should exist in all. We will create the files where necessary
Expand Down Expand Up @@ -94,7 +108,7 @@ else
echo "$0 [info]: not combining segments as it does not exist"
fi

for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
exists_somewhere=false
absent_somewhere=false
for d in $*; do
Expand All @@ -121,6 +135,10 @@ done

utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt

if [[ $dir_with_frame_shift ]]; then
cp $dir_with_frame_shift/frame_shift $dest
fi

if ! $skip_fix ; then
utils/fix_data_dir.sh $dest || exit 1;
fi
Expand Down
7 changes: 5 additions & 2 deletions egs/wsj/s5/utils/copy_data_dir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ fi
if [ -f $srcdir/utt2dur ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
fi
if [ -f $srcdir/utt2num_frames ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames
fi
if [ -f $srcdir/reco2dur ]; then
if [ -f $srcdir/segments ]; then
cp $srcdir/reco2dur $destdir/reco2dur
Expand All @@ -116,7 +119,7 @@ fi
if [ -f $srcdir/cmvn.scp ]; then
utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
fi
for f in stm glm ctm; do
for f in frame_shift stm glm ctm; do
if [ -f $srcdir/$f ]; then
cp $srcdir/$f $destdir
fi
Expand All @@ -126,7 +129,7 @@ rm $destdir/spk_map $destdir/utt_map

echo "$0: copied data from $srcdir to $destdir"

for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do
for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do
if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to"
echo " ... $destdir/.backup/$f"
Expand Down
226 changes: 109 additions & 117 deletions egs/wsj/s5/utils/subset_data_dir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,167 +34,159 @@

shortest=false
perspk=false
first_opt=""
speakers=false
spk_list_specified=false
utt_list_specified=false

if [ "$1" == "--per-spk" ]; then
perspk=true;
shift;
elif [ "$1" == "--shortest" ]; then
shortest=true;
shift;
elif [ "$1" == "--first" ]; then
first_opt="--first";
shift;
elif [ "$1" == "--speakers" ]; then
speakers=true
shift;
elif [ "$1" == "--last" ]; then
first_opt="--last";
shift;
elif [ "$1" == "--spk-list" ]; then
spk_list_specified=true
shift;
elif [ "$1" == "--utt-list" ]; then
utt_list_specified=true
shift;
fi




if [ $# != 3 ]; then
echo "Usage: "
first_opt=
spk_list=
utt_list=

expect_args=3
case $1 in
--first|--last) first_opt=$1; shift ;;
--per-spk) perspk=true; shift ;;
--shortest) shortest=true; shift ;;
--speakers) speakers=true; shift ;;
--spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
--utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
--*) echo "$0: invalid option '$1'"; exit 1
esac

if [ $# != $expect_args ]; then
echo "Usage:"
echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
echo " subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
echo " subset_data_dir.sh [--utt-list <utterance-list-file>] <srcdir> <destdir>"
echo " subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
echo "By default, randomly selects <num-utt> utterances from the data directory."
echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
echo "With --first, selects the first <num-utt> utterances"
echo "With --last, selects the last <num-utt> utterances"
echo "With --shortest, selects the shortest <num-utt> utterances."
echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
exit 1;
fi

if $spk_list_specified; then
spk_list=$1
srcdir=$2
destdir=$3
elif $utt_list_specified; then
utt_list=$1
srcdir=$2
destdir=$3
srcdir=$1
if [[ $spk_list || $utt_list ]]; then
numutt=
destdir=$2
else
srcdir=$1
numutt=$2
destdir=$3
fi


export LC_ALL=C

if [ ! -f $srcdir/utt2spk ]; then
echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
exit 1;
echo "$0: no such file $srcdir/utt2spk"
exit 1
fi

function do_filtering {
# assumes the utt2spk and spk2utt files already exist.
[ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
[ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
[ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
[ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
[ -f $srcdir/utt2num_frames ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
[ -f $srcdir/utt2uniq ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
[ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
[ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
[ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
[ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
if [ -f $srcdir/segments ]; then
utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
# The next line would override the command above for wav.scp, which would be incorrect.
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/reco2file_and_channel ] && \
utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
[ -f $srcdir/reco2dur ] && \
utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur

# Filter the STM file for proper sclite scoring
# Copy over the comments from STM file
[ -f $srcdir/stm ] && grep "^;;" $srcdir/stm > $destdir/stm
[ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm >> $destdir/stm

rm $destdir/reco
else
awk '{print $1;}' $destdir/wav.scp | sort | uniq > $destdir/reco
[ -f $srcdir/reco2file_and_channel ] && \
utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
[ -f $srcdir/reco2dur ] && \
utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur

rm $destdir/reco
fi
srcutts=`cat $srcdir/utt2spk | wc -l`
destutts=`cat $destdir/utt2spk | wc -l`
echo "$0: reducing #utt from $srcutts to $destutts"
}
if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
echo "$0: cannot subset to more utterances than you originally had."
exit 1
fi

if $shortest && [ ! -f $srcdir/feats.scp ]; then
echo "$0: you selected --shortest but no feats.scp exist."
exit 1
fi

mkdir -p $destdir || exit 1

if $spk_list_specified; then
mkdir -p $destdir
if [[ $spk_list ]]; then
utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
do_filtering; # bash function.
exit 0;
elif $utt_list_specified; then
mkdir -p $destdir
elif [[ $utt_list ]]; then
utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
do_filtering; # bash function.
exit 0;
elif $speakers; then
mkdir -p $destdir
utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \
utils/shuffle_list.pl < $srcdir/spk2utt |
awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
sort > $destdir/spk2utt
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
do_filtering; # bash function.
exit 0;
elif $perspk; then
mkdir -p $destdir
awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
awk '{ n='$numutt'; printf("%s ",$1);
skip=1; while(n*(skip+1) <= NF-1) { skip++; }
for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
do_filtering; # bash function.
exit 0;
else
if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then
echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
exit 1;
fi
mkdir -p $destdir || exit 1;

## scripting note: $shortest evaluates to true or false
## so this becomes the command true or false.
if $shortest; then
# select the n shortest utterances.
# Select $numutt shortest utterances.
. ./path.sh
[ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1;
feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist
sort -n -k2 $destdir/tmp.len |
awk '{print $1}' |
head -$numutt >$destdir/tmp.uttlist
utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
rm $destdir/tmp.uttlist $destdir/tmp.len
else
# Select $numutt random utterances.
utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
fi
utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
do_filtering;
exit 0;
fi

# Perform filtering. utt2spk and spk2utt files already exist by this point.
# Filter by utterance.
[ -f $srcdir/feats.scp ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
[ -f $srcdir/vad.scp ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
[ -f $srcdir/utt2lang ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
[ -f $srcdir/utt2dur ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
[ -f $srcdir/utt2num_frames ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
[ -f $srcdir/utt2uniq ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
[ -f $srcdir/wav.scp ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/utt2warp ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
[ -f $srcdir/text ] &&
utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text

# Filter by speaker.
[ -f $srcdir/spk2warp ] &&
utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
[ -f $srcdir/spk2gender ] &&
utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
[ -f $srcdir/cmvn.scp ] &&
utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp

# Filter by recording-id.
if [ -f $srcdir/segments ]; then
utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
# Recording-ids are in segments.
awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
# The next line overrides the command above for wav.scp, which would be incorrect.
[ -f $srcdir/wav.scp ] &&
utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
else
# No segments; recording-ids are in wav.scp.
awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
fi

[ -f $srcdir/reco2file_and_channel ] &&
utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
[ -f $srcdir/reco2dur ] &&
utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur

# Filter the STM file for proper sclite scoring.
# Copy over the comments from STM file.
[ -f $srcdir/stm ] &&
(grep "^;;" $srcdir/stm
utils/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm

rm $destdir/reco

# Copy frame_shift if present.
[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir

srcutts=$(wc -l <$srcdir/utt2spk)
destutts=$(wc -l <$destdir/utt2spk)
echo "$0: reducing #utt from $srcutts to $destutts"
exit 0
20 changes: 16 additions & 4 deletions egs/wsj/s5/utils/validate_data_dir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,7 @@ function check_sorted_and_uniq {
}

function partial_diff {
diff $1 $2 | head -n 6
echo "..."
diff $1 $2 | tail -n 6
diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
n1=`cat $1 | wc -l`
n2=`cat $2 | wc -l`
echo "[Lengths are $1=$n1 versus $2=$n2]"
Expand Down Expand Up @@ -341,9 +339,23 @@ if [ -f $data/utt2dur ]; then
exit 1;
fi
cat $data/utt2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
fi

if [ -f $data/utt2num_frames ]; then
check_sorted_and_uniq $data/utt2num_frames
cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2num_frames}
exit 1
fi
awk <$data/utt2num_frames '{
if (NF != 2 || !($2 > 0) || $2 != int($2)) {
print "Bad line utt2num_frames:" NR ":" $0
exit 1 } }' || exit 1
fi

if [ -f $data/reco2dur ]; then
check_sorted_and_uniq $data/reco2dur
Expand Down

0 comments on commit 74ebdee

Please sign in to comment.