kaldi-asr · danpovey · Nov 13, 2018 · Nov 5, 2018 · Nov 6, 2018 · Nov 6, 2018
diff --git a/egs/callhome_diarization/v1/diarization/cluster.sh b/egs/callhome_diarization/v1/diarization/cluster.sh
@@ -14,6 +14,7 @@ stage=0
 nj=10
 cleanup=true
 threshold=0.5
+rttm_channel=0
 read_costs=false
 reco2num_spk=
 # End configuration section.
@@ -35,6 +36,8 @@ if [ $# != 2 ]; then
   echo "  --threshold <threshold|0>                        # Cluster stopping criterion. Clusters with scores greater"
   echo "                                                   # than this value will be merged until all clusters"
   echo "                                                   # exceed this value."
+  echo "  --rttm-channel <rttm-channel|0>                  # The value passed into the RTTM channel field. Only affects"
+  echo "                                                   # the format of the RTTM file."
   echo "  --read-costs <read-costs|false>                  # If true, interpret input scores as costs, i.e. similarity"
   echo "                                                   # is indicated by smaller values. If enabled, clusters will"
   echo "                                                   # be merged until all cluster scores are less than the"
@@ -86,7 +89,7 @@ fi
 
 if [ $stage -le 2 ]; then
   echo "$0: computing RTTM"
-  diarization/make_rttm.py $srcdir/segments $dir/labels $dir/rttm || exit 1;
+  diarization/make_rttm.py --rttm-channel $rttm_channel $srcdir/segments $dir/labels $dir/rttm || exit 1;
 fi
 
 if $cleanup ; then

diff --git a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh
@@ -29,6 +29,10 @@ min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
 posterior_scale=1.0 # This scale helps to control for successve features being highly
                     # correlated.  E.g. try 0.1 or 0.3.
 apply_cmn=true # If true, apply sliding window cepstral mean normalization
+apply_deltas=true # If true, copy the delta options from the i-vector extractor directory.
+                  # If false, we won't add deltas in this step. For speaker diarization,
+		  # we sometimes need to write features to disk that already have various
+		  # post-processing applied so adding deltas is no longer needed in this stage.
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -57,6 +61,12 @@ if [ $# != 3 ]; then
   echo "  --min-post <min-post|0.025>                      # Pruning threshold for posteriors"
   echo "  --apply-cmn <true,false|true>                    # if true, apply sliding window cepstral mean"
   echo "                                                   # normalization to features"
+  echo "  --apply-deltas <true,false|true>                 # If true, copy the delta options from the i-vector"
+  echo "                                                   # extractor directory. If false, we won't add deltas"
+  echo "                                                   # in this step. For speaker diarization, we sometimes"
+  echo "                                                   # need to write features to disk that already have"
+  echo "                                                   # various post-processing applied so adding deltas is"
+  echo "                                                   # no longer needed in this stage."
   exit 1;
 fi
 
@@ -95,7 +105,11 @@ mkdir -p $dir/log
 sub_sdata=$sub_data/split$nj;
 utils/split_data.sh $sub_data $nj || exit 1;
 
-delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+if $apply_deltas; then
+  delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+else
+  delta_opts="--delta-order=0"
+fi
 
 ## Set up features.
 if $apply_cmn; then

diff --git a/egs/callhome_diarization/v1/diarization/make_rttm.py b/egs/callhome_diarization/v1/diarization/make_rttm.py
@@ -51,6 +51,9 @@ def get_args():
                       help="Input labels file")
   parser.add_argument("rttm_file", type=str,
                       help="Output RTTM file")
+  parser.add_argument("--rttm-channel", type=int, default=0,
+                      help="The value passed into the RTTM channel field. \
+                      Only affects the format of the RTTM file.")
 
   args = parser.parse_args()
   return args
@@ -120,8 +123,8 @@ def main():
       reco = segs[0]
       for i in range(1, len(segs)):
         start, end, label = segs[i].strip().split(',')
-        print("SPEAKER {0} 0 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>".format(
-          reco, float(start), float(end)-float(start), label), file=rttm_writer)
+        print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
+          reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer)
 
 if __name__ == '__main__':
   main()
diff --git a/egs/dihard_2018/README.txt b/egs/dihard_2018/README.txt
@@ -0,0 +1,14 @@
+
+ This is a Kaldi recipe for The First DIHARD Speech Diarization Challenge.  
+ DIHARD is a new annual challenge focusing on “hard” diarization; that is,
+ speech diarization for challenging corpora where there is an expectation that
+ the current state-of-the-art will fare poorly, including, but not limited
+ to: clinical interviews, extended child language acquisition recordings,
+ YouTube videos and “speech in the wild” (e.g., recordings in restaurants)
+ See https://coml.lscp.ens.fr/dihard/index.html for details.
+
+ The subdirectories "v1" and so on are different speaker diarization
+ recipes. The recipe in v1 demonstrates a standard approach using a
+ full-covariance GMM-UBM, i-vectors, PLDA scoring and agglomerative
+ hierarchical clustering. The example in v2 demonstrates DNN speaker 
+ embeddings, PLDA scoring and agglomerative hierarchical clustering.
diff --git a/egs/dihard_2018/v1/README.txt b/egs/dihard_2018/v1/README.txt
@@ -0,0 +1,13 @@
+ This recipe is the speaker diarization recipe for The First DIHARD Speech
+ Diarization Challenge (DIHARD 2018). There are two tracks in the DIHARD 2018 
+ competition , one uses oracle SAD (track1) and the other required that SAD 
+ was performed from scratch (track2). This script is for track1.
+
+ The recipe is closely based on the following paper:
+ http://www.danielpovey.com/files/2018_interspeech_dihard.pdf but doesn't
+ contain the VB refinement. The whole system mainly contains full-covariance
+ GMM-UBM, i-vector extractor (T-matrix), PLDA scoring and agglomerative 
+ hierarchical clustering. The VoxCeleb datasets are used for training i-vectors 
+ and PLDA. The development set of the DIHARD 2018 competition is used as 
+ validation set to tune parameters. The system is tested on the DIHARD 2018 
+ evaluation set. 
diff --git a/egs/dihard_2018/v1/cmd.sh b/egs/dihard_2018/v1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl"
+
+
diff --git a/egs/dihard_2018/v1/conf/mfcc.conf b/egs/dihard_2018/v1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=24
+--snip-edges=false
diff --git a/egs/dihard_2018/v1/conf/vad.conf b/egs/dihard_2018/v1/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/dihard_2018/v1/diarization b/egs/dihard_2018/v1/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_dev.py b/egs/dihard_2018/v1/local/make_dihard_2018_dev.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+# This script is called by local/make_dihard_2018_dev.sh, and it creates the
+# necessary files for DIHARD 2018 development directory.
+
+import sys, os
+
+def prepare_dihard_2018_dev(src_dir, data_dir):
+    wavscp_fi = open(data_dir + "/wav.scp" , 'w')
+    utt2spk_fi = open(data_dir + "/utt2spk" , 'w')
+    segments_fi = open(data_dir + "/segments" , 'w')
+    rttm_fi = open(data_dir + "/rttm" , 'w')
+    reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w')
+
+    for subdir, dirs, files in os.walk(src_dir):
+        for file in files:
+            filename = os.path.join(subdir, file)
+            if filename.endswith(".lab"):
+                utt = os.path.basename(filename).split(".")[0]
+                lines = open(filename, 'r').readlines()
+                segment_id = 0
+                for line in lines:
+                    start, end, speech = line.split()
+                    segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
+                    segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end)
+                    utt2spk_str = "{} {}\n".format(segment_id_str, utt)
+                    segments_fi.write(segments_str)
+                    utt2spk_fi.write(utt2spk_str)
+                    segment_id += 1
+                wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\
+                        "-b 16 - channels 1 |\n".format(utt, src_dir, utt)
+                wavscp_fi.write(wav_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_str = fh.read()
+                rttm_fi.write(rttm_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_list = fh.readlines()
+                spk_list = map(lambda x: (x.split())[7], rttm_list) 
+                num_spk = len(set(spk_list))
+                reco2num_spk_fi.write("{} {}\n".format(utt, num_spk))
+    wavscp_fi.close()
+    utt2spk_fi.close()
+    segments_fi.close()
+    rttm_fi.close()
+    reco2num_spk_fi.close()
+    return 0
+
+def main():
+    src_dir = sys.argv[1]
+    data_dir = sys.argv[2]
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    prepare_dihard_2018_dev(src_dir, data_dir)
+    return 0
+
+if __name__=="__main__":
+    main()
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh b/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright 2018   Zili Huang
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the DIHARD 2018 development data directory.
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <path-to-dihard_2018_dev> <path-to-output>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC2018E31 data/dihard_2018_dev"
+fi
+
+path_to_dihard_2018_dev=$1
+data_dir=$2
+
+echo "Preparing ${data_dir}..."
+local/make_dihard_2018_dev.py ${path_to_dihard_2018_dev} ${data_dir}
+
+sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp
+mv ${data_dir}/rttm_tmp ${data_dir}/rttm
+sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp
+mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk
+utils/fix_data_dir.sh ${data_dir}
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_eval.py b/egs/dihard_2018/v1/local/make_dihard_2018_eval.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+# This script is called by local/make_dihard_2018_eval.sh, and it creates the
+# necessary files for DIHARD 2018 evaluation directory.
+
+import sys, os
+
+def prepare_dihard_2018_eval(src_dir, data_dir):
+    wavscp_fi = open(data_dir + "/wav.scp" , 'w')
+    utt2spk_fi = open(data_dir + "/utt2spk" , 'w')
+    segments_fi = open(data_dir + "/segments" , 'w')
+    rttm_fi = open(data_dir + "/rttm" , 'w')
+    reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w')
+
+    for subdir, dirs, files in os.walk(src_dir):
+        for file in files:
+            filename = os.path.join(subdir, file)
+            if filename.endswith(".lab"):
+                utt = os.path.basename(filename).split(".")[0]
+                lines = open(filename, 'r').readlines()
+                segment_id = 0
+                for line in lines:
+                    start, end, speech = line.split()
+                    segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
+                    segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end)
+                    utt2spk_str = "{} {}\n".format(segment_id_str, utt)
+                    segments_fi.write(segments_str)
+                    utt2spk_fi.write(utt2spk_str)
+                    segment_id += 1
+                wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\
+                        "-b 16 - channels 1 |\n".format(utt, src_dir, utt)
+                wavscp_fi.write(wav_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_str = fh.read()
+                rttm_fi.write(rttm_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_list = fh.readlines()
+                spk_list = map(lambda x: (x.split())[7], rttm_list) 
+                num_spk = len(set(spk_list))
+                reco2num_spk_fi.write("{} {}\n".format(utt, num_spk))
+    wavscp_fi.close()
+    utt2spk_fi.close()
+    segments_fi.close()
+    rttm_fi.close()
+    reco2num_spk_fi.close()
+    return 0
+
+def main():
+    src_dir = sys.argv[1]
+    data_dir = sys.argv[2]
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    prepare_dihard_2018_eval(src_dir, data_dir)
+    return 0
+
+if __name__=="__main__":
+    main()
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh b/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright 2018   Zili Huang
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the DIHARD 2018 evaluation directory.
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <path-to-dihard_2018_eval> <path-to-output>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC2018E32v1.1 data/dihard_2018_eval"
+fi
+
+path_to_dihard_2018_eval=$1
+data_dir=$2
+
+echo "Preparing ${data_dir}..."
+local/make_dihard_2018_eval.py ${path_to_dihard_2018_eval} ${data_dir}
+
+sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp
+mv ${data_dir}/rttm_tmp ${data_dir}/rttm
+sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp
+mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk
+utils/fix_data_dir.sh ${data_dir}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		--vad-energy-threshold=5.5
		--vad-energy-mean-scale=0.5