Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(WIP) Adding DIHARD 2018 recipe. #2822

Merged
merged 9 commits into from
Nov 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion egs/callhome_diarization/v1/diarization/cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ stage=0
nj=10
cleanup=true
threshold=0.5
rttm_channel=0
read_costs=false
reco2num_spk=
# End configuration section.
Expand All @@ -35,6 +36,8 @@ if [ $# != 2 ]; then
echo " --threshold <threshold|0> # Cluster stopping criterion. Clusters with scores greater"
echo " # than this value will be merged until all clusters"
echo " # exceed this value."
echo " --rttm-channel <rttm-channel|0> # The value passed into the RTTM channel field. Only affects"
echo " # the format of the RTTM file."
echo " --read-costs <read-costs|false> # If true, interpret input scores as costs, i.e. similarity"
echo " # is indicated by smaller values. If enabled, clusters will"
echo " # be merged until all cluster scores are less than the"
Expand Down Expand Up @@ -86,7 +89,7 @@ fi

if [ $stage -le 2 ]; then
echo "$0: computing RTTM"
diarization/make_rttm.py $srcdir/segments $dir/labels $dir/rttm || exit 1;
diarization/make_rttm.py --rttm-channel $rttm_channel $srcdir/segments $dir/labels $dir/rttm || exit 1;
fi

if $cleanup ; then
Expand Down
16 changes: 15 additions & 1 deletion egs/callhome_diarization/v1/diarization/extract_ivectors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
posterior_scale=1.0 # This scale helps to control for successve features being highly
# correlated. E.g. try 0.1 or 0.3.
apply_cmn=true # If true, apply sliding window cepstral mean normalization
apply_deltas=true # If true, copy the delta options from the i-vector extractor directory.
# If false, we won't add deltas in this step. For speaker diarization,
# we sometimes need to write features to disk that already have various
# post-processing applied so adding deltas is no longer needed in this stage.
# End configuration section.

echo "$0 $@" # Print the command line for logging
Expand Down Expand Up @@ -57,6 +61,12 @@ if [ $# != 3 ]; then
echo " --min-post <min-post|0.025> # Pruning threshold for posteriors"
echo " --apply-cmn <true,false|true> # if true, apply sliding window cepstral mean"
echo " # normalization to features"
echo " --apply-deltas <true,false|true> # If true, copy the delta options from the i-vector"
echo " # extractor directory. If false, we won't add deltas"
echo " # in this step. For speaker diarization, we sometimes"
echo " # need to write features to disk that already have"
echo " # various post-processing applied so adding deltas is"
echo " # no longer needed in this stage."
exit 1;
fi

Expand Down Expand Up @@ -95,7 +105,11 @@ mkdir -p $dir/log
sub_sdata=$sub_data/split$nj;
utils/split_data.sh $sub_data $nj || exit 1;

delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
if $apply_deltas; then
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
else
delta_opts="--delta-order=0"
fi

## Set up features.
if $apply_cmn; then
Expand Down
7 changes: 5 additions & 2 deletions egs/callhome_diarization/v1/diarization/make_rttm.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ def get_args():
help="Input labels file")
parser.add_argument("rttm_file", type=str,
help="Output RTTM file")
parser.add_argument("--rttm-channel", type=int, default=0,
help="The value passed into the RTTM channel field. \
Only affects the format of the RTTM file.")

args = parser.parse_args()
return args
Expand Down Expand Up @@ -120,8 +123,8 @@ def main():
reco = segs[0]
for i in range(1, len(segs)):
start, end, label = segs[i].strip().split(',')
print("SPEAKER {0} 0 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>".format(
reco, float(start), float(end)-float(start), label), file=rttm_writer)
print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer)

if __name__ == '__main__':
main()
14 changes: 14 additions & 0 deletions egs/dihard_2018/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

This is a Kaldi recipe for The First DIHARD Speech Diarization Challenge.
DIHARD is a new annual challenge focusing on “hard” diarization; that is,
speech diarization for challenging corpora where there is an expectation that
the current state-of-the-art will fare poorly, including, but not limited
to: clinical interviews, extended child language acquisition recordings,
YouTube videos and “speech in the wild” (e.g., recordings in restaurants)
See https://coml.lscp.ens.fr/dihard/index.html for details.

The subdirectories "v1" and so on are different speaker diarization
recipes. The recipe in v1 demonstrates a standard approach using a
full-covariance GMM-UBM, i-vectors, PLDA scoring and agglomerative
hierarchical clustering. The example in v2 demonstrates DNN speaker
embeddings, PLDA scoring and agglomerative hierarchical clustering.
13 changes: 13 additions & 0 deletions egs/dihard_2018/v1/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
This recipe is the speaker diarization recipe for The First DIHARD Speech
Diarization Challenge (DIHARD 2018). There are two tracks in the DIHARD 2018
competition , one uses oracle SAD (track1) and the other required that SAD
was performed from scratch (track2). This script is for track1.

The recipe is closely based on the following paper:
http://www.danielpovey.com/files/2018_interspeech_dihard.pdf but doesn't
contain the VB refinement. The whole system mainly contains full-covariance
GMM-UBM, i-vector extractor (T-matrix), PLDA scoring and agglomerative
hierarchical clustering. The VoxCeleb datasets are used for training i-vectors
and PLDA. The development set of the DIHARD 2018 competition is used as
validation set to tune parameters. The system is tested on the DIHARD 2018
evaluation set.
15 changes: 15 additions & 0 deletions egs/dihard_2018/v1/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.

export train_cmd="queue.pl"


7 changes: 7 additions & 0 deletions egs/dihard_2018/v1/conf/mfcc.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
--sample-frequency=16000
--frame-length=25 # the default is 25
--low-freq=20 # the default.
--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
--num-mel-bins=30
--num-ceps=24
--snip-edges=false
2 changes: 2 additions & 0 deletions egs/dihard_2018/v1/conf/vad.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
--vad-energy-threshold=5.5
--vad-energy-mean-scale=0.5
1 change: 1 addition & 0 deletions egs/dihard_2018/v1/diarization
57 changes: 57 additions & 0 deletions egs/dihard_2018/v1/local/make_dihard_2018_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3

# This script is called by local/make_dihard_2018_dev.sh, and it creates the
# necessary files for DIHARD 2018 development directory.

import sys, os

def prepare_dihard_2018_dev(src_dir, data_dir):
wavscp_fi = open(data_dir + "/wav.scp" , 'w')
utt2spk_fi = open(data_dir + "/utt2spk" , 'w')
segments_fi = open(data_dir + "/segments" , 'w')
rttm_fi = open(data_dir + "/rttm" , 'w')
reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w')

for subdir, dirs, files in os.walk(src_dir):
for file in files:
filename = os.path.join(subdir, file)
if filename.endswith(".lab"):
utt = os.path.basename(filename).split(".")[0]
lines = open(filename, 'r').readlines()
segment_id = 0
for line in lines:
start, end, speech = line.split()
segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end)
utt2spk_str = "{} {}\n".format(segment_id_str, utt)
segments_fi.write(segments_str)
utt2spk_fi.write(utt2spk_str)
segment_id += 1
wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\
"-b 16 - channels 1 |\n".format(utt, src_dir, utt)
wavscp_fi.write(wav_str)
with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
rttm_str = fh.read()
rttm_fi.write(rttm_str)
with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
rttm_list = fh.readlines()
spk_list = map(lambda x: (x.split())[7], rttm_list)
num_spk = len(set(spk_list))
reco2num_spk_fi.write("{} {}\n".format(utt, num_spk))
wavscp_fi.close()
utt2spk_fi.close()
segments_fi.close()
rttm_fi.close()
reco2num_spk_fi.close()
return 0

def main():
src_dir = sys.argv[1]
data_dir = sys.argv[2]
if not os.path.exists(data_dir):
os.makedirs(data_dir)
prepare_dihard_2018_dev(src_dir, data_dir)
return 0

if __name__=="__main__":
main()
22 changes: 22 additions & 0 deletions egs/dihard_2018/v1/local/make_dihard_2018_dev.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
# Copyright 2018 Zili Huang
# Apache 2.0.
#
# This script, called by ../run.sh, creates the DIHARD 2018 development data directory.

if [ $# != 2 ]; then
echo "Usage: $0 <path-to-dihard_2018_dev> <path-to-output>"
echo " e.g.: $0 /export/corpora/LDC/LDC2018E31 data/dihard_2018_dev"
fi

path_to_dihard_2018_dev=$1
data_dir=$2

echo "Preparing ${data_dir}..."
local/make_dihard_2018_dev.py ${path_to_dihard_2018_dev} ${data_dir}

sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp
mv ${data_dir}/rttm_tmp ${data_dir}/rttm
sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp
mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk
utils/fix_data_dir.sh ${data_dir}
57 changes: 57 additions & 0 deletions egs/dihard_2018/v1/local/make_dihard_2018_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3

# This script is called by local/make_dihard_2018_eval.sh, and it creates the
# necessary files for DIHARD 2018 evaluation directory.

import sys, os

def prepare_dihard_2018_eval(src_dir, data_dir):
wavscp_fi = open(data_dir + "/wav.scp" , 'w')
utt2spk_fi = open(data_dir + "/utt2spk" , 'w')
segments_fi = open(data_dir + "/segments" , 'w')
rttm_fi = open(data_dir + "/rttm" , 'w')
reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w')

for subdir, dirs, files in os.walk(src_dir):
for file in files:
filename = os.path.join(subdir, file)
if filename.endswith(".lab"):
utt = os.path.basename(filename).split(".")[0]
lines = open(filename, 'r').readlines()
segment_id = 0
for line in lines:
start, end, speech = line.split()
segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end)
utt2spk_str = "{} {}\n".format(segment_id_str, utt)
segments_fi.write(segments_str)
utt2spk_fi.write(utt2spk_str)
segment_id += 1
wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\
"-b 16 - channels 1 |\n".format(utt, src_dir, utt)
wavscp_fi.write(wav_str)
with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
rttm_str = fh.read()
rttm_fi.write(rttm_str)
with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
rttm_list = fh.readlines()
spk_list = map(lambda x: (x.split())[7], rttm_list)
num_spk = len(set(spk_list))
reco2num_spk_fi.write("{} {}\n".format(utt, num_spk))
wavscp_fi.close()
utt2spk_fi.close()
segments_fi.close()
rttm_fi.close()
reco2num_spk_fi.close()
return 0

def main():
src_dir = sys.argv[1]
data_dir = sys.argv[2]
if not os.path.exists(data_dir):
os.makedirs(data_dir)
prepare_dihard_2018_eval(src_dir, data_dir)
return 0

if __name__=="__main__":
main()
22 changes: 22 additions & 0 deletions egs/dihard_2018/v1/local/make_dihard_2018_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
# Copyright 2018 Zili Huang
# Apache 2.0.
#
# This script, called by ../run.sh, creates the DIHARD 2018 evaluation directory.

if [ $# != 2 ]; then
echo "Usage: $0 <path-to-dihard_2018_eval> <path-to-output>"
echo " e.g.: $0 /export/corpora/LDC/LDC2018E32v1.1 data/dihard_2018_eval"
fi

path_to_dihard_2018_eval=$1
data_dir=$2

echo "Preparing ${data_dir}..."
local/make_dihard_2018_eval.py ${path_to_dihard_2018_eval} ${data_dir}

sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp
mv ${data_dir}/rttm_tmp ${data_dir}/rttm
sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp
mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk
utils/fix_data_dir.sh ${data_dir}
Loading