-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathcreate_phases.py
125 lines (95 loc) · 5.28 KB
/
create_phases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
from scipy.io.wavfile import read
import subprocess as sp
import numpy as np
import argparse
import random
# Required directories
dir_text = os.path.expanduser('~/Downloads/voxceleb_data/voxceleb1_txt')
dir_audio = os.path.expanduser('~/Downloads/voxceleb_data/voxceleb1_audio')
root_path = os.path.expanduser('~/Downloads/voxceleb')
development_path = os.path.expanduser('~/Downloads/voxceleb_data/voxceleb1_development.txt')
enrollment_path = os.path.expanduser('~/Downloads/voxceleb_data/voxceleb1_enrollment.txt')
evaluation_path = os.path.expanduser('~/Downloads/voxceleb_data/voxceleb1_evaluation.txt')
parser = argparse.ArgumentParser(description='Extracting the utterances of POIs from OXFORD VOXCELEB dataset')
parser.add_argument('--root_path', default=root_path, help='The directory which the whole data files are available')
parser.add_argument('--dir_audio', default=dir_audio, help='The directory which the output audio files will be stored')
parser.add_argument('--dir_text', default=dir_text, help='The directory which the annotations exist')
parser.add_argument('--development_path', default=development_path, help='The list of files for developments phase')
parser.add_argument('--enrollment_path', default=enrollment_path, help='The list of files for enrollment phase')
parser.add_argument('--evaluation_path', default=evaluation_path, help='The list of files for evaluation phase')
args = parser.parse_args()
def extract():
file_development = open(args.development_path, 'w')
file_enrollment = open(args.enrollment_path, 'w')
file_evaluation = open(args.evaluation_path, 'w')
# This loop just check for file compatibility in which all files should be of type .wav!
# It's not a requirement but it's just for convenience.
for root, dirs, files in os.walk(args.dir_audio, topdown=False):
# walk through the directory
for name in files:
# Check all files to be of format .txt
# Get each file path
file_path = os.path.join(root, name)
assert os.path.splitext(file_path)[1] == '.wav', "file extension is not .txt: %s" % file_path
# Get the list of subfolders (IDs) & check for compatibility
list_text_IDs = [os.path.basename(x[0]) for x in os.walk(args.dir_text)]
list_text_IDs = list_text_IDs[1:]
list_audio_IDs = [os.path.basename(x[0]) for x in os.walk(args.dir_audio)]
list_audio_IDs = list_audio_IDs[1:]
assert len(list(set(list_audio_IDs) - set(
list_text_IDs))) == 0, "Please check for extra redundant created folders (possibly empty folders)"
# For development phase we follow the same procedure used in the paper of "VoxCeleb: a large-scale speaker identification dataset"
# All POIs that their name starts with 'E' will be excluded from development
list_IDs_development = []
list_IDs_enrollmentandevaluation = []
for item in list_audio_IDs:
if item.startswith('E'):
list_IDs_enrollmentandevaluation.append(item)
else:
list_IDs_development.append(item)
print("Number of POIs for enrollment & evaluation: %d \nNumber of POIs for development: %d" % (
len(list_IDs_enrollmentandevaluation), len(list_IDs_development)))
#########################
#### Pair Generation ####
#########################
for i,ID in enumerate(list_IDs_development):
###########################
#### Development Phase ####
###########################
# Get the full path to the folder of POI
POI_files_path = os.path.join(args.dir_audio, ID)
# Extract all the files and choose a portion of them at random.
files = os.listdir(POI_files_path)
files_development = random.sample(files, int(len(files) / 4.0))
for file in files_development:
# Create the text and the associated label for generating the text file for the genuine training pair.
string_of_file = str(i) + ' ' + os.path.join(ID, file)
# Write to file
file_development.write(string_of_file + '\n')
print("Writing genuine pair : %s" % string_of_file)
###################################
#### Genuine impostor creation ####
###################################
for i, ID in enumerate(list_IDs_enrollmentandevaluation):
# Get the full path to the folder of POI
POI_files_path = os.path.join(args.dir_audio, ID)
# Extract all the files.
files = os.listdir(POI_files_path)
# Create enrollment and evaluation.
files_enrollment = random.sample(files, int(len(files)/2.0))
files_evaluation = [x for x in files if x not in files_enrollment]
for file in files_enrollment:
# Create the text and the associated label.
string_of_file = str(i) + ' ' + os.path.join(ID, file)
# Write to file
file_enrollment.write(string_of_file + '\n')
print("Writing enrollment files : %s" % string_of_file)
for file in files_evaluation:
# Create the text and the associated label.
string_of_file = str(i) + ' ' + os.path.join(ID, file)
# Write to file
file_evaluation.write(string_of_file + '\n')
print("Writing evaluation files : %s" % string_of_file)
if __name__ == '__main__':
extract()