-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_subsets_listening_test.py
305 lines (212 loc) · 10.3 KB
/
create_subsets_listening_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script creates 4 subsets of 32 samples from the material in
CHiME-5/eval/listening_test. Constraints are taken into account to
obtain subsets that match with the distribution of CHiME-5/eval/listening_test
in terms of session, location, and gender. This results in balanced subsets.
From Léonie: Les locuteurs sont accessibles dans "words", le nombre de personnes qui parlent
en même temps dans "num_spk" et le lieu dans "location". J'ai aussi ajouté les
temps où x personnes parlent simultanément (0_spk_time, 1_spk_time, etc.) et
le sexe des locuteurs (F, M ou FM).
"""
SEED = 0
import os
import json
import glob
import pandas as pd
import numpy as np
np.random.seed(SEED)
import random
random.seed(SEED)
import matplotlib.pyplot as plt
from collections import Counter
#%% parameters
# we remove some samples for various reasons: do not contain enough audible speech,
# not enough words, contain almost only laughings, the microphone is being manipulated,
# chewing heavily
df_annot = pd.read_csv('metadata/samples_manual_annotations.csv')
for index, row in df_annot.iterrows():
df_annot.iloc[index]['wavfile'] = os.path.basename(df_annot.iloc[index]['wavfile'])
samples_to_ignore = list(df_annot[(df_annot['rating']=='C') | (df_annot['rating']=='D')]['wavfile'])
# number of subsets
num_subsets = 4
# target number of samples (elements) in each subset
############################################################
### SETTING 33 BUT THE RESULT WILL BE 32
# The handling of the sampling proportion for each condition
# should be improved to address this issue.
target_numel_per_subset = 33
#############################################################
VERBOSE = True
#%% dataframe with all samples
# path to the json file for the listening test
json_path = '/data/recherche/python/UDASE-CHiME2023/unlabeled_data/json_files/eval'
json_file_list = glob.glob(os.path.join(json_path, '*listening_test.json'))
json_file_list.sort()
list_columns_df = ['wavfile', 'duration', 'duration_0_spk',
'duration_1_spk', 'duration_2_spk',
'duration_3_spk', 'n_spk', 'speakers', 'sex',
'session', 'location', 'transcription']
# dataframe
df = pd.DataFrame(columns=list_columns_df)
for file in json_file_list:
head, tail = os.path.split(file)
session = tail[:3] # session ID
ref_spk = tail[4:7] # ref speaker
num_active_spk = tail[8:-5] # subfolder where the audio is saved (0, 1, 2, 3 or listening_test)
split = os.path.basename(head) # train, dev or eval
with open(file) as f:
data = json.load(f)
for mix in data:
wavfile = session + '_' + ref_spk + '_' + mix['mix'] + '_output.wav'
if wavfile in samples_to_ignore:
continue
duration = '{:.2f}'.format(float(mix['duration']))
duration_0_spk = '{:.2f}'.format(float(mix['0_spk_time']))
duration_1_spk = '{:.2f}'.format(float(mix['1_spk_time']))
duration_2_spk = '{:.2f}'.format(float(mix['2_spk_time']))
duration_3_spk = '{:.2f}'.format(float(mix['3_spk_time']))
n_spk = int(mix['num_spk'])
transcription = mix['words']
speakers = []
for w in transcription.split(' '):
if len(w)==5 and w[0]=='(' and w[1]=='P' and w[-1]==')':
speakers.append(w[1:4])
speakers = list(np.unique(speakers))
speakers.sort()
speakers_list = ''
for i, spk in enumerate(speakers):
if i==0:
speakers_list = speakers_list + spk
else:
speakers_list = speakers_list + ' ' + spk
sex = mix['sex']
location = mix['location']
row = [wavfile, duration, duration_0_spk, duration_1_spk,
duration_2_spk, duration_3_spk, n_spk, speakers_list, sex,
session, location, transcription]
df.loc[len(df)] = row
df = df.sort_values(by=['wavfile'])
df.to_csv('metadata/samples_all.csv')
#%% split dataset into balanced subsets
# list of effects (sessions, locations, sex)
list_session = list(df['session'].unique())
list_sex = list(df['sex'].unique())
list_location = list(df['location'].unique())
# distribution of the effects in the orignal complete dataset and in the subsets
distribution_labels = []
distributions = {'original': [],
'subset_1': [],
'subset_2': [],
'subset_3': [],
'subset_4': []}
# list of dataframe for each subset
df_subsets = []
for i in range(num_subsets):
df_subsets.append(pd.DataFrame(columns=list_columns_df))
# total number of samples in the original dataset
total_numel = len(df)
for session in list_session:
for location in list_location:
for sex in list_sex:
# label for the effects combination
label = session + '_' + location + '_' + sex
distribution_labels.append(label)
# select the samples in the original dataset that correspond
# to this combination of the effects
sub_df = df[(df['session']==session) &
(df['location']==location) &
(df['sex']==sex)]
# total number of samples for this combination of effects in the
# original dataset
numel = len(sub_df)
distributions['original'].append(round(numel/total_numel*100))
# proportion of samples in the original dataset for this
# combination of effects
proportion = numel/total_numel
# number of samples to draw for each subset
numel_to_sample = int(np.rint(target_numel_per_subset*proportion))
# if the total number of samples (for all subsets) is greater than
# the total number of samples in the original dataset, then
# decrement it
if len(sub_df) < numel_to_sample*num_subsets and numel_to_sample > 1:
numel_to_sample -= 1
# if we have enough samples in the original dataset
if len(sub_df) >= numel_to_sample*num_subsets:
# list of indices from 0 to 3, shuffled
subset_indices = list(range(num_subsets))
random.shuffle(subset_indices)
# equivalent to selecting randomly a subset
for i in subset_indices:
# extract numel_to_sample samples from the original
# dataset (samples that fit the combination of effects)
df_subset = sub_df.sample(numel_to_sample, random_state=SEED)
# add these samples to the subset
df_subsets[i] = pd.concat([df_subsets[i], df_subset])
# remove the samples from the pool of available samples
sub_df = sub_df.drop(df_subset.index)
if VERBOSE:
print(len(sub_df))
else:
print('not enough samples for ' + label + ': required %d, available %d' % (numel_to_sample*4, len(sub_df)))
if VERBOSE:
print(label)
print('... all: %d' % numel)
for i, df_subset in enumerate(df_subsets):
numel_subset = len(df_subset[(df_subset['session']==session) &
(df_subset['location']==location) &
(df_subset['sex']==sex)])
distributions['subset_'+str(i+1)].append(numel_subset)
if VERBOSE:
print('... subset %d: %d' % (i+1, numel_subset))
for i, df_subset in enumerate(df_subsets):
distributions['subset_'+str(i+1)] = [round(x/len(df_subset)*100) for x in distributions['subset_'+str(i+1)]]
df_subset.to_csv('metadata/samples_subset_'+str(i+1)+'.csv')
#%% check dataframes are different
for i in range(len(df_subsets)):
for j in range(len(df_subsets)):
if j==i: continue
assert not np.any(df_subsets[i].isin(df_subsets[j]))
def common_member(a, b):
a_set = set(a)
b_set = set(b)
if (a_set & b_set):
return True
else:
return False
stimuli_subsets = []
for i, df_subset in enumerate(df_subsets):
stimuli_subset = list(df_subset['wavfile'].unique())
stimuli_subset.sort()
stimuli_subsets.append(stimuli_subset)
for i in range(len(stimuli_subsets)):
for j in range(len(stimuli_subsets)):
if j==i: continue
assert not common_member(stimuli_subsets[i], stimuli_subsets[j])
#%% plot distributions
plt.close('all')
fig, axs = plt.subplots(5,1)
for i, label in enumerate(list(distributions.keys())):
axs[i].bar(distribution_labels, distributions[label])
axs[i].set_title(label)
axs[i].set_ylabel('prop. (\%)')
if i < len(list(distributions.keys()))-1:
axs[i].set_xticks([])
plt.xticks(rotation = 45)
fig.set_figheight(7)
fig.set_figwidth(10)
plt.tight_layout()
plt.savefig('data_splitting.png')
#%% count conditions
data_labels = list(distributions.keys())
data = [df] + df_subsets
for i, label in enumerate(data_labels):
print(label + ' - %d samples' % len(data[i]))
c_session = Counter(data[i].session)
c_location = Counter(data[i].location)
c_sex = Counter(data[i].sex)
cs = [c_session, c_location, c_sex]
for c in cs:
print('...', end = '')
print(dict(c))