-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_config_files.py
241 lines (170 loc) · 7.62 KB
/
generate_config_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script generates the config file for each of the 32 subjects using the
subsets created from create_subset_listening_test.py.
"""
SEED = 0
import numpy as np
np.random.seed(SEED)
import pandas as pd
import os
from glob import glob
import random
random.seed(SEED)
import json
# folder containing the data
data_dir = 'data'
# list of conditions
conditions = [os.path.basename(x) for x in glob(data_dir + '/C*', recursive = True)]
conditions.sort()
# unprocessed condition
unprocessed_condition = 'C0'
# reference condition
ref_condition = 'ref'
# number of listening sessions
n_sessions = 4
# output directory
config_dir = 'config_files'
if not os.path.isdir(config_dir):
os.makedirs(config_dir)
#%% create a list that contains the information for each subject
# number of subsets of the complete listening test dataset
n_subsets = 4
# number of subjects associated to each subset
n_subjects_per_subset = 8
# total number of subjects
n_subjects = n_subsets*n_subjects_per_subset
# create a list where each element is a dataframe containing the stimuli
# for a given subset
df_subsets = []
for i in range(n_subsets):
df_subset = pd.read_csv('metadata/samples_subset_' + str(i+1) + '.csv')
df_subsets.append(df_subset)
# create a list where each element contains a dictionary corresponding
# to one subject. The dictionary contains:
# subject_id: the id of the subject (integer)
# subset_id: the id of the subset containing the stimuli for this subject (integer)
# df_stimuli: the dataframe containing the list of stimuli for subset subset_id
# df_subject_exp: the dataframe containing the configuration file for the listening
# experiment of this subject. Each row is a stimulus. It contains the following columns:
# subset: the subset of all stimuli
# session: the listening session index
# file: the stimulus path (wav file)
# first_scale: the first rating scale
subject_data_list = []
# associate each subset to a panel of subjects.
for i in range(n_subjects):
# subjects 1 to 8 --> subset 1, subjects 9 to 16 --> subset 2, etc.
subset_id = i//n_subjects_per_subset + 1
subject_data = {'subject_id': i+1,
'subset_id': subset_id,
'df_stimuli': df_subsets[subset_id-1],
'df_subject_exp': pd.DataFrame(columns=['subset', 'session', 'file', 'first_scale'])}
subject_data_list.append(subject_data)
#%% Fill in df_subject_exp for each subject
# for each subject
for subject_ind in range(n_subjects):
# dataframe containing the configuration file for the listening
# experiment of the current subject
df_subject_exp = subject_data_list[subject_ind]['df_subject_exp']
# dataframe containing the list of stimuli for the current subset
df_stimuli = subject_data_list[subject_ind]['df_stimuli']
# list of stimuli (wav files)
stimuli = list(df_stimuli['wavfile'])
# # shuffle
# random.shuffle(stimuli)
# loop over stimuli
for stimulus in stimuli:
# loop over conditions
for cond in conditions:
# path to the stimulus for the current condition
file = os.path.join(data_dir, cond, stimulus)
assert stimulus.split('_')[-1] == 'output.wav'
# add row to dataframe
# we do not fill in session and first_scale for the moment
df_subject_exp.loc[len(df_subject_exp)] = [subject_data_list[subject_ind]['subset_id'], 'x', file, 'x']
# shuffle rows
df_subject_exp = df_subject_exp.sample(frac=1, random_state=subject_ind, ignore_index=True)
# fill in session and first_scale columns
n_samples_per_session = len(df_subject_exp)//n_sessions
for index, row in df_subject_exp.iterrows():
# session index
session_ind = index//n_samples_per_session + 1
# balance the presentation order of the SIG and BAK scales
if (row.subset == 1 or row.subset == 2) and (session_ind == 1 or session_ind == 2):
first_scale = 'SIG'
elif (row.subset == 1 or row.subset == 2) and (session_ind == 3 or session_ind == 4):
first_scale = 'BAK'
elif (row.subset == 3 or row.subset == 4) and (session_ind == 1 or session_ind == 2):
first_scale = 'BAK'
elif (row.subset == 3 or row.subset == 4) and (session_ind == 3 or session_ind == 4):
first_scale = 'SIG'
df_subject_exp.at[index,'session'] = session_ind
df_subject_exp.at[index,'first_scale'] = first_scale
#%% dataframe for the reference conditions
subset = df_subject_exp.subset.iloc[0]
session = 0
# init dataframe
df_subject_ref = pd.DataFrame(columns=['subset', 'session', 'file', 'first_scale'])
# list of wav files for the reference conditions
ref_file_list = glob(os.path.join(data_dir, ref_condition, '*.wav'))
ref_file_list.sort()
# shuffle file list
random.shuffle(ref_file_list)
# assign first scale
if (subset == 1 or subset == 2):
first_scale = first_scale = 'SIG'
else:
first_scale = first_scale = 'BAK'
for i, file in enumerate(ref_file_list):
if i == len(ref_file_list)//2:
if first_scale == 'SIG':
first_scale = 'BAK'
else:
first_scale = 'SIG'
df_subject_ref.loc[len(df_subject_ref)] = [subset, session, file, first_scale]
#%% complete dataframe with all rating scales
# concatenate reference and experimental conditions
df_subject = pd.concat([df_subject_ref, df_subject_exp])
# init complete dataframe
df_subject_all_scales = pd.DataFrame(columns=['subset', 'session', 'file', 'scale'])
# fill in dataframe
for index, row in df_subject.iterrows():
if row.first_scale == 'SIG':
scale_list = ['SIG', 'BAK', 'OVRL']
else:
scale_list = ['BAK', 'SIG', 'OVRL']
[subset, session, file, _] = row
for scale in scale_list:
row = [subset, session, file, scale]
df_subject_all_scales.loc[len(df_subject_all_scales)] = row
#%% save to csv and json
output_dir = os.path.join(config_dir, 'csv')
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
df_subject_all_scales.to_csv(os.path.join(output_dir, 'subject_' + str(subject_ind+1) + '.csv'))
output_dir = os.path.join(config_dir, 'json')
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
with open(os.path.join(output_dir,'subject_' + str(subject_ind+1) + '.json'), 'w') as f:
json.dump(df_subject_all_scales.to_dict('records'), f, indent=1)
#%%
# """Add a variable with the id of the subject at the beginning of the json.
# This was asked by Matthieu and is used to read the file in javascript."""
# input_path = os.path.join(config_dir,'json')
# output_path = os.path.join(config_dir,'modified_json')
# if not os.path.isdir(output_path):
# os.mkdir(output_path)
# file_list = glob(os.path.join(input_path, '*.json'))
# file_list.sort()
# for file in file_list:
# basename = os.path.basename(file)
# with open(file, 'r') as f:
# lines = f.readlines()
# # lines[0] = basename[:-5] + ' = ' + lines[0]
# lines[0] = 'all_trials' + ' = ' + lines[0]
# lines[-1] = lines[-1] + ';'
# new_file = os.path.join(output_path, basename)
# with open(new_file, 'w') as f:
# f.writelines(lines)