-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_models.py
383 lines (314 loc) · 14.9 KB
/
evaluate_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
"""
Script to evaluate the models existing on the './models' folder
with the model.wv.evaluate_word_pairs('./data/wordsim353.en.ca.txt').
After evaluating the models, some plots will be generated and saved to the './results/plots' directory.
"""
import os
import pandas as pd
from gensim.models import Word2Vec, FastText
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from functions import calculate_full_dataset_size
################################################################################
# Function to evaluate the models
def evaluate_models(save: bool=True) -> pd.DataFrame:
"""
Evaluate the models existing on the './models' folder with the model.wv.evaluate_word_pairs('./data/wordsim353.en.ca.txt').
Parameters
----------
save : bool
Whether to save the results to a CSV file. The default is True.
Returns
-------
results_df : pd.DataFrame
The results of the evaluation in a DataFrame sorted by the 'Avg. Statistic' column.
"""
# Get the list of files ending with '.model' in the './models' folder
w2v_path_list = [os.path.join('./models/word2vec/', f) for f in os.listdir('./models/word2vec/') if f.endswith('.model')]
ft_path_list = [os.path.join('./models/fasttext/', f) for f in os.listdir('./models/fasttext/') if f.endswith('.model')]
num_w2c_models = len(w2v_path_list)
num_ft_models = len(ft_path_list)
num_total_models = num_w2c_models + num_ft_models
# Concatenate the lists
print(f"Found {num_total_models} models (W2V: {num_w2c_models}, FT: {num_ft_models})")
# Load the word similarity dataset
test_path = './data/wordsim353.en.ca.txt'
caps_test_path = './data/wordsim353.en.ca_caps.txt'
# Store the results in a dictionary
results_dict: dict[str, dict[str, str|float|None]] = {
model_path: {
'Model': os.path.basename(model_path),
'Avg. Statistic': None,
'Pearson': None,
'Pearson p-value': None,
'Spearman': None,
'Spearman p-value': None,
'OOV Perc.': None
} for model_path in w2v_path_list + ft_path_list
}
counter = 0
# Evaluate Word2Vec models
for model_path in w2v_path_list:
caps = 'caps' in model_path
model = Word2Vec.load(model_path)
(pearson, p_pearson), (spearman, p_spearman), oov = model.wv.evaluate_word_pairs(test_path if not caps else caps_test_path)
avg_statistic = (pearson + spearman) / 2
results_dict[model_path] = {
'Model': os.path.basename(model_path),
'Avg. Statistic': avg_statistic,
'Pearson': pearson,
'Pearson p-value': p_pearson,
'Spearman': spearman,
'Spearman p-value': p_spearman,
'OOV Perc.': oov
}
counter += 1
print(f"Evaluated {counter}/{num_total_models} models with '{test_path}'.", end='\r')
# Evaluate FastText models
for model_path in ft_path_list:
caps = 'caps' in model_path
model = FastText.load(model_path)
(pearson, p_pearson), (spearman, p_spearman), oov = model.wv.evaluate_word_pairs(test_path if not caps else caps_test_path)
avg_statistic = (pearson + spearman) / 2
results_dict[model_path] = {
'Model': os.path.basename(model_path),
'Avg. Statistic': avg_statistic,
'Pearson': pearson,
'Pearson p-value': p_pearson,
'Spearman': spearman,
'Spearman p-value': p_spearman,
'OOV Perc.': oov
}
counter += 1
print(f"Evaluated {counter}/{num_total_models} models with '{test_path}'.", end='\r')
print()
# Save the results to a CSV file
results_df = pd.DataFrame(results_dict).T
# Reorder the columns and sort the values by the 'Avg. Statistic' column
results_df = results_df[['Model', 'Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.', 'Pearson p-value', 'Spearman p-value']]
results_df = results_df.sort_values(by='Avg. Statistic', ascending=False)
# Save the results to a CSV file
if save:
results_df.to_csv('./results/evaluation_results.csv', index=False)
print(f"Results saved to './results/evaluation_results.csv'.")
return results_df
def filter_results(df: pd.DataFrame, neg_keyword: str|None=None, pos_keyword: str|None=None) -> pd.DataFrame:
"""
Filter the evaluation results by the negative and positive keywords.
Parameters
----------
df : pd.DataFrame
The DataFrame with the evaluation results to filter.
neg_keyword : str
The negative keyword to filter the results by. All models containing this keyword will be removed.
pos_keyword : str
The positive keyword to filter the results by. All models not containing this keyword will be removed.
Only one of the keywords should be used at a time.
Returns
-------
results_filtered_df : pd.DataFrame
The filtered DataFrame with the evaluation results.
"""
assert (neg_keyword is None) != (pos_keyword is None), "Only one of the keywords should be used at a time."
df = df.copy()
if pos_keyword:
results_filtered_df = df[df['Model'].str.contains(pos_keyword)].copy()
elif neg_keyword:
results_filtered_df = df[~df['Model'].str.contains(neg_keyword)].copy()
return results_filtered_df
################################################################################
# Functions to generate the plots
# Calculate the size of the full dataset in MB
print("Calculating the size of the full dataset in MB to adjust the scale of the plots...")
FULL_DATA_SIZE_MB = int(round(calculate_full_dataset_size(), 0))
def generate_model_barplot(results_df: pd.DataFrame, metric: str, family: str='all') -> None:
"""
Generate a barplot of the models evaluated for the given metric.
Parameters
----------
results_df : pd.DataFrame
The DataFrame containing the results of the evaluation.
metric : str
The metric to generate the barplot for. It can be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'].
family : str, optional
The family of models to generate the barplot for. It can be one of ['all', 'w2v', 'ft']. The default is 'all'.
"""
assert metric in ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'], f"Invalid metric '{metric}'. Must be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.']."
assert family in ['all', 'w2v', 'ft'], f"Invalid family '{family}'. Must be one of ['all', 'w2v', 'ft']."
# Filter the dataframe by the family of models
if family == 'w2v':
results_filtered_df = results_df[results_df['Model'].str.startswith('w2v')].copy()
elif family == 'ft':
results_filtered_df = results_df[results_df['Model'].str.startswith('ft')].copy()
else:
results_filtered_df = results_df.copy()
# Sort the values by the metric (descending order, except for 'OOV Perc.')
results_filtered_df = results_filtered_df.sort_values(by=metric, ascending=(metric == 'OOV Perc.'))
# Remove .model from the model names
results_filtered_df['Model'] = results_filtered_df['Model'].apply(lambda x: x.removesuffix('.model'))
# Generate the barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y=metric, data=results_filtered_df)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel(metric)
plt.title(f'{metric} for each model ({family})')
plt.tight_layout()
plt.savefig(f'./results/plots/bp_model_{family}_{metric.lower().replace(".", "").replace(" ", "_")}.png')
plt.close()
def generate_tokenizer_barplot(results_df: pd.DataFrame, metric: str, family: str) -> None:
"""
Generate a barplot of the average of the given metric for each tokenizer.
Parameters
----------
results_df : pd.DataFrame
The DataFrame containing the results of the evaluation.
metric : str
The metric to generate the barplot for. It can be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'].
family : str
The family of models to generate the barplot for. It can be one of ['all', 'w2v', 'ft'].
"""
assert metric in ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'], f"Invalid metric '{metric}'. Must be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.']."
assert family in ['all', 'w2v', 'ft'], f"Invalid family '{family}'. Must be one of ['all', 'w2v', 'ft']."
# Filter the dataframe by the family of models
if family == 'w2v':
results_filtered_df = results_df[results_df['Model'].str.startswith('w2v')].copy()
elif family == 'ft':
results_filtered_df = results_df[results_df['Model'].str.startswith('ft')].copy()
else:
results_filtered_df = results_df.copy()
# Extract the tokenizer from the model name ('spacy', 'nltk')
results_filtered_df['Tokenizer'] = results_filtered_df['Model'].apply(lambda x: 'spacy' if 'spacy' in x else ('nltk' if 'nltk' in x else None))
# Group by the tokenizer and calculate the average of the metric, ignoring the other columns
average_tokenizer_df = results_filtered_df[['Tokenizer', metric]].groupby('Tokenizer').mean().reset_index()
# Generate the barplot
plt.figure(figsize=(8, 6))
sns.barplot(x='Tokenizer', y=metric, data=average_tokenizer_df)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Tokenizer')
plt.ylabel(metric)
plt.title(f'{metric} for each tokenizer ({family})')
plt.tight_layout()
plt.savefig(f'./results/plots/bp_tokenizer_{family}_{metric.lower().replace(".", "").replace(" ", "_")}.png')
plt.close()
def generate_size_plot(results_df: pd.DataFrame, metric: str, family: str) -> None:
"""
Generate a plot with the evolution of the metric for each dataset size.
Parameters
----------
results_df : pd.DataFrame
The DataFrame with the evaluation results of the models.
metric : str
The metric to generate the barplot for. It can be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'].
family : str
The family of models to generate the barplot for. It can be one of ['all', 'w2v', 'ft'].
"""
assert metric in ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'], f"Invalid metric '{metric}'. Must be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.']."
assert family in ['all', 'w2v', 'ft'], f"Invalid family '{family}'. Must be one of ['all', 'w2v', 'ft']."
# Filter the dataframe by the family of models
if family == 'w2v':
results_filtered_df = results_df[results_df['Model'].str.startswith('w2v')].copy()
elif family == 'ft':
results_filtered_df = results_df[results_df['Model'].str.startswith('ft')].copy()
else:
results_filtered_df = results_df.copy()
# Extract the size from the model name
results_filtered_df['Size'] = results_filtered_df['Model'].apply(lambda x: x.split('_')[-1].split('.')[0].removesuffix('mb'))
results_filtered_df['Size'] = results_filtered_df['Size'].apply(lambda x: int(x) if x != 'max' else FULL_DATA_SIZE_MB)
results_filtered_df['Model'] = results_filtered_df['Model'].apply(lambda x: '_'.join(x.split('_')[:-1]))
# Define the ticks and labels for the x-axis
ticks = results_filtered_df['Size'].unique()
ticks_labels = []
for s in ticks:
if s == FULL_DATA_SIZE_MB:
ticks_labels.append(f'{s}MB (max)')
else:
ticks_labels.append(f'{s}MB')
# Generate the plot with the evolution of the metric for each dataset size and each model
plt.figure(figsize=(12, 8))
sns.lineplot(x='Size', y=metric, hue='Model', data=results_filtered_df, marker='o')
plt.xticks(ticks=ticks, labels=ticks_labels, rotation=45, ha='right')
plt.xlabel('Size (MB)')
plt.ylabel(metric)
plt.title(f'{metric} for each model and dataset size ({family})')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Model')
plt.tight_layout()
plt.savefig(f'./results/plots/lp_size_{family}_{metric.lower().replace(".", "").replace(" ", "_")}.png')
plt.close()
def generate_vectorizer_barplot(results_df: pd.DataFrame, metric: str, compare: tuple[str, str]=('w2v_sg', 'w2v_cbow')) -> None:
"""
Generate a barplot of the average of the given metric for each vectorizer in the comparison (max. 2).
Parameters
----------
results_df : pd.DataFrame
The DataFrame containing the results of the evaluation.
metric : str
The metric to generate the barplot for. It can be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'].
compare : list[str]|tuple[str]
The vectorizers to compare. Can be one of ['w2v_sg', 'w2v_cbow', 'ft_sg', 'ft_cbow']. The default is ['w2v', 'ft'].
"""
assert metric in ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.'], f"Invalid metric '{metric}'. Must be one of ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.']."
# Filter the dataframe by the family of models
results_filtered_df = results_df[results_df['Model'].str.startswith(compare[0]) | results_df['Model'].str.startswith(compare[1])]
results_filtered_df['Vectorizer'] = results_filtered_df['Model'].apply(lambda x: compare[0] if x.startswith(compare[0]) else (compare[1] if x.startswith(compare[1]) else None))
# Group by the vectorizer and calculate the average of the metric, ignoring the other columns
average_vectorizer_df = results_filtered_df[['Vectorizer', metric]].groupby('Vectorizer').mean().reset_index()
# Generate the barplot
plt.figure(figsize=(8, 6))
sns.barplot(x='Vectorizer', y=metric, data=average_vectorizer_df)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Vectorizer')
plt.ylabel(metric)
plt.title(f'{metric} for each vectorizer ({compare[0]} vs {compare[1]})')
plt.tight_layout()
plt.savefig(f'./results/plots/bp_vectorizer_{compare[0]}_vs_{compare[1]}_{metric.lower().replace(".", "").replace(" ", "_")}.png')
plt.close()
################################################################################
# Main script
if __name__ == '__main__':
# Evaluate the models
results_df = evaluate_models(save=True)
hide_win10 = False
hide_caps = True
hide_cbow = True
# Filter the results by the negative keywords
if hide_win10:
results_df = filter_results(results_df, neg_keyword='win10')
if hide_caps:
results_df = filter_results(results_df, neg_keyword='caps')
if hide_cbow:
results_df = filter_results(results_df, neg_keyword='cbow')
results_df = filter_results(results_df, pos_keyword='1000mb')
results_df = filter_results(results_df, pos_keyword='win10')
# Save the filtered results to a CSV file
results_df.to_csv('./results/evaluation_results_filtered.csv', index=False)
# Empty the 'plots' directory before generating the plots
for file in os.listdir('./results/plots/'):
os.remove(os.path.join('./results/plots/', file))
# Define the metrics and families
metrics = ['Avg. Statistic', 'Pearson', 'Spearman', 'OOV Perc.']
different_vectorizers = results_df['Model'].apply(lambda x: '_'.join(x.split('_')[:2])).unique()
available_families = []
bool_w2v, bool_ft = False, False
if any(v.startswith('w2v') for v in different_vectorizers):
available_families.append('w2v')
bool_w2v = True
if any(v.startswith('ft') for v in different_vectorizers):
available_families.append('ft')
bool_ft = True
if bool_w2v and bool_ft:
available_families.append('all')
# Generate the plots
count_plots = 0
for metric in metrics:
for family in available_families:
generate_model_barplot(results_df, metric, family)
generate_tokenizer_barplot(results_df, metric, family)
generate_size_plot(results_df, metric, family)
count_plots += 3
if len(different_vectorizers) > 1:
for compare in combinations(different_vectorizers, 2):
generate_vectorizer_barplot(results_df, metric, compare)
count_plots += 1
print(f"{count_plots} plots generated from the results. Saved to './results/plots/' directory.")