-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcombineCreateFinalEvalData.py
129 lines (94 loc) · 4.52 KB
/
combineCreateFinalEvalData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import csv
import sys
import random
from datasets import load_dataset
from collections import defaultdict
sys.path.append( '../Utils/' )
from utils import tokenise_idiom as idiom_tokeniser
from utils import match_idioms
from utils import _load_csv as load_csv
random.seed( 42 )
def _get_sents( location ) :
headers, data = load_csv( location, delimiter="," )
correct = list()
incorrect = list()
for row in data :
correct .append( row[headers.index( 'correct' ) ] )
incorrect.append( row[headers.index( 'incorrect' ) ] )
return correct, incorrect
def _get_sts_data( language, dataset ) :
location = os.path.join( 'EvalSTSData', language.lower() + '_' + dataset.lower() + '_Eval_STS_Data.csv' )
first_line, sts_data = load_csv( location, "," )
sts_data = [ first_line ] + sts_data
return sts_data
def create_eval_data( sent_location, similatiries, is_not_idiom_info, out_location, tokenise_idiom, select_tokenise, sts_data_to_add ) :
sent_header, sents = load_csv( sent_location, "," )
row_0, sims_data = load_csv( similatiries, "," )
not_idiom_header, not_idiom_data = load_csv( is_not_idiom_info, "\t" )
sims_data = [ row_0 ] + sims_data
assert len( sents ) == len( sims_data ) == len( not_idiom_data )
out_header = [ [ 'score', 'sentence1', 'sentence2' ] ]
out_data = list()
for index in range( len( sims_data ) ) :
this_sentence = sents[ index ][ sent_header.index( 'sent_idiom' ) ]
this_sentence_tokenised = sents[ index ][ sent_header.index( 'sent_idiom_tokenised' ) ]
this_other = sents[ index ][ sent_header.index( 'sent_other' ) ]
this_sim = sents[ index ][ sent_header.index( 'sim' ) ]
if tokenise_idiom :
if select_tokenise :
## Based on model prediction
this_pred = not_idiom_data[ index ][ not_idiom_header.index( 'prediction' ) ]
if int( this_pred ) == 0 : ## 0 is idiomatic
this_sentence = this_sentence_tokenised
else :
## Always
this_sentence = this_sentence_tokenised
if this_sim == 'None' :
this_sim = float( sims_data[ index ][0] )
else :
this_sim = float( this_sim )
assert this_sim == 1.0
assert this_sentence != this_other
out_data.append( [ this_sim, this_sentence, this_other ] )
out_data += sts_data_to_add
random.shuffle( out_data )
out_data = out_header + out_data
outfile = os.path.join( out_location, dataset + '_final_eval_data.csv' )
with open( outfile, 'w' ) as csvfile :
writer = csv.writer( csvfile )
writer.writerows( out_data )
print( "Wrote STS evaluation data to {} of size {}: ".format( outfile, len( out_data ) ) )
return
if __name__ == '__main__' :
if len( sys.argv ) < 2 :
print( "Require language as param" )
sys.exit()
language = sys.argv[1].upper()
assert language in [ 'EN', 'PT' ]
## Legacy - should always be "True"
include_sts = True
print( "Language: {}".format( language ), flush=True )
for out_location, tokenise_idiom, select_tokenise, create_folder in [
( 'evalData' , False, False, False ),
( 'evalDataAllTokenised' , True , False, True ),
( 'evalDataSelectTokenised', True , True , True ),
] :
if create_folder :
print( "Created {}".format( out_location ) )
os.makedirs( out_location )
for dataset in [ 'dev', 'test' ] :
params = {
'sent_location' : 'evalData/' + dataset + '_rawSimData.csv' ,
'similatiries' : 'evalData/' + dataset + '_similatiries.csv' ,
'is_not_idiom_info' : 'evalData/predict_' + dataset + '/predict_results_None.txt' ,
'out_location' : out_location ,
'tokenise_idiom' : tokenise_idiom ,
'select_tokenise' : select_tokenise,
}
sts_data_to_add = _get_sts_data( language, dataset )
if include_sts :
params[ 'sts_data_to_add' ] = sts_data_to_add
else :
params[ 'sts_data_to_add' ] = list()
create_eval_data( **params )