-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
125 lines (102 loc) · 3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
import cPickle as pickle
import itertools
import codecs
import re
def evaluate_word_PRF(y_pred, y):
y_pred=list(itertools.chain.from_iterable(y_pred))
y=list(itertools.chain.from_iterable(y))
assert len(y_pred)==len(y)
cor_num = 0
yp_wordnum = y_pred.count(2) + y_pred.count(3)
yt_wordnum = y.count(2) + y.count(3)
start = 0
for i in xrange(len(y)):
if y[i] == 2 or y[i] == 3:
flag = True
for j in xrange(start, i + 1):
if y[j] != y_pred[j]:
flag = False
if flag == True:
cor_num += 1
start = i + 1
P = cor_num / (float(yp_wordnum) + 1e-12)
R = cor_num / float(yt_wordnum)
F = 2 * P * R / (P + R + 1e-12)
return P, R, F
def get_seg_file(X, pred_Y, target, model_name):
id2char = pickle.load( open("data/%s_id2char.p" % (target, ), "rb") )
seg_f = codecs.open("output/%s_test_%s.txt" % (target, model_name), "w", 'utf-8')
for i in range(len(X)):
assert len(X[i]) == len(pred_Y[i])
line = []
for j in range(len(X[i])):
if pred_Y[i][j] == 0:
line.append(" ")
line.append(id2char[X[i][j]])
elif pred_Y[i][j] == 1:
line.append(id2char[X[i][j]])
elif pred_Y[i][j] == 2:
line.append(id2char[X[i][j]])
line.append(" ")
elif pred_Y[i][j] == 3:
line.append(" ")
line.append(id2char[X[i][j]])
line.append(" ")
seg_f.write("".join(line) + "\n")
seg_f.close()
def strQ2B(ustring):
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 12288:
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374):
inside_code -= 65248
rstring += unichr(inside_code)
return rstring
def preprocess(in_filename, out_filename):
f_in = codecs.open(in_filename, 'r', 'utf-8')
f_out = codecs.open(out_filename, 'w', 'utf-8')
# rNUM = u'(-|\+)?\d+((\.|·)\d+)?%?'
# rENG = u'[A-Za-z_.]+'
rENGNUM = u'[A-Za-z_.\d%#+@-]+'
for text in f_in.readlines():
text = strQ2B(text)
# text=re.sub(rNUM, u'0', text)
# text=re.sub(rENG, u'X', text)
text=re.sub(rENGNUM, u'X', text)
f_out.write(text)
f_in.close()
f_out.close()
def preprocess_datasets():
datasets = ["pd/pd_dict.txt",
"pd/pd_train.txt",
"ctb/ctb_dict.txt",
"ctb/ctb_train.txt",
"zx/zx_test.txt",
"zx/zx_ul_train.txt",
"zx/zx_pl_train.txt",
"zx/zx_valid.txt",
"zx/zx_dict.txt",
"com/com_test.txt",
"com/com_ul_train.txt",
"com/com_pl_train.txt",
"com/com_pl_train_new.txt",
"com/com_valid.txt",
"lit/lit_test.txt",
"lit/lit_ul_train.txt",
"lit/lit_pl_train.txt",
"lit/lit_pl_train_new.txt",
"lit/lit_valid.txt",
"fin/fin_test.txt",
"fin/fin_ul_train.txt",
"fin/fin_pl_train.txt",
"fin/fin_pl_train_new.txt",
"med/med_test.txt",
"med/med_ul_train.txt",
"med/med_pl_train.txt",
"med/med_pl_train_new.txt"]
for d in datasets:
print d
preprocess("dataset/raw_data/" + d, "dataset/preprocess_data/" + d)