-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdata.py
139 lines (106 loc) · 4.56 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import torch
from torch.autograd import Variable
import pickle
import random
import os
USE_CUDA = torch.cuda.is_available()
flatten = lambda l: [item for sublist in l for item in sublist]
def prepare_sequence(seq, to_ix):
idxs = list(map(lambda w: to_ix[w] if w in to_ix.keys() else to_ix["<UNK>"], seq))
tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else Variable(torch.LongTensor(idxs))
return tensor
def preprocessing(file_path, length):
"""
atis-2.train.w-intent.iob
"""
processed_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/")
print("processed_data_path : %s" % processed_path)
if os.path.exists(os.path.join(processed_path, "processed_train_data.pkl")):
train_data, word2index, tag2index, intent2index = pickle.load(open(os.path.join(processed_path,
"processed_train_data.pkl"),
"rb"))
return train_data, word2index, tag2index, intent2index
if not os.path.exists(processed_path):
os.makedirs(processed_path)
try:
train = open(file_path, "r").readlines()
print("Successfully load data. # of set : %d " % len(train))
except:
print("No such file!")
return None, None, None, None
try:
train = [t[:-1] for t in train]
train = [[t.split("\t")[0].split(" "), t.split("\t")[1].split(" ")[:-1], t.split("\t")[1].split(" ")[-1]] for t in train]
train = [[t[0][1:-1], t[1][1:], t[2]] for t in train]
seq_in, seq_out, intent = list(zip(*train))
vocab = set(flatten(seq_in))
slot_tag = set(flatten(seq_out))
intent_tag = set(intent)
print("# of vocab : {vocab}, # of slot_tag : {slot_tag}, # of intent_tag : {intent_tag}"
.format(vocab=len(vocab), slot_tag=len(slot_tag), intent_tag=len(intent_tag)))
except:
return None, None, None, None
sin = []
sout = []
for i in range(len(seq_in)):
temp = seq_in[i]
if len(temp) < length:
temp.append('<EOS>')
while len(temp) < length:
temp.append('<PAD>')
else:
temp = temp[:length]
temp[-1] = '<EOS>'
sin.append(temp)
temp = seq_out[i]
if len(temp) < length:
while len(temp) < length:
temp.append('<PAD>')
else:
temp = temp[:length]
temp[-1] = '<EOS>'
sout.append(temp)
word2index = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
for token in vocab:
if token not in word2index.keys():
word2index[token] = len(word2index)
tag2index = {'<PAD>': 0}
for tag in slot_tag:
if tag not in tag2index.keys():
tag2index[tag] = len(tag2index)
intent2index = {}
for ii in intent_tag:
if ii not in intent2index.keys():
intent2index[ii] = len(intent2index)
train = list(zip(sin, sout, intent))
train_data = []
for tr in train:
temp = prepare_sequence(tr[0], word2index)
temp = temp.view(1, -1)
temp2 = prepare_sequence(tr[1], tag2index)
temp2 = temp2.view(1, -1)
temp3 = Variable(torch.LongTensor([intent2index[tr[2]]])).cuda() if USE_CUDA else Variable(torch.LongTensor([intent2index[tr[2]]]))
train_data.append((temp, temp2, temp3))
pickle.dump((train_data,word2index,tag2index,intent2index),open(os.path.join(processed_path, "processed_train_data.pkl"), "wb"))
pickle
print("Preprocessing complete!")
return train_data, word2index, tag2index, intent2index
def getBatch(batch_size, train_data):
random.shuffle(train_data)
sindex = 0
eindex = batch_size
while eindex < len(train_data):
batch = train_data[sindex:eindex]
temp = eindex
eindex = eindex+batch_size
sindex = temp
yield batch
def load_dictionary():
processed_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data/")
if os.path.exists(os.path.join(processed_path, "processed_train_data.pkl")):
_, word2index, tag2index, intent2index \
= pickle.load(open(os.path.join(processed_path, "processed_train_data.pkl"), "rb"))
return word2index, tag2index, intent2index
else:
print("Please, preprocess data first")
return None, None, None