-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
70 lines (64 loc) · 2.77 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import io
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras import preprocessing
def preprocess(filenames):
tokenizer_c = preprocessing.text.Tokenizer(filters='', lower=False)
tokenizer_p = preprocessing.text.Tokenizer(filters='', lower=False)
for filename in filenames:
with io.open('./datasets/raw/' + filename) as file:
cpis = file.read().split('\n')
tokenizer_c.fit_on_texts(['<start> ' + ' '.join(cpi.split(',')[0]) + ' <end>' for cpi in cpis])
tokenizer_p.fit_on_texts(['<start> ' + ' '.join(cpi.split(',')[1]) + ' <end>' for cpi in cpis])
np.random.shuffle(cpis)
with io.open('./datasets/shuffle/' + filename, 'w') as file:
for i in range(len(cpis)):
file.write(cpis[i])
if i < len(cpis) - 1:
file.write('\n')
with io.open('./vocabularies/compound.json', 'w') as file:
file.write(json.dumps(tokenizer_c.word_index, indent=4))
with io.open('./vocabularies/protein.json', 'w') as file:
file.write(json.dumps(tokenizer_p.word_index, indent=4))
def make_seq(texts, max_len, vocab, vocab_size, word_len):
batch_size = len(texts)
seq_len = 0
for text in texts:
if len(text) > seq_len:
seq_len = len(text)
seq_len = (seq_len + word_len + 1) // word_len
if seq_len > max_len:
seq_len = max_len
seq = np.zeros((batch_size, seq_len), np.int64)
for i in range(batch_size):
for j in range(seq.shape[1]):
for k in range(j * word_len, (j + 1) * word_len):
seq[i][j] *= len(vocab) + 1
if k == 0:
seq[i][j] += vocab['<start>']
else:
if k <= len(texts[i]) and texts[i][k - 1] in vocab:
seq[i][j] += vocab[texts[i][k - 1]]
else:
if k == len(texts[i]) + 1:
seq[i][j] += vocab['<end>']
seq[i][j] %= vocab_size
if j * word_len <= len(texts[i]) + 1:
seq[i][j] += 1
return tf.cast(seq, tf.int64)
def make_batch(file, batch_size, max_len_c, max_len_p, vocab_c, vocab_p, vocab_size_c, vocab_size_p, word_len_c,
word_len_p):
cps = []
pts = []
data_i = np.zeros(batch_size)
for i in range(batch_size):
cpi = file.readline()
if len(cpi) == 0:
break
cp, pt, itr = cpi.split(',')
cps.append(cp)
pts.append(pt)
data_i[i] = int(itr)
return make_seq(cps, max_len_c, vocab_c, vocab_size_c, word_len_c), make_seq(pts, max_len_p, vocab_p, vocab_size_p,
word_len_p), tf.cast(data_i, tf.int64)