-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
33 lines (29 loc) · 892 Bytes
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import collections
class Tokenizer:
def __init__(self, vocab_list):
self.unk = '<UNK>'
self.pad = '<PAD>'
self.vocab = self.load_vocab(vocab_list) # key-word value index
def load_vocab(self, vocab_list):
'''
return:
vocab: key 字词 value index
'''
vocab = collections.OrderedDict()
vocab[self.pad] = 0
vocab[self.unk] = 1
index = 2
for token in vocab_list:
token = token.strip()
vocab[token] = index
index += 1
return vocab
def token_to_id(self, token):
idx = self.vocab.get(token)
if idx is not None:
return idx
else:
return self.vocab[self.unk]
def tokens_to_id(self, tokens):
ids_list = list(map(self.token_to_id, tokens))
return ids_list