Skip to content

Commit

Permalink
Merge pull request #8 from LiyuanLucasLiu/predict
Browse files Browse the repository at this point in the history
add predict function
  • Loading branch information
frankxu2004 authored Sep 19, 2017
2 parents 1f5ea83 + 9691675 commit e887f04
Show file tree
Hide file tree
Showing 4 changed files with 402 additions and 6 deletions.
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ When models are only trained on the CoNLL 2000 Chunking dataset, the results are

## Pretrained Model

### Evaluation
We released pre-trained model on these three tasks. The checkpoint file can be downloaded at:


Expand All @@ -164,6 +165,44 @@ Also, ```eval_wc.py``` is provided to load and run these checkpoints. Its usage
```
python eval_wc.py --load_arg checkpoint/ner/ner_4_cwlm_lstm_crf.json --load_check_point checkpoint/ner_ner_4_cwlm_lstm_crf.model --gpu 0 --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt
```
### Prediction

To annotated raw text, ```seq_wc.py``` is provided to annotate un-annotated text. Its usage can be accessed by command ````python seq_wc.py -h````, and a running command example is provided below:
```
python seq_wc.py --load_arg checkpoint/ner/ner_4_cwlm_lstm_crf.json --load_check_point checkpoint/ner_ner_4_cwlm_lstm_crf.model --gpu 0 --input_file ./data/ner2003/test.txt --output_file output.txt
```

The input format is similar to CoNLL, but each line is required to only contain one field, token. For example, an input file could be:

```
-DOCSTART-
But
China
saw
their
luck
desert
them
in
the
second
match
of
the
group
,
crashing
to
a
surprise
2-0
defeat
to
newcomers
Uzbekistan
.
```

## Reference

Expand Down
271 changes: 271 additions & 0 deletions model/predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
"""
.. module:: predictor
:synopsis: prediction method (for un-annotated text)
.. moduleauthor:: Liyuan Liu
"""

import torch
import torch.autograd as autograd
import numpy as np
import itertools

from model.crf import CRFDecode_vb
from model.utils import *

class predict:
"""Base class for prediction, provide method to calculate f1 score and accuracy
args:
if_cuda: if use cuda to speed up
l_map: dictionary for labels
label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
batch_size: size of batch in decoding
"""

def __init__(self, if_cuda, l_map, label_seq = True, batch_size = 50):
self.if_cuda = if_cuda
self.l_map = l_map
self.r_l_map = revlut(l_map)
self.batch_size = batch_size
if label_seq:
self.decode_str = self.decode_l
else:
self.decode_str = self.decode_s

def decode_l(self, feature, label):
"""
decode a sentence coupled with label
args:
feature (list): words list
label (list): label list
"""
return '\n'.join(map(lambda t: t[0] + ' '+ self.r_l_map[t[1]], zip(feature, label)))

def decode_s(self, feature, label):
"""
decode a sentence in the format of <>
args:
feature (list): words list
label (list): label list
"""
chunks = ""
current = None

for f, y in zip(feature, label):
label = self.r_l_map[y]

if label.startswith('B-'):

if current is not None:
chunks += "</"+current+"> "
current = label[2:]
chunks += "<"+current+"> " + f + " "

elif label.startswith('S-'):

if current is not None:
chunks += " </"+current+"> "
current = label[2:]
chunks += "<"+current+"> " + f + " </"+current+"> "
current = None

elif label.startswith('I-'):

if current is not None:
base = label[2:]
if base == current:
chunks += f+" "
else:
chunks += "</"+current+"> <"+base+"> " + f + " "
current = base
else:
current = label[2:]
chunks += "<"+current+"> " + f + " "

elif label.startswith('E-'):

if current is not None:
base = label[2:]
if base == current:
chunks += f + " </"+base+"> "
current = None
else:
chunks += "</"+current+"> <"+base+"> " + f + " </"+base+"> "
current = None

else:
current = label[2:]
chunks += "<"+current+"> " + f + " </"+current+"> "
current = None

else:
if current is not None:
chunks += "</"+current+"> "
chunks += f+" "
current = None

if current is not None:
chunks += "</"+current+"> "

return chunks

def output_batch(self, ner_model, features, fout):
"""
decode the whole corpus in the specific format by calling apply_model to fit specific models
args:
ner_model: sequence labeling model
feature (list): list of words list
fout: output file
"""
f_len = len(features)

for ind in range(0, f_len, self.batch_size):
eind = min(f_len, ind + self.batch_size)
labels = self.apply_model(ner_model, features[ind: eind])
labels = torch.unbind(labels, 1)

for ind2 in range(ind, eind):
f = features[ind2]
l = labels[ind2 - ind][0: len(f) ]
fout.write(self.decode_str(features[ind2], l) + '\n\n')

def apply_model(self, ner_model, features):
"""
template function for apply_model
args:
ner_model: sequence labeling model
feature (list): list of words list
"""
return None

class predict_w(predict):
"""prediction class for word level model (LSTM-CRF)
args:
if_cuda: if use cuda to speed up
f_map: dictionary for words
l_map: dictionary for labels
pad_word: word padding
pad_label: label padding
start_label: start label
label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
batch_size: size of batch in decoding
caseless: caseless or not
"""

def __init__(self, if_cuda, f_map, l_map, pad_word, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True):
predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
self.pad_word = pad_word
self.f_map = f_map
self.l_map = l_map
self.caseless = caseless

def apply_model(self, ner_model, features):
"""
apply_model function for LSTM-CRF
args:
ner_model: sequence labeling model
feature (list): list of words list
"""
if self.caseless:
features = list(map(lambda t: list(map(lambda x: x.lower(), t)), features))
features = encode_safe(features, self.f_map, self.f_map['<unk>'])
f_len = max(map(lambda t: len(t) + 1, features))

masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1), features)))
word_features = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (f_len - len(t)), features)))

if self.if_cuda:
fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda()
mask_v = masks.transpose(0, 1).cuda()
else:
fea_v = autograd.Variable(word_features.transpose(0, 1))
mask_v = masks.transpose(0, 1).contiguous()

scores, _ = ner_model(fea_v)
decoded = self.decoder.decode(scores.data, mask_v)

return decoded

class predict_wc(predict):
"""prediction class for LM-LSTM-CRF
args:
if_cuda: if use cuda to speed up
f_map: dictionary for words
c_map: dictionary for chars
l_map: dictionary for labels
pad_word: word padding
pad_char: word padding
pad_label: label padding
start_label: start label
label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
batch_size: size of batch in decoding
caseless: caseless or not
"""

def __init__(self, if_cuda, f_map, c_map, l_map, pad_word, pad_char, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True):
predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
self.pad_word = pad_word
self.pad_char = pad_char
self.f_map = f_map
self.c_map = c_map
self.l_map = l_map
self.caseless = caseless

def apply_model(self, ner_model, features):
"""
apply_model function for LM-LSTM-CRF
args:
ner_model: sequence labeling model
feature (list): list of words list
"""
char_features = encode2char_safe(features, self.c_map)

if self.caseless:
word_features = encode_safe(list(map(lambda t: list(map(lambda x: x.lower(), t)), features)), self.f_map, self.f_map['<unk>'])
else:
word_features = encode_safe(features, self.f_map, self.f_map['<unk>'])

fea_len = [list( map( lambda t: len(t) + 1, f) ) for f in char_features]
forw_features = concatChar(char_features, self.c_map)

word_len = max(map(lambda t: len(t) + 1, word_features))
char_len = max(map(lambda t: len(t[0]) + word_len - len(t[1]), zip(forw_features, word_features)))
forw_t = list( map( lambda t: t + [self.pad_char] * ( char_len - len(t) ), forw_features ) )
back_t = torch.LongTensor( list( map( lambda t: t[::-1], forw_t ) ) )
forw_t = torch.LongTensor( forw_t )
forw_p = torch.LongTensor( list( map( lambda t: list(itertools.accumulate( t + [1] * (word_len - len(t) ) ) ), fea_len) ) )
back_p = torch.LongTensor( list( map( lambda t: [char_len - 1] + [ char_len - 1 - tup for tup in t[:-1] ], forw_p) ) )

masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (word_len - len(t) - 1), word_features)))
word_t = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (word_len - len(t)), word_features)))

if self.if_cuda:
f_f = autograd.Variable(forw_t.transpose(0, 1)).cuda()
f_p = autograd.Variable(forw_p.transpose(0, 1)).cuda()
b_f = autograd.Variable(back_t.transpose(0, 1)).cuda()
b_p = autograd.Variable(back_p.transpose(0, 1)).cuda()
w_f = autograd.Variable(word_t.transpose(0, 1)).cuda()
mask_v = masks.transpose(0, 1).cuda()
else:
f_f = autograd.Variable(forw_t.transpose(0, 1))
f_p = autograd.Variable(forw_p.transpose(0, 1))
b_f = autograd.Variable(back_t.transpose(0, 1))
b_p = autograd.Variable(back_p.transpose(0, 1))
w_f = autograd.Variable(word_t.transpose(0, 1))
mask_v = masks.transpose(0, 1)

scores = ner_model(f_f, f_p, b_f, b_p, w_f)
decoded = self.decoder.decode(scores.data, mask_v)

return decoded
27 changes: 21 additions & 6 deletions model/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,23 @@ def read_corpus(lines):

return features, labels

def read_features(lines):
"""
convert un-annotated corpus into features
"""
features = list()
tmp_fl = list()
for line in lines:
if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
line = line.rstrip()
tmp_fl.append(line)
elif len(tmp_fl) > 0:
features.append(tmp_fl)
tmp_fl = list()
if len(tmp_fl) > 0:
features.append(tmp_fl)

return features

def shrink_embedding(feature_map, word_dict, word_embedding, caseless):
"""
Expand Down Expand Up @@ -516,10 +533,9 @@ def construct_bucket_vb_wc(word_features, forw_features, fea_len, input_labels,

padded_feature = f_f + [pad_char_feature] * (buckets_len[idx] - len(f_f)) # pad feature with <'\n'>, at least one

padded_feature_len = f_l + [1] * (thresholds[idx] - len(f_l)) # pad feature length with <'\n'>, at least one
padded_feature_len_cum = [tup for tup in itertools.accumulate(
padded_feature_len)] # start from 0, but the first is ' ', so the position need not to be -1
buckets[idx][0].append(padded_feature)#char
padded_feature_len = f_l + [1] * (thresholds[idx] - len(f_l)) # pad feature length with <'\n'>, at least one
padded_feature_len_cum = list(itertools.accumulate(padded_feature_len)) # start from 0, but the first is ' ', so the position need not to be -1
buckets[idx][0].append(padded_feature) # char
buckets[idx][1].append(padded_feature_len_cum)
buckets[idx][2].append(padded_feature[::-1])
buckets[idx][3].append([buckets_len[idx] - 1] + [buckets_len[idx] - 1 - tup for tup in padded_feature_len_cum[:-1]])
Expand Down Expand Up @@ -569,8 +585,7 @@ def construct_bucket_gd(input_features, input_labels, thresholds, pad_feature, p
buckets[idx][0].append(feature + [pad_feature] * (thresholds[idx] - cur_len))
buckets[idx][1].append(label[1:] + [pad_label] * (thresholds[idx] - cur_len))
buckets[idx][2].append(label + [pad_label] * (thresholds[idx] - cur_len_1))
bucket_dataset = [CRFDataset(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), torch.LongTensor(bucket[2]))
for bucket in buckets]
bucket_dataset = [CRFDataset(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), torch.LongTensor(bucket[2])) for bucket in buckets]
return bucket_dataset


Expand Down
Loading

0 comments on commit e887f04

Please sign in to comment.