Skip to content

Commit

Permalink
predict for raw text
Browse files Browse the repository at this point in the history
  • Loading branch information
trieuhl committed Aug 12, 2020
1 parent 005976a commit a9a6c5b
Show file tree
Hide file tree
Showing 10 changed files with 468 additions and 105 deletions.
27 changes: 25 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ sh download_corpora.sh
2. Preprocess data
- Tokenize texts and prepare data for prediction
```bash
sh preprocess.sh
sh preprocess.sh bionlp
```

3. Download models
Expand All @@ -46,7 +46,7 @@ sh download_model.sh
sh generate-config.sh [gpu]
```

### Predict
### Predict (with the BioNLP shared task data sets)

1. For development and test sets (given gold entities)
- CG task: [task] = cg
Expand Down Expand Up @@ -80,6 +80,29 @@ sh run.sh [task] eval test gold
- [Our trained models](https://b2share.eudat.eu/records/80d2de0c57d64419b722dc1afa375f28)
- [Our scores](https://b2share.eudat.eu/api/files/3cf6c1f4-5eed-4ee3-99c5-d99f5f011be3/scores.tar.gz)

### Predict (with raw text)

1. Prepare raw text input in the following path
```bash
data/raw-text/[task]/PMID-*.txt
```
- Eg: [task] = cg: the output is predicted based on our trained cg model

2. Preprocess raw text

- Tokenize raw text and prepare data for prediction
```bash
sh preprocess.sh raw
```

- The processed input is stored in the
```bash
data/processed-raw-text/[task]/
```

3. Predict


## Acknowledgements
This work is based on results obtained from a project commissioned by the New Energy and Industrial Technology Development Organization (NEDO).
This work is also supported by PRISM (Public/Private R&D Investment Strategic Expansion PrograM).
Expand Down
2 changes: 2 additions & 0 deletions configs/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ test_data: ..
# eval
ev_eval_script_path: eval/scripts/eval-ev-cg.py
ev_eval_entities: []
raw_text: False
ner_predict_all: False

# output
result_dir: ..
Expand Down
100 changes: 85 additions & 15 deletions eval/evalEV.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,25 +292,90 @@ def convert_evid_to_number(str_evid):
return int(evid[0] + evid[1])


def mapping_entity_id(en_preds_):
eid = 1
enid_mapping = collections.OrderedDict()
en_preds_out_ = []

# create mapping for entity id first
for en_pred in en_preds_:

# id
en_id = en_pred[0]

if en_id.startswith('TR'):
continue

elif en_id.startswith('T'):
enid_mapping[en_id] = 'T' + str(eid)
eid += 1
en_preds_out_.append(en_pred)

# creat mapping for trigger id
for en_pred in en_preds_:
# id
en_id = en_pred[0]

if en_id.startswith('TR'):
enid_mapping[en_id] = 'T' + str(eid)
eid += 1
en_preds_out_.append(en_pred)

return enid_mapping, en_preds_out_


# write events to file
def write_ev_2file(pred_output, result_dir, params):
dir2wr = result_dir + 'ev-last/ev-ann/'
rev_type_map = params['mappings']['rev_type_map']

# entity id mapping
feid_mapping = collections.OrderedDict()

if not os.path.exists(dir2wr):
os.makedirs(dir2wr)
else:
os.system('rm ' + dir2wr + '*.a2')
os.system('rm ' + dir2wr + '*.a1')

# write event and triggers, (and entity: if predict both entity and trigger)
for fid, preds in pred_output.items():
triggers = preds[0]
en_preds_ = preds[0]
events = preds[1]

enid_mapping, en_preds_out_ = mapping_entity_id(en_preds_)
# store for this document
feid_mapping[fid] = enid_mapping

# write a1 file for entity if predict both entity and trigger
if params['ner_predict_all']:
with open(dir2wr + fid + '.a1', 'w') as o1file:
for e_pred in en_preds_:
e_id = e_pred[0]

if e_id.startswith('TR'):
continue

# only write entity to a1
elif e_id.startswith('T'):
e_id = enid_mapping[e_id]

output = ''.join(
[e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t',
e_pred[3], '\n'])
o1file.write(output)

# write event and trigger to a2
with open(dir2wr + fid + '.a2', 'w') as o2file:

for trigger in triggers:
o2file.write(trigger[0].replace('TR', 'T') + '\t' + rev_type_map[trigger[1]] + ' ' +
str(trigger[2][0]) + ' ' + str(trigger[2][1]) + '\t' + trigger[3] + '\n')
for e_pred in en_preds_out_:
e_id = e_pred[0]
e_id = enid_mapping[e_id]

output = ''.join(
[e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t',
e_pred[3], '\n'])
o2file.write(output)

# count event id
f_evid = 0
Expand All @@ -334,7 +399,8 @@ def write_ev_2file(pred_output, result_dir, params):
evid_out = f_evid
f_evid_map[evid] = evid_out

idTR = event_[1][0].replace('TR', 'T')
trid = event_[1][0]
trid = enid_mapping[trid]
typeEV = rev_type_map[event_[1][1]]
args_data = event_[2]
mod_pred = event_[3]
Expand All @@ -351,27 +417,30 @@ def write_ev_2file(pred_output, result_dir, params):
nest_evid = convert_evid_to_number(argIdE)
if nest_evid in f_evid_map:
nest_evid_out = f_evid_map[nest_evid]
idT = 'E' + str(nest_evid_out)
eid = 'E' + str(nest_evid_out)
else:
print('ERROR: NESTED EVENT BUT MISSING EVENT ARGUMENT.')

# entity argument
else:
a2data = arg_[1]
idT = a2data[0].replace('TR', 'T')
eid = a2data[0]
eid = enid_mapping[eid]

if len(args_output) > 0:
args_output += ' '

args_output += typeR + ':' + idT
args_output += typeR + ':' + eid

# if has argument
if len(args_output) > 0:
o2file.write('E' + str(evid_out) + '\t' + typeEV + ':' + idTR + ' ' + args_output + '\n')
output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, ' ', args_output, '\n'])
o2file.write(output)

# no argument
else:
o2file.write('E' + str(evid_out) + '\t' + typeEV + ':' + idTR + '\n')
output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, '\n'])
o2file.write(output)

# check and store modality
if mod_pred > 1:
Expand All @@ -383,14 +452,15 @@ def write_ev_2file(pred_output, result_dir, params):
for mod_id, mod_data in enumerate(mod_list):
mod_type = mod_data[0]
evid_out = mod_data[1]
o2file.write('M' + str(mod_id + 1) + '\t' + mod_type + ' ' + 'E' + str(evid_out) + '\n')
output = ''.join(['M', str(mod_id + 1), '\t', mod_type, ' ', 'E', str(evid_out), '\n'])
o2file.write(output)

return
return feid_mapping


# generate event output and evaluation
def evaluate_ev(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all_span_indices, all_sub_to_words,
all_ev_preds, params, result_dir):
def write_events(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all_span_indices, all_sub_to_words,
all_ev_preds, params, result_dir):
# generate predicted entities
pred_ents = generate_entities(fids=fids,
all_e_preds=all_ent_preds,
Expand All @@ -410,6 +480,6 @@ def evaluate_ev(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all
preds_output = generate_ev_output(pred_ents, pred_evs, params)

# write output to file
_ = write_ev_2file(preds_output, result_dir, params)
write_ev_2file(preds_output, result_dir, params)

return
Loading

0 comments on commit a9a6c5b

Please sign in to comment.