predict for raw text

aistairc · Aug 12, 2020 · a9a6c5b · a9a6c5b
1 parent 005976a
commit a9a6c5b
Show file tree

Hide file tree

Showing 10 changed files with 468 additions and 105 deletions.
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ sh download_corpora.sh
 2. Preprocess data
 - Tokenize texts and prepare data for prediction
 ```bash
-sh preprocess.sh
+sh preprocess.sh bionlp
 ```
 
 3. Download models
@@ -46,7 +46,7 @@ sh download_model.sh
 sh generate-config.sh [gpu]
 ```
 
-### Predict
+### Predict (with the BioNLP shared task data sets)
 
 1. For development and test sets (given gold entities)
 - CG task: [task] = cg
@@ -80,6 +80,29 @@ sh run.sh [task] eval test gold
 - [Our trained models](https://b2share.eudat.eu/records/80d2de0c57d64419b722dc1afa375f28)
 - [Our scores](https://b2share.eudat.eu/api/files/3cf6c1f4-5eed-4ee3-99c5-d99f5f011be3/scores.tar.gz)
 
+### Predict (with raw text)
+
+1. Prepare raw text input in the following path
+```bash
+data/raw-text/[task]/PMID-*.txt
+```
+- Eg: [task] = cg: the output is predicted based on our trained cg model
+
+2. Preprocess raw text
+
+- Tokenize raw text and prepare data for prediction
+```bash
+sh preprocess.sh raw
+```
+
+- The processed input is stored in the
+```bash
+data/processed-raw-text/[task]/
+```
+
+3. Predict
+
+
 ## Acknowledgements
 This work is based on results obtained from a project commissioned by the New Energy and Industrial Technology Development Organization (NEDO).
 This work is also supported by PRISM (Public/Private R&D Investment Strategic Expansion PrograM).

diff --git a/configs/default.yaml b/configs/default.yaml
@@ -13,6 +13,8 @@ test_data: ..
 # eval
 ev_eval_script_path: eval/scripts/eval-ev-cg.py
 ev_eval_entities: []
+raw_text: False
+ner_predict_all: False
 
 # output
 result_dir: ..

diff --git a/eval/evalEV.py b/eval/evalEV.py
@@ -292,25 +292,90 @@ def convert_evid_to_number(str_evid):
     return int(evid[0] + evid[1])
 
 
+def mapping_entity_id(en_preds_):
+    eid = 1
+    enid_mapping = collections.OrderedDict()
+    en_preds_out_ = []
+
+    # create mapping for entity id first
+    for en_pred in en_preds_:
+
+        # id
+        en_id = en_pred[0]
+
+        if en_id.startswith('TR'):
+            continue
+
+        elif en_id.startswith('T'):
+            enid_mapping[en_id] = 'T' + str(eid)
+            eid += 1
+            en_preds_out_.append(en_pred)
+
+    # creat mapping for trigger id
+    for en_pred in en_preds_:
+        # id
+        en_id = en_pred[0]
+
+        if en_id.startswith('TR'):
+            enid_mapping[en_id] = 'T' + str(eid)
+            eid += 1
+            en_preds_out_.append(en_pred)
+
+    return enid_mapping, en_preds_out_
+
+
 # write events to file
 def write_ev_2file(pred_output, result_dir, params):
     dir2wr = result_dir + 'ev-last/ev-ann/'
     rev_type_map = params['mappings']['rev_type_map']
 
+    # entity id mapping
+    feid_mapping = collections.OrderedDict()
+
     if not os.path.exists(dir2wr):
         os.makedirs(dir2wr)
     else:
         os.system('rm ' + dir2wr + '*.a2')
+        os.system('rm ' + dir2wr + '*.a1')
 
+    # write event and triggers, (and entity: if predict both entity and trigger)
     for fid, preds in pred_output.items():
-        triggers = preds[0]
+        en_preds_ = preds[0]
         events = preds[1]
 
+        enid_mapping, en_preds_out_ = mapping_entity_id(en_preds_)
+        # store for this document
+        feid_mapping[fid] = enid_mapping
+
+        # write a1 file for entity if predict both entity and trigger
+        if params['ner_predict_all']:
+            with open(dir2wr + fid + '.a1', 'w') as o1file:
+                for e_pred in en_preds_:
+                    e_id = e_pred[0]
+
+                    if e_id.startswith('TR'):
+                        continue
+
+                    # only write entity to a1
+                    elif e_id.startswith('T'):
+                        e_id = enid_mapping[e_id]
+
+                        output = ''.join(
+                            [e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t',
+                             e_pred[3], '\n'])
+                        o1file.write(output)
+
+        # write event and trigger to a2
         with open(dir2wr + fid + '.a2', 'w') as o2file:
 
-            for trigger in triggers:
-                o2file.write(trigger[0].replace('TR', 'T') + '\t' + rev_type_map[trigger[1]] + ' ' +
-                             str(trigger[2][0]) + ' ' + str(trigger[2][1]) + '\t' + trigger[3] + '\n')
+            for e_pred in en_preds_out_:
+                e_id = e_pred[0]
+                e_id = enid_mapping[e_id]
+
+                output = ''.join(
+                    [e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t',
+                     e_pred[3], '\n'])
+                o2file.write(output)
 
             # count event id
             f_evid = 0
@@ -334,7 +399,8 @@ def write_ev_2file(pred_output, result_dir, params):
                     evid_out = f_evid
                     f_evid_map[evid] = evid_out
 
-                idTR = event_[1][0].replace('TR', 'T')
+                trid = event_[1][0]
+                trid = enid_mapping[trid]
                 typeEV = rev_type_map[event_[1][1]]
                 args_data = event_[2]
                 mod_pred = event_[3]
@@ -351,27 +417,30 @@ def write_ev_2file(pred_output, result_dir, params):
                         nest_evid = convert_evid_to_number(argIdE)
                         if nest_evid in f_evid_map:
                             nest_evid_out = f_evid_map[nest_evid]
-                            idT = 'E' + str(nest_evid_out)
+                            eid = 'E' + str(nest_evid_out)
                         else:
                             print('ERROR: NESTED EVENT BUT MISSING EVENT ARGUMENT.')
 
                     # entity argument
                     else:
                         a2data = arg_[1]
-                        idT = a2data[0].replace('TR', 'T')
+                        eid = a2data[0]
+                        eid = enid_mapping[eid]
 
                     if len(args_output) > 0:
                         args_output += ' '
 
-                    args_output += typeR + ':' + idT
+                    args_output += typeR + ':' + eid
 
                 # if has argument
                 if len(args_output) > 0:
-                    o2file.write('E' + str(evid_out) + '\t' + typeEV + ':' + idTR + ' ' + args_output + '\n')
+                    output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, ' ', args_output, '\n'])
+                    o2file.write(output)
 
                 # no argument
                 else:
-                    o2file.write('E' + str(evid_out) + '\t' + typeEV + ':' + idTR + '\n')
+                    output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, '\n'])
+                    o2file.write(output)
 
                 # check and store modality
                 if mod_pred > 1:
@@ -383,14 +452,15 @@ def write_ev_2file(pred_output, result_dir, params):
                 for mod_id, mod_data in enumerate(mod_list):
                     mod_type = mod_data[0]
                     evid_out = mod_data[1]
-                    o2file.write('M' + str(mod_id + 1) + '\t' + mod_type + ' ' + 'E' + str(evid_out) + '\n')
+                    output = ''.join(['M', str(mod_id + 1), '\t', mod_type, ' ', 'E', str(evid_out), '\n'])
+                    o2file.write(output)
 
-    return
+    return feid_mapping
 
 
 # generate event output and evaluation
-def evaluate_ev(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all_span_indices, all_sub_to_words,
-                all_ev_preds, params, result_dir):
+def write_events(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all_span_indices, all_sub_to_words,
+                 all_ev_preds, params, result_dir):
     # generate predicted entities
     pred_ents = generate_entities(fids=fids,
                                   all_e_preds=all_ent_preds,
@@ -410,6 +480,6 @@ def evaluate_ev(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all
     preds_output = generate_ev_output(pred_ents, pred_evs, params)
 
     # write output to file
-    _ = write_ev_2file(preds_output, result_dir, params)
+    write_ev_2file(preds_output, result_dir, params)
 
     return