-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtext_generator.py
141 lines (112 loc) · 4.95 KB
/
text_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
import scispacy
import spacy
from scispacy.umls_linking import UmlsEntityLinker
import json
import random
import requests
nlp = spacy.load("en_core_sci_lg")
linker = UmlsEntityLinker()
nlp.add_pipe(linker)
tokenizer = nlp.Defaults.create_tokenizer(nlp)
#Import json data
with open('../emrQG/relations.json') as json_file: #You can modify this line to change your input directory
data = json.load(json_file)
#Prcoess text to add whitespace between word/letter tokens and punctuation tokens.
def process_text(text):
temp = []
token = tokenizer(text) #Use Spacy to do tokenization
temp.extend(i.text for i in token)
# return " ".join(Final)
return " ".join(temp)
#Answer generator
def generator_answer(ls, index, out_dir):
with open(out_dir, "w") as out:
for num in ls:
i,j,k,l = index[num]
temp = process_text(data['data'][i]['paragraphs'][j]['qas'][k]['answers'][l]['text'].strip())
out.write(temp+'\n')
#Question generator
def generator_question(ls, index, out_dir):
with open(out_dir, "w") as out:
for num in ls:
i,j,k,_ = index[num]
temp = process_text(data['data'][i]['paragraphs'][j]['qas'][k]['question'].strip())
out.write(temp+'\n')
#Entity generator
def generator_entity(ls, index, out_dir):
with open(out_dir, "w") as out:
for num in ls:
i,j,k,l = index[num]
temp = process_text(data['data'][i]['paragraphs'][j]['qas'][k]['answers'][l]['entity'])
if temp == '':
temp = '<UNDEFINED>'
out.write(temp+'\n')
#Question Type generator
def generator_question_type(ls, index, out_dir):
with open(out_dir, "w") as out:
for num in ls:
i,j,k,_ = index[num]
temp = process_text(data['data'][i]['paragraphs'][j]['qas'][k]['question_type'])
out.write(temp+'\n')
#idx generator
def generator_idx(ls, index, out_dir):
with open(out_dir, "w") as out:
for num in ls:
i,j,k,l = index[num]
temp = str(i)+" "+str(j)+" "+str(k)+" "+str(l)
out.write(temp+'\n')
#Shuffle around a list of QA pairs, and pick top "percent" of pairs and store selected pairs in return list
def shuffler_selctor(ls, index_note, percent, shuffle = False):
temp = []
total = 0
index = dict()
for num in ls:
i,j = index_note[num]
for k in range (len(data['data'][i]['paragraphs'][j]['qas'])):
for l in range (len(data['data'][i]['paragraphs'][j]['qas'][k]['answers'])):
index[total] = [i,j,k,l]
total +=1
temp = list(range(total))
random.seed(1)
if shuffle:
random.shuffle(temp)
return (temp[:int(total*percent)],index)
#Main function
def main(percent = 0.05):
index_note = dict()
dataset = 'relation_5'
# #Count the total #QA pairs and #clinical-notes
note_total=0
for i in range (len(data['data'])):
for j in range (len(data['data'][i]['paragraphs'])):
index_note[note_total] = [i,j]
note_total += 1
#Preperation for shuffling the text
ls_note = list(range(note_total))
random.seed(1)
ls_note = random.sample(ls_note, len(ls_note))
ls_note_train = ls_note[:int(note_total*0.8)]
ls_note_dev = ls_note[int(note_total*0.8):int(note_total*0.9)]
ls_note_test = ls_note[int(note_total*0.9):]
ls_train, index_train = shuffler_selctor(ls_note_train, index_note, percent)
ls_dev, index_dev = shuffler_selctor(ls_note_dev, index_note, percent)
ls_test, index_test = shuffler_selctor(ls_note_test, index_note, percent)
#Generate src
generator_answer(ls_train, index_train, out_dir = "../emrQG/"+dataset+"/train.src.pre")
generator_answer(ls_dev, index_dev, out_dir = "../emrQG/"+dataset+"/dev.src.pre")
generator_answer(ls_test, index_test, out_dir = "../emrQG/"+dataset+"/test.src.pre")
#Generate tgt
generator_question(ls_train, index_train, out_dir = "../emrQG/"+dataset+"/train.tgt.pre")
generator_question(ls_dev, index_dev, out_dir = "../emrQG/"+dataset+"/dev.tgt.pre")
generator_question(ls_test, index_test, out_dir = "../emrQG/"+dataset+"/test.tgt.pre")
#Generate Entity
generator_entity(ls_train, index_train, out_dir = "../emrQG/"+dataset+"/train.entity.pre")
generator_entity(ls_dev, index_dev, out_dir = "../emrQG/"+dataset+"/dev.entity.pre")
generator_entity(ls_test, index_test, out_dir = "../emrQG/"+dataset+"/test.entity.pre")
#Gnerate idx
generator_idx(ls_train, index_train, out_dir = "../emrQG/"+dataset+"/train.idx.pre")
generator_idx(ls_dev, index_dev, out_dir = "../emrQG/"+dataset+"/dev.idx.pre")
generator_idx(ls_test, index_test, out_dir = "../emrQG/"+dataset+"/test.idx.pre")
if __name__ == "__main__":
main()