-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathemr_preprocessor.py
48 lines (38 loc) · 1.86 KB
/
emr_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from typing import List, Union, Dict, Tuple
from finding_extractor import FindingExtractor
class EMRPreprocessor(object):
def __init__(self, finding_extractor: FindingExtractor):
self.finding_extractor = finding_extractor
def preprocess(self, text_l: List[str], mode: str) -> List[str]:
terms_l, pols_l = self.finding_extractor.extract(text_l, mode)
preprocessed_text_l = self.add_label_before_every_term(terms_l, pols_l)
return preprocessed_text_l
def add_label_before_every_term(self, terms_l: List[List[str]], labels_l: List[List[int]]):
assert len(terms_l) == len(labels_l)
text_l = list()
for terms, labels in zip(terms_l, labels_l):
assert len(terms) == len(labels)
text = list()
for term, label in zip(terms, labels):
token = self.finding_extractor.recognizer.label2token[label]
text += [token, term]
text = ' '.join(text)
text_l.append(text)
return text_l
@staticmethod
def build_patient_states(terms_l: List[List[str]], pols_l: List[List[int]], return_type: bool = "dict") -> Union[List[Dict[str, int]], List[List[Tuple[str, int]]]]:
assert len(terms_l) == len(pols_l)
if return_type not in ["dict", "tuple"]:
raise ValueError("return_type should be 'dict' or 'tuple'")
states = list()
for terms, pols in zip(terms_l, pols_l):
assert len(terms) == len(pols)
state = dict() if return_type == "dict" else list()
for term, pol in zip(terms, pols):
t = (term, pol)
if return_type == "dict":
state[t] = state.get(t, 0) + 1
elif return_type == "tuple":
state.append(t)
states.append(state)
return states