-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsrindicator.py
96 lines (82 loc) · 3.9 KB
/
srindicator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import xml.etree.ElementTree as ET
from spacy.tokens import Token
class SRIndicator:
"""
Sample XMl record:
<SRIndicator string="abate" gapType="none" type="l" verified="true">
<Lexeme lemma="abate" pos="VB"/>
<SemInfo category="affects" cue="" inverse="false" negated="false"/>
<SemInfo category="disrupts" cue="" inverse="false" negated="false"/>
</SRIndicator>
"""
def __init__(self, srindicator_xml):
self.string = srindicator_xml.attrib['string']
self.gap_type = srindicator_xml.attrib['gapType']
self.type = srindicator_xml.attrib['type']
self.verified = srindicator_xml.attrib['verified']
lexeme_xml = srindicator_xml.findall('Lexeme')
self.lexeme = []
if len(lexeme_xml) == 1:
self.lexeme = [{'lemma': lexeme_xml[0].attrib['lemma'],
'pos': lexeme_xml[0].attrib['pos']}]
self.lexeme_type = 'single'
elif len(lexeme_xml) > 1:
for lexeme_xml in lexeme_xml:
self.lexeme.append({'lemma': lexeme_xml.attrib['lemma'],
'pos': lexeme_xml.attrib['pos']})
self.lexeme_type = 'multiword'
else:
gapped_lexeme = srindicator_xml.find('GappedLexeme')
if gapped_lexeme is not None:
for lexeme in gapped_lexeme.findall('Part/Lexeme'):
self.lexeme.append({'lemma': lexeme.attrib['lemma'],
'pos': lexeme.attrib['pos']})
self.lexeme_type = 'gapped'
else:
input(self.string)
self.seminfo = []
for seminfo_xml in srindicator_xml.find('SemInfo'):
seminfo.append(SemInfo(seminfo_xml))
class SemInfo:
"""
Sample XMl record:
<SemInfo category="affects" cue="" inverse="false" negated="false"/>
"""
def __init__(self, seminfo_xml):
self.category = srindicator_xml.attrib['category']
self.cue = srindicator_xml.attrib['cue']
self.inverse = srindicator_xml.attrib['inverse']
self.negated = srindicator_xml.attrib['negated']
def parse_semrules_file(filename):
tree = ET.parse(filename)
root = tree.getroot()
srindicators_list = []
srindicator_lemmas = {}
for i, srindicator_xml in enumerate(root.findall('SRIndicator')):
srindicator = SRIndicator(srindicator_xml)
srindicators_list.append(srindicator)
if srindicator.lexeme_type not in srindicator_lemmas:
srindicator_lemmas[srindicator.lexeme_type] = {}
if srindicator.lexeme[0]['lemma'] not in srindicator_lemmas[srindicator.lexeme_type]:
srindicator_lemmas[srindicator.lexeme_type][srindicator.lexeme[0]['lemma']] = []
srindicator_lemmas[srindicator.lexeme_type][srindicator.lexeme[0]['lemma']].append(i)
return srindicators_list, srindicator_lemmas
def annotate_indicators(spacy_sentence, srindicators_list, srindicator_lemmas):
indicator_getter = lambda token: token.text in ("apple", "pear", "banana")
lexeme_type_order = ['gapped', 'multiword', 'single']
indicators = []
for lexeme_type in lexeme_type_order:
for token in spacy_sentence:
if token.lemma_ in srindicator_lemmas[lexeme_type]:
if lexeme_type == 'gapped':
for next_token in spacy_sentence[token.i + 1:]:
if next_token.lemma_ == srindicator_lemmas[lexeme_type][token.lemma_]:
pass
elif lexeme_type == 'multiword':
for next_token in spacy_sentence[token.i + 1:]:
if next_token.lemma_ == srindicator_lemmas[lexeme_type][token.lemma_]:
pass
else:
Token.set_extension('is_indicator', default = True, force = True)
indicators.append(token.i)
return indicators