-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfrom_csv_to_lmf.py
executable file
·118 lines (101 loc) · 3.53 KB
/
from_csv_to_lmf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
import sys
from lxml import etree
def map_pos(old_pos):
new_pos = old_pos
if old_pos == 'n':
new_pos = 'noun'
elif old_pos == 'a':
new_pos = 'adj'
elif old_pos == 'r':
new_pos = 'adv'
elif old_pos == 'v':
new_pos = 'verb'
return new_pos
if __name__ == '__main__':
if sys.stdin.isatty():
print>>sys.stderr,'Input stream required.'
print>>sys.stderr,'Example usage: cat my_lexicon.utf8.csv |',sys.argv[0],'-verbose'
print>>sys.stderr,'\t-verbose option to generate the lexicon with all the lemmas for one synset'
print>>sys.stderr,'\tInput file format: synset;pos;polarity;value;lemma1,lemma2,lemma3;freq OR'
print>>sys.stderr,'\tInput file format: synset;pos;polarity;value;lemma1,lemma2,lemma3;freq;sentiment_modifier (intensifier,shifter...)'
print>>sys.stderr,'''\nIn case you ran the propagate_wn.py with the output lemmas option the CSV file
can contain 2 types of lines:'''
print>>sys.stderr,'\tInput file format: unknown;pos;polarity;value;lemma;freq OR'
print>>sys.stderr,'\tInput file format: unknown;pos;polarity;value;lemma;freq;modifier'
print>>sys.stderr
print>>sys.stderr
sys.exit(-1)
lang = sys.argv[1]
verbose = False
if False and len(sys.argv)==2:
if sys.argv[1]=='-verbose':
verbose = True
else:
print>>sys.stderr,'Option ',sys.argv[1],'not recognized. Verbosity=',verbose
my_root = etree.Element('LexicalResource')
my_global = etree.Element('GlobalInformation')
my_global.set('label','Created with the standard propagation algorithm')
my_root.append(my_global)
my_lexicon = etree.Element('Lexicon')
my_lexicon.set('languageCoding','UTF-8')
my_lexicon.set('label','sentiment')
my_lexicon.set('language',lang)
my_root.append(my_lexicon)
n=0
for line in sys.stdin:
tokens = line.decode('utf-8').strip().split(';')
if len(tokens)!=6 and len(tokens)!=7:
print>>sys.stderr,'Skipped line because has not the correct format'
else:
synset = tokens[0]
pos = tokens[1]
polarity = tokens[2]
try:
value = float(tokens[3])
except:
value=-1
lemmas = tokens[4].split(',')
freq = tokens[5]
my_lemmas = []
if verbose:
my_lemmas = lemmas
else:
my_lemmas = [lemmas[0]]
if len(tokens) == 7:
modifier = tokens[6]
else:
modifier = None
for my_lemma in my_lemmas:
if modifier is None:
lex_ent = etree.Element('LexicalEntry',attrib={'id':'id_'+str(n),'partOfSpeech':map_pos(pos)})
else:
lex_ent = etree.Element('LexicalEntry',attrib={'id':'id_'+str(n),'partOfSpeech':map_pos(pos),'type':modifier})
n+=1
my_lexicon.append(lex_ent)
## LEMMA
l_obj = etree.Element('Lemma',attrib={'writtenForm':my_lemma})
lex_ent.append(l_obj)
#### SENSE
sense = etree.Element('Sense')
if modifier is not None:
method = 'manual'
value ='1'
elif freq=='-1':
method = 'automatic'
else:
method = 'manual'
value = '1'
sense.append(etree.Element('Confidence',attrib={'score':str(value),'method':method}))
refs = etree.Element('MonolingualExternalRefs')
if synset != 'unknown':
refs.append(etree.Element('MonolingualExternalRef',attrib={'externalReference':str(synset)}))
sense.append(refs)
if modifier is None:
sense.append(etree.Element('Sentiment',attrib={'polarity':polarity}))
else:
sense.append(etree.Element('Sentiment'))
sense.append(etree.Element('Domain'))
lex_ent.append(sense)
my_tree = etree.ElementTree(my_root)
my_tree.write(sys.stdout,encoding='UTF-8',pretty_print=1, xml_declaration=1)