-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsentimentTagger.py
executable file
·185 lines (172 loc) · 5.49 KB
/
sentimentTagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#! /usr/bin/python
"""
Implements a sentiment analyzer. The analyzer trains on annotated tweets.
A few different methods are implemented to identify the sentiment of a new
tweet.
1) naive Bayes
"""
from collections import defaultdict
#from decision import *
from random import *
from checkTags import checkListTags
import sys
#some global values
TRAINFILEA = 'trainA.txt'
TRAINFILEB = 'trainB.txt'
TOKENIZED = 'tokenizedTrainB.txt'
POLARA = 4 #the index of the polarity
POLARB = 3 #index of polarity in task B
TWEETA = 5 #the index of the tweet itself
TWEETB = 4 #index of twee in task B
"""
builds a list of the data based on the info in filename
"""
def extractData( fileName ):
#builds the list of test words
f = open( fileName, 'r')
#lines = filter(lambda x: "Not Available" not in x, f.readlines())
trainData = map(lambda x: x.strip('\n').split('\t'), f.readlines())
f.close()
return trainData
"""
returns a list of all the n-grams in the text
"""
def ngramList( text, n ):
ngrams = []
words = text.split()
for i in xrange(0,len(words) - n + 1):
ngrams.append(" ".join(words[i:i+n]))
return ngrams
"""
returns a list of all the n-grams [1,...,k] in the text
"""
def allGrams( text, k):
ngrams = []
for i in xrange(1,k+1):
ngrams.extend(ngramList( text, i ))
return ngrams
"""
Returns a dict of counts of sense per feature, and total #of sense, for taskA
"""
def featureCountsA( data, n ):
featCounts = {}
senseTotals = {}
for line in data:
start = int(line[2])
end = int(line[3]) + 1
text = " ".join( line[TWEETA].split()[start:end] )
#for feat in ngramList( text, n ):
for feat in allGrams( text, n ): #+1 counts for words in range
if feat in featCounts:
featCounts[feat][line[POLARA]] = featCounts[feat].get(line[POLARA],0)+1
senseTotals[line[POLARA]] = senseTotals.get(line[POLARA], 0) + 1
else:
featCounts[feat] = { line[POLARA] : 1 }
senseTotals[line[POLARA]] = senseTotals.get(line[POLARA], 0) + 1
return featCounts, senseTotals
"""
Returns a dict of counts of sense per feature, and total #of sense, for taskA
"""
def featureCountsB( data, n ):
featCounts = {}
senseTotals = {}
for line in data:
#for feat in ngramList( line[TWEETB], n ):
for feat in allGrams( line[TWEETB], n ): #+1 counts for words in range
if feat in featCounts:
featCounts[feat][line[POLARB]] = featCounts[feat].get(line[POLARB],0)+1
senseTotals[line[POLARB]] = senseTotals.get(line[POLARB], 0) + 1
else:
featCounts[feat] = { line[POLARB] : 1 }
senseTotals[line[POLARB]] = senseTotals.get(line[POLARB], 0) + 1
return featCounts, senseTotals
"""
Returns a dict of probabilities: P(f_i | s) where keys are (f_i, s) tuples
"""
def buildFeaturesProb( data, taskA, n ):
probDict = {}
if taskA == True:
featCounts, senseTotals = featureCountsA( data, n )
else:
featCounts, senseTotals = featureCountsB( data, n )
for feat in featCounts.keys():
for sentim in featCounts[feat].keys():
#for sentim in senseTotals.keys():
v = len(featCounts[feat].keys())
#print len(featCounts[feat].keys())
prob = ( featCounts[feat].get( sentim, 0) + 1 ) / \
float( senseTotals[sentim] + 1 )
probDict[(feat, sentim)] = prob
total = sum(senseTotals.values())
for sentim in senseTotals.keys():
probDict[sentim] = senseTotals[sentim]/float( total )
return senseTotals.keys(), probDict
"""
Returns the best guess for the "correct" sense using the naive bayes algo
"""
def naiveProb( features, probDict, sentiments ):
probList = []
for sense in sentiments:
#p = 10000.0 * probDict[sense]
""
if probDict[sense] > .5:
p = 1.0
else:
p = 2.0 #*probDict[sense] #multiplying prob by P(s)
""
for feat in features:
p *= probDict.get( (feat, sense), .5)
probList.append( (p, sense) )
probList.sort() #defaults to sorting on the first element in tuple
return probList[0]
"""
Returns the best guess for each tweet segment
"""
def naiveBayes( train, test, n, task = True ):
if task == True:
sentim, probDict = buildFeaturesProb( train, task, n )
else:
sentim, probDict = buildFeaturesProb( train, task, n )
tags = []
for line in test:
if task == True:
start = int(line[2])
end = int(line[3]) + 1
text = " ".join( line[TWEETA].split()[start:end] )
#tag = naiveProb ( ngramList(text, n), probDict, sentim )
tag = naiveProb( allGrams(text, n), probDict, sentim )
else:
#tag = naiveProb( ngramList(line[TWEETB], n), probDict, sentim )
tag = naiveProb( allGrams(line[TWEETB], n), probDict, sentim )
tags.append(tag)
return tags
"""
splits train data into 1/testFrac sections. Then tests on partition n,
and trains on the rest
"""
def crossValid( testFrac, trainData, n = 1):
lines = len(trainData)
end = int(lines*testFrac)*n
start = int(lines*testFrac)*(n-1)
train = trainData[0:start]+trainData[end:lines]
test = trainData[start:end]
return train, test
def main():
task = True
#task = False
#fileName = TOKENIZED
fileName = TRAINFILEA
if len(sys.argv) > 1:
task = sys.argv[1]
if task.lower() == "a":
task = True
fileName = TRAINFILEA
else:
task = False
fileName = TRAINFILEB
trainFile = extractData( fileName )
trainData, testData = crossValid(.2, trainFile, 3)
tags = naiveBayes( trainData, testData, 2, task ) #list of the guesses
checkListTags(tags, testData, task) #prints out accuracy and other stats
if __name__=='__main__':
main()