-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathkey_feature.py
executable file
·157 lines (130 loc) · 5.89 KB
/
key_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import numpy as np
import datetime
import statsmodels.api as sm
def get_features(file_name, app):
with open(file_name, 'r', encoding='utf-8') as file:
for line in file.readlines():
line = line.strip().split(',')
if line[0] == app:
return line[1:]
def get_annotated_reviews(file_name):
with open(file_name, 'r', encoding='utf-8') as file:
return file.readlines()
class KeyFeature:
def __init__(self, features, reviews, day=180, time_unit=1):
self.feature_index = 0
self.review_content_index = 1
self.date_index = 2
self.rating_index = 3
self.label_index = 4
self.split_symbol = '-*-'
self.date_str = '%Y-%m-%d %H:%M'
self.features = features # a list of features
self.reviews = reviews # a list of annotated feature_review pairs
self.day = day
self.time_unit = time_unit
# process annotated review file
self.all_feature_reviews = self.get_feature_review_list(self.features, self.reviews)
def get_feature_review_list(self, features, reviews):
"""
Determine the set of features that is mentioned in each review. The result will be used as the
input of key feature identification.
:return: a list of data instances [0, review_text, review_date, review_rating, feature_set]
"""
# assign an unique ID for each feature (start from 0)
feature_dict = {}
for i, feature in enumerate(features):
feature_dict[feature] = i
# determine the set of features that is mentioned in each review
final_list = []
for i in range(0, len(reviews), len(features)):
review = reviews[i].strip().split(self.split_symbol)
feature_label = []
for j in range(len(features)):
review = reviews[i + j].strip().split(self.split_symbol)
if review[self.label_index] == '1': # this is a match
f = str(feature_dict[review[self.feature_index]])
feature_label.append(f)
feature_label_str = ','.join(feature_label) if len(feature_label) > 0 else '-1'
final_list.append([0, review[self.review_content_index], review[self.date_index],
review[self.rating_index], feature_label_str])
return final_list
def key_feature_identification(self, current_date):
"""
Identify key features based on both positive and negative reviews, and return the combination of their results.
The reviews posted in the previous [self.day] days will be used to perform analysis.
"""
# filter reviews based on date
bad_reviews = []
good_reviews = []
start_date = current_date - datetime.timedelta(days=self.day)
for review in self.all_feature_reviews:
date = review[self.date_index]
date = datetime.datetime.strptime(date, self.date_str)
if start_date <= date < current_date:
if review[self.rating_index] in ['1', '2']: # negative reviews
bad_reviews.append(review)
elif review[self.rating_index] in ['4', '5']: # positive reviews
good_reviews.append(review)
positive_key_features, num1, num_sum1 = self.get_key_feature(current_date, good_reviews)
negative_key_features, num2, num_sum2 = self.get_key_feature(current_date, bad_reviews)
# combine key features of positive and negative reviews
positive_key_features.sort(reverse=True)
negative_key_features.sort(reverse=True)
key_feature_dict = {}
for x in positive_key_features + negative_key_features:
if x[1] in key_feature_dict:
key_feature_dict[x[1]] += float(x[0])
else:
key_feature_dict[x[1]] = float(x[0])
all_key_features = []
for key in key_feature_dict:
all_key_features.append([key_feature_dict[key], key])
all_key_features.sort(reverse=True)
return positive_key_features, negative_key_features, all_key_features
def get_key_feature(self, current_date, feature_reviews_used):
"""
Identify the set of key features based on the specific feature_review data.
"""
start_date = current_date - datetime.timedelta(days=self.day)
time_unit_numbers = int(self.day / self.time_unit)
# calculate the number of reviews in each time unit
num = np.zeros((len(self.features), time_unit_numbers), dtype=np.int32)
num_sum = np.zeros(time_unit_numbers, dtype=np.int32)
for review in feature_reviews_used:
date = datetime.datetime.strptime(review[self.date_index], self.date_str)
index = int((date - start_date).days / self.time_unit)
assert 0 <= index < self.day
num_sum[index] += 1
# for each feature mentioned
for s in review[self.label_index].split(','):
assert s != ''
if s != '-1':
num[int(s), index] += 1
# remove features that are not mentioned in any review
feature_index = []
for k in range(len(self.features)):
if np.sum(num[k]) != 0:
feature_index.append(k)
num1 = num[feature_index, :]
feature_str = np.array(self.features)[feature_index]
# regression analysis
x = np.column_stack(num1) # rows = day, columns = number of features
x2 = sm.add_constant(x, prepend=True)
y = num_sum
est = sm.OLS(y, x2).fit()
sig_features_index = []
for i, p in enumerate(est.pvalues):
if i > 0 and p < 0.05:
sig_features_index.append(i - 1)
sig_feature_str = feature_str[sig_features_index]
sig_feature_coef = []
# adjust the coefficients
if len(sig_feature_str) > 0:
num2 = num[sig_features_index, :]
x = np.column_stack(num2)
x2 = sm.add_constant(x, prepend=True)
est_new = sm.OLS(y, x2).fit()
for i in range(1, len(est_new.params)):
sig_feature_coef.append(est_new.params[i])
return [[x, y] for x, y in zip(sig_feature_coef, sig_feature_str)], num, num_sum