-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
134 lines (109 loc) · 5.4 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Use python 3 or above
import re # Regular Expressions
import numpy as np # Numpy
from sklearn.naive_bayes import BernoulliNB
# Extracts words from given list of files and assigns an id to them
# Arguments: List of filenames
# Returns: Dictionary with key = word, value = id
def extract_words_and_add_to_dict(filenamelist):
# Ignore these words
stop_words = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being",
"below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have",
"having", "he", "hed", "hell", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how", "hows", "i", "id", "ill", "im", "ive",
"if", "in", "into", "is", "it", "its", "its", "itself", "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
"other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should", "so",
"some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "there", "theres", "these", "they", "theyd", "theyll", "theyre",
"theyve", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were", "what", "whats", "when",
"whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why", "whys", "with", "would", "you", "youd", "youll", "youre", "youve",
"your", "yours", "yourself", "yourselves", "need", "needed", "can", "u", "every", "rather", "gonna", "m", "tap", "fish", "gives", "nice", "hoping"]
# Dictionary containing word to id mapping
word_id = {}
# For all files in the given list of filenames
for filename in filenamelist:
# Open file and read lines
with open(filename, encoding="ISO-8859-1") as f:
lines = f.read().splitlines()
# For all lines in this file
for line in lines:
# Remove everything other than alphabets, whitespace
line = re.sub(r'[^a-zA-Z\s]', "", line)
# Split at whitespace
words = line.split()
# Ignore length, rating
words = words[2:]
for word in words:
word = word.lower()
# Add this word to dictionary if it is not a stop word
if word not in stop_words:
word_id[word] = 0
# Assign id's to words in dictionary
id = 0
for word in word_id:
word_id[word] = id
id += 1
# Return the dictionary
return word_id
# Reads data from given files and stores it in a numpy array
# Arguments: List of filenames, dictionary containing word -> id mapping
# Returns: 2D Numpy array representation of given data
# The first 5 columns represent the rating of the review
# Other columns represent which words are present in the review
def get_data(filenamelist, word_id):
# Number of attributes per instance
# These include bitstring for words, rating
rating_bits = 6
cols = rating_bits + len(word_id)
# List representation of given data
data_list = []
# For all files in the given list of filenames
for filename in filenamelist:
# Open file and read lines
with open(filename, encoding="ISO-8859-1") as f:
lines = f.read().splitlines()
for line in lines:
line = line.lower()
# Get rating
rating = 0
try:
rating_str = line.split(" ", 2)[1]
except IndexError:
continue
if rating_str.endswith("one"):
rating = 1
elif rating_str.endswith("two"):
rating = 2
elif rating_str.endswith("three"):
rating = 3
elif rating_str.endswith("four"):
rating = 4
elif rating_str.endswith("five"):
rating = 5
# Remove length, rating
line = line.split(" ", 2)[2]
# Add split characters to regex based on requirement
line = re.sub(r'[\.\?]', ",", line)
reviews = line.split(",")
for review in reviews:
# If review is empty ignore
if review == "":
continue
# If review contains non-alphabets ignore
if re.match(r'[^a-zA-Z]', review) is not None:
continue
# Create row
instance = np.zeros((cols, ), dtype=int)
# Set appropriate rating bit
instance[rating] = 1
words = review.split()
for word in words:
attr_idx = word_id.get(word, None)
if attr_idx is not None:
instance[rating_bits + attr_idx] = 1
# If instance is not all zeros then append
if np.count_nonzero(instance[rating_bits:]) != 0:
data_list.append(instance)
# Convert to numpy array
ret_val = np.array(data_list)
return ret_val
def get_reverse_mapping(mapping):
return {val : key for key,val in mapping.items()}