-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreProcess.py
90 lines (77 loc) · 2.95 KB
/
PreProcess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import copy
class PreProcess:
"""
Class for pre-processing the review before passing it for analyzing
"""
def __init__(self, data, params={}):
self.data = copy.deepcopy(data)
self.params = params
def tokenize(self):
from nltk import word_tokenize
for d in self.data:
d[1] = word_tokenize(d[1])
return self.data
def remove_stopwords(self):
from nltk.corpus import stopwords
import re
stop = set(stopwords.words("english"))
for d in self.data:
temp = []
for w in d[1]:
# if not a stop word or a piece of punctuation
if w not in stop and not re.match(r"[^a-zA-Z\d\s]+", w):
temp.append(w)
d[1] = temp
return self.data
def stemmingPS(self):
from nltk.stem import PorterStemmer
ps = PorterStemmer()
# temp = copy.deepcopy(self.data)
for i in range(len(self.data)):
for j in range(len(self.data[i][1])):
self.data[i][1][j] = ps.stem(self.data[i][1][j])
return self.data
def stemmingLS(self):
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
temp = copy.deepcopy(self.data)
for i in range(len(temp)):
for j in range(len(temp[i][1])):
temp[i][1][j] = ls.stem(temp[i][1][j])
return temp
def stemmingSB(self):
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")
# temp = copy.deepcopy(self.data)
for i in range(len(self.data)):
for j in range(len(self.data[i][1])):
self.data[i][1][j] = sb.stem(self.data[i][1][j])
return self.data
def get_pos(self, word):
from collections import Counter
from nltk.corpus import wordnet # To get words in dictionary with their parts of speech
w_synsets = wordnet.synsets(word)
pos_counts = Counter()
pos_counts["n"] = len([item for item in w_synsets if item.pos() == "n"])
pos_counts["v"] = len([item for item in w_synsets if item.pos() == "v"])
pos_counts["a"] = len([item for item in w_synsets if item.pos() == "a"])
pos_counts["r"] = len([item for item in w_synsets if item.pos() == "r"])
most_common_pos_list = pos_counts.most_common(3)
return most_common_pos_list[0][0] # first indexer for getting the top POS from list, second indexer for getting POS from tuple( POS: count )
def lemmatize(self):
from nltk.stem import WordNetLemmatizer # lemmatizes word based on it's parts of speech
wnl = WordNetLemmatizer()
# temp = copy.deepcopy(self.data)
for i in range(len(self.data)):
for j in range(len(self.data[i][1])):
self.data[i][1][j] = wnl.lemmatize(self.data[i][1][j], pos=self.get_pos(self.data[i][1][j]))
return self.data
def clean(self, params):
params = ['tokenize']+list(params)
for p in params:
clean_call = getattr(self, p, None)
if clean_call:
clean_call()
else:
raise Exception(p.__str__()+' is not an available function')
return self.data