forked from sonalibansal/minor_project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify.py
137 lines (106 loc) · 3.51 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import re
import datefinder
from commonregex import CommonRegex
import geograpy
from extract import Extractor
from databaseInteractor import DatabaseInteractor
#from postal.parser import parse_address
def make_Dictionary(train_dir):
emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]
all_words = []
for mail in emails:
with open(mail) as m:
for i,line in enumerate(m):
if i == 0:
words = line.split()
all_words += words
dictionary = Counter(all_words)
list_to_remove = dictionary.keys()
for item in list_to_remove:
if item.isalpha() == False:
del dictionary[item]
elif len(item) == 1 or len(item)==2 or len(item)==3:
del dictionary[item]
dictionary = dictionary.most_common(20)
return dictionary
def extract_features(mail_dir):
files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
features_matrix = np.zeros((len(files),20))
docID = 0;
for fil in files:
with open(fil) as fi:
for i,line in enumerate(fi):
if i == 0:
words = line.split()
for word in words:
wordID = 0
for i,d in enumerate(dictionary):
if d[0] == word:
wordID = i
features_matrix[docID,wordID] = words.count(word)
docID = docID + 1
return features_matrix
# Create a dictionary of words with its frequency
train_dir = 'travel-nontravel/train-mails'
dictionary = make_Dictionary(train_dir)
print dictionary
# Prepare feature vectors per training mail and its labels
train_labels = np.zeros(160)
train_labels[80:159] = 1
train_matrix = extract_features(train_dir)
# Training SVM and Naive bayes classifier and its variants
model1 = LinearSVC()
model2 = MultinomialNB()
model1.fit(train_matrix,train_labels)
model2.fit(train_matrix,train_labels)
# Test the unseen mails for Spam
#print model1
test_dir = 'travel-nontravel/test-mails'
test_matrix = extract_features(test_dir)
test_labels = np.zeros(60)
test_labels[29:59] = 1
result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)
print confusion_matrix(test_labels,result1)
print confusion_matrix(test_labels,result2)
def extract_features_for_single_doc(doc_path):
features_matrix = np.zeros((1,20), dtype = np.int)
f=open(doc_path, "r")
if f.mode == 'r':
contents =f.read()
#print contents
words = contents.split()
for word in words:
wordID = 0
for i,d in enumerate(dictionary):
if d[0] == word:
wordID = i
features_matrix[0,wordID] = words.count(word)
return features_matrix
test_doc = 'travel-nontravel/tr2.txt'
doc_matrix = extract_features_for_single_doc(test_doc)
extractor = Extractor()
result3 = model1.predict(doc_matrix)
if result3==0:
print "non travel"
else:
print "travel"
print str(result3)+"\n"
if result3==1:
extractor.setPath(test_doc)
user_name = extractor.findUserName()#emailid
date = extractor.findDate()
time = extractor.findTime()
address = extractor.findAddress()
print date
print time
print address
db = DatabaseInteractor()
db.insert(user_name,date,time,address)