-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredictor.py
57 lines (43 loc) · 2.46 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import sqlalchemy
from sqlalchemy import create_engine
from mbti_no_pos import map_strings
class Predictor:
db_string = 'postgres://secretuser:secretpassword@localhost:5432/mbti'
# Here are probabilities of people in each of 4 different categories
p_probs = {'I': 0.769567, 'E': 0.230433, 'N': 0.862017, 'S': 0.137983, 'F': 0.541095, 'T': 0.458905, 'P': 0.60415, 'J': 0.39585}
# Probabilities of values in each of 4 categories
v_probs = {'I': 0.771512, 'E': 0.228488, 'N': 0.895135, 'S': 0.104865, 'F': 0.542246, 'T': 0.457754, 'P': 0.601053, 'J': 0.398947}
def __init__(self):
self.db = create_engine(self.db_string)
def run(self, text):
words, length = map_strings(text)
# no valid words were found
if length == 0:
return None
sql_template = 'SELECT SUM("I") AS I, SUM("E") AS E, SUM("N") AS N, SUM("S") AS S, SUM("T") AS T, SUM("F") AS F, SUM("J") AS J, SUM("P") AS P FROM vocabulary WHERE word = ANY( :user_vocab )'
result = self.db.execute(sqlalchemy.text(sql_template), {'user_vocab': words})
# lets get and format data from DB
row = result.first()
i, e, n, s, f, t, p, j = row
result.close()
# if completely trash text was provided
if i == None:
return None
i, e, n, s, f, t, p, j = int(i), int(e), int(n), int(s), int(f), int(t), int(p), int(j)
# probabilities calculated based on provided text. +1 was added to denominator to avoid division by zero
Pi = i/(i+e+1)
Pn = n/(n+s+1)
Pf = f/(f+t+1)
Pp = p/(p+j+1)
# In training set we have mixed group of people with both E and I.
# Since person we are testing belongs to one group only then our testing set is more focused
# on specific words and probability of these words usage should be higher than in mixed group.
# So we have to compare values of word usage probability for testing person
# with same probability for group and to choose group that have smaller values of probability
# for same words
# mask creation based on calculated probabilities
result = "I" if Pi >= self.v_probs['I'] else "E"
result += "N" if Pn >= self.v_probs['N'] else "S"
result += "F" if Pf >= self.v_probs['F'] else "T"
result += "P" if Pp >= self.v_probs['P'] else "J"
return result