forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmarkov.py
136 lines (106 loc) · 4.06 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Course URL:
# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
# https://udemy.com/natural-language-processing-with-deep-learning-in-python
from __future__ import print_function, division
# from future.utils import iteritems
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import os
import sys
sys.path.append(os.path.abspath('..'))
from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx
def get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=1):
# structure of bigram probability matrix will be:
# (last word, current word) --> probability
# we will use add-1 smoothing
# note: we'll always ignore this from the END token
bigram_probs = np.ones((V, V)) * smoothing
for sentence in sentences:
for i in range(len(sentence)):
if i == 0:
# beginning word
bigram_probs[start_idx, sentence[i]] += 1
else:
# middle word
bigram_probs[sentence[i-1], sentence[i]] += 1
# if we're at the final word
# we update the bigram for last -> current
# AND current -> END token
if i == len(sentence) - 1:
# final word
bigram_probs[sentence[i], end_idx] += 1
# normalize the counts along the rows to get probabilities
bigram_probs /= bigram_probs.sum(axis=1, keepdims=True)
return bigram_probs
if __name__ == '__main__':
# load in the data
# note: sentences are already converted to sequences of word indexes
# note: you can limit the vocab size if you run out of memory
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(10000)
# sentences, word2idx = get_sentences_with_word2idx()
# vocab size
V = len(word2idx)
print("Vocab size:", V)
# we will also treat beginning of sentence and end of sentence as bigrams
# START -> first word
# last word -> END
start_idx = word2idx['START']
end_idx = word2idx['END']
# a matrix where:
# row = last word
# col = current word
# value at [row, col] = p(current word | last word)
bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)
# a function to calculate normalized log prob score
# for a sentence
def get_score(sentence):
score = 0
for i in range(len(sentence)):
if i == 0:
# beginning word
score += np.log(bigram_probs[start_idx, sentence[i]])
else:
# middle word
score += np.log(bigram_probs[sentence[i-1], sentence[i]])
# final word
score += np.log(bigram_probs[sentence[-1], end_idx])
# normalize the score
return score / (len(sentence) + 1)
# a function to map word indexes back to real words
idx2word = dict((v, k) for k, v in dict.items(word2idx))
def get_words(sentence):
return ' '.join(idx2word[i] for i in sentence)
# when we sample a fake sentence, we want to ensure not to sample
# start token or end token
sample_probs = np.ones(V)
sample_probs[start_idx] = 0
sample_probs[end_idx] = 0
sample_probs /= sample_probs.sum()
# test our model on real and fake sentences
while True:
# real sentence
real_idx = np.random.choice(len(sentences))
real = sentences[real_idx]
# fake sentence
fake = np.random.choice(V, size=len(real), p=sample_probs)
print("REAL:", get_words(real), "SCORE:", get_score(real))
print("FAKE:", get_words(fake), "SCORE:", get_score(fake))
# input your own sentence
custom = input("Enter your own sentence:\n")
custom = custom.lower().split()
# check that all tokens exist in word2idx (otherwise, we can't get score)
bad_sentence = False
for token in custom:
if token not in word2idx:
bad_sentence = True
if bad_sentence:
print("Sorry, you entered words that are not in the vocabulary")
else:
# convert sentence into list of indexes
custom = [word2idx[token] for token in custom]
print("SCORE:", get_score(custom))
cont = input("Continue? [Y/n]")
if cont and cont.lower() in ('N', 'n'):
break