forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlogistic.py
142 lines (112 loc) · 4.1 KB
/
logistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Course URL:
# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
# https://udemy.com/natural-language-processing-with-deep-learning-in-python
# from __future__ import print_function, division
# from future.utils import iteritems
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime
import os
import sys
sys.path.append(os.path.abspath('..'))
from rnn_class.util import get_wikipedia_data
from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx
from markov import get_bigram_probs
if __name__ == '__main__':
# load in the data
# note: sentences are already converted to sequences of word indexes
# note: you can limit the vocab size if you run out of memory
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)
# sentences, word2idx = get_sentences_with_word2idx()
# vocab size
V = len(word2idx)
print("Vocab size:", V)
# we will also treat beginning of sentence and end of sentence as bigrams
# START -> first word
# last word -> END
start_idx = word2idx['START']
end_idx = word2idx['END']
# a matrix where:
# row = last word
# col = current word
# value at [row, col] = p(current word | last word)
bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)
# train a logistic model
# W = np.random.randn(V, V) / np.sqrt(V)
# train a shallow neural network model
D = 100
W1 = np.random.randn(V, D) / np.sqrt(V)
W2 = np.random.randn(D, V) / np.sqrt(D)
losses = []
epochs = 1
lr = 1e-1
def softmax(a):
a = a - a.max()
exp_a = np.exp(a)
return exp_a / exp_a.sum(axis=1, keepdims=True)
# what is the loss if we set W = log(bigram_probs)?
W_bigram = np.log(bigram_probs)
bigram_losses = []
t0 = datetime.now()
for epoch in range(epochs):
# shuffle sentences at each epoch
random.shuffle(sentences)
j = 0 # keep track of iterations
for sentence in sentences:
# convert sentence into one-hot encoded inputs and targets
sentence = [start_idx] + sentence + [end_idx]
n = len(sentence)
inputs = np.zeros((n - 1, V))
targets = np.zeros((n - 1, V))
inputs[np.arange(n - 1), sentence[:n-1]] = 1
targets[np.arange(n - 1), sentence[1:]] = 1
hidden = np.tanh(inputs.dot(W1))
# get output predictions
predictions = softmax(hidden.dot(W2))
# do a gradient descent step
W = W - lr * inputs.T.dot(predictions - targets)
W2 = W2 - lr * hidden.T.dot(predictions - targets)
dhidden = (predictions - targets).dot(W2.T) * (1 - hidden * hidden)
W1 = W1 - lr * inputs.T.dot(dhidden)
# keep track of the loss
loss = -np.sum(targets * np.log(predictions)) / (n - 1)
losses.append(loss)
# keep track of the bigram loss
# only do it for the first epoch to avoid redundancy
if epoch == 0:
bigram_predictions = softmax(inputs.dot(W_bigram))
bigram_loss = -np.sum(targets * np.log(bigram_predictions)) / (n - 1)
bigram_losses.append(bigram_loss)
if j % 10 == 0:
print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)), "loss:", loss)
j += 1
print("Elapsed time training:", datetime.now() - t0)
plt.plot(losses)
# plot a horizontal line for the bigram loss
avg_bigram_loss = np.mean(bigram_losses)
print("avg_bigram_loss:", avg_bigram_loss)
plt.axhline(y=avg_bigram_loss, color='r', linestyle='-')
# plot smoothed losses to reduce variability
def smoothed_loss(x, decay=0.99):
y = np.zeros(len(x))
last = 0
for t in range(len(x)):
z = decay * last + (1 - decay) * x[t]
y[t] = z / (1 - decay ** (t + 1))
last = z
return y
plt.plot(smoothed_loss(losses))
plt.show()
# plot W and bigram probs side-by-side
# for the most common 200 words
plt.subplot(1,2,1)
plt.title("Logistic Model")
plt.imshow(softmax(W))
plt.subplot(1,2,2)
plt.title("Bigram Probs")
plt.imshow(bigram_probs)
plt.show()