-
Notifications
You must be signed in to change notification settings - Fork 4.6k
/
Copy pathmultiple_regression.py
252 lines (188 loc) · 10.9 KB
/
multiple_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
from typing import List
inputs: List[List[float]] = [[1.,49,4,0],[1,41,9,0],[1,40,8,0],[1,25,6,0],[1,21,1,0],[1,21,0,0],[1,19,3,0],[1,19,0,0],[1,18,9,0],[1,18,8,0],[1,16,4,0],[1,15,3,0],[1,15,0,0],[1,15,2,0],[1,15,7,0],[1,14,0,0],[1,14,1,0],[1,13,1,0],[1,13,7,0],[1,13,4,0],[1,13,2,0],[1,12,5,0],[1,12,0,0],[1,11,9,0],[1,10,9,0],[1,10,1,0],[1,10,1,0],[1,10,7,0],[1,10,9,0],[1,10,1,0],[1,10,6,0],[1,10,6,0],[1,10,8,0],[1,10,10,0],[1,10,6,0],[1,10,0,0],[1,10,5,0],[1,10,3,0],[1,10,4,0],[1,9,9,0],[1,9,9,0],[1,9,0,0],[1,9,0,0],[1,9,6,0],[1,9,10,0],[1,9,8,0],[1,9,5,0],[1,9,2,0],[1,9,9,0],[1,9,10,0],[1,9,7,0],[1,9,2,0],[1,9,0,0],[1,9,4,0],[1,9,6,0],[1,9,4,0],[1,9,7,0],[1,8,3,0],[1,8,2,0],[1,8,4,0],[1,8,9,0],[1,8,2,0],[1,8,3,0],[1,8,5,0],[1,8,8,0],[1,8,0,0],[1,8,9,0],[1,8,10,0],[1,8,5,0],[1,8,5,0],[1,7,5,0],[1,7,5,0],[1,7,0,0],[1,7,2,0],[1,7,8,0],[1,7,10,0],[1,7,5,0],[1,7,3,0],[1,7,3,0],[1,7,6,0],[1,7,7,0],[1,7,7,0],[1,7,9,0],[1,7,3,0],[1,7,8,0],[1,6,4,0],[1,6,6,0],[1,6,4,0],[1,6,9,0],[1,6,0,0],[1,6,1,0],[1,6,4,0],[1,6,1,0],[1,6,0,0],[1,6,7,0],[1,6,0,0],[1,6,8,0],[1,6,4,0],[1,6,2,1],[1,6,1,1],[1,6,3,1],[1,6,6,1],[1,6,4,1],[1,6,4,1],[1,6,1,1],[1,6,3,1],[1,6,4,1],[1,5,1,1],[1,5,9,1],[1,5,4,1],[1,5,6,1],[1,5,4,1],[1,5,4,1],[1,5,10,1],[1,5,5,1],[1,5,2,1],[1,5,4,1],[1,5,4,1],[1,5,9,1],[1,5,3,1],[1,5,10,1],[1,5,2,1],[1,5,2,1],[1,5,9,1],[1,4,8,1],[1,4,6,1],[1,4,0,1],[1,4,10,1],[1,4,5,1],[1,4,10,1],[1,4,9,1],[1,4,1,1],[1,4,4,1],[1,4,4,1],[1,4,0,1],[1,4,3,1],[1,4,1,1],[1,4,3,1],[1,4,2,1],[1,4,4,1],[1,4,4,1],[1,4,8,1],[1,4,2,1],[1,4,4,1],[1,3,2,1],[1,3,6,1],[1,3,4,1],[1,3,7,1],[1,3,4,1],[1,3,1,1],[1,3,10,1],[1,3,3,1],[1,3,4,1],[1,3,7,1],[1,3,5,1],[1,3,6,1],[1,3,1,1],[1,3,6,1],[1,3,10,1],[1,3,2,1],[1,3,4,1],[1,3,2,1],[1,3,1,1],[1,3,5,1],[1,2,4,1],[1,2,2,1],[1,2,8,1],[1,2,3,1],[1,2,1,1],[1,2,9,1],[1,2,10,1],[1,2,9,1],[1,2,4,1],[1,2,5,1],[1,2,0,1],[1,2,9,1],[1,2,9,1],[1,2,0,1],[1,2,1,1],[1,2,1,1],[1,2,4,1],[1,1,0,1],[1,1,2,1],[1,1,2,1],[1,1,5,1],[1,1,3,1],[1,1,10,1],[1,1,6,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,4,1],[1,1,9,1],[1,1,9,1],[1,1,4,1],[1,1,2,1],[1,1,9,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,1,1],[1,1,1,1],[1,1,5,1]]
from scratch.linear_algebra import dot, Vector
def predict(x: Vector, beta: Vector) -> float:
"""assumes that the first element of x is 1"""
return dot(x, beta)
[1, # constant term
49, # number of friends
4, # work hours per day
0] # doesn't have PhD
from typing import List
def error(x: Vector, y: float, beta: Vector) -> float:
return predict(x, beta) - y
def squared_error(x: Vector, y: float, beta: Vector) -> float:
return error(x, y, beta) ** 2
x = [1, 2, 3]
y = 30
beta = [4, 4, 4] # so prediction = 4 + 8 + 12 = 24
assert error(x, y, beta) == -6
assert squared_error(x, y, beta) == 36
def sqerror_gradient(x: Vector, y: float, beta: Vector) -> Vector:
err = error(x, y, beta)
return [2 * err * x_i for x_i in x]
assert sqerror_gradient(x, y, beta) == [-12, -24, -36]
import random
import tqdm
from scratch.linear_algebra import vector_mean
from scratch.gradient_descent import gradient_step
def least_squares_fit(xs: List[Vector],
ys: List[float],
learning_rate: float = 0.001,
num_steps: int = 1000,
batch_size: int = 1) -> Vector:
"""
Find the beta that minimizes the sum of squared errors
assuming the model y = dot(x, beta).
"""
# Start with a random guess
guess = [random.random() for _ in xs[0]]
for _ in tqdm.trange(num_steps, desc="least squares fit"):
for start in range(0, len(xs), batch_size):
batch_xs = xs[start:start+batch_size]
batch_ys = ys[start:start+batch_size]
gradient = vector_mean([sqerror_gradient(x, y, guess)
for x, y in zip(batch_xs, batch_ys)])
guess = gradient_step(guess, gradient, -learning_rate)
return guess
from scratch.simple_linear_regression import total_sum_of_squares
def multiple_r_squared(xs: List[Vector], ys: Vector, beta: Vector) -> float:
sum_of_squared_errors = sum(error(x, y, beta) ** 2
for x, y in zip(xs, ys))
return 1.0 - sum_of_squared_errors / total_sum_of_squares(ys)
from typing import TypeVar, Callable
X = TypeVar('X') # Generic type for data
Stat = TypeVar('Stat') # Generic type for "statistic"
def bootstrap_sample(data: List[X]) -> List[X]:
"""randomly samples len(data) elements with replacement"""
return [random.choice(data) for _ in data]
def bootstrap_statistic(data: List[X],
stats_fn: Callable[[List[X]], Stat],
num_samples: int) -> List[Stat]:
"""evaluates stats_fn on num_samples bootstrap samples from data"""
return [stats_fn(bootstrap_sample(data)) for _ in range(num_samples)]
# 101 points all very close to 100
close_to_100 = [99.5 + random.random() for _ in range(101)]
# 101 points, 50 of them near 0, 50 of them near 200
far_from_100 = ([99.5 + random.random()] +
[random.random() for _ in range(50)] +
[200 + random.random() for _ in range(50)])
from scratch.statistics import median, standard_deviation
medians_close = bootstrap_statistic(close_to_100, median, 100)
medians_far = bootstrap_statistic(far_from_100, median, 100)
assert standard_deviation(medians_close) < 1
assert standard_deviation(medians_far) > 90
from scratch.probability import normal_cdf
def p_value(beta_hat_j: float, sigma_hat_j: float) -> float:
if beta_hat_j > 0:
# if the coefficient is positive, we need to compute twice the
# probability of seeing an even *larger* value
return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j))
else:
# otherwise twice the probability of seeing a *smaller* value
return 2 * normal_cdf(beta_hat_j / sigma_hat_j)
assert p_value(30.58, 1.27) < 0.001 # constant term
assert p_value(0.972, 0.103) < 0.001 # num_friends
assert p_value(-1.865, 0.155) < 0.001 # work_hours
assert p_value(0.923, 1.249) > 0.4 # phd
# alpha is a *hyperparameter* controlling how harsh the penalty is
# sometimes it's called "lambda" but that already means something in Python
def ridge_penalty(beta: Vector, alpha: float) -> float:
return alpha * dot(beta[1:], beta[1:])
def squared_error_ridge(x: Vector,
y: float,
beta: Vector,
alpha: float) -> float:
"""estimate error plus ridge penalty on beta"""
return error(x, y, beta) ** 2 + ridge_penalty(beta, alpha)
from scratch.linear_algebra import add
def ridge_penalty_gradient(beta: Vector, alpha: float) -> Vector:
"""gradient of just the ridge penalty"""
return [0.] + [2 * alpha * beta_j for beta_j in beta[1:]]
def sqerror_ridge_gradient(x: Vector,
y: float,
beta: Vector,
alpha: float) -> Vector:
"""
the gradient corresponding to the ith squared error term
including the ridge penalty
"""
return add(sqerror_gradient(x, y, beta),
ridge_penalty_gradient(beta, alpha))
from scratch.statistics import daily_minutes_good
from scratch.gradient_descent import gradient_step
learning_rate = 0.001
def least_squares_fit_ridge(xs: List[Vector],
ys: List[float],
alpha: float,
learning_rate: float,
num_steps: int,
batch_size: int = 1) -> Vector:
# Start guess with mean
guess = [random.random() for _ in xs[0]]
for i in range(num_steps):
for start in range(0, len(xs), batch_size):
batch_xs = xs[start:start+batch_size]
batch_ys = ys[start:start+batch_size]
gradient = vector_mean([sqerror_ridge_gradient(x, y, guess, alpha)
for x, y in zip(batch_xs, batch_ys)])
guess = gradient_step(guess, gradient, -learning_rate)
return guess
def lasso_penalty(beta, alpha):
return alpha * sum(abs(beta_i) for beta_i in beta[1:])
def main():
from scratch.statistics import daily_minutes_good
from scratch.gradient_descent import gradient_step
random.seed(0)
# I used trial and error to choose niters and step_size.
# This will run for a while.
learning_rate = 0.001
beta = least_squares_fit(inputs, daily_minutes_good, learning_rate, 5000, 25)
assert 30.50 < beta[0] < 30.70 # constant
assert 0.96 < beta[1] < 1.00 # num friends
assert -1.89 < beta[2] < -1.85 # work hours per day
assert 0.91 < beta[3] < 0.94 # has PhD
assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta) < 0.68
from typing import Tuple
import datetime
def estimate_sample_beta(pairs: List[Tuple[Vector, float]]):
x_sample = [x for x, _ in pairs]
y_sample = [y for _, y in pairs]
beta = least_squares_fit(x_sample, y_sample, learning_rate, 5000, 25)
print("bootstrap sample", beta)
return beta
random.seed(0) # so that you get the same results as me
# This will take a couple of minutes!
bootstrap_betas = bootstrap_statistic(list(zip(inputs, daily_minutes_good)),
estimate_sample_beta,
100)
bootstrap_standard_errors = [
standard_deviation([beta[i] for beta in bootstrap_betas])
for i in range(4)]
print(bootstrap_standard_errors)
# [1.272, # constant term, actual error = 1.19
# 0.103, # num_friends, actual error = 0.080
# 0.155, # work_hours, actual error = 0.127
# 1.249] # phd, actual error = 0.998
random.seed(0)
beta_0 = least_squares_fit_ridge(inputs, daily_minutes_good, 0.0, # alpha
learning_rate, 5000, 25)
# [30.51, 0.97, -1.85, 0.91]
assert 5 < dot(beta_0[1:], beta_0[1:]) < 6
assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_0) < 0.69
beta_0_1 = least_squares_fit_ridge(inputs, daily_minutes_good, 0.1, # alpha
learning_rate, 5000, 25)
# [30.8, 0.95, -1.83, 0.54]
assert 4 < dot(beta_0_1[1:], beta_0_1[1:]) < 5
assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_0_1) < 0.69
beta_1 = least_squares_fit_ridge(inputs, daily_minutes_good, 1, # alpha
learning_rate, 5000, 25)
# [30.6, 0.90, -1.68, 0.10]
assert 3 < dot(beta_1[1:], beta_1[1:]) < 4
assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_1) < 0.69
beta_10 = least_squares_fit_ridge(inputs, daily_minutes_good,10, # alpha
learning_rate, 5000, 25)
# [28.3, 0.67, -0.90, -0.01]
assert 1 < dot(beta_10[1:], beta_10[1:]) < 2
assert 0.5 < multiple_r_squared(inputs, daily_minutes_good, beta_10) < 0.6
if __name__ == "__main__": main()