-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathValue_Function_Approximation.py
127 lines (99 loc) · 7.11 KB
/
Value_Function_Approximation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import numpy as np
from TD_Learning import TDLearning
class FunctionApproximation(TDLearning):
def __init__(self, landa, N0, gamma, feature_space_size, score_upper_bound):
super(FunctionApproximation, self).__init__(landa, N0, gamma, score_upper_bound)
self.feature_space_size = feature_space_size
self.theta = np.zeros(feature_space_size)
self.limit = int(self.score_upper_bound / 3) - 1
# Returns the feature vector of a state action pair
def feature_vector(self, state, action):
dealer_feature_vector = [3 * i < state[1] < 5 + 3 * i for i in range(3)]
player_feature_vector = [3 * i < state[0] < 7 + 3 * i for i in range(self.limit)]
half_feature_vector = np.outer(dealer_feature_vector, player_feature_vector)
half_feature_vector = half_feature_vector.flatten()
features = np.hstack(((1 - action) * half_feature_vector, action * half_feature_vector))
return features
def sarsa_lambda_general_approximation_initialize(self):
self.state_action_value_estimation = np.zeros((self.score_upper_bound, 10, 2))
self.state_action_visit_count = np.zeros(self.state_action_value_shape)
self.state_visit_count = np.zeros(self.state_value_shape)
self.theta = np.zeros(self.feature_space_size)
# The sarsa lambda algorithm for any function approximator
def sarsa_lambda_general_approximation(self, episodes, landa, gradient_function, approximation_function, theta):
for i in range(episodes):
self.eligibility_trace = np.zeros(self.feature_space_size)
current_state = self.Environment.first_step()
epsilon = self.epsilon_t(count=self.state_visit_count[self.coord(current_state)])
current_action = self.epsilon_greedy(state=current_state, epsilon=epsilon)
current_state_action = self.to_state_action(action=current_action, state=current_state)
self.state_action_visit_count[self.coord_3d(current_state_action)] += 1
self.state_visit_count[self.coord(current_state)] += 1
new_state, reward, is_terminal = self.Environment.step(do_hit=current_action,
scores=current_state)
while is_terminal == 0:
epsilon = self.epsilon_t(count=self.state_visit_count[self.coord(new_state)])
new_action = self.epsilon_greedy(state=new_state, epsilon=epsilon)
delta = reward + self.gamma * approximation_function(state=new_state, action=new_action)\
- approximation_function(state=current_state, action=current_action)
gradient_value = gradient_function(state=current_state, action=current_action)
self.eligibility_trace = self.gamma * landa * self.eligibility_trace + gradient_value
alpha = self.alpha_t(current_state_action=current_state_action)
theta += delta * np.multiply(alpha, self.eligibility_trace)
current_state = new_state.copy()
current_action = new_action
current_state_action = self.to_state_action(action=current_action, state=current_state)
self.state_action_visit_count[self.coord_3d(current_state_action)] += 1
self.state_visit_count[self.coord(current_state)] += 1
new_state, reward, is_terminal = self.Environment.step(do_hit=current_action,
scores=current_state)
delta = reward - approximation_function(state=current_state, action=current_action)
gradient_value = gradient_function(state=current_state, action=current_action)
self.eligibility_trace = self.gamma * landa * self.eligibility_trace + gradient_value
alpha = self.alpha_t(current_state_action=current_state_action)
theta += delta * np.multiply(alpha, self.eligibility_trace)
return theta
def estimate_state_action_value(self, approximation_function):
player_states = np.arange(1, self.score_upper_bound + 1)
dealer_states = np.arange(1, 11)
# all_states = [(x, y) for x in player_states for y in dealer_states]
all_actions = [0, 1]
return np.asarray([[[approximation_function(state=[player_state, dealer_state], action=action) for action in all_actions] for dealer_state in dealer_states] for player_state in player_states])
def linear_approximation(self, state, action):
return np.dot(self.feature_vector(state=state, action=action), self.theta)
def quadratic_approximation(self, state, action):
feature_vector = self.feature_vector(state=state, action=action)
return np.dot(self.theta, np.multiply(feature_vector, feature_vector))
def linear_gradient(self, state, action):
return self.feature_vector(state=state, action=action)
def quadratic_gradient(self, state, action):
feature_vector = self.feature_vector(state=state, action=action)
return np.multiply(feature_vector, feature_vector)
def learn_sarsa_landa_general_approximation(self, episodes, landa, gradient_function, approximation_function):
self.sarsa_lambda_general_approximation_initialize()
self.theta = self.sarsa_lambda_general_approximation(episodes=episodes,
landa=landa,
gradient_function=gradient_function,
approximation_function=approximation_function,
theta=self.theta)
self.state_action_value_estimation = self.estimate_state_action_value(approximation_function=approximation_function)
state_value_estimation = self.to_value_function(state_value_function=self.state_action_value_estimation)
output = {'state_value': state_value_estimation,
'decision': self.state_action_value_estimation.argmax(axis=2),
'state_action_value': self.state_action_value_estimation
}
return output
def rmse_general_approximation(self, landa, measure_step, episodes, state_action_value_mc, gradient_function, approximation_function):
self.sarsa_lambda_general_approximation_initialize()
steps_number = episodes / measure_step
rmse_array = []
for i in range(int(steps_number)):
rmse = self.rmse(self.state_action_value_estimation, state_action_value_mc)
rmse_array.append(rmse)
self.theta = self.sarsa_lambda_general_approximation(episodes=measure_step,
landa=landa,
gradient_function=gradient_function,
approximation_function=approximation_function,
theta=self.theta)
self.state_action_value_estimation = self.estimate_state_action_value(approximation_function=approximation_function)
return rmse_array