-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathAlgorithms.py
77 lines (57 loc) · 2.7 KB
/
Algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from Game import Game
import numpy as np
class Algorithms():
def __init__(self, N0, gamma, score_upper_bound):
# The rows represent the player state, and the columns the dealer's state
# The policy maps for each state the probability of hit
self.Environment = Game(score_upper_bound=score_upper_bound)
self.state_value_shape = (score_upper_bound, 10)
self.state_action_value_shape = (score_upper_bound, 10, 2)
self.policy = np.random.rand(score_upper_bound, 10)
self.value_function = np.zeros(self.state_value_shape)
self.eligibility_trace = np.zeros(self.state_value_shape)
self.state_visit_count = np.zeros(self.state_value_shape)
self.state_action_visit_count = np.zeros(self.state_action_value_shape)
self.state_action_total_return = np.zeros(self.state_action_value_shape)
self.state_action_value_estimation = np.zeros(self.state_action_value_shape)
self.epsilon_list = []
self.score_upper_bound = score_upper_bound
self.N0 = N0
self.gamma = gamma
@staticmethod
def coord(vector):
return int(vector[0]) - 1, int(vector[1]) - 1
@staticmethod
def coord_3d(vector):
return int(vector[0]) - 1, int(vector[1]) - 1, int(vector[2])
@staticmethod
def coord_3d_2(state, action):
return int(state[0]) - 1, int(state[1]) - 1, int(action)
def random_policy(self, state, policy):
action = round(np.random.binomial(1, policy[self.coord(state)]))
return action
def epsilon_greedy(self, state, epsilon):
pick = round(np.random.binomial(1, epsilon / 2))
choices_values = self.state_action_value_estimation[int(state[0]) - 1, int(state[1]) - 1, :]
if pick or (choices_values[0] == 0 and choices_values[1] == 0):
return round(np.random.binomial(1, 1 / 2))
else:
# The problem with this is that this always returns 0 if the values are equal, which is a problem
# in the beginning where the value function is initialized to 0
return choices_values.argmax()
@staticmethod
def to_value_function(state_value_function):
# TODO Check if it really works
return state_value_function.max(axis=2)
def epsilon_t(self, count):
return self.N0 / (self.N0 + count)
def alpha_t(self, current_state_action):
return 1 / (self.state_action_visit_count[self.coord_3d(current_state_action)] + 1)
@staticmethod
def to_state_action(state, action):
state2 = np.copy(state)
state2 = np.append(state2, action)
return state2
@staticmethod
def rmse(array1, array2):
return np.sum(np.square(array1 - array2))