-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMonteCarloAgent.py
94 lines (74 loc) · 2.91 KB
/
MonteCarloAgent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
from easy21 import easy21
import csv
import pickle
class player:
def __init__(self, N_0):
self.N_0 = N_0
self.actions = [0, 1]
self.Q = np.zeros((11, 22, len(self.actions)))
self.NSA = np.zeros((11, 22, len(self.actions)))
self.game = easy21()
def NS(self, state):
(d, p) = state
return np.sum(self.NSA[d, p])
def epsilon(self, state):
return self.N_0/(self.N_0+self.NS(state))
def stepSize(self, SA):
return 1/self.NSA[SA]
def chooseAction(self, state):
if np.random.uniform(0, 1) < self.epsilon(state):
action = np.random.choice(self.actions)
else:
(d, p) = state
action = np.argmax([self.Q[d, p, a] for a in self.actions])
return action
def OptimalValueFunction(self):
v = {}
for i in range(1, 11):
for j in range(1, 22):
v[i, j] = max(self.Q[i, j, 0], self.Q[i, j, 1])
return v
def updateQ(self, record):
G = sum([exp[-1] for exp in record])
for d, p, a, r in record:
self.Q[d, p, a] += self.stepSize((d, p, a))*(G-self.Q[d, p, a])
def playMonteCarlo(self, NumEpisodes):
currentEp = 1
terminated = False
episodeRecord = []
wins = 0
meanReturn = 0
while currentEp <= NumEpisodes:
dealer, player = self.game.startGame()
while not terminated:
action = self.chooseAction((dealer, player))
self.NSA[dealer, player, action] += 1
player_new, dealer_new, reward, terminated = self.game.step(player, dealer, action)
episodeRecord.append([dealer, player, action, reward])
dealer, player = dealer_new, player_new
meanReturn = meanReturn + 1/(currentEp) * (reward - meanReturn)
if reward == 1:
wins += 1
if currentEp % 10000 == 0:
print("Episode %i, Mean-Return %.3f, Wins %.3f" %
(currentEp, meanReturn, wins/(currentEp)))
self.updateQ(episodeRecord)
currentEp += 1
terminated = False
episodeRecord = []
def outputValueCSV(self, v):
with open('valueFunction.csv', mode='w') as csv_file:
value_writer = csv.writer(
csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
value_writer.writerow(['dealer', 'player', 'value'])
for i in range(1, 11):
for j in range(1, 22):
value_writer.writerow([i, j, v[i, j]])
if __name__ == "__main__":
p = player(100)
p.playMonteCarlo(2000000)
v = p.OptimalValueFunction()
with open('Q.mc', 'wb') as Qfile:
pickle.dump(p.Q, Qfile)
p.outputValueCSV(v)