-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtf2_policy_gradient_training.py
40 lines (33 loc) · 1.26 KB
/
tf2_policy_gradient_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# if you have more than 1 gpu, use device '0' or '1' to assign to a gpu
#import os
#os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import gym
import numpy as np
from reinforce_tf2 import Agent
# from utils import plotLearning
if __name__ == '__main__':
agent = Agent(alpha=0.0005, gamma=0.99,n_actions=4)
env = gym.make('LunarLander-v2')
score_history = []
num_episodes = 2000
# TOPIC: (DRL) Training a DRL Network Based on the Monte-Carlo Method
# To get a better understanding, look at the schematics demonstrated in LINK-14
#
for i in range(num_episodes):
done = False
score = 0
observation = env.reset()
while not done:
action = agent.choose_action(observation)
observation_, reward, done, info = env.step(action)
agent.store_transition(observation, action, reward)
observation = observation_
score += reward
score_history.append(score)
agent.learn()
avg_score = np.mean(score_history[-100:])
print('episode: ', i,'score: %.1f' % score,
'average score %.1f' % avg_score)
filename = 'lunar-lander.png'
# plotLearning(score_history, filename=filename, window=100)