-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathppo.py
588 lines (482 loc) · 21.8 KB
/
ppo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
# Proximal Policy Optimization algorithm
# implemented by Qin Yongliang
# rewritten with openai/baselines as a reference
# this is intended to be run on py3.5
# terminology
# ob, obs, input, observation, state, s1 : observation in RL
# ac, action, a1, output : action in RL
# vp, vf, val_pred, value_prediction : value function predicted by critic
# reward, return, ret, r1, tdlamret : (per step / expected future) reward in RL
# adv, advantage, adv_target, atarg : advantage (estimated / target) in RL
import tensorflow as tf
import numpy as np
from canton import *
import gym
# low-passed gaussian noise to help with exploration.
# from gaussian import lowpassgaussian as lpgs
# To improve our policy via PPO, we must be able to parametrize and sample from it as a probabilistic distribution. A typical choice for continuous domain problems is the Diagonal Gaussian Distribution. It's simpler (and thus less powerful) than a full Multivariate Gaussian, but should work just fine.
# Knowledge of probabilistics is required to read and comprehend following code.
tsq = lambda x:tf.square(x)
tsum = lambda x:tf.reduce_sum(x)
tmean = lambda x:tf.reduce_mean(x)
tsumlast = lambda x:tf.reduce_sum(x, axis=[-1])
class Dist(Can):
def logp(self, a):
return - self.neglogp(a)
# Bernoulli layer
class Bernoulli(Dist):
def __call__(self, x):
self.logits = x
self.probs = Act('sigmoid')(self.logits)
self.mean = self.probs
return self.sample()
def sample(self):
return tf.to_float(tf.random_uniform(tf.shape(self.probs)) < self.probs)
def neglogp(self, a): # action be either of [0,1]
# a.shape [num, dims]
# def loge(i):
# eps = 1e-13
# return tf.log(i+eps)
# logp_each = loge(self.probs) * a + loge(1.-self.probs)*(1.-a)
# logp_total = tsumlast(logp_each)
# return logp_total # shape [num]
return tsumlast(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(a)))
class Categorical(Dist):
def __call__(self,x):
self.logits = x # [batch, categories]
return self.sample()
def sample(self):
self.mean = tf.argmax(self.logits, axis=-1) # [batch, ]
u = tf.random_uniform(tf.shape(self.logits))
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
# returns a number indicating category
def neglogp(self, x):
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
# the implementation does not allow second-order derivatives...
one_hot_actions = tf.one_hot(x, tf.shape(self.logits)[1])
return tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits,
labels=one_hot_actions)
class MultiCategorical(Dist):
def __init__(self, dims, cats):
super().__init__()
self.dims = dims # how many categoricals
self.cats = cats # how many category within one categorical
self.categoricals = [Categorical() for _ in range(dims)]
def __call__(self,x): #[batch, dim*categories]
self.multilogits = x
listlogits = tf.split(self.multilogits,num_or_size_splits=self.dims, axis=1)
for c,l in zip(self.categoricals,listlogits):
c(l) # feed every categorical
return self.sample()
def sample(self):
self.mean = tf.cast(tf.stack([p.mean for p in self.categoricals],axis=-1), tf.int32)
return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
def neglogp(self, x): #[batch, dims]
return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=1))])
class MultiCategoricalContinuous(MultiCategorical):
def __init__(self,*args):
super().__init__(*args)
self.scaler = 1/(self.cats-1)
def sample(self):
ret = super().sample()
ret = tf.to_float(ret) * self.scaler
self.mean = tf.to_float(self.mean) * self.scaler
return ret
def neglogp(self, x):
x = tf.to_int32(tf.floor(x/self.scaler+1e-8)) # integerization
return super().neglogp(x)
# a simple MLP policy.
class Policy(Can):
def __init__(self, ob_space, ac_space):
super().__init__()
# 1. assume probability distribution is continuous
assert len(ac_space.shape) == 1
self.ac_dims = ac_dims = ac_space.shape[0]
self.ob_dims = ob_dims = ob_space.shape[0]
# 2. build our action network
rect = Act('tanh')
# apparently John doesn't give a fuck about ReLUs. Change the rectifiers as you wish.
rect = Act('lrelu',alpha=0.2)
magic = 1/(0.5+0.5*0.2) # stddev factor for lrelu(0.2)
c = Can()
c.add(Dense(ob_dims, 128, stddev=magic))
c.add(rect)
c.add(Dense(128, 64, stddev=magic))
c.add(rect)
c.add(Dense(64, 64, stddev=magic))
c.add(rect)
c.add(Dense(64, ac_dims*10, stddev=1))
# self.dist = c.add(Bernoulli())
self.dist = c.add(MultiCategoricalContinuous(ac_dims, 10))
c.chain()
self.actor = self.add(c)
# 3. build our value network
c = Can()
c.add(Dense(ob_dims, 128, stddev=magic))
c.add(rect)
c.add(Dense(128, 64, stddev=magic))
c.add(rect)
c.add(Dense(64, 64, stddev=magic))
c.add(rect)
c.add(Dense(64, 1, stddev=1))
c.chain()
self.critic = self.add(c)
# don't discard trajectories after one iteration;
# keep them around in the buffer to increase sample efficiency.
class traj_buffer:
def __init__(self, length):
self.maxlen = length
self.buf = []
def push(self, collected):
# collected is a tuple of (s1,a1...)
self.buf.append(collected)
while len(self.buf)>self.maxlen:
self.buf.pop(0) # remove first
def get_all(self):
collected = [[] for i in range(len(self.buf[0]))]
for c in self.buf:
for i in range(len(c)):
collected[i] += c[i]
return collected
def get_all_raw(self):
return list(self.buf)
# our PPO agent.
class ppo_agent:
def __init__(
self, ob_space, ac_space,
horizon=2048,
gamma=0.99, lam=0.95,
train_epochs=10, batch_size=64,
buffer_length=10,
policy=None,
lr=None
):
if lr is not None:
self.lr = lr
if policy is None:
print('no policy designated, use default Policy')
policy = Policy
self.current_policy = policy(ob_space, ac_space)
self.old_policy = policy(ob_space, ac_space)
self.current_policy.actor.summary()
self.current_policy.critic.summary()
self.gamma, self.lam, self.horizon = gamma, lam, horizon
self.train_epochs, self.batch_size = train_epochs, batch_size
self.traj_buffer = traj_buffer(buffer_length)
self.act, self.predict_value, self.train_for_one_step, self.assign_old_eq_new = self.build_functions()
low, high = ac_space.low, ac_space.high
self.action_bias = (high + low)/2.
self.action_multiplier = high - self.action_bias
# limit action into the range specified by environment.
def action_limiter(action): # assume input mean 0 std 1
return np.tanh(action) * self.action_multiplier + self.action_bias
def action_limiter(action): # assume input uniform [0,1]
return (action * 2 - 1) * self.action_multiplier + self.action_bias
self.action_limiter = action_limiter
# logging of episodic reward.
from plotter import interprocess_plotter as plotter
self.plotter = plotter(2)
# logging of actions. comment out if you don't have opencv
if not hasattr(self,'wavegraph'):
from winfrey import wavegraph
# num_waves = self.outputdims*2+1
num_waves = self.current_policy.ac_dims*2+1
def rn():
r = np.random.uniform()
return 0.3+r*0.4
colors = []
for i in range(num_waves-1):
color = [rn(),rn(),rn()]
colors.append(color)
colors.append([0.2,0.5,0.9])
self.wavegraph = wavegraph(num_waves,'ac_mean/ac_sto/vf',np.array(colors))
def loggraph(waves):
wg = self.wavegraph
wg.one(waves.reshape((-1,)))
self.loggraph = loggraph
# build graph and actions for training with tensorflow.
def build_functions(self):
# the 'lrmult' parameter is not implemented.
# improve policy w.r.t. old_policy
policy, old_policy = self.current_policy, self.old_policy
# Input Placeholders
states, actions = ph([policy.ob_dims]), ph([policy.ac_dims]) # you know these two
adv_target = ph([1]) # Target advantage function, estimated
ret = ph([1]) # Empirical return, estimated
# feed observation thru the networks
# policy_mean, policy_std = policy.actor(states)
policy_sample = policy.actor(states)
policy_val_pred = policy.critic(states)
# old_policy_mean, old_policy_std = old_policy.actor(states)
old_policy_sample = old_policy.actor(states)
old_policy_val_pred = old_policy.critic(states)
# ratio = P_now(state, action) / P_old(state, action)
# state was previously fed so we will pass in actions only
# # IMPORTANT: add epsilon to guarantee numerical stability
# eps = 1e-8
# pn,po = policy.dg.p(actions)+eps, old_policy.dg.p(actions)+eps
# ratio = pn/po
# ratio = tf.reduce_prod(ratio, axis=1) #temp
logpn,logpo = policy.dist.logp(actions), old_policy.dist.logp(actions)
ratio = tf.exp(logpn - logpo)
# surr1 -> policy gradient
surr1 = ratio * adv_target
# surr2 -> policy deviation
clip_param = 0.2 # magical epsilon in paper
surr2 = tf.clip_by_value(ratio, 1.0-clip_param, 1.0+clip_param) * adv_target
# together they form the L^CLIP loss in PPO paper.
policy_surrogate = - tf.reduce_mean(tf.minimum(surr1,surr2))
# how far is our critic's prediction from estimated return?
value_prediction = policy_val_pred
value_loss = tf.reduce_mean(tsq(value_prediction-ret))
# optimizer
opt = tf.train.AdamOptimizer(1e-4)
opt_a = tf.train.AdamOptimizer(3e-4 if not hasattr(self,'lr') else self.lr)
opt_c = tf.train.AdamOptimizer(1e-3)
# sum of two losses used in original implementation
total_loss = policy_surrogate + value_loss
combined_trainstep = opt.minimize(total_loss, var_list=policy.get_weights())
# If you want different learning rates, go with the following
actor_trainstep = opt_a.minimize(policy_surrogate, var_list=policy.actor.get_weights())
critic_trainstep = opt_c.minimize(value_loss, var_list=policy.critic.get_weights())
# # gradient clipping test
# grads_vars = opt_a.compute_gradients(policy_surrogate, policy.actor.get_weights())
# capped = [(tf.clip_by_value(grad, -1., 1.), var) for grad,var in grads_vars]
# actor_trainstep = opt_a.apply_gradients(capped)
# # weight decay if needed
# decay_factor = 1e-5
# decay_step = [tf.assign(w, w * (1-decay_factor)) for w in policy.actor.get_only_weights()]
def insane(k):
for x in k:
s = np.sum(x)
if np.isnan(s) or np.isinf(s):
print(k)
raise Exception('inf/nan')
# 1. build our action sampler: given observation, generate action
def act(state, stochastic=True):
# assume state is ndarray of shape [dims]
state = state.view()
state.shape = (1,) + state.shape
insane(state)
res = get_session().run([
# policy_mean,
# policy_std,
# policy_out,
policy.dist.mean,
policy_sample,
value_prediction,
], feed_dict={states: state})
pm, ps, vp = res
# po, vp = res
# [batch, dims] [batch, dims] [batch, 1]
pm,ps,vp = pm[0], ps[0], vp[0,0]
return pm,ps,vp
# po, vp = po[0], vp[0,0]
# return po,vp
# 2. value prediction
def predict_value(_states):
# assume _states is ndarray of shape [batch, dims]
res = get_session().run([value_prediction],feed_dict={states:_states})
return res[0]
# 3. trainer. update current policy given processed trajectories.
def train_for_one_step(_states, _actions, _adv_target, _ret):
# insane([_states,_actions,_adv_target,_ret])
res = get_session().run(
[ # perform training and collect losses in one go
policy_surrogate, value_loss,
# ratio,logpn,logpo,
actor_trainstep, critic_trainstep,
# combined_trainstep,
# decay_step,
],
feed_dict = {
states:_states, actions:_actions,
adv_target:_adv_target, ret:_ret,
}
)
# insane([(x if x is not None else 0) for x in res])
# res[0] is ploss, res[1] is val_loss
return res
# 4. assigner. assign old_policy's weights with current policy's weights.
assign_ops = [tf.assign(o,n) for o,n in zip(old_policy.get_weights(), policy.get_weights())]
print('total of {} weights to assign from new to old'.format(len(assign_ops)))
def assign_old_eq_new():
get_session().run([assign_ops])
return act, predict_value, train_for_one_step, assign_old_eq_new
# run a bunch of episodes with current_policy on env and collect some trajectories.
def collect_trajectories(self, env):
policy = self.current_policy
# minimum length we are going to collect
horizon = self.horizon
print('collecting trajectory...')
# things we have to collect
s1 = [] # observations before action
a1 = [] # action taken
r1 = [] # reward received
_done = [] # is the episode done after a1
# counters
ep = 0
steps = 0
sum_reward = 0
while 1:
episode_total_reward = 0
episode_length = 0
# from gaussian import lowpassuniform
# noises = [lowpassuniform() for _ in range(self.current_policy.ac_dims)]
# def noise_sample():
# return np.array([n.sample() for n in noises])
# # return np.random.uniform(size=(self.current_policy.ac_dims,))
# initial observation
ob = env.reset()
while 1:
# sample action from given policy
mean, sto, val_pred = self.act(ob)
# policy_out, val_pred = self.act(ob)
# sto_action = 1.0*(policy_out > noise_sample())
sto_action = sto
# sto_action = noise_sample() * std + mean
# sto_limited = self.action_limiter(sto_action)
mean_limited, sto_limited = self.action_limiter(mean), self.action_limiter(sto_action)
# logging actions. comment out if you don't have opencv
if True:
# mean_limited = self.action_limiter(mean)
disp_mean = mean*10. + np.arange(policy.ac_dims)*12 + 30
disp_sto = sto*10. - np.flipud(np.arange(policy.ac_dims))*12 - 30
self.loggraph(np.hstack([disp_mean, disp_sto, val_pred]))
# step environment with action and obtain reward
new_ob, reward, done, info = env.step(sto_limited)
# if steps%100==0:
# env.render()
# append data into collection
s1.append(ob)
a1.append(sto_action)
r1.append(reward)
_done.append(1 if done else 0)
ob = new_ob # assign new_ob to prev ob
# counting
episode_total_reward+=reward
episode_length+=1
steps+=1
# if episode is done, either natually or forcifully
if done or episode_length >= 1600:
_done[-1] = 1
print('episode {} done in {} steps, total reward:{}'.format(
ep+1, episode_length, episode_total_reward,
))
self.plotter.pushys([ep,episode_total_reward])
break
sum_reward += episode_total_reward
print('{}/{} steps collected in {} episode(s)'.format(steps,horizon,ep+1), end='\r')
if steps>= horizon:
break
else:
ep+=1
print('mean reward per episode:{}'.format(sum_reward/(ep+1)))
return s1,a1,r1,_done
# estimate target value (which we are trying to make our critic to fit) via TD(lambda), and advantage using GAE(lambda), from collected trajectories.
def append_vtarg_and_adv(self, collected):
# you know what these mean, don't you?
gamma = self.gamma # 0.99
lam = self.lam # 0.95
[s1,a1,r1,done] = collected
vp1 = self.predict_value(np.array(s1))
T = len(a1) # [s1] might be longer than the others by one.
# since to predict vp1 for T+1 step, we might append last s2 into [s1]
advantage = [None]*T
last_adv = 0
for t in reversed(range(T)): # step T-1, T-2 ... 0
# delta = (reward_now) + (predicted_future_t+1) - (predicted_future_t)
delta = r1[t] + (0 if done[t] else gamma * vp1[t+1]) - vp1[t]
advantage[t] = delta + gamma * lam * (1-done[t]) * last_adv
last_adv = advantage[t]
if len(s1) > len(a1):
s1 = list(s1) # create a new list without intefereing original
s1.pop() # remove last
tdlamret = [a+v for a,v in zip(advantage, vp1)]
return [s1,a1,r1,done, advantage,tdlamret]
# perform one policy iteration
def iterate_once(self, env):
# 0. assign new to old
self.assign_old_eq_new()
# 1. collect trajectories w/ current policy
collected = self.collect_trajectories(env)
# 2. push the trajectories into buffer
self.traj_buffer.push(collected)
# 3. load historic trajectories from buffer
collected = self.traj_buffer.get_all()
# 4. estimate advantage and TD(lambda) return
collected = self.append_vtarg_and_adv(collected)
# 5. processing, numpyization
collected = self.usual_data_processing(collected)
# s1,a1,r1,done,advantage,tdlamret = self.usual_data_processing(collected)
# 6. train for some epochs
self.usual_feed_training(collected)
print('iteration done.')
# data processing moved here
def usual_data_processing(self,collected):
s1,a1,r1,done,advantage,tdlamret = collected
l = len(s1)
assert l==len(a1)and l==len(r1)and l==len(done)and l==len(advantage)and l==len(tdlamret) # all inputs must be of the same length
# 5. data processing
# shuffling
indices = np.arange(len(a1))
np.random.shuffle(indices)
# numpyization
s1, a1, advantage, tdlamret = [
np.take(np.array(k).astype('float32'), indices, axis=0)
for k in [s1, a1, advantage, tdlamret]
]
# expand dimension for minibatch training
for nd in [s1,a1,advantage,tdlamret]:
if nd.ndim == 1:
nd.shape += (1,)
# standarize/normalize
advantage = (advantage - advantage.mean())/(advantage.std()+1e-4)
return s1,a1,r1,done,advantage,tdlamret
# feed-train moved here
def usual_feed_training(self,collected):
s1,a1,r1,done,advantage,tdlamret = collected
# 6. train for some epochs
train_epochs = self.train_epochs
batch_size = self.batch_size
data_length = len(s1)
import time
lasttimestamp = time.time()
print('training network on {} datapoints for {} epochs'.format(data_length,train_epochs))
for e in range(train_epochs):
for j in range(0, data_length-batch_size+1, batch_size):
# ignore tail
res = self.train_for_one_step(
s1[j:j+batch_size],
a1[j:j+batch_size],
advantage[j:j+batch_size],
tdlamret[j:j+batch_size],
)
ploss, vloss = res[0],res[1]
if time.time() - lasttimestamp > 0.2:
lasttimestamp = time.time()
print(' '*30, 'epoch: {}/{} ploss: {:6.4f} vloss: {:6.4f}'.format(
e+1,train_epochs,ploss, vloss),end='\r')
if __name__ == '__main__':
# get swingy
envname = 'Pendulum-v0'
envname = 'BipedalWalker-v2'
env = gym.make(envname)
agent = ppo_agent(
env.observation_space, env.action_space,
horizon=1024, # minimum steps to collect before policy update
gamma=0.99, # discount factor for reward
lam=0.95, # smooth factor for advantage estimation
train_epochs=10, # how many epoch over data for one update
batch_size=64, # batch size for training
buffer_length=16, # how may iteration of trajectories to keep around for training. Set to 1 for 'original' PPO (higher variance).
)
get_session().run(gvi()) # init global variables for TF
def r(iters=2):
print('start running')
for i in range(iters):
print('optimization iteration {}/{}'.format(i+1, iters))
agent.iterate_once(env)
r(1000)