Skip to content

Commit

Permalink
Merge pull request #10 from praveen-palanisamy/master
Browse files Browse the repository at this point in the history
Added opensim-rl environment and a sample configuration and options for a continuous DQN agent to learn in that environment
  • Loading branch information
jingweiz authored Nov 20, 2017
2 parents 7c03adf + cd3b9cf commit a990c7d
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 9 deletions.
1 change: 1 addition & 0 deletions README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ NOTE: we follow the exact code structure as [pytorch-dnc](https://github.com/jin
- [Visdom](https://github.com/facebookresearch/visdom)
- [OpenAI Gym >=v0.9.0 (for lower versoins, just need to change into the available games, e.g. change PongDeterministic-v4 to PongDeterministic-v3)](https://github.com/openai/gym)
- [mujoco-py (Optional: for training continuous version of a3c)](https://github.com/openai/mujoco-py)
- [opensim-rl (Optional: for training in Opensim-rl environment) (https://github.com/stanfordnmbl/osim-rl)
*******


Expand Down
4 changes: 2 additions & 2 deletions core/agents/dqn.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _epsilon_greedy(self, q_values_ts):
self.eps = self.eps_eval
# choose action
if np.random.uniform() < self.eps: # then we choose a random action
action = random.randrange(self.action_dim)
action = np.random.rand(self.action_dim).tolist()
else: # then we choose the greedy action
if self.use_cuda:
action = np.argmax(q_values_ts.cpu().numpy())
Expand All @@ -164,7 +164,7 @@ def _forward(self, observation):
state_ts = torch.from_numpy(np.array(state)).unsqueeze(0).type(self.dtype)
q_values_ts = self.model(Variable(state_ts, volatile=True)).data # NOTE: only doing inference here, so volatile=True
if self.training and self.step < self.learn_start: # then we don't do any learning, just accumulate experiences into replay memory
action = random.randrange(self.action_dim) # thus we only randomly sample actions here, since the model hasn't been updated at all till now
action = np.random.rand(self.action_dim).tolist() # thus we only randomly sample actions here, since the model hasn't been updated at all till now
else:
action = self._epsilon_greedy(q_values_ts)

Expand Down
75 changes: 75 additions & 0 deletions opensim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from copy import deepcopy
from gym.spaces.box import Box
import inspect

from utils.helpers import Experience # NOTE: here state0 is always "None"
from utils.helpers import preprocessAtari, rgb2gray, rgb2y, scale
from core.env import Env

class OpenSim(Env): # low dimensional observations
""" Class to setup the OpenSim-RL environment (https://github.com/praveen-palanisamy/pytorch-rl.git) Where the agent has to learn to run! Continuous (18 dim) action space."""
def __init__(self, args, env_ind=0):
super(OpenSim, self).__init__(args, env_ind)

assert self.env_type == "opensim"
try: from osim.env import RunEnv
except ImportError as e: self.logger.warning("WARNING: opensim not found")

self.env = RunEnv(visualize= True)
#self.env.seed(self.seed) # NOTE: so each env would be different

# action space setup
self.actions = range(self.action_dim)
self.logger.warning("Action Space: %s", self.env.action_space)

# state space setup
self.logger.warning("State Space: %s", self.state_shape)

# continuous space
#if args.agent_type == "a3c":
self.enable_continuous = True #args.enable_continuous

def _preprocessState(self, state): # NOTE: here no preprecessing is needed
return state

@property
def action_dim(self):
return self.env.action_space.shape[0]

@property
def state_shape(self):
return self.env.observation_space.shape[0]

def render(self):
#if self.mode == 2:
# frame = self.env.render(mode='rgb_array')
# frame_name = self.img_dir + "frame_%04d.jpg" % self.frame_ind
# self.imsave(frame_name, frame)
# self.logger.warning("Saved Frame @ Step: " + str(self.frame_ind) + " To: " + frame_name)
# self.frame_ind += 1
# return frame
#else:
# return self.env.render()
return


def visual(self):
pass

def sample_random_action(self):
return self.env.action_space.sample()

def reset(self):
self._reset_experience()
self.exp_state1 = self.env.reset()
return self._get_experience()

def step(self, action):
self.exp_action = action
if self.enable_continuous:
self.exp_state1, self.exp_reward, self.exp_terminal1, _ = self.env.step(self.exp_action)
return self._get_experience()
5 changes: 4 additions & 1 deletion utils/factory.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
from core.envs.atari_ram import AtariRamEnv
from core.envs.atari import AtariEnv
from core.envs.lab import LabEnv
from core.envs.opensim import OpenSim
EnvDict = {"gym": GymEnv, # classic control games from openai w/ low-level input
"atari-ram": AtariRamEnv, # atari integrations from openai, with low-level input
"atari": AtariEnv, # atari integrations from openai, with pixel-level input
"lab": LabEnv}
"lab": LabEnv,
"opensim": OpenSim}

from core.models.empty import EmptyModel
from core.models.dqn_mlp import DQNMlpModel
Expand All @@ -20,6 +22,7 @@
from core.models.acer_cnn_dis import ACERCnnDisModel
ModelDict = {"empty": EmptyModel, # contains nothing, only should be used w/ EmptyAgent
"dqn-mlp": DQNMlpModel, # for dqn low-level input
"dqn-mlp-con": DQNMlpModel, # for dqn low-level input
"dqn-cnn": DQNCnnModel, # for dqn pixel-level input
"a3c-mlp-con": A3CMlpConModel, # for a3c low-level input (NOTE: continuous must end in "-con")
"a3c-cnn-dis": A3CCnnDisModel, # for a3c pixel-level input
Expand Down
42 changes: 36 additions & 6 deletions utils/options.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@
[ "dqn", "atari", "BreakoutDeterministic-v4", "dqn-cnn", "sequential"], # 4
[ "a3c", "atari", "PongDeterministic-v4", "a3c-cnn-dis", "none" ], # 5
[ "a3c", "gym", "InvertedPendulum-v1", "a3c-mlp-con", "none" ], # 6
[ "acer", "gym", "MountainCar-v0", "acer-mlp-dis", "episodic" ] # 7 # NOTE: acer under testing
[ "acer", "gym", "MountainCar-v0", "acer-mlp-dis", "episodic" ], # 7 # NOTE: acer under testing
[ "dqn", "opensim", "opensim", "dqn-mlp-con", "sequential"] # 8
]

class Params(object): # NOTE: shared across all modules
def __init__(self):
self.verbose = 0 # 0(warning) | 1(info) | 2(debug)

# training signature
self.machine = "aisdaim" # "machine_id"
self.timestamp = "17082400" # "yymmdd##"
self.machine = "hpc011" # "machine_id"
self.timestamp = "1" # "yymmdd##"
# training configuration
self.mode = 1 # 1(train) | 2(test model_file)
self.config = 7
self.config = 8

self.seed = 123
self.render = False # whether render the window from the original envs or not
Expand All @@ -53,7 +54,7 @@ def __init__(self):
self.hidden_dim = 16
else:
self.hist_len = 4
self.hidden_dim = 256
self.hidden_dim = 512#256

self.use_cuda = torch.cuda.is_available()
self.dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
Expand Down Expand Up @@ -139,8 +140,12 @@ def __init__(self):
self.wid_state = 80
self.preprocess_mode = 3 # 0(nothing) | 1(rgb2gray) | 2(rgb2y) | 3(crop&resize depth)
self.img_encoding_type = "passthrough"

elif self.env_type == "opensim":
pass

else:
assert False, "env_type must be: gym | atari-ram | atari | lab"
assert False, "env_type must be: gym | atari-ram | atari | lab | opensim"

class ModelParams(Params): # settings for network architecture
def __init__(self):
Expand Down Expand Up @@ -228,6 +233,31 @@ def __init__(self):
self.action_repetition = 4
self.memory_interval = 1
self.train_interval = 4
elif self.agent_type == "dqn" and self.env_type == "opensim":
self.steps = 50000000 # max #iterations
self.early_stop = None # max #steps per episode
self.gamma = 0.99
self.clip_grad = 40.#np.inf
self.lr = 0.00025
self.lr_decay = False
self.weight_decay = 0.
self.eval_freq = 250000#12500 # NOTE: here means every this many steps
self.eval_steps = 125000#2500
self.prog_freq = 10000#self.eval_freq
self.test_nepisodes = 1

self.learn_start = 50000 # start update params after this many steps
self.batch_size = 32
self.valid_size = 500
self.eps_start = 1
self.eps_end = 0.1
self.eps_eval = 0.#0.05
self.eps_decay = 1000000
self.target_model_update = 10000
self.action_repetition = 4
self.memory_interval = 1
self.train_interval = 4

elif self.agent_type == "a3c":
self.steps = 20000000 # max #iterations
self.early_stop = None # max #steps per episode
Expand Down

0 comments on commit a990c7d

Please sign in to comment.