Skip to content

Commit

Permalink
Update hyperparams
Browse files Browse the repository at this point in the history
  • Loading branch information
amacati committed Jan 26, 2025
1 parent d0852f1 commit f72cc2b
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 42 deletions.
55 changes: 24 additions & 31 deletions tutorials/ppo/sweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,59 @@
from train import train_ppo

sweep_config = {
"method": "bayes",
"method": "random",
"metric": {"name": "eval/mean_rewards", "goal": "maximize"},
"parameters": {
"n_envs": {"values": [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]},
"n_train_samples": {"values": [int(2**n) for n in range(15, 20)]},
"learning_rate": {
"distribution": "log_uniform",
"min": -10, # e⁻¹⁰ ~= 5e-5
"max": -5, # e⁻⁵ ~= 6e-3
},
"n_minibatches": {"values": [8, 16, 32, 64, 128]},
"n_epochs": {"values": [5, 10, 15]},
"clip_coef": {"distribution": "uniform", "min": 0.1, "max": 0.3},
"ent_coef": {"distribution": "uniform", "min": 0.0, "max": 0.25},
"vf_coef": {"distribution": "uniform", "min": 0.4, "max": 0.6},
"gamma": {"distribution": "uniform", "min": 0.9, "max": 0.999},
"gae_lambda": {"distribution": "uniform", "min": 0.5, "max": 0.99},
"max_grad_norm": {"distribution": "uniform", "min": 0.2, "max": 5.0},
"learning_rate": {"distribution": "uniform", "min": 1e-4, "max": 5e-3},
"clip_coef": {"distribution": "uniform", "min": 0.2, "max": 0.3},
"ent_coef": {"distribution": "uniform", "min": 0.0, "max": 0.05},
"gamma": {"distribution": "uniform", "min": 0.8, "max": 0.99},
"gae_lambda": {"distribution": "uniform", "min": 0.9, "max": 0.99},
"max_grad_norm": {"distribution": "uniform", "min": 1.0, "max": 5.0},
},
}

config = ConfigDict(
{
"n_envs": 32,
"n_envs": 1024,
"device": "cuda",
"total_timesteps": 2_000_000,
"learning_rate": 3e-4,
"n_steps": 2048, # Number of steps per environment per policy rollout
"gamma": 0.99, # Discount factor
"total_timesteps": 1_000_000,
"learning_rate": 1.5e-3,
"n_steps": 16, # Number of steps per environment per policy rollout
"gamma": 0.90, # Discount factor
"gae_lambda": 0.95, # Lambda for general advantage estimation
"n_minibatches": 32, # Number of mini-batches
"n_epochs": 10,
"n_minibatches": 16, # Number of mini-batches
"n_epochs": 15,
"norm_adv": True,
"clip_coef": 0.2,
"clip_coef": 0.25,
"clip_vloss": True,
"ent_coef": 0.0,
"ent_coef": 0.01,
"vf_coef": 0.5,
"max_grad_norm": 0.5,
"max_grad_norm": 5.0,
"target_kl": None,
"seed": 0,
"n_eval_envs": 64,
"n_eval_steps": 1_000,
"save_model": False,
"eval_interval": 40_000,
"eval_interval": 999_000,
}
)


def main(n_runs: int | None = None):
def main(n_runs: int | None = None, sweep: str | None = None):
with open("wandb_api_key.secret", "r") as f:
wandb_api_key = f.read().lstrip("\n").rstrip("\n")
wandb.login(key=wandb_api_key)
project = "crazyflow-ppo-x"

sweep_id = wandb.sweep(sweep_config, project="crazyflow-ppo")
if sweep is None:
sweep = wandb.sweep(sweep_config, project=project)

wandb.agent(
sweep_id,
sweep,
lambda: train_ppo(config.copy_and_resolve_references(), True),
count=n_runs,
project="crazyflow-ppo",
project=project,
)


Expand Down
22 changes: 11 additions & 11 deletions tutorials/ppo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def train_ppo(config: ConfigDict, wandb_log: bool = False):
wandb_api_key = f.read().lstrip("\n").rstrip("\n")

wandb.login(key=wandb_api_key)
wandb.init(project="crazyflow-ppo", config=None)
wandb.init(project="crazyflow-ppo-x", config=None)
config.update(wandb.config)
if config.get("n_train_samples"):
config.n_steps = config.n_train_samples // config.n_envs
Expand Down Expand Up @@ -336,7 +336,7 @@ def train_ppo(config: ConfigDict, wandb_log: bool = False):
explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

# Evaluate the agent
if global_step - last_eval > config.eval_interval:
if global_step - last_eval >= config.eval_interval:
sync_envs(train_envs, eval_envs)
eval_rewards, eval_steps = evaluate_agent(
eval_envs,
Expand Down Expand Up @@ -375,33 +375,33 @@ def train_ppo(config: ConfigDict, wandb_log: bool = False):
if config.save_model:
save_model(agent, optimizer, train_envs, Path(__file__).parent / "ppo_checkpoint.pt")

plot_results(train_rewards_hist, train_rewards_steps, eval_rewards_hist, eval_rewards_steps)
# plot_results(train_rewards_hist, train_rewards_steps, eval_rewards_hist, eval_rewards_steps)


if __name__ == "__main__":
config = ConfigDict(
{
"n_envs": 32,
"n_envs": 1024,
"device": "cuda",
"total_timesteps": 2_000_000,
"learning_rate": 5e-3,
"n_steps": 1024, # Number of steps per environment per policy rollout
"total_timesteps": 1_000_000,
"learning_rate": 1.5e-3,
"n_steps": 16, # Number of steps per environment per policy rollout
"gamma": 0.90, # Discount factor
"gae_lambda": 0.90, # Lambda for general advantage estimation
"n_minibatches": 8, # Number of mini-batches
"gae_lambda": 0.95, # Lambda for general advantage estimation
"n_minibatches": 16, # Number of mini-batches
"n_epochs": 15,
"norm_adv": True,
"clip_coef": 0.25,
"clip_vloss": True,
"ent_coef": 0.0,
"ent_coef": 0.01,
"vf_coef": 0.5,
"max_grad_norm": 5.0,
"target_kl": None,
"seed": 0,
"n_eval_envs": 64,
"n_eval_steps": 1_000,
"save_model": False,
"eval_interval": 40_000,
"eval_interval": 999_000,
}
)

Expand Down

0 comments on commit f72cc2b

Please sign in to comment.