Fix use of dones buffer

utiasDSL · Jan 24, 2025 · 9349ff2 · 9349ff2
1 parent eb27b6f
commit 9349ff2
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,5 @@ build
 !/.devcontainer/devcontainer.linux.json
 !/.devcontainer/devcontainer.wsl2.json
 !/.vscode/launch.json
-**/*.pt
+**/*.pt
+tutorials/ppo/wandb
diff --git a/crazyflow/gymnasium_envs/crazyflow.py b/crazyflow/gymnasium_envs/crazyflow.py
@@ -11,7 +11,6 @@
 from gymnasium.vector import VectorEnv, VectorWrapper
 from gymnasium.vector.utils import batch_space
 from jax import Array
-from numpy.typing import NDArray
 
 from crazyflow.control.control import MAX_THRUST, MIN_THRUST, Control
 from crazyflow.sim import Sim

diff --git a/tutorials/ppo/sweep.py b/tutorials/ppo/sweep.py
@@ -29,7 +29,7 @@
     {
         "n_envs": 32,
         "device": "cuda",
-        "total_timesteps": 4_000_000,
+        "total_timesteps": 2_000_000,
         "learning_rate": 3e-4,
         "n_steps": 2048,  # Number of steps per environment per policy rollout
         "gamma": 0.99,  # Discount factor

diff --git a/tutorials/ppo/train.py b/tutorials/ppo/train.py
@@ -189,7 +189,9 @@ def train_ppo(config: ConfigDict, wandb_log: bool = False):
     ).to(config.device)
     logprobs_buffer = torch.zeros((config.n_steps, config.n_envs)).to(config.device)
     rewards_buffer = torch.zeros((config.n_steps, config.n_envs)).to(config.device)
+    # TODO: Remove dones buffer
     dones_buffer = torch.zeros((config.n_steps, config.n_envs)).to(config.device)
+    terminated_buffer = torch.zeros((config.n_steps, config.n_envs)).to(config.device)
     values_buffer = torch.zeros((config.n_steps, config.n_envs)).to(config.device)
 
     # Stats tracking setup
@@ -229,6 +231,7 @@ def train_ppo(config: ConfigDict, wandb_log: bool = False):
             mask = active & ~autoreset
             obs_buffer[steps[mask], mask] = obs[mask]
             dones_buffer[steps[mask], mask] = done[mask].float()
+            terminated_buffer[steps[mask], mask] = terminated[mask].float()
             values_buffer[steps[mask], mask] = value[mask].squeeze()
             actions_buffer[steps[mask], mask] = action[mask]
             logprobs_buffer[steps[mask], mask] = logprob[mask].squeeze()
@@ -246,10 +249,11 @@ def train_ppo(config: ConfigDict, wandb_log: bool = False):
             lastgaelam = 0
             for t in reversed(range(config.n_steps)):
                 if t == config.n_steps - 1:
-                    nextnonterminal = 1.0 - dones_buffer[t]  # TODO: Replace with terminated buffer
+                    # TODO: Check that terminated is correct instead of dones
+                    nextnonterminal = 1.0 - terminated_buffer[t]
                     nextvalues = next_value
                 else:
-                    nextnonterminal = 1.0 - dones_buffer[t + 1]
+                    nextnonterminal = 1.0 - terminated_buffer[t + 1]
                     nextvalues = values_buffer[t + 1]
                 delta = (
                     rewards_buffer[t]