instadeepai · DriesSmit · Apr 21, 2022 · Sep 27, 2021 · Sep 27, 2021 · Sep 27, 2021
diff --git a/docs/images/focus_fire.html b/docs/images/focus_fire.html
@@ -428,4 +428,4 @@
 MTAw
 ">
   Your browser does not support the video tag.
-</video>
+</video>
diff --git a/docs/images/runaway.html b/docs/images/runaway.html
@@ -1272,4 +1272,4 @@
 dAAAACWpdG9vAAAAHWRhdGEAAAABAAAAAExhdmY1OC4yOS4xMDA=
 ">
   Your browser does not support the video tag.
-</video>
+</video>
diff --git a/examples/debugging/simple_spread/feedforward/decentralised/run_mappo_scale_trainers.py b/examples/debugging/simple_spread/feedforward/decentralised/run_mappo_scale_trainers.py
@@ -0,0 +1,112 @@
+# python3
+# Copyright 2021 InstaDeep Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example running feedforward mappo on debug MPE environments.
+NB: Using multiple trainers with non-shared weights is still in its
+    experimental phase of development. This feature will become faster and
+    more stable in future Mava updates."""
+
+import functools
+from datetime import datetime
+from typing import Any
+
+import launchpad as lp
+import sonnet as snt
+from absl import app, flags
+
+from mava.systems.tf import mappo
+from mava.systems.tf.mappo import make_default_networks
+from mava.utils import enums, lp_utils
+from mava.utils.environments import debugging_utils
+from mava.utils.loggers import logger_utils
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "env_name",
+    "simple_spread",
+    "Debugging environment name (str).",
+)
+flags.DEFINE_string(
+    "action_space",
+    "discrete",
+    "Environment action space type (str).",
+)
+flags.DEFINE_string(
+    "mava_id",
+    str(datetime.now()),
+    "Experiment identifier that can be used to continue experiments.",
+)
+flags.DEFINE_string("base_dir", "~/mava/", "Base dir to store experiments.")
+
+
+def main(_: Any) -> None:
+
+    # environment
+    environment_factory = functools.partial(
+        debugging_utils.make_environment,
+        env_name=FLAGS.env_name,
+        action_space=FLAGS.action_space,
+    )
+
+    # networks
+    network_factory = lp_utils.partial_kwargs(make_default_networks)
+
+    # Checkpointer appends "Checkpoints" to checkpoint_dir
+    checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}"
+
+    # Log every [log_every] seconds.
+    log_every = 10
+    logger_factory = functools.partial(
+        logger_utils.make_logger,
+        directory=FLAGS.base_dir,
+        to_terminal=True,
+        to_tensorboard=True,
+        time_stamp=FLAGS.mava_id,
+        time_delta=log_every,
+    )
+
+    # distributed program
+    """NB: Using multiple trainers with non-shared weights is still in its
+    experimental phase of development. This feature will become faster and
+    more stable in future Mava updates."""
+    program = mappo.MAPPO(
+        environment_factory=environment_factory,
+        network_factory=network_factory,
+        logger_factory=logger_factory,
+        num_executors=2,
+        shared_weights=False,
+        trainer_networks=enums.Trainer.one_trainer_per_network,
+        network_sampling_setup=enums.NetworkSampler.fixed_agent_networks,
+        policy_optimizer=snt.optimizers.Adam(learning_rate=1e-4),
+        critic_optimizer=snt.optimizers.Adam(learning_rate=1e-4),
+        checkpoint_subpath=checkpoint_dir,
+        max_gradient_norm=40.0,
+    ).build()
+
+    # Ensure only trainer runs on gpu, while other processes run on cpu.
+    local_resources = lp_utils.to_device(
+        program_nodes=program.groups.keys(), nodes_on_gpu=["trainer"]
+    )
+
+    lp.launch(
+        program,
+        lp.LaunchType.LOCAL_MULTI_PROCESSING,
+        terminal="current_terminal",
+        local_resources=local_resources,
+    )
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/examples/debugging/simple_spread/recurrent/state_based/run_mappo.py b/examples/debugging/simple_spread/recurrent/state_based/run_mappo.py
@@ -0,0 +1,121 @@
+# python3
+# Copyright 2021 InstaDeep Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example running MAPPO on debug MPE environments."""
+
+import functools
+from datetime import datetime
+from typing import Any
+
+import launchpad as lp
+import sonnet as snt
+from absl import app, flags
+
+from mava.components.tf import architectures
+from mava.systems.tf import mappo
+from mava.utils import lp_utils
+from mava.utils.enums import ArchitectureType
+from mava.utils.environments import debugging_utils
+from mava.utils.loggers import logger_utils
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "env_name",
+    "simple_spread",
+    "Debugging environment name (str).",
+)
+flags.DEFINE_string(
+    "action_space",
+    "discrete",
+    "Environment action space type (str).",
+)
+
+flags.DEFINE_string(
+    "mava_id",
+    str(datetime.now()),
+    "Experiment identifier that can be used to continue experiments.",
+)
+flags.DEFINE_string("base_dir", "~/mava", "Base dir to store experiments.")
+
+
+def main(_: Any) -> None:
+
+    recurrent_test = False
+    recurrent_ppo = True
+
+    # Environment.
+    environment_factory = functools.partial(
+        debugging_utils.make_environment,
+        env_name=FLAGS.env_name,
+        action_space=FLAGS.action_space,
+        return_state_info=True,
+        recurrent_test=recurrent_test,
+    )
+
+    # Networks.
+    network_factory = lp_utils.partial_kwargs(
+        mappo.make_default_networks,
+        archecture_type=ArchitectureType.recurrent
+        if recurrent_ppo
+        else ArchitectureType.feedforward,
+    )
+
+    # Checkpointer appends "Checkpoints" to checkpoint_dir
+    checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}"
+
+    # Log every [log_every] seconds.
+    log_every = 10
+    logger_factory = functools.partial(
+        logger_utils.make_logger,
+        directory=FLAGS.base_dir,
+        to_terminal=True,
+        to_tensorboard=True,
+        time_stamp=FLAGS.mava_id,
+        time_delta=log_every,
+    )
+
+    # Distributed program
+    program = mappo.MAPPO(
+        environment_factory=environment_factory,
+        network_factory=network_factory,
+        logger_factory=logger_factory,
+        num_executors=1,
+        policy_optimizer=snt.optimizers.Adam(learning_rate=1e-4),
+        critic_optimizer=snt.optimizers.Adam(learning_rate=1e-4),
+        checkpoint_subpath=checkpoint_dir,
+        max_gradient_norm=40.0,
+        executor_fn=mappo.MAPPORecurrentExecutor
+        if recurrent_ppo
+        else mappo.MAPPOFeedForwardExecutor,
+        architecture=architectures.StateBasedValueActorCritic,
+        trainer_fn=mappo.StateBasedMAPPOTrainer,
+    ).build()
+
+    # Ensure only trainer runs on gpu, while other processes run on cpu.
+    local_resources = lp_utils.to_device(
+        program_nodes=program.groups.keys(), nodes_on_gpu=["trainer"]
+    )
+
+    # Launch.
+    lp.launch(
+        program,
+        lp.LaunchType.LOCAL_MULTI_PROCESSING,
+        terminal="current_terminal",
+        local_resources=local_resources,
+    )
+
+
+if __name__ == "__main__":
+    app.run(main)
@@ -41,4 +41,5 @@
     StateBasedQValueActorCritic,
     StateBasedQValueCritic,
     StateBasedQValueSingleActionCritic,
+    StateBasedValueActorCritic,
 )
@@ -99,9 +99,10 @@ def _get_critic_specs(
 
         for agent_type, agents in agents_by_type.items():
             agent_key = agents[0]
-            critic_obs_shape = list(copy.copy(self._embed_specs[agent_key].shape))
+            net_key = self._agent_net_keys[agent_key]
+            critic_obs_shape = list(copy.copy(self._embed_specs[net_key].shape))
             critic_obs_shape.insert(0, len(agents))
-            obs_specs_per_type[agent_type] = tf.TensorSpec(
+            obs_specs_per_type[net_key] = tf.TensorSpec(
                 shape=critic_obs_shape,
                 dtype=tf.dtypes.float32,
             )
@@ -143,11 +144,12 @@ def _get_critic_specs(
 
         for agent_type, agents in agents_by_type.items():
             agent_key = agents[0]
+            net_key = self._agent_net_keys[agent_key]
 
             # TODO (dries): Add a check to see if all
             #  self._embed_specs[agent_key].shape are of the same shape
 
-            critic_obs_shape = list(copy.copy(self._embed_specs[agent_key].shape))
+            critic_obs_shape = list(copy.copy(self._embed_specs[net_key].shape))
             critic_obs_shape.insert(0, len(agents))
             obs_specs_per_type[agent_type] = tf.TensorSpec(
                 shape=critic_obs_shape,

@@ -157,7 +157,7 @@ def create_actor_variables(self) -> Dict[str, Dict[str, snt.Module]]:
             emb_spec = tf2_utils.create_variables(
                 self._observation_networks[agent_net_key], [obs_spec]
             )
-            self._embed_specs[agent_key] = emb_spec
+            self._embed_specs[agent_net_key] = emb_spec
 
             # Create variables.
             tf2_utils.create_variables(self._policy_networks[agent_net_key], [emb_spec])
@@ -267,7 +267,7 @@ def create_actor_variables(self) -> Dict[str, Dict[str, snt.Module]]:
             emb_spec = tf2_utils.create_variables(
                 self._observation_networks[net_key], [obs_spec]
             )
-            self._embed_specs[agent_key] = emb_spec
+            self._embed_specs[net_key] = emb_spec
 
             # Create variables.
             tf2_utils.create_variables(self._policy_networks[net_key], [emb_spec])
@@ -279,7 +279,6 @@ def create_actor_variables(self) -> Dict[str, Dict[str, snt.Module]]:
             tf2_utils.create_variables(
                 self._target_observation_networks[net_key], [obs_spec]
             )
-
         actor_networks: Dict[str, Dict[str, snt.Module]] = {
             "policies": self._policy_networks,
             "observations": self._observation_networks,
@@ -295,10 +294,8 @@ def create_critic_variables(self) -> Dict[str, Dict[str, snt.Module]]:
 
         # create critics
         for net_key in self._net_keys:
-            agent_key = self._net_spec_keys[net_key]
-
             # get specs
-            emb_spec = embed_specs[agent_key]
+            emb_spec = embed_specs[net_key]
 
             # Create variables.
             tf2_utils.create_variables(self._critic_networks[net_key], [emb_spec])
@@ -372,7 +369,7 @@ def create_critic_variables(self) -> Dict[str, Dict[str, snt.Module]]:
             agent_key = self._net_spec_keys[net_key]
 
             # get specs
-            emb_spec = embed_specs[agent_key]
+            emb_spec = embed_specs[net_key]
             act_spec = act_specs[agent_key]
 
             # Create variables.