upload model

mprhode · Jan 31, 2022 · eacd30a · eacd30a
1 parent 6af4514
commit eacd30a
Show file tree

Hide file tree

Showing 14 changed files with 1,484 additions and 22 deletions.
diff --git a/DQN/DQNAgent.py b/DQN/DQNAgent.py
@@ -0,0 +1,160 @@
+# checkout https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN
+# I didn't make many changes, I simply ensured it fits with the CybORG BaseAgent
+
+import inspect
+
+from CybORG import CybORG
+from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
+from CybORG.Agents.Wrappers.ChallengeWrapper import ChallengeWrapper
+from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
+from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
+from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
+
+from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
+import torch as T
+import numpy as np
+from DQN.DeepQNetwork import DeepQNetwork, DeepRNNNetwork
+from DQN.ReplayBuffer import ReplayBuffer
+
+class DQNAgent(BaseAgent):
+    def __init__(self, gamma=0.9, epsilon=0, lr=0.1, n_actions=41, input_dims=(52,),
+                 mem_size=1000, batch_size=32, eps_min=0.01, eps_dec=5e-7,
+                 replace=1000, algo='DDQN', env_name='Scenario1b', chkpt_dir='chkpt', load=False):
+        self.gamma = gamma
+        self.epsilon = epsilon
+        self.lr = lr
+        self.n_actions = n_actions
+        self.input_dims = input_dims
+        self.batch_size = batch_size
+        self.eps_min = eps_min
+        self.eps_dec = eps_dec
+        self.replace_target_cnt = replace
+        self.algo = algo
+        self.env_name = env_name
+        self.chkpt_dir = chkpt_dir
+        self.action_space = [i for i in range(n_actions)]
+        self.learn_step_counter = 0
+
+        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
+
+        self.q_eval = DeepQNetwork(self.lr, self.n_actions,
+                                        input_dims=self.input_dims,
+                                        name=self.env_name+'_'+self.algo+'_q_eval',
+                                        chkpt_dir=self.chkpt_dir)
+        self.q_next = DeepQNetwork(self.lr, self.n_actions,
+                                        input_dims=self.input_dims,
+                                        name=self.env_name+'_'+self.algo+'_q_next',
+                                        chkpt_dir=self.chkpt_dir)
+
+    # if epsilon=0 it will just use the model
+    def get_action(self, observation, action_space, ignore_epsilon=False):
+        if ignore_epsilon or (np.random.random() > self.epsilon):
+            state = T.tensor([observation], dtype=T.float).to(self.q_eval.device)
+            actions = self.q_eval.forward(state)
+            action = T.argmax(actions).item()
+        else:
+            action = np.random.choice(self.action_space)
+
+        return action
+
+    def store_transition(self, state, action, reward, state_, done):
+        self.memory.store_transition(state, action, reward, state_, done)
+
+    def sample_memory(self):
+        state, action, reward, new_state, done = \
+                                self.memory.sample_buffer(self.batch_size)
+
+        states = T.tensor(state).to(self.q_eval.device)
+        rewards = T.tensor(reward).to(self.q_eval.device)
+        dones = T.tensor(done).to(self.q_eval.device)
+        actions = T.tensor(action).to(self.q_eval.device)
+        states_ = T.tensor(new_state).to(self.q_eval.device)
+
+        return states, actions, rewards, states_, dones
+
+    def replace_target_network(self):
+        if self.replace_target_cnt is not None and \
+           self.learn_step_counter % self.replace_target_cnt == 0:
+            self.q_next.load_state_dict(self.q_eval.state_dict())
+
+    def decrement_epsilon(self):
+        self.epsilon = self.epsilon - self.eps_dec \
+                           if self.epsilon > self.eps_min else self.eps_min
+
+    def train(self):
+        if self.memory.mem_cntr < self.batch_size:
+            return
+        self.q_eval.optimizer.zero_grad()
+        self.replace_target_network()
+        states, actions, rewards, states_, dones = self.sample_memory()
+        indices = np.arange(self.batch_size)
+        q_pred = self.q_eval.forward(states)[indices, actions]
+        q_next = self.q_next.forward(states_)
+        q_eval = self.q_eval.forward(states_)
+        max_actions = T.argmax(q_eval, dim=1)
+        q_next[dones] = 0.0
+        q_target = rewards + self.gamma*q_next[indices, max_actions]
+        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
+        loss.backward()
+        self.q_eval.optimizer.step()
+        self.learn_step_counter += 1
+        self.decrement_epsilon()
+
+    def end_episode(self):
+        pass
+
+    def set_initial_values(self, action_space, observation):
+        pass
+
+    def save_models(self):
+        self.q_eval.save_checkpoint()
+        self.q_next.save_checkpoint()
+
+    def load_models(self):
+        self.q_eval.load_checkpoint()
+        self.q_next.load_checkpoint()
+
+
+class RNNDQNAgent(DQNAgent):
+    def __init__(self, gamma=0.99, epsilon=1, lr=0.0001, n_actions=10, input_dims=(10), lookback_steps=7,
+                 mem_size=1000, batch_size=64, eps_min=0.01, eps_dec=5e-7, hid_size=64,
+                 replace=1000, algo=None, env_name=None, chkpt_dir='chkpt', load=False,
+                 env=None):
+        self.lookback_steps = lookback_steps
+
+        super(RNNDQNAgent, self).__init__(gamma=gamma, epsilon=epsilon, lr=lr, n_actions=n_actions,
+                input_dims=input_dims,
+                 mem_size=mem_size, batch_size=batch_size, eps_min=eps_min, eps_dec=eps_dec,
+                 replace=replace, algo=algo, env_name=env_name, chkpt_dir=chkpt_dir)
+
+        self.memory = ReplayBuffer(mem_size, (self.lookback_steps, input_dims[0]), n_actions)
+
+        self.q_eval = DeepRNNNetwork(self.lr, self.n_actions,
+                                        input_dims=self.input_dims,
+                                        name=self.env_name+'_'+self.algo+'_q_eval',
+                                        chkpt_dir=self.chkpt_dir, hid_size=hid_size)
+        self.q_next = DeepRNNNetwork(self.lr, self.n_actions,
+                                        input_dims=self.input_dims,
+                                        name=self.env_name+'_'+self.algo+'_q_next',
+                                        chkpt_dir=self.chkpt_dir, hid_size=hid_size)
+
+        self.observation_buffer = np.zeros((self.lookback_steps, self.input_dims[0]))
+
+        if load:
+            self.load_models()
+
+    def get_action(self, observation, action_space):
+        if (observation.shape) != self.observation_buffer.shape:
+            self.observation_buffer[:-1] = self.observation_buffer[1:]
+            self.observation_buffer[-1] = observation
+        else:
+            self.observation_buffer = observation
+
+        if np.random.random() > self.epsilon:
+            state = T.tensor([self.observation_buffer], dtype=T.float).to(self.q_eval.device)
+            actions = self.q_eval.forward(state)
+            action = T.argmax(actions).item()
+        else:
+            action = np.random.choice(self.action_space)
+
+        return action
diff --git a/DQN/DeepQNetwork.py b/DQN/DeepQNetwork.py
@@ -0,0 +1,69 @@
+# checkout https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN
+# The only changes I made were regarding the network architecture (not CNN here)
+
+import os
+import torch as T
+import torch.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+
+class DeepQNetwork(nn.Module):
+    def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
+        super(DeepQNetwork, self).__init__()
+        self.checkpoint_dir = chkpt_dir
+        self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
+
+        # you may want to play around with this and forward()
+        self.fc1 = nn.Linear(input_dims[0], 64)
+        self.fc2 = nn.Linear(64, 64)
+        self.fc3 = nn.Linear(64, n_actions)
+
+        self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
+
+        self.loss = nn.MSELoss()
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
+        self.to(self.device)
+
+    # you may want to play around with this
+    def forward(self, state):
+        flat1 = F.relu(self.fc1(state))
+        flat2 = F.relu(self.fc2(flat1))
+        actions = self.fc3(flat2)
+        return actions
+
+    def save_checkpoint(self):
+        print('... saving checkpoint ...')
+        T.save(self.state_dict(), self.checkpoint_file)
+
+    def load_checkpoint(self):
+        print('... loading checkpoint ...')
+        self.load_state_dict(T.load(self.checkpoint_file))
+
+
+class DeepRNNNetwork(DeepQNetwork):
+    def __init__(self, lr, n_actions, name, input_dims, chkpt_dir, hid_size=64):
+        super(DeepRNNNetwork, self).__init__(lr, n_actions, name, input_dims, chkpt_dir)
+
+        self.n_layers = 2
+        self.hidden_dim = hid_size
+        self.gru = nn.GRU(input_dims[0], hidden_size=self.hidden_dim, num_layers=2, batch_first=True, device=device)
+        self.fc3 = nn.Linear(self.hidden_dim, n_actions, device=device)
+
+        self.optimizer = optim.Adam(self.parameters(), lr=lr)
+
+    def forward(self, state, hidden=None):
+        if hidden is None:
+            hidden = self.init_hidden(state.shape[0])
+        out, h1 = self.gru(state, hidden)
+        actions = self.fc3(F.relu(out[:, -1]))
+        return actions
+
+    def init_hidden(self, batch_size):
+        weight = next(self.parameters()).data
+        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
+        return hidden
+
diff --git a/DQN/ReplayBuffer.py b/DQN/ReplayBuffer.py
@@ -0,0 +1,43 @@
+# from https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN
+
+import numpy as np
+
+class ReplayBuffer(object):
+    def __init__(self, max_size, input_shape, n_actions):
+        self.mem_size = max_size
+        self.mem_cntr = 0
+        self.state_memory = np.zeros((self.mem_size, *input_shape),
+                                     dtype=np.float32)
+        self.new_state_memory = np.zeros((self.mem_size, *input_shape),
+                                         dtype=np.float32)
+
+        self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
+        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
+        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
+
+    def store_transition(self, state, action, reward, state_, done):
+        index = self.mem_cntr % self.mem_size
+        self.state_memory[index] = state
+        self.new_state_memory[index] = state_
+        self.action_memory[index] = action
+        self.reward_memory[index] = reward
+        self.terminal_memory[index] = done
+        self.mem_cntr += 1
+
+    def sample_buffer(self, batch_size):
+        max_mem = min(self.mem_cntr, self.mem_size)
+        batch = np.random.choice(max_mem, batch_size, replace=False)
+
+        states = self.state_memory[batch]
+        actions = self.action_memory[batch]
+        rewards = self.reward_memory[batch]
+        states_ = self.new_state_memory[batch]
+        terminal = self.terminal_memory[batch]
+
+        return states, actions, rewards, states_, terminal
+
+    def get_last_rewards(self, batch_size):
+        rewards = self.reward_memory[-1*batch_size:]
+        return rewards
+
+
diff --git a/DQN/__pycache__/DQNAgent.cpython-38.pyc b/DQN/__pycache__/DQNAgent.cpython-38.pyc
diff --git a/DQN/__pycache__/DeepQNetwork.cpython-38.pyc b/DQN/__pycache__/DeepQNetwork.cpython-38.pyc
diff --git a/DQN/__pycache__/ReplayBuffer.cpython-38.pyc b/DQN/__pycache__/ReplayBuffer.cpython-38.pyc
diff --git a/README.md b/README.md
@@ -1,42 +1,55 @@
 # Cage-submission
 
-Due to red agents’ behaviours not changing mid-episode, and the fact that they are predictable, we thought that fingerprinting the agent we are facing and then assigning it to a trained model made the most sense. If multiple red agents could exist in the environment in parallel, or if the red agents could change behaviour mid-episode, or if noise was added (Green Agent), then we would have applied hierarchical RL or utilised an RNN (which we expect to do in the second version of the challenge).
-In addition, due to the action space being small (the blue agent cannot perform multiple actions at once, i.e restore multiple hosts for instance), we felt that reinforcement learning was appropriate, however in reality the action spaces for the defender (and attacker) would be too large for our approach.
+This approach rotated the red agents (Sleep, Meander and B_line) during training epochs of a recurrent neural network double DQN. 
 
-As a result, we trained two models using DDQN for B_line and Meander. We also experimented with regular Q-learning for B_line after reducing the action and observation spaces, this was successful, but is not included in this submission as it does not add any value. This approach was however interesting to analyse the largest and smallest Q-values to confirm our suspicions.
+RNN was chosen in order to give the model some memory of agent's past actions in order to help distinguish them - we found that RNNs with a memory of length 16 or 32 steps tended to outperform those with only 8 steps
 
-Finally, it should be noted that we have not considered the Misinform action because it was not in the initial release. This made sense as the Green Agent does not figure in the evaluation.
+We used a random search to find the optimal hyperaparameter configuration
 
-# Agents
-
-We built three agents:
-1. A Sleep blue agent 
-2. A DDQN blue agent
-3. A Main blue agent which fingerprints the red agents and assigns a blue agent
+Limitations of this approach include that it has only been exposed and tested on the same kinds of Red agents and we do not know how behaviour will generalise
 
-The agents can be found in the Agents folder.
+This is our version 2 submission for cyborg-v1.2 (version 1 was for cyborg v.1.1)
 
-# DDQN
+# RNN-DDQN
 
 The DDQN implementation was taken from the following Github page https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN where it has not been modified except for the model architecture (we opted for a MLP instead of CNN). The architecture is as follows for both models:
 
-        self.fc1 = nn.Linear(input_dims[0], 64)
-        self.fc2 = nn.Linear(64, 64)
-        self.fc3 = nn.Linear(64, n_actions)
-
-Where fc1 and fc2 have ReLU activations.
+but modified to use Gated Recurrent Unit (GRU) layers, allowing the model to 'remember' previous activity
 
-We trained two models: one for B_line and one for Meander. These are stored in the Models folder.
+The hyperaperameters chosen for the best-performing model were:
 
-The train.py and utils.py files are included in the root directory for completeness but are not called in the evaluation.
+depth: 2 layers \
+neurons in hidden layers: 64 \
+number of previous steps to consider: 16 \
+gamma (discount factor for future rewards): 0.5 \
+epsilon (chance of picking a random action at start of training): 0.5 \
+epsilon decrease rate: 5e-06 \
+minimum epsilon: 0.1 \
+learning rate: 0.0001 \
+memory size: 5000 \
+replace memory frequency (episodes): 500 \
+length of episodes: 100 \
+number of episodes: 1000 \
+batch size: 32 
 
 # Evaluation
 
-The Evaluation folder contains the evaluation.py file and an .md file discussing our approach's strengths and weakne
+Evaluation can be triggered by running evaluation/evaluation.py
+
+# Agents
+
+The code implementing the agent can be found in evaluation/MainAgent.py
+
+# Wrapper
+
+The wrapper used was the CyborgChallengeWrapper 
+
 # Dependencies
 
-Pytorch 
-Challenge version: 1.2
+Cyborg version 1.2 \
+pandas==1.3.4 (for training only) \
+numpy==1.21.4 \
+torch==1.10.0
 
 # Thank you