Skip to content

Commit

Permalink
upload model
Browse files Browse the repository at this point in the history
  • Loading branch information
mprhode committed Jan 31, 2022
1 parent 6af4514 commit eacd30a
Show file tree
Hide file tree
Showing 14 changed files with 1,484 additions and 22 deletions.
160 changes: 160 additions & 0 deletions DQN/DQNAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# checkout https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN
# I didn't make many changes, I simply ensured it fits with the CybORG BaseAgent

import inspect

from CybORG import CybORG
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.Wrappers.ChallengeWrapper import ChallengeWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper

from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
import torch as T
import numpy as np
from DQN.DeepQNetwork import DeepQNetwork, DeepRNNNetwork
from DQN.ReplayBuffer import ReplayBuffer

class DQNAgent(BaseAgent):
def __init__(self, gamma=0.9, epsilon=0, lr=0.1, n_actions=41, input_dims=(52,),
mem_size=1000, batch_size=32, eps_min=0.01, eps_dec=5e-7,
replace=1000, algo='DDQN', env_name='Scenario1b', chkpt_dir='chkpt', load=False):
self.gamma = gamma
self.epsilon = epsilon
self.lr = lr
self.n_actions = n_actions
self.input_dims = input_dims
self.batch_size = batch_size
self.eps_min = eps_min
self.eps_dec = eps_dec
self.replace_target_cnt = replace
self.algo = algo
self.env_name = env_name
self.chkpt_dir = chkpt_dir
self.action_space = [i for i in range(n_actions)]
self.learn_step_counter = 0

self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

self.q_eval = DeepQNetwork(self.lr, self.n_actions,
input_dims=self.input_dims,
name=self.env_name+'_'+self.algo+'_q_eval',
chkpt_dir=self.chkpt_dir)
self.q_next = DeepQNetwork(self.lr, self.n_actions,
input_dims=self.input_dims,
name=self.env_name+'_'+self.algo+'_q_next',
chkpt_dir=self.chkpt_dir)

# if epsilon=0 it will just use the model
def get_action(self, observation, action_space, ignore_epsilon=False):
if ignore_epsilon or (np.random.random() > self.epsilon):
state = T.tensor([observation], dtype=T.float).to(self.q_eval.device)
actions = self.q_eval.forward(state)
action = T.argmax(actions).item()
else:
action = np.random.choice(self.action_space)

return action

def store_transition(self, state, action, reward, state_, done):
self.memory.store_transition(state, action, reward, state_, done)

def sample_memory(self):
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)

states = T.tensor(state).to(self.q_eval.device)
rewards = T.tensor(reward).to(self.q_eval.device)
dones = T.tensor(done).to(self.q_eval.device)
actions = T.tensor(action).to(self.q_eval.device)
states_ = T.tensor(new_state).to(self.q_eval.device)

return states, actions, rewards, states_, dones

def replace_target_network(self):
if self.replace_target_cnt is not None and \
self.learn_step_counter % self.replace_target_cnt == 0:
self.q_next.load_state_dict(self.q_eval.state_dict())

def decrement_epsilon(self):
self.epsilon = self.epsilon - self.eps_dec \
if self.epsilon > self.eps_min else self.eps_min

def train(self):
if self.memory.mem_cntr < self.batch_size:
return
self.q_eval.optimizer.zero_grad()
self.replace_target_network()
states, actions, rewards, states_, dones = self.sample_memory()
indices = np.arange(self.batch_size)
q_pred = self.q_eval.forward(states)[indices, actions]
q_next = self.q_next.forward(states_)
q_eval = self.q_eval.forward(states_)
max_actions = T.argmax(q_eval, dim=1)
q_next[dones] = 0.0
q_target = rewards + self.gamma*q_next[indices, max_actions]
loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
loss.backward()
self.q_eval.optimizer.step()
self.learn_step_counter += 1
self.decrement_epsilon()

def end_episode(self):
pass

def set_initial_values(self, action_space, observation):
pass

def save_models(self):
self.q_eval.save_checkpoint()
self.q_next.save_checkpoint()

def load_models(self):
self.q_eval.load_checkpoint()
self.q_next.load_checkpoint()


class RNNDQNAgent(DQNAgent):
def __init__(self, gamma=0.99, epsilon=1, lr=0.0001, n_actions=10, input_dims=(10), lookback_steps=7,
mem_size=1000, batch_size=64, eps_min=0.01, eps_dec=5e-7, hid_size=64,
replace=1000, algo=None, env_name=None, chkpt_dir='chkpt', load=False,
env=None):
self.lookback_steps = lookback_steps

super(RNNDQNAgent, self).__init__(gamma=gamma, epsilon=epsilon, lr=lr, n_actions=n_actions,
input_dims=input_dims,
mem_size=mem_size, batch_size=batch_size, eps_min=eps_min, eps_dec=eps_dec,
replace=replace, algo=algo, env_name=env_name, chkpt_dir=chkpt_dir)

self.memory = ReplayBuffer(mem_size, (self.lookback_steps, input_dims[0]), n_actions)

self.q_eval = DeepRNNNetwork(self.lr, self.n_actions,
input_dims=self.input_dims,
name=self.env_name+'_'+self.algo+'_q_eval',
chkpt_dir=self.chkpt_dir, hid_size=hid_size)
self.q_next = DeepRNNNetwork(self.lr, self.n_actions,
input_dims=self.input_dims,
name=self.env_name+'_'+self.algo+'_q_next',
chkpt_dir=self.chkpt_dir, hid_size=hid_size)

self.observation_buffer = np.zeros((self.lookback_steps, self.input_dims[0]))

if load:
self.load_models()

def get_action(self, observation, action_space):
if (observation.shape) != self.observation_buffer.shape:
self.observation_buffer[:-1] = self.observation_buffer[1:]
self.observation_buffer[-1] = observation
else:
self.observation_buffer = observation

if np.random.random() > self.epsilon:
state = T.tensor([self.observation_buffer], dtype=T.float).to(self.q_eval.device)
actions = self.q_eval.forward(state)
action = T.argmax(actions).item()
else:
action = np.random.choice(self.action_space)

return action
69 changes: 69 additions & 0 deletions DQN/DeepQNetwork.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# checkout https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN
# The only changes I made were regarding the network architecture (not CNN here)

import os
import torch as T
import torch.cuda
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


class DeepQNetwork(nn.Module):
def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
super(DeepQNetwork, self).__init__()
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir, name)

# you may want to play around with this and forward()
self.fc1 = nn.Linear(input_dims[0], 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, n_actions)

self.optimizer = optim.RMSprop(self.parameters(), lr=lr)

self.loss = nn.MSELoss()
self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
self.to(self.device)

# you may want to play around with this
def forward(self, state):
flat1 = F.relu(self.fc1(state))
flat2 = F.relu(self.fc2(flat1))
actions = self.fc3(flat2)
return actions

def save_checkpoint(self):
print('... saving checkpoint ...')
T.save(self.state_dict(), self.checkpoint_file)

def load_checkpoint(self):
print('... loading checkpoint ...')
self.load_state_dict(T.load(self.checkpoint_file))


class DeepRNNNetwork(DeepQNetwork):
def __init__(self, lr, n_actions, name, input_dims, chkpt_dir, hid_size=64):
super(DeepRNNNetwork, self).__init__(lr, n_actions, name, input_dims, chkpt_dir)

self.n_layers = 2
self.hidden_dim = hid_size
self.gru = nn.GRU(input_dims[0], hidden_size=self.hidden_dim, num_layers=2, batch_first=True, device=device)
self.fc3 = nn.Linear(self.hidden_dim, n_actions, device=device)

self.optimizer = optim.Adam(self.parameters(), lr=lr)

def forward(self, state, hidden=None):
if hidden is None:
hidden = self.init_hidden(state.shape[0])
out, h1 = self.gru(state, hidden)
actions = self.fc3(F.relu(out[:, -1]))
return actions

def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
return hidden

43 changes: 43 additions & 0 deletions DQN/ReplayBuffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# from https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN

import numpy as np

class ReplayBuffer(object):
def __init__(self, max_size, input_shape, n_actions):
self.mem_size = max_size
self.mem_cntr = 0
self.state_memory = np.zeros((self.mem_size, *input_shape),
dtype=np.float32)
self.new_state_memory = np.zeros((self.mem_size, *input_shape),
dtype=np.float32)

self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

def store_transition(self, state, action, reward, state_, done):
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.action_memory[index] = action
self.reward_memory[index] = reward
self.terminal_memory[index] = done
self.mem_cntr += 1

def sample_buffer(self, batch_size):
max_mem = min(self.mem_cntr, self.mem_size)
batch = np.random.choice(max_mem, batch_size, replace=False)

states = self.state_memory[batch]
actions = self.action_memory[batch]
rewards = self.reward_memory[batch]
states_ = self.new_state_memory[batch]
terminal = self.terminal_memory[batch]

return states, actions, rewards, states_, terminal

def get_last_rewards(self, batch_size):
rewards = self.reward_memory[-1*batch_size:]
return rewards


Binary file added DQN/__pycache__/DQNAgent.cpython-38.pyc
Binary file not shown.
Binary file added DQN/__pycache__/DeepQNetwork.cpython-38.pyc
Binary file not shown.
Binary file added DQN/__pycache__/ReplayBuffer.cpython-38.pyc
Binary file not shown.
57 changes: 35 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,42 +1,55 @@
# Cage-submission

Due to red agents’ behaviours not changing mid-episode, and the fact that they are predictable, we thought that fingerprinting the agent we are facing and then assigning it to a trained model made the most sense. If multiple red agents could exist in the environment in parallel, or if the red agents could change behaviour mid-episode, or if noise was added (Green Agent), then we would have applied hierarchical RL or utilised an RNN (which we expect to do in the second version of the challenge).
In addition, due to the action space being small (the blue agent cannot perform multiple actions at once, i.e restore multiple hosts for instance), we felt that reinforcement learning was appropriate, however in reality the action spaces for the defender (and attacker) would be too large for our approach.
This approach rotated the red agents (Sleep, Meander and B_line) during training epochs of a recurrent neural network double DQN.

As a result, we trained two models using DDQN for B_line and Meander. We also experimented with regular Q-learning for B_line after reducing the action and observation spaces, this was successful, but is not included in this submission as it does not add any value. This approach was however interesting to analyse the largest and smallest Q-values to confirm our suspicions.
RNN was chosen in order to give the model some memory of agent's past actions in order to help distinguish them - we found that RNNs with a memory of length 16 or 32 steps tended to outperform those with only 8 steps

Finally, it should be noted that we have not considered the Misinform action because it was not in the initial release. This made sense as the Green Agent does not figure in the evaluation.
We used a random search to find the optimal hyperaparameter configuration

# Agents

We built three agents:
1. A Sleep blue agent
2. A DDQN blue agent
3. A Main blue agent which fingerprints the red agents and assigns a blue agent
Limitations of this approach include that it has only been exposed and tested on the same kinds of Red agents and we do not know how behaviour will generalise

The agents can be found in the Agents folder.
This is our version 2 submission for cyborg-v1.2 (version 1 was for cyborg v.1.1)

# DDQN
# RNN-DDQN

The DDQN implementation was taken from the following Github page https://github.com/philtabor/Deep-Q-Learning-Paper-To-Code/tree/master/DDQN where it has not been modified except for the model architecture (we opted for a MLP instead of CNN). The architecture is as follows for both models:

self.fc1 = nn.Linear(input_dims[0], 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, n_actions)

Where fc1 and fc2 have ReLU activations.
but modified to use Gated Recurrent Unit (GRU) layers, allowing the model to 'remember' previous activity

We trained two models: one for B_line and one for Meander. These are stored in the Models folder.
The hyperaperameters chosen for the best-performing model were:

The train.py and utils.py files are included in the root directory for completeness but are not called in the evaluation.
depth: 2 layers \
neurons in hidden layers: 64 \
number of previous steps to consider: 16 \
gamma (discount factor for future rewards): 0.5 \
epsilon (chance of picking a random action at start of training): 0.5 \
epsilon decrease rate: 5e-06 \
minimum epsilon: 0.1 \
learning rate: 0.0001 \
memory size: 5000 \
replace memory frequency (episodes): 500 \
length of episodes: 100 \
number of episodes: 1000 \
batch size: 32

# Evaluation

The Evaluation folder contains the evaluation.py file and an .md file discussing our approach's strengths and weakne
Evaluation can be triggered by running evaluation/evaluation.py

# Agents

The code implementing the agent can be found in evaluation/MainAgent.py

# Wrapper

The wrapper used was the CyborgChallengeWrapper

# Dependencies

Pytorch
Challenge version: 1.2
Cyborg version 1.2 \
pandas==1.3.4 (for training only) \
numpy==1.21.4 \
torch==1.10.0

# Thank you

Expand Down
Loading

0 comments on commit eacd30a

Please sign in to comment.