Skip to content

Commit

Permalink
mnist_impurity_ensemble
Browse files Browse the repository at this point in the history
  • Loading branch information
Liphos committed Sep 12, 2022
1 parent c3697be commit 8fcb27e
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 89 deletions.
27 changes: 27 additions & 0 deletions dataset.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import numpy as np
from datasets import load_dataset
from keras.datasets import mnist
from binreader import open_binary_file
from pathlib import Path
from typing import Dict

def import_dataset(name:str, split:float=0.2, shuffle=True, extra_args:Dict[str, bool]={}):
datasets = {"minds14":import_minds_hugging_face,
"trend":import_data_TREND,
"mnist": import_mnist,
}

if name in datasets:
return datasets[name](split, shuffle, extra_args)
else:
Expand All @@ -21,6 +24,29 @@ def import_minds_hugging_face(split:float, shuffle:bool, extra_args:Dict[str, bo

return (minds["train"], minds["test"])

def import_mnist(split:float, shuffle:bool, extra_args:Dict[str, bool]):
print(Warning("Split is not supported for MNIST yet"))
(data_train, labels_train), (data_test, labels_test) = mnist.load_data()

max_classes = 10
if "max_classes" in extra_args:
max_classes = extra_args["max_classes"]

impurity = 0
if "impurity" in extra_args:
impurity = extra_args["impurity"]

indicies_impure = np.where(np.random.rand(len(labels_train))<impurity)[0]
labels_train[indicies_impure] = 1 - labels_train[indicies_impure]

indicies_train = np.where(labels_train<max_classes)[0]
data_train, labels_train = np.expand_dims(data_train[indicies_train], axis=1), np.expand_dims(labels_train[indicies_train],axis=-1)

indicies_test = np.where(labels_test<max_classes)[0]
data_test, labels_test = np.expand_dims(data_test[indicies_test], axis=1), np.expand_dims(labels_test[indicies_test], axis=-1)

return (data_train, labels_train), (data_test, labels_test)

def import_data_TREND(split:float, shuffle:bool, extra_args:Dict[str, bool]):
#Data for signal analysis
if "use_fourier_transform" in extra_args:
Expand All @@ -46,6 +72,7 @@ def import_data_TREND(split:float, shuffle:bool, extra_args:Dict[str, bool]):
np.random.shuffle(indicies)
data_anthropique = data_anthropique[indicies]

use_fourier_transform = False
if use_fourier_transform:
data_selected_fft = np.fft.fft(data_selected)
data_anthropique_fft = np.fft.fft(data_anthropique)
Expand Down
156 changes: 87 additions & 69 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

from functools import partial
import numpy as np
import matplotlib.pyplot as plt
import os
Expand All @@ -11,66 +11,26 @@
from torch.utils.tensorboard import SummaryWriter

from dataset import import_dataset
from model import SimpleModel
from utils import create_batch_tensorboard
import xgboost as xgb

dataset_name = "trend"
# Get cpu or gpu device for training.

device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"

print(f"Using {device} device")

#Define writer and prepare the tensorboard
try:
os.makedirs("./Models/"+ dataset_name)
except FileExistsError:
print("The Directory already exits")
except:
print("Unknown exception")


comment = "xgb_model"
tensorboard_log_dir = "./Models/"+ dataset_name + "/" + comment + "-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") # + str(len(os.listdir("./Models/"+ dataset_name))) To count the experiments


writer = SummaryWriter(log_dir=tensorboard_log_dir)
create_batch_tensorboard(tensorboard_log_dir)
from model import SimpleSignalModel, SimpleImageModel
from utils import create_batch_tensorboard, logical_and_array


#Create model for training
model = SimpleModel().to(device)
config = {
"dataset": {"name":"trend",
"extra_args": {"max_classes": 2, "impurity": 0.2}},
"optimizer": {"name": "Adam", "lr":1e-3, "weight_decay":1e-4},
"training":{"num_epochs":8, "batch_size":10, "cross_training":3},
"model": SimpleSignalModel,
"device":"cuda" if torch.cuda.is_available() else "cpu",
"comment": "ensemble_impurity"
}

#Define loss funct and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=1)
batch_size = 10

#Import dataset
dataset_args = {"use_fourier_transform":True}
data_train, data_test = import_dataset(dataset_name, split=0.2, shuffle=True, extra_args=dataset_args)

learning_rate_range = np.arange(0.01, 1, 0.05)
for lr in learning_rate_range:
xgb_classifier = xgb.XGBClassifier(eta=lr)
xgb_classifier.fit(data_train[0][:, 2, :], data_train[1])
print("lr", lr, xgb_classifier.score(data_train[0][:, 2, :], data_train[1]))
print(xgb_classifier.score(data_test[0][:, 2, :], data_test[1]))

print(data_train[0].shape, data_train[1].shape)
for k in range(1):
print(data_train[1][k])
#plt.plot([i for i in range(len(data_train[0][k]))], data_train[0][k])
#plt.show()

def train_epoch(epoch:int, data, data_labels, is_testing:bool=False):
def train_epoch(model:torch.nn.Module, training_iter:int, epoch:int, data:np.ndarray, data_labels:np.ndarray, optimizer, lr_scheduler, criterion:torch.nn, writer:SummaryWriter, is_testing:bool=False):
size = len(data)

batch_size = config["training"]["batch_size"]
#We shuffle the dataset
indicies = np.arange(len(data))
indicies = np.arange(size)
np.random.shuffle(indicies)
data, data_labels = data[indicies], data_labels[indicies]

Expand All @@ -82,11 +42,11 @@ def train_epoch(epoch:int, data, data_labels, is_testing:bool=False):
mean_loss = 0
mean_accuracy = 0
mean_counter = 0
for i in range(int(len(data)/batch_size)+1):
for i in range(int(size/batch_size)+1):
inputs, labels = data[i*batch_size: np.minimum((i+1)*batch_size, size)], data_labels[i*batch_size: np.minimum((i+1)*batch_size, size)] #We normalize the inputs
# Every data instance is an input + label pair
inputs = torch.as_tensor(inputs, dtype=torch.float32, device=device)
labels = torch.as_tensor(labels, dtype=torch.float32, device=device)
inputs = torch.as_tensor(inputs, dtype=torch.float32, device=config["device"])
labels = torch.as_tensor(labels, dtype=torch.float32, device=config["device"])
# Zero your gradients for every batch!
optimizer.zero_grad()
if is_testing:
Expand All @@ -105,10 +65,10 @@ def train_epoch(epoch:int, data, data_labels, is_testing:bool=False):

sig_mask = labels==1
key = "test" if is_testing else "train"
writer.add_scalar("Loss_"+ key, loss, (int(len(data)/batch_size) + 1 ) * epoch + i)
writer.add_scalar("Metrics_"+ key +"/Accuracy", accuracy, (int(len(data)/batch_size) + 1 ) * epoch + i)
writer.add_scalar("Metrics_"+ key +"/TPR", torch.mean(torch.where(torch.round(outputs[sig_mask])==1, 1., 0.)), (int(len(data)/batch_size) + 1 ) * epoch + i)
writer.add_scalar("Metrics_"+ key +"/TNR", torch.mean(torch.where(torch.round(outputs[~sig_mask])==0, 1., 0.)), (int(len(data)/batch_size) + 1 ) * epoch + i)
writer.add_scalar("Loss_"+ key, loss, (int(size/batch_size) + 1 ) * epoch + i, new_style=True if i==0 else False)
writer.add_scalar("Metrics_"+ key +"/Accuracy", accuracy, (int(size/batch_size) + 1 ) * epoch + i, new_style=True if i==0 else False)
writer.add_scalar("Metrics_"+ key +"/TPR", torch.mean(torch.where(torch.round(outputs[sig_mask])==1, 1., 0.)), (int(size/batch_size) + 1 ) * epoch + i, new_style=True if i==0 else False)
writer.add_scalar("Metrics_"+ key +"/TNR", torch.mean(torch.where(torch.round(outputs[~sig_mask])==0, 1., 0.)), (int(size/batch_size) + 1 ) * epoch + i, new_style=True if i==0 else False)

mean_loss = (mean_loss * mean_counter + loss )/(mean_counter + 1)
mean_accuracy = (mean_accuracy * mean_counter + accuracy )/(mean_counter + 1)
Expand All @@ -128,16 +88,74 @@ def train_epoch(epoch:int, data, data_labels, is_testing:bool=False):
mean_loss = 0
mean_accuracy = 0
mean_counter = 0


print("Using " + config["device"] + " device")

#Import dataset
(data_train, labels_train), (data_test, labels_test) = import_dataset(config["dataset"]["name"], split=0.2, shuffle=True, extra_args=config["dataset"]["extra_args"])

print(data_train.shape, labels_train.shape)

#Define writer and prepare the tensorboard
try:
os.makedirs("./Models/"+ config["dataset"]["name"])
except FileExistsError:
print("The Directory already exits")
except:
print("Unknown exception")


tensorboard_log_dir = "./Models/"+ config["dataset"]["name"] + "/" + config["comment"] + "-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") # + str(len(os.listdir("./Models/"+ dataset_name))) To count the experiments



#training
nb_epoch = 40
for i in range(nb_epoch):
print(f"epoch: {i}, lr: {lr_scheduler.get_last_lr()}")
train_epoch(i, data_train[0], data_train[1])
train_epoch(i, data_test[0], data_test[1], is_testing=True)
if i % 5 == 0:
torch.save(model.state_dict, tensorboard_log_dir + "/checkpoint" + str(i) + ".pth")

cross_training = config["training"]["cross_training"]
models = []
for training_iter in range(cross_training):
if cross_training != 1:
data_train_split = data_train[int((len(data_train)/cross_training) * training_iter):int((len(data_train)/cross_training) * training_iter + 1)]
else:
data_train_split = data_train
labels_train_split = labels_train[int((len(labels_train)/cross_training) * training_iter):int((len(labels_train)/cross_training) * training_iter + 1)]

#Create model for training
model = config["model"]().to(config["device"])

#Define loss funct and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["optimizer"]["lr"], weight_decay=config["optimizer"]["weight_decay"])
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=1)

writer = SummaryWriter(log_dir=tensorboard_log_dir + "/" + str(training_iter))
create_batch_tensorboard(tensorboard_log_dir)

train_epoch_initializer = partial(train_epoch, model=model, training_iter=training_iter, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, writer=writer)

#training
for epoch in range(config["training"]["num_epochs"]):
print(f"training_iter: [{training_iter+1}/{cross_training}], epoch: {epoch}, lr: {lr_scheduler.get_last_lr()}")
train_epoch_initializer(epoch=epoch, data=data_train, data_labels = labels_train)
train_epoch_initializer(epoch=epoch, data=data_test, data_labels=labels_test, is_testing=True)
if epoch % 5 == 0:
torch.save(model.state_dict, tensorboard_log_dir + "/checkpoint" + str(epoch) +"_" + str(training_iter) + ".pth")

models.append(model)


#Testing model
indicies_impure = np.where(np.random.rand(len(labels_test))<0.2)[0]
labels_test_impure = np.copy(labels_test)
labels_test_impure[indicies_impure] = 1 - labels_test[indicies_impure]

new_labels = torch.as_tensor(labels_test_impure, dtype=torch.float32, device=config["device"])
new_labels[logical_and_array([torch.where(model(torch.as_tensor(data_test, dtype=torch.float32, device=config["device"]))>=0.7, True, False)])] = 1
new_labels[logical_and_array([torch.where(model(torch.as_tensor(data_test, dtype=torch.float32, device=config["device"]))<=1-0.7, True, False)])] = 0
print(np.where(new_labels.cpu().numpy()[:,0]==labels_test_impure[:,0])[0].shape)
print(np.where(new_labels.cpu().numpy()[:,0]==labels_test[:,0])[0].shape)
print(new_labels)

writer.flush()
writer.close()
88 changes: 69 additions & 19 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
import torch.nn as F
import torch

class SimpleModel(F.Module):
class SimpleSignalModel(F.Module):
def __init__(self):
super(SimpleModel, self).__init__()
super(SimpleSignalModel, self).__init__()
self.layers = []

self.conv1_sig = F.Conv1d(1, 64, kernel_size=15, padding=7)
self.layers.append(self.conv1_sig)
self.conv1 = F.Conv1d(1, 128, kernel_size=15, padding=7)
self.layers.append(self.conv1)

self.conv1_fft = F.Conv1d(2, 64, kernel_size=15, padding=7)
self.layers.append(self.conv1_fft)
self.batch_norm1 = F.BatchNorm1d(128)
self.layers.append(self.batch_norm1)

self.batch_norm1_sig = F.BatchNorm1d(64)
self.layers.append(self.batch_norm1_sig)

self.batch_norm1_fft = F.BatchNorm1d(64)
self.layers.append(self.batch_norm1_fft)

self.conv2 = F.Conv1d(128, 128, kernel_size=7, padding=3)
self.layers.append(self.conv2)

Expand Down Expand Up @@ -54,13 +48,8 @@ def __init__(self):
self.sigmoid = F.Sigmoid()

def forward(self, x):
x_sig = self.maxpool(self.conv1_sig(x[:, :1, :]))
x_sig = self.dropout(self.activation(self.batch_norm1_sig(x_sig)))

x_fft = self.maxpool(self.conv1_fft(x[:, 1:, :]))
x_fft = self.dropout(self.activation(self.batch_norm1_fft(x_fft)))

x = torch.cat([x_sig, x_fft], axis=1)
x = self.maxpool(self.conv1(x))
x = self.dropout(self.activation(self.batch_norm1(x)))

x = self.dropout(self.activation(self.activation(self.batch_norm2(self.conv2(x))) + x))
x = self.maxpool(x)
Expand Down Expand Up @@ -88,3 +77,64 @@ def save_txt(self, filename:str):
f.write(str(layer._get_name) + "\n")
f.close()

class SimpleImageModel(F.Module):
def __init__(self):
super(SimpleImageModel, self).__init__()
self.layers = []

self.conv1 = F.Conv2d(1, 8, kernel_size=5, stride=2)
self.layers.append(self.conv1)

self.batch_norm1 = F.BatchNorm2d(8)
self.layers.append(self.batch_norm1)

self.conv2 = F.Conv2d(8, 16, kernel_size=3)
self.layers.append(self.conv2)

self.batch_norm2 = F.BatchNorm2d(16)
self.layers.append(self.batch_norm2)

self.conv3 = F.Conv2d(16, 32, kernel_size=3)
self.layers.append(self.conv3)

self.batch_norm3 = F.BatchNorm2d(32)
self.layers.append(self.batch_norm3)

self.dense1 = F.Linear(32*64, 512)
self.layers.append(self.dense1)
self.dense2 = F.Linear(512, 1)
self.layers.append(self.dense2)

self.dropout = F.Dropout(0)
self.layers.append(self.dropout)

self.activation = F.ReLU()
self.layers.append(self.activation)

self.flatten = F.Flatten()
self.sigmoid = F.Sigmoid()

def forward(self, x):
x = self.conv1(x)
x = self.dropout(self.activation(self.batch_norm1(x)))

x = self.dropout(self.activation(self.batch_norm2(self.conv2(x))))

x = self.dropout(self.activation(self.batch_norm3(self.conv3(x))))

x = self.flatten(x)
x = self.activation(self.dense1(x))
x = self.sigmoid(self.dense2(x))

return x

def save_txt(self, filename:str):
"""Save the layers in a txt
Args:
filename (str): path to the txt file
"""
with open(filename, 'w') as f:
for layer in self.layers:
f.write(str(layer._get_name) + "\n")
f.close()
21 changes: 20 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
import os
import numpy as np
from typing import List

def create_batch_tensorboard(dir:str):
"""Create the batch file to launch the TensorBoard
Expand All @@ -14,4 +15,22 @@ def create_batch_tensorboard(dir:str):
f.write(f"cd {os.getcwd()}\n")
f.write(f'start "" http://localhost:{port}/#scalars\n')
f.write("tensorboard --logdir " + dir + " --port " + str(port))
f.close()
f.close()

def logical_and_array(arrays:List[np.ndarray]) -> np.ndarray:
"""Return logical and for a list of arrays.
Args:
arrays (List[np.ndarray]): The list of arrays
Returns:
(np.ndarray): The logical and of the list of arrays
"""
if len(arrays) == 1:
return arrays[0]

result = arrays[0]

for array in arrays[1:]:
result = np.logical_and(result, array)

return result

0 comments on commit 8fcb27e

Please sign in to comment.