From da61463db98a5cdc60c4c3edf1ccb8adc6b3a069 Mon Sep 17 00:00:00 2001 From: Kilian Lieret Date: Wed, 27 Jul 2022 15:28:02 -0400 Subject: [PATCH] Update code formatting with modified blacken-docs --- _episodes/06-Data_Discussion.md | 69 +++++---- _episodes/07-Data_Preprocessing.md | 51 ++++--- _episodes/09-Model_Training.md | 225 ++++++++++++++++++----------- _episodes/10-Overfitting_Check.md | 5 +- _episodes/11-Model_Comparison.md | 69 ++++++--- _episodes/12-Experimental_Data.md | 38 +++-- _episodes/13-TensorFlow.md | 44 ++++-- _episodes/14-ttZ.md | 100 +++++++++---- 8 files changed, 390 insertions(+), 211 deletions(-) diff --git a/_episodes/06-Data_Discussion.md b/_episodes/06-Data_Discussion.md index 9dcf296..9520656 100644 --- a/_episodes/06-Data_Discussion.md +++ b/_episodes/06-Data_Discussion.md @@ -20,9 +20,9 @@ keypoints: Here we will import all the required libraries for the rest of the tutorial. All scikit-learn and PyTorch functions will be imported later on when they are required. ~~~ -import pandas as pd # to store data as dataframe -import numpy as np # for numerical calculations such as histogramming -import matplotlib.pyplot as plt # for plotting +import pandas as pd # to store data as dataframe +import numpy as np # for numerical calculations such as histogramming +import matplotlib.pyplot as plt # for plotting ~~~ {: .language-python} @@ -36,9 +36,10 @@ np.__version__ Let's set the random seed that we'll be using. This reduces the randomness when you re-run the notebook ~~~ -seed_value = 420 # 42 is the answer to life, the universe and everything -from numpy.random import seed # import the function to set the random seed in NumPy -seed(seed_value) # set the seed value for random numbers in NumPy +seed_value = 420 # 42 is the answer to life, the universe and everything +from numpy.random import seed # import the function to set the random seed in NumPy + +seed(seed_value) # set the seed value for random numbers in NumPy ~~~ {: .language-python} @@ -51,7 +52,7 @@ The dataset we will use in this tutorial is simulated ATLAS data. Each event cor # In this notebook we only process the main signal ggH125_ZZ4lep and the main background llll, # for illustration purposes. # You can add other backgrounds after if you wish. -samples = ['llll','ggH125_ZZ4lep'] +samples = ["llll", "ggH125_ZZ4lep"] ~~~ {: .language-python} @@ -62,11 +63,11 @@ Here we will format the dataset $$(x_i, y_i)$$ so we can explore! First, we need ~~~ # get data from files -DataFrames = {} # define empty dictionary to hold dataframes -for s in samples: # loop over samples - DataFrames[s] = pd.read_csv('/kaggle/input/4lepton/'+s+".csv") # read .csv file +DataFrames = {} # define empty dictionary to hold dataframes +for s in samples: # loop over samples + DataFrames[s] = pd.read_csv("/kaggle/input/4lepton/" + s + ".csv") # read .csv file -DataFrames['ggH125_ZZ4lep'] # print signal data to take a look +DataFrames["ggH125_ZZ4lep"] # print signal data to take a look ~~~ {: .language-python} @@ -74,14 +75,16 @@ Before diving into machine learning, think about whether there are any things yo ~~~ # cut on lepton type -def cut_lep_type(lep_type_0,lep_type_1,lep_type_2,lep_type_3): -# first lepton is [0], 2nd lepton is [1] etc -# for an electron lep_type is 11 -# for a muon lep_type is 13 -# only want to keep events where one of eeee, mumumumu, eemumu +def cut_lep_type(lep_type_0, lep_type_1, lep_type_2, lep_type_3): + # first lepton is [0], 2nd lepton is [1] etc + # for an electron lep_type is 11 + # for a muon lep_type is 13 + # only want to keep events where one of eeee, mumumumu, eemumu sum_lep_type = lep_type_0 + lep_type_1 + lep_type_2 + lep_type_3 - if sum_lep_type==44 or sum_lep_type==48 or sum_lep_type==52: return True - else: return False + if sum_lep_type == 44 or sum_lep_type == 48 or sum_lep_type == 52: + return True + else: + return False ~~~ {: .language-python} @@ -91,11 +94,15 @@ We then need to apply this function on our DataFrames. # apply cut on lepton type for s in samples: # cut on lepton type using the function cut_lep_type defined above - DataFrames[s] = DataFrames[s][ np.vectorize(cut_lep_type)(DataFrames[s].lep_type_0, - DataFrames[s].lep_type_1, - DataFrames[s].lep_type_2, - DataFrames[s].lep_type_3) ] -DataFrames['ggH125_ZZ4lep'] # print signal data to take a look + DataFrames[s] = DataFrames[s][ + np.vectorize(cut_lep_type)( + DataFrames[s].lep_type_0, + DataFrames[s].lep_type_1, + DataFrames[s].lep_type_2, + DataFrames[s].lep_type_3, + ) + ] +DataFrames["ggH125_ZZ4lep"] # print signal data to take a look ~~~ {: .language-python} @@ -131,12 +138,12 @@ DataFrames['ggH125_ZZ4lep'] # print signal data to take a look In any analysis searching for signal one wants to optimise the use of various input variables. Often, this optimisation will be to find the best signal to background ratio. Here we define histograms for the variables that we'll look to optimise. ~~~ -lep_pt_2 = { # dictionary containing plotting parameters for the lep_pt_2 histogram +lep_pt_2 = { # dictionary containing plotting parameters for the lep_pt_2 histogram # change plotting parameters - 'bin_width':1, # width of each histogram bin - 'num_bins':13, # number of histogram bins - 'xrange_min':7, # minimum on x-axis - 'xlabel':r'$lep\_pt$[2] [GeV]', # x-axis label + "bin_width": 1, # width of each histogram bin + "num_bins": 13, # number of histogram bins + "xrange_min": 7, # minimum on x-axis + "xlabel": r"$lep\_pt$[2] [GeV]", # x-axis label } ~~~ {: .language-python} @@ -162,7 +169,10 @@ lep_pt_2 = { # dictionary containing plotting parameters for the lep_pt_2 histog Now we define a dictionary for the histograms we want to plot. ~~~ -SoverB_hist_dict = {'lep_pt_2':lep_pt_2,'lep_pt_1':lep_pt_1} # add a histogram here if you want it plotted +SoverB_hist_dict = { + "lep_pt_2": lep_pt_2, + "lep_pt_1": lep_pt_1, +} # add a histogram here if you want it plotted ~~~ {: .language-python} @@ -172,6 +182,7 @@ We're not doing any machine learning just yet! We're looking at the variables we ~~~ from my_functions import plot_SoverB + plot_SoverB(DataFrames, SoverB_hist_dict) ~~~ {: .language-python} diff --git a/_episodes/07-Data_Preprocessing.md b/_episodes/07-Data_Preprocessing.md index 5884ea0..81254a5 100644 --- a/_episodes/07-Data_Preprocessing.md +++ b/_episodes/07-Data_Preprocessing.md @@ -20,7 +20,7 @@ keypoints: It's almost time to build a machine learning model! First we choose the variables to use in our machine learning model. ~~~ -ML_inputs = ['lep_pt_1','lep_pt_2'] # list of features for ML model +ML_inputs = ["lep_pt_1", "lep_pt_2"] # list of features for ML model ~~~ {: .language-python} @@ -34,20 +34,32 @@ ML_inputs = ['lep_pt_1','lep_pt_2'] # list of features for ML model # containing all the data and one array of categories # of length n_samples -all_MC = [] # define empty list that will contain all features for the MC -for s in samples: # loop over the different samples - if s!='data': # only MC should pass this - all_MC.append(DataFrames[s][ML_inputs]) # append the MC dataframe to the list containing all MC features -X = np.concatenate(all_MC) # concatenate the list of MC dataframes into a single 2D array of features, called X - -all_y = [] # define empty list that will contain labels whether an event in signal or background -for s in samples: # loop over the different samples - if s!='data': # only MC should pass this - if 'H125' in s: # only signal MC should pass this - all_y.append(np.ones(DataFrames[s].shape[0])) # signal events are labelled with 1 - else: # only background MC should pass this - all_y.append(np.zeros(DataFrames[s].shape[0])) # background events are labelled 0 -y = np.concatenate(all_y) # concatenate the list of labels into a single 1D array of labels, called y +all_MC = [] # define empty list that will contain all features for the MC +for s in samples: # loop over the different samples + if s != "data": # only MC should pass this + all_MC.append( + DataFrames[s][ML_inputs] + ) # append the MC dataframe to the list containing all MC features +X = np.concatenate( + all_MC +) # concatenate the list of MC dataframes into a single 2D array of features, called X + +all_y = ( + [] +) # define empty list that will contain labels whether an event in signal or background +for s in samples: # loop over the different samples + if s != "data": # only MC should pass this + if "H125" in s: # only signal MC should pass this + all_y.append( + np.ones(DataFrames[s].shape[0]) + ) # signal events are labelled with 1 + else: # only background MC should pass this + all_y.append( + np.zeros(DataFrames[s].shape[0]) + ) # background events are labelled 0 +y = np.concatenate( + all_y +) # concatenate the list of labels into a single 1D array of labels, called y ~~~ {: .language-python} @@ -69,9 +81,9 @@ Now we separate our data into a training and test set. from sklearn.model_selection import train_test_split # make train and test sets -X_train,X_test, y_train,y_test = train_test_split(X, y, - test_size=0.33, - random_state=seed_value ) # set the random seed for reproducibility +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=seed_value +) # set the random seed for reproducibility ~~~ {: .language-python} @@ -79,7 +91,8 @@ Machine learning models may have difficulty converging before the maximum number ~~~ from sklearn.preprocessing import StandardScaler -scaler = StandardScaler() # initialise StandardScaler + +scaler = StandardScaler() # initialise StandardScaler # Fit only to the training data scaler.fit(X_train) diff --git a/_episodes/09-Model_Training.md b/_episodes/09-Model_Training.md index 40ae719..c52bef3 100644 --- a/_episodes/09-Model_Training.md +++ b/_episodes/09-Model_Training.md @@ -46,9 +46,11 @@ In the previous page we created a training and test dataset. Lets use these data from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score -RF_clf = RandomForestClassifier(criterion='gini', max_depth=8, n_estimators=30, random_state=seed_value) # initialise your random forest classifier -RF_clf.fit(X_train_scaled, y_train) # fit to the training data -y_pred_RF = RF_clf.predict(X_test_scaled) # make predictions on the test data +RF_clf = RandomForestClassifier( + criterion="gini", max_depth=8, n_estimators=30, random_state=seed_value +) # initialise your random forest classifier +RF_clf.fit(X_train_scaled, y_train) # fit to the training data +y_pred_RF = RF_clf.predict(X_test_scaled) # make predictions on the test data # See how well the classifier does print(accuracy_score(y_test, y_pred_RF)) @@ -67,49 +69,65 @@ A neural network is a black-box model with many hyperparameters. The mathematica First let's import the bits we need to build a neural network in PyTorch. ~~~ -import torch # import PyTorch -import torch.nn as nn # import PyTorch neural network -import torch.nn.functional as F # import PyTorch neural network functional -from torch.autograd import Variable # create variable from tensor -import torch.utils.data as Data # create data from tensors +import torch # import PyTorch +import torch.nn as nn # import PyTorch neural network +import torch.nn.functional as F # import PyTorch neural network functional +from torch.autograd import Variable # create variable from tensor +import torch.utils.data as Data # create data from tensors ~~~ {: .language-python} Next we make variables for various PyTorch neural network hyper-parameters: ~~~ -epochs = 10 # number of training epochs -batch_size = 32 # number of samples per batch -input_size = len(ML_inputs) # The number of features -num_classes = 2 # The number of output classes. In this case: [signal, background] -hidden_size = 5 # The number of nodes at the hidden layer -learning_rate = 0.001 # The speed of convergence -verbose = True # flag for printing out stats at each epoch -torch.manual_seed(seed_value) # set random seed for PyTorch +epochs = 10 # number of training epochs +batch_size = 32 # number of samples per batch +input_size = len(ML_inputs) # The number of features +num_classes = 2 # The number of output classes. In this case: [signal, background] +hidden_size = 5 # The number of nodes at the hidden layer +learning_rate = 0.001 # The speed of convergence +verbose = True # flag for printing out stats at each epoch +torch.manual_seed(seed_value) # set random seed for PyTorch ~~~ {: .language-python} Now we create tensors, variables, datasets and loaders to build our neural network in PyTorch. We need to keep some events for validation. Validation sets are used to select and tune the final neural network model. Here we're making use of the PyTorch `DataLoader` functionality. This is going to be useful later when we want to load data during our training loop. ~~~ -X_train_tensor = torch.as_tensor(X_train_scaled, dtype=torch.float) # make tensor from X_train_scaled -y_train_tensor = torch.as_tensor(y_train, dtype=torch.long) # make tensor from y_train - -X_train_var, y_train_var = Variable(X_train_tensor), Variable(y_train_tensor) # make variables from tensors - -X_valid_var, y_valid_var = X_train_var[:100], y_train_var[:100] # get first 100 events for validation -X_train_nn_var, y_train_nn_var = X_train_var[100:], y_train_var[100:] # get remaining events for training - -train_data = Data.TensorDataset(X_train_nn_var, y_train_nn_var) # create training dataset -valid_data = Data.TensorDataset(X_valid_var, y_valid_var) # create validation dataset - -train_loader = Data.DataLoader(dataset=train_data, # PyTorch Dataset - batch_size=batch_size, # how many samples per batch to load - shuffle=True) # data reshuffled at every epoch - -valid_loader = Data.DataLoader(dataset=valid_data, # PyTorch Dataset - batch_size=batch_size, # how many samples per batch to load - shuffle=True) # data reshuffled at every epoch +X_train_tensor = torch.as_tensor( + X_train_scaled, dtype=torch.float +) # make tensor from X_train_scaled +y_train_tensor = torch.as_tensor(y_train, dtype=torch.long) # make tensor from y_train + +X_train_var, y_train_var = Variable(X_train_tensor), Variable( + y_train_tensor +) # make variables from tensors + +X_valid_var, y_valid_var = ( + X_train_var[:100], + y_train_var[:100], +) # get first 100 events for validation +X_train_nn_var, y_train_nn_var = ( + X_train_var[100:], + y_train_var[100:], +) # get remaining events for training + +train_data = Data.TensorDataset( + X_train_nn_var, y_train_nn_var +) # create training dataset +valid_data = Data.TensorDataset(X_valid_var, y_valid_var) # create validation dataset + +train_loader = Data.DataLoader( + dataset=train_data, # PyTorch Dataset + batch_size=batch_size, # how many samples per batch to load + shuffle=True, +) # data reshuffled at every epoch + +valid_loader = Data.DataLoader( + dataset=valid_data, # PyTorch Dataset + batch_size=batch_size, # how many samples per batch to load + shuffle=True, +) # data reshuffled at every epoch ~~~ {: .language-python} @@ -117,20 +135,20 @@ valid_loader = Data.DataLoader(dataset=valid_data, # PyTorch Dataset Here we define the neural network that we'll be using. This is a simple fully-connected neural network, otherwise known as a *multi-layer perceptron* (MLP). It has two hidden layers, both with the same number of neurons (`hidden_dim`). The order of the layers for a forward pass through the network is specified in the `forward` function. You can see that each fully-connected layer is followed by a [ReLU activation function](https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#relu). The function then returns an unnormalised vector of outputs (`x`; also referred to as *logits*) and a vector of normalised "probabilities" for `x`, calculated using the [SoftMax function](https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#softmax). ~~~ -class Classifier_MLP(nn.Module): # define Multi-Layer Perceptron - def __init__(self, in_dim, hidden_dim, out_dim): # initialise - super().__init__() # lets you avoid referring to the base class explicitly +class Classifier_MLP(nn.Module): # define Multi-Layer Perceptron + def __init__(self, in_dim, hidden_dim, out_dim): # initialise + super().__init__() # lets you avoid referring to the base class explicitly - self.h1 = nn.Linear(in_dim, hidden_dim) # hidden layer 1 - self.out = nn.Linear(hidden_dim, out_dim) # output layer - self.out_dim = out_dim # output layer dimension + self.h1 = nn.Linear(in_dim, hidden_dim) # hidden layer 1 + self.out = nn.Linear(hidden_dim, out_dim) # output layer + self.out_dim = out_dim # output layer dimension - def forward(self, x): # order of the layers + def forward(self, x): # order of the layers - x = F.relu(self.h1(x)) # relu activation function for hidden layer - x = self.out(x) # no activation function for output layer + x = F.relu(self.h1(x)) # relu activation function for hidden layer + x = self.out(x) # no activation function for output layer - return x, F.softmax(x, dim=1) # SoftMax function + return x, F.softmax(x, dim=1) # SoftMax function ~~~ {: .language-python} @@ -139,8 +157,12 @@ Next we need to specify that we're using the `Classifier_MLP` model that we spec We also specify which optimizer we'll use to train our network. Here I've implemented a classic [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) (SGD) optimiser, but there are [a wide range of optimizers available in the PyTorch library](https://pytorch.org/docs/stable/optim.html#algorithms). For most recent applications the [Adam](https://arxiv.org/abs/1412.6980) optimizer is used. ~~~ -NN_clf = Classifier_MLP(in_dim=input_size, hidden_dim=hidden_size, out_dim=num_classes) # call Classifier_MLP class -optimizer = torch.optim.SGD(NN_clf.parameters(), lr=learning_rate) # optimize model parameters +NN_clf = Classifier_MLP( + in_dim=input_size, hidden_dim=hidden_size, out_dim=num_classes +) # call Classifier_MLP class +optimizer = torch.optim.SGD( + NN_clf.parameters(), lr=learning_rate +) # optimize model parameters ~~~ {: .language-python} @@ -151,69 +173,100 @@ The `train_loader` that we specified earlier using the PyTorch `DataLoader` brea PyTorch models (`nn.Module`) can be set into either training or evaluation mode. For the loop we've defined here this setting does not make any difference as we do not use any layers that perform differently during evaluation (e.g. dropout, batch normalisation, etc. ) However, it's included here for completeness. ~~~ -_results = [] # define empty list for epoch, train_loss, valid_loss, accuracy +_results = [] # define empty list for epoch, train_loss, valid_loss, accuracy for epoch in range(epochs): # loop over the dataset multiple times # training loop for this epoch - NN_clf.train() # set the model into training mode + NN_clf.train() # set the model into training mode - train_loss = 0. # start training loss counter at 0 - for batch, (x_train_batch, y_train_batch) in enumerate(train_loader): # loop over train_loader + train_loss = 0.0 # start training loss counter at 0 + for batch, (x_train_batch, y_train_batch) in enumerate( + train_loader + ): # loop over train_loader - NN_clf.zero_grad() # set the gradients to zero before backpropragation because PyTorch accumulates the gradients - out, prob = NN_clf(x_train_batch) # get output and probability on this training batch - loss = F.cross_entropy(out, y_train_batch) # calculate loss as cross entropy + NN_clf.zero_grad() # set the gradients to zero before backpropragation because PyTorch accumulates the gradients + out, prob = NN_clf( + x_train_batch + ) # get output and probability on this training batch + loss = F.cross_entropy(out, y_train_batch) # calculate loss as cross entropy - loss.backward() # compute dloss/dx - optimizer.step() # updates the parameters + loss.backward() # compute dloss/dx + optimizer.step() # updates the parameters - train_loss += loss.item() * x_train_batch.size(0) # add to counter for training loss + train_loss += loss.item() * x_train_batch.size( + 0 + ) # add to counter for training loss - train_loss /= len(train_loader.dataset) # divide train loss by length of train_loader + train_loss /= len( + train_loader.dataset + ) # divide train loss by length of train_loader - if verbose: # if verbose flag set to True - print('Epoch: {}, Train Loss: {:4f}'.format(epoch, train_loss)) + if verbose: # if verbose flag set to True + print("Epoch: {}, Train Loss: {:4f}".format(epoch, train_loss)) # validation loop for this epoch: - NN_clf.eval() # set the model into evaluation mode + NN_clf.eval() # set the model into evaluation mode with torch.no_grad(): # turn off the gradient calculations - correct = 0; valid_loss = 0 # start counters for number of correct and validation loss - for i, (x_valid_batch, y_valid_batch) in enumerate(valid_loader): # loop over validation loader - - out, prob = NN_clf(x_valid_batch) # get output and probability on this validation batch - loss = F.cross_entropy(out, y_valid_batch) # compute loss as cross entropy - - valid_loss += loss.item() * x_valid_batch.size(0) # add to counter for validation loss - - preds = prob.argmax(dim=1, keepdim=True) # get predictions - correct += preds.eq(y_valid_batch.view_as(preds)).sum().item() # count number of correct - - valid_loss /= len(valid_loader.dataset) # divide validation loss by length of validation dataset - accuracy = correct / len(valid_loader.dataset) # calculate accuracy as number of correct divided by total - - if verbose: # if verbose flag set to True - print('Validation Loss: {:4f}, Validation Accuracy: {:4f}'.format(valid_loss, accuracy)) + correct = 0 + valid_loss = 0 # start counters for number of correct and validation loss + for i, (x_valid_batch, y_valid_batch) in enumerate( + valid_loader + ): # loop over validation loader + + out, prob = NN_clf( + x_valid_batch + ) # get output and probability on this validation batch + loss = F.cross_entropy(out, y_valid_batch) # compute loss as cross entropy + + valid_loss += loss.item() * x_valid_batch.size( + 0 + ) # add to counter for validation loss + + preds = prob.argmax(dim=1, keepdim=True) # get predictions + correct += ( + preds.eq(y_valid_batch.view_as(preds)).sum().item() + ) # count number of correct + + valid_loss /= len( + valid_loader.dataset + ) # divide validation loss by length of validation dataset + accuracy = correct / len( + valid_loader.dataset + ) # calculate accuracy as number of correct divided by total + + if verbose: # if verbose flag set to True + print( + "Validation Loss: {:4f}, Validation Accuracy: {:4f}".format( + valid_loss, accuracy + ) + ) # create output row: _results.append([epoch, train_loss, valid_loss, accuracy]) -results = np.array(_results) # make array of results -print('Finished Training') -print("Final validation error: ",100.*(1 - accuracy),"%") +results = np.array(_results) # make array of results +print("Finished Training") +print("Final validation error: ", 100.0 * (1 - accuracy), "%") ~~~ {: .language-python} The predicted y values for the neural network, `y_pred_NN` can be obtained like: ~~~ -X_test_tensor = torch.as_tensor(X_test_scaled, dtype=torch.float) # make tensor from X_test_scaled -y_test_tensor = torch.as_tensor(y_test, dtype=torch.long) # make tensor from y_test - -X_test_var, y_test_var = Variable(X_test_tensor), Variable(y_test_tensor) # make variables from tensors - -out, prob = NN_clf(X_test_var) # get output and probabilities from X_test -y_pred_NN = prob.cpu().detach().numpy().argmax(axis=1) # get signal/background predictions +X_test_tensor = torch.as_tensor( + X_test_scaled, dtype=torch.float +) # make tensor from X_test_scaled +y_test_tensor = torch.as_tensor(y_test, dtype=torch.long) # make tensor from y_test + +X_test_var, y_test_var = Variable(X_test_tensor), Variable( + y_test_tensor +) # make variables from tensors + +out, prob = NN_clf(X_test_var) # get output and probabilities from X_test +y_pred_NN = ( + prob.cpu().detach().numpy().argmax(axis=1) +) # get signal/background predictions ~~~ {: .language-python} diff --git a/_episodes/10-Overfitting_Check.md b/_episodes/10-Overfitting_Check.md index 4ddd2a5..8c432de 100644 --- a/_episodes/10-Overfitting_Check.md +++ b/_episodes/10-Overfitting_Check.md @@ -24,7 +24,10 @@ The code to plot the overfitting check is a bit long, so once again you can see ~~~ from my_functions import compare_train_test -compare_train_test(RF_clf, X_train_scaled, y_train, X_test_scaled, y_test, 'Random Forest output') + +compare_train_test( + RF_clf, X_train_scaled, y_train, X_test_scaled, y_test, "Random Forest output" +) ~~~ {: .language-python} diff --git a/_episodes/11-Model_Comparison.md b/_episodes/11-Model_Comparison.md index adefe85..ecab06d 100644 --- a/_episodes/11-Model_Comparison.md +++ b/_episodes/11-Model_Comparison.md @@ -62,9 +62,9 @@ By default, the threshold was set to 50% in computing the accuracy score when we ~~~ from sklearn.metrics import classification_report, roc_auc_score + # Random Forest Report -print (classification_report(y_test, y_pred_RF, - target_names=["background", "signal"])) +print(classification_report(y_test, y_pred_RF, target_names=["background", "signal"])) ~~~ {: .language-python} @@ -86,14 +86,18 @@ Out of the box, the random forest performs slightly better than the neural netwo Let's get the decisions of the random forest classifier. ~~~ -decisions_rf = RF_clf.predict_proba(X_test_scaled)[:,1] # get the decisions of the random forest +decisions_rf = RF_clf.predict_proba(X_test_scaled)[ + :, 1 +] # get the decisions of the random forest ~~~ {: .language-python} The decisions of the neural network classifier, `decisions_nn`, can be obtained like: ~~~ -decisions_nn = NN_clf(X_test_var)[1][:,1].cpu().detach().numpy() # get the decisions of the neural network +decisions_nn = ( + NN_clf(X_test_var)[1][:, 1].cpu().detach().numpy() +) # get the decisions of the neural network ~~~ {: .language-python} @@ -101,12 +105,20 @@ decisions_nn = NN_clf(X_test_var)[1][:,1].cpu().detach().numpy() # get the decis The Receiver Operating Characteristic (ROC) curve is a plot of the recall (or true positive rate) vs. the false positive rate: the ratio of negative instances incorrectly classified as positive. A classifier may classify many instances as positive (i.e. has a low tolerance for classifying something as positive), but in such an example it will probably also incorrectly classify many negative instances as positive as well. The false positive rate is plotted on the x-axis of the ROC curve and the true positive rate on the y-axis; the threshold is varied to give a parameteric curve. A random classifier results in a line. Before we look at the ROC curve, let's examine the following plot ~~~ -plt.hist(decisions_rf[y_test==0], histtype='step', bins=50, label='Background Events') # plot background -plt.hist(decisions_rf[y_test==1], histtype='step', bins=50, linestyle='dashed', label='Signal Events') # plot signal -plt.xlabel('Threshold') # x-axis label -plt.ylabel('Number of Events') # y-axis label -plt.semilogy() # make the y-axis semi-log -plt.legend() # draw the legend +plt.hist( + decisions_rf[y_test == 0], histtype="step", bins=50, label="Background Events" +) # plot background +plt.hist( + decisions_rf[y_test == 1], + histtype="step", + bins=50, + linestyle="dashed", + label="Signal Events", +) # plot signal +plt.xlabel("Threshold") # x-axis label +plt.ylabel("Number of Events") # y-axis label +plt.semilogy() # make the y-axis semi-log +plt.legend() # draw the legend ~~~ {: .language-python} @@ -116,7 +128,10 @@ Suppose we move the threshold from 0 to 1 in steps of 0.01. In doing so, we will ~~~ from sklearn.metrics import roc_curve -fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, decisions_rf) # get FPRs, TPRs and thresholds for random forest + +fpr_rf, tpr_rf, thresholds_rf = roc_curve( + y_test, decisions_rf +) # get FPRs, TPRs and thresholds for random forest ~~~ {: .language-python} @@ -134,13 +149,17 @@ fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, decisions_rf) # get FPRs, TPRs Now we plot the ROC curve: ~~~ -plt.plot(fpr_rf, tpr_rf, label='Random Forest') # plot random forest ROC -plt.plot(fpr_nn, tpr_nn, linestyle='dashed', label='Neural Network') # plot neural network ROC -plt.plot([0, 1], [0, 1], linestyle='dotted', color='grey', label='Luck') # plot diagonal line to indicate luck -plt.xlabel('False Positive Rate') # x-axis label -plt.ylabel('True Positive Rate') # y-axis label -plt.grid() # add a grid to the plot -plt.legend() # add a legend +plt.plot(fpr_rf, tpr_rf, label="Random Forest") # plot random forest ROC +plt.plot( + fpr_nn, tpr_nn, linestyle="dashed", label="Neural Network" +) # plot neural network ROC +plt.plot( + [0, 1], [0, 1], linestyle="dotted", color="grey", label="Luck" +) # plot diagonal line to indicate luck +plt.xlabel("False Positive Rate") # x-axis label +plt.ylabel("True Positive Rate") # y-axis label +plt.grid() # add a grid to the plot +plt.legend() # add a legend ~~~ {: .language-python} @@ -188,12 +207,14 @@ Other values for $$b_r$$ would also be possible. Once you've plotted AMS for the Then plot: ~~~ -plt.plot(thresholds_rf, ams_rf, label='Random Forest') # plot random forest AMS -plt.plot(thresholds_nn, ams_nn, linestyle='dashed', label='Neural Network') # plot neural network AMS -plt.xlabel('Threshold') # x-axis label -plt.ylabel('AMS') # y-axis label -plt.title('AMS with $b_r=0.001$') # add plot title -plt.legend() # add legend +plt.plot(thresholds_rf, ams_rf, label="Random Forest") # plot random forest AMS +plt.plot( + thresholds_nn, ams_nn, linestyle="dashed", label="Neural Network" +) # plot neural network AMS +plt.xlabel("Threshold") # x-axis label +plt.ylabel("AMS") # y-axis label +plt.title("AMS with $b_r=0.001$") # add plot title +plt.legend() # add legend ~~~ {: .language-python} diff --git a/_episodes/12-Experimental_Data.md b/_episodes/12-Experimental_Data.md index 5ab9a2d..a0e1b5c 100644 --- a/_episodes/12-Experimental_Data.md +++ b/_episodes/12-Experimental_Data.md @@ -84,16 +84,30 @@ We first need to get the real experimental data. Now we can overlay the real experimental data on the simulated data. ~~~ -labels = ['background','signal'] # labels for simulated data -thresholds = [] # define list to hold random forest classifier probability predictions for each sample -for s in samples: # loop over samples - thresholds.append(RF_clf.predict_proba(scaler.transform(DataFrames[s][ML_inputs]))[:,1]) # predict probabilities for each sample -plt.hist(thresholds, bins=np.arange(0, 0.8, 0.1), density=True, stacked=True, label=labels) # plot simulated data -data_hist = np.histogram(RF_clf.predict_proba(X_data_scaled)[:,1], bins=np.arange(0, 0.8, 0.1), density=True)[0] # histogram the experimental data -scale = sum(RF_clf.predict_proba(X_data_scaled)[:,1]) / sum(data_hist) # get scale imposed by density=True -data_err = np.sqrt(data_hist * scale) / scale # get error on experimental data -plt.errorbar(x=np.arange(0.05, 0.75, 0.1), y=data_hist, yerr=data_err, label='Data') # plot the experimental data errorbars -plt.xlabel('Threshold') +labels = ["background", "signal"] # labels for simulated data +thresholds = ( + [] +) # define list to hold random forest classifier probability predictions for each sample +for s in samples: # loop over samples + thresholds.append( + RF_clf.predict_proba(scaler.transform(DataFrames[s][ML_inputs]))[:, 1] + ) # predict probabilities for each sample +plt.hist( + thresholds, bins=np.arange(0, 0.8, 0.1), density=True, stacked=True, label=labels +) # plot simulated data +data_hist = np.histogram( + RF_clf.predict_proba(X_data_scaled)[:, 1], bins=np.arange(0, 0.8, 0.1), density=True +)[ + 0 +] # histogram the experimental data +scale = sum(RF_clf.predict_proba(X_data_scaled)[:, 1]) / sum( + data_hist +) # get scale imposed by density=True +data_err = np.sqrt(data_hist * scale) / scale # get error on experimental data +plt.errorbar( + x=np.arange(0.05, 0.75, 0.1), y=data_hist, yerr=data_err, label="Data" +) # plot the experimental data errorbars +plt.xlabel("Threshold") plt.legend() ~~~ {: .language-python} @@ -105,14 +119,14 @@ Within errors, the real experimental data errorbars agree with the simulated dat How many signal events is the random forest classifier predicting? ~~~ -print(np.count_nonzero(y_data_RF==1)) # signal +print(np.count_nonzero(y_data_RF == 1)) # signal ~~~ {: .language-python} What about background? ~~~ -print(np.count_nonzero(y_data_RF==0)) # background +print(np.count_nonzero(y_data_RF == 0)) # background ~~~ {: .language-python} diff --git a/_episodes/13-TensorFlow.md b/_episodes/13-TensorFlow.md index 57ff34b..3165481 100644 --- a/_episodes/13-TensorFlow.md +++ b/_episodes/13-TensorFlow.md @@ -26,9 +26,10 @@ TensorFlow is also interoperable with other packages discussed so far, datatypes Here we will import all the required TensorFlow libraries for the rest of the tutorial. ~~~ -from tensorflow import keras # tensorflow wrapper -from tensorflow.random import set_seed # import set_seed function for TensorFlow -set_seed(seed_value) # set TensorFlow random seed +from tensorflow import keras # tensorflow wrapper +from tensorflow.random import set_seed # import set_seed function for TensorFlow + +set_seed(seed_value) # set TensorFlow random seed ~~~ {: .language-python} @@ -39,15 +40,25 @@ From the previous pages, we already have our data in NumPy arrays, like `X`, tha To use a neural network with TensorFlow, we modularize its construction using a function. We will later pass this function into a Keras wrapper. ~~~ -def build_model(n_hidden=1, n_neurons=5, learning_rate=1e-3): # function to build a neural network model +def build_model( + n_hidden=1, n_neurons=5, learning_rate=1e-3 +): # function to build a neural network model # Build - model = keras.models.Sequential() # initialise the model - for layer in range(n_hidden): # loop over hidden layers - model.add(keras.layers.Dense(n_neurons, activation="relu")) # add layer to your model - model.add(keras.layers.Dense(2, activation='softmax')) # add output layer + model = keras.models.Sequential() # initialise the model + for layer in range(n_hidden): # loop over hidden layers + model.add( + keras.layers.Dense(n_neurons, activation="relu") + ) # add layer to your model + model.add(keras.layers.Dense(2, activation="softmax")) # add output layer # Compile - optimizer = keras.optimizers.SGD(learning_rate=learning_rate) # define the optimizer - model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) # compile your model + optimizer = keras.optimizers.SGD( + learning_rate=learning_rate + ) # define the optimizer + model.compile( + loss="sparse_categorical_crossentropy", + optimizer=optimizer, + metrics=["accuracy"], + ) # compile your model return model ~~~ {: .language-python} @@ -57,7 +68,10 @@ For now, ignore all the complicated hyperparameters, but note that the loss used We need to keep some events for validation. Validation sets are used to select and tune the final neural network model. ~~~ -X_valid_scaled, X_train_nn_scaled = X_train_scaled[:100], X_train_scaled[100:] # first 100 events for validation +X_valid_scaled, X_train_nn_scaled = ( + X_train_scaled[:100], + X_train_scaled[100:], +) # first 100 events for validation ~~~ {: .language-python} @@ -75,8 +89,12 @@ X_valid_scaled, X_train_nn_scaled = X_train_scaled[:100], X_train_scaled[100:] # With these parameters, the network can be trained as follows: ~~~ -tf_clf = keras.wrappers.scikit_learn.KerasClassifier(build_model) # call the build_model function defined earlier -tf_clf.fit(X_train_nn_scaled, y_train_nn, validation_data=(X_valid_scaled, y_valid)) # fit your neural network +tf_clf = keras.wrappers.scikit_learn.KerasClassifier( + build_model +) # call the build_model function defined earlier +tf_clf.fit( + X_train_nn_scaled, y_train_nn, validation_data=(X_valid_scaled, y_valid) +) # fit your neural network ~~~ {: .language-python} diff --git a/_episodes/14-ttZ.md b/_episodes/14-ttZ.md index 1bf0768..f62c7a4 100644 --- a/_episodes/14-ttZ.md +++ b/_episodes/14-ttZ.md @@ -229,38 +229,84 @@ Let's make a plot where we directly compare real, experimental data with all sim ~~~ # dictionary to hold colors for each sample -colors_dict = {'Other':'#79b278', 'Z0HF':'#ce0000', 'Z1HF':'#ffcccc', 'Z2HF':'#ff6666', 'ttbar':'#f8f8f8', 'ttZ':'#00ccfd'} - -mc_stat_err_squared = np.zeros(10) # define array to hold the MC statistical uncertainties, 1 zero for each bin -for s in samples_ttZ: # loop over samples - X_s = DataFrames_ttZ[s][ML_inputs_ttZ] # get ML_inputs_ttZ columns from DataFrame for this sample - predicted_prob = RF_clf_ttZ.predict_proba(X_s) # get probability predictions - predicted_prob_signal = predicted_prob[:,1] # 2nd column of predicted_prob corresponds to probability of being signal - thresholds_ttZ.append(predicted_prob_signal) # append predict probabilities for each sample - weights_ttZ.append(DataFrames_ttZ[s]['totalWeight']) # append weights for each sample - weights_squared,_ = np.histogram(predicted_prob_signal, bins=np.arange(0, 1.1, 0.1), - weights=DataFrames_ttZ[s]['totalWeight']**2) # square the totalWeights - mc_stat_err_squared = np.add(mc_stat_err_squared,weights_squared) # add weights_squared for s - colors_ttZ.append( colors_dict[s] ) # append colors for each sample -mc_stat_err = np.sqrt( mc_stat_err_squared ) # statistical error on the MC bars +colors_dict = { + "Other": "#79b278", + "Z0HF": "#ce0000", + "Z1HF": "#ffcccc", + "Z2HF": "#ff6666", + "ttbar": "#f8f8f8", + "ttZ": "#00ccfd", +} + +mc_stat_err_squared = np.zeros( + 10 +) # define array to hold the MC statistical uncertainties, 1 zero for each bin +for s in samples_ttZ: # loop over samples + X_s = DataFrames_ttZ[s][ + ML_inputs_ttZ + ] # get ML_inputs_ttZ columns from DataFrame for this sample + predicted_prob = RF_clf_ttZ.predict_proba(X_s) # get probability predictions + predicted_prob_signal = predicted_prob[ + :, 1 + ] # 2nd column of predicted_prob corresponds to probability of being signal + thresholds_ttZ.append( + predicted_prob_signal + ) # append predict probabilities for each sample + weights_ttZ.append( + DataFrames_ttZ[s]["totalWeight"] + ) # append weights for each sample + weights_squared, _ = np.histogram( + predicted_prob_signal, + bins=np.arange(0, 1.1, 0.1), + weights=DataFrames_ttZ[s]["totalWeight"] ** 2, + ) # square the totalWeights + mc_stat_err_squared = np.add( + mc_stat_err_squared, weights_squared + ) # add weights_squared for s + colors_ttZ.append(colors_dict[s]) # append colors for each sample +mc_stat_err = np.sqrt(mc_stat_err_squared) # statistical error on the MC bars # plot simulated data -mc_heights = plt.hist(thresholds_ttZ, bins=np.arange(0, 1.1, 0.1), weights=weights_ttZ, stacked=True, label=samples_ttZ, - color=colors_ttZ) +mc_heights = plt.hist( + thresholds_ttZ, + bins=np.arange(0, 1.1, 0.1), + weights=weights_ttZ, + stacked=True, + label=samples_ttZ, + color=colors_ttZ, +) -mc_tot = mc_heights[0][-1] # stacked simulated data y-axis value +mc_tot = mc_heights[0][-1] # stacked simulated data y-axis value # plot the statistical uncertainty -plt.bar(np.arange(0.05, 1.05, 0.1), # x - 2*mc_stat_err, # heights - bottom=mc_tot-mc_stat_err, color='none', hatch="////", width=0.1, label='Stat. Unc.' ) - -predicted_prob_data = RF_clf_ttZ.predict_proba(X_data_ttZ) # get probability predictions on data -predicted_prob_data_signal = predicted_prob_data[:, 1] # 2nd column corresponds to probability of being signal -data_hist_ttZ,_ = np.histogram(predicted_prob_data_signal, bins=np.arange(0, 1.1, 0.1)) # histogram the experimental data -data_err_ttZ = np.sqrt(data_hist_ttZ) # get error on experimental data -plt.errorbar(x=np.arange(0.05, 1.05, 0.1), y=data_hist_ttZ, yerr=data_err_ttZ, label='Data', fmt='ko') # plot the experimental data -plt.xlabel('Threshold') +plt.bar( + np.arange(0.05, 1.05, 0.1), # x + 2 * mc_stat_err, # heights + bottom=mc_tot - mc_stat_err, + color="none", + hatch="////", + width=0.1, + label="Stat. Unc.", +) + +predicted_prob_data = RF_clf_ttZ.predict_proba( + X_data_ttZ +) # get probability predictions on data +predicted_prob_data_signal = predicted_prob_data[ + :, 1 +] # 2nd column corresponds to probability of being signal +data_hist_ttZ, _ = np.histogram( + predicted_prob_data_signal, bins=np.arange(0, 1.1, 0.1) +) # histogram the experimental data +data_err_ttZ = np.sqrt(data_hist_ttZ) # get error on experimental data +plt.errorbar( + x=np.arange(0.05, 1.05, 0.1), + y=data_hist_ttZ, + yerr=data_err_ttZ, + label="Data", + fmt="ko", +) # plot the experimental data +plt.xlabel("Threshold") plt.legend() ~~~ {: .language-python}