From 42081f0f2591688c67b74747f4e4a115f81eb5a6 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Mon, 9 Dec 2024 11:47:57 +0400 Subject: [PATCH 01/20] Updated vulnscan.py Improved model and vectorizer loading with thread locking and file scanning functionality Made `is_sensitive` have a reason for logging sensitive file Improved the threading process and logging --- CODE/vulnscan.py | 132 ++++++++++++++++++++--------------------------- 1 file changed, 55 insertions(+), 77 deletions(-) diff --git a/CODE/vulnscan.py b/CODE/vulnscan.py index 6d9ec78b..e9cf5fea 100644 --- a/CODE/vulnscan.py +++ b/CODE/vulnscan.py @@ -6,20 +6,23 @@ import warnings import joblib +import numpy as np import torch from safetensors import safe_open from sklearn.feature_extraction.text import TfidfVectorizer -from tqdm import tqdm # Set up logging from logicytics import Log, DEBUG -# Use v3 models on this! Especially NN models - if __name__ == "__main__": - log = Log( - {"log_level": DEBUG} - ) + log = Log({"log_level": DEBUG}) + +log.info("Locking threads - Model and Vectorizer") +model_lock = threading.Lock() +vectorizer_lock = threading.Lock() + +model_to_use = None +vectorizer_to_use = None def load_model(model_path_to_load: str) -> safe_open | torch.nn.Module: @@ -42,12 +45,28 @@ def load_model(model_path_to_load: str) -> safe_open | torch.nn.Module: elif model_path_to_load.endswith('.pth'): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) - return torch.load(model_path_to_load) + return torch.load(model_path_to_load, weights_only=False) else: raise ValueError("Unsupported model file format. Use .pkl, .safetensors, or .pth") -def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_content: str) -> tuple[bool, float]: +def scan_path(model_path: str, scan_paths: str, vectorizer_path: str): + global model_to_use, vectorizer_to_use + try: + with model_lock: + if model_to_use is None: + log.info(f"Loading model from {model_path}") + model_to_use = load_model(model_path) + with vectorizer_lock: + if vectorizer_to_use is None: + log.info(f"Loading vectorizer from {vectorizer_path}") + vectorizer_to_use = joblib.load(vectorizer_path) + vulnscan(model_to_use, scan_paths, vectorizer_to_use) + except Exception as e: + log.error(f"Error scanning path {scan_paths}: {e}") + + +def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_content: str) -> tuple[bool, float, str]: """ Determine if the file content is sensitive using the provided model and vectorizer. @@ -57,7 +76,7 @@ def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_conte file_content (str): Content of the file to be analyzed. Returns: - tuple: (True if the content is sensitive, False otherwise, prediction probability). + tuple: (True if the content is sensitive, False otherwise, prediction probability, reason). """ if isinstance(model, torch.nn.Module): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -68,15 +87,19 @@ def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_conte features_tensor = torch.tensor(features.toarray(), dtype=torch.float32).to(device) prediction = model(features_tensor) probability = torch.softmax(prediction, dim=1).max().item() - return prediction.argmax(dim=1).item() == 1, probability + top_features = np.argsort(features.toarray()[0])[-5:] + reason = ", ".join([vectorizer.get_feature_names_out()[i] for i in top_features]) + return prediction.argmax(dim=1).item() == 1, probability, reason else: features = vectorizer.transform([file_content]) prediction = model.predict_proba(features) probability = prediction.max() - return model.predict(features)[0] == 1, probability + top_features = np.argsort(features.toarray()[0])[-5:] + reason = ", ".join([vectorizer.get_feature_names_out()[i] for i in top_features]) + return model.predict(features)[0] == 1, probability, reason -def scan_file(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_path: str) -> tuple[bool, float]: +def scan_file(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_path: str) -> tuple[bool, float, str]: """ Scan a single file to determine if it contains sensitive content. @@ -99,83 +122,38 @@ def scan_file(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_path: st return is_sensitive(model, vectorizer, content) -def scan_directory(model: torch.nn.Module, vectorizer, dir_path: str) -> dict[str, tuple[bool, float]]: - """ - Scan all files in a directory to determine if they contain sensitive content. - - Args: - model: Machine learning model. - vectorizer: Vectorizer to transform file content. - dir_path (str): Path to the directory to be scanned. - - Returns: - dict: Dictionary with file paths as keys and (sensitivity, prediction probability) as values. - """ - results = {} - for roots, _, files_dir in os.walk(dir_path): - for file in tqdm(files_dir, desc="Scanning files", unit="file", leave=True): - file_path = os.path.join(roots, file) - if file.endswith(('.zip', '.rar', '.7z', '.tar', '.gz', '.tar.gz')): - continue - results[file_path] = scan_file(model, vectorizer, file_path) - - return results - - -def main(MODELS_PATH: str, SCAN_PATH: str, VECTORIZER_PATH: str): - """ - Main function to load the model and vectorizer, and scan the specified path. - Saves the paths of sensitive files to a file named "Sensitive_File_Paths.txt". - - Args: - MODELS_PATH (str): Path to the model file. - SCAN_PATH (str): Path to the file or directory to be scanned. - VECTORIZER_PATH (str): Path to the vectorizer file. - """ - log.info(f"Loading model from {MODELS_PATH}") - model = load_model(MODELS_PATH) - log.info(f"Loading vectorizer from {VECTORIZER_PATH}") - vectorizer = joblib.load(VECTORIZER_PATH) # Adjust as needed +def vulnscan(model, SCAN_PATH, vectorizer): log.info(f"Scanning {SCAN_PATH}") - if os.path.isfile(SCAN_PATH): - result, probability = scan_file(model, vectorizer, SCAN_PATH) - log.info(f"File {SCAN_PATH} is {'sensitive' if result else 'not sensitive'} with probability {probability:.2f}") - with open("Sensitive_File_Paths.txt", "w") as sensitive_file: + result, probability, reason = scan_file(model, vectorizer, SCAN_PATH) + if result: + log.info(f"File {SCAN_PATH} is sensitive with probability {probability:.2f}. Reason: {reason}") + if not os.path.exists("Sensitive_File_Paths.txt"): + with open("Sensitive_File_Paths.txt", "w") as sensitive_file: + sensitive_file.write(f"{SCAN_PATH}\n\n") + with open("Sensitive_File_Paths.txt", "a") as sensitive_file: sensitive_file.write(f"{SCAN_PATH}\n") - elif os.path.isdir(SCAN_PATH): - results = scan_directory(model, vectorizer, SCAN_PATH) - with open("Sensitive_File_Paths.txt", "w") as sensitive_file: - for file_path, (is_sensitive_main, probability) in results.items(): - log.info(f"File {file_path} is {'sensitive' if is_sensitive_main else 'not sensitive'} with probability {probability:.2f}") - if is_sensitive_main: - sensitive_file.write(f"{file_path}\n") - else: - log.error("Invalid path provided. Please provide a valid file or directory path.") - exit(1) -def scan_path(model_path: str, scan_paths: str, vectorizer_path: str): - """ - Scan the specified path using the provided model and vectorizer. - - Args: - model_path (str): Path to the model file. - scan_paths (str): Path to the file or directory to be scanned. - vectorizer_path (str): Path to the vectorizer file. - """ - main(model_path, scan_paths, vectorizer_path) - - -log.warning("Starting scan - This may take hours!!") +# Start scanning +log.info("Getting paths to scan - This may take some time!!") threads = [] -paths = [ +paths = [] +base_paths = [ "C:\\Users\\", "C:\\Windows\\Logs", "C:\\Program Files", "C:\\Program Files (x86)" ] +for base_path in base_paths: + for root, dirs, files_main in os.walk(base_path): + for file_main in files_main: + paths.append(os.path.join(root, file_main)) + +# Start scanning +log.warning("Starting scan - This may take hours and consume memory!!") + for path in paths: thread = threading.Thread(target=scan_path, args=("VulnScan/Model SenseMini .3n3.pth", path, "VulnScan/Vectorizer .3n3.pkl")) From 41044eba0613a4a489cd162deff80feebedbd537 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Mon, 9 Dec 2024 14:03:58 +0400 Subject: [PATCH 02/20] Added _study_network.py to tools Add summary and visualization functions for neural network model - Implement `summary` function to generate a detailed summary of the model - Implement `visualize_model` function to create a directed graph of the model's layers and weights - Save model summary and visualization to 'Vectorizer features' directory - Add progress tracking and file handling for vectorizer features --- .gitignore | 1 + .idea/Logicytics.iml | 1 + CODE/VulnScan/tools/_study_network.py | 228 ++++++++++++++++++ CODE/VulnScan/tools/_test_gpu_acceleration.py | 3 +- CODE/logicytics/FileManagement.py | 4 +- requirements.txt | 26 +- 6 files changed, 249 insertions(+), 14 deletions(-) create mode 100644 CODE/VulnScan/tools/_study_network.py diff --git a/.gitignore b/.gitignore index bca2a5a3..848eb65c 100644 --- a/.gitignore +++ b/.gitignore @@ -319,3 +319,4 @@ $RECYCLE.BIN/ *.pyc /CODE/SysInternal_Suite/.sys.ignore /ACCESS/ +/CODE/VulnScan/tools/Vectorizer features/ diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml index e33fd634..deff06b9 100644 --- a/.idea/Logicytics.iml +++ b/.idea/Logicytics.iml @@ -16,6 +16,7 @@ + diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py new file mode 100644 index 00000000..f74f10d6 --- /dev/null +++ b/CODE/VulnScan/tools/_study_network.py @@ -0,0 +1,228 @@ +import os.path +from collections import OrderedDict +from os import mkdir + +import joblib +import matplotlib.pyplot as plt +import networkx as nx +import numpy as np +import seaborn as sns +import torch +import torch.nn as nn +from torchviz import make_dot +from tqdm import tqdm + + +def summary(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): + def register_hook(module): + + def hook(modules, inputs, output): + class_name = str(modules.__class__).split(".")[-1].split("'")[0] + module_idx = len(summaries) + + m_key = "%s-%i" % (class_name, module_idx + 1) + summaries[m_key] = OrderedDict() + summaries[m_key]["input_shape"] = list(inputs[0].size()) + summaries[m_key]["input_shape"][0] = batch_size + if isinstance(output, (list, tuple)): + summaries[m_key]["output_shape"] = [ + [-1] + list(o.size())[1:] for o in output + ] + else: + summaries[m_key]["output_shape"] = list(output.size()) + summaries[m_key]["output_shape"][0] = batch_size + + params = 0 + if hasattr(modules, "weight") and hasattr(modules.weight, "size"): + params += torch.prod(torch.LongTensor(list(modules.weight.size()))) + summaries[m_key]["trainable"] = modules.weight.requires_grad + if hasattr(modules, "bias") and hasattr(modules.bias, "size"): + params += torch.prod(torch.LongTensor(list(modules.bias.size()))) + summaries[m_key]["nb_params"] = params + + if ( + not isinstance(module, nn.Sequential) + and not isinstance(module, nn.ModuleList) + and not (module == model_to_use) + ): + hooks.append(module.register_forward_hook(hook)) + + device_to_use = device_to_use.lower() + assert device_to_use in [ + "cuda", + "cpu", + ], "Input device is not valid, please specify 'cuda' or 'cpu'" + + if device_to_use == "cuda" and torch.cuda.is_available(): + dtype = torch.cuda.FloatTensor + else: + dtype = torch.FloatTensor + + # multiple inputs to the network + if isinstance(input_size, tuple): + input_size = [input_size] + + # batch_size of 2 for batch norm + x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size] + # print(type(x[0])) + + # create properties + summaries = OrderedDict() + hooks = [] + + # register hook + model_to_use.apply(register_hook) + + # make a forward pass + # print(x.shape) + model_to_use(*x) + + # remove these hooks + for h in hooks: + h.remove() + + with open('Vectorizer features/Model Summary.txt', 'w') as vf_ms: + vf_ms.write("----------------------------------------------------------------") + line_new = "{:>20} {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #") + vf_ms.write(line_new) + vf_ms.write("================================================================") + total_params = 0 + total_output = 0 + trainable_params = 0 + for layer in summaries: + # input_shape, output_shape, trainable, nb_params + line_new = "{:>20} {:>25} {:>15}".format( + layer, + str(summaries[layer]["output_shape"]), + "{0:,}".format(summaries[layer]["nb_params"]), + ) + total_params += summaries[layer]["nb_params"] + total_output += np.prod(summaries[layer]["output_shape"]) + if "trainable" in summaries[layer]: + if summaries[layer]["trainable"]: + trainable_params += summaries[layer]["nb_params"] + vf_ms.write(line_new) + + # assume 4 bytes/number (float on cuda). + total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.)) + total_output_size = abs(2. * total_output * 4. / (1024 ** 2.)) # x2 for gradients + total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.)) + total_size = total_params_size + total_output_size + total_input_size + + vf_ms.write("================================================================") + vf_ms.write("Total params: {0:,}".format(total_params)) + vf_ms.write("Trainable params: {0:,}".format(trainable_params)) + vf_ms.write("Non-trainable params: {0:,}".format(total_params - trainable_params)) + vf_ms.write("----------------------------------------------------------------") + vf_ms.write("Input size (MB): %0.2f" % total_input_size) + vf_ms.write("Forward/backward pass size (MB): %0.2f" % total_output_size) + vf_ms.write("Params size (MB): %0.2f" % total_params_size) + vf_ms.write("Estimated Total Size (MB): %0.2f" % total_size) + vf_ms.write("----------------------------------------------------------------") + # return summary + + +def visualize_model(): + # Create a directed graph + G = nx.DiGraph() + + def add_edges_bulk(layer_names, weight_matrices): + """Efficiently add edges to the graph with progress tracking.""" + threshold = 0.1 # Adjust this threshold as needed + significant_weights = np.abs(weight_matrices) > threshold + rows, cols = np.where(significant_weights) + weights = weight_matrices[rows, cols] + + # Use tqdm for progress tracking + edge_count = len(rows) + with tqdm(total=edge_count, desc=f"Processing {layer_names}", unit="edges") as pbar: + for row, col, weight in zip(rows, cols, weights): + in_node = f"{layer_names}_in_{col}" + out_node = f"{layer_names}_out_{row}" + G.add_edge(in_node, out_node, weight=weight) + pbar.update(1) + + # Process parameters + for name, param in model.named_parameters(): + if 'weight' in name: + layer_name = name.split('.')[0] + weight_matrix = param.data.cpu().numpy() + + # Add edges with progress bar + add_edges_bulk(layer_name, weight_matrix) + + # Draw the graph + print("Writing the graph to a file...") + nx.write_gexf(G, "Vectorizer features/Neural Network Nodes Graph.gexf") + + +# TODO - Add more print statements to indicate the progress of the script +if __name__ == '__main__': + print("Visualizing the model and vectorizer features...") + print("This may take a while, please wait.") + + if not os.path.exists('Vectorizer features'): + mkdir('Vectorizer features') + + # Load the vectorizer + vectorizer_path = '../Vectorizer .3n3.pkl' + vectorizer = joblib.load(vectorizer_path) + + # Inspect the vectorizer + feature_names = vectorizer.get_feature_names_out() + with open('Vectorizer features/Vectorizer features', 'w') as f: + f.write(f"Number of features: {len(feature_names)}\n\n") + f.write('\n'.join(feature_names)) + + # Visualize the top 90 features + top_n = 90 + sorted_indices = vectorizer.idf_.argsort()[:top_n] + top_features = [feature_names[i] for i in sorted_indices] + top_idf_scores = vectorizer.idf_[sorted_indices] + + plt.figure(figsize=(20, 12)) # Increase the figure size + sns.barplot(x=top_idf_scores, y=top_features) + plt.title('Top 90 Features by IDF Score') + plt.xlabel('IDF Score') + plt.ylabel('Feature') + + # Save the plot as a vector graphic + plt.savefig('Vectorizer features/Top_90_Features.svg', format='svg') + + plt.show() + + # Load the model + model_path = '../Model SenseMini .3n3.pth' + model = torch.load(model_path, weights_only=False) + + # Save the model summary + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + summary(model, input_size=(1, vectorizer.vocabulary_.__len__())) + + # Save the model's state dictionary + with open('Vectorizer features/Model state dictionary.txt', 'w') as f: + f.write("Model's state dictionary:\n\n") + for param_tensor in model.state_dict(): + f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}") + + # Create a dummy input tensor with the appropriate size + dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device) + + # Generate the visualization + model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters())) + + # Save the visualization to a file + model_viz.format = 'png' + model_viz.render(filename='Vectorizer features/Model Visualization', format='png') + + # Removing the temporary files as they are no longer needed, we saved them to the desired location + if os.path.exists("Digraph.gv"): + os.remove("Digraph.gv") + if os.path.exists("Digraph.gv.png"): + os.remove("Digraph.gv.png") + + # Visualize the model + visualize_model() + + print("Model visualization and summary have been saved to the 'Vectorizer features' directory.") diff --git a/CODE/VulnScan/tools/_test_gpu_acceleration.py b/CODE/VulnScan/tools/_test_gpu_acceleration.py index 0b82f523..e45d05c8 100644 --- a/CODE/VulnScan/tools/_test_gpu_acceleration.py +++ b/CODE/VulnScan/tools/_test_gpu_acceleration.py @@ -21,4 +21,5 @@ def check_gpu(): print(f"Error initializing CUDA: {err}") -check_gpu() +if __name__ == '__main__': + check_gpu() diff --git a/CODE/logicytics/FileManagement.py b/CODE/logicytics/FileManagement.py index 188b1341..07f9fc3c 100644 --- a/CODE/logicytics/FileManagement.py +++ b/CODE/logicytics/FileManagement.py @@ -107,7 +107,9 @@ def __get_files_to_zip(path: str) -> list: list: A list of file and directory names to be zipped. """ excluded_extensions = (".py", ".exe", ".bat", ".ps1", ".pkl", ".pth") - excluded_prefixes = ("config.ini", "SysInternal_Suite", "__pycache__", "logicytics", "VulnScan") + excluded_prefixes = ("config.ini", "SysInternal_Suite", + "__pycache__", "logicytics", "VulnScan", + "Vectorizer features") return [ f for f in os.listdir(path) diff --git a/requirements.txt b/requirements.txt index a67b234f..a22c3358 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,22 @@ +configobj~=5.0.9 +joblib~=1.3.2 +matplotlib~=3.8.4 +torch~=2.5.1+cu124 +xgboost~=2.1.3 +scikit-learn~=1.5.2 +Faker~=30.3.0 +numpy~=1.26.4 +transformers~=4.38.2 requests~=2.32.3 psutil~=6.1.0 -colorlog~=6.9.0 DateTime~=5.5 pathlib~=1.0.1 +colorlog~=6.9.0 +safetensors~=0.4.5 prettytable~=3.12.0 -scikit-learn~=1.5.2 -joblib~=1.3.2 -matplotlib~=3.8.4 -numpy~=1.26.4 -Faker~=30.3.0 -transformers~=4.38.2 -xgboost~=2.1.3 -torch~=2.5.1+cu124 pandas~=2.2.2 networkx~=3.2.1 scapy~=2.5.0 -safetensors~=0.4.2 -tqdm~=4.66.6 -configobj~=5.0.9 \ No newline at end of file +seaborn~=0.13.2 +torchviz~=0.0.3 +tqdm~=4.66.6 \ No newline at end of file From d3566a8268b50421d1d9cf4caef117786efe9710 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Mon, 9 Dec 2024 14:22:54 +0400 Subject: [PATCH 03/20] Fixed minor bug --- CODE/VulnScan/tools/_study_network.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index f74f10d6..5f2084a6 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -122,7 +122,7 @@ def hook(modules, inputs, output): # return summary -def visualize_model(): +def visualize_model(models, output_file="model_graph.gexf"): # Create a directed graph G = nx.DiGraph() @@ -142,8 +142,8 @@ def add_edges_bulk(layer_names, weight_matrices): G.add_edge(in_node, out_node, weight=weight) pbar.update(1) - # Process parameters - for name, param in model.named_parameters(): + # Process model parameters + for name, param in models.named_parameters(): if 'weight' in name: layer_name = name.split('.')[0] weight_matrix = param.data.cpu().numpy() @@ -151,9 +151,9 @@ def add_edges_bulk(layer_names, weight_matrices): # Add edges with progress bar add_edges_bulk(layer_name, weight_matrix) - # Draw the graph - print("Writing the graph to a file...") - nx.write_gexf(G, "Vectorizer features/Neural Network Nodes Graph.gexf") + # Save the graph to a GEXF file + nx.write_gexf(G, output_file) + print(f"Graph saved to {output_file}") # TODO - Add more print statements to indicate the progress of the script @@ -223,6 +223,6 @@ def add_edges_bulk(layer_names, weight_matrices): os.remove("Digraph.gv.png") # Visualize the model - visualize_model() + visualize_model(model, output_file='Vectorizer features/NN.gexf') print("Model visualization and summary have been saved to the 'Vectorizer features' directory.") From 62ad70a070123dcdcb5be338a802cb63a1aaf9d9 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Mon, 9 Dec 2024 14:22:54 +0400 Subject: [PATCH 04/20] Fixed minor bug --- CODE/VulnScan/tools/_study_network.py | 59 +++++++++++++++++++-------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index f74f10d6..34099caa 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -3,14 +3,14 @@ from os import mkdir import joblib -import matplotlib.pyplot as plt -import networkx as nx -import numpy as np import seaborn as sns import torch import torch.nn as nn from torchviz import make_dot +import networkx as nx +import numpy as np from tqdm import tqdm +import matplotlib.pyplot as plt def summary(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): @@ -122,13 +122,13 @@ def hook(modules, inputs, output): # return summary -def visualize_model(): - # Create a directed graph - G = nx.DiGraph() +def visualize_model(models, output_dir="model_graphs", visualize_separately=True): + # Create a directed graph for the whole model + os.makedirs(output_dir, exist_ok=True) # Ensure the output directory exists - def add_edges_bulk(layer_names, weight_matrices): + def add_edges_bulk(layer_names, weight_matrices, G): """Efficiently add edges to the graph with progress tracking.""" - threshold = 0.1 # Adjust this threshold as needed + threshold = 1 # Adjust this threshold as needed significant_weights = np.abs(weight_matrices) > threshold rows, cols = np.where(significant_weights) weights = weight_matrices[rows, cols] @@ -142,18 +142,45 @@ def add_edges_bulk(layer_names, weight_matrices): G.add_edge(in_node, out_node, weight=weight) pbar.update(1) - # Process parameters - for name, param in model.named_parameters(): + # Process model parameters and create graphs for each layer + layer_graphs = {} + + for name, param in models.named_parameters(): if 'weight' in name: layer_name = name.split('.')[0] weight_matrix = param.data.cpu().numpy() - # Add edges with progress bar - add_edges_bulk(layer_name, weight_matrix) + # Create a new graph for the current layer and add edges + layer_G = nx.DiGraph() + add_edges_bulk(layer_name, weight_matrix, layer_G) + + # Store the graph for the layer + layer_graphs[layer_name] = layer_G + + # Save the layer graph to a separate file + layer_output_file = os.path.join(output_dir, f"{layer_name}_graph.gexf") + nx.write_gexf(layer_G, layer_output_file) + print(f"Layer graph saved to {layer_output_file}") + + if visualize_separately: + # Visualize each graph separately + for layer_name, layer_G in layer_graphs.items(): + plt.figure(figsize=(8, 8)) + pos = nx.spring_layout(layer_G, seed=42) # Layout for better visualization + nx.draw(layer_G, pos, with_labels=True, node_size=50, node_color="skyblue", font_size=8, font_color="black", + alpha=0.6) + plt.title(f"Visualization for {layer_name}") + plt.show() + + else: + # Combine all layer graphs into one and visualize + combined_graph = nx.DiGraph() + + for layer_name, layer_G in layer_graphs.items(): + combined_graph.add_nodes_from(layer_G.nodes()) + combined_graph.add_edges_from(layer_G.edges()) - # Draw the graph - print("Writing the graph to a file...") - nx.write_gexf(G, "Vectorizer features/Neural Network Nodes Graph.gexf") + print("Visualization complete.") # TODO - Add more print statements to indicate the progress of the script @@ -223,6 +250,6 @@ def add_edges_bulk(layer_names, weight_matrices): os.remove("Digraph.gv.png") # Visualize the model - visualize_model() + visualize_model(model) print("Model visualization and summary have been saved to the 'Vectorizer features' directory.") From 8205e5b74f270c1b522ecbacf2a7dd14f86bb8ec Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Tue, 10 Dec 2024 09:24:44 +0400 Subject: [PATCH 05/20] Fixing more bugs Saving files now is neater --- .gitignore | 2 +- .idea/Logicytics.iml | 1 + CODE/VulnScan/tools/_study_network.py | 143 +++++++++----------------- requirements.txt | 3 +- 4 files changed, 54 insertions(+), 95 deletions(-) diff --git a/.gitignore b/.gitignore index 848eb65c..add49568 100644 --- a/.gitignore +++ b/.gitignore @@ -319,4 +319,4 @@ $RECYCLE.BIN/ *.pyc /CODE/SysInternal_Suite/.sys.ignore /ACCESS/ -/CODE/VulnScan/tools/Vectorizer features/ +/CODE/VulnScan/tools/NN features/ diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml index deff06b9..9d371a5c 100644 --- a/.idea/Logicytics.iml +++ b/.idea/Logicytics.iml @@ -17,6 +17,7 @@ + diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 34099caa..c74546c7 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -3,17 +3,16 @@ from os import mkdir import joblib +import matplotlib.pyplot as plt +import networkx as nx +import numpy as np import seaborn as sns import torch import torch.nn as nn from torchviz import make_dot -import networkx as nx -import numpy as np -from tqdm import tqdm -import matplotlib.pyplot as plt -def summary(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): +def save_graph(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): def register_hook(module): def hook(modules, inputs, output): @@ -81,11 +80,11 @@ def hook(modules, inputs, output): for h in hooks: h.remove() - with open('Vectorizer features/Model Summary.txt', 'w') as vf_ms: - vf_ms.write("----------------------------------------------------------------") + with open('NN features/Model Summary.txt', 'w') as vf_ms: + vf_ms.write("----------------------------------------------------------------\n") line_new = "{:>20} {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #") - vf_ms.write(line_new) - vf_ms.write("================================================================") + vf_ms.write(f"{line_new}\n") + vf_ms.write("================================================================\n") total_params = 0 total_output = 0 trainable_params = 0 @@ -101,7 +100,7 @@ def hook(modules, inputs, output): if "trainable" in summaries[layer]: if summaries[layer]["trainable"]: trainable_params += summaries[layer]["nb_params"] - vf_ms.write(line_new) + vf_ms.write(f"{line_new}\n") # assume 4 bytes/number (float on cuda). total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.)) @@ -109,87 +108,47 @@ def hook(modules, inputs, output): total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.)) total_size = total_params_size + total_output_size + total_input_size - vf_ms.write("================================================================") - vf_ms.write("Total params: {0:,}".format(total_params)) - vf_ms.write("Trainable params: {0:,}".format(trainable_params)) - vf_ms.write("Non-trainable params: {0:,}".format(total_params - trainable_params)) - vf_ms.write("----------------------------------------------------------------") - vf_ms.write("Input size (MB): %0.2f" % total_input_size) - vf_ms.write("Forward/backward pass size (MB): %0.2f" % total_output_size) - vf_ms.write("Params size (MB): %0.2f" % total_params_size) - vf_ms.write("Estimated Total Size (MB): %0.2f" % total_size) - vf_ms.write("----------------------------------------------------------------") - # return summary - - -def visualize_model(models, output_dir="model_graphs", visualize_separately=True): - # Create a directed graph for the whole model - os.makedirs(output_dir, exist_ok=True) # Ensure the output directory exists - - def add_edges_bulk(layer_names, weight_matrices, G): - """Efficiently add edges to the graph with progress tracking.""" - threshold = 1 # Adjust this threshold as needed - significant_weights = np.abs(weight_matrices) > threshold - rows, cols = np.where(significant_weights) - weights = weight_matrices[rows, cols] - - # Use tqdm for progress tracking - edge_count = len(rows) - with tqdm(total=edge_count, desc=f"Processing {layer_names}", unit="edges") as pbar: - for row, col, weight in zip(rows, cols, weights): - in_node = f"{layer_names}_in_{col}" - out_node = f"{layer_names}_out_{row}" - G.add_edge(in_node, out_node, weight=weight) - pbar.update(1) - - # Process model parameters and create graphs for each layer - layer_graphs = {} - - for name, param in models.named_parameters(): - if 'weight' in name: - layer_name = name.split('.')[0] - weight_matrix = param.data.cpu().numpy() - - # Create a new graph for the current layer and add edges - layer_G = nx.DiGraph() - add_edges_bulk(layer_name, weight_matrix, layer_G) - - # Store the graph for the layer - layer_graphs[layer_name] = layer_G - - # Save the layer graph to a separate file - layer_output_file = os.path.join(output_dir, f"{layer_name}_graph.gexf") - nx.write_gexf(layer_G, layer_output_file) - print(f"Layer graph saved to {layer_output_file}") - - if visualize_separately: - # Visualize each graph separately - for layer_name, layer_G in layer_graphs.items(): - plt.figure(figsize=(8, 8)) - pos = nx.spring_layout(layer_G, seed=42) # Layout for better visualization - nx.draw(layer_G, pos, with_labels=True, node_size=50, node_color="skyblue", font_size=8, font_color="black", - alpha=0.6) - plt.title(f"Visualization for {layer_name}") - plt.show() + vf_ms.write("\n================================================================") + vf_ms.write("\nTotal params: {0:,}".format(total_params)) + vf_ms.write("\nTrainable params: {0:,}".format(trainable_params)) + vf_ms.write("\nNon-trainable params: {0:,}".format(total_params - trainable_params)) + vf_ms.write("\n----------------------------------------------------------------") + vf_ms.write("\nInput size (MB): %0.2f" % total_input_size) + vf_ms.write("\nForward/backward pass size (MB): %0.2f" % total_output_size) + vf_ms.write("\nParams size (MB): %0.2f" % total_params_size) + vf_ms.write("\nEstimated Total Size (MB): %0.2f" % total_size) + vf_ms.write("\n----------------------------------------------------------------\n") - else: - # Combine all layer graphs into one and visualize - combined_graph = nx.DiGraph() - for layer_name, layer_G in layer_graphs.items(): - combined_graph.add_nodes_from(layer_G.nodes()) - combined_graph.add_edges_from(layer_G.edges()) +def visualize_model(models, output_dir="NN features"): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Create a directed graph + G = nx.DiGraph() + + # Add nodes and edges to the graph + for model_i in models: + for names, param in model_i.named_parameters(): + G.add_node(names, size=param.numel()) + if param.requires_grad: + G.add_edge(names, f"{names}_grad") + + # Define the output file path + output_file = os.path.join(output_dir, "model.graphml") + + # Write the graph to a GraphML file + nx.write_graphml(G, output_file) - print("Visualization complete.") + print(f"Model visualization saved as {output_file}") -# TODO - Add more print statements to indicate the progress of the script if __name__ == '__main__': print("Visualizing the model and vectorizer features...") print("This may take a while, please wait.") - if not os.path.exists('Vectorizer features'): - mkdir('Vectorizer features') + if not os.path.exists('NN features'): + mkdir('NN features') # Load the vectorizer vectorizer_path = '../Vectorizer .3n3.pkl' @@ -197,7 +156,7 @@ def add_edges_bulk(layer_names, weight_matrices, G): # Inspect the vectorizer feature_names = vectorizer.get_feature_names_out() - with open('Vectorizer features/Vectorizer features', 'w') as f: + with open('NN features/Vectorizer features.txt', 'w') as f: f.write(f"Number of features: {len(feature_names)}\n\n") f.write('\n'.join(feature_names)) @@ -214,7 +173,7 @@ def add_edges_bulk(layer_names, weight_matrices, G): plt.ylabel('Feature') # Save the plot as a vector graphic - plt.savefig('Vectorizer features/Top_90_Features.svg', format='svg') + plt.savefig('NN features/Top_90_Features.svg', format='svg') plt.show() @@ -225,10 +184,10 @@ def add_edges_bulk(layer_names, weight_matrices, G): # Save the model summary device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) - summary(model, input_size=(1, vectorizer.vocabulary_.__len__())) + save_graph(model, input_size=(1, vectorizer.vocabulary_.__len__())) # Save the model's state dictionary - with open('Vectorizer features/Model state dictionary.txt', 'w') as f: + with open('NN features/Model state dictionary.txt', 'w') as f: f.write("Model's state dictionary:\n\n") for param_tensor in model.state_dict(): f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}") @@ -237,19 +196,17 @@ def add_edges_bulk(layer_names, weight_matrices, G): dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device) # Generate the visualization - model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters())) + model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()), show_attrs=True, show_saved=True) # Save the visualization to a file model_viz.format = 'png' - model_viz.render(filename='Vectorizer features/Model Visualization', format='png') + model_viz.render(filename='NN features/Model Visualization', format='png') # Removing the temporary files as they are no longer needed, we saved them to the desired location - if os.path.exists("Digraph.gv"): - os.remove("Digraph.gv") - if os.path.exists("Digraph.gv.png"): - os.remove("Digraph.gv.png") + if os.path.exists("NN features/Model Visualization"): + os.remove("NN features/Model Visualization") # Visualize the model visualize_model(model) - print("Model visualization and summary have been saved to the 'Vectorizer features' directory.") + print("Model visualization and summary have been saved to the 'NN features' directory.") diff --git a/requirements.txt b/requirements.txt index a22c3358..05ea87ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,5 @@ networkx~=3.2.1 scapy~=2.5.0 seaborn~=0.13.2 torchviz~=0.0.3 -tqdm~=4.66.6 \ No newline at end of file +torchvision~=0.20.1+cu124 +torchcam~=0.4.0 \ No newline at end of file From 891bfad97048b8e9680ba96beac6c7c6995981f5 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Tue, 10 Dec 2024 14:25:03 +0400 Subject: [PATCH 06/20] Added _plot.py Now added plot.py which shows a heatmap in bargraph form of the model and best 1000 features, as well as a .html file with a 3D plot graph of losses Fixed minor bug in _study_network.py where I returned old save_graph() function which now makes the gephi file have proper node counts --- CODE/VulnScan/tools/_plot.py | 140 ++++++++++++++++++++++++++ CODE/VulnScan/tools/_study_network.py | 48 +++++---- requirements.txt | 4 +- 3 files changed, 172 insertions(+), 20 deletions(-) create mode 100644 CODE/VulnScan/tools/_plot.py diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py new file mode 100644 index 00000000..757a0b25 --- /dev/null +++ b/CODE/VulnScan/tools/_plot.py @@ -0,0 +1,140 @@ +import joblib +import matplotlib.pyplot as plt +import numpy as np +import plotly.graph_objects as go +import seaborn as sns +import torch +from sklearn.feature_extraction.text import TfidfTransformer +from torch.utils.data import DataLoader + + +# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot +def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): + # Limit the number of tokens to visualize + TOKENS = TOKENS[:1000] + FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000] + + plt.figure(figsize=(len(TOKENS) * 0.5, 6)) + sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False) + plt.title("Feature Importance") + plt.xlabel("Tokens") + plt.ylabel("Importance") + plt.xticks(rotation=45) + plt.savefig(FILENAME, format="svg") + plt.show() # Show the plot interactively + plt.close() # Close the plot to release memory + + +# Function to visualize the loss landscape as an interactive 3D object +def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"): + MODEL.eval() # Set model to evaluation mode + param = next(MODEL.parameters()) # Use the first parameter for landscape perturbations + param_flat = param.view(-1) + + # Define perturbation directions u and v + u = torch.randn_like(param_flat).view(param.shape).to(param.device) + v = torch.randn_like(param_flat).view(param.shape).to(param.device) + + # Normalize perturbations + u = EPSILON * u / torch.norm(u) + v = EPSILON * v / torch.norm(v) + + # Create grid + x = np.linspace(-1, 1, GRID_SIZE) + y = np.linspace(-1, 1, GRID_SIZE) + loss_values = np.zeros((GRID_SIZE, GRID_SIZE)) + + # Iterate through the grid to compute losses + for i, dx in enumerate(x): + for j, dy in enumerate(y): + param.data += dx * u + dy * v # Apply perturbation + loss = 0 + + # Compute loss for all batches in data loader + for batch in DATA_LOADER: + inputs, targets = batch + inputs = inputs.to(param.device) + targets = targets.to(param.device) + outputs = MODEL(inputs) + loss += CRITERION(outputs, targets).item() + + loss_values[i, j] = loss # Store the loss + param.data -= dx * u + dy * v # Revert perturbation + + # Create a meshgrid for plotting + X, Y = np.meshgrid(x, y) + + # Plot the 3D surface using Plotly + fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")]) + fig.update_layout( + title="Loss Landscape (Interactive 3D)", + scene=dict( + xaxis_title="Perturbation in u", + yaxis_title="Perturbation in v", + zaxis_title="Loss", + ), + ) + + # Save as an interactive HTML file + fig.write_html(FILENAME) + print(f"3D loss landscape saved as {FILENAME}") + + +# Example of DataLoader for loss landscape (dummy dataset for visualization) +class DummyDataset(torch.utils.data.Dataset): + def __init__(self, num_samples=100): + self.num_samples = num_samples + self.data = torch.randn(num_samples, 10000) # Increased number of features + self.labels = torch.randint(0, 2, (num_samples,)) # Binary labels + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + return self.data[idx], self.labels[idx] + + +if __name__ == "__main__": + # Check if GPU is available + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + # Load vectorizer (change the path to your vectorizer .pkl file) + vectorizer_path = "../Vectorizer .3n3.pkl" + model_path = "../Model SenseMini .3n3.pth" + + # Load vectorizer + print(f"Loading vectorizer from: {vectorizer_path}") + with open(vectorizer_path, "rb") as f: + vectorizer = joblib.load(f) + + # Load model and move to the appropriate device (GPU/CPU) + print(f"Loading model from: {model_path}") + model = torch.load(model_path, weights_only=False) + model.to(device) # Move model to GPU or CPU + print(model) + + # Instantiate dummy data loader + print("Creating dummy data loader...") + dummy_data_loader = DataLoader(DummyDataset(), batch_size=32) + + # Define loss criterion + print("Defining loss criterion...") + criterion: torch.nn = torch.nn.CrossEntropyLoss() + + # Visualizations + print("Creating visualizations...") + tokens: TfidfTransformer = vectorizer.get_feature_names_out() + + # Feature importance (dummy data) + NUMBER_OF_FEATURES: int = -1 # Number of features to visualize, -1 for all + # Max number of features to visualize is 3000 due to image constraints + print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...") + feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES])) # Example random importance + visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg") + + # Loss landscape + print("Visualizing loss landscape - This may take a while...") + plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html") + + print("Completed.") diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index c74546c7..905ff532 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -10,9 +10,10 @@ import torch import torch.nn as nn from torchviz import make_dot +from tqdm import tqdm -def save_graph(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): +def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): def register_hook(module): def hook(modules, inputs, output): @@ -120,27 +121,38 @@ def hook(modules, inputs, output): vf_ms.write("\n----------------------------------------------------------------\n") -def visualize_model(models, output_dir="NN features"): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - +def save_graph(): # Create a directed graph G = nx.DiGraph() - # Add nodes and edges to the graph - for model_i in models: - for names, param in model_i.named_parameters(): - G.add_node(names, size=param.numel()) - if param.requires_grad: - G.add_edge(names, f"{names}_grad") + def add_edges_bulk(layer_names, weight_matrices): + """Efficiently add edges to the graph with progress tracking.""" + threshold = 0.1 # Adjust this threshold as needed + significant_weights = np.abs(weight_matrices) > threshold + rows, cols = np.where(significant_weights) + weights = weight_matrices[rows, cols] + + # Use tqdm for progress tracking + edge_count = len(rows) + with tqdm(total=edge_count, desc=f"Processing {layer_names}", unit="edges") as pbar: + for row, col, weight in zip(rows, cols, weights): + in_node = f"{layer_names}_in_{col}" + out_node = f"{layer_names}_out_{row}" + G.add_edge(in_node, out_node, weight=weight) + pbar.update(1) - # Define the output file path - output_file = os.path.join(output_dir, "model.graphml") + # Process parameters + for name, param in model.named_parameters(): + if 'weight' in name: + layer_name = name.split('.')[0] + weight_matrix = param.data.cpu().numpy() - # Write the graph to a GraphML file - nx.write_graphml(G, output_file) + # Add edges with progress bar + add_edges_bulk(layer_name, weight_matrix) - print(f"Model visualization saved as {output_file}") + # Draw the graph + print("Writing the graph to a file...") + nx.write_gexf(G, "NN features/Neural Network Nodes Graph.gexf") if __name__ == '__main__': @@ -184,7 +196,7 @@ def visualize_model(models, output_dir="NN features"): # Save the model summary device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) - save_graph(model, input_size=(1, vectorizer.vocabulary_.__len__())) + save_data(model, input_size=(1, vectorizer.vocabulary_.__len__())) # Save the model's state dictionary with open('NN features/Model state dictionary.txt', 'w') as f: @@ -207,6 +219,6 @@ def visualize_model(models, output_dir="NN features"): os.remove("NN features/Model Visualization") # Visualize the model - visualize_model(model) + save_graph() print("Model visualization and summary have been saved to the 'NN features' directory.") diff --git a/requirements.txt b/requirements.txt index 05ea87ba..f186ddf6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,5 @@ networkx~=3.2.1 scapy~=2.5.0 seaborn~=0.13.2 torchviz~=0.0.3 -torchvision~=0.20.1+cu124 -torchcam~=0.4.0 \ No newline at end of file +plotly~=5.24.1 +tqdm~=4.66.6 \ No newline at end of file From 7e88e1a21059c549190c610acff0365601ce7c95 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Tue, 10 Dec 2024 14:47:02 +0400 Subject: [PATCH 07/20] Fixed minor bugs Added checks if directories and files existed before write/appending to them --- CODE/VulnScan/tools/_plot.py | 9 ++++++++- CODE/VulnScan/tools/_study_network.py | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py index 757a0b25..247519f9 100644 --- a/CODE/VulnScan/tools/_plot.py +++ b/CODE/VulnScan/tools/_plot.py @@ -1,3 +1,5 @@ +import os + import joblib import matplotlib.pyplot as plt import numpy as np @@ -96,6 +98,9 @@ def __getitem__(self, idx): if __name__ == "__main__": # Check if GPU is available + if not os.path.exists('NN features'): + os.mkdir('NN features') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") @@ -112,7 +117,9 @@ def __getitem__(self, idx): print(f"Loading model from: {model_path}") model = torch.load(model_path, weights_only=False) model.to(device) # Move model to GPU or CPU - print(model) + mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w" + with open("NN features/Model Summary.txt", mode) as f: + f.write(str(model)) # Instantiate dummy data loader print("Creating dummy data loader...") diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 905ff532..38305e01 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -64,7 +64,6 @@ def hook(modules, inputs, output): # batch_size of 2 for batch norm x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size] - # print(type(x[0])) # create properties summaries = OrderedDict() @@ -74,14 +73,15 @@ def hook(modules, inputs, output): model_to_use.apply(register_hook) # make a forward pass - # print(x.shape) model_to_use(*x) # remove these hooks for h in hooks: h.remove() - with open('NN features/Model Summary.txt', 'w') as vf_ms: + # Save the summary + mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w" + with open('NN features/Model Summary.txt', mode) as vf_ms: vf_ms.write("----------------------------------------------------------------\n") line_new = "{:>20} {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #") vf_ms.write(f"{line_new}\n") From 7cb6df99035ec3700a6693fed91253589460627f Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Tue, 10 Dec 2024 21:44:12 +0400 Subject: [PATCH 08/20] Added many study features Also merged _plot.py to _study_network.py, added activations, weight distribution, t-SNE plots, which are all special, finally fixed some bugs, and made sure all data is genuine, or synthetic, modified config.ini as well to allow paths to be set there. --- .idea/Logicytics.iml | 1 + CODE/VulnScan/tools/_plot.py | 147 ---------- CODE/VulnScan/tools/_study_network.py | 407 ++++++++++++++++++++++++-- CODE/VulnScan/v3/_train.py | 1 + CODE/config.ini | 58 ++-- 5 files changed, 414 insertions(+), 200 deletions(-) delete mode 100644 CODE/VulnScan/tools/_plot.py diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml index 9d371a5c..235b40bc 100644 --- a/.idea/Logicytics.iml +++ b/.idea/Logicytics.iml @@ -35,6 +35,7 @@ diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py deleted file mode 100644 index 247519f9..00000000 --- a/CODE/VulnScan/tools/_plot.py +++ /dev/null @@ -1,147 +0,0 @@ -import os - -import joblib -import matplotlib.pyplot as plt -import numpy as np -import plotly.graph_objects as go -import seaborn as sns -import torch -from sklearn.feature_extraction.text import TfidfTransformer -from torch.utils.data import DataLoader - - -# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot -def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): - # Limit the number of tokens to visualize - TOKENS = TOKENS[:1000] - FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000] - - plt.figure(figsize=(len(TOKENS) * 0.5, 6)) - sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False) - plt.title("Feature Importance") - plt.xlabel("Tokens") - plt.ylabel("Importance") - plt.xticks(rotation=45) - plt.savefig(FILENAME, format="svg") - plt.show() # Show the plot interactively - plt.close() # Close the plot to release memory - - -# Function to visualize the loss landscape as an interactive 3D object -def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"): - MODEL.eval() # Set model to evaluation mode - param = next(MODEL.parameters()) # Use the first parameter for landscape perturbations - param_flat = param.view(-1) - - # Define perturbation directions u and v - u = torch.randn_like(param_flat).view(param.shape).to(param.device) - v = torch.randn_like(param_flat).view(param.shape).to(param.device) - - # Normalize perturbations - u = EPSILON * u / torch.norm(u) - v = EPSILON * v / torch.norm(v) - - # Create grid - x = np.linspace(-1, 1, GRID_SIZE) - y = np.linspace(-1, 1, GRID_SIZE) - loss_values = np.zeros((GRID_SIZE, GRID_SIZE)) - - # Iterate through the grid to compute losses - for i, dx in enumerate(x): - for j, dy in enumerate(y): - param.data += dx * u + dy * v # Apply perturbation - loss = 0 - - # Compute loss for all batches in data loader - for batch in DATA_LOADER: - inputs, targets = batch - inputs = inputs.to(param.device) - targets = targets.to(param.device) - outputs = MODEL(inputs) - loss += CRITERION(outputs, targets).item() - - loss_values[i, j] = loss # Store the loss - param.data -= dx * u + dy * v # Revert perturbation - - # Create a meshgrid for plotting - X, Y = np.meshgrid(x, y) - - # Plot the 3D surface using Plotly - fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")]) - fig.update_layout( - title="Loss Landscape (Interactive 3D)", - scene=dict( - xaxis_title="Perturbation in u", - yaxis_title="Perturbation in v", - zaxis_title="Loss", - ), - ) - - # Save as an interactive HTML file - fig.write_html(FILENAME) - print(f"3D loss landscape saved as {FILENAME}") - - -# Example of DataLoader for loss landscape (dummy dataset for visualization) -class DummyDataset(torch.utils.data.Dataset): - def __init__(self, num_samples=100): - self.num_samples = num_samples - self.data = torch.randn(num_samples, 10000) # Increased number of features - self.labels = torch.randint(0, 2, (num_samples,)) # Binary labels - - def __len__(self): - return self.num_samples - - def __getitem__(self, idx): - return self.data[idx], self.labels[idx] - - -if __name__ == "__main__": - # Check if GPU is available - if not os.path.exists('NN features'): - os.mkdir('NN features') - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Using device: {device}") - - # Load vectorizer (change the path to your vectorizer .pkl file) - vectorizer_path = "../Vectorizer .3n3.pkl" - model_path = "../Model SenseMini .3n3.pth" - - # Load vectorizer - print(f"Loading vectorizer from: {vectorizer_path}") - with open(vectorizer_path, "rb") as f: - vectorizer = joblib.load(f) - - # Load model and move to the appropriate device (GPU/CPU) - print(f"Loading model from: {model_path}") - model = torch.load(model_path, weights_only=False) - model.to(device) # Move model to GPU or CPU - mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w" - with open("NN features/Model Summary.txt", mode) as f: - f.write(str(model)) - - # Instantiate dummy data loader - print("Creating dummy data loader...") - dummy_data_loader = DataLoader(DummyDataset(), batch_size=32) - - # Define loss criterion - print("Defining loss criterion...") - criterion: torch.nn = torch.nn.CrossEntropyLoss() - - # Visualizations - print("Creating visualizations...") - tokens: TfidfTransformer = vectorizer.get_feature_names_out() - - # Feature importance (dummy data) - NUMBER_OF_FEATURES: int = -1 # Number of features to visualize, -1 for all - # Max number of features to visualize is 3000 due to image constraints - print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...") - feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES])) # Example random importance - visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg") - - # Loss landscape - print("Visualizing loss landscape - This may take a while...") - plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html") - - print("Completed.") diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 38305e01..1b04e0ef 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -1,18 +1,300 @@ +import os import os.path +import random from collections import OrderedDict +from configparser import ConfigParser from os import mkdir import joblib import matplotlib.pyplot as plt import networkx as nx import numpy as np +import plotly.graph_objects as go import seaborn as sns import torch import torch.nn as nn +from faker import Faker +from sklearn.manifold import TSNE +from torch.utils.data import DataLoader, TensorDataset from torchviz import make_dot from tqdm import tqdm +# Example of DataLoader for loss landscape (dummy dataset for visualization) +class DummyDataset(torch.utils.data.Dataset): + def __init__(self, num_samples=100, input_dim=10000): + self.num_samples = num_samples + self.input_dim = input_dim + self.data = [] + self.labels = [] + faker = Faker() + for _ in range(num_samples): + if random.random() < 0.05: # 5% chance to include sensitive data + self.data.append(f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}") + self.labels.append(1) # Label as sensitive + else: + self.data.append(faker.text(max_nb_chars=100)) # Non-sensitive data + self.labels.append(0) # Label as non-sensitive + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + data = self.data[idx] + label = self.labels[idx] + # Convert data to tensor of ASCII values and pad to input_dim + data_tensor = torch.tensor([ord(c) for c in data], dtype=torch.float32) + if len(data_tensor) < self.input_dim: + padding = torch.zeros(self.input_dim - len(data_tensor)) + data_tensor = torch.cat((data_tensor, padding)) + else: + data_tensor = data_tensor[:self.input_dim] + label_tensor = torch.tensor(label, dtype=torch.long) + return data_tensor, label_tensor + + +def load_data(text_data, vectorizer_to_load): + # Vectorize the text data + X = vectorizer_to_load.transform(text_data) + # Create a dummy label for visualization (replace with real labels if available) + y = np.zeros(len(text_data)) + # Convert to torch tensors + X_tensor = torch.tensor(X.toarray(), dtype=torch.float32) + y_tensor = torch.tensor(y, dtype=torch.long) + dataset = TensorDataset(X_tensor, y_tensor) + return DataLoader(dataset, batch_size=32, shuffle=True) + + +def visualize_weight_distribution(model_to_load): + # Access weights of the first layer + weights = model_to_load[0].weight.detach().cpu().numpy() # Move tensor to CPU before conversion to numpy + plt.hist(weights.flatten(), bins=50) + plt.title("Weight Distribution - First Layer") + plt.xlabel("Weight Value") + plt.ylabel("Frequency") + plt.savefig("NN features/Weight Distribution.png") + plt.close() + + +def visualize_activations(model_to_load, input_tensor): + # Check the device of the model + device_va = next(model_to_load.parameters()).device + + # Move the input tensor to the same device as the model + input_tensor = input_tensor.to(device_va) + + activations = [] + + # noinspection PyUnusedLocal + def hook_fn(module, inputx, output): + # Hook function to extract intermediate layer activations + activations.append(output) + + model_to_load[0].register_forward_hook(hook_fn) # Register hook on first layer + + # Perform a forward pass + _ = model_to_load(input_tensor) + activation = activations[0].detach().cpu().numpy() # Move activations to CPU + + # Plot activations as a bar chart + plt.figure(figsize=(10, 6)) + plt.bar(range(len(activation[0])), activation[0]) + plt.title("Activation Values - First Layer") + plt.xlabel("Neuron Index") + plt.ylabel("Activation Value") + plt.savefig("NN features/Visualize Activation.png") + plt.close() + + +def visualize_tsne(model_to_load, dataloader): + # Get the device of the model + device_va = next(model_to_load.parameters()).device + + model_to_load.eval() # Set the model to evaluation mode + + features = [] + labels = [] + + with torch.no_grad(): + for data, target in dataloader: + # Move data and target to the same device as the model + data, target = data.to(device_va), target.to(device_va) + + # Extract features (output of the model) + output = model_to_load(data) + features.append(output.cpu().numpy()) # Move output to CPU for concatenation + labels.append(target.cpu().numpy()) # Move target to CPU for concatenation + + # Stack all batches + features = np.vstack(features) + labels = np.hstack(labels) + + # Determine suitable perplexity + num_samples = features.shape[0] + perplexity = min(30, num_samples - 1) # Ensure perplexity < num_samples + + # Apply t-SNE + tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity) + reduced_features = tsne.fit_transform(features) + + # Plot the t-SNE results + plt.figure(figsize=(10, 8)) + scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=labels, cmap='viridis', alpha=0.7) + plt.colorbar(scatter, label="Class") + plt.title("t-SNE Visualization of Features") + plt.xlabel("t-SNE Dimension 1") + plt.ylabel("t-SNE Dimension 2") + plt.savefig("NN features/Visualize t-SNE.png") + plt.close() + + +# Main function to run all visualizations +def plot_many_graphs(): + print("Starting synthetic data generation...") + # Load data + faker = Faker() + + # Generate sensitive examples + sensitive_data = [ + f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}", + f"Credit Card: {faker.credit_card_number()}, Expiry: {faker.credit_card_expire()}, CVV: {faker.credit_card_security_code()}", + f"Patient: {faker.name()}, Condition: {faker.text(max_nb_chars=20)}", + f"Password: {faker.password()}", + f"Email: {faker.email()}", + f"Phone: {faker.phone_number()}", + f"Medical Record: {faker.md5()}", + f"Username: {faker.user_name()}", + f"IP: {faker.ipv4()}", + ] + + # Generate non-sensitive examples + non_sensitive_data = [ + faker.text(max_nb_chars=50) for _ in range(50000) + ] + + data_text = non_sensitive_data + (sensitive_data * 15) + random.shuffle(data_text) + print("Loaded data for visualization.") + dataloader = load_data(data_text, vectorizer) + + # Visualizations + print("Creating visualizations...") + visualize_weight_distribution(model) + + # For activations, use a sample from the dataloader + print("Creating activation visualizations...") + sample_input = next(iter(dataloader))[0] + visualize_activations(model, sample_input) + + print("Creating t-SNE visualization - May take a long time...") + visualize_tsne(model, dataloader) + + print("Completed.") + + +# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot +def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): + # Limit the number of tokens to visualize + TOKENS = TOKENS[:1000] + FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000] + + plt.figure(figsize=(len(TOKENS) * 0.5, 6)) + sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False) + plt.title("Feature Importance") + plt.xlabel("Tokens") + plt.ylabel("Importance") + plt.xticks(rotation=45) + plt.savefig(FILENAME, format="svg") + plt.close() # Close the plot to release memory + + +# Function to visualize the loss landscape as an interactive 3D object +def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"): + MODEL.eval() # Set model to evaluation mode + param = next(MODEL.parameters()) # Use the first parameter for landscape perturbations + param_flat = param.view(-1) + + # Define perturbation directions u and v + u = torch.randn_like(param_flat).view(param.shape).to(param.device) + v = torch.randn_like(param_flat).view(param.shape).to(param.device) + + # Normalize perturbations + u = EPSILON * u / torch.norm(u) + v = EPSILON * v / torch.norm(v) + + # Create grid + x = np.linspace(-1, 1, GRID_SIZE) + y = np.linspace(-1, 1, GRID_SIZE) + loss_values = np.zeros((GRID_SIZE, GRID_SIZE)) + + # Iterate through the grid to compute losses + for i, dx in enumerate(x): + print(f"Computing loss for row {i+1}/{GRID_SIZE}...") + for j, dy in enumerate(y): + print(f" Computing loss for column {j+1}/{GRID_SIZE}...") + param.data += dx * u + dy * v # Apply perturbation + loss = 0 + + # Compute loss for all batches in data loader + for batch in DATA_LOADER: + print(f" Computing loss for batch: {batch}...") + inputs, targets = batch + inputs = inputs.to(param.device) + targets = targets.to(param.device) + outputs = MODEL(inputs) + loss += CRITERION(outputs, targets).item() + + loss_values[i, j] = loss # Store the loss + param.data -= dx * u + dy * v # Revert perturbation + + # Create a meshgrid for plotting + X, Y = np.meshgrid(x, y) + + # Plot the 3D surface using Plotly + fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")]) + fig.update_layout( + title="Loss Landscape (Interactive 3D)", + scene=dict( + xaxis_title="Perturbation in u", + yaxis_title="Perturbation in v", + zaxis_title="Loss", + ), + ) + + # Save as an interactive HTML file + fig.write_html(FILENAME) + print(f"3D loss landscape saved as {FILENAME}") + + +def main_plot(): + # Instantiate data loader + print("Creating dummy data loader...") + dummy_data_loader = DataLoader(DummyDataset(), batch_size=32) + + # Define loss criterion + print("Defining loss criterion...") + criterion = torch.nn.CrossEntropyLoss() + + # Visualizations + print("Creating visualizations...") + tokens = vectorizer.get_feature_names_out() + + # Feature importance + # Max number of features to visualize is 3000 due to image constraints + print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...") + feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES])) # Example random importance + visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg") + + # Loss landscape + print("Visualizing loss landscape - This may take a while...") + plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html") + + # Set model to evaluation mode, and plot many graphs + print("Setting model to evaluation mode...") + model.eval() # Set the model to evaluation mode + plot_many_graphs() + + def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): def register_hook(module): @@ -155,25 +437,25 @@ def add_edges_bulk(layer_names, weight_matrices): nx.write_gexf(G, "NN features/Neural Network Nodes Graph.gexf") -if __name__ == '__main__': +def setup_environment(): print("Visualizing the model and vectorizer features...") print("This may take a while, please wait.") if not os.path.exists('NN features'): mkdir('NN features') - # Load the vectorizer - vectorizer_path = '../Vectorizer .3n3.pkl' - vectorizer = joblib.load(vectorizer_path) - # Inspect the vectorizer - feature_names = vectorizer.get_feature_names_out() - with open('NN features/Vectorizer features.txt', 'w') as f: - f.write(f"Number of features: {len(feature_names)}\n\n") - f.write('\n'.join(feature_names)) +def load_vectorizer(): + vectorizer_load = joblib.load(vectorizer_path) + feature_names = vectorizer_load.get_feature_names_out() + with open('NN features/Vectorizer features.txt', 'w') as file: + file.write(f"Number of features: {len(feature_names)}\n\n") + file.write('\n'.join(feature_names)) + return vectorizer_load + - # Visualize the top 90 features - top_n = 90 +def visualize_top_features(top_n=90): + feature_names = vectorizer.get_feature_names_out() sorted_indices = vectorizer.idf_.argsort()[:top_n] top_features = [feature_names[i] for i in sorted_indices] top_idf_scores = vectorizer.idf_[sorted_indices] @@ -186,39 +468,102 @@ def add_edges_bulk(layer_names, weight_matrices): # Save the plot as a vector graphic plt.savefig('NN features/Top_90_Features.svg', format='svg') - plt.show() - # Load the model - model_path = '../Model SenseMini .3n3.pth' - model = torch.load(model_path, weights_only=False) - # Save the model summary - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - save_data(model, input_size=(1, vectorizer.vocabulary_.__len__())) +def load_model(): + device_load = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model_load = torch.load(model_path, weights_only=False) + model_load.to(device_load) + return model_load, device_load + - # Save the model's state dictionary - with open('NN features/Model state dictionary.txt', 'w') as f: - f.write("Model's state dictionary:\n\n") +def save_model_state_dict(): + with open('NN features/Model state dictionary.txt', 'w') as file: + file.write("Model's state dictionary:\n\n") for param_tensor in model.state_dict(): - f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}") + file.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}") - # Create a dummy input tensor with the appropriate size - dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device) - # Generate the visualization +def generate_model_visualization(): + dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device) model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()), show_attrs=True, show_saved=True) - - # Save the visualization to a file model_viz.format = 'png' model_viz.render(filename='NN features/Model Visualization', format='png') - # Removing the temporary files as they are no longer needed, we saved them to the desired location + +def cleanup_temp_files(): if os.path.exists("NN features/Model Visualization"): os.remove("NN features/Model Visualization") - # Visualize the model - save_graph() +def model_summary(): + mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w" + with open("NN features/Model Summary.txt", mode) as file: + file.write(str(model)) + + +if __name__ == '__main__': + # Print the welcome message + print("===========================================================================================") + print("= This script will visualize the features of the model and vectorizer. =") + print("= Please ensure that the model and vectorizer files are present in the specified paths. =") + print("= The visualization will be saved in the 'NN features' directory. =") + print("= This script will take a while to run, please be patient. =") + print("===========================================================================================") + + # Read the config file + print("\n\nReading config file and setting up...") + config = ConfigParser() + config.read('../../config.ini') + + setup_environment() + + # Load the paths from the config file + vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path') + model_path = config.get('VulnScan.study Settings', 'model_path') + NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features') + + # Check if the paths exist + if not os.path.exists(vectorizer_path): + print(f"Vectorizer file not found. Please double check the path {vectorizer_path}.") + exit(1) + if not os.path.exists(model_path): + print(f"Model file not found. Please double check the path {model_path}.") + exit(1) + + # Load the vectorizer and model + vectorizer = load_vectorizer() + visualize_top_features() + model, device = load_model() + # Save the model summary, state dictionary, and visualization + save_data(model, input_size=(1, vectorizer.vocabulary_.__len__())) + save_model_state_dict() + generate_model_visualization() + cleanup_temp_files() + save_graph() print("Model visualization and summary have been saved to the 'NN features' directory.") + + # Check if GPU is available + if not os.path.exists('NN features'): + os.mkdir('NN features') + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + # Load vectorizer (change the path to your vectorizer .pkl file) + vectorizer_path = "../Vectorizer .3n3.pkl" + model_path = "../Model SenseMini .3n3.pth" + + # Load vectorizer + print(f"Reloading vectorizer from: {vectorizer_path}") + with open(vectorizer_path, "rb") as f: + vectorizer = joblib.load(f) + + # Load model and move to the appropriate device (GPU/CPU) + print(f"Reloading model from: {model_path}") + model = torch.load(model_path, weights_only=False) + model.to(device) # Move model to GPU or CPU + + model_summary() + main_plot() diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py index c9fa7ee7..1600fee6 100644 --- a/CODE/VulnScan/v3/_train.py +++ b/CODE/VulnScan/v3/_train.py @@ -178,6 +178,7 @@ def select_model_from_traditional(model_name: str, logger.error(f"Invalid model name: {model_name}") exit(1) + def train_traditional_model(model_name: str, epochs: int, save_model_path: str): diff --git a/CODE/config.ini b/CODE/config.ini index f24190ce..cbc5986f 100644 --- a/CODE/config.ini +++ b/CODE/config.ini @@ -27,29 +27,10 @@ timeout = 10 ################################################### -[VulnScan.train Settings] -# The following settings are for the Train module for training models -# NeuralNetwork seems to be the best choice for this task -# Options: "NeuralNetwork", "LogReg", -# "RandomForest", "ExtraTrees", "GBM", -# "XGBoost", "DecisionTree", "NaiveBayes" -model_name = NeuralNetwork -# General Training Parameters -epochs = 10 -batch_size = 32 -learning_rate = 0.001 -use_cuda = true - -# Paths to train and save data -train_data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData -# If all models are to be trained, this is the path to save all models, -# and will be appended with the model codename and follow naming convention -save_model_path = C:\Users\Hp\Desktop\Model Tests\Model SenseMini - [VulnScan.generate Settings] # The following settings are for the Generate module for fake training data extensions = .txt, .log, .md, .csv, .json, .xml, .html, .yaml, .ini, .pdf, .docx, .xlsx, .pptx -save_path = C:\Users\Hp\Desktop\Model Tests\Generated Data +save_path = PATH # Options include: # 'Sense' - Generates 50k files, each 25KB in size. # 'SenseNano' - Generates 5 files, each 5KB in size. @@ -79,11 +60,44 @@ partial_sensitive_chance = 0.2 # Use the vectorizer supplied for any v3 model on SenseMini # The path to the data to vectorize, either a file or a directory -data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData +data_path = PATH # The path to save the vectorized data - It will automatically be appended '\Vectorizer.pkl' # Make sure the path is a directory, and it exists -output_path = C:\Users\Hp\Desktop\Model Tests\Model Sense - Vectorizer +output_path = PATH # Vectorizer to use, options include: # tfidf or count - The code for the training only supports tfidf - we advise to use tfidf vectorizer_type = tfidf + +[VulnScan.train Settings] +# The following settings are for the Train module for training models +# NeuralNetwork seems to be the best choice for this task +# Options: "NeuralNetwork", "LogReg", +# "RandomForest", "ExtraTrees", "GBM", +# "XGBoost", "DecisionTree", "NaiveBayes" +model_name = NeuralNetwork +# General Training Parameters +epochs = 10 +batch_size = 32 +learning_rate = 0.001 +use_cuda = true + +# Paths to train and save data +train_data_path = PATH +# If all models are to be trained, this is the path to save all models, +# and will be appended with the model codename and follow naming convention +save_model_path = PATH + +[VulnScan.study Settings] +# Here is the basics of the study module +# This is useful to generate graphs and data that may help in understanding the model +# Everything is found online pre-studied, so this is not necessary +# But it is useful for understanding the model locally +# All files be saved here, and can't be changed, PATH is "NN features/" + +# This is the path to the model, and the vectorizer +model_path = PATH +vectorizer_path = PATH +# Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations +# Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap. +number_of_features = -1 From 4587dc87ff76f309b4491a4b110a5094ef0a723b Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Tue, 10 Dec 2024 21:44:12 +0400 Subject: [PATCH 09/20] Added many study features Also merged _plot.py to _study_network.py, added activations, weight distribution, t-SNE plots, which are all special, finally fixed some bugs, and made sure all data is genuine, or synthetic, modified config.ini as well to allow paths to be set there. --- .idea/Logicytics.iml | 1 + CODE/VulnScan/tools/_plot.py | 147 ---------- CODE/VulnScan/tools/_study_network.py | 407 ++++++++++++++++++++++++-- CODE/VulnScan/v3/_train.py | 1 + CODE/config.ini | 58 ++-- 5 files changed, 414 insertions(+), 200 deletions(-) delete mode 100644 CODE/VulnScan/tools/_plot.py diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml index 9d371a5c..235b40bc 100644 --- a/.idea/Logicytics.iml +++ b/.idea/Logicytics.iml @@ -35,6 +35,7 @@ diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py deleted file mode 100644 index 247519f9..00000000 --- a/CODE/VulnScan/tools/_plot.py +++ /dev/null @@ -1,147 +0,0 @@ -import os - -import joblib -import matplotlib.pyplot as plt -import numpy as np -import plotly.graph_objects as go -import seaborn as sns -import torch -from sklearn.feature_extraction.text import TfidfTransformer -from torch.utils.data import DataLoader - - -# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot -def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): - # Limit the number of tokens to visualize - TOKENS = TOKENS[:1000] - FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000] - - plt.figure(figsize=(len(TOKENS) * 0.5, 6)) - sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False) - plt.title("Feature Importance") - plt.xlabel("Tokens") - plt.ylabel("Importance") - plt.xticks(rotation=45) - plt.savefig(FILENAME, format="svg") - plt.show() # Show the plot interactively - plt.close() # Close the plot to release memory - - -# Function to visualize the loss landscape as an interactive 3D object -def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"): - MODEL.eval() # Set model to evaluation mode - param = next(MODEL.parameters()) # Use the first parameter for landscape perturbations - param_flat = param.view(-1) - - # Define perturbation directions u and v - u = torch.randn_like(param_flat).view(param.shape).to(param.device) - v = torch.randn_like(param_flat).view(param.shape).to(param.device) - - # Normalize perturbations - u = EPSILON * u / torch.norm(u) - v = EPSILON * v / torch.norm(v) - - # Create grid - x = np.linspace(-1, 1, GRID_SIZE) - y = np.linspace(-1, 1, GRID_SIZE) - loss_values = np.zeros((GRID_SIZE, GRID_SIZE)) - - # Iterate through the grid to compute losses - for i, dx in enumerate(x): - for j, dy in enumerate(y): - param.data += dx * u + dy * v # Apply perturbation - loss = 0 - - # Compute loss for all batches in data loader - for batch in DATA_LOADER: - inputs, targets = batch - inputs = inputs.to(param.device) - targets = targets.to(param.device) - outputs = MODEL(inputs) - loss += CRITERION(outputs, targets).item() - - loss_values[i, j] = loss # Store the loss - param.data -= dx * u + dy * v # Revert perturbation - - # Create a meshgrid for plotting - X, Y = np.meshgrid(x, y) - - # Plot the 3D surface using Plotly - fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")]) - fig.update_layout( - title="Loss Landscape (Interactive 3D)", - scene=dict( - xaxis_title="Perturbation in u", - yaxis_title="Perturbation in v", - zaxis_title="Loss", - ), - ) - - # Save as an interactive HTML file - fig.write_html(FILENAME) - print(f"3D loss landscape saved as {FILENAME}") - - -# Example of DataLoader for loss landscape (dummy dataset for visualization) -class DummyDataset(torch.utils.data.Dataset): - def __init__(self, num_samples=100): - self.num_samples = num_samples - self.data = torch.randn(num_samples, 10000) # Increased number of features - self.labels = torch.randint(0, 2, (num_samples,)) # Binary labels - - def __len__(self): - return self.num_samples - - def __getitem__(self, idx): - return self.data[idx], self.labels[idx] - - -if __name__ == "__main__": - # Check if GPU is available - if not os.path.exists('NN features'): - os.mkdir('NN features') - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Using device: {device}") - - # Load vectorizer (change the path to your vectorizer .pkl file) - vectorizer_path = "../Vectorizer .3n3.pkl" - model_path = "../Model SenseMini .3n3.pth" - - # Load vectorizer - print(f"Loading vectorizer from: {vectorizer_path}") - with open(vectorizer_path, "rb") as f: - vectorizer = joblib.load(f) - - # Load model and move to the appropriate device (GPU/CPU) - print(f"Loading model from: {model_path}") - model = torch.load(model_path, weights_only=False) - model.to(device) # Move model to GPU or CPU - mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w" - with open("NN features/Model Summary.txt", mode) as f: - f.write(str(model)) - - # Instantiate dummy data loader - print("Creating dummy data loader...") - dummy_data_loader = DataLoader(DummyDataset(), batch_size=32) - - # Define loss criterion - print("Defining loss criterion...") - criterion: torch.nn = torch.nn.CrossEntropyLoss() - - # Visualizations - print("Creating visualizations...") - tokens: TfidfTransformer = vectorizer.get_feature_names_out() - - # Feature importance (dummy data) - NUMBER_OF_FEATURES: int = -1 # Number of features to visualize, -1 for all - # Max number of features to visualize is 3000 due to image constraints - print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...") - feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES])) # Example random importance - visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg") - - # Loss landscape - print("Visualizing loss landscape - This may take a while...") - plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html") - - print("Completed.") diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 38305e01..1b04e0ef 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -1,18 +1,300 @@ +import os import os.path +import random from collections import OrderedDict +from configparser import ConfigParser from os import mkdir import joblib import matplotlib.pyplot as plt import networkx as nx import numpy as np +import plotly.graph_objects as go import seaborn as sns import torch import torch.nn as nn +from faker import Faker +from sklearn.manifold import TSNE +from torch.utils.data import DataLoader, TensorDataset from torchviz import make_dot from tqdm import tqdm +# Example of DataLoader for loss landscape (dummy dataset for visualization) +class DummyDataset(torch.utils.data.Dataset): + def __init__(self, num_samples=100, input_dim=10000): + self.num_samples = num_samples + self.input_dim = input_dim + self.data = [] + self.labels = [] + faker = Faker() + for _ in range(num_samples): + if random.random() < 0.05: # 5% chance to include sensitive data + self.data.append(f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}") + self.labels.append(1) # Label as sensitive + else: + self.data.append(faker.text(max_nb_chars=100)) # Non-sensitive data + self.labels.append(0) # Label as non-sensitive + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + data = self.data[idx] + label = self.labels[idx] + # Convert data to tensor of ASCII values and pad to input_dim + data_tensor = torch.tensor([ord(c) for c in data], dtype=torch.float32) + if len(data_tensor) < self.input_dim: + padding = torch.zeros(self.input_dim - len(data_tensor)) + data_tensor = torch.cat((data_tensor, padding)) + else: + data_tensor = data_tensor[:self.input_dim] + label_tensor = torch.tensor(label, dtype=torch.long) + return data_tensor, label_tensor + + +def load_data(text_data, vectorizer_to_load): + # Vectorize the text data + X = vectorizer_to_load.transform(text_data) + # Create a dummy label for visualization (replace with real labels if available) + y = np.zeros(len(text_data)) + # Convert to torch tensors + X_tensor = torch.tensor(X.toarray(), dtype=torch.float32) + y_tensor = torch.tensor(y, dtype=torch.long) + dataset = TensorDataset(X_tensor, y_tensor) + return DataLoader(dataset, batch_size=32, shuffle=True) + + +def visualize_weight_distribution(model_to_load): + # Access weights of the first layer + weights = model_to_load[0].weight.detach().cpu().numpy() # Move tensor to CPU before conversion to numpy + plt.hist(weights.flatten(), bins=50) + plt.title("Weight Distribution - First Layer") + plt.xlabel("Weight Value") + plt.ylabel("Frequency") + plt.savefig("NN features/Weight Distribution.png") + plt.close() + + +def visualize_activations(model_to_load, input_tensor): + # Check the device of the model + device_va = next(model_to_load.parameters()).device + + # Move the input tensor to the same device as the model + input_tensor = input_tensor.to(device_va) + + activations = [] + + # noinspection PyUnusedLocal + def hook_fn(module, inputx, output): + # Hook function to extract intermediate layer activations + activations.append(output) + + model_to_load[0].register_forward_hook(hook_fn) # Register hook on first layer + + # Perform a forward pass + _ = model_to_load(input_tensor) + activation = activations[0].detach().cpu().numpy() # Move activations to CPU + + # Plot activations as a bar chart + plt.figure(figsize=(10, 6)) + plt.bar(range(len(activation[0])), activation[0]) + plt.title("Activation Values - First Layer") + plt.xlabel("Neuron Index") + plt.ylabel("Activation Value") + plt.savefig("NN features/Visualize Activation.png") + plt.close() + + +def visualize_tsne(model_to_load, dataloader): + # Get the device of the model + device_va = next(model_to_load.parameters()).device + + model_to_load.eval() # Set the model to evaluation mode + + features = [] + labels = [] + + with torch.no_grad(): + for data, target in dataloader: + # Move data and target to the same device as the model + data, target = data.to(device_va), target.to(device_va) + + # Extract features (output of the model) + output = model_to_load(data) + features.append(output.cpu().numpy()) # Move output to CPU for concatenation + labels.append(target.cpu().numpy()) # Move target to CPU for concatenation + + # Stack all batches + features = np.vstack(features) + labels = np.hstack(labels) + + # Determine suitable perplexity + num_samples = features.shape[0] + perplexity = min(30, num_samples - 1) # Ensure perplexity < num_samples + + # Apply t-SNE + tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity) + reduced_features = tsne.fit_transform(features) + + # Plot the t-SNE results + plt.figure(figsize=(10, 8)) + scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=labels, cmap='viridis', alpha=0.7) + plt.colorbar(scatter, label="Class") + plt.title("t-SNE Visualization of Features") + plt.xlabel("t-SNE Dimension 1") + plt.ylabel("t-SNE Dimension 2") + plt.savefig("NN features/Visualize t-SNE.png") + plt.close() + + +# Main function to run all visualizations +def plot_many_graphs(): + print("Starting synthetic data generation...") + # Load data + faker = Faker() + + # Generate sensitive examples + sensitive_data = [ + f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}", + f"Credit Card: {faker.credit_card_number()}, Expiry: {faker.credit_card_expire()}, CVV: {faker.credit_card_security_code()}", + f"Patient: {faker.name()}, Condition: {faker.text(max_nb_chars=20)}", + f"Password: {faker.password()}", + f"Email: {faker.email()}", + f"Phone: {faker.phone_number()}", + f"Medical Record: {faker.md5()}", + f"Username: {faker.user_name()}", + f"IP: {faker.ipv4()}", + ] + + # Generate non-sensitive examples + non_sensitive_data = [ + faker.text(max_nb_chars=50) for _ in range(50000) + ] + + data_text = non_sensitive_data + (sensitive_data * 15) + random.shuffle(data_text) + print("Loaded data for visualization.") + dataloader = load_data(data_text, vectorizer) + + # Visualizations + print("Creating visualizations...") + visualize_weight_distribution(model) + + # For activations, use a sample from the dataloader + print("Creating activation visualizations...") + sample_input = next(iter(dataloader))[0] + visualize_activations(model, sample_input) + + print("Creating t-SNE visualization - May take a long time...") + visualize_tsne(model, dataloader) + + print("Completed.") + + +# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot +def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): + # Limit the number of tokens to visualize + TOKENS = TOKENS[:1000] + FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000] + + plt.figure(figsize=(len(TOKENS) * 0.5, 6)) + sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False) + plt.title("Feature Importance") + plt.xlabel("Tokens") + plt.ylabel("Importance") + plt.xticks(rotation=45) + plt.savefig(FILENAME, format="svg") + plt.close() # Close the plot to release memory + + +# Function to visualize the loss landscape as an interactive 3D object +def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"): + MODEL.eval() # Set model to evaluation mode + param = next(MODEL.parameters()) # Use the first parameter for landscape perturbations + param_flat = param.view(-1) + + # Define perturbation directions u and v + u = torch.randn_like(param_flat).view(param.shape).to(param.device) + v = torch.randn_like(param_flat).view(param.shape).to(param.device) + + # Normalize perturbations + u = EPSILON * u / torch.norm(u) + v = EPSILON * v / torch.norm(v) + + # Create grid + x = np.linspace(-1, 1, GRID_SIZE) + y = np.linspace(-1, 1, GRID_SIZE) + loss_values = np.zeros((GRID_SIZE, GRID_SIZE)) + + # Iterate through the grid to compute losses + for i, dx in enumerate(x): + print(f"Computing loss for row {i+1}/{GRID_SIZE}...") + for j, dy in enumerate(y): + print(f" Computing loss for column {j+1}/{GRID_SIZE}...") + param.data += dx * u + dy * v # Apply perturbation + loss = 0 + + # Compute loss for all batches in data loader + for batch in DATA_LOADER: + print(f" Computing loss for batch: {batch}...") + inputs, targets = batch + inputs = inputs.to(param.device) + targets = targets.to(param.device) + outputs = MODEL(inputs) + loss += CRITERION(outputs, targets).item() + + loss_values[i, j] = loss # Store the loss + param.data -= dx * u + dy * v # Revert perturbation + + # Create a meshgrid for plotting + X, Y = np.meshgrid(x, y) + + # Plot the 3D surface using Plotly + fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")]) + fig.update_layout( + title="Loss Landscape (Interactive 3D)", + scene=dict( + xaxis_title="Perturbation in u", + yaxis_title="Perturbation in v", + zaxis_title="Loss", + ), + ) + + # Save as an interactive HTML file + fig.write_html(FILENAME) + print(f"3D loss landscape saved as {FILENAME}") + + +def main_plot(): + # Instantiate data loader + print("Creating dummy data loader...") + dummy_data_loader = DataLoader(DummyDataset(), batch_size=32) + + # Define loss criterion + print("Defining loss criterion...") + criterion = torch.nn.CrossEntropyLoss() + + # Visualizations + print("Creating visualizations...") + tokens = vectorizer.get_feature_names_out() + + # Feature importance + # Max number of features to visualize is 3000 due to image constraints + print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...") + feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES])) # Example random importance + visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg") + + # Loss landscape + print("Visualizing loss landscape - This may take a while...") + plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html") + + # Set model to evaluation mode, and plot many graphs + print("Setting model to evaluation mode...") + model.eval() # Set the model to evaluation mode + plot_many_graphs() + + def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): def register_hook(module): @@ -155,25 +437,25 @@ def add_edges_bulk(layer_names, weight_matrices): nx.write_gexf(G, "NN features/Neural Network Nodes Graph.gexf") -if __name__ == '__main__': +def setup_environment(): print("Visualizing the model and vectorizer features...") print("This may take a while, please wait.") if not os.path.exists('NN features'): mkdir('NN features') - # Load the vectorizer - vectorizer_path = '../Vectorizer .3n3.pkl' - vectorizer = joblib.load(vectorizer_path) - # Inspect the vectorizer - feature_names = vectorizer.get_feature_names_out() - with open('NN features/Vectorizer features.txt', 'w') as f: - f.write(f"Number of features: {len(feature_names)}\n\n") - f.write('\n'.join(feature_names)) +def load_vectorizer(): + vectorizer_load = joblib.load(vectorizer_path) + feature_names = vectorizer_load.get_feature_names_out() + with open('NN features/Vectorizer features.txt', 'w') as file: + file.write(f"Number of features: {len(feature_names)}\n\n") + file.write('\n'.join(feature_names)) + return vectorizer_load + - # Visualize the top 90 features - top_n = 90 +def visualize_top_features(top_n=90): + feature_names = vectorizer.get_feature_names_out() sorted_indices = vectorizer.idf_.argsort()[:top_n] top_features = [feature_names[i] for i in sorted_indices] top_idf_scores = vectorizer.idf_[sorted_indices] @@ -186,39 +468,102 @@ def add_edges_bulk(layer_names, weight_matrices): # Save the plot as a vector graphic plt.savefig('NN features/Top_90_Features.svg', format='svg') - plt.show() - # Load the model - model_path = '../Model SenseMini .3n3.pth' - model = torch.load(model_path, weights_only=False) - # Save the model summary - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - save_data(model, input_size=(1, vectorizer.vocabulary_.__len__())) +def load_model(): + device_load = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model_load = torch.load(model_path, weights_only=False) + model_load.to(device_load) + return model_load, device_load + - # Save the model's state dictionary - with open('NN features/Model state dictionary.txt', 'w') as f: - f.write("Model's state dictionary:\n\n") +def save_model_state_dict(): + with open('NN features/Model state dictionary.txt', 'w') as file: + file.write("Model's state dictionary:\n\n") for param_tensor in model.state_dict(): - f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}") + file.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}") - # Create a dummy input tensor with the appropriate size - dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device) - # Generate the visualization +def generate_model_visualization(): + dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device) model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()), show_attrs=True, show_saved=True) - - # Save the visualization to a file model_viz.format = 'png' model_viz.render(filename='NN features/Model Visualization', format='png') - # Removing the temporary files as they are no longer needed, we saved them to the desired location + +def cleanup_temp_files(): if os.path.exists("NN features/Model Visualization"): os.remove("NN features/Model Visualization") - # Visualize the model - save_graph() +def model_summary(): + mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w" + with open("NN features/Model Summary.txt", mode) as file: + file.write(str(model)) + + +if __name__ == '__main__': + # Print the welcome message + print("===========================================================================================") + print("= This script will visualize the features of the model and vectorizer. =") + print("= Please ensure that the model and vectorizer files are present in the specified paths. =") + print("= The visualization will be saved in the 'NN features' directory. =") + print("= This script will take a while to run, please be patient. =") + print("===========================================================================================") + + # Read the config file + print("\n\nReading config file and setting up...") + config = ConfigParser() + config.read('../../config.ini') + + setup_environment() + + # Load the paths from the config file + vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path') + model_path = config.get('VulnScan.study Settings', 'model_path') + NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features') + + # Check if the paths exist + if not os.path.exists(vectorizer_path): + print(f"Vectorizer file not found. Please double check the path {vectorizer_path}.") + exit(1) + if not os.path.exists(model_path): + print(f"Model file not found. Please double check the path {model_path}.") + exit(1) + + # Load the vectorizer and model + vectorizer = load_vectorizer() + visualize_top_features() + model, device = load_model() + # Save the model summary, state dictionary, and visualization + save_data(model, input_size=(1, vectorizer.vocabulary_.__len__())) + save_model_state_dict() + generate_model_visualization() + cleanup_temp_files() + save_graph() print("Model visualization and summary have been saved to the 'NN features' directory.") + + # Check if GPU is available + if not os.path.exists('NN features'): + os.mkdir('NN features') + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + # Load vectorizer (change the path to your vectorizer .pkl file) + vectorizer_path = "../Vectorizer .3n3.pkl" + model_path = "../Model SenseMini .3n3.pth" + + # Load vectorizer + print(f"Reloading vectorizer from: {vectorizer_path}") + with open(vectorizer_path, "rb") as f: + vectorizer = joblib.load(f) + + # Load model and move to the appropriate device (GPU/CPU) + print(f"Reloading model from: {model_path}") + model = torch.load(model_path, weights_only=False) + model.to(device) # Move model to GPU or CPU + + model_summary() + main_plot() diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py index c9fa7ee7..1600fee6 100644 --- a/CODE/VulnScan/v3/_train.py +++ b/CODE/VulnScan/v3/_train.py @@ -178,6 +178,7 @@ def select_model_from_traditional(model_name: str, logger.error(f"Invalid model name: {model_name}") exit(1) + def train_traditional_model(model_name: str, epochs: int, save_model_path: str): diff --git a/CODE/config.ini b/CODE/config.ini index f24190ce..24130933 100644 --- a/CODE/config.ini +++ b/CODE/config.ini @@ -27,29 +27,10 @@ timeout = 10 ################################################### -[VulnScan.train Settings] -# The following settings are for the Train module for training models -# NeuralNetwork seems to be the best choice for this task -# Options: "NeuralNetwork", "LogReg", -# "RandomForest", "ExtraTrees", "GBM", -# "XGBoost", "DecisionTree", "NaiveBayes" -model_name = NeuralNetwork -# General Training Parameters -epochs = 10 -batch_size = 32 -learning_rate = 0.001 -use_cuda = true - -# Paths to train and save data -train_data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData -# If all models are to be trained, this is the path to save all models, -# and will be appended with the model codename and follow naming convention -save_model_path = C:\Users\Hp\Desktop\Model Tests\Model SenseMini - [VulnScan.generate Settings] # The following settings are for the Generate module for fake training data extensions = .txt, .log, .md, .csv, .json, .xml, .html, .yaml, .ini, .pdf, .docx, .xlsx, .pptx -save_path = C:\Users\Hp\Desktop\Model Tests\Generated Data +save_path = PATH # Options include: # 'Sense' - Generates 50k files, each 25KB in size. # 'SenseNano' - Generates 5 files, each 5KB in size. @@ -79,11 +60,44 @@ partial_sensitive_chance = 0.2 # Use the vectorizer supplied for any v3 model on SenseMini # The path to the data to vectorize, either a file or a directory -data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData +data_path = PATH # The path to save the vectorized data - It will automatically be appended '\Vectorizer.pkl' # Make sure the path is a directory, and it exists -output_path = C:\Users\Hp\Desktop\Model Tests\Model Sense - Vectorizer +output_path = PATH # Vectorizer to use, options include: # tfidf or count - The code for the training only supports tfidf - we advise to use tfidf vectorizer_type = tfidf + +[VulnScan.train Settings] +# The following settings are for the Train module for training models +# NeuralNetwork seems to be the best choice for this task +# Options: "NeuralNetwork", "LogReg", +# "RandomForest", "ExtraTrees", "GBM", +# "XGBoost", "DecisionTree", "NaiveBayes" +model_name = NeuralNetwork +# General Training Parameters +epochs = 10 +batch_size = 32 +learning_rate = 0.001 +use_cuda = true + +# Paths to train and save data +train_data_path = PATH +# If all models are to be trained, this is the path to save all models, +# and will be appended with the model codename and follow naming convention +save_model_path = PATH + +[VulnScan.study Settings] +# Here is the basics of the study module +# This is useful to generate graphs and data that may help in understanding the model +# Everything is found online pre-studied, so this is not necessary +# But it is useful for understanding the model locally +# All files be saved here, and can't be changed, PATH is "NN features/" + +# This is the path to the model, and the vectorizer +model_path = ../Model SenseMini .3n3.pth +vectorizer_path = ../Vectorizer .3n3.pkl +# Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations +# Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap. +number_of_features = -1 From d19a6b209eb73516da951e33d509228a029038cb Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Tue, 10 Dec 2024 22:01:07 +0400 Subject: [PATCH 10/20] Fixed some bugs --- CODE/VulnScan/tools/_study_network.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 1b04e0ef..dedf7bd4 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -123,7 +123,7 @@ def visualize_tsne(model_to_load, dataloader): # Extract features (output of the model) output = model_to_load(data) features.append(output.cpu().numpy()) # Move output to CPU for concatenation - labels.append(target.cpu().numpy()) # Move target to CPU for concatenation + labels.append(target.cpu().numpy()) # Move target to CPU for concatenation # Stack all batches features = np.vstack(features) @@ -192,7 +192,7 @@ def plot_many_graphs(): print("Completed.") -# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot +# Visualize feature importance (dummy example for visualization) and save as SVG def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): # Limit the number of tokens to visualize TOKENS = TOKENS[:1000] @@ -229,15 +229,14 @@ def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON # Iterate through the grid to compute losses for i, dx in enumerate(x): - print(f"Computing loss for row {i+1}/{GRID_SIZE}...") + print(f"Computing loss for row {i + 1}/{GRID_SIZE}...") for j, dy in enumerate(y): - print(f" Computing loss for column {j+1}/{GRID_SIZE}...") + print(f" Computing loss for column {j + 1}/{GRID_SIZE}...") param.data += dx * u + dy * v # Apply perturbation loss = 0 # Compute loss for all batches in data loader for batch in DATA_LOADER: - print(f" Computing loss for batch: {batch}...") inputs, targets = batch inputs = inputs.to(param.device) targets = targets.to(param.device) @@ -281,9 +280,11 @@ def main_plot(): # Feature importance # Max number of features to visualize is 3000 due to image constraints - print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...") + print( + f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES]) + 1} tokens...") feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES])) # Example random importance - visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg") + visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, + FILENAME="NN features/feature_importance.svg") # Loss landscape print("Visualizing loss landscape - This may take a while...") @@ -468,7 +469,7 @@ def visualize_top_features(top_n=90): # Save the plot as a vector graphic plt.savefig('NN features/Top_90_Features.svg', format='svg') - plt.show() + plt.close() def load_model(): @@ -522,7 +523,7 @@ def model_summary(): # Load the paths from the config file vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path') model_path = config.get('VulnScan.study Settings', 'model_path') - NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features') + NUMBER_OF_FEATURES = int(config.get('VulnScan.study Settings', 'number_of_features')) # Check if the paths exist if not os.path.exists(vectorizer_path): From 6dd419cede8ac1b29f1b64bb9161db5371b6667c Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Tue, 10 Dec 2024 22:01:07 +0400 Subject: [PATCH 11/20] Fixed some bugs --- CODE/VulnScan/tools/_study_network.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 1b04e0ef..323db56f 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -20,6 +20,10 @@ from tqdm import tqdm +# TODO Add docstring, and hint-type +# TODO Do v3.1 plans +# ZIP the file and attach somewhere (Data) + # Example of DataLoader for loss landscape (dummy dataset for visualization) class DummyDataset(torch.utils.data.Dataset): def __init__(self, num_samples=100, input_dim=10000): @@ -123,7 +127,7 @@ def visualize_tsne(model_to_load, dataloader): # Extract features (output of the model) output = model_to_load(data) features.append(output.cpu().numpy()) # Move output to CPU for concatenation - labels.append(target.cpu().numpy()) # Move target to CPU for concatenation + labels.append(target.cpu().numpy()) # Move target to CPU for concatenation # Stack all batches features = np.vstack(features) @@ -192,7 +196,7 @@ def plot_many_graphs(): print("Completed.") -# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot +# Visualize feature importance (dummy example for visualization) and save as SVG def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): # Limit the number of tokens to visualize TOKENS = TOKENS[:1000] @@ -229,15 +233,14 @@ def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON # Iterate through the grid to compute losses for i, dx in enumerate(x): - print(f"Computing loss for row {i+1}/{GRID_SIZE}...") + print(f"Computing loss for row {i + 1}/{GRID_SIZE}...") for j, dy in enumerate(y): - print(f" Computing loss for column {j+1}/{GRID_SIZE}...") + print(f" Computing loss for column {j + 1}/{GRID_SIZE}...") param.data += dx * u + dy * v # Apply perturbation loss = 0 # Compute loss for all batches in data loader for batch in DATA_LOADER: - print(f" Computing loss for batch: {batch}...") inputs, targets = batch inputs = inputs.to(param.device) targets = targets.to(param.device) @@ -281,9 +284,11 @@ def main_plot(): # Feature importance # Max number of features to visualize is 3000 due to image constraints - print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...") + print( + f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES]) + 1} tokens...") feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES])) # Example random importance - visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg") + visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, + FILENAME="NN features/feature_importance.svg") # Loss landscape print("Visualizing loss landscape - This may take a while...") @@ -468,7 +473,7 @@ def visualize_top_features(top_n=90): # Save the plot as a vector graphic plt.savefig('NN features/Top_90_Features.svg', format='svg') - plt.show() + plt.close() def load_model(): @@ -522,7 +527,7 @@ def model_summary(): # Load the paths from the config file vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path') model_path = config.get('VulnScan.study Settings', 'model_path') - NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features') + NUMBER_OF_FEATURES = int(config.get('VulnScan.study Settings', 'number_of_features')) # Check if the paths exist if not os.path.exists(vectorizer_path): From d91ce486063ec99d06b74f32e70844b538ffab59 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 10:55:53 +0400 Subject: [PATCH 12/20] Added docstrings, and hint-types --- CODE/VulnScan/tools/_study_network.py | 96 ++++++-- CODE/VulnScan/tools/_vectorizer.py | 29 +++ CODE/VulnScan/v2-deprecated/_generate_data.py | 40 ++- CODE/VulnScan/v2-deprecated/_train.py | 131 +++++++++- CODE/VulnScan/v3/_generate_data.py | 233 ++++++++++-------- CODE/VulnScan/v3/_train.py | 93 +++---- 6 files changed, 440 insertions(+), 182 deletions(-) diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 323db56f..c52ba622 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -1,9 +1,12 @@ +from __future__ import annotations + import os import os.path import random from collections import OrderedDict from configparser import ConfigParser from os import mkdir +from typing import Any import joblib import matplotlib.pyplot as plt @@ -14,23 +17,44 @@ import torch import torch.nn as nn from faker import Faker +from numpy import ndarray, dtype +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.manifold import TSNE +from torch import device from torch.utils.data import DataLoader, TensorDataset from torchviz import make_dot from tqdm import tqdm -# TODO Add docstring, and hint-type # TODO Do v3.1 plans -# ZIP the file and attach somewhere (Data) +# Raise an ImportError to make the file unimportable +# raise ImportError("This file cannot be imported") + # Example of DataLoader for loss landscape (dummy dataset for visualization) class DummyDataset(torch.utils.data.Dataset): - def __init__(self, num_samples=100, input_dim=10000): + """ + A dummy dataset for generating synthetic data for visualization purposes. + + Attributes: + num_samples (int): Number of samples in the dataset. + input_dim (int): Dimension of the input data. + data (list): List of generated data samples. + labels (list): List of labels corresponding to the data samples. + """ + + def __init__(self, num_samples: int = 100, input_dim: int = 10000): + """ + Initializes the DummyDataset with the specified number of samples and input dimension. + + Args: + num_samples (int): Number of samples to generate. + input_dim (int): Dimension of the input data. + """ self.num_samples = num_samples self.input_dim = input_dim - self.data = [] - self.labels = [] + self.data: list[str] = [] + self.labels: list[int] = [] faker = Faker() for _ in range(num_samples): if random.random() < 0.05: # 5% chance to include sensitive data @@ -40,10 +64,25 @@ def __init__(self, num_samples=100, input_dim=10000): self.data.append(faker.text(max_nb_chars=100)) # Non-sensitive data self.labels.append(0) # Label as non-sensitive - def __len__(self): + def __len__(self) -> int: + """ + Returns the number of samples in the dataset. + + Returns: + int: Number of samples in the dataset. + """ return self.num_samples - def __getitem__(self, idx): + def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]: + """ + Retrieves the data and label at the specified index. + + Args: + idx (int): Index of the data and label to retrieve. + + Returns: + tuple: A tuple containing the data tensor and label tensor. + """ data = self.data[idx] label = self.labels[idx] # Convert data to tensor of ASCII values and pad to input_dim @@ -57,7 +96,17 @@ def __getitem__(self, idx): return data_tensor, label_tensor -def load_data(text_data, vectorizer_to_load): +def load_data(text_data: list[str], vectorizer_to_load: TfidfVectorizer | CountVectorizer) -> DataLoader: + """ + Vectorizes the text data and creates a DataLoader for it. + + Args: + text_data (list of str): The text data to be vectorized. + vectorizer_to_load: The vectorizer to use for transforming the text data. + + Returns: + DataLoader: A DataLoader containing the vectorized text data and dummy labels. + """ # Vectorize the text data X = vectorizer_to_load.transform(text_data) # Create a dummy label for visualization (replace with real labels if available) @@ -69,7 +118,7 @@ def load_data(text_data, vectorizer_to_load): return DataLoader(dataset, batch_size=32, shuffle=True) -def visualize_weight_distribution(model_to_load): +def visualize_weight_distribution(model_to_load: torch.nn.Module): # Access weights of the first layer weights = model_to_load[0].weight.detach().cpu().numpy() # Move tensor to CPU before conversion to numpy plt.hist(weights.flatten(), bins=50) @@ -80,7 +129,7 @@ def visualize_weight_distribution(model_to_load): plt.close() -def visualize_activations(model_to_load, input_tensor): +def visualize_activations(model_to_load: torch.nn.Module, input_tensor: torch.Tensor): # Check the device of the model device_va = next(model_to_load.parameters()).device @@ -110,7 +159,7 @@ def hook_fn(module, inputx, output): plt.close() -def visualize_tsne(model_to_load, dataloader): +def visualize_tsne(model_to_load: torch.nn.Module, dataloader: DataLoader): # Get the device of the model device_va = next(model_to_load.parameters()).device @@ -197,7 +246,8 @@ def plot_many_graphs(): # Visualize feature importance (dummy example for visualization) and save as SVG -def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"): +def visualize_feature_importance(TOKENS: list[str], FEATURE_IMPORTANCE: float | ndarray[Any, dtype[np.floating]], + FILENAME: str = "Plot.svg"): # Limit the number of tokens to visualize TOKENS = TOKENS[:1000] FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000] @@ -213,7 +263,8 @@ def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg" # Function to visualize the loss landscape as an interactive 3D object -def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"): +def plot_loss_landscape_3d(MODEL: torch.nn.Module, DATA_LOADER: DataLoader, CRITERION: torch.nn.Module, + GRID_SIZE: int = 200, EPSILON: float = 0.01, FILENAME: str = "Plot.html"): MODEL.eval() # Set model to evaluation mode param = next(MODEL.parameters()) # Use the first parameter for landscape perturbations param_flat = param.view(-1) @@ -300,10 +351,11 @@ def main_plot(): plot_many_graphs() -def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"): - def register_hook(module): +def save_data(model_to_use: torch.nn.Module, input_size: tuple[int, Any] | int, batch_size: int = -1, + device_to_use: str = "cuda"): + def register_hook(module: torch.nn.Module): - def hook(modules, inputs, output): + def hook(modules: torch.nn.Module, inputs: (torch.nn.Module, tuple[torch.Tensor]), output: torch.Tensor): class_name = str(modules.__class__).split(".")[-1].split("'")[0] module_idx = len(summaries) @@ -341,16 +393,16 @@ def hook(modules, inputs, output): ], "Input device is not valid, please specify 'cuda' or 'cpu'" if device_to_use == "cuda" and torch.cuda.is_available(): - dtype = torch.cuda.FloatTensor + dtype_to_use = torch.cuda.FloatTensor else: - dtype = torch.FloatTensor + dtype_to_use = torch.FloatTensor # multiple inputs to the network if isinstance(input_size, tuple): input_size = [input_size] # batch_size of 2 for batch norm - x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size] + x = [torch.rand(2, *in_size).type(dtype_to_use) for in_size in input_size] # create properties summaries = OrderedDict() @@ -412,7 +464,7 @@ def save_graph(): # Create a directed graph G = nx.DiGraph() - def add_edges_bulk(layer_names, weight_matrices): + def add_edges_bulk(layer_names: str, weight_matrices: np.ndarray[np.float32]): """Efficiently add edges to the graph with progress tracking.""" threshold = 0.1 # Adjust this threshold as needed significant_weights = np.abs(weight_matrices) > threshold @@ -459,7 +511,7 @@ def load_vectorizer(): return vectorizer_load -def visualize_top_features(top_n=90): +def visualize_top_features(top_n: int = 90): feature_names = vectorizer.get_feature_names_out() sorted_indices = vectorizer.idf_.argsort()[:top_n] top_features = [feature_names[i] for i in sorted_indices] @@ -476,7 +528,7 @@ def visualize_top_features(top_n=90): plt.close() -def load_model(): +def load_model() -> tuple[Any, device]: device_load = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_load = torch.load(model_path, weights_only=False) model_load.to(device_load) diff --git a/CODE/VulnScan/tools/_vectorizer.py b/CODE/VulnScan/tools/_vectorizer.py index 1ad7da8b..63577fa8 100644 --- a/CODE/VulnScan/tools/_vectorizer.py +++ b/CODE/VulnScan/tools/_vectorizer.py @@ -9,6 +9,15 @@ def load_data(data_paths: str | os.PathLike) -> list[str]: + """ + Load data from the specified path(s). + + Args: + data_paths (str | os.PathLike): Path to a directory or a file containing data. + + Returns: + list[str]: List of strings, each representing the content of a file. + """ data = [] if os.path.isdir(data_paths): for root, _, files in os.walk(data_paths): @@ -24,6 +33,18 @@ def load_data(data_paths: str | os.PathLike) -> list[str]: def choose_vectorizer(vectorizer_types: str) -> TfidfVectorizer | CountVectorizer: + """ + Choose and return a vectorizer based on the specified type. + + Args: + vectorizer_types (str): Type of vectorizer to use ('tfidf' or 'count'). + + Returns: + TfidfVectorizer | CountVectorizer: The chosen vectorizer. + + Raises: + ValueError: If an unsupported vectorizer type is specified. + """ print("Vectorizer Type: ", vectorizer_types) print("Vectorizing Data...") if vectorizer_types == 'tfidf': @@ -34,6 +55,14 @@ def choose_vectorizer(vectorizer_types: str) -> TfidfVectorizer | CountVectorize def main(data_paths: str, vectorizer_types: str, output_paths: str): + """ + Main function to load data, choose a vectorizer, fit the vectorizer to the data, and save the vectorizer. + + Args: + data_paths (str): Path to the data. + vectorizer_types (str): Type of vectorizer to use ('tfidf' or 'count'). + output_paths (str): Path to save the fitted vectorizer. + """ data = load_data(data_paths) vectorizer = choose_vectorizer(vectorizer_types) vectorizer.fit(data) diff --git a/CODE/VulnScan/v2-deprecated/_generate_data.py b/CODE/VulnScan/v2-deprecated/_generate_data.py index 59925242..78722f47 100644 --- a/CODE/VulnScan/v2-deprecated/_generate_data.py +++ b/CODE/VulnScan/v2-deprecated/_generate_data.py @@ -9,9 +9,15 @@ fake = Faker() -# Function to generate a sensitive file with real sensitive information -@deprecated(reason="This function is only used for generating sensitive data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0") +@deprecated(reason="This function is only used for generating sensitive data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0") def create_sensitive_file(file_path: str, max_size: int): + """ + Generate a sensitive file with real sensitive information. + + Args: + file_path (str): The path where the file will be saved. + max_size (int): The maximum size of the file in bytes. + """ content = "" # Generate sensitive data using Faker content += f"Name: {fake.name()}\n" @@ -30,9 +36,15 @@ def create_sensitive_file(file_path: str, max_size: int): f.write(content) -# Function to generate a normal file with non-sensitive data -@deprecated(reason="This function is only used for generating normal data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0") +@deprecated(reason="This function is only used for generating normal data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0") def create_normal_file(file_path: str, max_size: int): + """ + Generate a normal file with non-sensitive data. + + Args: + file_path (str): The path where the file will be saved. + max_size (int): The maximum size of the file in bytes. + """ content = "" # Add random text while len(content.encode('utf-8')) < max_size: @@ -42,9 +54,15 @@ def create_normal_file(file_path: str, max_size: int): f.write(content) -# Function to generate a mix file with both normal and sensitive data -@deprecated(reason="This function is only used for generating mixed data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0") +@deprecated(reason="This function is only used for generating mixed data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0") def create_mix_file(file_path: str, max_size: int): + """ + Generate a mix file with both normal and sensitive data. + + Args: + file_path (str): The path where the file will be saved. + max_size (int): The maximum size of the file in bytes. + """ content = "" # Add a mix of normal and sensitive data while len(content.encode('utf-8')) < max_size: @@ -59,9 +77,15 @@ def create_mix_file(file_path: str, max_size: int): f.write(content) -# Function to create random files (Normal, Mix, Sensitive) -@deprecated(reason="This function is only used for generating random files for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0") +@deprecated(reason="This function is only used for generating random files for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0") def create_random_files(directories: str, num_file: int = 100): + """ + Create random files (Normal, Mix, Sensitive). + + Args: + directories (str): The directory where the files will be saved. + num_file (int): The number of files to generate. + """ os.makedirs(directories, exist_ok=True) for i in range(num_file): diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py index 4cfa6247..f25d4152 100644 --- a/CODE/VulnScan/v2-deprecated/_train.py +++ b/CODE/VulnScan/v2-deprecated/_train.py @@ -2,7 +2,6 @@ import logging import os -from os import mkdir import joblib import matplotlib.pyplot as plt @@ -20,6 +19,7 @@ from sklearn.svm import SVC from torch.utils.data import DataLoader, TensorDataset from transformers import BertTokenizer, BertForSequenceClassification + from logicytics import deprecated # Configure logging @@ -42,7 +42,15 @@ @deprecated(reason="This function is used to load data from a directory. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") def load_data(data_dir: str) -> tuple[list[str], np.ndarray]: - """Loads text data and labels from the directory.""" + """ + Loads text data and labels from the directory. + + Args: + data_dir (str): The directory containing the data files. + + Returns: + tuple[list[str], np.ndarray]: A tuple containing the list of texts and the corresponding labels. + """ texts, labels = [], [] for file_name in os.listdir(data_dir): with open(os.path.join(data_dir, file_name), "r", encoding="utf-8") as f: @@ -55,7 +63,16 @@ def load_data(data_dir: str) -> tuple[list[str], np.ndarray]: @deprecated(reason="This function is used to evaluate a model. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float, float, float, float]: - """Evaluates the model using standard metrics.""" + """ + Evaluates the model using standard metrics. + + Args: + y_true (np.ndarray): The true labels. + y_pred (np.ndarray): The predicted labels. + + Returns: + tuple[float, float, float, float, float]: A tuple containing accuracy, precision, recall, F1 score, and ROC-AUC score. + """ accuracy = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred, zero_division=1) recall = recall_score(y_true, y_pred, zero_division=1) @@ -73,6 +90,13 @@ def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float @deprecated(reason="This function is used to save progress graphs. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") def save_progress_graph(accuracies: list[float], filename: str = "training_progress.png"): + """ + Saves a graph of training progress. + + Args: + accuracies (list[float]): List of accuracies for each epoch. + filename (str): The filename to save the graph as. + """ plt.figure(figsize=(8, 6)) plt.plot(range(1, len(accuracies) + 1), accuracies, marker='o', label="Training Accuracy") plt.xlabel("Epochs") @@ -87,7 +111,16 @@ def save_progress_graph(accuracies: list[float], filename: str = "training_progr @deprecated(reason="This function is used to train xgboost. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") def train_xgboost(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, SAVE_DIR: str): - """Trains a Gradient Boosting Classifier (XGBoost) with GPU.""" + """ + Trains a Gradient Boosting Classifier (XGBoost) with GPU. + + Args: + X_train (np.ndarray): Training data features. + X_test (np.ndarray): Testing data features. + y_train (np.ndarray): Training data labels. + y_test (np.ndarray): Testing data labels. + SAVE_DIR (str): Directory to save the trained model. + """ logging.info("Enabling GPU acceleration...") model = xgb.XGBClassifier(tree_method='hist', device=DEVICE) # Enable GPU acceleration logging.info("GPU acceleration enabled.") @@ -104,7 +137,21 @@ def train_xgboost(X_train: np.ndarray, X_test: np.ndarray, def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, MAX_LEN: int, LEARNING_RATE: float, BATCH_SIZE: int, EPOCHS: int, SAVE_DIR: str, MODEL_PATH: str): - """Trains a BERT model with GPU support.""" + """ + Trains a BERT model with GPU support. + + Args: + X_train (np.ndarray): Training data features. + X_test (np.ndarray): Testing data features. + y_train (np.ndarray): Training data labels. + y_test (np.ndarray): Testing data labels. + MAX_LEN (int): Maximum length of the sequences. + LEARNING_RATE (float): Learning rate for the optimizer. + BATCH_SIZE (int): Batch size for training. + EPOCHS (int): Number of epochs for training. + SAVE_DIR (str): Directory to save the trained model. + MODEL_PATH (str): Path to the pre-trained BERT model. + """ logging.info("Loading BERT tokenizer...") tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME) train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt") @@ -154,14 +201,34 @@ def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, class LSTMModel(nn.Module): + @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0") def __init__(self, vocab_size: int, embedding_dim: int = 128, hidden_dim: int = 128, output_dim: int = 1): + """ + Initializes the LSTM model. + + Args: + vocab_size (int): Size of the vocabulary. + embedding_dim (int): Dimension of the embedding layer. + hidden_dim (int): Dimension of the hidden layer. + output_dim (int): Dimension of the output layer. + """ super(LSTMModel, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True) self.fc = nn.Linear(hidden_dim * 2, output_dim) # Bidirectional, so multiply by 2 self.sigmoid = nn.Sigmoid() + @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0") def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Defines the forward pass of the LSTM model. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: Output tensor. + """ x = self.embedding(x) lstm_out, _ = self.lstm(x) x = self.fc(lstm_out[:, -1, :]) @@ -173,7 +240,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, MAX_FEATURES: int, LEARNING_RATE: float, BATCH_SIZE: int, EPOCHS: int, SAVE_DIR: str): - """Trains an LSTM model using PyTorch with GPU support.""" + """ + Trains an LSTM model using PyTorch with GPU support. + + Args: + X_train (np.ndarray): Training data features. + X_test (np.ndarray): Testing data features. + y_train (np.ndarray): Training data labels. + y_test (np.ndarray): Testing data labels. + MAX_FEATURES (int): Maximum number of features for the vectorizer. + LEARNING_RATE (float): Learning rate for the optimizer. + BATCH_SIZE (int): Batch size for training. + EPOCHS (int): Number of epochs for training. + SAVE_DIR (str): Directory to save the trained model. + """ logging.info("Training LSTM...") logging.info("Vectorizing text data...") vectorizer = TfidfVectorizer(max_features=MAX_FEATURES) @@ -236,6 +316,18 @@ def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str, MAX_FEATURES: int, TEST_SIZE: float | int, MAX_ITER: int, RANDOM_STATE: int): + """ + Trains a Neural Network or SVM model with hyperparameter tuning. + + Args: + MODEL (str): The type of model to train ('svm' or 'nn'). + EPOCHS (int): Number of epochs for training. + SAVE_DIR (str): Directory to save the trained model. + MAX_FEATURES (int): Maximum number of features for the vectorizer. + TEST_SIZE (float | int): Proportion of the dataset to include in the test split. + MAX_ITER (int): Maximum number of iterations for the model. + RANDOM_STATE (int): Random state for reproducibility. + """ if MODEL not in ["svm", "nn"]: logging.error(f"Invalid model type: {MODEL}. Please choose 'svm' or 'nn'.") return @@ -325,6 +417,21 @@ def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str, def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int, LEARNING_RATE: float, MAX_FEATURES: int, MAX_LEN: int, TEST_SIZE: float | int, RANDOM_STATE: int, MODEL_PATH_BERT: str = None): + """ + Sets up and trains a model based on the specified type. + + Args: + MODEL_TYPE (str): The type of model to train ('xgboost', 'bert', 'lstm'). + SAVE_DIR (str): Directory to save the trained model. + EPOCHS (int): Number of epochs for training. + BATCH_SIZE (int): Batch size for training. + LEARNING_RATE (float): Learning rate for the optimizer. + MAX_FEATURES (int): Maximum number of features for the vectorizer. + MAX_LEN (int): Maximum length of the sequences (for BERT). + TEST_SIZE (float | int): Proportion of the dataset to include in the test split. + RANDOM_STATE (int): Random state for reproducibility. + MODEL_PATH_BERT (str, optional): Path to the pre-trained BERT model. + """ # Create save directory if it doesn't exist os.makedirs(SAVE_DIR, exist_ok=True) @@ -358,6 +465,16 @@ def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int @deprecated(reason="This function is used to train RandomForest. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int, N_ESTIMATORS: int, RANDOM_STATE: int): + """ + Trains a Random Forest Classifier. + + Args: + SAVE_DIR (str): Directory to save the trained model. + EPOCHS (int): Number of epochs for training. + TEST_SIZE (float | int): Proportion of the dataset to include in the test split. + N_ESTIMATORS (int): Number of trees in the forest. + RANDOM_STATE (int): Random state for reproducibility. + """ logging.info("Training model...") # Load data @@ -391,7 +508,7 @@ def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int, # Save progress plot if not os.path.exists(SAVE_DIR): - mkdir(SAVE_DIR) + os.mkdir(SAVE_DIR) save_progress_graph(accuracies, filename=os.path.join(SAVE_DIR, "training_progress.png")) # Save model checkpoint diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py index 0b28b6db..161ee97f 100644 --- a/CODE/VulnScan/v3/_generate_data.py +++ b/CODE/VulnScan/v3/_generate_data.py @@ -1,98 +1,86 @@ +from __future__ import annotations + import os import random import string import configparser from faker import Faker -# Initialize Faker -fake = Faker() - -# Read configuration -config = configparser.ConfigParser() -config.read('../../config.ini') - -# Load configuration values -config = config['VulnScan.generate Settings'] -EXTENSIONS_ALLOWED = config.get('extensions', '.txt').split(',') -SAVE_PATH = config.get('save_path', '.') -CODE_NAME = config.get('code_name', 'Sense') -SIZE_VARIATION = float(config.get('size_variation', '0.1')) - -# Ensure the save directory exists -os.makedirs(SAVE_PATH, exist_ok=True) - -# Set default file size and number of files -DEFAULT_FILE_NUM = 10000 -DEFAULT_MIN_FILE_SIZE = 10 * 1024 # 10 KB -DEFAULT_MAX_FILE_SIZE = 10 * 1024 # 10 KB - -# File configuration based on CODE_NAME -if CODE_NAME == 'Sense': - FILE_NUM = DEFAULT_FILE_NUM * 5 - MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE * 5 - MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE * 5 -elif CODE_NAME == 'SenseNano': - FILE_NUM = 5 - MIN_FILE_SIZE = int(DEFAULT_MIN_FILE_SIZE * 0.5) - MAX_FILE_SIZE = int(DEFAULT_MAX_FILE_SIZE * 0.5) -elif CODE_NAME == 'SenseMacro': - FILE_NUM = DEFAULT_FILE_NUM * 100 - MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE - MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE -elif CODE_NAME == 'SenseMini': - FILE_NUM = DEFAULT_FILE_NUM - MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE - MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE -else: # Custom configuration - MIN_FILE_SIZE = int(config['min_file_size'].replace('KB', '')) * 1024 - MAX_FILE_SIZE = int(config['max_file_size'].replace('KB', '')) * 1024 - FILE_NUM = DEFAULT_FILE_NUM - -print(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes") - - -# Function to generate random file names -def generate_random_filename(extensions, suffix_x): + +def generate_random_filename(extensions: str, suffix_x: str = '') -> str: + """ + Generate a random filename with the given extension and optional suffix. + + Args: + extensions (str): The file extension. + suffix_x (str, optional): An optional suffix to add to the filename. + + Returns: + str: The generated random filename. + """ return ''.join(random.choices(string.ascii_letters + string.digits, k=10)) + suffix_x + extensions -# Function to generate content based on file extension -def generate_content_for_extension(extensions, size): - # Define sensitive data generators - sensitive_data_generators = { - '.txt': lambda: random.choice([ - fake.credit_card_number(), - fake.ssn(), - fake.password(), - fake.email(), - fake.phone_number(), - fake.iban(), - ]), - '.json': lambda: { - 'credit_card': fake.credit_card_number(), - 'email': fake.email(), - 'phone': fake.phone_number(), - 'password': fake.password(), - 'iban': fake.iban(), - }, - '.csv': lambda: ",".join([ - fake.credit_card_number(), - fake.email(), - fake.phone_number(), - ]), - '.xml': lambda: f"{random.choice([fake.credit_card_number(), fake.iban(), fake.password()])}", - '.log': lambda: f"{fake.date_time()} - Sensitive Data: {random.choice([fake.email(), fake.password(), fake.ipv4_private()])}", - 'default': lambda: fake.text(max_nb_chars=50) - } - - # Define sensitivity chances +def generate_content_for_extension(extensions: str, size: int | float) -> tuple[str, str]: + """ + Generate content based on the file extension and size. + + Args: + extensions (str): The file extension. + size (int | float): The size of the content to generate. + + Returns: + tuple[str, str]: The generated content and a suffix indicating the sensitivity level. + """ full_sensitive_chance = float(config.get('full_sensitive_chance', '0.1')) partial_sensitive_chance = float(config.get('partial_sensitive_chance', '0.3')) - def generate_sensitive_data(): + def generate_sensitive_data() -> str: + """ + Generate sensitive data based on the file extension. + + Returns: + str: The generated sensitive data. + """ + sensitive_data_generators = { + '.txt': lambda: random.choice([ + fake.credit_card_number(), + fake.ssn(), + fake.password(), + fake.email(), + fake.phone_number(), + fake.iban(), + ]), + '.json': lambda: { + 'credit_card': fake.credit_card_number(), + 'email': fake.email(), + 'phone': fake.phone_number(), + 'password': fake.password(), + 'iban': fake.iban(), + }, + '.csv': lambda: ",".join([ + fake.credit_card_number(), + fake.email(), + fake.phone_number(), + ]), + '.xml': lambda: f"{random.choice([fake.credit_card_number(), fake.iban(), fake.password()])}", + '.log': lambda: f"{fake.date_time()} - Sensitive Data: {random.choice([fake.email(), fake.password(), fake.ipv4_private()])}", + 'default': lambda: fake.text(max_nb_chars=50) + } + return sensitive_data_generators.get(extensions, sensitive_data_generators['default'])() - def generate_regular_content(extension_grc, sizes): + def generate_regular_content(extension_grc: str, sizes: int | float) -> str: + """ + Generate regular content based on the file extension and size. + + Args: + extension_grc (str): The file extension. + sizes (int | float): The size of the content to generate. + + Returns: + str: The generated regular content. + """ if extension_grc == '.txt': content_grc = fake.text(max_nb_chars=sizes) elif extension_grc == '.json': @@ -111,12 +99,10 @@ def generate_regular_content(extension_grc, sizes): elif extension_grc == '.log': content_grc = "\n".join([f"{fake.date_time()} - {fake.text(50)}" for _ in range(sizes // 100)]) else: - # Default to plain text for unknown extensions content_grc = fake.text(max_nb_chars=sizes) return content_grc if random.random() < full_sensitive_chance: - # Generate fully sensitive content if extensions == '.json': contents = str([generate_sensitive_data() for _ in range(size // 500)]) elif extensions in ['.txt', '.log', '.xml']: @@ -127,12 +113,10 @@ def generate_regular_content(extension_grc, sizes): contents = "\n".join([generate_sensitive_data() for _ in range(size // 500)]) return contents, '-sensitive' else: - # Generate regular content with optional partial sensitivity regular_content = generate_regular_content(extensions, size) if random.random() < partial_sensitive_chance: - sensitive_data_count = max(1, size // 500) # Embed some sensitive data + sensitive_data_count = max(1, size // 500) sensitive_data = [generate_sensitive_data() for _ in range(sensitive_data_count)] - # Blend sensitive data into the regular content regular_content_lines = regular_content.split("\n") for _ in range(sensitive_data_count): insert_position = random.randint(0, len(regular_content_lines) - 1) @@ -144,8 +128,16 @@ def generate_regular_content(extension_grc, sizes): return contents, '-none' -# Function to generate file content -def generate_file_content(extensions): +def generate_file_content(extensions: str) -> tuple[str, str]: + """ + Generate file content based on the file extension. + + Args: + extensions (str): The file extension. + + Returns: + tuple[str, str]: The generated content and a suffix indicating the sensitivity level. + """ size = random.randint(MIN_FILE_SIZE, MAX_FILE_SIZE) if SIZE_VARIATION != 0: variation_choice = random.choice([1, 2, 3, 4]) @@ -161,14 +153,57 @@ def generate_file_content(extensions): return generate_content_for_extension(extensions, size) -# Generate files -for i in range(FILE_NUM): - print(f"Generating file {i + 1}/{FILE_NUM}") - extension = random.choice(EXTENSIONS_ALLOWED).strip() - content, suffix = generate_file_content(extension) - filename = generate_random_filename(extension, suffix) - filepath = os.path.join(SAVE_PATH, filename) - with open(filepath, 'w', encoding='utf-8') as f: - f.write(content) - -print(f"Generated {FILE_NUM} files in {SAVE_PATH}") +if __name__ == "__name__": + """ + Main function to generate files based on the configuration. + """ + fake = Faker() + + config = configparser.ConfigParser() + config.read('../../config.ini') + + config = config['VulnScan.generate Settings'] + EXTENSIONS_ALLOWED = config.get('extensions', '.txt').split(',') + SAVE_PATH = config.get('save_path', '.') + CODE_NAME = config.get('code_name', 'Sense') + SIZE_VARIATION = float(config.get('size_variation', '0.1')) + + os.makedirs(SAVE_PATH, exist_ok=True) + + DEFAULT_FILE_NUM = 10000 + DEFAULT_MIN_FILE_SIZE = 10 * 1024 + DEFAULT_MAX_FILE_SIZE = 10 * 1024 + + if CODE_NAME == 'Sense': + FILE_NUM = DEFAULT_FILE_NUM * 5 + MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE * 5 + MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE * 5 + elif CODE_NAME == 'SenseNano': + FILE_NUM = 5 + MIN_FILE_SIZE = int(DEFAULT_MIN_FILE_SIZE * 0.5) + MAX_FILE_SIZE = int(DEFAULT_MAX_FILE_SIZE * 0.5) + elif CODE_NAME == 'SenseMacro': + FILE_NUM = DEFAULT_FILE_NUM * 100 + MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE + MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE + elif CODE_NAME == 'SenseMini': + FILE_NUM = DEFAULT_FILE_NUM + MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE + MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE + else: + MIN_FILE_SIZE = int(config['min_file_size'].replace('KB', '')) * 1024 + MAX_FILE_SIZE = int(config['max_file_size'].replace('KB', '')) * 1024 + FILE_NUM = DEFAULT_FILE_NUM + + print(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes") + + for i in range(FILE_NUM): + print(f"Generating file {i + 1}/{FILE_NUM}") + extension = random.choice(EXTENSIONS_ALLOWED).strip() + content, suffix = generate_file_content(extension) + filename = generate_random_filename(extension, suffix) + filepath = os.path.join(SAVE_PATH, filename) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + print(f"Generated {FILE_NUM} files in {SAVE_PATH}") diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py index 1600fee6..53c3d0a3 100644 --- a/CODE/VulnScan/v3/_train.py +++ b/CODE/VulnScan/v3/_train.py @@ -344,49 +344,50 @@ def train_model( train_traditional_model(model_name, epochs, save_model_path) -# Config file reading and setting constants -logger.info("Reading config file") -config = ConfigParser() -config.read('../../config.ini') -MODEL_NAME = config.get('VulnScan.train Settings', 'model_name') -TRAINING_PATH = config.get('VulnScan.train Settings', 'train_data_path') -EPOCHS = int(config.get('VulnScan.train Settings', 'epochs')) -BATCH_SIZE = int(config.get('VulnScan.train Settings', 'batch_size')) -LEARN_RATE = float(config.get('VulnScan.train Settings', 'learning_rate')) -CUDA = config.getboolean('VulnScan.train Settings', 'use_cuda') -SAVE_PATH = config.get('VulnScan.train Settings', 'save_model_path') - -# Load Data -logger.info(f"Loading data from {TRAINING_PATH}") -texts, labels = [], [] -for filename in os.listdir(TRAINING_PATH): - with open(os.path.join(config.get('VulnScan.train Settings', 'train_data_path'), filename), 'r', - encoding='utf-8') as file: - texts.append(file.read()) - labels.append(1 if '-sensitive' in filename else 0) - logger.debug(f"Loaded data from {filename} with label {labels[-1]}") - -# Split Data -logger.info("Splitting data into training and validation sets") -X_train, X_val, y_train, y_val = train_test_split(texts, - labels, - test_size=0.2, - random_state=42) - -# Train Model -try: - train_model(model_name=MODEL_NAME, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LEARN_RATE, - save_model_path=SAVE_PATH, - use_cuda=CUDA) -except FileNotFoundError as e: - logger.error(f"File Not Found Error in training model: {e}") - exit(1) -except AttributeError as e: - logger.error(f"Attribute Error in training model: {e}") - exit(1) -except Exception as e: - logger.error(f"Error in training model: {e}") - exit(1) +if __name__ == "__main__": + # Config file reading and setting constants + logger.info("Reading config file") + config = ConfigParser() + config.read('../../config.ini') + MODEL_NAME = config.get('VulnScan.train Settings', 'model_name') + TRAINING_PATH = config.get('VulnScan.train Settings', 'train_data_path') + EPOCHS = int(config.get('VulnScan.train Settings', 'epochs')) + BATCH_SIZE = int(config.get('VulnScan.train Settings', 'batch_size')) + LEARN_RATE = float(config.get('VulnScan.train Settings', 'learning_rate')) + CUDA = config.getboolean('VulnScan.train Settings', 'use_cuda') + SAVE_PATH = config.get('VulnScan.train Settings', 'save_model_path') + + # Load Data + logger.info(f"Loading data from {TRAINING_PATH}") + texts, labels = [], [] + for filename in os.listdir(TRAINING_PATH): + with open(os.path.join(config.get('VulnScan.train Settings', 'train_data_path'), filename), 'r', + encoding='utf-8') as file: + texts.append(file.read()) + labels.append(1 if '-sensitive' in filename else 0) + logger.debug(f"Loaded data from {filename} with label {labels[-1]}") + + # Split Data + logger.info("Splitting data into training and validation sets") + X_train, X_val, y_train, y_val = train_test_split(texts, + labels, + test_size=0.2, + random_state=42) + + # Train Model + try: + train_model(model_name=MODEL_NAME, + epochs=EPOCHS, + batch_size=BATCH_SIZE, + learning_rate=LEARN_RATE, + save_model_path=SAVE_PATH, + use_cuda=CUDA) + except FileNotFoundError as e: + logger.error(f"File Not Found Error in training model: {e}") + exit(1) + except AttributeError as e: + logger.error(f"Attribute Error in training model: {e}") + exit(1) + except Exception as e: + logger.error(f"Error in training model: {e}") + exit(1) From f409b27dcd43d39a693584cc7e8ce53513018aea Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 10:59:10 +0400 Subject: [PATCH 13/20] Made file non-importable --- CODE/VulnScan/tools/_study_network.py | 7 ++----- CODE/VulnScan/tools/_test_gpu_acceleration.py | 2 ++ CODE/VulnScan/tools/_vectorizer.py | 2 ++ CODE/VulnScan/v2-deprecated/_generate_data.py | 5 ++++- CODE/VulnScan/v2-deprecated/_train.py | 2 ++ CODE/VulnScan/v3/_generate_data.py | 2 ++ CODE/VulnScan/v3/_train.py | 2 ++ 7 files changed, 16 insertions(+), 6 deletions(-) diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index c52ba622..69f823ab 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -26,11 +26,6 @@ from tqdm import tqdm -# TODO Do v3.1 plans -# Raise an ImportError to make the file unimportable -# raise ImportError("This file cannot be imported") - - # Example of DataLoader for loss landscape (dummy dataset for visualization) class DummyDataset(torch.utils.data.Dataset): """ @@ -624,3 +619,5 @@ def model_summary(): model_summary() main_plot() +else: + raise ImportError("This file cannot be imported") diff --git a/CODE/VulnScan/tools/_test_gpu_acceleration.py b/CODE/VulnScan/tools/_test_gpu_acceleration.py index e45d05c8..a7e47134 100644 --- a/CODE/VulnScan/tools/_test_gpu_acceleration.py +++ b/CODE/VulnScan/tools/_test_gpu_acceleration.py @@ -23,3 +23,5 @@ def check_gpu(): if __name__ == '__main__': check_gpu() +else: + raise ImportError("This file cannot be imported") diff --git a/CODE/VulnScan/tools/_vectorizer.py b/CODE/VulnScan/tools/_vectorizer.py index 63577fa8..5d316de9 100644 --- a/CODE/VulnScan/tools/_vectorizer.py +++ b/CODE/VulnScan/tools/_vectorizer.py @@ -80,3 +80,5 @@ def main(data_paths: str, vectorizer_types: str, output_paths: str): if not os.path.exists(output_path): os.makedirs(output_path) main(data_path, vectorizer_type, output_path) +else: + raise ImportError("This file cannot be imported") diff --git a/CODE/VulnScan/v2-deprecated/_generate_data.py b/CODE/VulnScan/v2-deprecated/_generate_data.py index 78722f47..62fc91e5 100644 --- a/CODE/VulnScan/v2-deprecated/_generate_data.py +++ b/CODE/VulnScan/v2-deprecated/_generate_data.py @@ -103,4 +103,7 @@ def create_random_files(directories: str, num_file: int = 100): print(f"Created {file_type} file: {file_name}") -create_random_files(SAVE_DIRECTORY, num_file=1000000) +if __name__ == "__main__": + create_random_files(SAVE_DIRECTORY, num_file=1000000) +else: + raise ImportError("This file cannot be imported") diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py index f25d4152..9fc51517 100644 --- a/CODE/VulnScan/v2-deprecated/_train.py +++ b/CODE/VulnScan/v2-deprecated/_train.py @@ -544,3 +544,5 @@ def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int, train_model_blx(MODEL_TYPE="bert", SAVE_DIR=r"C:\Users\Hp\Desktop\Model Tests\Model Sense .2b1", EPOCHS=5, BATCH_SIZE=8, LEARNING_RATE=5e-5, MAX_FEATURES=5000, MAX_LEN=128, TEST_SIZE=0.2, RANDOM_STATE=42, MODEL_PATH_BERT="../bert-base-uncased-model") +else: + raise ImportError("This file cannot be imported") diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py index 161ee97f..e1f0d0c8 100644 --- a/CODE/VulnScan/v3/_generate_data.py +++ b/CODE/VulnScan/v3/_generate_data.py @@ -207,3 +207,5 @@ def generate_file_content(extensions: str) -> tuple[str, str]: f.write(content) print(f"Generated {FILE_NUM} files in {SAVE_PATH}") +else: + raise ImportError("This file cannot be imported") diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py index 53c3d0a3..586bd5fb 100644 --- a/CODE/VulnScan/v3/_train.py +++ b/CODE/VulnScan/v3/_train.py @@ -391,3 +391,5 @@ def train_model( except Exception as e: logger.error(f"Error in training model: {e}") exit(1) +else: + raise ImportError("This file cannot be imported") From ae35a70384fd8fa4f6c93609b2f16bfac8a2c826 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 11:14:56 +0400 Subject: [PATCH 14/20] Documentation update --- CODE/VulnScan/Documentation.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/CODE/VulnScan/Documentation.md b/CODE/VulnScan/Documentation.md index 4b750fda..7b0f5dc9 100644 --- a/CODE/VulnScan/Documentation.md +++ b/CODE/VulnScan/Documentation.md @@ -107,3 +107,32 @@ VulnScan is designed to detect sensitive data across various file formats. It of - **Progress Tracking**: Visualizes accuracy and loss per epoch with graphs. - **Error Handling**: Logs errors for missing files, attribute issues, or unexpected conditions. - **Extensibility**: Supports plug-and-play integration for new algorithms or datasets. + + +# More files + +There is a repository that archived all the data used to make the model, +as well as previously trained models for you to test out +(loading scripts and vectorizers are not included). + +The repository is located [here](https://github.com/DefinetlyNotAI/VulnScan_TrainingData). + +The repository contains the following directories: +- `Training Data`: Contains the data used to train the models. Is organized by the file size and amount, unless its Tests, where they explicitly say text. +- `Archived Models`: Contains the previously trained models. Is organized by the model type then version. +- `NN features`: Contains information about the model `.3n3` and the vectorizer used. Information include: + - `Documentation_Study_Network.md`: A markdown file that contains more info. + - `Neural Network Nodes Graph.gexf`: A Gephi file that contains the model nodes and edges. + - `Nodes and edges (GEPHI).csv`: A CSV file that contains the model nodes and edges. + - `Statistics`: Directories made by Gephi, containing the statistics of the model nodes and edges. + - `Feature_Importance.svg`: A SVG file that contains the feature importance of the model. + - `Loss_Landscape_3D.html`: A HTML file that contains the 3D loss landscape of the model. + - `Model Accuracy Over Epochs.png` and `Model Loss Over Epochs.png`: PNG files that contain the model accuracy and loss over epochs. + - `Model state dictionary.txt`: A text file that contains the model state dictionary. + - `Model Summary.txt`: A text file that contains the model summary. + - `Model Visualization.png`: A PNG file that contains the model visualization. + - `Top_90_Features.svg`: A SVG file that contains the top 90 features of the model. + - `Vectorizer features.txt`: A text file that contains the vectorizer features. + - `Visualize Activation.png`: A PNG file that contains the visualization of the model activation. + - `Visualize t-SNE.png`: A PNG file that contains the visualization of the model t-SNE. + - `Weight Distribution.png`: A PNG file that contains the weight distribution of the model. From 2843e884a36133a62b525b67465e1c3b11374b8b Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 11:21:34 +0400 Subject: [PATCH 15/20] PLANS.md update 1. Made deprecation versions changed 2. Removed an old plan made in v3.0.0 or v3.1.0 --- CODE/VulnScan/v2-deprecated/_train.py | 22 +++++++++++----------- PLANS.md | 20 +++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py index 9fc51517..7940b1c3 100644 --- a/CODE/VulnScan/v2-deprecated/_train.py +++ b/CODE/VulnScan/v2-deprecated/_train.py @@ -40,7 +40,7 @@ # --------------------------------------- -@deprecated(reason="This function is used to load data from a directory. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to load data from a directory. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def load_data(data_dir: str) -> tuple[list[str], np.ndarray]: """ Loads text data and labels from the directory. @@ -61,7 +61,7 @@ def load_data(data_dir: str) -> tuple[list[str], np.ndarray]: return texts, np.array(labels) -@deprecated(reason="This function is used to evaluate a model. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to evaluate a model. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float, float, float, float]: """ Evaluates the model using standard metrics. @@ -88,7 +88,7 @@ def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float # --------------------------------------- -@deprecated(reason="This function is used to save progress graphs. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to save progress graphs. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def save_progress_graph(accuracies: list[float], filename: str = "training_progress.png"): """ Saves a graph of training progress. @@ -108,7 +108,7 @@ def save_progress_graph(accuracies: list[float], filename: str = "training_progr plt.close() -@deprecated(reason="This function is used to train xgboost. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to train xgboost. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def train_xgboost(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, SAVE_DIR: str): """ @@ -133,7 +133,7 @@ def train_xgboost(X_train: np.ndarray, X_test: np.ndarray, logging.info("Model saved as xgboost_model.pkl") -@deprecated(reason="This function is used to train bert. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to train bert. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, MAX_LEN: int, LEARNING_RATE: float, BATCH_SIZE: int, EPOCHS: int, SAVE_DIR: str, MODEL_PATH: str): @@ -201,7 +201,7 @@ def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, class LSTMModel(nn.Module): - @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0") + @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.2.0") def __init__(self, vocab_size: int, embedding_dim: int = 128, hidden_dim: int = 128, output_dim: int = 1): """ Initializes the LSTM model. @@ -218,7 +218,7 @@ def __init__(self, vocab_size: int, embedding_dim: int = 128, hidden_dim: int = self.fc = nn.Linear(hidden_dim * 2, output_dim) # Bidirectional, so multiply by 2 self.sigmoid = nn.Sigmoid() - @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0") + @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.2.0") def forward(self, x: torch.Tensor) -> torch.Tensor: """ Defines the forward pass of the LSTM model. @@ -236,7 +236,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -@deprecated(reason="This function is used to train lstm. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to train lstm. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, MAX_FEATURES: int, LEARNING_RATE: float, BATCH_SIZE: int, EPOCHS: int, SAVE_DIR: str): @@ -312,7 +312,7 @@ def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, # --------------------------------------- # noinspection DuplicatedCode -@deprecated(reason="This function is used to train NeuralNetworks/SVM. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to train NeuralNetworks/SVM. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str, MAX_FEATURES: int, TEST_SIZE: float | int, MAX_ITER: int, RANDOM_STATE: int): @@ -413,7 +413,7 @@ def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str, logging.info("Training complete.") -@deprecated(reason="This function is used setup training. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used setup training. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int, LEARNING_RATE: float, MAX_FEATURES: int, MAX_LEN: int, TEST_SIZE: float | int, RANDOM_STATE: int, MODEL_PATH_BERT: str = None): @@ -462,7 +462,7 @@ def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int # noinspection DuplicatedCode -@deprecated(reason="This function is used to train RandomForest. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0") +@deprecated(reason="This function is used to train RandomForest. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0") def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int, N_ESTIMATORS: int, RANDOM_STATE: int): """ diff --git a/PLANS.md b/PLANS.md index 39cc6f5d..e567ff5c 100644 --- a/PLANS.md +++ b/PLANS.md @@ -5,12 +5,14 @@ > - ❌ = Might be done, Not sure yet > - ✅ = Will be done, 100% sure -| Task | Version | Might or Will be done? | -|---------------------------------------------------------------------------------------------------------------------------------|---------|------------------------| -| Add a tool to capture and analyse memory dumps, which can help in forensic investigations. | v3.1.0 | ❌ | -| Add a tool to capture and analyse network traffic, which can help in forensic investigations. | v3.1.0 | ❌ | -| Remove EXTRA dir, and zip features with custom proper features from Logicytics, as well as remove EXTRA wrapper | v3.2.0 | ❌ | -| Implement a parser for Windows Prefetch files, Shellbags, Jump Lists, LNK files to extract data | v3.3a.0 | ✅ | -| Implement a parser for Windows UserAssist registry key, SRUM database to extract data. | v3.3b.0 | ✅ | -| Implement a parser for Windows Volume Shadow Copy, LSA Secrets, Syscache, Shimcache, Amcache Event Tracing logs to extract data | v3.3c.0 | ✅ | -| Implement the 2 missing flags | v3.4.0 | ✅ | +| Task | Version | Might or Will be done? | +|---------------------------------------------------------------------------------------------------------------------------------|----------------|------------------------| +| Add a tool to capture and analyse memory dumps, which can help in forensic investigations. | v3.1.0 | ❌ | +| Remove EXTRA dir, and zip features with custom proper features from Logicytics, as well as remove EXTRA wrapper | v3.2.0 | ❌ | +| Remove deprecated feature: `_train.py` | v3.2.0 | ❌ | +| Implement a parser for Windows Prefetch files, Shellbags, Jump Lists, LNK files to extract data | snapshot-3.3.a | ✅ | +| Implement a parser for Windows UserAssist registry key, SRUM database to extract data. | snapshot-3.3.b | ✅ | +| Implement a parser for Windows Volume Shadow Copy, LSA Secrets, Syscache, Shimcache, Amcache Event Tracing logs to extract data | snapshot-3.3.c | ✅ | +| Implement the 2 missing flags | v3.4.0 | ✅ | +| Remove deprecated feature: `_generate_data.py` | v3.4.0 | ✅ | +| Move VulnScan tools and v3 module to separate repository, keep only the model and vectorizer | v3.5.0 | ✅ | From 3d2a82c5eb2aa5162b18dcd0bb89da51edcf7903 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 13:18:23 +0400 Subject: [PATCH 16/20] Made dump_memory.py Fixed minor bug, and added dump_memory.py to Logicytics.py, also made dump_memory.py which generates around 3 files with data from the system's RAM, 1 is in HEX aka unreadable --- CODE/Logicytics.py | 6 +- CODE/dump_memory.py | 163 ++++++++++++++++++++++++++++++++++++++++++++ PLANS.md | 1 - 3 files changed, 166 insertions(+), 4 deletions(-) create mode 100644 CODE/dump_memory.py diff --git a/CODE/Logicytics.py b/CODE/Logicytics.py index a5f5583a..db37e416 100644 --- a/CODE/Logicytics.py +++ b/CODE/Logicytics.py @@ -16,11 +16,12 @@ # Initialization FileManagement.mkdir() log = Log({"log_level": DEBUG, "delete_log": DELETE_LOGS}) +ACTION = None +SUB_ACTION = None class Health: @staticmethod - @log.function def backup(directory: str, name: str): """ Creates a backup of a specified directory by zipping its contents and moving it to a designated backup location. @@ -47,7 +48,6 @@ def backup(directory: str, name: str): shutil.move(f"{name}.zip", "../ACCESS/BACKUP") @staticmethod - @log.function def update() -> tuple[str, str]: """ Updates the repository by pulling the latest changes from the remote repository. @@ -327,7 +327,7 @@ def zip_generated_files(): """Zips generated files based on the action.""" def zip_and_log(directory, name): - zip_values = FileManagement.Zip.and_hash(directory, name, ACTION) + zip_values = FileManagement.Zip.and_hash(directory, name, ACTION if not None else "ERROR_NULL_ACTION_VALUE") if isinstance(zip_values, str): log.error(zip_values) else: diff --git a/CODE/dump_memory.py b/CODE/dump_memory.py new file mode 100644 index 00000000..3df3b21e --- /dev/null +++ b/CODE/dump_memory.py @@ -0,0 +1,163 @@ +import datetime +import platform +import ctypes +import os +import psutil +from logicytics import Log, DEBUG + +if __name__ == "__main__": + log = Log({"log_level": DEBUG}) + + +# Function to save RAM content snapshot to a file +def dump_ram_content(): + try: + # Generate a timestamp for the file + dump_file = f"Ram_Snapshot.txt" + + # Gather memory statistics using psutil + memory_info = psutil.virtual_memory() + swap_info = psutil.swap_memory() + + # Get system-specific details + system_info = ( + "System Information:\n" + "===================================\n" + f"OS: {platform.system()} {platform.release()}\n" + f"Architecture: {platform.architecture()[0]}\n" + f"Processor: {platform.processor()}\n" + f"Machine: {platform.machine()}\n\n" + ) + + # Prepare content to dump + dump_content = ( + f"RAM Snapshot - {datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}\n" + "===================================\n" + f"{system_info}" + f"Total Memory: {memory_info.total / (1024 ** 3):.2f} GB\n" + f"Available Memory: {memory_info.available / (1024 ** 3):.2f} GB\n" + f"Used Memory: {memory_info.used / (1024 ** 3):.2f} GB\n" + f"Memory Usage: {memory_info.percent}%\n\n" + f"Swap Total: {swap_info.total / (1024 ** 3):.2f} GB\n" + f"Swap Used: {swap_info.used / (1024 ** 3):.2f} GB\n" + f"Swap Free: {swap_info.free / (1024 ** 3):.2f} GB\n" + f"Swap Usage: {swap_info.percent}%\n" + ) + + # Write the content to the file + with open(dump_file, "w", encoding="utf-8") as file: + file.write(dump_content) + + log.info(f"RAM snapshot saved to: {dump_file}") + + except Exception as e: + log.error(f"Error capturing RAM snapshot: {e}") + + +# Define structures for SystemInfo +class SystemInfo(ctypes.Structure): + _fields_ = [ + ("wProcessorArchitecture", ctypes.c_ushort), + ("wReserved", ctypes.c_ushort), + ("dwPageSize", ctypes.c_ulong), + ("lpMinimumApplicationAddress", ctypes.c_void_p), + ("lpMaximumApplicationAddress", ctypes.c_void_p), + ("dwActiveProcessorMask", ctypes.POINTER(ctypes.c_ulong)), + ("dwNumberOfProcessors", ctypes.c_ulong), + ("dwProcessorType", ctypes.c_ulong), + ("dwAllocationGranularity", ctypes.c_ulong), + ("wProcessorLevel", ctypes.c_ushort), + ("wProcessorRevision", ctypes.c_ushort), + ] + + +# Define BasicMemInfo +class BasicMemInfo(ctypes.Structure): + _fields_ = [ + ("BaseAddress", ctypes.c_void_p), + ("AllocationBase", ctypes.c_void_p), + ("AllocationProtect", ctypes.c_ulong), + ("RegionSize", ctypes.c_size_t), + ("State", ctypes.c_ulong), + ("Protect", ctypes.c_ulong), + ("Type", ctypes.c_ulong), + ] + + +def get_system_info() -> SystemInfo: + system_info = SystemInfo() + ctypes.windll.kernel32.GetSystemInfo(ctypes.byref(system_info)) + return system_info + + +def read_memory(): + # Open current process with permissions + process = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, False, os.getpid()) + if not process: + log.error("Unable to open process for reading.") + return + + # Get system info + system_info = get_system_info() + min_address = system_info.lpMinimumApplicationAddress + max_address = system_info.lpMaximumApplicationAddress + with open("SystemRam_Info.txt", "w") as sys_file: + sys_file.write(f"System Information:\n") + sys_file.write("===================================\n") + sys_file.write(f"Minimum Address: {min_address}\n") + sys_file.write(f"Maximum Address: {max_address}\n") + sys_file.write(f"Allocation Granularity: {system_info.dwAllocationGranularity}\n") + sys_file.write(f"Processor Architecture: {system_info.wProcessorArchitecture}\n") + sys_file.write(f"Number of Processors: {system_info.dwNumberOfProcessors}\n") + sys_file.write(f"Processor Type: {system_info.dwProcessorType}\n") + sys_file.write(f"Processor Level: {system_info.wProcessorLevel}\n") + sys_file.write(f"Processor Revision: {system_info.wProcessorRevision}\n") + sys_file.write(f"Page Size: {system_info.dwPageSize}\n") + sys_file.write(f"Active Processor Mask: {system_info.dwActiveProcessorMask.contents}\n") + sys_file.write(f"Reserved: {system_info.wReserved}\n") + sys_file.write("===================================\n") + sys_file.write(f"Raw SystemInfo: {system_info}\n") + sys_file.write("===================================\n") + log.debug(f"Memory Range: {min_address:#x} - {max_address:#x}") + + # Iterate through memory pages + memory_info = BasicMemInfo() + address = min_address + with open("Ram_Dump.txt", "w") as dump_file: + while address < max_address: + result = ctypes.windll.kernel32.VirtualQueryEx( + process, ctypes.c_void_p(address), ctypes.byref(memory_info), ctypes.sizeof(memory_info) + ) + if not result: + break + + # Check if the memory is committed and readable + if memory_info.State == MEM_COMMIT and memory_info.Protect == PAGE_READWRITE: + buffer = ctypes.create_string_buffer(memory_info.RegionSize) + bytes_read = ctypes.c_size_t() + ctypes.windll.kernel32.ReadProcessMemory( + process, + ctypes.c_void_p(memory_info.BaseAddress), + buffer, + memory_info.RegionSize, + ctypes.byref(bytes_read), + ) + dump_file.write(str(buffer.raw[: bytes_read.value])) + + address += memory_info.RegionSize + + # Close the process handle + ctypes.windll.kernel32.CloseHandle(process) + log.info("Memory dump complete. Saved to 'ram_dump.txt'.") + log.warning("Encoding is in HEX") + + +if __name__ == "__main__": + # Constants + PROCESS_QUERY_INFORMATION = 0x0400 + PROCESS_VM_READ = 0x0010 + MEM_COMMIT = 0x1000 + PAGE_READWRITE = 0x04 + + dump_ram_content() + read_memory() diff --git a/PLANS.md b/PLANS.md index e567ff5c..cf9a1dbb 100644 --- a/PLANS.md +++ b/PLANS.md @@ -7,7 +7,6 @@ | Task | Version | Might or Will be done? | |---------------------------------------------------------------------------------------------------------------------------------|----------------|------------------------| -| Add a tool to capture and analyse memory dumps, which can help in forensic investigations. | v3.1.0 | ❌ | | Remove EXTRA dir, and zip features with custom proper features from Logicytics, as well as remove EXTRA wrapper | v3.2.0 | ❌ | | Remove deprecated feature: `_train.py` | v3.2.0 | ❌ | | Implement a parser for Windows Prefetch files, Shellbags, Jump Lists, LNK files to extract data | snapshot-3.3.a | ✅ | From e79086ccee5d3f8c0bd5ee6584c05ff959efb3b2 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 14:05:36 +0400 Subject: [PATCH 17/20] CodeRabbit Suggestions --- CODE/Logicytics.py | 10 +++++++--- CODE/VulnScan/v3/_generate_data.py | 20 ++++++++++++++++---- CODE/VulnScan/v3/_train.py | 29 +++++++++++++++++++++++++++++ CODE/dump_memory.py | 2 +- 4 files changed, 53 insertions(+), 8 deletions(-) diff --git a/CODE/Logicytics.py b/CODE/Logicytics.py index db37e416..381becb6 100644 --- a/CODE/Logicytics.py +++ b/CODE/Logicytics.py @@ -325,9 +325,13 @@ def threaded_execution(execution_list_thread, index_thread): def zip_generated_files(): """Zips generated files based on the action.""" - - def zip_and_log(directory, name): - zip_values = FileManagement.Zip.and_hash(directory, name, ACTION if not None else "ERROR_NULL_ACTION_VALUE") + def zip_and_log(directory: str, name: str): + log.debug(f"Zipping directory '{directory}' with name '{name}' under action '{ACTION}'") + zip_values = FileManagement.Zip.and_hash( + directory, + name, + ACTION if ACTION is not None else f"ERROR_NO_ACTION_SPECIFIED_{datetime.now().isoformat()}" + ) if isinstance(zip_values, str): log.error(zip_values) else: diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py index e1f0d0c8..70f642c9 100644 --- a/CODE/VulnScan/v3/_generate_data.py +++ b/CODE/VulnScan/v3/_generate_data.py @@ -4,9 +4,19 @@ import random import string import configparser +from Logicytics import Log, DEBUG from faker import Faker +logger = Log( + {"log_level": DEBUG, + "filename": "../../../ACCESS/LOGS/VulnScan_Train.log", + "colorlog_fmt_parameters": + "%(log_color)s%(levelname)-8s%(reset)s %(yellow)s%(asctime)s %(blue)s%(message)s", + } +) + + def generate_random_filename(extensions: str, suffix_x: str = '') -> str: """ Generate a random filename with the given extension and optional suffix. @@ -149,7 +159,7 @@ def generate_file_content(extensions: str) -> tuple[str, str]: size = abs(int(size + (size / SIZE_VARIATION))) elif variation_choice == 4: size = abs(int(size - (size / SIZE_VARIATION))) - print(f"Generating {extensions} content of size {size} bytes") + logger.debug(f"Generating {extensions} content of size {size} bytes") return generate_content_for_extension(extensions, size) @@ -183,6 +193,8 @@ def generate_file_content(extensions: str) -> tuple[str, str]: MIN_FILE_SIZE = int(DEFAULT_MIN_FILE_SIZE * 0.5) MAX_FILE_SIZE = int(DEFAULT_MAX_FILE_SIZE * 0.5) elif CODE_NAME == 'SenseMacro': + logger.warning("Generating 100 times more files and 100 times larger files") + logger.warning("This is being deprecated in version 3.2.0") FILE_NUM = DEFAULT_FILE_NUM * 100 MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE @@ -195,10 +207,10 @@ def generate_file_content(extensions: str) -> tuple[str, str]: MAX_FILE_SIZE = int(config['max_file_size'].replace('KB', '')) * 1024 FILE_NUM = DEFAULT_FILE_NUM - print(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes") + logger.info(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes") for i in range(FILE_NUM): - print(f"Generating file {i + 1}/{FILE_NUM}") + logger.debug(f"Generating file {i + 1}/{FILE_NUM}") extension = random.choice(EXTENSIONS_ALLOWED).strip() content, suffix = generate_file_content(extension) filename = generate_random_filename(extension, suffix) @@ -206,6 +218,6 @@ def generate_file_content(extensions: str) -> tuple[str, str]: with open(filepath, 'w', encoding='utf-8') as f: f.write(content) - print(f"Generated {FILE_NUM} files in {SAVE_PATH}") + logger.info(f"Generated {FILE_NUM} files in {SAVE_PATH}") else: raise ImportError("This file cannot be imported") diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py index 586bd5fb..55fe845f 100644 --- a/CODE/VulnScan/v3/_train.py +++ b/CODE/VulnScan/v3/_train.py @@ -344,11 +344,38 @@ def train_model( train_traditional_model(model_name, epochs, save_model_path) +def validate_data(): + """ + Validates the data by checking if the variables are of the correct type. + """ + if not isinstance(EPOCHS, int) and EPOCHS > 0: + logger.error("EPOCHS must be an integer") + exit(1) + if not isinstance(BATCH_SIZE, int) and BATCH_SIZE > 0: + logger.error("BATCH_SIZE must be an integer") + exit(1) + if not isinstance(LEARN_RATE, float) and 0 < LEARN_RATE < 1: + logger.error("LEARN_RATE must be a float") + exit(1) + if not isinstance(CUDA, bool): + logger.error("CUDA must be a boolean") + exit(1) + allowed_models = ["NeuralNetwork", "LogReg", + "RandomForest", "ExtraTrees", "GBM", + "XGBoost", "DecisionTree", "NaiveBayes"] + if MODEL_NAME not in allowed_models: + logger.error('MODEL_NAME must be one of the following: ' + '"NeuralNetwork", "LogReg", "RandomForest", ' + '"ExtraTrees", "GBM","XGBoost", "DecisionTree", "NaiveBayes"') + exit(1) + + if __name__ == "__main__": # Config file reading and setting constants logger.info("Reading config file") config = ConfigParser() config.read('../../config.ini') + MODEL_NAME = config.get('VulnScan.train Settings', 'model_name') TRAINING_PATH = config.get('VulnScan.train Settings', 'train_data_path') EPOCHS = int(config.get('VulnScan.train Settings', 'epochs')) @@ -357,6 +384,8 @@ def train_model( CUDA = config.getboolean('VulnScan.train Settings', 'use_cuda') SAVE_PATH = config.get('VulnScan.train Settings', 'save_model_path') + validate_data() + # Load Data logger.info(f"Loading data from {TRAINING_PATH}") texts, labels = [], [] diff --git a/CODE/dump_memory.py b/CODE/dump_memory.py index 3df3b21e..db2ab7db 100644 --- a/CODE/dump_memory.py +++ b/CODE/dump_memory.py @@ -102,7 +102,7 @@ def read_memory(): min_address = system_info.lpMinimumApplicationAddress max_address = system_info.lpMaximumApplicationAddress with open("SystemRam_Info.txt", "w") as sys_file: - sys_file.write(f"System Information:\n") + sys_file.write("System Information:\n") sys_file.write("===================================\n") sys_file.write(f"Minimum Address: {min_address}\n") sys_file.write(f"Maximum Address: {max_address}\n") From 6ab61b51e613bd5ad7c1ab47f327f77c014c5298 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 14:19:46 +0400 Subject: [PATCH 18/20] Did --dev and fixed bug with it Keeps mistaking files added and removed due to trailing `"` --- CODE/_dev.py | 8 ++++---- CODE/config.ini | 4 ++-- README.md | 41 +++++++++++++++++++++-------------------- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/CODE/_dev.py b/CODE/_dev.py index 47687b6f..18755ea1 100644 --- a/CODE/_dev.py +++ b/CODE/_dev.py @@ -66,7 +66,7 @@ def dev_checks() -> None: Performs a series of checks to ensure that the developer has followed the required guidelines and best practices. Returns: bool: True if all checks pass, otherwise False. - """ + """ # Create the necessary directories if they do not exist FileManagement.mkdir() @@ -88,9 +88,9 @@ def dev_checks() -> None: # Get the list of code files in the current directory files = Get.list_of_code_files(".") - added_files = [f for f in files if f not in CURRENT_FILES] - removed_files = [f for f in CURRENT_FILES if f not in files] - normal_files = [f for f in files if f in CURRENT_FILES] + added_files = [f.replace('"', '') for f in files if f not in CURRENT_FILES] + removed_files = [f.replace('"', '') for f in CURRENT_FILES if f not in files] + normal_files = [f.replace('"', '') for f in files if f in CURRENT_FILES] # Print the list of added, removed, and normal files in color print("\n".join([f"\033[92m+ {file}\033[0m" for file in added_files])) # Green + diff --git a/CODE/config.ini b/CODE/config.ini index 24130933..be985f35 100644 --- a/CODE/config.ini +++ b/CODE/config.ini @@ -9,8 +9,8 @@ delete_old_logs = false [System Settings] # Do not play with these settings unless you know what you are doing -version = 3.0.0 -files = "browser_miner.ps1, cmd_commands.py, dir_list.py, event_log.py, Logicytics.py, log_miner.py, media_backup.py, netadapter.ps1, packet_sniffer.py, property_scraper.ps1, registry.py, sensitive_data_miner.py, ssh_miner.py, sys_internal.py, tasklist.py, tree.ps1, vulnscan.py, wifi_stealer.py, window_feature_miner.ps1, wmic.py, _debug.py, _dev.py, _extra.py, logicytics\Checks.py, logicytics\Execute.py, logicytics\FileManagement.py, logicytics\Flag.py, logicytics\Get.py, logicytics\Logger.py, logicytics\__init__.py, VulnScan\tools\_test_gpu_acceleration.py, VulnScan\tools\_vectorizer.py, VulnScan\v2-deprecated\_generate_data.py, VulnScan\v2-deprecated\_train.py, VulnScan\v3\_generate_data.py, VulnScan\v3\_train.py" +version = 3.1.0 +files = "browser_miner.ps1, cmd_commands.py, dir_list.py, dump_memory.py, event_log.py, Logicytics.py, log_miner.py, media_backup.py, netadapter.ps1, packet_sniffer.py, property_scraper.ps1, registry.py, sensitive_data_miner.py, ssh_miner.py, sys_internal.py, tasklist.py, tree.ps1, vulnscan.py, wifi_stealer.py, window_feature_miner.ps1, wmic.py, _debug.py, _dev.py, _extra.py, logicytics\Checks.py, logicytics\Execute.py, logicytics\FileManagement.py, logicytics\Flag.py, logicytics\Get.py, logicytics\Logger.py, logicytics\__init__.py, VulnScan\tools\_study_network.py, VulnScan\tools\_test_gpu_acceleration.py, VulnScan\tools\_vectorizer.py, VulnScan\v2-deprecated\_generate_data.py, VulnScan\v2-deprecated\_train.py, VulnScan\v3\_generate_data.py, VulnScan\v3\_train.py" ################################################### # The following settings are for specific modules # diff --git a/README.md b/README.md index d1597acc..a25fa039 100644 --- a/README.md +++ b/README.md @@ -282,26 +282,27 @@ Here are some of the data points that Logicytics extracts: > [!TIP] > You can check out future plans [here](PLANS.md), you can contribute these plans if you have no idea's on what to contribute! -| File Name | About | Important Note | -|--------------------------|------------------------------------------------------------------------------------------------------------------------|---------------------------------| -| browser_miner.ps1 | Mines all data related to browsers | Would love to be updated | -| cmd_commands.py | Gets data from driverquery, sysinfo, gpresult and more | | -| log_miner.py | Gets all logs from the Windows device | | -| media_backup.py | Gets all media of the device in a neat folder | Would love to be updated | -| netadapter.ps1 | Runs Get-NetAdapter Command with many flags | | -| property_scraper.ps1 | Gets all the windows properties | | -| registry.py | Backups the registry | | -| sensitive_data_miner.py | Copies all files that can be considered sensitive in a neat folder, , very slow and clunky - useful for depth scanning | | -| ssh_miner.py | Gets as much ssh private data as possible | | -| sys_internal.py | Attempts to use the Sys_Internal Suite from microsoft | | -| tasklist.py | Gets all running tasks, PID and info/data | | -| tree.ps1 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning | | -| window_feature_miner.ps1 | Logs all the windows features enabled | | -| wmic.py | Logs and runs many wmic commands to gain sensitive data and information | | -| wifi_stealer.py | Gets the SSID and Password of all saved Wi-Fi | | -| dir_list.py | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning | | -| event_logs.py | Produces a multiple txt files in a folder on many event logs (Security, Applications and System) | | -| vulnscan.py | Uses AI/ML to detect sensitive files, and log their paths | In beta! | +| File Name | About | Important Note | +|--------------------------|------------------------------------------------------------------------------------------------------------------------|--------------------------| +| browser_miner.ps1 | Mines all data related to browsers | | +| cmd_commands.py | Gets data from driverquery, sysinfo, gpresult and more | | +| log_miner.py | Gets all logs from the Windows device | | +| media_backup.py | Gets all media of the device in a neat folder | Would love to be updated | +| netadapter.ps1 | Runs Get-NetAdapter Command with many flags | | +| property_scraper.ps1 | Gets all the windows properties | | +| registry.py | Backups the registry | | +| sensitive_data_miner.py | Copies all files that can be considered sensitive in a neat folder, , very slow and clunky - useful for depth scanning | | +| ssh_miner.py | Gets as much ssh private data as possible | | +| sys_internal.py | Attempts to use the Sys_Internal Suite from microsoft | | +| tasklist.py | Gets all running tasks, PID and info/data | | +| tree.ps1 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning | | +| window_feature_miner.ps1 | Logs all the windows features enabled | | +| wmic.py | Logs and runs many wmic commands to gain sensitive data and information | | +| wifi_stealer.py | Gets the SSID and Password of all saved Wi-Fi | | +| dir_list.py | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning | | +| event_logs.py | Produces a multiple txt files in a folder on many event logs (Security, Applications and System) | | +| vulnscan.py | Uses AI/ML to detect sensitive files, and log their paths | In beta! | +| dump_memory.py | Dumps some memory as well as log some RAM details | | This is not an exhaustive list, but it should give you a good idea of what data Logicytics is capable of extracting. From 85e633b26afc10bb3c08b4e0e6d6064ea61f25cd Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 14:25:14 +0400 Subject: [PATCH 19/20] Fixed minor bugs CodeRabbit Suggestions --- CODE/VulnScan/v3/_generate_data.py | 2 +- CODE/VulnScan/v3/_train.py | 11 +++++++++++ CODE/config.ini | 4 ++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py index 70f642c9..bfce5446 100644 --- a/CODE/VulnScan/v3/_generate_data.py +++ b/CODE/VulnScan/v3/_generate_data.py @@ -163,7 +163,7 @@ def generate_file_content(extensions: str) -> tuple[str, str]: return generate_content_for_extension(extensions, size) -if __name__ == "__name__": +if __name__ == "__main__": """ Main function to generate files based on the configuration. """ diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py index 55fe845f..485e3d0e 100644 --- a/CODE/VulnScan/v3/_train.py +++ b/CODE/VulnScan/v3/_train.py @@ -411,6 +411,17 @@ def validate_data(): learning_rate=LEARN_RATE, save_model_path=SAVE_PATH, use_cuda=CUDA) + except RuntimeError as e: + if "CUDA" in str(e): + logger.error(f"GPU error: {e}. Falling back to CPU...") + train_model(model_name=MODEL_NAME, + epochs=EPOCHS, + batch_size=BATCH_SIZE, + learning_rate=LEARN_RATE, + save_model_path=SAVE_PATH, + use_cuda=False) + else: + raise except FileNotFoundError as e: logger.error(f"File Not Found Error in training model: {e}") exit(1) diff --git a/CODE/config.ini b/CODE/config.ini index be985f35..65a2e6db 100644 --- a/CODE/config.ini +++ b/CODE/config.ini @@ -96,8 +96,8 @@ save_model_path = PATH # All files be saved here, and can't be changed, PATH is "NN features/" # This is the path to the model, and the vectorizer -model_path = ../Model SenseMini .3n3.pth -vectorizer_path = ../Vectorizer .3n3.pkl +model_path = PATH +vectorizer_path = PATH # Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations # Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap. number_of_features = -1 From 62da7858e50084c73e26ffa6dbf9432bb534bbc3 Mon Sep 17 00:00:00 2001 From: DefinetlyNotAI Date: Wed, 11 Dec 2024 14:46:18 +0400 Subject: [PATCH 20/20] Basic final fixes Minor bug fixes, grammatical fixes and code formatting --- CODE/VulnScan/tools/_study_network.py | 3 +- CODE/VulnScan/tools/_test_gpu_acceleration.py | 3 +- CODE/VulnScan/tools/_vectorizer.py | 3 +- CODE/VulnScan/v2-deprecated/_generate_data.py | 3 +- CODE/VulnScan/v2-deprecated/_train.py | 3 +- CODE/VulnScan/v3/_generate_data.py | 3 +- CODE/VulnScan/v3/_train.py | 39 ++++++---- CODE/dump_memory.py | 73 ++++++++++++++++--- README.md | 42 +++++------ 9 files changed, 120 insertions(+), 52 deletions(-) diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py index 69f823ab..907c8576 100644 --- a/CODE/VulnScan/tools/_study_network.py +++ b/CODE/VulnScan/tools/_study_network.py @@ -620,4 +620,5 @@ def model_summary(): model_summary() main_plot() else: - raise ImportError("This file cannot be imported") + raise ImportError("This training script is meant to be run directly " + "and cannot be imported. Please execute it as a standalone script.") diff --git a/CODE/VulnScan/tools/_test_gpu_acceleration.py b/CODE/VulnScan/tools/_test_gpu_acceleration.py index a7e47134..86397e70 100644 --- a/CODE/VulnScan/tools/_test_gpu_acceleration.py +++ b/CODE/VulnScan/tools/_test_gpu_acceleration.py @@ -24,4 +24,5 @@ def check_gpu(): if __name__ == '__main__': check_gpu() else: - raise ImportError("This file cannot be imported") + raise ImportError("This training script is meant to be run directly " + "and cannot be imported. Please execute it as a standalone script.") diff --git a/CODE/VulnScan/tools/_vectorizer.py b/CODE/VulnScan/tools/_vectorizer.py index 5d316de9..25e57272 100644 --- a/CODE/VulnScan/tools/_vectorizer.py +++ b/CODE/VulnScan/tools/_vectorizer.py @@ -81,4 +81,5 @@ def main(data_paths: str, vectorizer_types: str, output_paths: str): os.makedirs(output_path) main(data_path, vectorizer_type, output_path) else: - raise ImportError("This file cannot be imported") + raise ImportError("This training script is meant to be run directly " + "and cannot be imported. Please execute it as a standalone script.") diff --git a/CODE/VulnScan/v2-deprecated/_generate_data.py b/CODE/VulnScan/v2-deprecated/_generate_data.py index 62fc91e5..778c1c26 100644 --- a/CODE/VulnScan/v2-deprecated/_generate_data.py +++ b/CODE/VulnScan/v2-deprecated/_generate_data.py @@ -106,4 +106,5 @@ def create_random_files(directories: str, num_file: int = 100): if __name__ == "__main__": create_random_files(SAVE_DIRECTORY, num_file=1000000) else: - raise ImportError("This file cannot be imported") + raise ImportError("This training script is meant to be run directly " + "and cannot be imported. Please execute it as a standalone script.") diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py index 7940b1c3..5daa78fd 100644 --- a/CODE/VulnScan/v2-deprecated/_train.py +++ b/CODE/VulnScan/v2-deprecated/_train.py @@ -545,4 +545,5 @@ def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int, BATCH_SIZE=8, LEARNING_RATE=5e-5, MAX_FEATURES=5000, MAX_LEN=128, TEST_SIZE=0.2, RANDOM_STATE=42, MODEL_PATH_BERT="../bert-base-uncased-model") else: - raise ImportError("This file cannot be imported") + raise ImportError("This training script is meant to be run directly " + "and cannot be imported. Please execute it as a standalone script.") diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py index bfce5446..0bc8dd3b 100644 --- a/CODE/VulnScan/v3/_generate_data.py +++ b/CODE/VulnScan/v3/_generate_data.py @@ -220,4 +220,5 @@ def generate_file_content(extensions: str) -> tuple[str, str]: logger.info(f"Generated {FILE_NUM} files in {SAVE_PATH}") else: - raise ImportError("This file cannot be imported") + raise ImportError("This training script is meant to be run directly " + "and cannot be imported. Please execute it as a standalone script.") diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py index 485e3d0e..f9bfb2a4 100644 --- a/CODE/VulnScan/v3/_train.py +++ b/CODE/VulnScan/v3/_train.py @@ -348,25 +348,28 @@ def validate_data(): """ Validates the data by checking if the variables are of the correct type. """ - if not isinstance(EPOCHS, int) and EPOCHS > 0: - logger.error("EPOCHS must be an integer") + if not isinstance(EPOCHS, int) or EPOCHS <= 0: + logger.error("EPOCHS must be a positive integer") exit(1) - if not isinstance(BATCH_SIZE, int) and BATCH_SIZE > 0: - logger.error("BATCH_SIZE must be an integer") + if not isinstance(BATCH_SIZE, int) or BATCH_SIZE <= 0: + logger.error("BATCH_SIZE must be a positive integer") exit(1) - if not isinstance(LEARN_RATE, float) and 0 < LEARN_RATE < 1: - logger.error("LEARN_RATE must be a float") + if not isinstance(LEARN_RATE, float) or not (0 < LEARN_RATE < 1): + logger.error("LEARN_RATE must be a float between 0 and 1") exit(1) if not isinstance(CUDA, bool): logger.error("CUDA must be a boolean") exit(1) - allowed_models = ["NeuralNetwork", "LogReg", - "RandomForest", "ExtraTrees", "GBM", - "XGBoost", "DecisionTree", "NaiveBayes"] + + allowed_models = ["NeuralNetwork", "LogReg", "RandomForest", "ExtraTrees", "GBM", "XGBoost", "DecisionTree", "NaiveBayes"] if MODEL_NAME not in allowed_models: - logger.error('MODEL_NAME must be one of the following: ' - '"NeuralNetwork", "LogReg", "RandomForest", ' - '"ExtraTrees", "GBM","XGBoost", "DecisionTree", "NaiveBayes"') + logger.error(f"MODEL_NAME must be one of: {', '.join(allowed_models)}") + exit(1) + if not os.path.exists(TRAINING_PATH): + logger.error(f"Training data path {TRAINING_PATH} does not exist") + exit(1) + if not os.path.exists(os.path.dirname(SAVE_PATH)): + logger.error(f"Save model path {SAVE_PATH} does not exist") exit(1) @@ -421,15 +424,19 @@ def validate_data(): save_model_path=SAVE_PATH, use_cuda=False) else: - raise + logger.error(f"Runtime Error in training model: {e}") + exit(1) except FileNotFoundError as e: - logger.error(f"File Not Found Error in training model: {e}") + logger.error(f"Training data or model files not found: {e}." + f" Please check if all required files exist.") exit(1) except AttributeError as e: - logger.error(f"Attribute Error in training model: {e}") + logger.error(f"Invalid model configuration or missing attributes: {e}." + f" Please verify model settings.") exit(1) except Exception as e: logger.error(f"Error in training model: {e}") exit(1) else: - raise ImportError("This file cannot be imported") + raise ImportError("This training script is meant to be run directly " + "and cannot be imported. Please execute it as a standalone script.") diff --git a/CODE/dump_memory.py b/CODE/dump_memory.py index db2ab7db..927c40dd 100644 --- a/CODE/dump_memory.py +++ b/CODE/dump_memory.py @@ -7,13 +7,25 @@ if __name__ == "__main__": log = Log({"log_level": DEBUG}) + # Constants + PROCESS_QUERY_INFORMATION = 0x0400 + PROCESS_VM_READ = 0x0010 + MEM_COMMIT = 0x1000 + PAGE_READWRITE = 0x04 # Function to save RAM content snapshot to a file +@log.function def dump_ram_content(): + """ + Capture the current state of the system's RAM and write it to a file. + + This function gathers memory statistics, system-specific details, and writes + the information to a file named 'Ram_Snapshot.txt'. + """ try: # Generate a timestamp for the file - dump_file = f"Ram_Snapshot.txt" + dump_file = "Ram_Snapshot.txt" # Gather memory statistics using psutil memory_info = psutil.virtual_memory() @@ -56,6 +68,23 @@ def dump_ram_content(): # Define structures for SystemInfo class SystemInfo(ctypes.Structure): + # noinspection PyUnresolvedReferences + """ + A ctypes Structure to hold system information. + + Attributes: + wProcessorArchitecture (ctypes.c_ushort): Processor architecture. + wReserved (ctypes.c_ushort): Reserved. + dwPageSize (ctypes.c_ulong): Page size. + lpMinimumApplicationAddress (ctypes.c_void_p): Minimum application address. + lpMaximumApplicationAddress (ctypes.c_void_p): Maximum application address. + dwActiveProcessorMask (ctypes.POINTER(ctypes.c_ulong)): Active processor mask. + dwNumberOfProcessors (ctypes.c_ulong): Number of processors. + dwProcessorType (ctypes.c_ulong): Processor type. + dwAllocationGranularity (ctypes.c_ulong): Allocation granularity. + wProcessorLevel (ctypes.c_ushort): Processor level. + wProcessorRevision (ctypes.c_ushort): Processor revision. + """ _fields_ = [ ("wProcessorArchitecture", ctypes.c_ushort), ("wReserved", ctypes.c_ushort), @@ -73,6 +102,19 @@ class SystemInfo(ctypes.Structure): # Define BasicMemInfo class BasicMemInfo(ctypes.Structure): + # noinspection PyUnresolvedReferences + """ + A ctypes Structure to hold basic memory information. + + Attributes: + BaseAddress (ctypes.c_void_p): Base address. + AllocationBase (ctypes.c_void_p): Allocation base. + AllocationProtect (ctypes.c_ulong): Allocation protection. + RegionSize (ctypes.c_size_t): Region size. + State (ctypes.c_ulong): State. + Protect (ctypes.c_ulong): Protection. + Type (ctypes.c_ulong): Type. + """ _fields_ = [ ("BaseAddress", ctypes.c_void_p), ("AllocationBase", ctypes.c_void_p), @@ -84,13 +126,28 @@ class BasicMemInfo(ctypes.Structure): ] +@log.function def get_system_info() -> SystemInfo: + """ + Retrieve and return system information using the `GetSystemInfo` function from the Windows API. + + Returns: + SystemInfo: A `SystemInfo` structure containing details about the system's architecture, + processor, memory, and other attributes. + """ system_info = SystemInfo() ctypes.windll.kernel32.GetSystemInfo(ctypes.byref(system_info)) return system_info +@log.function def read_memory(): + """ + Read the memory of the current process and write the content to a file. + + This function opens the current process with the necessary permissions, + retrieves system information, and iterates through memory pages to read + """ # Open current process with permissions process = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, False, os.getpid()) if not process: @@ -153,11 +210,9 @@ def read_memory(): if __name__ == "__main__": - # Constants - PROCESS_QUERY_INFORMATION = 0x0400 - PROCESS_VM_READ = 0x0010 - MEM_COMMIT = 0x1000 - PAGE_READWRITE = 0x04 - - dump_ram_content() - read_memory() + try: + log.info("Starting memory dump process...") + dump_ram_content() + read_memory() + except Exception as err: + log.error(f"Error during memory dump: {err}") diff --git a/README.md b/README.md index a25fa039..9d6f495e 100644 --- a/README.md +++ b/README.md @@ -282,27 +282,27 @@ Here are some of the data points that Logicytics extracts: > [!TIP] > You can check out future plans [here](PLANS.md), you can contribute these plans if you have no idea's on what to contribute! -| File Name | About | Important Note | -|--------------------------|------------------------------------------------------------------------------------------------------------------------|--------------------------| -| browser_miner.ps1 | Mines all data related to browsers | | -| cmd_commands.py | Gets data from driverquery, sysinfo, gpresult and more | | -| log_miner.py | Gets all logs from the Windows device | | -| media_backup.py | Gets all media of the device in a neat folder | Would love to be updated | -| netadapter.ps1 | Runs Get-NetAdapter Command with many flags | | -| property_scraper.ps1 | Gets all the windows properties | | -| registry.py | Backups the registry | | -| sensitive_data_miner.py | Copies all files that can be considered sensitive in a neat folder, , very slow and clunky - useful for depth scanning | | -| ssh_miner.py | Gets as much ssh private data as possible | | -| sys_internal.py | Attempts to use the Sys_Internal Suite from microsoft | | -| tasklist.py | Gets all running tasks, PID and info/data | | -| tree.ps1 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning | | -| window_feature_miner.ps1 | Logs all the windows features enabled | | -| wmic.py | Logs and runs many wmic commands to gain sensitive data and information | | -| wifi_stealer.py | Gets the SSID and Password of all saved Wi-Fi | | -| dir_list.py | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning | | -| event_logs.py | Produces a multiple txt files in a folder on many event logs (Security, Applications and System) | | -| vulnscan.py | Uses AI/ML to detect sensitive files, and log their paths | In beta! | -| dump_memory.py | Dumps some memory as well as log some RAM details | | +| File Name | About | Important Note | +|--------------------------|----------------------------------------------------------------------------------------------------------------------|--------------------------| +| browser_miner.ps1 | Mines all data related to browsers | | +| cmd_commands.py | Gets data from driverquery, sysinfo, gpresult and more | | +| log_miner.py | Gets all logs from the Windows device | | +| media_backup.py | Gets all media of the device in a neat folder | Would love to be updated | +| netadapter.ps1 | Runs Get-NetAdapter Command with many flags | | +| property_scraper.ps1 | Gets all the windows properties | | +| registry.py | Backups the registry | | +| sensitive_data_miner.py | Copies all files that can be considered sensitive in a neat folder, very slow and clunky - useful for depth scanning | | +| ssh_miner.py | Gets as much ssh private data as possible | | +| sys_internal.py | Attempts to use the Sys_Internal Suite from microsoft | | +| tasklist.py | Gets all running tasks, PID and info/data | | +| tree.ps1 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning | | +| window_feature_miner.ps1 | Logs all the windows features enabled | | +| wmic.py | Logs and runs many wmic commands to gain sensitive data and information | | +| wifi_stealer.py | Gets the SSID and Password of all saved Wi-Fi | | +| dir_list.py | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning | | +| event_logs.py | Produces a multiple txt files in a folder on many event logs (Security, Applications and System) | | +| vulnscan.py | Uses AI/ML to detect sensitive files, and log their paths | In beta! | +| dump_memory.py | Dumps some memory as well as log some RAM details | | This is not an exhaustive list, but it should give you a good idea of what data Logicytics is capable of extracting.