From 42081f0f2591688c67b74747f4e4a115f81eb5a6 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Mon, 9 Dec 2024 11:47:57 +0400
Subject: [PATCH 01/20] Updated vulnscan.py

Improved model and vectorizer loading with thread locking and file scanning functionality
Made `is_sensitive` have a reason for logging sensitive file
Improved the threading process and logging
---
 CODE/vulnscan.py | 132 ++++++++++++++++++++---------------------------
 1 file changed, 55 insertions(+), 77 deletions(-)

diff --git a/CODE/vulnscan.py b/CODE/vulnscan.py
index 6d9ec78b..e9cf5fea 100644
--- a/CODE/vulnscan.py
+++ b/CODE/vulnscan.py
@@ -6,20 +6,23 @@
 import warnings
 
 import joblib
+import numpy as np
 import torch
 from safetensors import safe_open
 from sklearn.feature_extraction.text import TfidfVectorizer
-from tqdm import tqdm
 
 # Set up logging
 from logicytics import Log, DEBUG
 
-# Use v3 models on this! Especially NN models
-
 if __name__ == "__main__":
-    log = Log(
-        {"log_level": DEBUG}
-    )
+    log = Log({"log_level": DEBUG})
+
+log.info("Locking threads - Model and Vectorizer")
+model_lock = threading.Lock()
+vectorizer_lock = threading.Lock()
+
+model_to_use = None
+vectorizer_to_use = None
 
 
 def load_model(model_path_to_load: str) -> safe_open | torch.nn.Module:
@@ -42,12 +45,28 @@ def load_model(model_path_to_load: str) -> safe_open | torch.nn.Module:
     elif model_path_to_load.endswith('.pth'):
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=FutureWarning)
-            return torch.load(model_path_to_load)
+            return torch.load(model_path_to_load, weights_only=False)
     else:
         raise ValueError("Unsupported model file format. Use .pkl, .safetensors, or .pth")
 
 
-def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_content: str) -> tuple[bool, float]:
+def scan_path(model_path: str, scan_paths: str, vectorizer_path: str):
+    global model_to_use, vectorizer_to_use
+    try:
+        with model_lock:
+            if model_to_use is None:
+                log.info(f"Loading model from {model_path}")
+                model_to_use = load_model(model_path)
+        with vectorizer_lock:
+            if vectorizer_to_use is None:
+                log.info(f"Loading vectorizer from {vectorizer_path}")
+                vectorizer_to_use = joblib.load(vectorizer_path)
+        vulnscan(model_to_use, scan_paths, vectorizer_to_use)
+    except Exception as e:
+        log.error(f"Error scanning path {scan_paths}: {e}")
+
+
+def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_content: str) -> tuple[bool, float, str]:
     """
     Determine if the file content is sensitive using the provided model and vectorizer.
 
@@ -57,7 +76,7 @@ def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_conte
         file_content (str): Content of the file to be analyzed.
 
     Returns:
-        tuple: (True if the content is sensitive, False otherwise, prediction probability).
+        tuple: (True if the content is sensitive, False otherwise, prediction probability, reason).
     """
     if isinstance(model, torch.nn.Module):
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -68,15 +87,19 @@ def is_sensitive(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_conte
             features_tensor = torch.tensor(features.toarray(), dtype=torch.float32).to(device)
             prediction = model(features_tensor)
             probability = torch.softmax(prediction, dim=1).max().item()
-            return prediction.argmax(dim=1).item() == 1, probability
+            top_features = np.argsort(features.toarray()[0])[-5:]
+            reason = ", ".join([vectorizer.get_feature_names_out()[i] for i in top_features])
+            return prediction.argmax(dim=1).item() == 1, probability, reason
     else:
         features = vectorizer.transform([file_content])
         prediction = model.predict_proba(features)
         probability = prediction.max()
-        return model.predict(features)[0] == 1, probability
+        top_features = np.argsort(features.toarray()[0])[-5:]
+        reason = ", ".join([vectorizer.get_feature_names_out()[i] for i in top_features])
+        return model.predict(features)[0] == 1, probability, reason
 
 
-def scan_file(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_path: str) -> tuple[bool, float]:
+def scan_file(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_path: str) -> tuple[bool, float, str]:
     """
     Scan a single file to determine if it contains sensitive content.
 
@@ -99,83 +122,38 @@ def scan_file(model: torch.nn.Module, vectorizer: TfidfVectorizer, file_path: st
         return is_sensitive(model, vectorizer, content)
 
 
-def scan_directory(model: torch.nn.Module, vectorizer, dir_path: str) -> dict[str, tuple[bool, float]]:
-    """
-    Scan all files in a directory to determine if they contain sensitive content.
-
-    Args:
-        model: Machine learning model.
-        vectorizer: Vectorizer to transform file content.
-        dir_path (str): Path to the directory to be scanned.
-
-    Returns:
-        dict: Dictionary with file paths as keys and (sensitivity, prediction probability) as values.
-    """
-    results = {}
-    for roots, _, files_dir in os.walk(dir_path):
-        for file in tqdm(files_dir, desc="Scanning files", unit="file", leave=True):
-            file_path = os.path.join(roots, file)
-            if file.endswith(('.zip', '.rar', '.7z', '.tar', '.gz', '.tar.gz')):
-                continue
-            results[file_path] = scan_file(model, vectorizer, file_path)
-
-    return results
-
-
-def main(MODELS_PATH: str, SCAN_PATH: str, VECTORIZER_PATH: str):
-    """
-    Main function to load the model and vectorizer, and scan the specified path.
-    Saves the paths of sensitive files to a file named "Sensitive_File_Paths.txt".
-
-    Args:
-        MODELS_PATH (str): Path to the model file.
-        SCAN_PATH (str): Path to the file or directory to be scanned.
-        VECTORIZER_PATH (str): Path to the vectorizer file.
-    """
-    log.info(f"Loading model from {MODELS_PATH}")
-    model = load_model(MODELS_PATH)
-    log.info(f"Loading vectorizer from {VECTORIZER_PATH}")
-    vectorizer = joblib.load(VECTORIZER_PATH)  # Adjust as needed
+def vulnscan(model, SCAN_PATH, vectorizer):
     log.info(f"Scanning {SCAN_PATH}")
-    if os.path.isfile(SCAN_PATH):
-        result, probability = scan_file(model, vectorizer, SCAN_PATH)
-        log.info(f"File {SCAN_PATH} is {'sensitive' if result else 'not sensitive'} with probability {probability:.2f}")
-        with open("Sensitive_File_Paths.txt", "w") as sensitive_file:
+    result, probability, reason = scan_file(model, vectorizer, SCAN_PATH)
+    if result:
+        log.info(f"File {SCAN_PATH} is sensitive with probability {probability:.2f}. Reason: {reason}")
+        if not os.path.exists("Sensitive_File_Paths.txt"):
+            with open("Sensitive_File_Paths.txt", "w") as sensitive_file:
+                sensitive_file.write(f"{SCAN_PATH}\n\n")
+        with open("Sensitive_File_Paths.txt", "a") as sensitive_file:
             sensitive_file.write(f"{SCAN_PATH}\n")
-    elif os.path.isdir(SCAN_PATH):
-        results = scan_directory(model, vectorizer, SCAN_PATH)
-        with open("Sensitive_File_Paths.txt", "w") as sensitive_file:
-            for file_path, (is_sensitive_main, probability) in results.items():
-                log.info(f"File {file_path} is {'sensitive' if is_sensitive_main else 'not sensitive'} with probability {probability:.2f}")
-                if is_sensitive_main:
-                    sensitive_file.write(f"{file_path}\n")
-    else:
-        log.error("Invalid path provided. Please provide a valid file or directory path.")
-        exit(1)
 
 
-def scan_path(model_path: str, scan_paths: str, vectorizer_path: str):
-    """
-        Scan the specified path using the provided model and vectorizer.
-
-        Args:
-            model_path (str): Path to the model file.
-            scan_paths (str): Path to the file or directory to be scanned.
-            vectorizer_path (str): Path to the vectorizer file.
-        """
-    main(model_path, scan_paths, vectorizer_path)
-
-
-log.warning("Starting scan - This may take hours!!")
+# Start scanning
+log.info("Getting paths to scan - This may take some time!!")
 
 threads = []
-paths = [
+paths = []
+base_paths = [
     "C:\\Users\\",
     "C:\\Windows\\Logs",
     "C:\\Program Files",
     "C:\\Program Files (x86)"
 ]
 
+for base_path in base_paths:
+    for root, dirs, files_main in os.walk(base_path):
+        for file_main in files_main:
+            paths.append(os.path.join(root, file_main))
+
+# Start scanning
+log.warning("Starting scan - This may take hours and consume memory!!")
+
 for path in paths:
     thread = threading.Thread(target=scan_path,
                               args=("VulnScan/Model SenseMini .3n3.pth", path, "VulnScan/Vectorizer .3n3.pkl"))

From 41044eba0613a4a489cd162deff80feebedbd537 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Mon, 9 Dec 2024 14:03:58 +0400
Subject: [PATCH 02/20] Added _study_network.py to tools

Add summary and visualization functions for neural network model

- Implement `summary` function to generate a detailed summary of the model
- Implement `visualize_model` function to create a directed graph of the model's layers and weights
- Save model summary and visualization to 'Vectorizer features' directory
- Add progress tracking and file handling for vectorizer features
---
 .gitignore                                    |   1 +
 .idea/Logicytics.iml                          |   1 +
 CODE/VulnScan/tools/_study_network.py         | 228 ++++++++++++++++++
 CODE/VulnScan/tools/_test_gpu_acceleration.py |   3 +-
 CODE/logicytics/FileManagement.py             |   4 +-
 requirements.txt                              |  26 +-
 6 files changed, 249 insertions(+), 14 deletions(-)
 create mode 100644 CODE/VulnScan/tools/_study_network.py

diff --git a/.gitignore b/.gitignore
index bca2a5a3..848eb65c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -319,3 +319,4 @@ $RECYCLE.BIN/
 *.pyc
 /CODE/SysInternal_Suite/.sys.ignore
 /ACCESS/
+/CODE/VulnScan/tools/Vectorizer features/
diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml
index e33fd634..deff06b9 100644
--- a/.idea/Logicytics.iml
+++ b/.idea/Logicytics.iml
@@ -16,6 +16,7 @@
       <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/bert" />
       <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/distilbert" />
       <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/t5" />
+      <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/tools/Vectorizer features" />
     </content>
     <orderEntry type="jdk" jdkName="Python 3.11" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
new file mode 100644
index 00000000..f74f10d6
--- /dev/null
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -0,0 +1,228 @@
+import os.path
+from collections import OrderedDict
+from os import mkdir
+
+import joblib
+import matplotlib.pyplot as plt
+import networkx as nx
+import numpy as np
+import seaborn as sns
+import torch
+import torch.nn as nn
+from torchviz import make_dot
+from tqdm import tqdm
+
+
+def summary(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
+    def register_hook(module):
+
+        def hook(modules, inputs, output):
+            class_name = str(modules.__class__).split(".")[-1].split("'")[0]
+            module_idx = len(summaries)
+
+            m_key = "%s-%i" % (class_name, module_idx + 1)
+            summaries[m_key] = OrderedDict()
+            summaries[m_key]["input_shape"] = list(inputs[0].size())
+            summaries[m_key]["input_shape"][0] = batch_size
+            if isinstance(output, (list, tuple)):
+                summaries[m_key]["output_shape"] = [
+                    [-1] + list(o.size())[1:] for o in output
+                ]
+            else:
+                summaries[m_key]["output_shape"] = list(output.size())
+                summaries[m_key]["output_shape"][0] = batch_size
+
+            params = 0
+            if hasattr(modules, "weight") and hasattr(modules.weight, "size"):
+                params += torch.prod(torch.LongTensor(list(modules.weight.size())))
+                summaries[m_key]["trainable"] = modules.weight.requires_grad
+            if hasattr(modules, "bias") and hasattr(modules.bias, "size"):
+                params += torch.prod(torch.LongTensor(list(modules.bias.size())))
+            summaries[m_key]["nb_params"] = params
+
+        if (
+                not isinstance(module, nn.Sequential)
+                and not isinstance(module, nn.ModuleList)
+                and not (module == model_to_use)
+        ):
+            hooks.append(module.register_forward_hook(hook))
+
+    device_to_use = device_to_use.lower()
+    assert device_to_use in [
+        "cuda",
+        "cpu",
+    ], "Input device is not valid, please specify 'cuda' or 'cpu'"
+
+    if device_to_use == "cuda" and torch.cuda.is_available():
+        dtype = torch.cuda.FloatTensor
+    else:
+        dtype = torch.FloatTensor
+
+    # multiple inputs to the network
+    if isinstance(input_size, tuple):
+        input_size = [input_size]
+
+    # batch_size of 2 for batch norm
+    x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
+    # print(type(x[0]))
+
+    # create properties
+    summaries = OrderedDict()
+    hooks = []
+
+    # register hook
+    model_to_use.apply(register_hook)
+
+    # make a forward pass
+    # print(x.shape)
+    model_to_use(*x)
+
+    # remove these hooks
+    for h in hooks:
+        h.remove()
+
+    with open('Vectorizer features/Model Summary.txt', 'w') as vf_ms:
+        vf_ms.write("----------------------------------------------------------------")
+        line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
+        vf_ms.write(line_new)
+        vf_ms.write("================================================================")
+        total_params = 0
+        total_output = 0
+        trainable_params = 0
+        for layer in summaries:
+            # input_shape, output_shape, trainable, nb_params
+            line_new = "{:>20}  {:>25} {:>15}".format(
+                layer,
+                str(summaries[layer]["output_shape"]),
+                "{0:,}".format(summaries[layer]["nb_params"]),
+            )
+            total_params += summaries[layer]["nb_params"]
+            total_output += np.prod(summaries[layer]["output_shape"])
+            if "trainable" in summaries[layer]:
+                if summaries[layer]["trainable"]:
+                    trainable_params += summaries[layer]["nb_params"]
+            vf_ms.write(line_new)
+
+        # assume 4 bytes/number (float on cuda).
+        total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
+        total_output_size = abs(2. * total_output * 4. / (1024 ** 2.))  # x2 for gradients
+        total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
+        total_size = total_params_size + total_output_size + total_input_size
+
+        vf_ms.write("================================================================")
+        vf_ms.write("Total params: {0:,}".format(total_params))
+        vf_ms.write("Trainable params: {0:,}".format(trainable_params))
+        vf_ms.write("Non-trainable params: {0:,}".format(total_params - trainable_params))
+        vf_ms.write("----------------------------------------------------------------")
+        vf_ms.write("Input size (MB): %0.2f" % total_input_size)
+        vf_ms.write("Forward/backward pass size (MB): %0.2f" % total_output_size)
+        vf_ms.write("Params size (MB): %0.2f" % total_params_size)
+        vf_ms.write("Estimated Total Size (MB): %0.2f" % total_size)
+        vf_ms.write("----------------------------------------------------------------")
+        # return summary
+
+
+def visualize_model():
+    # Create a directed graph
+    G = nx.DiGraph()
+
+    def add_edges_bulk(layer_names, weight_matrices):
+        """Efficiently add edges to the graph with progress tracking."""
+        threshold = 0.1  # Adjust this threshold as needed
+        significant_weights = np.abs(weight_matrices) > threshold
+        rows, cols = np.where(significant_weights)
+        weights = weight_matrices[rows, cols]
+
+        # Use tqdm for progress tracking
+        edge_count = len(rows)
+        with tqdm(total=edge_count, desc=f"Processing {layer_names}", unit="edges") as pbar:
+            for row, col, weight in zip(rows, cols, weights):
+                in_node = f"{layer_names}_in_{col}"
+                out_node = f"{layer_names}_out_{row}"
+                G.add_edge(in_node, out_node, weight=weight)
+                pbar.update(1)
+
+    # Process parameters
+    for name, param in model.named_parameters():
+        if 'weight' in name:
+            layer_name = name.split('.')[0]
+            weight_matrix = param.data.cpu().numpy()
+
+            # Add edges with progress bar
+            add_edges_bulk(layer_name, weight_matrix)
+
+    # Draw the graph
+    print("Writing the graph to a file...")
+    nx.write_gexf(G, "Vectorizer features/Neural Network Nodes Graph.gexf")
+
+
+# TODO - Add more print statements to indicate the progress of the script
+if __name__ == '__main__':
+    print("Visualizing the model and vectorizer features...")
+    print("This may take a while, please wait.")
+
+    if not os.path.exists('Vectorizer features'):
+        mkdir('Vectorizer features')
+
+    # Load the vectorizer
+    vectorizer_path = '../Vectorizer .3n3.pkl'
+    vectorizer = joblib.load(vectorizer_path)
+
+    # Inspect the vectorizer
+    feature_names = vectorizer.get_feature_names_out()
+    with open('Vectorizer features/Vectorizer features', 'w') as f:
+        f.write(f"Number of features: {len(feature_names)}\n\n")
+        f.write('\n'.join(feature_names))
+
+    # Visualize the top 90 features
+    top_n = 90
+    sorted_indices = vectorizer.idf_.argsort()[:top_n]
+    top_features = [feature_names[i] for i in sorted_indices]
+    top_idf_scores = vectorizer.idf_[sorted_indices]
+
+    plt.figure(figsize=(20, 12))  # Increase the figure size
+    sns.barplot(x=top_idf_scores, y=top_features)
+    plt.title('Top 90 Features by IDF Score')
+    plt.xlabel('IDF Score')
+    plt.ylabel('Feature')
+
+    # Save the plot as a vector graphic
+    plt.savefig('Vectorizer features/Top_90_Features.svg', format='svg')
+
+    plt.show()
+
+    # Load the model
+    model_path = '../Model SenseMini .3n3.pth'
+    model = torch.load(model_path, weights_only=False)
+
+    # Save the model summary
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    summary(model, input_size=(1, vectorizer.vocabulary_.__len__()))
+
+    # Save the model's state dictionary
+    with open('Vectorizer features/Model state dictionary.txt', 'w') as f:
+        f.write("Model's state dictionary:\n\n")
+        for param_tensor in model.state_dict():
+            f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}")
+
+    # Create a dummy input tensor with the appropriate size
+    dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device)
+
+    # Generate the visualization
+    model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()))
+
+    # Save the visualization to a file
+    model_viz.format = 'png'
+    model_viz.render(filename='Vectorizer features/Model Visualization', format='png')
+
+    # Removing the temporary files as they are no longer needed, we saved them to the desired location
+    if os.path.exists("Digraph.gv"):
+        os.remove("Digraph.gv")
+    if os.path.exists("Digraph.gv.png"):
+        os.remove("Digraph.gv.png")
+
+    # Visualize the model
+    visualize_model()
+
+    print("Model visualization and summary have been saved to the 'Vectorizer features' directory.")
diff --git a/CODE/VulnScan/tools/_test_gpu_acceleration.py b/CODE/VulnScan/tools/_test_gpu_acceleration.py
index 0b82f523..e45d05c8 100644
--- a/CODE/VulnScan/tools/_test_gpu_acceleration.py
+++ b/CODE/VulnScan/tools/_test_gpu_acceleration.py
@@ -21,4 +21,5 @@ def check_gpu():
         print(f"Error initializing CUDA: {err}")
 
 
-check_gpu()
+if __name__ == '__main__':
+    check_gpu()
diff --git a/CODE/logicytics/FileManagement.py b/CODE/logicytics/FileManagement.py
index 188b1341..07f9fc3c 100644
--- a/CODE/logicytics/FileManagement.py
+++ b/CODE/logicytics/FileManagement.py
@@ -107,7 +107,9 @@ def __get_files_to_zip(path: str) -> list:
                 list: A list of file and directory names to be zipped.
             """
             excluded_extensions = (".py", ".exe", ".bat", ".ps1", ".pkl", ".pth")
-            excluded_prefixes = ("config.ini", "SysInternal_Suite", "__pycache__", "logicytics", "VulnScan")
+            excluded_prefixes = ("config.ini", "SysInternal_Suite",
+                                 "__pycache__", "logicytics", "VulnScan",
+                                 "Vectorizer features")
 
             return [
                 f for f in os.listdir(path)
diff --git a/requirements.txt b/requirements.txt
index a67b234f..a22c3358 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,20 +1,22 @@
+configobj~=5.0.9
+joblib~=1.3.2
+matplotlib~=3.8.4
+torch~=2.5.1+cu124
+xgboost~=2.1.3
+scikit-learn~=1.5.2
+Faker~=30.3.0
+numpy~=1.26.4
+transformers~=4.38.2
 requests~=2.32.3
 psutil~=6.1.0
-colorlog~=6.9.0
 DateTime~=5.5
 pathlib~=1.0.1
+colorlog~=6.9.0
+safetensors~=0.4.5
 prettytable~=3.12.0
-scikit-learn~=1.5.2
-joblib~=1.3.2
-matplotlib~=3.8.4
-numpy~=1.26.4
-Faker~=30.3.0
-transformers~=4.38.2
-xgboost~=2.1.3
-torch~=2.5.1+cu124
 pandas~=2.2.2
 networkx~=3.2.1
 scapy~=2.5.0
-safetensors~=0.4.2
-tqdm~=4.66.6
-configobj~=5.0.9
\ No newline at end of file
+seaborn~=0.13.2
+torchviz~=0.0.3
+tqdm~=4.66.6
\ No newline at end of file

From d3566a8268b50421d1d9cf4caef117786efe9710 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Mon, 9 Dec 2024 14:22:54 +0400
Subject: [PATCH 03/20] Fixed minor bug

---
 CODE/VulnScan/tools/_study_network.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index f74f10d6..5f2084a6 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -122,7 +122,7 @@ def hook(modules, inputs, output):
         # return summary
 
 
-def visualize_model():
+def visualize_model(models, output_file="model_graph.gexf"):
     # Create a directed graph
     G = nx.DiGraph()
 
@@ -142,8 +142,8 @@ def add_edges_bulk(layer_names, weight_matrices):
                 G.add_edge(in_node, out_node, weight=weight)
                 pbar.update(1)
 
-    # Process parameters
-    for name, param in model.named_parameters():
+    # Process model parameters
+    for name, param in models.named_parameters():
         if 'weight' in name:
             layer_name = name.split('.')[0]
             weight_matrix = param.data.cpu().numpy()
@@ -151,9 +151,9 @@ def add_edges_bulk(layer_names, weight_matrices):
             # Add edges with progress bar
             add_edges_bulk(layer_name, weight_matrix)
 
-    # Draw the graph
-    print("Writing the graph to a file...")
-    nx.write_gexf(G, "Vectorizer features/Neural Network Nodes Graph.gexf")
+    # Save the graph to a GEXF file
+    nx.write_gexf(G, output_file)
+    print(f"Graph saved to {output_file}")
 
 
 # TODO - Add more print statements to indicate the progress of the script
@@ -223,6 +223,6 @@ def add_edges_bulk(layer_names, weight_matrices):
         os.remove("Digraph.gv.png")
 
     # Visualize the model
-    visualize_model()
+    visualize_model(model, output_file='Vectorizer features/NN.gexf')
 
     print("Model visualization and summary have been saved to the 'Vectorizer features' directory.")

From 62ad70a070123dcdcb5be338a802cb63a1aaf9d9 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Mon, 9 Dec 2024 14:22:54 +0400
Subject: [PATCH 04/20] Fixed minor bug

---
 CODE/VulnScan/tools/_study_network.py | 59 +++++++++++++++++++--------
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index f74f10d6..34099caa 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -3,14 +3,14 @@
 from os import mkdir
 
 import joblib
-import matplotlib.pyplot as plt
-import networkx as nx
-import numpy as np
 import seaborn as sns
 import torch
 import torch.nn as nn
 from torchviz import make_dot
+import networkx as nx
+import numpy as np
 from tqdm import tqdm
+import matplotlib.pyplot as plt
 
 
 def summary(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
@@ -122,13 +122,13 @@ def hook(modules, inputs, output):
         # return summary
 
 
-def visualize_model():
-    # Create a directed graph
-    G = nx.DiGraph()
+def visualize_model(models, output_dir="model_graphs", visualize_separately=True):
+    # Create a directed graph for the whole model
+    os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
 
-    def add_edges_bulk(layer_names, weight_matrices):
+    def add_edges_bulk(layer_names, weight_matrices, G):
         """Efficiently add edges to the graph with progress tracking."""
-        threshold = 0.1  # Adjust this threshold as needed
+        threshold = 1  # Adjust this threshold as needed
         significant_weights = np.abs(weight_matrices) > threshold
         rows, cols = np.where(significant_weights)
         weights = weight_matrices[rows, cols]
@@ -142,18 +142,45 @@ def add_edges_bulk(layer_names, weight_matrices):
                 G.add_edge(in_node, out_node, weight=weight)
                 pbar.update(1)
 
-    # Process parameters
-    for name, param in model.named_parameters():
+    # Process model parameters and create graphs for each layer
+    layer_graphs = {}
+
+    for name, param in models.named_parameters():
         if 'weight' in name:
             layer_name = name.split('.')[0]
             weight_matrix = param.data.cpu().numpy()
 
-            # Add edges with progress bar
-            add_edges_bulk(layer_name, weight_matrix)
+            # Create a new graph for the current layer and add edges
+            layer_G = nx.DiGraph()
+            add_edges_bulk(layer_name, weight_matrix, layer_G)
+
+            # Store the graph for the layer
+            layer_graphs[layer_name] = layer_G
+
+            # Save the layer graph to a separate file
+            layer_output_file = os.path.join(output_dir, f"{layer_name}_graph.gexf")
+            nx.write_gexf(layer_G, layer_output_file)
+            print(f"Layer graph saved to {layer_output_file}")
+
+    if visualize_separately:
+        # Visualize each graph separately
+        for layer_name, layer_G in layer_graphs.items():
+            plt.figure(figsize=(8, 8))
+            pos = nx.spring_layout(layer_G, seed=42)  # Layout for better visualization
+            nx.draw(layer_G, pos, with_labels=True, node_size=50, node_color="skyblue", font_size=8, font_color="black",
+                    alpha=0.6)
+            plt.title(f"Visualization for {layer_name}")
+            plt.show()
+
+    else:
+        # Combine all layer graphs into one and visualize
+        combined_graph = nx.DiGraph()
+
+        for layer_name, layer_G in layer_graphs.items():
+            combined_graph.add_nodes_from(layer_G.nodes())
+            combined_graph.add_edges_from(layer_G.edges())
 
-    # Draw the graph
-    print("Writing the graph to a file...")
-    nx.write_gexf(G, "Vectorizer features/Neural Network Nodes Graph.gexf")
+    print("Visualization complete.")
 
 
 # TODO - Add more print statements to indicate the progress of the script
@@ -223,6 +250,6 @@ def add_edges_bulk(layer_names, weight_matrices):
         os.remove("Digraph.gv.png")
 
     # Visualize the model
-    visualize_model()
+    visualize_model(model)
 
     print("Model visualization and summary have been saved to the 'Vectorizer features' directory.")

From 8205e5b74f270c1b522ecbacf2a7dd14f86bb8ec Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Tue, 10 Dec 2024 09:24:44 +0400
Subject: [PATCH 05/20] Fixing more bugs

Saving files now is neater
---
 .gitignore                            |   2 +-
 .idea/Logicytics.iml                  |   1 +
 CODE/VulnScan/tools/_study_network.py | 143 +++++++++-----------------
 requirements.txt                      |   3 +-
 4 files changed, 54 insertions(+), 95 deletions(-)

diff --git a/.gitignore b/.gitignore
index 848eb65c..add49568 100644
--- a/.gitignore
+++ b/.gitignore
@@ -319,4 +319,4 @@ $RECYCLE.BIN/
 *.pyc
 /CODE/SysInternal_Suite/.sys.ignore
 /ACCESS/
-/CODE/VulnScan/tools/Vectorizer features/
+/CODE/VulnScan/tools/NN features/
diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml
index deff06b9..9d371a5c 100644
--- a/.idea/Logicytics.iml
+++ b/.idea/Logicytics.iml
@@ -17,6 +17,7 @@
       <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/distilbert" />
       <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/t5" />
       <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/tools/Vectorizer features" />
+      <excludeFolder url="file://$MODULE_DIR$/CODE/VulnScan/tools/NN features" />
     </content>
     <orderEntry type="jdk" jdkName="Python 3.11" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 34099caa..c74546c7 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -3,17 +3,16 @@
 from os import mkdir
 
 import joblib
+import matplotlib.pyplot as plt
+import networkx as nx
+import numpy as np
 import seaborn as sns
 import torch
 import torch.nn as nn
 from torchviz import make_dot
-import networkx as nx
-import numpy as np
-from tqdm import tqdm
-import matplotlib.pyplot as plt
 
 
-def summary(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
+def save_graph(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
     def register_hook(module):
 
         def hook(modules, inputs, output):
@@ -81,11 +80,11 @@ def hook(modules, inputs, output):
     for h in hooks:
         h.remove()
 
-    with open('Vectorizer features/Model Summary.txt', 'w') as vf_ms:
-        vf_ms.write("----------------------------------------------------------------")
+    with open('NN features/Model Summary.txt', 'w') as vf_ms:
+        vf_ms.write("----------------------------------------------------------------\n")
         line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
-        vf_ms.write(line_new)
-        vf_ms.write("================================================================")
+        vf_ms.write(f"{line_new}\n")
+        vf_ms.write("================================================================\n")
         total_params = 0
         total_output = 0
         trainable_params = 0
@@ -101,7 +100,7 @@ def hook(modules, inputs, output):
             if "trainable" in summaries[layer]:
                 if summaries[layer]["trainable"]:
                     trainable_params += summaries[layer]["nb_params"]
-            vf_ms.write(line_new)
+            vf_ms.write(f"{line_new}\n")
 
         # assume 4 bytes/number (float on cuda).
         total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
@@ -109,87 +108,47 @@ def hook(modules, inputs, output):
         total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
         total_size = total_params_size + total_output_size + total_input_size
 
-        vf_ms.write("================================================================")
-        vf_ms.write("Total params: {0:,}".format(total_params))
-        vf_ms.write("Trainable params: {0:,}".format(trainable_params))
-        vf_ms.write("Non-trainable params: {0:,}".format(total_params - trainable_params))
-        vf_ms.write("----------------------------------------------------------------")
-        vf_ms.write("Input size (MB): %0.2f" % total_input_size)
-        vf_ms.write("Forward/backward pass size (MB): %0.2f" % total_output_size)
-        vf_ms.write("Params size (MB): %0.2f" % total_params_size)
-        vf_ms.write("Estimated Total Size (MB): %0.2f" % total_size)
-        vf_ms.write("----------------------------------------------------------------")
-        # return summary
-
-
-def visualize_model(models, output_dir="model_graphs", visualize_separately=True):
-    # Create a directed graph for the whole model
-    os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
-
-    def add_edges_bulk(layer_names, weight_matrices, G):
-        """Efficiently add edges to the graph with progress tracking."""
-        threshold = 1  # Adjust this threshold as needed
-        significant_weights = np.abs(weight_matrices) > threshold
-        rows, cols = np.where(significant_weights)
-        weights = weight_matrices[rows, cols]
-
-        # Use tqdm for progress tracking
-        edge_count = len(rows)
-        with tqdm(total=edge_count, desc=f"Processing {layer_names}", unit="edges") as pbar:
-            for row, col, weight in zip(rows, cols, weights):
-                in_node = f"{layer_names}_in_{col}"
-                out_node = f"{layer_names}_out_{row}"
-                G.add_edge(in_node, out_node, weight=weight)
-                pbar.update(1)
-
-    # Process model parameters and create graphs for each layer
-    layer_graphs = {}
-
-    for name, param in models.named_parameters():
-        if 'weight' in name:
-            layer_name = name.split('.')[0]
-            weight_matrix = param.data.cpu().numpy()
-
-            # Create a new graph for the current layer and add edges
-            layer_G = nx.DiGraph()
-            add_edges_bulk(layer_name, weight_matrix, layer_G)
-
-            # Store the graph for the layer
-            layer_graphs[layer_name] = layer_G
-
-            # Save the layer graph to a separate file
-            layer_output_file = os.path.join(output_dir, f"{layer_name}_graph.gexf")
-            nx.write_gexf(layer_G, layer_output_file)
-            print(f"Layer graph saved to {layer_output_file}")
-
-    if visualize_separately:
-        # Visualize each graph separately
-        for layer_name, layer_G in layer_graphs.items():
-            plt.figure(figsize=(8, 8))
-            pos = nx.spring_layout(layer_G, seed=42)  # Layout for better visualization
-            nx.draw(layer_G, pos, with_labels=True, node_size=50, node_color="skyblue", font_size=8, font_color="black",
-                    alpha=0.6)
-            plt.title(f"Visualization for {layer_name}")
-            plt.show()
+        vf_ms.write("\n================================================================")
+        vf_ms.write("\nTotal params: {0:,}".format(total_params))
+        vf_ms.write("\nTrainable params: {0:,}".format(trainable_params))
+        vf_ms.write("\nNon-trainable params: {0:,}".format(total_params - trainable_params))
+        vf_ms.write("\n----------------------------------------------------------------")
+        vf_ms.write("\nInput size (MB): %0.2f" % total_input_size)
+        vf_ms.write("\nForward/backward pass size (MB): %0.2f" % total_output_size)
+        vf_ms.write("\nParams size (MB): %0.2f" % total_params_size)
+        vf_ms.write("\nEstimated Total Size (MB): %0.2f" % total_size)
+        vf_ms.write("\n----------------------------------------------------------------\n")
 
-    else:
-        # Combine all layer graphs into one and visualize
-        combined_graph = nx.DiGraph()
 
-        for layer_name, layer_G in layer_graphs.items():
-            combined_graph.add_nodes_from(layer_G.nodes())
-            combined_graph.add_edges_from(layer_G.edges())
+def visualize_model(models, output_dir="NN features"):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # Create a directed graph
+    G = nx.DiGraph()
+
+    # Add nodes and edges to the graph
+    for model_i in models:
+        for names, param in model_i.named_parameters():
+            G.add_node(names, size=param.numel())
+            if param.requires_grad:
+                G.add_edge(names, f"{names}_grad")
+
+    # Define the output file path
+    output_file = os.path.join(output_dir, "model.graphml")
+
+    # Write the graph to a GraphML file
+    nx.write_graphml(G, output_file)
 
-    print("Visualization complete.")
+    print(f"Model visualization saved as {output_file}")
 
 
-# TODO - Add more print statements to indicate the progress of the script
 if __name__ == '__main__':
     print("Visualizing the model and vectorizer features...")
     print("This may take a while, please wait.")
 
-    if not os.path.exists('Vectorizer features'):
-        mkdir('Vectorizer features')
+    if not os.path.exists('NN features'):
+        mkdir('NN features')
 
     # Load the vectorizer
     vectorizer_path = '../Vectorizer .3n3.pkl'
@@ -197,7 +156,7 @@ def add_edges_bulk(layer_names, weight_matrices, G):
 
     # Inspect the vectorizer
     feature_names = vectorizer.get_feature_names_out()
-    with open('Vectorizer features/Vectorizer features', 'w') as f:
+    with open('NN features/Vectorizer features.txt', 'w') as f:
         f.write(f"Number of features: {len(feature_names)}\n\n")
         f.write('\n'.join(feature_names))
 
@@ -214,7 +173,7 @@ def add_edges_bulk(layer_names, weight_matrices, G):
     plt.ylabel('Feature')
 
     # Save the plot as a vector graphic
-    plt.savefig('Vectorizer features/Top_90_Features.svg', format='svg')
+    plt.savefig('NN features/Top_90_Features.svg', format='svg')
 
     plt.show()
 
@@ -225,10 +184,10 @@ def add_edges_bulk(layer_names, weight_matrices, G):
     # Save the model summary
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
-    summary(model, input_size=(1, vectorizer.vocabulary_.__len__()))
+    save_graph(model, input_size=(1, vectorizer.vocabulary_.__len__()))
 
     # Save the model's state dictionary
-    with open('Vectorizer features/Model state dictionary.txt', 'w') as f:
+    with open('NN features/Model state dictionary.txt', 'w') as f:
         f.write("Model's state dictionary:\n\n")
         for param_tensor in model.state_dict():
             f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}")
@@ -237,19 +196,17 @@ def add_edges_bulk(layer_names, weight_matrices, G):
     dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device)
 
     # Generate the visualization
-    model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()))
+    model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()), show_attrs=True, show_saved=True)
 
     # Save the visualization to a file
     model_viz.format = 'png'
-    model_viz.render(filename='Vectorizer features/Model Visualization', format='png')
+    model_viz.render(filename='NN features/Model Visualization', format='png')
 
     # Removing the temporary files as they are no longer needed, we saved them to the desired location
-    if os.path.exists("Digraph.gv"):
-        os.remove("Digraph.gv")
-    if os.path.exists("Digraph.gv.png"):
-        os.remove("Digraph.gv.png")
+    if os.path.exists("NN features/Model Visualization"):
+        os.remove("NN features/Model Visualization")
 
     # Visualize the model
     visualize_model(model)
 
-    print("Model visualization and summary have been saved to the 'Vectorizer features' directory.")
+    print("Model visualization and summary have been saved to the 'NN features' directory.")
diff --git a/requirements.txt b/requirements.txt
index a22c3358..05ea87ba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,4 +19,5 @@ networkx~=3.2.1
 scapy~=2.5.0
 seaborn~=0.13.2
 torchviz~=0.0.3
-tqdm~=4.66.6
\ No newline at end of file
+torchvision~=0.20.1+cu124
+torchcam~=0.4.0
\ No newline at end of file

From 891bfad97048b8e9680ba96beac6c7c6995981f5 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Tue, 10 Dec 2024 14:25:03 +0400
Subject: [PATCH 06/20] Added _plot.py

Now added plot.py which shows a heatmap in bargraph form of the model and best 1000 features, as well as a .html file with a 3D plot graph of losses
Fixed minor bug in _study_network.py where I returned old save_graph() function which now makes the gephi file have proper node counts
---
 CODE/VulnScan/tools/_plot.py          | 140 ++++++++++++++++++++++++++
 CODE/VulnScan/tools/_study_network.py |  48 +++++----
 requirements.txt                      |   4 +-
 3 files changed, 172 insertions(+), 20 deletions(-)
 create mode 100644 CODE/VulnScan/tools/_plot.py

diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py
new file mode 100644
index 00000000..757a0b25
--- /dev/null
+++ b/CODE/VulnScan/tools/_plot.py
@@ -0,0 +1,140 @@
+import joblib
+import matplotlib.pyplot as plt
+import numpy as np
+import plotly.graph_objects as go
+import seaborn as sns
+import torch
+from sklearn.feature_extraction.text import TfidfTransformer
+from torch.utils.data import DataLoader
+
+
+# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot
+def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
+    # Limit the number of tokens to visualize
+    TOKENS = TOKENS[:1000]
+    FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000]
+
+    plt.figure(figsize=(len(TOKENS) * 0.5, 6))
+    sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False)
+    plt.title("Feature Importance")
+    plt.xlabel("Tokens")
+    plt.ylabel("Importance")
+    plt.xticks(rotation=45)
+    plt.savefig(FILENAME, format="svg")
+    plt.show()  # Show the plot interactively
+    plt.close()  # Close the plot to release memory
+
+
+# Function to visualize the loss landscape as an interactive 3D object
+def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"):
+    MODEL.eval()  # Set model to evaluation mode
+    param = next(MODEL.parameters())  # Use the first parameter for landscape perturbations
+    param_flat = param.view(-1)
+
+    # Define perturbation directions u and v
+    u = torch.randn_like(param_flat).view(param.shape).to(param.device)
+    v = torch.randn_like(param_flat).view(param.shape).to(param.device)
+
+    # Normalize perturbations
+    u = EPSILON * u / torch.norm(u)
+    v = EPSILON * v / torch.norm(v)
+
+    # Create grid
+    x = np.linspace(-1, 1, GRID_SIZE)
+    y = np.linspace(-1, 1, GRID_SIZE)
+    loss_values = np.zeros((GRID_SIZE, GRID_SIZE))
+
+    # Iterate through the grid to compute losses
+    for i, dx in enumerate(x):
+        for j, dy in enumerate(y):
+            param.data += dx * u + dy * v  # Apply perturbation
+            loss = 0
+
+            # Compute loss for all batches in data loader
+            for batch in DATA_LOADER:
+                inputs, targets = batch
+                inputs = inputs.to(param.device)
+                targets = targets.to(param.device)
+                outputs = MODEL(inputs)
+                loss += CRITERION(outputs, targets).item()
+
+            loss_values[i, j] = loss  # Store the loss
+            param.data -= dx * u + dy * v  # Revert perturbation
+
+    # Create a meshgrid for plotting
+    X, Y = np.meshgrid(x, y)
+
+    # Plot the 3D surface using Plotly
+    fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")])
+    fig.update_layout(
+        title="Loss Landscape (Interactive 3D)",
+        scene=dict(
+            xaxis_title="Perturbation in u",
+            yaxis_title="Perturbation in v",
+            zaxis_title="Loss",
+        ),
+    )
+
+    # Save as an interactive HTML file
+    fig.write_html(FILENAME)
+    print(f"3D loss landscape saved as {FILENAME}")
+
+
+# Example of DataLoader for loss landscape (dummy dataset for visualization)
+class DummyDataset(torch.utils.data.Dataset):
+    def __init__(self, num_samples=100):
+        self.num_samples = num_samples
+        self.data = torch.randn(num_samples, 10000)  # Increased number of features
+        self.labels = torch.randint(0, 2, (num_samples,))  # Binary labels
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return self.data[idx], self.labels[idx]
+
+
+if __name__ == "__main__":
+    # Check if GPU is available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+
+    # Load vectorizer (change the path to your vectorizer .pkl file)
+    vectorizer_path = "../Vectorizer .3n3.pkl"
+    model_path = "../Model SenseMini .3n3.pth"
+
+    # Load vectorizer
+    print(f"Loading vectorizer from: {vectorizer_path}")
+    with open(vectorizer_path, "rb") as f:
+        vectorizer = joblib.load(f)
+
+    # Load model and move to the appropriate device (GPU/CPU)
+    print(f"Loading model from: {model_path}")
+    model = torch.load(model_path, weights_only=False)
+    model.to(device)  # Move model to GPU or CPU
+    print(model)
+
+    # Instantiate dummy data loader
+    print("Creating dummy data loader...")
+    dummy_data_loader = DataLoader(DummyDataset(), batch_size=32)
+
+    # Define loss criterion
+    print("Defining loss criterion...")
+    criterion: torch.nn = torch.nn.CrossEntropyLoss()
+
+    # Visualizations
+    print("Creating visualizations...")
+    tokens: TfidfTransformer = vectorizer.get_feature_names_out()
+
+    # Feature importance (dummy data)
+    NUMBER_OF_FEATURES: int = -1  # Number of features to visualize, -1 for all
+    # Max number of features to visualize is 3000 due to image constraints
+    print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...")
+    feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES]))  # Example random importance
+    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg")
+
+    # Loss landscape
+    print("Visualizing loss landscape - This may take a while...")
+    plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html")
+
+    print("Completed.")
diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index c74546c7..905ff532 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -10,9 +10,10 @@
 import torch
 import torch.nn as nn
 from torchviz import make_dot
+from tqdm import tqdm
 
 
-def save_graph(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
+def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
     def register_hook(module):
 
         def hook(modules, inputs, output):
@@ -120,27 +121,38 @@ def hook(modules, inputs, output):
         vf_ms.write("\n----------------------------------------------------------------\n")
 
 
-def visualize_model(models, output_dir="NN features"):
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
+def save_graph():
     # Create a directed graph
     G = nx.DiGraph()
 
-    # Add nodes and edges to the graph
-    for model_i in models:
-        for names, param in model_i.named_parameters():
-            G.add_node(names, size=param.numel())
-            if param.requires_grad:
-                G.add_edge(names, f"{names}_grad")
+    def add_edges_bulk(layer_names, weight_matrices):
+        """Efficiently add edges to the graph with progress tracking."""
+        threshold = 0.1  # Adjust this threshold as needed
+        significant_weights = np.abs(weight_matrices) > threshold
+        rows, cols = np.where(significant_weights)
+        weights = weight_matrices[rows, cols]
+
+        # Use tqdm for progress tracking
+        edge_count = len(rows)
+        with tqdm(total=edge_count, desc=f"Processing {layer_names}", unit="edges") as pbar:
+            for row, col, weight in zip(rows, cols, weights):
+                in_node = f"{layer_names}_in_{col}"
+                out_node = f"{layer_names}_out_{row}"
+                G.add_edge(in_node, out_node, weight=weight)
+                pbar.update(1)
 
-    # Define the output file path
-    output_file = os.path.join(output_dir, "model.graphml")
+    # Process parameters
+    for name, param in model.named_parameters():
+        if 'weight' in name:
+            layer_name = name.split('.')[0]
+            weight_matrix = param.data.cpu().numpy()
 
-    # Write the graph to a GraphML file
-    nx.write_graphml(G, output_file)
+            # Add edges with progress bar
+            add_edges_bulk(layer_name, weight_matrix)
 
-    print(f"Model visualization saved as {output_file}")
+    # Draw the graph
+    print("Writing the graph to a file...")
+    nx.write_gexf(G, "NN features/Neural Network Nodes Graph.gexf")
 
 
 if __name__ == '__main__':
@@ -184,7 +196,7 @@ def visualize_model(models, output_dir="NN features"):
     # Save the model summary
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
-    save_graph(model, input_size=(1, vectorizer.vocabulary_.__len__()))
+    save_data(model, input_size=(1, vectorizer.vocabulary_.__len__()))
 
     # Save the model's state dictionary
     with open('NN features/Model state dictionary.txt', 'w') as f:
@@ -207,6 +219,6 @@ def visualize_model(models, output_dir="NN features"):
         os.remove("NN features/Model Visualization")
 
     # Visualize the model
-    visualize_model(model)
+    save_graph()
 
     print("Model visualization and summary have been saved to the 'NN features' directory.")
diff --git a/requirements.txt b/requirements.txt
index 05ea87ba..f186ddf6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,5 +19,5 @@ networkx~=3.2.1
 scapy~=2.5.0
 seaborn~=0.13.2
 torchviz~=0.0.3
-torchvision~=0.20.1+cu124
-torchcam~=0.4.0
\ No newline at end of file
+plotly~=5.24.1
+tqdm~=4.66.6
\ No newline at end of file

From 7e88e1a21059c549190c610acff0365601ce7c95 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Tue, 10 Dec 2024 14:47:02 +0400
Subject: [PATCH 07/20] Fixed minor bugs

Added checks if directories and files existed before write/appending to them
---
 CODE/VulnScan/tools/_plot.py          | 9 ++++++++-
 CODE/VulnScan/tools/_study_network.py | 6 +++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py
index 757a0b25..247519f9 100644
--- a/CODE/VulnScan/tools/_plot.py
+++ b/CODE/VulnScan/tools/_plot.py
@@ -1,3 +1,5 @@
+import os
+
 import joblib
 import matplotlib.pyplot as plt
 import numpy as np
@@ -96,6 +98,9 @@ def __getitem__(self, idx):
 
 if __name__ == "__main__":
     # Check if GPU is available
+    if not os.path.exists('NN features'):
+        os.mkdir('NN features')
+
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
 
@@ -112,7 +117,9 @@ def __getitem__(self, idx):
     print(f"Loading model from: {model_path}")
     model = torch.load(model_path, weights_only=False)
     model.to(device)  # Move model to GPU or CPU
-    print(model)
+    mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w"
+    with open("NN features/Model Summary.txt", mode) as f:
+        f.write(str(model))
 
     # Instantiate dummy data loader
     print("Creating dummy data loader...")
diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 905ff532..38305e01 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -64,7 +64,6 @@ def hook(modules, inputs, output):
 
     # batch_size of 2 for batch norm
     x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
-    # print(type(x[0]))
 
     # create properties
     summaries = OrderedDict()
@@ -74,14 +73,15 @@ def hook(modules, inputs, output):
     model_to_use.apply(register_hook)
 
     # make a forward pass
-    # print(x.shape)
     model_to_use(*x)
 
     # remove these hooks
     for h in hooks:
         h.remove()
 
-    with open('NN features/Model Summary.txt', 'w') as vf_ms:
+    # Save the summary
+    mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w"
+    with open('NN features/Model Summary.txt', mode) as vf_ms:
         vf_ms.write("----------------------------------------------------------------\n")
         line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
         vf_ms.write(f"{line_new}\n")

From 7cb6df99035ec3700a6693fed91253589460627f Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Tue, 10 Dec 2024 21:44:12 +0400
Subject: [PATCH 08/20] Added many study features

Also merged _plot.py to _study_network.py, added activations, weight distribution, t-SNE plots, which are all special, finally fixed some bugs, and made sure all data is genuine, or synthetic, modified config.ini as well to allow paths to be set there.
---
 .idea/Logicytics.iml                  |   1 +
 CODE/VulnScan/tools/_plot.py          | 147 ----------
 CODE/VulnScan/tools/_study_network.py | 407 ++++++++++++++++++++++++--
 CODE/VulnScan/v3/_train.py            |   1 +
 CODE/config.ini                       |  58 ++--
 5 files changed, 414 insertions(+), 200 deletions(-)
 delete mode 100644 CODE/VulnScan/tools/_plot.py

diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml
index 9d371a5c..235b40bc 100644
--- a/.idea/Logicytics.iml
+++ b/.idea/Logicytics.iml
@@ -35,6 +35,7 @@
     <option name="TEMPLATE_FOLDERS">
       <list>
         <option value="$MODULE_DIR$/CODE/VulnScan/tools" />
+        <option value="$MODULE_DIR$/CODE/VulnScan/tools/test_tools" />
       </list>
     </option>
   </component>
diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py
deleted file mode 100644
index 247519f9..00000000
--- a/CODE/VulnScan/tools/_plot.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import os
-
-import joblib
-import matplotlib.pyplot as plt
-import numpy as np
-import plotly.graph_objects as go
-import seaborn as sns
-import torch
-from sklearn.feature_extraction.text import TfidfTransformer
-from torch.utils.data import DataLoader
-
-
-# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot
-def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
-    # Limit the number of tokens to visualize
-    TOKENS = TOKENS[:1000]
-    FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000]
-
-    plt.figure(figsize=(len(TOKENS) * 0.5, 6))
-    sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False)
-    plt.title("Feature Importance")
-    plt.xlabel("Tokens")
-    plt.ylabel("Importance")
-    plt.xticks(rotation=45)
-    plt.savefig(FILENAME, format="svg")
-    plt.show()  # Show the plot interactively
-    plt.close()  # Close the plot to release memory
-
-
-# Function to visualize the loss landscape as an interactive 3D object
-def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"):
-    MODEL.eval()  # Set model to evaluation mode
-    param = next(MODEL.parameters())  # Use the first parameter for landscape perturbations
-    param_flat = param.view(-1)
-
-    # Define perturbation directions u and v
-    u = torch.randn_like(param_flat).view(param.shape).to(param.device)
-    v = torch.randn_like(param_flat).view(param.shape).to(param.device)
-
-    # Normalize perturbations
-    u = EPSILON * u / torch.norm(u)
-    v = EPSILON * v / torch.norm(v)
-
-    # Create grid
-    x = np.linspace(-1, 1, GRID_SIZE)
-    y = np.linspace(-1, 1, GRID_SIZE)
-    loss_values = np.zeros((GRID_SIZE, GRID_SIZE))
-
-    # Iterate through the grid to compute losses
-    for i, dx in enumerate(x):
-        for j, dy in enumerate(y):
-            param.data += dx * u + dy * v  # Apply perturbation
-            loss = 0
-
-            # Compute loss for all batches in data loader
-            for batch in DATA_LOADER:
-                inputs, targets = batch
-                inputs = inputs.to(param.device)
-                targets = targets.to(param.device)
-                outputs = MODEL(inputs)
-                loss += CRITERION(outputs, targets).item()
-
-            loss_values[i, j] = loss  # Store the loss
-            param.data -= dx * u + dy * v  # Revert perturbation
-
-    # Create a meshgrid for plotting
-    X, Y = np.meshgrid(x, y)
-
-    # Plot the 3D surface using Plotly
-    fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")])
-    fig.update_layout(
-        title="Loss Landscape (Interactive 3D)",
-        scene=dict(
-            xaxis_title="Perturbation in u",
-            yaxis_title="Perturbation in v",
-            zaxis_title="Loss",
-        ),
-    )
-
-    # Save as an interactive HTML file
-    fig.write_html(FILENAME)
-    print(f"3D loss landscape saved as {FILENAME}")
-
-
-# Example of DataLoader for loss landscape (dummy dataset for visualization)
-class DummyDataset(torch.utils.data.Dataset):
-    def __init__(self, num_samples=100):
-        self.num_samples = num_samples
-        self.data = torch.randn(num_samples, 10000)  # Increased number of features
-        self.labels = torch.randint(0, 2, (num_samples,))  # Binary labels
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, idx):
-        return self.data[idx], self.labels[idx]
-
-
-if __name__ == "__main__":
-    # Check if GPU is available
-    if not os.path.exists('NN features'):
-        os.mkdir('NN features')
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print(f"Using device: {device}")
-
-    # Load vectorizer (change the path to your vectorizer .pkl file)
-    vectorizer_path = "../Vectorizer .3n3.pkl"
-    model_path = "../Model SenseMini .3n3.pth"
-
-    # Load vectorizer
-    print(f"Loading vectorizer from: {vectorizer_path}")
-    with open(vectorizer_path, "rb") as f:
-        vectorizer = joblib.load(f)
-
-    # Load model and move to the appropriate device (GPU/CPU)
-    print(f"Loading model from: {model_path}")
-    model = torch.load(model_path, weights_only=False)
-    model.to(device)  # Move model to GPU or CPU
-    mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w"
-    with open("NN features/Model Summary.txt", mode) as f:
-        f.write(str(model))
-
-    # Instantiate dummy data loader
-    print("Creating dummy data loader...")
-    dummy_data_loader = DataLoader(DummyDataset(), batch_size=32)
-
-    # Define loss criterion
-    print("Defining loss criterion...")
-    criterion: torch.nn = torch.nn.CrossEntropyLoss()
-
-    # Visualizations
-    print("Creating visualizations...")
-    tokens: TfidfTransformer = vectorizer.get_feature_names_out()
-
-    # Feature importance (dummy data)
-    NUMBER_OF_FEATURES: int = -1  # Number of features to visualize, -1 for all
-    # Max number of features to visualize is 3000 due to image constraints
-    print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...")
-    feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES]))  # Example random importance
-    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg")
-
-    # Loss landscape
-    print("Visualizing loss landscape - This may take a while...")
-    plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html")
-
-    print("Completed.")
diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 38305e01..1b04e0ef 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -1,18 +1,300 @@
+import os
 import os.path
+import random
 from collections import OrderedDict
+from configparser import ConfigParser
 from os import mkdir
 
 import joblib
 import matplotlib.pyplot as plt
 import networkx as nx
 import numpy as np
+import plotly.graph_objects as go
 import seaborn as sns
 import torch
 import torch.nn as nn
+from faker import Faker
+from sklearn.manifold import TSNE
+from torch.utils.data import DataLoader, TensorDataset
 from torchviz import make_dot
 from tqdm import tqdm
 
 
+# Example of DataLoader for loss landscape (dummy dataset for visualization)
+class DummyDataset(torch.utils.data.Dataset):
+    def __init__(self, num_samples=100, input_dim=10000):
+        self.num_samples = num_samples
+        self.input_dim = input_dim
+        self.data = []
+        self.labels = []
+        faker = Faker()
+        for _ in range(num_samples):
+            if random.random() < 0.05:  # 5% chance to include sensitive data
+                self.data.append(f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}")
+                self.labels.append(1)  # Label as sensitive
+            else:
+                self.data.append(faker.text(max_nb_chars=100))  # Non-sensitive data
+                self.labels.append(0)  # Label as non-sensitive
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        label = self.labels[idx]
+        # Convert data to tensor of ASCII values and pad to input_dim
+        data_tensor = torch.tensor([ord(c) for c in data], dtype=torch.float32)
+        if len(data_tensor) < self.input_dim:
+            padding = torch.zeros(self.input_dim - len(data_tensor))
+            data_tensor = torch.cat((data_tensor, padding))
+        else:
+            data_tensor = data_tensor[:self.input_dim]
+        label_tensor = torch.tensor(label, dtype=torch.long)
+        return data_tensor, label_tensor
+
+
+def load_data(text_data, vectorizer_to_load):
+    # Vectorize the text data
+    X = vectorizer_to_load.transform(text_data)
+    # Create a dummy label for visualization (replace with real labels if available)
+    y = np.zeros(len(text_data))
+    # Convert to torch tensors
+    X_tensor = torch.tensor(X.toarray(), dtype=torch.float32)
+    y_tensor = torch.tensor(y, dtype=torch.long)
+    dataset = TensorDataset(X_tensor, y_tensor)
+    return DataLoader(dataset, batch_size=32, shuffle=True)
+
+
+def visualize_weight_distribution(model_to_load):
+    # Access weights of the first layer
+    weights = model_to_load[0].weight.detach().cpu().numpy()  # Move tensor to CPU before conversion to numpy
+    plt.hist(weights.flatten(), bins=50)
+    plt.title("Weight Distribution - First Layer")
+    plt.xlabel("Weight Value")
+    plt.ylabel("Frequency")
+    plt.savefig("NN features/Weight Distribution.png")
+    plt.close()
+
+
+def visualize_activations(model_to_load, input_tensor):
+    # Check the device of the model
+    device_va = next(model_to_load.parameters()).device
+
+    # Move the input tensor to the same device as the model
+    input_tensor = input_tensor.to(device_va)
+
+    activations = []
+
+    # noinspection PyUnusedLocal
+    def hook_fn(module, inputx, output):
+        # Hook function to extract intermediate layer activations
+        activations.append(output)
+
+    model_to_load[0].register_forward_hook(hook_fn)  # Register hook on first layer
+
+    # Perform a forward pass
+    _ = model_to_load(input_tensor)
+    activation = activations[0].detach().cpu().numpy()  # Move activations to CPU
+
+    # Plot activations as a bar chart
+    plt.figure(figsize=(10, 6))
+    plt.bar(range(len(activation[0])), activation[0])
+    plt.title("Activation Values - First Layer")
+    plt.xlabel("Neuron Index")
+    plt.ylabel("Activation Value")
+    plt.savefig("NN features/Visualize Activation.png")
+    plt.close()
+
+
+def visualize_tsne(model_to_load, dataloader):
+    # Get the device of the model
+    device_va = next(model_to_load.parameters()).device
+
+    model_to_load.eval()  # Set the model to evaluation mode
+
+    features = []
+    labels = []
+
+    with torch.no_grad():
+        for data, target in dataloader:
+            # Move data and target to the same device as the model
+            data, target = data.to(device_va), target.to(device_va)
+
+            # Extract features (output of the model)
+            output = model_to_load(data)
+            features.append(output.cpu().numpy())  # Move output to CPU for concatenation
+            labels.append(target.cpu().numpy())   # Move target to CPU for concatenation
+
+    # Stack all batches
+    features = np.vstack(features)
+    labels = np.hstack(labels)
+
+    # Determine suitable perplexity
+    num_samples = features.shape[0]
+    perplexity = min(30, num_samples - 1)  # Ensure perplexity < num_samples
+
+    # Apply t-SNE
+    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
+    reduced_features = tsne.fit_transform(features)
+
+    # Plot the t-SNE results
+    plt.figure(figsize=(10, 8))
+    scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=labels, cmap='viridis', alpha=0.7)
+    plt.colorbar(scatter, label="Class")
+    plt.title("t-SNE Visualization of Features")
+    plt.xlabel("t-SNE Dimension 1")
+    plt.ylabel("t-SNE Dimension 2")
+    plt.savefig("NN features/Visualize t-SNE.png")
+    plt.close()
+
+
+# Main function to run all visualizations
+def plot_many_graphs():
+    print("Starting synthetic data generation...")
+    # Load data
+    faker = Faker()
+
+    # Generate sensitive examples
+    sensitive_data = [
+        f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}",
+        f"Credit Card: {faker.credit_card_number()}, Expiry: {faker.credit_card_expire()}, CVV: {faker.credit_card_security_code()}",
+        f"Patient: {faker.name()}, Condition: {faker.text(max_nb_chars=20)}",
+        f"Password: {faker.password()}",
+        f"Email: {faker.email()}",
+        f"Phone: {faker.phone_number()}",
+        f"Medical Record: {faker.md5()}",
+        f"Username: {faker.user_name()}",
+        f"IP: {faker.ipv4()}",
+    ]
+
+    # Generate non-sensitive examples
+    non_sensitive_data = [
+        faker.text(max_nb_chars=50) for _ in range(50000)
+    ]
+
+    data_text = non_sensitive_data + (sensitive_data * 15)
+    random.shuffle(data_text)
+    print("Loaded data for visualization.")
+    dataloader = load_data(data_text, vectorizer)
+
+    # Visualizations
+    print("Creating visualizations...")
+    visualize_weight_distribution(model)
+
+    # For activations, use a sample from the dataloader
+    print("Creating activation visualizations...")
+    sample_input = next(iter(dataloader))[0]
+    visualize_activations(model, sample_input)
+
+    print("Creating t-SNE visualization - May take a long time...")
+    visualize_tsne(model, dataloader)
+
+    print("Completed.")
+
+
+# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot
+def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
+    # Limit the number of tokens to visualize
+    TOKENS = TOKENS[:1000]
+    FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000]
+
+    plt.figure(figsize=(len(TOKENS) * 0.5, 6))
+    sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False)
+    plt.title("Feature Importance")
+    plt.xlabel("Tokens")
+    plt.ylabel("Importance")
+    plt.xticks(rotation=45)
+    plt.savefig(FILENAME, format="svg")
+    plt.close()  # Close the plot to release memory
+
+
+# Function to visualize the loss landscape as an interactive 3D object
+def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"):
+    MODEL.eval()  # Set model to evaluation mode
+    param = next(MODEL.parameters())  # Use the first parameter for landscape perturbations
+    param_flat = param.view(-1)
+
+    # Define perturbation directions u and v
+    u = torch.randn_like(param_flat).view(param.shape).to(param.device)
+    v = torch.randn_like(param_flat).view(param.shape).to(param.device)
+
+    # Normalize perturbations
+    u = EPSILON * u / torch.norm(u)
+    v = EPSILON * v / torch.norm(v)
+
+    # Create grid
+    x = np.linspace(-1, 1, GRID_SIZE)
+    y = np.linspace(-1, 1, GRID_SIZE)
+    loss_values = np.zeros((GRID_SIZE, GRID_SIZE))
+
+    # Iterate through the grid to compute losses
+    for i, dx in enumerate(x):
+        print(f"Computing loss for row {i+1}/{GRID_SIZE}...")
+        for j, dy in enumerate(y):
+            print(f"    Computing loss for column {j+1}/{GRID_SIZE}...")
+            param.data += dx * u + dy * v  # Apply perturbation
+            loss = 0
+
+            # Compute loss for all batches in data loader
+            for batch in DATA_LOADER:
+                print(f"        Computing loss for batch: {batch}...")
+                inputs, targets = batch
+                inputs = inputs.to(param.device)
+                targets = targets.to(param.device)
+                outputs = MODEL(inputs)
+                loss += CRITERION(outputs, targets).item()
+
+            loss_values[i, j] = loss  # Store the loss
+            param.data -= dx * u + dy * v  # Revert perturbation
+
+    # Create a meshgrid for plotting
+    X, Y = np.meshgrid(x, y)
+
+    # Plot the 3D surface using Plotly
+    fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")])
+    fig.update_layout(
+        title="Loss Landscape (Interactive 3D)",
+        scene=dict(
+            xaxis_title="Perturbation in u",
+            yaxis_title="Perturbation in v",
+            zaxis_title="Loss",
+        ),
+    )
+
+    # Save as an interactive HTML file
+    fig.write_html(FILENAME)
+    print(f"3D loss landscape saved as {FILENAME}")
+
+
+def main_plot():
+    # Instantiate data loader
+    print("Creating dummy data loader...")
+    dummy_data_loader = DataLoader(DummyDataset(), batch_size=32)
+
+    # Define loss criterion
+    print("Defining loss criterion...")
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # Visualizations
+    print("Creating visualizations...")
+    tokens = vectorizer.get_feature_names_out()
+
+    # Feature importance
+    # Max number of features to visualize is 3000 due to image constraints
+    print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...")
+    feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES]))  # Example random importance
+    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg")
+
+    # Loss landscape
+    print("Visualizing loss landscape - This may take a while...")
+    plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html")
+
+    # Set model to evaluation mode, and plot many graphs
+    print("Setting model to evaluation mode...")
+    model.eval()  # Set the model to evaluation mode
+    plot_many_graphs()
+
+
 def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
     def register_hook(module):
 
@@ -155,25 +437,25 @@ def add_edges_bulk(layer_names, weight_matrices):
     nx.write_gexf(G, "NN features/Neural Network Nodes Graph.gexf")
 
 
-if __name__ == '__main__':
+def setup_environment():
     print("Visualizing the model and vectorizer features...")
     print("This may take a while, please wait.")
 
     if not os.path.exists('NN features'):
         mkdir('NN features')
 
-    # Load the vectorizer
-    vectorizer_path = '../Vectorizer .3n3.pkl'
-    vectorizer = joblib.load(vectorizer_path)
 
-    # Inspect the vectorizer
-    feature_names = vectorizer.get_feature_names_out()
-    with open('NN features/Vectorizer features.txt', 'w') as f:
-        f.write(f"Number of features: {len(feature_names)}\n\n")
-        f.write('\n'.join(feature_names))
+def load_vectorizer():
+    vectorizer_load = joblib.load(vectorizer_path)
+    feature_names = vectorizer_load.get_feature_names_out()
+    with open('NN features/Vectorizer features.txt', 'w') as file:
+        file.write(f"Number of features: {len(feature_names)}\n\n")
+        file.write('\n'.join(feature_names))
+    return vectorizer_load
+
 
-    # Visualize the top 90 features
-    top_n = 90
+def visualize_top_features(top_n=90):
+    feature_names = vectorizer.get_feature_names_out()
     sorted_indices = vectorizer.idf_.argsort()[:top_n]
     top_features = [feature_names[i] for i in sorted_indices]
     top_idf_scores = vectorizer.idf_[sorted_indices]
@@ -186,39 +468,102 @@ def add_edges_bulk(layer_names, weight_matrices):
 
     # Save the plot as a vector graphic
     plt.savefig('NN features/Top_90_Features.svg', format='svg')
-
     plt.show()
 
-    # Load the model
-    model_path = '../Model SenseMini .3n3.pth'
-    model = torch.load(model_path, weights_only=False)
 
-    # Save the model summary
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    save_data(model, input_size=(1, vectorizer.vocabulary_.__len__()))
+def load_model():
+    device_load = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_load = torch.load(model_path, weights_only=False)
+    model_load.to(device_load)
+    return model_load, device_load
+
 
-    # Save the model's state dictionary
-    with open('NN features/Model state dictionary.txt', 'w') as f:
-        f.write("Model's state dictionary:\n\n")
+def save_model_state_dict():
+    with open('NN features/Model state dictionary.txt', 'w') as file:
+        file.write("Model's state dictionary:\n\n")
         for param_tensor in model.state_dict():
-            f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}")
+            file.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}")
 
-    # Create a dummy input tensor with the appropriate size
-    dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device)
 
-    # Generate the visualization
+def generate_model_visualization():
+    dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device)
     model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()), show_attrs=True, show_saved=True)
-
-    # Save the visualization to a file
     model_viz.format = 'png'
     model_viz.render(filename='NN features/Model Visualization', format='png')
 
-    # Removing the temporary files as they are no longer needed, we saved them to the desired location
+
+def cleanup_temp_files():
     if os.path.exists("NN features/Model Visualization"):
         os.remove("NN features/Model Visualization")
 
-    # Visualize the model
-    save_graph()
 
+def model_summary():
+    mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w"
+    with open("NN features/Model Summary.txt", mode) as file:
+        file.write(str(model))
+
+
+if __name__ == '__main__':
+    # Print the welcome message
+    print("===========================================================================================")
+    print("= This script will visualize the features of the model and vectorizer.                    =")
+    print("= Please ensure that the model and vectorizer files are present in the specified paths.   =")
+    print("= The visualization will be saved in the 'NN features' directory.                         =")
+    print("= This script will take a while to run, please be patient.                                =")
+    print("===========================================================================================")
+
+    # Read the config file
+    print("\n\nReading config file and setting up...")
+    config = ConfigParser()
+    config.read('../../config.ini')
+
+    setup_environment()
+
+    # Load the paths from the config file
+    vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path')
+    model_path = config.get('VulnScan.study Settings', 'model_path')
+    NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features')
+
+    # Check if the paths exist
+    if not os.path.exists(vectorizer_path):
+        print(f"Vectorizer file not found. Please double check the path {vectorizer_path}.")
+        exit(1)
+    if not os.path.exists(model_path):
+        print(f"Model file not found. Please double check the path {model_path}.")
+        exit(1)
+
+    # Load the vectorizer and model
+    vectorizer = load_vectorizer()
+    visualize_top_features()
+    model, device = load_model()
+    # Save the model summary, state dictionary, and visualization
+    save_data(model, input_size=(1, vectorizer.vocabulary_.__len__()))
+    save_model_state_dict()
+    generate_model_visualization()
+    cleanup_temp_files()
+    save_graph()
     print("Model visualization and summary have been saved to the 'NN features' directory.")
+
+    # Check if GPU is available
+    if not os.path.exists('NN features'):
+        os.mkdir('NN features')
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+
+    # Load vectorizer (change the path to your vectorizer .pkl file)
+    vectorizer_path = "../Vectorizer .3n3.pkl"
+    model_path = "../Model SenseMini .3n3.pth"
+
+    # Load vectorizer
+    print(f"Reloading vectorizer from: {vectorizer_path}")
+    with open(vectorizer_path, "rb") as f:
+        vectorizer = joblib.load(f)
+
+    # Load model and move to the appropriate device (GPU/CPU)
+    print(f"Reloading model from: {model_path}")
+    model = torch.load(model_path, weights_only=False)
+    model.to(device)  # Move model to GPU or CPU
+
+    model_summary()
+    main_plot()
diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py
index c9fa7ee7..1600fee6 100644
--- a/CODE/VulnScan/v3/_train.py
+++ b/CODE/VulnScan/v3/_train.py
@@ -178,6 +178,7 @@ def select_model_from_traditional(model_name: str,
     logger.error(f"Invalid model name: {model_name}")
     exit(1)
 
+
 def train_traditional_model(model_name: str,
                             epochs: int,
                             save_model_path: str):
diff --git a/CODE/config.ini b/CODE/config.ini
index f24190ce..cbc5986f 100644
--- a/CODE/config.ini
+++ b/CODE/config.ini
@@ -27,29 +27,10 @@ timeout = 10
 
 ###################################################
 
-[VulnScan.train Settings]
-# The following settings are for the Train module for training models
-# NeuralNetwork seems to be the best choice for this task
-# Options: "NeuralNetwork", "LogReg",
-#          "RandomForest", "ExtraTrees", "GBM",
-#          "XGBoost", "DecisionTree", "NaiveBayes"
-model_name = NeuralNetwork
-# General Training Parameters
-epochs = 10
-batch_size = 32
-learning_rate = 0.001
-use_cuda = true
-
-# Paths to train and save data
-train_data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData
-# If all models are to be trained, this is the path to save all models,
-# and will be appended with the model codename and follow naming convention
-save_model_path = C:\Users\Hp\Desktop\Model Tests\Model SenseMini
-
 [VulnScan.generate Settings]
 # The following settings are for the Generate module for fake training data
 extensions = .txt, .log, .md, .csv, .json, .xml, .html, .yaml, .ini, .pdf, .docx, .xlsx, .pptx
-save_path = C:\Users\Hp\Desktop\Model Tests\Generated Data
+save_path = PATH
 # Options include:
 # 'Sense' - Generates 50k files, each 25KB in size.
 # 'SenseNano' - Generates 5 files, each 5KB in size.
@@ -79,11 +60,44 @@ partial_sensitive_chance = 0.2
 # Use the vectorizer supplied for any v3 model on SenseMini
 
 # The path to the data to vectorize, either a file or a directory
-data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData
+data_path = PATH
 # The path to save the vectorized data - It will automatically be appended '\Vectorizer.pkl'
 # Make sure the path is a directory, and it exists
-output_path = C:\Users\Hp\Desktop\Model Tests\Model Sense - Vectorizer
+output_path = PATH
 
 # Vectorizer to use, options include:
 # tfidf or count - The code for the training only supports tfidf - we advise to use tfidf
 vectorizer_type = tfidf
+
+[VulnScan.train Settings]
+# The following settings are for the Train module for training models
+# NeuralNetwork seems to be the best choice for this task
+# Options: "NeuralNetwork", "LogReg",
+#          "RandomForest", "ExtraTrees", "GBM",
+#          "XGBoost", "DecisionTree", "NaiveBayes"
+model_name = NeuralNetwork
+# General Training Parameters
+epochs = 10
+batch_size = 32
+learning_rate = 0.001
+use_cuda = true
+
+# Paths to train and save data
+train_data_path = PATH
+# If all models are to be trained, this is the path to save all models,
+# and will be appended with the model codename and follow naming convention
+save_model_path = PATH
+
+[VulnScan.study Settings]
+# Here is the basics of the study module
+# This is useful to generate graphs and data that may help in understanding the model
+# Everything is found online pre-studied, so this is not necessary
+# But it is useful for understanding the model locally
+# All files be saved here, and can't be changed, PATH is "NN features/"
+
+# This is the path to the model, and the vectorizer
+model_path = PATH
+vectorizer_path = PATH
+# Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations
+# Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap.
+number_of_features = -1

From 4587dc87ff76f309b4491a4b110a5094ef0a723b Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Tue, 10 Dec 2024 21:44:12 +0400
Subject: [PATCH 09/20] Added many study features

Also merged _plot.py to _study_network.py, added activations, weight distribution, t-SNE plots, which are all special, finally fixed some bugs, and made sure all data is genuine, or synthetic, modified config.ini as well to allow paths to be set there.
---
 .idea/Logicytics.iml                  |   1 +
 CODE/VulnScan/tools/_plot.py          | 147 ----------
 CODE/VulnScan/tools/_study_network.py | 407 ++++++++++++++++++++++++--
 CODE/VulnScan/v3/_train.py            |   1 +
 CODE/config.ini                       |  58 ++--
 5 files changed, 414 insertions(+), 200 deletions(-)
 delete mode 100644 CODE/VulnScan/tools/_plot.py

diff --git a/.idea/Logicytics.iml b/.idea/Logicytics.iml
index 9d371a5c..235b40bc 100644
--- a/.idea/Logicytics.iml
+++ b/.idea/Logicytics.iml
@@ -35,6 +35,7 @@
     <option name="TEMPLATE_FOLDERS">
       <list>
         <option value="$MODULE_DIR$/CODE/VulnScan/tools" />
+        <option value="$MODULE_DIR$/CODE/VulnScan/tools/test_tools" />
       </list>
     </option>
   </component>
diff --git a/CODE/VulnScan/tools/_plot.py b/CODE/VulnScan/tools/_plot.py
deleted file mode 100644
index 247519f9..00000000
--- a/CODE/VulnScan/tools/_plot.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import os
-
-import joblib
-import matplotlib.pyplot as plt
-import numpy as np
-import plotly.graph_objects as go
-import seaborn as sns
-import torch
-from sklearn.feature_extraction.text import TfidfTransformer
-from torch.utils.data import DataLoader
-
-
-# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot
-def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
-    # Limit the number of tokens to visualize
-    TOKENS = TOKENS[:1000]
-    FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000]
-
-    plt.figure(figsize=(len(TOKENS) * 0.5, 6))
-    sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False)
-    plt.title("Feature Importance")
-    plt.xlabel("Tokens")
-    plt.ylabel("Importance")
-    plt.xticks(rotation=45)
-    plt.savefig(FILENAME, format="svg")
-    plt.show()  # Show the plot interactively
-    plt.close()  # Close the plot to release memory
-
-
-# Function to visualize the loss landscape as an interactive 3D object
-def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"):
-    MODEL.eval()  # Set model to evaluation mode
-    param = next(MODEL.parameters())  # Use the first parameter for landscape perturbations
-    param_flat = param.view(-1)
-
-    # Define perturbation directions u and v
-    u = torch.randn_like(param_flat).view(param.shape).to(param.device)
-    v = torch.randn_like(param_flat).view(param.shape).to(param.device)
-
-    # Normalize perturbations
-    u = EPSILON * u / torch.norm(u)
-    v = EPSILON * v / torch.norm(v)
-
-    # Create grid
-    x = np.linspace(-1, 1, GRID_SIZE)
-    y = np.linspace(-1, 1, GRID_SIZE)
-    loss_values = np.zeros((GRID_SIZE, GRID_SIZE))
-
-    # Iterate through the grid to compute losses
-    for i, dx in enumerate(x):
-        for j, dy in enumerate(y):
-            param.data += dx * u + dy * v  # Apply perturbation
-            loss = 0
-
-            # Compute loss for all batches in data loader
-            for batch in DATA_LOADER:
-                inputs, targets = batch
-                inputs = inputs.to(param.device)
-                targets = targets.to(param.device)
-                outputs = MODEL(inputs)
-                loss += CRITERION(outputs, targets).item()
-
-            loss_values[i, j] = loss  # Store the loss
-            param.data -= dx * u + dy * v  # Revert perturbation
-
-    # Create a meshgrid for plotting
-    X, Y = np.meshgrid(x, y)
-
-    # Plot the 3D surface using Plotly
-    fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")])
-    fig.update_layout(
-        title="Loss Landscape (Interactive 3D)",
-        scene=dict(
-            xaxis_title="Perturbation in u",
-            yaxis_title="Perturbation in v",
-            zaxis_title="Loss",
-        ),
-    )
-
-    # Save as an interactive HTML file
-    fig.write_html(FILENAME)
-    print(f"3D loss landscape saved as {FILENAME}")
-
-
-# Example of DataLoader for loss landscape (dummy dataset for visualization)
-class DummyDataset(torch.utils.data.Dataset):
-    def __init__(self, num_samples=100):
-        self.num_samples = num_samples
-        self.data = torch.randn(num_samples, 10000)  # Increased number of features
-        self.labels = torch.randint(0, 2, (num_samples,))  # Binary labels
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, idx):
-        return self.data[idx], self.labels[idx]
-
-
-if __name__ == "__main__":
-    # Check if GPU is available
-    if not os.path.exists('NN features'):
-        os.mkdir('NN features')
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print(f"Using device: {device}")
-
-    # Load vectorizer (change the path to your vectorizer .pkl file)
-    vectorizer_path = "../Vectorizer .3n3.pkl"
-    model_path = "../Model SenseMini .3n3.pth"
-
-    # Load vectorizer
-    print(f"Loading vectorizer from: {vectorizer_path}")
-    with open(vectorizer_path, "rb") as f:
-        vectorizer = joblib.load(f)
-
-    # Load model and move to the appropriate device (GPU/CPU)
-    print(f"Loading model from: {model_path}")
-    model = torch.load(model_path, weights_only=False)
-    model.to(device)  # Move model to GPU or CPU
-    mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w"
-    with open("NN features/Model Summary.txt", mode) as f:
-        f.write(str(model))
-
-    # Instantiate dummy data loader
-    print("Creating dummy data loader...")
-    dummy_data_loader = DataLoader(DummyDataset(), batch_size=32)
-
-    # Define loss criterion
-    print("Defining loss criterion...")
-    criterion: torch.nn = torch.nn.CrossEntropyLoss()
-
-    # Visualizations
-    print("Creating visualizations...")
-    tokens: TfidfTransformer = vectorizer.get_feature_names_out()
-
-    # Feature importance (dummy data)
-    NUMBER_OF_FEATURES: int = -1  # Number of features to visualize, -1 for all
-    # Max number of features to visualize is 3000 due to image constraints
-    print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...")
-    feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES]))  # Example random importance
-    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg")
-
-    # Loss landscape
-    print("Visualizing loss landscape - This may take a while...")
-    plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html")
-
-    print("Completed.")
diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 38305e01..1b04e0ef 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -1,18 +1,300 @@
+import os
 import os.path
+import random
 from collections import OrderedDict
+from configparser import ConfigParser
 from os import mkdir
 
 import joblib
 import matplotlib.pyplot as plt
 import networkx as nx
 import numpy as np
+import plotly.graph_objects as go
 import seaborn as sns
 import torch
 import torch.nn as nn
+from faker import Faker
+from sklearn.manifold import TSNE
+from torch.utils.data import DataLoader, TensorDataset
 from torchviz import make_dot
 from tqdm import tqdm
 
 
+# Example of DataLoader for loss landscape (dummy dataset for visualization)
+class DummyDataset(torch.utils.data.Dataset):
+    def __init__(self, num_samples=100, input_dim=10000):
+        self.num_samples = num_samples
+        self.input_dim = input_dim
+        self.data = []
+        self.labels = []
+        faker = Faker()
+        for _ in range(num_samples):
+            if random.random() < 0.05:  # 5% chance to include sensitive data
+                self.data.append(f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}")
+                self.labels.append(1)  # Label as sensitive
+            else:
+                self.data.append(faker.text(max_nb_chars=100))  # Non-sensitive data
+                self.labels.append(0)  # Label as non-sensitive
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        label = self.labels[idx]
+        # Convert data to tensor of ASCII values and pad to input_dim
+        data_tensor = torch.tensor([ord(c) for c in data], dtype=torch.float32)
+        if len(data_tensor) < self.input_dim:
+            padding = torch.zeros(self.input_dim - len(data_tensor))
+            data_tensor = torch.cat((data_tensor, padding))
+        else:
+            data_tensor = data_tensor[:self.input_dim]
+        label_tensor = torch.tensor(label, dtype=torch.long)
+        return data_tensor, label_tensor
+
+
+def load_data(text_data, vectorizer_to_load):
+    # Vectorize the text data
+    X = vectorizer_to_load.transform(text_data)
+    # Create a dummy label for visualization (replace with real labels if available)
+    y = np.zeros(len(text_data))
+    # Convert to torch tensors
+    X_tensor = torch.tensor(X.toarray(), dtype=torch.float32)
+    y_tensor = torch.tensor(y, dtype=torch.long)
+    dataset = TensorDataset(X_tensor, y_tensor)
+    return DataLoader(dataset, batch_size=32, shuffle=True)
+
+
+def visualize_weight_distribution(model_to_load):
+    # Access weights of the first layer
+    weights = model_to_load[0].weight.detach().cpu().numpy()  # Move tensor to CPU before conversion to numpy
+    plt.hist(weights.flatten(), bins=50)
+    plt.title("Weight Distribution - First Layer")
+    plt.xlabel("Weight Value")
+    plt.ylabel("Frequency")
+    plt.savefig("NN features/Weight Distribution.png")
+    plt.close()
+
+
+def visualize_activations(model_to_load, input_tensor):
+    # Check the device of the model
+    device_va = next(model_to_load.parameters()).device
+
+    # Move the input tensor to the same device as the model
+    input_tensor = input_tensor.to(device_va)
+
+    activations = []
+
+    # noinspection PyUnusedLocal
+    def hook_fn(module, inputx, output):
+        # Hook function to extract intermediate layer activations
+        activations.append(output)
+
+    model_to_load[0].register_forward_hook(hook_fn)  # Register hook on first layer
+
+    # Perform a forward pass
+    _ = model_to_load(input_tensor)
+    activation = activations[0].detach().cpu().numpy()  # Move activations to CPU
+
+    # Plot activations as a bar chart
+    plt.figure(figsize=(10, 6))
+    plt.bar(range(len(activation[0])), activation[0])
+    plt.title("Activation Values - First Layer")
+    plt.xlabel("Neuron Index")
+    plt.ylabel("Activation Value")
+    plt.savefig("NN features/Visualize Activation.png")
+    plt.close()
+
+
+def visualize_tsne(model_to_load, dataloader):
+    # Get the device of the model
+    device_va = next(model_to_load.parameters()).device
+
+    model_to_load.eval()  # Set the model to evaluation mode
+
+    features = []
+    labels = []
+
+    with torch.no_grad():
+        for data, target in dataloader:
+            # Move data and target to the same device as the model
+            data, target = data.to(device_va), target.to(device_va)
+
+            # Extract features (output of the model)
+            output = model_to_load(data)
+            features.append(output.cpu().numpy())  # Move output to CPU for concatenation
+            labels.append(target.cpu().numpy())   # Move target to CPU for concatenation
+
+    # Stack all batches
+    features = np.vstack(features)
+    labels = np.hstack(labels)
+
+    # Determine suitable perplexity
+    num_samples = features.shape[0]
+    perplexity = min(30, num_samples - 1)  # Ensure perplexity < num_samples
+
+    # Apply t-SNE
+    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
+    reduced_features = tsne.fit_transform(features)
+
+    # Plot the t-SNE results
+    plt.figure(figsize=(10, 8))
+    scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=labels, cmap='viridis', alpha=0.7)
+    plt.colorbar(scatter, label="Class")
+    plt.title("t-SNE Visualization of Features")
+    plt.xlabel("t-SNE Dimension 1")
+    plt.ylabel("t-SNE Dimension 2")
+    plt.savefig("NN features/Visualize t-SNE.png")
+    plt.close()
+
+
+# Main function to run all visualizations
+def plot_many_graphs():
+    print("Starting synthetic data generation...")
+    # Load data
+    faker = Faker()
+
+    # Generate sensitive examples
+    sensitive_data = [
+        f"Name: {faker.name()}, SSN: {faker.ssn()}, Address: {faker.address()}",
+        f"Credit Card: {faker.credit_card_number()}, Expiry: {faker.credit_card_expire()}, CVV: {faker.credit_card_security_code()}",
+        f"Patient: {faker.name()}, Condition: {faker.text(max_nb_chars=20)}",
+        f"Password: {faker.password()}",
+        f"Email: {faker.email()}",
+        f"Phone: {faker.phone_number()}",
+        f"Medical Record: {faker.md5()}",
+        f"Username: {faker.user_name()}",
+        f"IP: {faker.ipv4()}",
+    ]
+
+    # Generate non-sensitive examples
+    non_sensitive_data = [
+        faker.text(max_nb_chars=50) for _ in range(50000)
+    ]
+
+    data_text = non_sensitive_data + (sensitive_data * 15)
+    random.shuffle(data_text)
+    print("Loaded data for visualization.")
+    dataloader = load_data(data_text, vectorizer)
+
+    # Visualizations
+    print("Creating visualizations...")
+    visualize_weight_distribution(model)
+
+    # For activations, use a sample from the dataloader
+    print("Creating activation visualizations...")
+    sample_input = next(iter(dataloader))[0]
+    visualize_activations(model, sample_input)
+
+    print("Creating t-SNE visualization - May take a long time...")
+    visualize_tsne(model, dataloader)
+
+    print("Completed.")
+
+
+# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot
+def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
+    # Limit the number of tokens to visualize
+    TOKENS = TOKENS[:1000]
+    FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000]
+
+    plt.figure(figsize=(len(TOKENS) * 0.5, 6))
+    sns.barplot(x=TOKENS, y=FEATURE_IMPORTANCE, palette="coolwarm", hue=TOKENS, legend=False)
+    plt.title("Feature Importance")
+    plt.xlabel("Tokens")
+    plt.ylabel("Importance")
+    plt.xticks(rotation=45)
+    plt.savefig(FILENAME, format="svg")
+    plt.close()  # Close the plot to release memory
+
+
+# Function to visualize the loss landscape as an interactive 3D object
+def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"):
+    MODEL.eval()  # Set model to evaluation mode
+    param = next(MODEL.parameters())  # Use the first parameter for landscape perturbations
+    param_flat = param.view(-1)
+
+    # Define perturbation directions u and v
+    u = torch.randn_like(param_flat).view(param.shape).to(param.device)
+    v = torch.randn_like(param_flat).view(param.shape).to(param.device)
+
+    # Normalize perturbations
+    u = EPSILON * u / torch.norm(u)
+    v = EPSILON * v / torch.norm(v)
+
+    # Create grid
+    x = np.linspace(-1, 1, GRID_SIZE)
+    y = np.linspace(-1, 1, GRID_SIZE)
+    loss_values = np.zeros((GRID_SIZE, GRID_SIZE))
+
+    # Iterate through the grid to compute losses
+    for i, dx in enumerate(x):
+        print(f"Computing loss for row {i+1}/{GRID_SIZE}...")
+        for j, dy in enumerate(y):
+            print(f"    Computing loss for column {j+1}/{GRID_SIZE}...")
+            param.data += dx * u + dy * v  # Apply perturbation
+            loss = 0
+
+            # Compute loss for all batches in data loader
+            for batch in DATA_LOADER:
+                print(f"        Computing loss for batch: {batch}...")
+                inputs, targets = batch
+                inputs = inputs.to(param.device)
+                targets = targets.to(param.device)
+                outputs = MODEL(inputs)
+                loss += CRITERION(outputs, targets).item()
+
+            loss_values[i, j] = loss  # Store the loss
+            param.data -= dx * u + dy * v  # Revert perturbation
+
+    # Create a meshgrid for plotting
+    X, Y = np.meshgrid(x, y)
+
+    # Plot the 3D surface using Plotly
+    fig = go.Figure(data=[go.Surface(z=loss_values, x=X, y=Y, colorscale="Viridis")])
+    fig.update_layout(
+        title="Loss Landscape (Interactive 3D)",
+        scene=dict(
+            xaxis_title="Perturbation in u",
+            yaxis_title="Perturbation in v",
+            zaxis_title="Loss",
+        ),
+    )
+
+    # Save as an interactive HTML file
+    fig.write_html(FILENAME)
+    print(f"3D loss landscape saved as {FILENAME}")
+
+
+def main_plot():
+    # Instantiate data loader
+    print("Creating dummy data loader...")
+    dummy_data_loader = DataLoader(DummyDataset(), batch_size=32)
+
+    # Define loss criterion
+    print("Defining loss criterion...")
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # Visualizations
+    print("Creating visualizations...")
+    tokens = vectorizer.get_feature_names_out()
+
+    # Feature importance
+    # Max number of features to visualize is 3000 due to image constraints
+    print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...")
+    feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES]))  # Example random importance
+    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg")
+
+    # Loss landscape
+    print("Visualizing loss landscape - This may take a while...")
+    plot_loss_landscape_3d(model, dummy_data_loader, criterion, FILENAME="NN features/loss_landscape_3d.html")
+
+    # Set model to evaluation mode, and plot many graphs
+    print("Setting model to evaluation mode...")
+    model.eval()  # Set the model to evaluation mode
+    plot_many_graphs()
+
+
 def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
     def register_hook(module):
 
@@ -155,25 +437,25 @@ def add_edges_bulk(layer_names, weight_matrices):
     nx.write_gexf(G, "NN features/Neural Network Nodes Graph.gexf")
 
 
-if __name__ == '__main__':
+def setup_environment():
     print("Visualizing the model and vectorizer features...")
     print("This may take a while, please wait.")
 
     if not os.path.exists('NN features'):
         mkdir('NN features')
 
-    # Load the vectorizer
-    vectorizer_path = '../Vectorizer .3n3.pkl'
-    vectorizer = joblib.load(vectorizer_path)
 
-    # Inspect the vectorizer
-    feature_names = vectorizer.get_feature_names_out()
-    with open('NN features/Vectorizer features.txt', 'w') as f:
-        f.write(f"Number of features: {len(feature_names)}\n\n")
-        f.write('\n'.join(feature_names))
+def load_vectorizer():
+    vectorizer_load = joblib.load(vectorizer_path)
+    feature_names = vectorizer_load.get_feature_names_out()
+    with open('NN features/Vectorizer features.txt', 'w') as file:
+        file.write(f"Number of features: {len(feature_names)}\n\n")
+        file.write('\n'.join(feature_names))
+    return vectorizer_load
+
 
-    # Visualize the top 90 features
-    top_n = 90
+def visualize_top_features(top_n=90):
+    feature_names = vectorizer.get_feature_names_out()
     sorted_indices = vectorizer.idf_.argsort()[:top_n]
     top_features = [feature_names[i] for i in sorted_indices]
     top_idf_scores = vectorizer.idf_[sorted_indices]
@@ -186,39 +468,102 @@ def add_edges_bulk(layer_names, weight_matrices):
 
     # Save the plot as a vector graphic
     plt.savefig('NN features/Top_90_Features.svg', format='svg')
-
     plt.show()
 
-    # Load the model
-    model_path = '../Model SenseMini .3n3.pth'
-    model = torch.load(model_path, weights_only=False)
 
-    # Save the model summary
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    save_data(model, input_size=(1, vectorizer.vocabulary_.__len__()))
+def load_model():
+    device_load = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_load = torch.load(model_path, weights_only=False)
+    model_load.to(device_load)
+    return model_load, device_load
+
 
-    # Save the model's state dictionary
-    with open('NN features/Model state dictionary.txt', 'w') as f:
-        f.write("Model's state dictionary:\n\n")
+def save_model_state_dict():
+    with open('NN features/Model state dictionary.txt', 'w') as file:
+        file.write("Model's state dictionary:\n\n")
         for param_tensor in model.state_dict():
-            f.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}")
+            file.write(f"\n{param_tensor}\t{model.state_dict()[param_tensor].size()}")
 
-    # Create a dummy input tensor with the appropriate size
-    dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device)
 
-    # Generate the visualization
+def generate_model_visualization():
+    dummy_input = torch.randn(1, vectorizer.vocabulary_.__len__()).to(device)
     model_viz = make_dot(model(dummy_input), params=dict(model.named_parameters()), show_attrs=True, show_saved=True)
-
-    # Save the visualization to a file
     model_viz.format = 'png'
     model_viz.render(filename='NN features/Model Visualization', format='png')
 
-    # Removing the temporary files as they are no longer needed, we saved them to the desired location
+
+def cleanup_temp_files():
     if os.path.exists("NN features/Model Visualization"):
         os.remove("NN features/Model Visualization")
 
-    # Visualize the model
-    save_graph()
 
+def model_summary():
+    mode = "a" if os.path.exists("NN features/Model Summary.txt") else "w"
+    with open("NN features/Model Summary.txt", mode) as file:
+        file.write(str(model))
+
+
+if __name__ == '__main__':
+    # Print the welcome message
+    print("===========================================================================================")
+    print("= This script will visualize the features of the model and vectorizer.                    =")
+    print("= Please ensure that the model and vectorizer files are present in the specified paths.   =")
+    print("= The visualization will be saved in the 'NN features' directory.                         =")
+    print("= This script will take a while to run, please be patient.                                =")
+    print("===========================================================================================")
+
+    # Read the config file
+    print("\n\nReading config file and setting up...")
+    config = ConfigParser()
+    config.read('../../config.ini')
+
+    setup_environment()
+
+    # Load the paths from the config file
+    vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path')
+    model_path = config.get('VulnScan.study Settings', 'model_path')
+    NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features')
+
+    # Check if the paths exist
+    if not os.path.exists(vectorizer_path):
+        print(f"Vectorizer file not found. Please double check the path {vectorizer_path}.")
+        exit(1)
+    if not os.path.exists(model_path):
+        print(f"Model file not found. Please double check the path {model_path}.")
+        exit(1)
+
+    # Load the vectorizer and model
+    vectorizer = load_vectorizer()
+    visualize_top_features()
+    model, device = load_model()
+    # Save the model summary, state dictionary, and visualization
+    save_data(model, input_size=(1, vectorizer.vocabulary_.__len__()))
+    save_model_state_dict()
+    generate_model_visualization()
+    cleanup_temp_files()
+    save_graph()
     print("Model visualization and summary have been saved to the 'NN features' directory.")
+
+    # Check if GPU is available
+    if not os.path.exists('NN features'):
+        os.mkdir('NN features')
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+
+    # Load vectorizer (change the path to your vectorizer .pkl file)
+    vectorizer_path = "../Vectorizer .3n3.pkl"
+    model_path = "../Model SenseMini .3n3.pth"
+
+    # Load vectorizer
+    print(f"Reloading vectorizer from: {vectorizer_path}")
+    with open(vectorizer_path, "rb") as f:
+        vectorizer = joblib.load(f)
+
+    # Load model and move to the appropriate device (GPU/CPU)
+    print(f"Reloading model from: {model_path}")
+    model = torch.load(model_path, weights_only=False)
+    model.to(device)  # Move model to GPU or CPU
+
+    model_summary()
+    main_plot()
diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py
index c9fa7ee7..1600fee6 100644
--- a/CODE/VulnScan/v3/_train.py
+++ b/CODE/VulnScan/v3/_train.py
@@ -178,6 +178,7 @@ def select_model_from_traditional(model_name: str,
     logger.error(f"Invalid model name: {model_name}")
     exit(1)
 
+
 def train_traditional_model(model_name: str,
                             epochs: int,
                             save_model_path: str):
diff --git a/CODE/config.ini b/CODE/config.ini
index f24190ce..24130933 100644
--- a/CODE/config.ini
+++ b/CODE/config.ini
@@ -27,29 +27,10 @@ timeout = 10
 
 ###################################################
 
-[VulnScan.train Settings]
-# The following settings are for the Train module for training models
-# NeuralNetwork seems to be the best choice for this task
-# Options: "NeuralNetwork", "LogReg",
-#          "RandomForest", "ExtraTrees", "GBM",
-#          "XGBoost", "DecisionTree", "NaiveBayes"
-model_name = NeuralNetwork
-# General Training Parameters
-epochs = 10
-batch_size = 32
-learning_rate = 0.001
-use_cuda = true
-
-# Paths to train and save data
-train_data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData
-# If all models are to be trained, this is the path to save all models,
-# and will be appended with the model codename and follow naming convention
-save_model_path = C:\Users\Hp\Desktop\Model Tests\Model SenseMini
-
 [VulnScan.generate Settings]
 # The following settings are for the Generate module for fake training data
 extensions = .txt, .log, .md, .csv, .json, .xml, .html, .yaml, .ini, .pdf, .docx, .xlsx, .pptx
-save_path = C:\Users\Hp\Desktop\Model Tests\Generated Data
+save_path = PATH
 # Options include:
 # 'Sense' - Generates 50k files, each 25KB in size.
 # 'SenseNano' - Generates 5 files, each 5KB in size.
@@ -79,11 +60,44 @@ partial_sensitive_chance = 0.2
 # Use the vectorizer supplied for any v3 model on SenseMini
 
 # The path to the data to vectorize, either a file or a directory
-data_path = C:\Users\Hp\Desktop\Model Tests\Model Data\GeneratedData
+data_path = PATH
 # The path to save the vectorized data - It will automatically be appended '\Vectorizer.pkl'
 # Make sure the path is a directory, and it exists
-output_path = C:\Users\Hp\Desktop\Model Tests\Model Sense - Vectorizer
+output_path = PATH
 
 # Vectorizer to use, options include:
 # tfidf or count - The code for the training only supports tfidf - we advise to use tfidf
 vectorizer_type = tfidf
+
+[VulnScan.train Settings]
+# The following settings are for the Train module for training models
+# NeuralNetwork seems to be the best choice for this task
+# Options: "NeuralNetwork", "LogReg",
+#          "RandomForest", "ExtraTrees", "GBM",
+#          "XGBoost", "DecisionTree", "NaiveBayes"
+model_name = NeuralNetwork
+# General Training Parameters
+epochs = 10
+batch_size = 32
+learning_rate = 0.001
+use_cuda = true
+
+# Paths to train and save data
+train_data_path = PATH
+# If all models are to be trained, this is the path to save all models,
+# and will be appended with the model codename and follow naming convention
+save_model_path = PATH
+
+[VulnScan.study Settings]
+# Here is the basics of the study module
+# This is useful to generate graphs and data that may help in understanding the model
+# Everything is found online pre-studied, so this is not necessary
+# But it is useful for understanding the model locally
+# All files be saved here, and can't be changed, PATH is "NN features/"
+
+# This is the path to the model, and the vectorizer
+model_path = ../Model SenseMini .3n3.pth
+vectorizer_path = ../Vectorizer .3n3.pkl
+# Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations
+# Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap.
+number_of_features = -1

From d19a6b209eb73516da951e33d509228a029038cb Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Tue, 10 Dec 2024 22:01:07 +0400
Subject: [PATCH 10/20] Fixed some bugs

---
 CODE/VulnScan/tools/_study_network.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 1b04e0ef..dedf7bd4 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -123,7 +123,7 @@ def visualize_tsne(model_to_load, dataloader):
             # Extract features (output of the model)
             output = model_to_load(data)
             features.append(output.cpu().numpy())  # Move output to CPU for concatenation
-            labels.append(target.cpu().numpy())   # Move target to CPU for concatenation
+            labels.append(target.cpu().numpy())  # Move target to CPU for concatenation
 
     # Stack all batches
     features = np.vstack(features)
@@ -192,7 +192,7 @@ def plot_many_graphs():
     print("Completed.")
 
 
-# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot
+# Visualize feature importance (dummy example for visualization) and save as SVG
 def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
     # Limit the number of tokens to visualize
     TOKENS = TOKENS[:1000]
@@ -229,15 +229,14 @@ def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON
 
     # Iterate through the grid to compute losses
     for i, dx in enumerate(x):
-        print(f"Computing loss for row {i+1}/{GRID_SIZE}...")
+        print(f"Computing loss for row {i + 1}/{GRID_SIZE}...")
         for j, dy in enumerate(y):
-            print(f"    Computing loss for column {j+1}/{GRID_SIZE}...")
+            print(f"    Computing loss for column {j + 1}/{GRID_SIZE}...")
             param.data += dx * u + dy * v  # Apply perturbation
             loss = 0
 
             # Compute loss for all batches in data loader
             for batch in DATA_LOADER:
-                print(f"        Computing loss for batch: {batch}...")
                 inputs, targets = batch
                 inputs = inputs.to(param.device)
                 targets = targets.to(param.device)
@@ -281,9 +280,11 @@ def main_plot():
 
     # Feature importance
     # Max number of features to visualize is 3000 due to image constraints
-    print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...")
+    print(
+        f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES]) + 1} tokens...")
     feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES]))  # Example random importance
-    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg")
+    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance,
+                                 FILENAME="NN features/feature_importance.svg")
 
     # Loss landscape
     print("Visualizing loss landscape - This may take a while...")
@@ -468,7 +469,7 @@ def visualize_top_features(top_n=90):
 
     # Save the plot as a vector graphic
     plt.savefig('NN features/Top_90_Features.svg', format='svg')
-    plt.show()
+    plt.close()
 
 
 def load_model():
@@ -522,7 +523,7 @@ def model_summary():
     # Load the paths from the config file
     vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path')
     model_path = config.get('VulnScan.study Settings', 'model_path')
-    NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features')
+    NUMBER_OF_FEATURES = int(config.get('VulnScan.study Settings', 'number_of_features'))
 
     # Check if the paths exist
     if not os.path.exists(vectorizer_path):

From 6dd419cede8ac1b29f1b64bb9161db5371b6667c Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Tue, 10 Dec 2024 22:01:07 +0400
Subject: [PATCH 11/20] Fixed some bugs

---
 CODE/VulnScan/tools/_study_network.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 1b04e0ef..323db56f 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -20,6 +20,10 @@
 from tqdm import tqdm
 
 
+# TODO Add docstring, and hint-type
+# TODO Do v3.1 plans
+#  ZIP the file and attach somewhere (Data)
+
 # Example of DataLoader for loss landscape (dummy dataset for visualization)
 class DummyDataset(torch.utils.data.Dataset):
     def __init__(self, num_samples=100, input_dim=10000):
@@ -123,7 +127,7 @@ def visualize_tsne(model_to_load, dataloader):
             # Extract features (output of the model)
             output = model_to_load(data)
             features.append(output.cpu().numpy())  # Move output to CPU for concatenation
-            labels.append(target.cpu().numpy())   # Move target to CPU for concatenation
+            labels.append(target.cpu().numpy())  # Move target to CPU for concatenation
 
     # Stack all batches
     features = np.vstack(features)
@@ -192,7 +196,7 @@ def plot_many_graphs():
     print("Completed.")
 
 
-# Visualize feature importance (dummy example for visualization) and save as SVG + show the plot
+# Visualize feature importance (dummy example for visualization) and save as SVG
 def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
     # Limit the number of tokens to visualize
     TOKENS = TOKENS[:1000]
@@ -229,15 +233,14 @@ def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON
 
     # Iterate through the grid to compute losses
     for i, dx in enumerate(x):
-        print(f"Computing loss for row {i+1}/{GRID_SIZE}...")
+        print(f"Computing loss for row {i + 1}/{GRID_SIZE}...")
         for j, dy in enumerate(y):
-            print(f"    Computing loss for column {j+1}/{GRID_SIZE}...")
+            print(f"    Computing loss for column {j + 1}/{GRID_SIZE}...")
             param.data += dx * u + dy * v  # Apply perturbation
             loss = 0
 
             # Compute loss for all batches in data loader
             for batch in DATA_LOADER:
-                print(f"        Computing loss for batch: {batch}...")
                 inputs, targets = batch
                 inputs = inputs.to(param.device)
                 targets = targets.to(param.device)
@@ -281,9 +284,11 @@ def main_plot():
 
     # Feature importance
     # Max number of features to visualize is 3000 due to image constraints
-    print(f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES])+1} tokens...")
+    print(
+        f"Visualizing feature importance - This may take a while for {len(tokens[:NUMBER_OF_FEATURES]) + 1} tokens...")
     feature_importance = np.random.rand(len(tokens[:NUMBER_OF_FEATURES]))  # Example random importance
-    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance, FILENAME="NN features/feature_importance.svg")
+    visualize_feature_importance(tokens[:NUMBER_OF_FEATURES], feature_importance,
+                                 FILENAME="NN features/feature_importance.svg")
 
     # Loss landscape
     print("Visualizing loss landscape - This may take a while...")
@@ -468,7 +473,7 @@ def visualize_top_features(top_n=90):
 
     # Save the plot as a vector graphic
     plt.savefig('NN features/Top_90_Features.svg', format='svg')
-    plt.show()
+    plt.close()
 
 
 def load_model():
@@ -522,7 +527,7 @@ def model_summary():
     # Load the paths from the config file
     vectorizer_path = config.get('VulnScan.study Settings', 'vectorizer_path')
     model_path = config.get('VulnScan.study Settings', 'model_path')
-    NUMBER_OF_FEATURES = config.get('VulnScan.study Settings', 'number_of_features')
+    NUMBER_OF_FEATURES = int(config.get('VulnScan.study Settings', 'number_of_features'))
 
     # Check if the paths exist
     if not os.path.exists(vectorizer_path):

From d91ce486063ec99d06b74f32e70844b538ffab59 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 10:55:53 +0400
Subject: [PATCH 12/20] Added docstrings, and hint-types

---
 CODE/VulnScan/tools/_study_network.py         |  96 ++++++--
 CODE/VulnScan/tools/_vectorizer.py            |  29 +++
 CODE/VulnScan/v2-deprecated/_generate_data.py |  40 ++-
 CODE/VulnScan/v2-deprecated/_train.py         | 131 +++++++++-
 CODE/VulnScan/v3/_generate_data.py            | 233 ++++++++++--------
 CODE/VulnScan/v3/_train.py                    |  93 +++----
 6 files changed, 440 insertions(+), 182 deletions(-)

diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 323db56f..c52ba622 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -1,9 +1,12 @@
+from __future__ import annotations
+
 import os
 import os.path
 import random
 from collections import OrderedDict
 from configparser import ConfigParser
 from os import mkdir
+from typing import Any
 
 import joblib
 import matplotlib.pyplot as plt
@@ -14,23 +17,44 @@
 import torch
 import torch.nn as nn
 from faker import Faker
+from numpy import ndarray, dtype
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.manifold import TSNE
+from torch import device
 from torch.utils.data import DataLoader, TensorDataset
 from torchviz import make_dot
 from tqdm import tqdm
 
 
-# TODO Add docstring, and hint-type
 # TODO Do v3.1 plans
-#  ZIP the file and attach somewhere (Data)
+#   Raise an ImportError to make the file unimportable
+#   raise ImportError("This file cannot be imported")
+
 
 # Example of DataLoader for loss landscape (dummy dataset for visualization)
 class DummyDataset(torch.utils.data.Dataset):
-    def __init__(self, num_samples=100, input_dim=10000):
+    """
+    A dummy dataset for generating synthetic data for visualization purposes.
+
+    Attributes:
+        num_samples (int): Number of samples in the dataset.
+        input_dim (int): Dimension of the input data.
+        data (list): List of generated data samples.
+        labels (list): List of labels corresponding to the data samples.
+    """
+
+    def __init__(self, num_samples: int = 100, input_dim: int = 10000):
+        """
+        Initializes the DummyDataset with the specified number of samples and input dimension.
+
+        Args:
+            num_samples (int): Number of samples to generate.
+            input_dim (int): Dimension of the input data.
+        """
         self.num_samples = num_samples
         self.input_dim = input_dim
-        self.data = []
-        self.labels = []
+        self.data: list[str] = []
+        self.labels: list[int] = []
         faker = Faker()
         for _ in range(num_samples):
             if random.random() < 0.05:  # 5% chance to include sensitive data
@@ -40,10 +64,25 @@ def __init__(self, num_samples=100, input_dim=10000):
                 self.data.append(faker.text(max_nb_chars=100))  # Non-sensitive data
                 self.labels.append(0)  # Label as non-sensitive
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Returns the number of samples in the dataset.
+
+        Returns:
+            int: Number of samples in the dataset.
+        """
         return self.num_samples
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Retrieves the data and label at the specified index.
+
+        Args:
+            idx (int): Index of the data and label to retrieve.
+
+        Returns:
+            tuple: A tuple containing the data tensor and label tensor.
+        """
         data = self.data[idx]
         label = self.labels[idx]
         # Convert data to tensor of ASCII values and pad to input_dim
@@ -57,7 +96,17 @@ def __getitem__(self, idx):
         return data_tensor, label_tensor
 
 
-def load_data(text_data, vectorizer_to_load):
+def load_data(text_data: list[str], vectorizer_to_load: TfidfVectorizer | CountVectorizer) -> DataLoader:
+    """
+    Vectorizes the text data and creates a DataLoader for it.
+
+    Args:
+        text_data (list of str): The text data to be vectorized.
+        vectorizer_to_load: The vectorizer to use for transforming the text data.
+
+    Returns:
+        DataLoader: A DataLoader containing the vectorized text data and dummy labels.
+    """
     # Vectorize the text data
     X = vectorizer_to_load.transform(text_data)
     # Create a dummy label for visualization (replace with real labels if available)
@@ -69,7 +118,7 @@ def load_data(text_data, vectorizer_to_load):
     return DataLoader(dataset, batch_size=32, shuffle=True)
 
 
-def visualize_weight_distribution(model_to_load):
+def visualize_weight_distribution(model_to_load: torch.nn.Module):
     # Access weights of the first layer
     weights = model_to_load[0].weight.detach().cpu().numpy()  # Move tensor to CPU before conversion to numpy
     plt.hist(weights.flatten(), bins=50)
@@ -80,7 +129,7 @@ def visualize_weight_distribution(model_to_load):
     plt.close()
 
 
-def visualize_activations(model_to_load, input_tensor):
+def visualize_activations(model_to_load: torch.nn.Module, input_tensor: torch.Tensor):
     # Check the device of the model
     device_va = next(model_to_load.parameters()).device
 
@@ -110,7 +159,7 @@ def hook_fn(module, inputx, output):
     plt.close()
 
 
-def visualize_tsne(model_to_load, dataloader):
+def visualize_tsne(model_to_load: torch.nn.Module, dataloader: DataLoader):
     # Get the device of the model
     device_va = next(model_to_load.parameters()).device
 
@@ -197,7 +246,8 @@ def plot_many_graphs():
 
 
 # Visualize feature importance (dummy example for visualization) and save as SVG
-def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"):
+def visualize_feature_importance(TOKENS: list[str], FEATURE_IMPORTANCE: float | ndarray[Any, dtype[np.floating]],
+                                 FILENAME: str = "Plot.svg"):
     # Limit the number of tokens to visualize
     TOKENS = TOKENS[:1000]
     FEATURE_IMPORTANCE = FEATURE_IMPORTANCE[:1000]
@@ -213,7 +263,8 @@ def visualize_feature_importance(TOKENS, FEATURE_IMPORTANCE, FILENAME="Plot.svg"
 
 
 # Function to visualize the loss landscape as an interactive 3D object
-def plot_loss_landscape_3d(MODEL, DATA_LOADER, CRITERION, GRID_SIZE=200, EPSILON=0.01, FILENAME="Plot.html"):
+def plot_loss_landscape_3d(MODEL: torch.nn.Module, DATA_LOADER: DataLoader, CRITERION: torch.nn.Module,
+                           GRID_SIZE: int = 200, EPSILON: float = 0.01, FILENAME: str = "Plot.html"):
     MODEL.eval()  # Set model to evaluation mode
     param = next(MODEL.parameters())  # Use the first parameter for landscape perturbations
     param_flat = param.view(-1)
@@ -300,10 +351,11 @@ def main_plot():
     plot_many_graphs()
 
 
-def save_data(model_to_use, input_size, batch_size=-1, device_to_use="cuda"):
-    def register_hook(module):
+def save_data(model_to_use: torch.nn.Module, input_size: tuple[int, Any] | int, batch_size: int = -1,
+              device_to_use: str = "cuda"):
+    def register_hook(module: torch.nn.Module):
 
-        def hook(modules, inputs, output):
+        def hook(modules: torch.nn.Module, inputs: (torch.nn.Module, tuple[torch.Tensor]), output: torch.Tensor):
             class_name = str(modules.__class__).split(".")[-1].split("'")[0]
             module_idx = len(summaries)
 
@@ -341,16 +393,16 @@ def hook(modules, inputs, output):
     ], "Input device is not valid, please specify 'cuda' or 'cpu'"
 
     if device_to_use == "cuda" and torch.cuda.is_available():
-        dtype = torch.cuda.FloatTensor
+        dtype_to_use = torch.cuda.FloatTensor
     else:
-        dtype = torch.FloatTensor
+        dtype_to_use = torch.FloatTensor
 
     # multiple inputs to the network
     if isinstance(input_size, tuple):
         input_size = [input_size]
 
     # batch_size of 2 for batch norm
-    x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
+    x = [torch.rand(2, *in_size).type(dtype_to_use) for in_size in input_size]
 
     # create properties
     summaries = OrderedDict()
@@ -412,7 +464,7 @@ def save_graph():
     # Create a directed graph
     G = nx.DiGraph()
 
-    def add_edges_bulk(layer_names, weight_matrices):
+    def add_edges_bulk(layer_names: str, weight_matrices: np.ndarray[np.float32]):
         """Efficiently add edges to the graph with progress tracking."""
         threshold = 0.1  # Adjust this threshold as needed
         significant_weights = np.abs(weight_matrices) > threshold
@@ -459,7 +511,7 @@ def load_vectorizer():
     return vectorizer_load
 
 
-def visualize_top_features(top_n=90):
+def visualize_top_features(top_n: int = 90):
     feature_names = vectorizer.get_feature_names_out()
     sorted_indices = vectorizer.idf_.argsort()[:top_n]
     top_features = [feature_names[i] for i in sorted_indices]
@@ -476,7 +528,7 @@ def visualize_top_features(top_n=90):
     plt.close()
 
 
-def load_model():
+def load_model() -> tuple[Any, device]:
     device_load = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model_load = torch.load(model_path, weights_only=False)
     model_load.to(device_load)
diff --git a/CODE/VulnScan/tools/_vectorizer.py b/CODE/VulnScan/tools/_vectorizer.py
index 1ad7da8b..63577fa8 100644
--- a/CODE/VulnScan/tools/_vectorizer.py
+++ b/CODE/VulnScan/tools/_vectorizer.py
@@ -9,6 +9,15 @@
 
 
 def load_data(data_paths: str | os.PathLike) -> list[str]:
+    """
+    Load data from the specified path(s).
+
+    Args:
+        data_paths (str | os.PathLike): Path to a directory or a file containing data.
+
+    Returns:
+        list[str]: List of strings, each representing the content of a file.
+    """
     data = []
     if os.path.isdir(data_paths):
         for root, _, files in os.walk(data_paths):
@@ -24,6 +33,18 @@ def load_data(data_paths: str | os.PathLike) -> list[str]:
 
 
 def choose_vectorizer(vectorizer_types: str) -> TfidfVectorizer | CountVectorizer:
+    """
+    Choose and return a vectorizer based on the specified type.
+
+    Args:
+        vectorizer_types (str): Type of vectorizer to use ('tfidf' or 'count').
+
+    Returns:
+        TfidfVectorizer | CountVectorizer: The chosen vectorizer.
+
+    Raises:
+        ValueError: If an unsupported vectorizer type is specified.
+    """
     print("Vectorizer Type: ", vectorizer_types)
     print("Vectorizing Data...")
     if vectorizer_types == 'tfidf':
@@ -34,6 +55,14 @@ def choose_vectorizer(vectorizer_types: str) -> TfidfVectorizer | CountVectorize
 
 
 def main(data_paths: str, vectorizer_types: str, output_paths: str):
+    """
+    Main function to load data, choose a vectorizer, fit the vectorizer to the data, and save the vectorizer.
+
+    Args:
+        data_paths (str): Path to the data.
+        vectorizer_types (str): Type of vectorizer to use ('tfidf' or 'count').
+        output_paths (str): Path to save the fitted vectorizer.
+    """
     data = load_data(data_paths)
     vectorizer = choose_vectorizer(vectorizer_types)
     vectorizer.fit(data)
diff --git a/CODE/VulnScan/v2-deprecated/_generate_data.py b/CODE/VulnScan/v2-deprecated/_generate_data.py
index 59925242..78722f47 100644
--- a/CODE/VulnScan/v2-deprecated/_generate_data.py
+++ b/CODE/VulnScan/v2-deprecated/_generate_data.py
@@ -9,9 +9,15 @@
 fake = Faker()
 
 
-# Function to generate a sensitive file with real sensitive information
-@deprecated(reason="This function is only used for generating sensitive data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0")
+@deprecated(reason="This function is only used for generating sensitive data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0")
 def create_sensitive_file(file_path: str, max_size: int):
+    """
+    Generate a sensitive file with real sensitive information.
+
+    Args:
+        file_path (str): The path where the file will be saved.
+        max_size (int): The maximum size of the file in bytes.
+    """
     content = ""
     # Generate sensitive data using Faker
     content += f"Name: {fake.name()}\n"
@@ -30,9 +36,15 @@ def create_sensitive_file(file_path: str, max_size: int):
         f.write(content)
 
 
-# Function to generate a normal file with non-sensitive data
-@deprecated(reason="This function is only used for generating normal data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0")
+@deprecated(reason="This function is only used for generating normal data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0")
 def create_normal_file(file_path: str, max_size: int):
+    """
+    Generate a normal file with non-sensitive data.
+
+    Args:
+        file_path (str): The path where the file will be saved.
+        max_size (int): The maximum size of the file in bytes.
+    """
     content = ""
     # Add random text
     while len(content.encode('utf-8')) < max_size:
@@ -42,9 +54,15 @@ def create_normal_file(file_path: str, max_size: int):
         f.write(content)
 
 
-# Function to generate a mix file with both normal and sensitive data
-@deprecated(reason="This function is only used for generating mixed data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0")
+@deprecated(reason="This function is only used for generating mixed data for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0")
 def create_mix_file(file_path: str, max_size: int):
+    """
+    Generate a mix file with both normal and sensitive data.
+
+    Args:
+        file_path (str): The path where the file will be saved.
+        max_size (int): The maximum size of the file in bytes.
+    """
     content = ""
     # Add a mix of normal and sensitive data
     while len(content.encode('utf-8')) < max_size:
@@ -59,9 +77,15 @@ def create_mix_file(file_path: str, max_size: int):
         f.write(content)
 
 
-# Function to create random files (Normal, Mix, Sensitive)
-@deprecated(reason="This function is only used for generating random files for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.3.0")
+@deprecated(reason="This function is only used for generating random files for testing purposes for v2 trainers, v2 trainers are deprecated now, use v3 trainers.", removal_version="3.4.0")
 def create_random_files(directories: str, num_file: int = 100):
+    """
+    Create random files (Normal, Mix, Sensitive).
+
+    Args:
+        directories (str): The directory where the files will be saved.
+        num_file (int): The number of files to generate.
+    """
     os.makedirs(directories, exist_ok=True)
 
     for i in range(num_file):
diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py
index 4cfa6247..f25d4152 100644
--- a/CODE/VulnScan/v2-deprecated/_train.py
+++ b/CODE/VulnScan/v2-deprecated/_train.py
@@ -2,7 +2,6 @@
 
 import logging
 import os
-from os import mkdir
 
 import joblib
 import matplotlib.pyplot as plt
@@ -20,6 +19,7 @@
 from sklearn.svm import SVC
 from torch.utils.data import DataLoader, TensorDataset
 from transformers import BertTokenizer, BertForSequenceClassification
+
 from logicytics import deprecated
 
 # Configure logging
@@ -42,7 +42,15 @@
 
 @deprecated(reason="This function is used to load data from a directory. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
 def load_data(data_dir: str) -> tuple[list[str], np.ndarray]:
-    """Loads text data and labels from the directory."""
+    """
+    Loads text data and labels from the directory.
+
+    Args:
+        data_dir (str): The directory containing the data files.
+
+    Returns:
+        tuple[list[str], np.ndarray]: A tuple containing the list of texts and the corresponding labels.
+    """
     texts, labels = [], []
     for file_name in os.listdir(data_dir):
         with open(os.path.join(data_dir, file_name), "r", encoding="utf-8") as f:
@@ -55,7 +63,16 @@ def load_data(data_dir: str) -> tuple[list[str], np.ndarray]:
 
 @deprecated(reason="This function is used to evaluate a model. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
 def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float, float, float, float]:
-    """Evaluates the model using standard metrics."""
+    """
+    Evaluates the model using standard metrics.
+
+    Args:
+        y_true (np.ndarray): The true labels.
+        y_pred (np.ndarray): The predicted labels.
+
+    Returns:
+        tuple[float, float, float, float, float]: A tuple containing accuracy, precision, recall, F1 score, and ROC-AUC score.
+    """
     accuracy = accuracy_score(y_true, y_pred)
     precision = precision_score(y_true, y_pred, zero_division=1)
     recall = recall_score(y_true, y_pred, zero_division=1)
@@ -73,6 +90,13 @@ def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float
 
 @deprecated(reason="This function is used to save progress graphs. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
 def save_progress_graph(accuracies: list[float], filename: str = "training_progress.png"):
+    """
+    Saves a graph of training progress.
+
+    Args:
+        accuracies (list[float]): List of accuracies for each epoch.
+        filename (str): The filename to save the graph as.
+    """
     plt.figure(figsize=(8, 6))
     plt.plot(range(1, len(accuracies) + 1), accuracies, marker='o', label="Training Accuracy")
     plt.xlabel("Epochs")
@@ -87,7 +111,16 @@ def save_progress_graph(accuracies: list[float], filename: str = "training_progr
 @deprecated(reason="This function is used to train xgboost. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
 def train_xgboost(X_train: np.ndarray, X_test: np.ndarray,
                   y_train: np.ndarray, y_test: np.ndarray, SAVE_DIR: str):
-    """Trains a Gradient Boosting Classifier (XGBoost) with GPU."""
+    """
+    Trains a Gradient Boosting Classifier (XGBoost) with GPU.
+
+    Args:
+        X_train (np.ndarray): Training data features.
+        X_test (np.ndarray): Testing data features.
+        y_train (np.ndarray): Training data labels.
+        y_test (np.ndarray): Testing data labels.
+        SAVE_DIR (str): Directory to save the trained model.
+    """
     logging.info("Enabling GPU acceleration...")
     model = xgb.XGBClassifier(tree_method='hist', device=DEVICE)  # Enable GPU acceleration
     logging.info("GPU acceleration enabled.")
@@ -104,7 +137,21 @@ def train_xgboost(X_train: np.ndarray, X_test: np.ndarray,
 def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
                y_test: np.ndarray, MAX_LEN: int, LEARNING_RATE: float, BATCH_SIZE: int,
                EPOCHS: int, SAVE_DIR: str, MODEL_PATH: str):
-    """Trains a BERT model with GPU support."""
+    """
+    Trains a BERT model with GPU support.
+
+    Args:
+        X_train (np.ndarray): Training data features.
+        X_test (np.ndarray): Testing data features.
+        y_train (np.ndarray): Training data labels.
+        y_test (np.ndarray): Testing data labels.
+        MAX_LEN (int): Maximum length of the sequences.
+        LEARNING_RATE (float): Learning rate for the optimizer.
+        BATCH_SIZE (int): Batch size for training.
+        EPOCHS (int): Number of epochs for training.
+        SAVE_DIR (str): Directory to save the trained model.
+        MODEL_PATH (str): Path to the pre-trained BERT model.
+    """
     logging.info("Loading BERT tokenizer...")
     tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
     train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
@@ -154,14 +201,34 @@ def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
 
 
 class LSTMModel(nn.Module):
+    @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0")
     def __init__(self, vocab_size: int, embedding_dim: int = 128, hidden_dim: int = 128, output_dim: int = 1):
+        """
+        Initializes the LSTM model.
+
+        Args:
+            vocab_size (int): Size of the vocabulary.
+            embedding_dim (int): Dimension of the embedding layer.
+            hidden_dim (int): Dimension of the hidden layer.
+            output_dim (int): Dimension of the output layer.
+        """
         super(LSTMModel, self).__init__()
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
         self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bidirectional, so multiply by 2
         self.sigmoid = nn.Sigmoid()
 
+    @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0")
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Defines the forward pass of the LSTM model.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Output tensor.
+        """
         x = self.embedding(x)
         lstm_out, _ = self.lstm(x)
         x = self.fc(lstm_out[:, -1, :])
@@ -173,7 +240,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
                y_test: np.ndarray, MAX_FEATURES: int, LEARNING_RATE: float, BATCH_SIZE: int,
                EPOCHS: int, SAVE_DIR: str):
-    """Trains an LSTM model using PyTorch with GPU support."""
+    """
+    Trains an LSTM model using PyTorch with GPU support.
+
+    Args:
+        X_train (np.ndarray): Training data features.
+        X_test (np.ndarray): Testing data features.
+        y_train (np.ndarray): Training data labels.
+        y_test (np.ndarray): Testing data labels.
+        MAX_FEATURES (int): Maximum number of features for the vectorizer.
+        LEARNING_RATE (float): Learning rate for the optimizer.
+        BATCH_SIZE (int): Batch size for training.
+        EPOCHS (int): Number of epochs for training.
+        SAVE_DIR (str): Directory to save the trained model.
+    """
     logging.info("Training LSTM...")
     logging.info("Vectorizing text data...")
     vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
@@ -236,6 +316,18 @@ def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
 def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str,
                  MAX_FEATURES: int, TEST_SIZE: float | int,
                  MAX_ITER: int, RANDOM_STATE: int):
+    """
+    Trains a Neural Network or SVM model with hyperparameter tuning.
+
+    Args:
+        MODEL (str): The type of model to train ('svm' or 'nn').
+        EPOCHS (int): Number of epochs for training.
+        SAVE_DIR (str): Directory to save the trained model.
+        MAX_FEATURES (int): Maximum number of features for the vectorizer.
+        TEST_SIZE (float | int): Proportion of the dataset to include in the test split.
+        MAX_ITER (int): Maximum number of iterations for the model.
+        RANDOM_STATE (int): Random state for reproducibility.
+    """
     if MODEL not in ["svm", "nn"]:
         logging.error(f"Invalid model type: {MODEL}. Please choose 'svm' or 'nn'.")
         return
@@ -325,6 +417,21 @@ def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str,
 def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int, LEARNING_RATE: float,
                     MAX_FEATURES: int, MAX_LEN: int,
                     TEST_SIZE: float | int, RANDOM_STATE: int, MODEL_PATH_BERT: str = None):
+    """
+    Sets up and trains a model based on the specified type.
+
+    Args:
+        MODEL_TYPE (str): The type of model to train ('xgboost', 'bert', 'lstm').
+        SAVE_DIR (str): Directory to save the trained model.
+        EPOCHS (int): Number of epochs for training.
+        BATCH_SIZE (int): Batch size for training.
+        LEARNING_RATE (float): Learning rate for the optimizer.
+        MAX_FEATURES (int): Maximum number of features for the vectorizer.
+        MAX_LEN (int): Maximum length of the sequences (for BERT).
+        TEST_SIZE (float | int): Proportion of the dataset to include in the test split.
+        RANDOM_STATE (int): Random state for reproducibility.
+        MODEL_PATH_BERT (str, optional): Path to the pre-trained BERT model.
+    """
     # Create save directory if it doesn't exist
     os.makedirs(SAVE_DIR, exist_ok=True)
 
@@ -358,6 +465,16 @@ def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int
 @deprecated(reason="This function is used to train RandomForest. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
 def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int,
               N_ESTIMATORS: int, RANDOM_STATE: int):
+    """
+    Trains a Random Forest Classifier.
+
+    Args:
+        SAVE_DIR (str): Directory to save the trained model.
+        EPOCHS (int): Number of epochs for training.
+        TEST_SIZE (float | int): Proportion of the dataset to include in the test split.
+        N_ESTIMATORS (int): Number of trees in the forest.
+        RANDOM_STATE (int): Random state for reproducibility.
+    """
     logging.info("Training model...")
 
     # Load data
@@ -391,7 +508,7 @@ def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int,
 
         # Save progress plot
         if not os.path.exists(SAVE_DIR):
-            mkdir(SAVE_DIR)
+            os.mkdir(SAVE_DIR)
         save_progress_graph(accuracies, filename=os.path.join(SAVE_DIR, "training_progress.png"))
 
         # Save model checkpoint
diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py
index 0b28b6db..161ee97f 100644
--- a/CODE/VulnScan/v3/_generate_data.py
+++ b/CODE/VulnScan/v3/_generate_data.py
@@ -1,98 +1,86 @@
+from __future__ import annotations
+
 import os
 import random
 import string
 import configparser
 from faker import Faker
 
-# Initialize Faker
-fake = Faker()
-
-# Read configuration
-config = configparser.ConfigParser()
-config.read('../../config.ini')
-
-# Load configuration values
-config = config['VulnScan.generate Settings']
-EXTENSIONS_ALLOWED = config.get('extensions', '.txt').split(',')
-SAVE_PATH = config.get('save_path', '.')
-CODE_NAME = config.get('code_name', 'Sense')
-SIZE_VARIATION = float(config.get('size_variation', '0.1'))
-
-# Ensure the save directory exists
-os.makedirs(SAVE_PATH, exist_ok=True)
-
-# Set default file size and number of files
-DEFAULT_FILE_NUM = 10000
-DEFAULT_MIN_FILE_SIZE = 10 * 1024  # 10 KB
-DEFAULT_MAX_FILE_SIZE = 10 * 1024  # 10 KB
-
-# File configuration based on CODE_NAME
-if CODE_NAME == 'Sense':
-    FILE_NUM = DEFAULT_FILE_NUM * 5
-    MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE * 5
-    MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE * 5
-elif CODE_NAME == 'SenseNano':
-    FILE_NUM = 5
-    MIN_FILE_SIZE = int(DEFAULT_MIN_FILE_SIZE * 0.5)
-    MAX_FILE_SIZE = int(DEFAULT_MAX_FILE_SIZE * 0.5)
-elif CODE_NAME == 'SenseMacro':
-    FILE_NUM = DEFAULT_FILE_NUM * 100
-    MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE
-    MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE
-elif CODE_NAME == 'SenseMini':
-    FILE_NUM = DEFAULT_FILE_NUM
-    MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE
-    MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE
-else:  # Custom configuration
-    MIN_FILE_SIZE = int(config['min_file_size'].replace('KB', '')) * 1024
-    MAX_FILE_SIZE = int(config['max_file_size'].replace('KB', '')) * 1024
-    FILE_NUM = DEFAULT_FILE_NUM
-
-print(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes")
-
-
-# Function to generate random file names
-def generate_random_filename(extensions, suffix_x):
+
+def generate_random_filename(extensions: str, suffix_x: str = '') -> str:
+    """
+    Generate a random filename with the given extension and optional suffix.
+
+    Args:
+        extensions (str): The file extension.
+        suffix_x (str, optional): An optional suffix to add to the filename.
+
+    Returns:
+        str: The generated random filename.
+    """
     return ''.join(random.choices(string.ascii_letters + string.digits, k=10)) + suffix_x + extensions
 
 
-# Function to generate content based on file extension
-def generate_content_for_extension(extensions, size):
-    # Define sensitive data generators
-    sensitive_data_generators = {
-        '.txt': lambda: random.choice([
-            fake.credit_card_number(),
-            fake.ssn(),
-            fake.password(),
-            fake.email(),
-            fake.phone_number(),
-            fake.iban(),
-        ]),
-        '.json': lambda: {
-            'credit_card': fake.credit_card_number(),
-            'email': fake.email(),
-            'phone': fake.phone_number(),
-            'password': fake.password(),
-            'iban': fake.iban(),
-        },
-        '.csv': lambda: ",".join([
-            fake.credit_card_number(),
-            fake.email(),
-            fake.phone_number(),
-        ]),
-        '.xml': lambda: f"<sensitive>{random.choice([fake.credit_card_number(), fake.iban(), fake.password()])}</sensitive>",
-        '.log': lambda: f"{fake.date_time()} - Sensitive Data: {random.choice([fake.email(), fake.password(), fake.ipv4_private()])}",
-        'default': lambda: fake.text(max_nb_chars=50)
-    }
-
-    # Define sensitivity chances
+def generate_content_for_extension(extensions: str, size: int | float) -> tuple[str, str]:
+    """
+    Generate content based on the file extension and size.
+
+    Args:
+        extensions (str): The file extension.
+        size (int | float): The size of the content to generate.
+
+    Returns:
+        tuple[str, str]: The generated content and a suffix indicating the sensitivity level.
+    """
     full_sensitive_chance = float(config.get('full_sensitive_chance', '0.1'))
     partial_sensitive_chance = float(config.get('partial_sensitive_chance', '0.3'))
 
-    def generate_sensitive_data():
+    def generate_sensitive_data() -> str:
+        """
+        Generate sensitive data based on the file extension.
+
+        Returns:
+            str: The generated sensitive data.
+        """
+        sensitive_data_generators = {
+            '.txt': lambda: random.choice([
+                fake.credit_card_number(),
+                fake.ssn(),
+                fake.password(),
+                fake.email(),
+                fake.phone_number(),
+                fake.iban(),
+            ]),
+            '.json': lambda: {
+                'credit_card': fake.credit_card_number(),
+                'email': fake.email(),
+                'phone': fake.phone_number(),
+                'password': fake.password(),
+                'iban': fake.iban(),
+            },
+            '.csv': lambda: ",".join([
+                fake.credit_card_number(),
+                fake.email(),
+                fake.phone_number(),
+            ]),
+            '.xml': lambda: f"<sensitive>{random.choice([fake.credit_card_number(), fake.iban(), fake.password()])}</sensitive>",
+            '.log': lambda: f"{fake.date_time()} - Sensitive Data: {random.choice([fake.email(), fake.password(), fake.ipv4_private()])}",
+            'default': lambda: fake.text(max_nb_chars=50)
+        }
+
         return sensitive_data_generators.get(extensions, sensitive_data_generators['default'])()
 
-    def generate_regular_content(extension_grc, sizes):
+    def generate_regular_content(extension_grc: str, sizes: int | float) -> str:
+        """
+        Generate regular content based on the file extension and size.
+
+        Args:
+            extension_grc (str): The file extension.
+            sizes (int | float): The size of the content to generate.
+
+        Returns:
+            str: The generated regular content.
+        """
         if extension_grc == '.txt':
             content_grc = fake.text(max_nb_chars=sizes)
         elif extension_grc == '.json':
@@ -111,12 +99,10 @@ def generate_regular_content(extension_grc, sizes):
         elif extension_grc == '.log':
             content_grc = "\n".join([f"{fake.date_time()} - {fake.text(50)}" for _ in range(sizes // 100)])
         else:
-            # Default to plain text for unknown extensions
             content_grc = fake.text(max_nb_chars=sizes)
         return content_grc
 
     if random.random() < full_sensitive_chance:
-        # Generate fully sensitive content
         if extensions == '.json':
             contents = str([generate_sensitive_data() for _ in range(size // 500)])
         elif extensions in ['.txt', '.log', '.xml']:
@@ -127,12 +113,10 @@ def generate_regular_content(extension_grc, sizes):
             contents = "\n".join([generate_sensitive_data() for _ in range(size // 500)])
         return contents, '-sensitive'
     else:
-        # Generate regular content with optional partial sensitivity
         regular_content = generate_regular_content(extensions, size)
         if random.random() < partial_sensitive_chance:
-            sensitive_data_count = max(1, size // 500)  # Embed some sensitive data
+            sensitive_data_count = max(1, size // 500)
             sensitive_data = [generate_sensitive_data() for _ in range(sensitive_data_count)]
-            # Blend sensitive data into the regular content
             regular_content_lines = regular_content.split("\n")
             for _ in range(sensitive_data_count):
                 insert_position = random.randint(0, len(regular_content_lines) - 1)
@@ -144,8 +128,16 @@ def generate_regular_content(extension_grc, sizes):
             return contents, '-none'
 
 
-# Function to generate file content
-def generate_file_content(extensions):
+def generate_file_content(extensions: str) -> tuple[str, str]:
+    """
+    Generate file content based on the file extension.
+
+    Args:
+        extensions (str): The file extension.
+
+    Returns:
+        tuple[str, str]: The generated content and a suffix indicating the sensitivity level.
+    """
     size = random.randint(MIN_FILE_SIZE, MAX_FILE_SIZE)
     if SIZE_VARIATION != 0:
         variation_choice = random.choice([1, 2, 3, 4])
@@ -161,14 +153,57 @@ def generate_file_content(extensions):
     return generate_content_for_extension(extensions, size)
 
 
-# Generate files
-for i in range(FILE_NUM):
-    print(f"Generating file {i + 1}/{FILE_NUM}")
-    extension = random.choice(EXTENSIONS_ALLOWED).strip()
-    content, suffix = generate_file_content(extension)
-    filename = generate_random_filename(extension, suffix)
-    filepath = os.path.join(SAVE_PATH, filename)
-    with open(filepath, 'w', encoding='utf-8') as f:
-        f.write(content)
-
-print(f"Generated {FILE_NUM} files in {SAVE_PATH}")
+if __name__ == "__name__":
+    """
+    Main function to generate files based on the configuration.
+    """
+    fake = Faker()
+
+    config = configparser.ConfigParser()
+    config.read('../../config.ini')
+
+    config = config['VulnScan.generate Settings']
+    EXTENSIONS_ALLOWED = config.get('extensions', '.txt').split(',')
+    SAVE_PATH = config.get('save_path', '.')
+    CODE_NAME = config.get('code_name', 'Sense')
+    SIZE_VARIATION = float(config.get('size_variation', '0.1'))
+
+    os.makedirs(SAVE_PATH, exist_ok=True)
+
+    DEFAULT_FILE_NUM = 10000
+    DEFAULT_MIN_FILE_SIZE = 10 * 1024
+    DEFAULT_MAX_FILE_SIZE = 10 * 1024
+
+    if CODE_NAME == 'Sense':
+        FILE_NUM = DEFAULT_FILE_NUM * 5
+        MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE * 5
+        MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE * 5
+    elif CODE_NAME == 'SenseNano':
+        FILE_NUM = 5
+        MIN_FILE_SIZE = int(DEFAULT_MIN_FILE_SIZE * 0.5)
+        MAX_FILE_SIZE = int(DEFAULT_MAX_FILE_SIZE * 0.5)
+    elif CODE_NAME == 'SenseMacro':
+        FILE_NUM = DEFAULT_FILE_NUM * 100
+        MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE
+        MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE
+    elif CODE_NAME == 'SenseMini':
+        FILE_NUM = DEFAULT_FILE_NUM
+        MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE
+        MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE
+    else:
+        MIN_FILE_SIZE = int(config['min_file_size'].replace('KB', '')) * 1024
+        MAX_FILE_SIZE = int(config['max_file_size'].replace('KB', '')) * 1024
+        FILE_NUM = DEFAULT_FILE_NUM
+
+    print(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes")
+
+    for i in range(FILE_NUM):
+        print(f"Generating file {i + 1}/{FILE_NUM}")
+        extension = random.choice(EXTENSIONS_ALLOWED).strip()
+        content, suffix = generate_file_content(extension)
+        filename = generate_random_filename(extension, suffix)
+        filepath = os.path.join(SAVE_PATH, filename)
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(content)
+
+    print(f"Generated {FILE_NUM} files in {SAVE_PATH}")
diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py
index 1600fee6..53c3d0a3 100644
--- a/CODE/VulnScan/v3/_train.py
+++ b/CODE/VulnScan/v3/_train.py
@@ -344,49 +344,50 @@ def train_model(
         train_traditional_model(model_name, epochs, save_model_path)
 
 
-# Config file reading and setting constants
-logger.info("Reading config file")
-config = ConfigParser()
-config.read('../../config.ini')
-MODEL_NAME = config.get('VulnScan.train Settings', 'model_name')
-TRAINING_PATH = config.get('VulnScan.train Settings', 'train_data_path')
-EPOCHS = int(config.get('VulnScan.train Settings', 'epochs'))
-BATCH_SIZE = int(config.get('VulnScan.train Settings', 'batch_size'))
-LEARN_RATE = float(config.get('VulnScan.train Settings', 'learning_rate'))
-CUDA = config.getboolean('VulnScan.train Settings', 'use_cuda')
-SAVE_PATH = config.get('VulnScan.train Settings', 'save_model_path')
-
-# Load Data
-logger.info(f"Loading data from {TRAINING_PATH}")
-texts, labels = [], []
-for filename in os.listdir(TRAINING_PATH):
-    with open(os.path.join(config.get('VulnScan.train Settings', 'train_data_path'), filename), 'r',
-              encoding='utf-8') as file:
-        texts.append(file.read())
-        labels.append(1 if '-sensitive' in filename else 0)
-    logger.debug(f"Loaded data from {filename} with label {labels[-1]}")
-
-# Split Data
-logger.info("Splitting data into training and validation sets")
-X_train, X_val, y_train, y_val = train_test_split(texts,
-                                                  labels,
-                                                  test_size=0.2,
-                                                  random_state=42)
-
-# Train Model
-try:
-    train_model(model_name=MODEL_NAME,
-                epochs=EPOCHS,
-                batch_size=BATCH_SIZE,
-                learning_rate=LEARN_RATE,
-                save_model_path=SAVE_PATH,
-                use_cuda=CUDA)
-except FileNotFoundError as e:
-    logger.error(f"File Not Found Error in training model: {e}")
-    exit(1)
-except AttributeError as e:
-    logger.error(f"Attribute Error in training model: {e}")
-    exit(1)
-except Exception as e:
-    logger.error(f"Error in training model: {e}")
-    exit(1)
+if __name__ == "__main__":
+    # Config file reading and setting constants
+    logger.info("Reading config file")
+    config = ConfigParser()
+    config.read('../../config.ini')
+    MODEL_NAME = config.get('VulnScan.train Settings', 'model_name')
+    TRAINING_PATH = config.get('VulnScan.train Settings', 'train_data_path')
+    EPOCHS = int(config.get('VulnScan.train Settings', 'epochs'))
+    BATCH_SIZE = int(config.get('VulnScan.train Settings', 'batch_size'))
+    LEARN_RATE = float(config.get('VulnScan.train Settings', 'learning_rate'))
+    CUDA = config.getboolean('VulnScan.train Settings', 'use_cuda')
+    SAVE_PATH = config.get('VulnScan.train Settings', 'save_model_path')
+
+    # Load Data
+    logger.info(f"Loading data from {TRAINING_PATH}")
+    texts, labels = [], []
+    for filename in os.listdir(TRAINING_PATH):
+        with open(os.path.join(config.get('VulnScan.train Settings', 'train_data_path'), filename), 'r',
+                  encoding='utf-8') as file:
+            texts.append(file.read())
+            labels.append(1 if '-sensitive' in filename else 0)
+        logger.debug(f"Loaded data from {filename} with label {labels[-1]}")
+
+    # Split Data
+    logger.info("Splitting data into training and validation sets")
+    X_train, X_val, y_train, y_val = train_test_split(texts,
+                                                      labels,
+                                                      test_size=0.2,
+                                                      random_state=42)
+
+    # Train Model
+    try:
+        train_model(model_name=MODEL_NAME,
+                    epochs=EPOCHS,
+                    batch_size=BATCH_SIZE,
+                    learning_rate=LEARN_RATE,
+                    save_model_path=SAVE_PATH,
+                    use_cuda=CUDA)
+    except FileNotFoundError as e:
+        logger.error(f"File Not Found Error in training model: {e}")
+        exit(1)
+    except AttributeError as e:
+        logger.error(f"Attribute Error in training model: {e}")
+        exit(1)
+    except Exception as e:
+        logger.error(f"Error in training model: {e}")
+        exit(1)

From f409b27dcd43d39a693584cc7e8ce53513018aea Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 10:59:10 +0400
Subject: [PATCH 13/20] Made file non-importable

---
 CODE/VulnScan/tools/_study_network.py         | 7 ++-----
 CODE/VulnScan/tools/_test_gpu_acceleration.py | 2 ++
 CODE/VulnScan/tools/_vectorizer.py            | 2 ++
 CODE/VulnScan/v2-deprecated/_generate_data.py | 5 ++++-
 CODE/VulnScan/v2-deprecated/_train.py         | 2 ++
 CODE/VulnScan/v3/_generate_data.py            | 2 ++
 CODE/VulnScan/v3/_train.py                    | 2 ++
 7 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index c52ba622..69f823ab 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -26,11 +26,6 @@
 from tqdm import tqdm
 
 
-# TODO Do v3.1 plans
-#   Raise an ImportError to make the file unimportable
-#   raise ImportError("This file cannot be imported")
-
-
 # Example of DataLoader for loss landscape (dummy dataset for visualization)
 class DummyDataset(torch.utils.data.Dataset):
     """
@@ -624,3 +619,5 @@ def model_summary():
 
     model_summary()
     main_plot()
+else:
+    raise ImportError("This file cannot be imported")
diff --git a/CODE/VulnScan/tools/_test_gpu_acceleration.py b/CODE/VulnScan/tools/_test_gpu_acceleration.py
index e45d05c8..a7e47134 100644
--- a/CODE/VulnScan/tools/_test_gpu_acceleration.py
+++ b/CODE/VulnScan/tools/_test_gpu_acceleration.py
@@ -23,3 +23,5 @@ def check_gpu():
 
 if __name__ == '__main__':
     check_gpu()
+else:
+    raise ImportError("This file cannot be imported")
diff --git a/CODE/VulnScan/tools/_vectorizer.py b/CODE/VulnScan/tools/_vectorizer.py
index 63577fa8..5d316de9 100644
--- a/CODE/VulnScan/tools/_vectorizer.py
+++ b/CODE/VulnScan/tools/_vectorizer.py
@@ -80,3 +80,5 @@ def main(data_paths: str, vectorizer_types: str, output_paths: str):
     if not os.path.exists(output_path):
         os.makedirs(output_path)
     main(data_path, vectorizer_type, output_path)
+else:
+    raise ImportError("This file cannot be imported")
diff --git a/CODE/VulnScan/v2-deprecated/_generate_data.py b/CODE/VulnScan/v2-deprecated/_generate_data.py
index 78722f47..62fc91e5 100644
--- a/CODE/VulnScan/v2-deprecated/_generate_data.py
+++ b/CODE/VulnScan/v2-deprecated/_generate_data.py
@@ -103,4 +103,7 @@ def create_random_files(directories: str, num_file: int = 100):
         print(f"Created {file_type} file: {file_name}")
 
 
-create_random_files(SAVE_DIRECTORY, num_file=1000000)
+if __name__ == "__main__":
+    create_random_files(SAVE_DIRECTORY, num_file=1000000)
+else:
+    raise ImportError("This file cannot be imported")
diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py
index f25d4152..9fc51517 100644
--- a/CODE/VulnScan/v2-deprecated/_train.py
+++ b/CODE/VulnScan/v2-deprecated/_train.py
@@ -544,3 +544,5 @@ def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int,
     train_model_blx(MODEL_TYPE="bert", SAVE_DIR=r"C:\Users\Hp\Desktop\Model Tests\Model Sense .2b1", EPOCHS=5,
                     BATCH_SIZE=8, LEARNING_RATE=5e-5, MAX_FEATURES=5000, MAX_LEN=128, TEST_SIZE=0.2, RANDOM_STATE=42,
                     MODEL_PATH_BERT="../bert-base-uncased-model")
+else:
+    raise ImportError("This file cannot be imported")
diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py
index 161ee97f..e1f0d0c8 100644
--- a/CODE/VulnScan/v3/_generate_data.py
+++ b/CODE/VulnScan/v3/_generate_data.py
@@ -207,3 +207,5 @@ def generate_file_content(extensions: str) -> tuple[str, str]:
             f.write(content)
 
     print(f"Generated {FILE_NUM} files in {SAVE_PATH}")
+else:
+    raise ImportError("This file cannot be imported")
diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py
index 53c3d0a3..586bd5fb 100644
--- a/CODE/VulnScan/v3/_train.py
+++ b/CODE/VulnScan/v3/_train.py
@@ -391,3 +391,5 @@ def train_model(
     except Exception as e:
         logger.error(f"Error in training model: {e}")
         exit(1)
+else:
+    raise ImportError("This file cannot be imported")

From ae35a70384fd8fa4f6c93609b2f16bfac8a2c826 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 11:14:56 +0400
Subject: [PATCH 14/20] Documentation update

---
 CODE/VulnScan/Documentation.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/CODE/VulnScan/Documentation.md b/CODE/VulnScan/Documentation.md
index 4b750fda..7b0f5dc9 100644
--- a/CODE/VulnScan/Documentation.md
+++ b/CODE/VulnScan/Documentation.md
@@ -107,3 +107,32 @@ VulnScan is designed to detect sensitive data across various file formats. It of
 - **Progress Tracking**: Visualizes accuracy and loss per epoch with graphs.
 - **Error Handling**: Logs errors for missing files, attribute issues, or unexpected conditions.
 - **Extensibility**: Supports plug-and-play integration for new algorithms or datasets.
+
+
+# More files
+
+There is a repository that archived all the data used to make the model, 
+as well as previously trained models for you to test out 
+(loading scripts and vectorizers are not included). 
+
+The repository is located [here](https://github.com/DefinetlyNotAI/VulnScan_TrainingData).
+
+The repository contains the following directories:
+- `Training Data`: Contains the data used to train the models. Is organized by the file size and amount, unless its Tests, where they explicitly say text.
+- `Archived Models`: Contains the previously trained models. Is organized by the model type then version.
+- `NN features`: Contains information about the model `.3n3` and the vectorizer used. Information include:
+  - `Documentation_Study_Network.md`: A markdown file that contains more info.
+  - `Neural Network Nodes Graph.gexf`: A Gephi file that contains the model nodes and edges.
+    - `Nodes and edges (GEPHI).csv`: A CSV file that contains the model nodes and edges.
+    - `Statistics`: Directories made by Gephi, containing the statistics of the model nodes and edges.
+  - `Feature_Importance.svg`: A SVG file that contains the feature importance of the model.
+  - `Loss_Landscape_3D.html`: A HTML file that contains the 3D loss landscape of the model.
+  - `Model Accuracy Over Epochs.png` and `Model Loss Over Epochs.png`: PNG files that contain the model accuracy and loss over epochs.
+  - `Model state dictionary.txt`: A text file that contains the model state dictionary.
+  - `Model Summary.txt`: A text file that contains the model summary.
+  - `Model Visualization.png`: A PNG file that contains the model visualization.
+  - `Top_90_Features.svg`: A SVG file that contains the top 90 features of the model.
+  - `Vectorizer features.txt`: A text file that contains the vectorizer features.
+  - `Visualize Activation.png`: A PNG file that contains the visualization of the model activation.
+  - `Visualize t-SNE.png`: A PNG file that contains the visualization of the model t-SNE.
+  - `Weight Distribution.png`: A PNG file that contains the weight distribution of the model.

From 2843e884a36133a62b525b67465e1c3b11374b8b Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 11:21:34 +0400
Subject: [PATCH 15/20] PLANS.md update

1. Made deprecation versions changed
2. Removed an old plan made in v3.0.0 or v3.1.0
---
 CODE/VulnScan/v2-deprecated/_train.py | 22 +++++++++++-----------
 PLANS.md                              | 20 +++++++++++---------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py
index 9fc51517..7940b1c3 100644
--- a/CODE/VulnScan/v2-deprecated/_train.py
+++ b/CODE/VulnScan/v2-deprecated/_train.py
@@ -40,7 +40,7 @@
 # ---------------------------------------
 
 
-@deprecated(reason="This function is used to load data from a directory. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to load data from a directory. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def load_data(data_dir: str) -> tuple[list[str], np.ndarray]:
     """
     Loads text data and labels from the directory.
@@ -61,7 +61,7 @@ def load_data(data_dir: str) -> tuple[list[str], np.ndarray]:
     return texts, np.array(labels)
 
 
-@deprecated(reason="This function is used to evaluate a model. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to evaluate a model. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float, float, float, float]:
     """
     Evaluates the model using standard metrics.
@@ -88,7 +88,7 @@ def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, float
 # ---------------------------------------
 
 
-@deprecated(reason="This function is used to save progress graphs. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to save progress graphs. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def save_progress_graph(accuracies: list[float], filename: str = "training_progress.png"):
     """
     Saves a graph of training progress.
@@ -108,7 +108,7 @@ def save_progress_graph(accuracies: list[float], filename: str = "training_progr
     plt.close()
 
 
-@deprecated(reason="This function is used to train xgboost. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to train xgboost. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def train_xgboost(X_train: np.ndarray, X_test: np.ndarray,
                   y_train: np.ndarray, y_test: np.ndarray, SAVE_DIR: str):
     """
@@ -133,7 +133,7 @@ def train_xgboost(X_train: np.ndarray, X_test: np.ndarray,
     logging.info("Model saved as xgboost_model.pkl")
 
 
-@deprecated(reason="This function is used to train bert. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to train bert. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
                y_test: np.ndarray, MAX_LEN: int, LEARNING_RATE: float, BATCH_SIZE: int,
                EPOCHS: int, SAVE_DIR: str, MODEL_PATH: str):
@@ -201,7 +201,7 @@ def train_bert(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
 
 
 class LSTMModel(nn.Module):
-    @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0")
+    @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.2.0")
     def __init__(self, vocab_size: int, embedding_dim: int = 128, hidden_dim: int = 128, output_dim: int = 1):
         """
         Initializes the LSTM model.
@@ -218,7 +218,7 @@ def __init__(self, vocab_size: int, embedding_dim: int = 128, hidden_dim: int =
         self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bidirectional, so multiply by 2
         self.sigmoid = nn.Sigmoid()
 
-    @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.3.0")
+    @deprecated(reason="This class is used to define an LSTM model. Its for training v2 models, which is now deprecated, use _train.py v3 instead.", removal_version="3.2.0")
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Defines the forward pass of the LSTM model.
@@ -236,7 +236,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-@deprecated(reason="This function is used to train lstm. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to train lstm. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
                y_test: np.ndarray, MAX_FEATURES: int, LEARNING_RATE: float, BATCH_SIZE: int,
                EPOCHS: int, SAVE_DIR: str):
@@ -312,7 +312,7 @@ def train_lstm(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray,
 # ---------------------------------------
 
 # noinspection DuplicatedCode
-@deprecated(reason="This function is used to train NeuralNetworks/SVM. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to train NeuralNetworks/SVM. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str,
                  MAX_FEATURES: int, TEST_SIZE: float | int,
                  MAX_ITER: int, RANDOM_STATE: int):
@@ -413,7 +413,7 @@ def train_nn_svm(MODEL: str, EPOCHS: int, SAVE_DIR: str,
     logging.info("Training complete.")
 
 
-@deprecated(reason="This function is used setup training. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used setup training. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int, LEARNING_RATE: float,
                     MAX_FEATURES: int, MAX_LEN: int,
                     TEST_SIZE: float | int, RANDOM_STATE: int, MODEL_PATH_BERT: str = None):
@@ -462,7 +462,7 @@ def train_model_blx(MODEL_TYPE: str, SAVE_DIR: str, EPOCHS: int, BATCH_SIZE: int
 
 
 # noinspection DuplicatedCode
-@deprecated(reason="This function is used to train RandomForest. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.3.0")
+@deprecated(reason="This function is used to train RandomForest. Its for training v2 models, which is now deprecated, use train.py v3 instead.", removal_version="3.2.0")
 def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int,
               N_ESTIMATORS: int, RANDOM_STATE: int):
     """
diff --git a/PLANS.md b/PLANS.md
index 39cc6f5d..e567ff5c 100644
--- a/PLANS.md
+++ b/PLANS.md
@@ -5,12 +5,14 @@
 > - ❌ = Might be done, Not sure yet
 > - ✅ = Will be done, 100% sure
 
-| Task                                                                                                                            | Version | Might or Will be done? |
-|---------------------------------------------------------------------------------------------------------------------------------|---------|------------------------|
-| Add a tool to capture and analyse memory dumps, which can help in forensic investigations.                                      | v3.1.0  | ❌                      |
-| Add a tool to capture and analyse network traffic, which can help in forensic investigations.                                   | v3.1.0  | ❌                      |
-| Remove EXTRA dir, and zip features with custom proper features from Logicytics, as well as remove EXTRA wrapper                 | v3.2.0  | ❌                      |
-| Implement a parser for Windows Prefetch files, Shellbags, Jump Lists, LNK files to extract data                                 | v3.3a.0 | ✅                      |
-| Implement a parser for Windows UserAssist registry key, SRUM database to extract data.                                          | v3.3b.0 | ✅                      |
-| Implement a parser for Windows Volume Shadow Copy, LSA Secrets, Syscache, Shimcache, Amcache Event Tracing logs to extract data | v3.3c.0 | ✅                      |
-| Implement the 2 missing flags                                                                                                   | v3.4.0  | ✅                      |
+| Task                                                                                                                            | Version        | Might or Will be done? |
+|---------------------------------------------------------------------------------------------------------------------------------|----------------|------------------------|
+| Add a tool to capture and analyse memory dumps, which can help in forensic investigations.                                      | v3.1.0         | ❌                      |
+| Remove EXTRA dir, and zip features with custom proper features from Logicytics, as well as remove EXTRA wrapper                 | v3.2.0         | ❌                      |
+| Remove deprecated feature: `_train.py`                                                                                          | v3.2.0         | ❌                      |
+| Implement a parser for Windows Prefetch files, Shellbags, Jump Lists, LNK files to extract data                                 | snapshot-3.3.a | ✅                      |
+| Implement a parser for Windows UserAssist registry key, SRUM database to extract data.                                          | snapshot-3.3.b | ✅                      |
+| Implement a parser for Windows Volume Shadow Copy, LSA Secrets, Syscache, Shimcache, Amcache Event Tracing logs to extract data | snapshot-3.3.c | ✅                      |
+| Implement the 2 missing flags                                                                                                   | v3.4.0         | ✅                      |
+| Remove deprecated feature: `_generate_data.py`                                                                                  | v3.4.0         | ✅                      |
+| Move VulnScan tools and v3 module to separate repository, keep only the model and vectorizer                                    | v3.5.0         | ✅                      |

From 3d2a82c5eb2aa5162b18dcd0bb89da51edcf7903 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 13:18:23 +0400
Subject: [PATCH 16/20] Made dump_memory.py

Fixed minor bug, and added dump_memory.py to Logicytics.py, also made dump_memory.py which generates around 3 files with data from the system's RAM, 1 is in HEX aka unreadable
---
 CODE/Logicytics.py  |   6 +-
 CODE/dump_memory.py | 163 ++++++++++++++++++++++++++++++++++++++++++++
 PLANS.md            |   1 -
 3 files changed, 166 insertions(+), 4 deletions(-)
 create mode 100644 CODE/dump_memory.py

diff --git a/CODE/Logicytics.py b/CODE/Logicytics.py
index a5f5583a..db37e416 100644
--- a/CODE/Logicytics.py
+++ b/CODE/Logicytics.py
@@ -16,11 +16,12 @@
 # Initialization
 FileManagement.mkdir()
 log = Log({"log_level": DEBUG, "delete_log": DELETE_LOGS})
+ACTION = None
+SUB_ACTION = None
 
 
 class Health:
     @staticmethod
-    @log.function
     def backup(directory: str, name: str):
         """
         Creates a backup of a specified directory by zipping its contents and moving it to a designated backup location.
@@ -47,7 +48,6 @@ def backup(directory: str, name: str):
         shutil.move(f"{name}.zip", "../ACCESS/BACKUP")
 
     @staticmethod
-    @log.function
     def update() -> tuple[str, str]:
         """
         Updates the repository by pulling the latest changes from the remote repository.
@@ -327,7 +327,7 @@ def zip_generated_files():
     """Zips generated files based on the action."""
 
     def zip_and_log(directory, name):
-        zip_values = FileManagement.Zip.and_hash(directory, name, ACTION)
+        zip_values = FileManagement.Zip.and_hash(directory, name, ACTION if not None else "ERROR_NULL_ACTION_VALUE")
         if isinstance(zip_values, str):
             log.error(zip_values)
         else:
diff --git a/CODE/dump_memory.py b/CODE/dump_memory.py
new file mode 100644
index 00000000..3df3b21e
--- /dev/null
+++ b/CODE/dump_memory.py
@@ -0,0 +1,163 @@
+import datetime
+import platform
+import ctypes
+import os
+import psutil
+from logicytics import Log, DEBUG
+
+if __name__ == "__main__":
+    log = Log({"log_level": DEBUG})
+
+
+# Function to save RAM content snapshot to a file
+def dump_ram_content():
+    try:
+        # Generate a timestamp for the file
+        dump_file = f"Ram_Snapshot.txt"
+
+        # Gather memory statistics using psutil
+        memory_info = psutil.virtual_memory()
+        swap_info = psutil.swap_memory()
+
+        # Get system-specific details
+        system_info = (
+            "System Information:\n"
+            "===================================\n"
+            f"OS: {platform.system()} {platform.release()}\n"
+            f"Architecture: {platform.architecture()[0]}\n"
+            f"Processor: {platform.processor()}\n"
+            f"Machine: {platform.machine()}\n\n"
+        )
+
+        # Prepare content to dump
+        dump_content = (
+            f"RAM Snapshot - {datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}\n"
+            "===================================\n"
+            f"{system_info}"
+            f"Total Memory: {memory_info.total / (1024 ** 3):.2f} GB\n"
+            f"Available Memory: {memory_info.available / (1024 ** 3):.2f} GB\n"
+            f"Used Memory: {memory_info.used / (1024 ** 3):.2f} GB\n"
+            f"Memory Usage: {memory_info.percent}%\n\n"
+            f"Swap Total: {swap_info.total / (1024 ** 3):.2f} GB\n"
+            f"Swap Used: {swap_info.used / (1024 ** 3):.2f} GB\n"
+            f"Swap Free: {swap_info.free / (1024 ** 3):.2f} GB\n"
+            f"Swap Usage: {swap_info.percent}%\n"
+        )
+
+        # Write the content to the file
+        with open(dump_file, "w", encoding="utf-8") as file:
+            file.write(dump_content)
+
+        log.info(f"RAM snapshot saved to: {dump_file}")
+
+    except Exception as e:
+        log.error(f"Error capturing RAM snapshot: {e}")
+
+
+# Define structures for SystemInfo
+class SystemInfo(ctypes.Structure):
+    _fields_ = [
+        ("wProcessorArchitecture", ctypes.c_ushort),
+        ("wReserved", ctypes.c_ushort),
+        ("dwPageSize", ctypes.c_ulong),
+        ("lpMinimumApplicationAddress", ctypes.c_void_p),
+        ("lpMaximumApplicationAddress", ctypes.c_void_p),
+        ("dwActiveProcessorMask", ctypes.POINTER(ctypes.c_ulong)),
+        ("dwNumberOfProcessors", ctypes.c_ulong),
+        ("dwProcessorType", ctypes.c_ulong),
+        ("dwAllocationGranularity", ctypes.c_ulong),
+        ("wProcessorLevel", ctypes.c_ushort),
+        ("wProcessorRevision", ctypes.c_ushort),
+    ]
+
+
+# Define BasicMemInfo
+class BasicMemInfo(ctypes.Structure):
+    _fields_ = [
+        ("BaseAddress", ctypes.c_void_p),
+        ("AllocationBase", ctypes.c_void_p),
+        ("AllocationProtect", ctypes.c_ulong),
+        ("RegionSize", ctypes.c_size_t),
+        ("State", ctypes.c_ulong),
+        ("Protect", ctypes.c_ulong),
+        ("Type", ctypes.c_ulong),
+    ]
+
+
+def get_system_info() -> SystemInfo:
+    system_info = SystemInfo()
+    ctypes.windll.kernel32.GetSystemInfo(ctypes.byref(system_info))
+    return system_info
+
+
+def read_memory():
+    # Open current process with permissions
+    process = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, False, os.getpid())
+    if not process:
+        log.error("Unable to open process for reading.")
+        return
+
+    # Get system info
+    system_info = get_system_info()
+    min_address = system_info.lpMinimumApplicationAddress
+    max_address = system_info.lpMaximumApplicationAddress
+    with open("SystemRam_Info.txt", "w") as sys_file:
+        sys_file.write(f"System Information:\n")
+        sys_file.write("===================================\n")
+        sys_file.write(f"Minimum Address: {min_address}\n")
+        sys_file.write(f"Maximum Address: {max_address}\n")
+        sys_file.write(f"Allocation Granularity: {system_info.dwAllocationGranularity}\n")
+        sys_file.write(f"Processor Architecture: {system_info.wProcessorArchitecture}\n")
+        sys_file.write(f"Number of Processors: {system_info.dwNumberOfProcessors}\n")
+        sys_file.write(f"Processor Type: {system_info.dwProcessorType}\n")
+        sys_file.write(f"Processor Level: {system_info.wProcessorLevel}\n")
+        sys_file.write(f"Processor Revision: {system_info.wProcessorRevision}\n")
+        sys_file.write(f"Page Size: {system_info.dwPageSize}\n")
+        sys_file.write(f"Active Processor Mask: {system_info.dwActiveProcessorMask.contents}\n")
+        sys_file.write(f"Reserved: {system_info.wReserved}\n")
+        sys_file.write("===================================\n")
+        sys_file.write(f"Raw SystemInfo: {system_info}\n")
+        sys_file.write("===================================\n")
+    log.debug(f"Memory Range: {min_address:#x} - {max_address:#x}")
+
+    # Iterate through memory pages
+    memory_info = BasicMemInfo()
+    address = min_address
+    with open("Ram_Dump.txt", "w") as dump_file:
+        while address < max_address:
+            result = ctypes.windll.kernel32.VirtualQueryEx(
+                process, ctypes.c_void_p(address), ctypes.byref(memory_info), ctypes.sizeof(memory_info)
+            )
+            if not result:
+                break
+
+            # Check if the memory is committed and readable
+            if memory_info.State == MEM_COMMIT and memory_info.Protect == PAGE_READWRITE:
+                buffer = ctypes.create_string_buffer(memory_info.RegionSize)
+                bytes_read = ctypes.c_size_t()
+                ctypes.windll.kernel32.ReadProcessMemory(
+                    process,
+                    ctypes.c_void_p(memory_info.BaseAddress),
+                    buffer,
+                    memory_info.RegionSize,
+                    ctypes.byref(bytes_read),
+                )
+                dump_file.write(str(buffer.raw[: bytes_read.value]))
+
+            address += memory_info.RegionSize
+
+    # Close the process handle
+    ctypes.windll.kernel32.CloseHandle(process)
+    log.info("Memory dump complete. Saved to 'ram_dump.txt'.")
+    log.warning("Encoding is in HEX")
+
+
+if __name__ == "__main__":
+    # Constants
+    PROCESS_QUERY_INFORMATION = 0x0400
+    PROCESS_VM_READ = 0x0010
+    MEM_COMMIT = 0x1000
+    PAGE_READWRITE = 0x04
+
+    dump_ram_content()
+    read_memory()
diff --git a/PLANS.md b/PLANS.md
index e567ff5c..cf9a1dbb 100644
--- a/PLANS.md
+++ b/PLANS.md
@@ -7,7 +7,6 @@
 
 | Task                                                                                                                            | Version        | Might or Will be done? |
 |---------------------------------------------------------------------------------------------------------------------------------|----------------|------------------------|
-| Add a tool to capture and analyse memory dumps, which can help in forensic investigations.                                      | v3.1.0         | ❌                      |
 | Remove EXTRA dir, and zip features with custom proper features from Logicytics, as well as remove EXTRA wrapper                 | v3.2.0         | ❌                      |
 | Remove deprecated feature: `_train.py`                                                                                          | v3.2.0         | ❌                      |
 | Implement a parser for Windows Prefetch files, Shellbags, Jump Lists, LNK files to extract data                                 | snapshot-3.3.a | ✅                      |

From e79086ccee5d3f8c0bd5ee6584c05ff959efb3b2 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 14:05:36 +0400
Subject: [PATCH 17/20] CodeRabbit Suggestions

---
 CODE/Logicytics.py                 | 10 +++++++---
 CODE/VulnScan/v3/_generate_data.py | 20 ++++++++++++++++----
 CODE/VulnScan/v3/_train.py         | 29 +++++++++++++++++++++++++++++
 CODE/dump_memory.py                |  2 +-
 4 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/CODE/Logicytics.py b/CODE/Logicytics.py
index db37e416..381becb6 100644
--- a/CODE/Logicytics.py
+++ b/CODE/Logicytics.py
@@ -325,9 +325,13 @@ def threaded_execution(execution_list_thread, index_thread):
 
 def zip_generated_files():
     """Zips generated files based on the action."""
-
-    def zip_and_log(directory, name):
-        zip_values = FileManagement.Zip.and_hash(directory, name, ACTION if not None else "ERROR_NULL_ACTION_VALUE")
+    def zip_and_log(directory: str, name: str):
+        log.debug(f"Zipping directory '{directory}' with name '{name}' under action '{ACTION}'")
+        zip_values = FileManagement.Zip.and_hash(
+                directory,
+                name,
+                ACTION if ACTION is not None else f"ERROR_NO_ACTION_SPECIFIED_{datetime.now().isoformat()}"
+        )
         if isinstance(zip_values, str):
             log.error(zip_values)
         else:
diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py
index e1f0d0c8..70f642c9 100644
--- a/CODE/VulnScan/v3/_generate_data.py
+++ b/CODE/VulnScan/v3/_generate_data.py
@@ -4,9 +4,19 @@
 import random
 import string
 import configparser
+from Logicytics import Log, DEBUG
 from faker import Faker
 
 
+logger = Log(
+    {"log_level": DEBUG,
+     "filename": "../../../ACCESS/LOGS/VulnScan_Train.log",
+     "colorlog_fmt_parameters":
+         "%(log_color)s%(levelname)-8s%(reset)s %(yellow)s%(asctime)s %(blue)s%(message)s",
+     }
+)
+
+
 def generate_random_filename(extensions: str, suffix_x: str = '') -> str:
     """
     Generate a random filename with the given extension and optional suffix.
@@ -149,7 +159,7 @@ def generate_file_content(extensions: str) -> tuple[str, str]:
         size = abs(int(size + (size / SIZE_VARIATION)))
     elif variation_choice == 4:
         size = abs(int(size - (size / SIZE_VARIATION)))
-    print(f"Generating {extensions} content of size {size} bytes")
+    logger.debug(f"Generating {extensions} content of size {size} bytes")
     return generate_content_for_extension(extensions, size)
 
 
@@ -183,6 +193,8 @@ def generate_file_content(extensions: str) -> tuple[str, str]:
         MIN_FILE_SIZE = int(DEFAULT_MIN_FILE_SIZE * 0.5)
         MAX_FILE_SIZE = int(DEFAULT_MAX_FILE_SIZE * 0.5)
     elif CODE_NAME == 'SenseMacro':
+        logger.warning("Generating 100 times more files and 100 times larger files")
+        logger.warning("This is being deprecated in version 3.2.0")
         FILE_NUM = DEFAULT_FILE_NUM * 100
         MIN_FILE_SIZE = DEFAULT_MIN_FILE_SIZE
         MAX_FILE_SIZE = DEFAULT_MAX_FILE_SIZE
@@ -195,10 +207,10 @@ def generate_file_content(extensions: str) -> tuple[str, str]:
         MAX_FILE_SIZE = int(config['max_file_size'].replace('KB', '')) * 1024
         FILE_NUM = DEFAULT_FILE_NUM
 
-    print(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes")
+    logger.info(f"Generating {FILE_NUM} files with sizes between {MIN_FILE_SIZE} and {MAX_FILE_SIZE} bytes")
 
     for i in range(FILE_NUM):
-        print(f"Generating file {i + 1}/{FILE_NUM}")
+        logger.debug(f"Generating file {i + 1}/{FILE_NUM}")
         extension = random.choice(EXTENSIONS_ALLOWED).strip()
         content, suffix = generate_file_content(extension)
         filename = generate_random_filename(extension, suffix)
@@ -206,6 +218,6 @@ def generate_file_content(extensions: str) -> tuple[str, str]:
         with open(filepath, 'w', encoding='utf-8') as f:
             f.write(content)
 
-    print(f"Generated {FILE_NUM} files in {SAVE_PATH}")
+    logger.info(f"Generated {FILE_NUM} files in {SAVE_PATH}")
 else:
     raise ImportError("This file cannot be imported")
diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py
index 586bd5fb..55fe845f 100644
--- a/CODE/VulnScan/v3/_train.py
+++ b/CODE/VulnScan/v3/_train.py
@@ -344,11 +344,38 @@ def train_model(
         train_traditional_model(model_name, epochs, save_model_path)
 
 
+def validate_data():
+    """
+    Validates the data by checking if the variables are of the correct type.
+    """
+    if not isinstance(EPOCHS, int) and EPOCHS > 0:
+        logger.error("EPOCHS must be an integer")
+        exit(1)
+    if not isinstance(BATCH_SIZE, int) and BATCH_SIZE > 0:
+        logger.error("BATCH_SIZE must be an integer")
+        exit(1)
+    if not isinstance(LEARN_RATE, float) and 0 < LEARN_RATE < 1:
+        logger.error("LEARN_RATE must be a float")
+        exit(1)
+    if not isinstance(CUDA, bool):
+        logger.error("CUDA must be a boolean")
+        exit(1)
+    allowed_models = ["NeuralNetwork", "LogReg",
+                      "RandomForest", "ExtraTrees", "GBM",
+                      "XGBoost", "DecisionTree", "NaiveBayes"]
+    if MODEL_NAME not in allowed_models:
+        logger.error('MODEL_NAME must be one of the following: '
+                     '"NeuralNetwork", "LogReg", "RandomForest", '
+                     '"ExtraTrees", "GBM","XGBoost", "DecisionTree", "NaiveBayes"')
+        exit(1)
+
+
 if __name__ == "__main__":
     # Config file reading and setting constants
     logger.info("Reading config file")
     config = ConfigParser()
     config.read('../../config.ini')
+
     MODEL_NAME = config.get('VulnScan.train Settings', 'model_name')
     TRAINING_PATH = config.get('VulnScan.train Settings', 'train_data_path')
     EPOCHS = int(config.get('VulnScan.train Settings', 'epochs'))
@@ -357,6 +384,8 @@ def train_model(
     CUDA = config.getboolean('VulnScan.train Settings', 'use_cuda')
     SAVE_PATH = config.get('VulnScan.train Settings', 'save_model_path')
 
+    validate_data()
+
     # Load Data
     logger.info(f"Loading data from {TRAINING_PATH}")
     texts, labels = [], []
diff --git a/CODE/dump_memory.py b/CODE/dump_memory.py
index 3df3b21e..db2ab7db 100644
--- a/CODE/dump_memory.py
+++ b/CODE/dump_memory.py
@@ -102,7 +102,7 @@ def read_memory():
     min_address = system_info.lpMinimumApplicationAddress
     max_address = system_info.lpMaximumApplicationAddress
     with open("SystemRam_Info.txt", "w") as sys_file:
-        sys_file.write(f"System Information:\n")
+        sys_file.write("System Information:\n")
         sys_file.write("===================================\n")
         sys_file.write(f"Minimum Address: {min_address}\n")
         sys_file.write(f"Maximum Address: {max_address}\n")

From 6ab61b51e613bd5ad7c1ab47f327f77c014c5298 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 14:19:46 +0400
Subject: [PATCH 18/20] Did --dev and fixed bug with it

Keeps mistaking files added and removed due to trailing `"`
---
 CODE/_dev.py    |  8 ++++----
 CODE/config.ini |  4 ++--
 README.md       | 41 +++++++++++++++++++++--------------------
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/CODE/_dev.py b/CODE/_dev.py
index 47687b6f..18755ea1 100644
--- a/CODE/_dev.py
+++ b/CODE/_dev.py
@@ -66,7 +66,7 @@ def dev_checks() -> None:
         Performs a series of checks to ensure that the developer has followed the required guidelines and best practices.
         Returns:
             bool: True if all checks pass, otherwise False.
-        """
+    """
     # Create the necessary directories if they do not exist
     FileManagement.mkdir()
 
@@ -88,9 +88,9 @@ def dev_checks() -> None:
 
         # Get the list of code files in the current directory
         files = Get.list_of_code_files(".")
-        added_files = [f for f in files if f not in CURRENT_FILES]
-        removed_files = [f for f in CURRENT_FILES if f not in files]
-        normal_files = [f for f in files if f in CURRENT_FILES]
+        added_files = [f.replace('"', '') for f in files if f not in CURRENT_FILES]
+        removed_files = [f.replace('"', '') for f in CURRENT_FILES if f not in files]
+        normal_files = [f.replace('"', '') for f in files if f in CURRENT_FILES]
 
         # Print the list of added, removed, and normal files in color
         print("\n".join([f"\033[92m+ {file}\033[0m" for file in added_files]))  # Green +
diff --git a/CODE/config.ini b/CODE/config.ini
index 24130933..be985f35 100644
--- a/CODE/config.ini
+++ b/CODE/config.ini
@@ -9,8 +9,8 @@ delete_old_logs = false
 
 [System Settings]
 # Do not play with these settings unless you know what you are doing
-version = 3.0.0
-files = "browser_miner.ps1, cmd_commands.py, dir_list.py, event_log.py, Logicytics.py, log_miner.py, media_backup.py, netadapter.ps1, packet_sniffer.py, property_scraper.ps1, registry.py, sensitive_data_miner.py, ssh_miner.py, sys_internal.py, tasklist.py, tree.ps1, vulnscan.py, wifi_stealer.py, window_feature_miner.ps1, wmic.py, _debug.py, _dev.py, _extra.py, logicytics\Checks.py, logicytics\Execute.py, logicytics\FileManagement.py, logicytics\Flag.py, logicytics\Get.py, logicytics\Logger.py, logicytics\__init__.py, VulnScan\tools\_test_gpu_acceleration.py, VulnScan\tools\_vectorizer.py, VulnScan\v2-deprecated\_generate_data.py, VulnScan\v2-deprecated\_train.py, VulnScan\v3\_generate_data.py, VulnScan\v3\_train.py"
+version = 3.1.0
+files = "browser_miner.ps1, cmd_commands.py, dir_list.py, dump_memory.py, event_log.py, Logicytics.py, log_miner.py, media_backup.py, netadapter.ps1, packet_sniffer.py, property_scraper.ps1, registry.py, sensitive_data_miner.py, ssh_miner.py, sys_internal.py, tasklist.py, tree.ps1, vulnscan.py, wifi_stealer.py, window_feature_miner.ps1, wmic.py, _debug.py, _dev.py, _extra.py, logicytics\Checks.py, logicytics\Execute.py, logicytics\FileManagement.py, logicytics\Flag.py, logicytics\Get.py, logicytics\Logger.py, logicytics\__init__.py, VulnScan\tools\_study_network.py, VulnScan\tools\_test_gpu_acceleration.py, VulnScan\tools\_vectorizer.py, VulnScan\v2-deprecated\_generate_data.py, VulnScan\v2-deprecated\_train.py, VulnScan\v3\_generate_data.py, VulnScan\v3\_train.py"
 
 ###################################################
 # The following settings are for specific modules #
diff --git a/README.md b/README.md
index d1597acc..a25fa039 100644
--- a/README.md
+++ b/README.md
@@ -282,26 +282,27 @@ Here are some of the data points that Logicytics extracts:
 > [!TIP]
 > You can check out future plans [here](PLANS.md), you can contribute these plans if you have no idea's on what to contribute!
 
-| File Name                | About                                                                                                                  | Important Note                  |
-|--------------------------|------------------------------------------------------------------------------------------------------------------------|---------------------------------|
-| browser_miner.ps1        | Mines all data related to browsers                                                                                     | Would love to be updated        |
-| cmd_commands.py          | Gets data from driverquery, sysinfo, gpresult and more                                                                 |                                 |
-| log_miner.py             | Gets all logs from the Windows device                                                                                  |                                 |
-| media_backup.py          | Gets all media of the device in a neat folder                                                                          | Would love to be updated        |
-| netadapter.ps1           | Runs Get-NetAdapter Command with many flags                                                                            |                                 |
-| property_scraper.ps1     | Gets all the windows properties                                                                                        |                                 |
-| registry.py              | Backups the registry                                                                                                   |                                 |
-| sensitive_data_miner.py  | Copies all files that can be considered sensitive in a neat folder, , very slow and clunky - useful for depth scanning |                                 |
-| ssh_miner.py             | Gets as much ssh private data as possible                                                                              |                                 |
-| sys_internal.py          | Attempts to use the Sys_Internal Suite from microsoft                                                                  |                                 |
-| tasklist.py              | Gets all running tasks, PID and info/data                                                                              |                                 |
-| tree.ps1                 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning                                   |                                 |
-| window_feature_miner.ps1 | Logs all the windows features enabled                                                                                  |                                 |
-| wmic.py                  | Logs and runs many wmic commands to gain sensitive data and information                                                |                                 |
-| wifi_stealer.py          | Gets the SSID and Password of all saved Wi-Fi                                                                          |                                 |
-| dir_list.py              | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning                    |                                 |
-| event_logs.py            | Produces a multiple txt files in a folder on many event logs (Security, Applications and System)                       |                                 |
-| vulnscan.py              | Uses AI/ML to detect sensitive files, and log their paths                                                              | In beta!                        |
+| File Name                | About                                                                                                                  | Important Note           |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------|--------------------------|
+| browser_miner.ps1        | Mines all data related to browsers                                                                                     |                          |
+| cmd_commands.py          | Gets data from driverquery, sysinfo, gpresult and more                                                                 |                          |
+| log_miner.py             | Gets all logs from the Windows device                                                                                  |                          |
+| media_backup.py          | Gets all media of the device in a neat folder                                                                          | Would love to be updated |
+| netadapter.ps1           | Runs Get-NetAdapter Command with many flags                                                                            |                          |
+| property_scraper.ps1     | Gets all the windows properties                                                                                        |                          |
+| registry.py              | Backups the registry                                                                                                   |                          |
+| sensitive_data_miner.py  | Copies all files that can be considered sensitive in a neat folder, , very slow and clunky - useful for depth scanning |                          |
+| ssh_miner.py             | Gets as much ssh private data as possible                                                                              |                          |
+| sys_internal.py          | Attempts to use the Sys_Internal Suite from microsoft                                                                  |                          |
+| tasklist.py              | Gets all running tasks, PID and info/data                                                                              |                          |
+| tree.ps1                 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning                                   |                          |
+| window_feature_miner.ps1 | Logs all the windows features enabled                                                                                  |                          |
+| wmic.py                  | Logs and runs many wmic commands to gain sensitive data and information                                                |                          |
+| wifi_stealer.py          | Gets the SSID and Password of all saved Wi-Fi                                                                          |                          |
+| dir_list.py              | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning                    |                          |
+| event_logs.py            | Produces a multiple txt files in a folder on many event logs (Security, Applications and System)                       |                          |
+| vulnscan.py              | Uses AI/ML to detect sensitive files, and log their paths                                                              | In beta!                 |
+| dump_memory.py           | Dumps some memory as well as log some RAM details                                                                      |                          |
 
 This is not an exhaustive list, 
 but it should give you a good idea of what data Logicytics is capable of extracting.

From 85e633b26afc10bb3c08b4e0e6d6064ea61f25cd Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 14:25:14 +0400
Subject: [PATCH 19/20] Fixed minor bugs

CodeRabbit Suggestions
---
 CODE/VulnScan/v3/_generate_data.py |  2 +-
 CODE/VulnScan/v3/_train.py         | 11 +++++++++++
 CODE/config.ini                    |  4 ++--
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py
index 70f642c9..bfce5446 100644
--- a/CODE/VulnScan/v3/_generate_data.py
+++ b/CODE/VulnScan/v3/_generate_data.py
@@ -163,7 +163,7 @@ def generate_file_content(extensions: str) -> tuple[str, str]:
     return generate_content_for_extension(extensions, size)
 
 
-if __name__ == "__name__":
+if __name__ == "__main__":
     """
     Main function to generate files based on the configuration.
     """
diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py
index 55fe845f..485e3d0e 100644
--- a/CODE/VulnScan/v3/_train.py
+++ b/CODE/VulnScan/v3/_train.py
@@ -411,6 +411,17 @@ def validate_data():
                     learning_rate=LEARN_RATE,
                     save_model_path=SAVE_PATH,
                     use_cuda=CUDA)
+    except RuntimeError as e:
+        if "CUDA" in str(e):
+            logger.error(f"GPU error: {e}. Falling back to CPU...")
+            train_model(model_name=MODEL_NAME,
+                        epochs=EPOCHS,
+                        batch_size=BATCH_SIZE,
+                        learning_rate=LEARN_RATE,
+                        save_model_path=SAVE_PATH,
+                        use_cuda=False)
+        else:
+            raise
     except FileNotFoundError as e:
         logger.error(f"File Not Found Error in training model: {e}")
         exit(1)
diff --git a/CODE/config.ini b/CODE/config.ini
index be985f35..65a2e6db 100644
--- a/CODE/config.ini
+++ b/CODE/config.ini
@@ -96,8 +96,8 @@ save_model_path = PATH
 # All files be saved here, and can't be changed, PATH is "NN features/"
 
 # This is the path to the model, and the vectorizer
-model_path = ../Model SenseMini .3n3.pth
-vectorizer_path = ../Vectorizer .3n3.pkl
+model_path = PATH
+vectorizer_path = PATH
 # Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations
 # Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap.
 number_of_features = -1

From 62da7858e50084c73e26ffa6dbf9432bb534bbc3 Mon Sep 17 00:00:00 2001
From: DefinetlyNotAI <Nirt_12023@outlook.com>
Date: Wed, 11 Dec 2024 14:46:18 +0400
Subject: [PATCH 20/20] Basic final fixes

Minor bug fixes, grammatical fixes and code formatting
---
 CODE/VulnScan/tools/_study_network.py         |  3 +-
 CODE/VulnScan/tools/_test_gpu_acceleration.py |  3 +-
 CODE/VulnScan/tools/_vectorizer.py            |  3 +-
 CODE/VulnScan/v2-deprecated/_generate_data.py |  3 +-
 CODE/VulnScan/v2-deprecated/_train.py         |  3 +-
 CODE/VulnScan/v3/_generate_data.py            |  3 +-
 CODE/VulnScan/v3/_train.py                    | 39 ++++++----
 CODE/dump_memory.py                           | 73 ++++++++++++++++---
 README.md                                     | 42 +++++------
 9 files changed, 120 insertions(+), 52 deletions(-)

diff --git a/CODE/VulnScan/tools/_study_network.py b/CODE/VulnScan/tools/_study_network.py
index 69f823ab..907c8576 100644
--- a/CODE/VulnScan/tools/_study_network.py
+++ b/CODE/VulnScan/tools/_study_network.py
@@ -620,4 +620,5 @@ def model_summary():
     model_summary()
     main_plot()
 else:
-    raise ImportError("This file cannot be imported")
+    raise ImportError("This training script is meant to be run directly "
+                      "and cannot be imported. Please execute it as a standalone script.")
diff --git a/CODE/VulnScan/tools/_test_gpu_acceleration.py b/CODE/VulnScan/tools/_test_gpu_acceleration.py
index a7e47134..86397e70 100644
--- a/CODE/VulnScan/tools/_test_gpu_acceleration.py
+++ b/CODE/VulnScan/tools/_test_gpu_acceleration.py
@@ -24,4 +24,5 @@ def check_gpu():
 if __name__ == '__main__':
     check_gpu()
 else:
-    raise ImportError("This file cannot be imported")
+    raise ImportError("This training script is meant to be run directly "
+                      "and cannot be imported. Please execute it as a standalone script.")
diff --git a/CODE/VulnScan/tools/_vectorizer.py b/CODE/VulnScan/tools/_vectorizer.py
index 5d316de9..25e57272 100644
--- a/CODE/VulnScan/tools/_vectorizer.py
+++ b/CODE/VulnScan/tools/_vectorizer.py
@@ -81,4 +81,5 @@ def main(data_paths: str, vectorizer_types: str, output_paths: str):
         os.makedirs(output_path)
     main(data_path, vectorizer_type, output_path)
 else:
-    raise ImportError("This file cannot be imported")
+    raise ImportError("This training script is meant to be run directly "
+                      "and cannot be imported. Please execute it as a standalone script.")
diff --git a/CODE/VulnScan/v2-deprecated/_generate_data.py b/CODE/VulnScan/v2-deprecated/_generate_data.py
index 62fc91e5..778c1c26 100644
--- a/CODE/VulnScan/v2-deprecated/_generate_data.py
+++ b/CODE/VulnScan/v2-deprecated/_generate_data.py
@@ -106,4 +106,5 @@ def create_random_files(directories: str, num_file: int = 100):
 if __name__ == "__main__":
     create_random_files(SAVE_DIRECTORY, num_file=1000000)
 else:
-    raise ImportError("This file cannot be imported")
+    raise ImportError("This training script is meant to be run directly "
+                      "and cannot be imported. Please execute it as a standalone script.")
diff --git a/CODE/VulnScan/v2-deprecated/_train.py b/CODE/VulnScan/v2-deprecated/_train.py
index 7940b1c3..5daa78fd 100644
--- a/CODE/VulnScan/v2-deprecated/_train.py
+++ b/CODE/VulnScan/v2-deprecated/_train.py
@@ -545,4 +545,5 @@ def train_rfc(SAVE_DIR: str, EPOCHS: int, TEST_SIZE: float | int,
                     BATCH_SIZE=8, LEARNING_RATE=5e-5, MAX_FEATURES=5000, MAX_LEN=128, TEST_SIZE=0.2, RANDOM_STATE=42,
                     MODEL_PATH_BERT="../bert-base-uncased-model")
 else:
-    raise ImportError("This file cannot be imported")
+    raise ImportError("This training script is meant to be run directly "
+                      "and cannot be imported. Please execute it as a standalone script.")
diff --git a/CODE/VulnScan/v3/_generate_data.py b/CODE/VulnScan/v3/_generate_data.py
index bfce5446..0bc8dd3b 100644
--- a/CODE/VulnScan/v3/_generate_data.py
+++ b/CODE/VulnScan/v3/_generate_data.py
@@ -220,4 +220,5 @@ def generate_file_content(extensions: str) -> tuple[str, str]:
 
     logger.info(f"Generated {FILE_NUM} files in {SAVE_PATH}")
 else:
-    raise ImportError("This file cannot be imported")
+    raise ImportError("This training script is meant to be run directly "
+                      "and cannot be imported. Please execute it as a standalone script.")
diff --git a/CODE/VulnScan/v3/_train.py b/CODE/VulnScan/v3/_train.py
index 485e3d0e..f9bfb2a4 100644
--- a/CODE/VulnScan/v3/_train.py
+++ b/CODE/VulnScan/v3/_train.py
@@ -348,25 +348,28 @@ def validate_data():
     """
     Validates the data by checking if the variables are of the correct type.
     """
-    if not isinstance(EPOCHS, int) and EPOCHS > 0:
-        logger.error("EPOCHS must be an integer")
+    if not isinstance(EPOCHS, int) or EPOCHS <= 0:
+        logger.error("EPOCHS must be a positive integer")
         exit(1)
-    if not isinstance(BATCH_SIZE, int) and BATCH_SIZE > 0:
-        logger.error("BATCH_SIZE must be an integer")
+    if not isinstance(BATCH_SIZE, int) or BATCH_SIZE <= 0:
+        logger.error("BATCH_SIZE must be a positive integer")
         exit(1)
-    if not isinstance(LEARN_RATE, float) and 0 < LEARN_RATE < 1:
-        logger.error("LEARN_RATE must be a float")
+    if not isinstance(LEARN_RATE, float) or not (0 < LEARN_RATE < 1):
+        logger.error("LEARN_RATE must be a float between 0 and 1")
         exit(1)
     if not isinstance(CUDA, bool):
         logger.error("CUDA must be a boolean")
         exit(1)
-    allowed_models = ["NeuralNetwork", "LogReg",
-                      "RandomForest", "ExtraTrees", "GBM",
-                      "XGBoost", "DecisionTree", "NaiveBayes"]
+
+    allowed_models = ["NeuralNetwork", "LogReg", "RandomForest", "ExtraTrees", "GBM", "XGBoost", "DecisionTree", "NaiveBayes"]
     if MODEL_NAME not in allowed_models:
-        logger.error('MODEL_NAME must be one of the following: '
-                     '"NeuralNetwork", "LogReg", "RandomForest", '
-                     '"ExtraTrees", "GBM","XGBoost", "DecisionTree", "NaiveBayes"')
+        logger.error(f"MODEL_NAME must be one of: {', '.join(allowed_models)}")
+        exit(1)
+    if not os.path.exists(TRAINING_PATH):
+        logger.error(f"Training data path {TRAINING_PATH} does not exist")
+        exit(1)
+    if not os.path.exists(os.path.dirname(SAVE_PATH)):
+        logger.error(f"Save model path {SAVE_PATH} does not exist")
         exit(1)
 
 
@@ -421,15 +424,19 @@ def validate_data():
                         save_model_path=SAVE_PATH,
                         use_cuda=False)
         else:
-            raise
+            logger.error(f"Runtime Error in training model: {e}")
+            exit(1)
     except FileNotFoundError as e:
-        logger.error(f"File Not Found Error in training model: {e}")
+        logger.error(f"Training data or model files not found: {e}."
+                     f" Please check if all required files exist.")
         exit(1)
     except AttributeError as e:
-        logger.error(f"Attribute Error in training model: {e}")
+        logger.error(f"Invalid model configuration or missing attributes: {e}."
+                     f" Please verify model settings.")
         exit(1)
     except Exception as e:
         logger.error(f"Error in training model: {e}")
         exit(1)
 else:
-    raise ImportError("This file cannot be imported")
+    raise ImportError("This training script is meant to be run directly "
+                      "and cannot be imported. Please execute it as a standalone script.")
diff --git a/CODE/dump_memory.py b/CODE/dump_memory.py
index db2ab7db..927c40dd 100644
--- a/CODE/dump_memory.py
+++ b/CODE/dump_memory.py
@@ -7,13 +7,25 @@
 
 if __name__ == "__main__":
     log = Log({"log_level": DEBUG})
+    # Constants
+    PROCESS_QUERY_INFORMATION = 0x0400
+    PROCESS_VM_READ = 0x0010
+    MEM_COMMIT = 0x1000
+    PAGE_READWRITE = 0x04
 
 
 # Function to save RAM content snapshot to a file
+@log.function
 def dump_ram_content():
+    """
+    Capture the current state of the system's RAM and write it to a file.
+
+    This function gathers memory statistics, system-specific details, and writes
+    the information to a file named 'Ram_Snapshot.txt'.
+    """
     try:
         # Generate a timestamp for the file
-        dump_file = f"Ram_Snapshot.txt"
+        dump_file = "Ram_Snapshot.txt"
 
         # Gather memory statistics using psutil
         memory_info = psutil.virtual_memory()
@@ -56,6 +68,23 @@ def dump_ram_content():
 
 # Define structures for SystemInfo
 class SystemInfo(ctypes.Structure):
+    # noinspection PyUnresolvedReferences
+    """
+        A ctypes Structure to hold system information.
+
+        Attributes:
+            wProcessorArchitecture (ctypes.c_ushort): Processor architecture.
+            wReserved (ctypes.c_ushort): Reserved.
+            dwPageSize (ctypes.c_ulong): Page size.
+            lpMinimumApplicationAddress (ctypes.c_void_p): Minimum application address.
+            lpMaximumApplicationAddress (ctypes.c_void_p): Maximum application address.
+            dwActiveProcessorMask (ctypes.POINTER(ctypes.c_ulong)): Active processor mask.
+            dwNumberOfProcessors (ctypes.c_ulong): Number of processors.
+            dwProcessorType (ctypes.c_ulong): Processor type.
+            dwAllocationGranularity (ctypes.c_ulong): Allocation granularity.
+            wProcessorLevel (ctypes.c_ushort): Processor level.
+            wProcessorRevision (ctypes.c_ushort): Processor revision.
+    """
     _fields_ = [
         ("wProcessorArchitecture", ctypes.c_ushort),
         ("wReserved", ctypes.c_ushort),
@@ -73,6 +102,19 @@ class SystemInfo(ctypes.Structure):
 
 # Define BasicMemInfo
 class BasicMemInfo(ctypes.Structure):
+    # noinspection PyUnresolvedReferences
+    """
+        A ctypes Structure to hold basic memory information.
+
+        Attributes:
+            BaseAddress (ctypes.c_void_p): Base address.
+            AllocationBase (ctypes.c_void_p): Allocation base.
+            AllocationProtect (ctypes.c_ulong): Allocation protection.
+            RegionSize (ctypes.c_size_t): Region size.
+            State (ctypes.c_ulong): State.
+            Protect (ctypes.c_ulong): Protection.
+            Type (ctypes.c_ulong): Type.
+    """
     _fields_ = [
         ("BaseAddress", ctypes.c_void_p),
         ("AllocationBase", ctypes.c_void_p),
@@ -84,13 +126,28 @@ class BasicMemInfo(ctypes.Structure):
     ]
 
 
+@log.function
 def get_system_info() -> SystemInfo:
+    """
+    Retrieve and return system information using the `GetSystemInfo` function from the Windows API.
+
+    Returns:
+        SystemInfo: A `SystemInfo` structure containing details about the system's architecture,
+                    processor, memory, and other attributes.
+    """
     system_info = SystemInfo()
     ctypes.windll.kernel32.GetSystemInfo(ctypes.byref(system_info))
     return system_info
 
 
+@log.function
 def read_memory():
+    """
+    Read the memory of the current process and write the content to a file.
+
+    This function opens the current process with the necessary permissions,
+    retrieves system information, and iterates through memory pages to read
+    """
     # Open current process with permissions
     process = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, False, os.getpid())
     if not process:
@@ -153,11 +210,9 @@ def read_memory():
 
 
 if __name__ == "__main__":
-    # Constants
-    PROCESS_QUERY_INFORMATION = 0x0400
-    PROCESS_VM_READ = 0x0010
-    MEM_COMMIT = 0x1000
-    PAGE_READWRITE = 0x04
-
-    dump_ram_content()
-    read_memory()
+    try:
+        log.info("Starting memory dump process...")
+        dump_ram_content()
+        read_memory()
+    except Exception as err:
+        log.error(f"Error during memory dump: {err}")
diff --git a/README.md b/README.md
index a25fa039..9d6f495e 100644
--- a/README.md
+++ b/README.md
@@ -282,27 +282,27 @@ Here are some of the data points that Logicytics extracts:
 > [!TIP]
 > You can check out future plans [here](PLANS.md), you can contribute these plans if you have no idea's on what to contribute!
 
-| File Name                | About                                                                                                                  | Important Note           |
-|--------------------------|------------------------------------------------------------------------------------------------------------------------|--------------------------|
-| browser_miner.ps1        | Mines all data related to browsers                                                                                     |                          |
-| cmd_commands.py          | Gets data from driverquery, sysinfo, gpresult and more                                                                 |                          |
-| log_miner.py             | Gets all logs from the Windows device                                                                                  |                          |
-| media_backup.py          | Gets all media of the device in a neat folder                                                                          | Would love to be updated |
-| netadapter.ps1           | Runs Get-NetAdapter Command with many flags                                                                            |                          |
-| property_scraper.ps1     | Gets all the windows properties                                                                                        |                          |
-| registry.py              | Backups the registry                                                                                                   |                          |
-| sensitive_data_miner.py  | Copies all files that can be considered sensitive in a neat folder, , very slow and clunky - useful for depth scanning |                          |
-| ssh_miner.py             | Gets as much ssh private data as possible                                                                              |                          |
-| sys_internal.py          | Attempts to use the Sys_Internal Suite from microsoft                                                                  |                          |
-| tasklist.py              | Gets all running tasks, PID and info/data                                                                              |                          |
-| tree.ps1                 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning                                   |                          |
-| window_feature_miner.ps1 | Logs all the windows features enabled                                                                                  |                          |
-| wmic.py                  | Logs and runs many wmic commands to gain sensitive data and information                                                |                          |
-| wifi_stealer.py          | Gets the SSID and Password of all saved Wi-Fi                                                                          |                          |
-| dir_list.py              | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning                    |                          |
-| event_logs.py            | Produces a multiple txt files in a folder on many event logs (Security, Applications and System)                       |                          |
-| vulnscan.py              | Uses AI/ML to detect sensitive files, and log their paths                                                              | In beta!                 |
-| dump_memory.py           | Dumps some memory as well as log some RAM details                                                                      |                          |
+| File Name                | About                                                                                                                | Important Note           |
+|--------------------------|----------------------------------------------------------------------------------------------------------------------|--------------------------|
+| browser_miner.ps1        | Mines all data related to browsers                                                                                   |                          |
+| cmd_commands.py          | Gets data from driverquery, sysinfo, gpresult and more                                                               |                          |
+| log_miner.py             | Gets all logs from the Windows device                                                                                |                          |
+| media_backup.py          | Gets all media of the device in a neat folder                                                                        | Would love to be updated |
+| netadapter.ps1           | Runs Get-NetAdapter Command with many flags                                                                          |                          |
+| property_scraper.ps1     | Gets all the windows properties                                                                                      |                          |
+| registry.py              | Backups the registry                                                                                                 |                          |
+| sensitive_data_miner.py  | Copies all files that can be considered sensitive in a neat folder, very slow and clunky - useful for depth scanning |                          |
+| ssh_miner.py             | Gets as much ssh private data as possible                                                                            |                          |
+| sys_internal.py          | Attempts to use the Sys_Internal Suite from microsoft                                                                |                          |
+| tasklist.py              | Gets all running tasks, PID and info/data                                                                            |                          |
+| tree.ps1                 | Runs and logs the tree.ps1 command, very slow and clunky - useful for depth scanning                                 |                          |
+| window_feature_miner.ps1 | Logs all the windows features enabled                                                                                |                          |
+| wmic.py                  | Logs and runs many wmic commands to gain sensitive data and information                                              |                          |
+| wifi_stealer.py          | Gets the SSID and Password of all saved Wi-Fi                                                                        |                          |
+| dir_list.py              | Produces a txt on every single file on the device, very slow and clunky - useful for depth scanning                  |                          |
+| event_logs.py            | Produces a multiple txt files in a folder on many event logs (Security, Applications and System)                     |                          |
+| vulnscan.py              | Uses AI/ML to detect sensitive files, and log their paths                                                            | In beta!                 |
+| dump_memory.py           | Dumps some memory as well as log some RAM details                                                                    |                          |
 
 This is not an exhaustive list, 
 but it should give you a good idea of what data Logicytics is capable of extracting.