opcode_sequence_preprocessing.py

import os
from typing import Tuple
from tqdm import tqdm
from utils import utils
import numpy as np


def generate_features_and_labels(path_to_set: str, label_map: dict[str, int]) -> Tuple[np.ndarray, np.ndarray]:
    """
    Generate a feature matrix and label array from a dataset organized by class folders.

    This function reads opcode sequences from text files within class-specific subfolders 
    of the given dataset directory. It determines the maximum sequence length, pads 
    shorter sequences to this length using a padding token, and associates each sequence 
    with its corresponding class label.

    Args:
        path_to_set (str): Path to the dataset folder where each subfolder represents a class.
        label_map (dict[str, int]): A mapping of class (folder) names to integer labels.

    Returns:
        Tuple[np.ndarray, np.ndarray]: 
            - Feature matrix (2D array), where each row is a padded opcode sequence.
            - Label array (1D array), containing integer labels for each sequence.

    Raises:
        FileNotFoundError: If a required file or folder cannot be found.
        ValueError: If a file is empty or improperly formatted.
    """
    features = []
    labels = []
    max_length = 0
    padding_token = "??"

    # First pass: Read sequences and determine the maximum length.
    for class_name, label in label_map.items():
        class_path = os.path.join(path_to_set, class_name)
        if os.path.isdir(class_path):
            for file_name in tqdm(os.listdir(class_path), desc=f"Processing {class_name}"):
                file_path = os.path.join(class_path, file_name)
                with open(file_path, 'r') as f:
                    opcodes = f.read().strip().split(',')

                max_length = max(max_length, len(opcodes))
                features.append(opcodes)
                labels.append(label)

    # Second pass: Pad sequences to the maximum length.
    for i in range(len(features)):
        features[i].extend([padding_token] * (max_length - len(features[i])))

    return np.array(features), np.array(labels)

def main():
    # Step 0: Read dataset path.
    config = utils.load_config()
    text_dataset_address = config["data"]["dataset_opcode_codesection"]

    # Step 1: Create global opcodes set.
    opcodes_set = utils.extract_unique_opcodes_from_dataset(text_dataset_address)

    # Step 2: Create index mappings for opcodes with padding opcode added to global dictionary.
    opcode_to_index_mapping = utils.build_global_opcode_dictionary(opcodes_set, True)
    index_to_opcode_mapping = {idx: opcode for opcode, idx in opcode_to_index_mapping.items()}
    

if __name__ == "__main__":
    main()