data_split.py


import numpy as np

def balance_data(df_ID_0, df_ID_1, final_data):
    """
        Input:
            - df_ID_0: dataframe with Columns = ["ID", "OBJECT_TYPE"] only for "OBJECT_TYPE"] == 0
            - df_ID_1: dataframe with Columns = ["ID", "OBJECT_TYPE"] only for "OBJECT_TYPE"] == 1  
            - final_data: concatenate horizontally diff srch temp for the same ID, shape (Count of IDs, rows, cols*3)
        Returns:
            - indexes: index of the images that together made a balanced data set
            - equal_type_data: data from final_data that made the balanced data set    
    """
    # -- check the min len and take that as the len for real and bogus
    len_each_set = min(len(df_ID_0), len(df_ID_1))
    # -- add/subtract 10 more images to be able to make a difference later
    if len_each_set != 0:
        if len(df_ID_0) <= len_each_set:
            # -- extract random the number of data classify as 0
            index_data_ID0 = df_ID_0.sample(len_each_set-10, random_state = 2).sort_index()
            # -- extract random the number of data classify as 1
            index_data_ID1 = df_ID_1.sample(len_each_set+10,random_state = 2).sort_index()
        else:
            # -- extract random the number of data classify as 0
            index_data_ID0 = df_ID_0.sample(len_each_set+10, random_state = 2).sort_index()
            # -- extract random the number of data classify as 1
            index_data_ID1 = df_ID_1.sample(len_each_set-10,random_state = 2).sort_index()

    if len(df_ID_0) == 0:
        index_data_ID1 = df_ID_1.sort_index()
        index_data_ID0 = df_ID_0
        finalIDs = index_data_ID1
        print("Only bogus")
        #index_data_ID1.to_pickle('ID_stamps%d'%ii+'.pkl')
    if len(df_ID_1) == 0:
        index_data_ID0 = df_ID_0.sort_index()
        index_data_ID1 = df_ID_1
        finalIDs = index_data_ID0
        print("Only reals")
        #index_data_ID0.to_pickle('ID_stamps%d'%ii+'.pkl')
    #finalIDs = index_data_ID0.append(index_data_ID1)
    
    # -- convert index to numpy to iterate
    index_ID0 = index_data_ID0.index.to_numpy()
    index_ID1 = index_data_ID1.index.to_numpy()

    # -- concatenate both index
    indexes = sorted(np.concatenate((index_ID0, index_ID1)))

    equal_type_data = np.array([final_data[i] for i in indexes])
    return indexes, equal_type_data

def split_data(equal_type_data, final_data, indexes): #equal_type_data is already balance
    """
        Input:
            - indexes: index of the images that together made a balanced data set
            - equal_type_data: data from final_data that made the balanced data set
            - final_data: concatenate horizontally diff srch temp for the same ID, shape (Count of IDs, rows, cols*3)
        Returns
            - train: array with train data set -> 75%
            - test: array with test data set -> 25%
            - random_index: random indexes from data, i.e. not take only data from one of the folders
    """
    # -- 75% is for training
    # -- 25% testing
    train_len = int(equal_type_data.shape[0]*0.75)
    test_len = equal_type_data.shape[0]  - int(equal_type_data.shape[0]*0.75)
    
    # -- take random data
    import random
    random.seed(4)
    # -- stored the random numbers and then extract the actual indexes from the data
    random_index = random.sample(range(0, equal_type_data.shape[0]), train_len)
    
    train = np.array([final_data[i] for i in [indexes[i] for i in sorted(random_index)]])
    test = np.array([final_data[i] for i in indexes if i not in [indexes[i] for i in sorted(random_index)]])
    return train, test, random_index