-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_split.py
77 lines (68 loc) · 3.55 KB
/
data_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
def balance_data(df_ID_0, df_ID_1, final_data):
"""
Input:
- df_ID_0: dataframe with Columns = ["ID", "OBJECT_TYPE"] only for "OBJECT_TYPE"] == 0
- df_ID_1: dataframe with Columns = ["ID", "OBJECT_TYPE"] only for "OBJECT_TYPE"] == 1
- final_data: concatenate horizontally diff srch temp for the same ID, shape (Count of IDs, rows, cols*3)
Returns:
- indexes: index of the images that together made a balanced data set
- equal_type_data: data from final_data that made the balanced data set
"""
# -- check the min len and take that as the len for real and bogus
len_each_set = min(len(df_ID_0), len(df_ID_1))
# -- add/subtract 10 more images to be able to make a difference later
if len_each_set != 0:
if len(df_ID_0) <= len_each_set:
# -- extract random the number of data classify as 0
index_data_ID0 = df_ID_0.sample(len_each_set-10, random_state = 2).sort_index()
# -- extract random the number of data classify as 1
index_data_ID1 = df_ID_1.sample(len_each_set+10,random_state = 2).sort_index()
else:
# -- extract random the number of data classify as 0
index_data_ID0 = df_ID_0.sample(len_each_set+10, random_state = 2).sort_index()
# -- extract random the number of data classify as 1
index_data_ID1 = df_ID_1.sample(len_each_set-10,random_state = 2).sort_index()
if len(df_ID_0) == 0:
index_data_ID1 = df_ID_1.sort_index()
index_data_ID0 = df_ID_0
finalIDs = index_data_ID1
print("Only bogus")
#index_data_ID1.to_pickle('ID_stamps%d'%ii+'.pkl')
if len(df_ID_1) == 0:
index_data_ID0 = df_ID_0.sort_index()
index_data_ID1 = df_ID_1
finalIDs = index_data_ID0
print("Only reals")
#index_data_ID0.to_pickle('ID_stamps%d'%ii+'.pkl')
#finalIDs = index_data_ID0.append(index_data_ID1)
# -- convert index to numpy to iterate
index_ID0 = index_data_ID0.index.to_numpy()
index_ID1 = index_data_ID1.index.to_numpy()
# -- concatenate both index
indexes = sorted(np.concatenate((index_ID0, index_ID1)))
equal_type_data = np.array([final_data[i] for i in indexes])
return indexes, equal_type_data
def split_data(equal_type_data, final_data, indexes): #equal_type_data is already balance
"""
Input:
- indexes: index of the images that together made a balanced data set
- equal_type_data: data from final_data that made the balanced data set
- final_data: concatenate horizontally diff srch temp for the same ID, shape (Count of IDs, rows, cols*3)
Returns
- train: array with train data set -> 75%
- test: array with test data set -> 25%
- random_index: random indexes from data, i.e. not take only data from one of the folders
"""
# -- 75% is for training
# -- 25% testing
train_len = int(equal_type_data.shape[0]*0.75)
test_len = equal_type_data.shape[0] - int(equal_type_data.shape[0]*0.75)
# -- take random data
import random
random.seed(4)
# -- stored the random numbers and then extract the actual indexes from the data
random_index = random.sample(range(0, equal_type_data.shape[0]), train_len)
train = np.array([final_data[i] for i in [indexes[i] for i in sorted(random_index)]])
test = np.array([final_data[i] for i in indexes if i not in [indexes[i] for i in sorted(random_index)]])
return train, test, random_index