-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdata_preprocessing.py
43 lines (34 loc) · 1.28 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import numpy as np
import pandas as pd
import os
def normalize(dataset_train_features, dataset_test_features):
dataset = np.concatenate((dataset_train_features, dataset_test_features), axis=0)
min_value = np.amin(dataset)
max_value = np.amax(dataset)
dataset_train_features = (dataset_train_features - min_value) / (max_value - min_value)
dataset_test_features = (dataset_test_features - min_value) / (max_value - min_value)
return dataset_train_features, dataset_test_features
def get_dataset_in_np(path, labels_available=False):
data = pd.read_json(path)
data_id = data.id.values
data_band_1 = data.band_1.values
data_band_2 = data.band_2.values
dataset_features = []
for i in range(0, data_band_1.shape[0]):
temp = []
temp.append(np.array(data_band_1[i]).reshape((75,75)))
temp.append(np.array(data_band_2[i]).reshape((75,75)))
temp = np.array(temp)
temp = temp.reshape((75,75,2))
dataset_features.append(temp)
dataset_features = np.array(dataset_features)
if labels_available:
data_label = data.is_iceberg.values
dataset_labels = []
for i in range(0, data_band_1.shape[0]):
temp = np.zeros(2)
temp[data_label[i]] = 1
dataset_labels.append(temp)
dataset_labels = np.array(dataset_labels)
return dataset_features, dataset_labels
return dataset_features