-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessdata.py
92 lines (78 loc) · 2.75 KB
/
processdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import pandas as pd
import math
import importcsv as ic
from sklearn.model_selection import train_test_split
def cleanData(data):
#shows the number of missing data to determine if imputation will be helpful to improve prediction accuracy
print('number of nans\n',data.isna().sum())
data = data.drop(['ca','thal','slope'], axis=1)
data = data.dropna()
return data
def callMode(df):
return df.mode()[0]
def callMean(df):
return df.mean()
def callMedian(df):
return df.median()
def callNone(df):
return -1
def fillData(data_dict, fill_method = 'none', exclude_col = True):
# Allows main code to choose from the four methods of cleaning data
switcher = {
'mode': callMode,
'mean': callMean,
'median': callMedian,
'none': callNone
}
func = switcher.get(fill_method)
output = pd.DataFrame()
datas = list(data_dict.values())
for df in datas:
if exclude_col == True:
df = df.drop(['ca','thal','slope'], axis=1)
for column in df.iloc[:, :-1]:
val = func(df[column])
if fill_method != 'none':
# imputes data with the fill method, if not 'none'
df[column].fillna(val, inplace = True)
else:
# if fill method stays as 'none', just drop all NaN values
df = df.dropna()
#combines all the data together after imputation as a common dataset for analysis
if (output.empty):
output = df
else:
output = output.append(df, ignore_index=True)
return output
# creates a data set
def createTrainingSet(data):
train_input = data.values
X_data, Y_dataNum = train_input[:,:-1], train_input[:,-1]
Y_dataNum = [isPositive(x) for x in Y_dataNum]
Y_data = np.array(Y_dataNum)
m = 3* X_data.shape[0] // 10
trainX, testX, trainY, testY = train_test_split(X_data, Y_data, test_size=m, random_state=42, stratify=Y_data)
# train_input = data.values
#
# X_data, Y_dataNum = train_input[:,:-1], train_input[:,-1]
# Y_dataNum = [isPositive(x) for x in Y_dataNum]
# seed = 123
# np.random.seed(seed)
# idx = np.arange(X_data.shape[0])
# np.random.shuffle(idx)
# Y_data = np.array(Y_dataNum)
# X_data = X_data[idx]
# Y_data = Y_data[idx]
# m = 3* X_data.shape[0] // 10
# testX, testY = X_data[:m], Y_data[:m]
# trainX, trainY = X_data[m:], Y_data[m:]
return testX, testY, trainX, trainY
def isPositive(x):
if x > 0:
return 1
else:
return 0
if __name__ == '__main__':
datadict = ic.separateImport()
data = fillData(datadict, fill_method='median')