-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcellpred_model.py
145 lines (116 loc) · 5.42 KB
/
cellpred_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 16 11:51:16 2018
@author: dawnstear
"""
import os
#s.environ['TF_CPP_MIN_LOG_LEVEL']='3'
import tensorflow as tf
from tensorflow import contrib
import numpy as np
from sklearn import cross_validation #, model_selection
from sklearn.utils import shuffle
import matplotlib as plt
from matplotlib import pyplot
tf.logging.set_verbosity(tf.logging.DEBUG)
# In order of ascending severity, they are DEBUG, INFO, WARN, ERROR, and FATAL
cellcount, genecount = np.shape(data)
data_shuffled = shuffle(data)
X_ = data_shuffled.drop('Labels',axis=1)
X_ = data_shuffled.drop('TYPE',axis=1)
y_ = data_shuffled['Labels'] # X_ and y_ are used b/c X and y are also name of placeholders in TF...
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_.values,y_.values,test_size=0.2)
# pass in X_.values so its a numpy.ndarray not dframe
#Define number of neurons per layer and batch size
batch_size = 30
num_input = genecount-1 # **** must preserve gene (feature) order when inputing data
n_hidden_1 = 512 # num_input = genecount-1 bc we have to take out one column (TYPE) bc its not needed
n_hidden_2 = 256
n_hidden_3 = 64
n_hidden_4 = 256
n_hidden_5 = 128
num_classes = len(celldistro) # specify how many cell types (labels in dataset)
# Create placeholders for train X,y and test X,y. Shape = (None, n_inputs)...
# bc we dont know how many samples we will have per batch yet
X = tf.placeholder(tf.float32, shape=(batch_size,num_input),name="X") # use -1, not None for dynamic batch size
y = tf.placeholder(tf.int64, shape=(batch_size), name="y")
# Build Architecture
with tf.name_scope("dnn"):
layer1 = tf.layers.dense(inputs=X, units=n_hidden_1, activation=tf.nn.elu,name='Layer_1',reuse=True) # add names?
dropout1 = tf.layers.dropout(inputs=layer1, rate=0.3,name='Dropout_1') # use Estimator, not ModeKeys
layer2 = tf.layers.dense(inputs=dropout1,units=n_hidden_2, activation=tf.nn.elu,name='Layer_2',reuse=True)
dropout2 = tf.layers.dropout(inputs=layer2, rate=0.3,name='Dropout_2')
layer3 = tf.layers.dense(inputs=dropout2, units=n_hidden_3, activation=tf.nn.elu,name='Layer_3',reuse=True)
output = tf.layers.dense(inputs=layer3,units=num_classes,activation=tf.nn.softmax,name='Output',reuse=True)
#output_tensor = tf.Print(output,[output])
######### Cost fn = softmax xentropy
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=output) # check sparse_xentropy docs
loss = tf.reduce_mean(xentropy,name="loss") # y or y_ ^^^
######### Optimizer = SGD
lr=0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(lr)
training_op = optimizer.minimize(loss)
### Now how to evaluate...
with tf.name_scope('eval'):
correct = tf.nn.in_top_k(output,y,1) # check top_k docs
accuracy = tf.reduce_mean(tf.cast(correct,tf.float32))
# *------End of Construction phase -------*
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# os.path.join(file, extension) -writer for tensorboard viz
writer = tf.summary.FileWriter("/Users/dawnstear/desktop/tmp")
# *--------Set Some hyperparameters & initialize Metic Vectors --------*
n_epochs = 28
epochvec = range(1,n_epochs+1)
accTrain = []
accVal = []
def next_batch(num, data, labels): # from stack overflow
''' Return a total of `num` random samples and labels. '''
idx = np.arange(0 , len(data))
np.random.shuffle(idx)
idx = idx[:num]
data_shuffle = [data[ i] for i in idx]
labels_shuffle = [labels[ i] for i in idx]
return np.asarray(data_shuffle), np.asarray(labels_shuffle)
'''
TODO:
-check that labels were applied correctly
-next batch method, ok
-use coo sparse dframe/matrix
-softmax logits needs labels to be ints 0-9 so change labels to 0 through 9 ?
-change shuffle method
-save trainedmodelwith date and time title
-implement Tensorboard
-Implement multithreading for cluster
- implement pie chart of cell types and Venn diagram
- plot accuracy as function of n epochs or n genes
-kfold and randomize, labelencode....keras'''
# *--------------- Run Session -------------------*
with tf.Session() as sess:
init.run() # sess.run(init) ??
print('Just initialized')
for epoch in range(n_epochs):
print('Starting Epoch %d ' % epoch)
for iteration in range(cellcount//batch_size): # examples = 55,000
X_batch, y_batch = next_batch(batch_size, X_train, y_train)
sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
acc_train = accuracy.eval(feed_dict={X:X_batch, y:y_batch})
# acc_val = accuracy.eval(feed_dict={X:X_test, y:y_test})
accTrain = np.append(accTrain,acc_train)
#print('Epoch %d completed' % epoch)
# accVal = np.append(accVal,acc_val)
# print('Epoch:%d - Train Acc :%f - Validation Acc: %f' % (epoch, acc_train,acc_val))
sess.close()
writer.add_graph(sess.graph) # outside session
# --------------- Plot Metrics -------------------
fig, ax = pyplot.subplots()
ax.plot(epochvec,accTrain)
#ax.plot(epochvec,accVal)
ax.set(xlabel='Epoch', ylabel='Accuracies',
title='Accuracy Over Training Phase') # include time, epoch and batch size in title
ax.grid()
fig.savefig("/Users/dawnstear/desktop/tmp/")
#tf.trainable_variables(scope=None)