-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel.py
202 lines (150 loc) · 8.24 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# Compatibility imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import argparse
import math
import random
import os
# uncomment this line to suppress Tensorflow warnings
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import numpy as np
from six.moves import xrange as range
from utils import *
import pdb
from time import gmtime, strftime
from config import Config
class SeparationModel():
"""
Implements a recursive neural network with a single hidden layer attached to CTC loss.
This network will predict a sequence of TIDIGITS (e.g. z1039) for a given audio wav file.
"""
def add_placeholders(self):
"""Generates placeholder variables to represent the input tensors
These placeholders are used as inputs by the rest of the model building and will be fed
data during training. Note that when "None" is in a placeholder's shape, it's flexible
(so we can use different batch sizes without rebuilding the model).
Adds following nodes to the computational graph:
inputs_placeholder: Input placeholder tensor of shape (None, None, num_final_features), type tf.float32
targets_placeholder: Sparse placeholder, type tf.int32. You don't need to specify shape dimension.
seq_lens_placeholder: Sequence length placeholder tensor of shape (None), type tf.int32
TODO: Add these placeholders to self as the instance variables
self.inputs_placeholder
self.targets_placeholder
self.seq_lens_placeholder
HINTS:
- Use tf.sparse_placeholder(tf.int32) for targets_placeholder. This is required by TF's ctc_loss op.
- Inputs is of shape [batch_size, max_timesteps, num_final_features], but we allow flexible sizes for
batch_size and max_timesteps (hence the shape definition as [None, None, num_final_features].
(Don't change the variable names)
"""
self.inputs_placeholder = tf.placeholder(tf.float32, shape=(None, None, Config.num_final_features), name='inputs')
self.targets_placeholder = tf.placeholder(tf.float32, shape=(None, None, Config.output_size), name='targets')
def create_feed_dict(self, inputs_batch, targets_batch):
"""Creates the feed_dict for the digit recognizer.
A feed_dict takes the form of:
feed_dict = {
<placeholder>: <tensor of values to be passed for placeholder>,
....
}
Hint: The keys for the feed_dict should be a subset of the placeholder
tensors created in add_placeholders.
Args:
inputs_batch: A batch of input data.
targets_batch: A batch of targets data.
seq_lens_batch: A batch of seq_lens data.
Returns:
feed_dict: The feed dictionary mapping from placeholders to values.
"""
feed_dict = {
self.inputs_placeholder: inputs_batch,
self.targets_placeholder: targets_batch,
}
return feed_dict
def add_prediction_op(self):
"""Applies a GRU RNN over the input data, then an affine layer projection. Steps to complete
in this function:
- Roll over inputs_placeholder with GRUCell, producing a Tensor of shape [batch_s, max_timestep,
num_hidden].
- Apply a W * f + b transformation over the data, where f is each hidden layer feature. This
should produce a Tensor of shape [batch_s, max_timesteps, num_classes]. Set this result to
"logits".
Remember:
* Use the xavier initialization for matrices (W, but not b).
* W should be shape [num_hidden, num_classes]. num_classes for our dataset is 12
* tf.contrib.rnn.GRUCell, tf.contrib.rnn.MultiRNNCell and tf.nn.dynamic_rnn are of interest
"""
cell = None
cell_bw = None
if Config.num_layers > 1:
# multi layer
cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(Config.output_size,
input_size=Config.num_final_features) for _ in range(Config.num_layers)], state_is_tuple=False)
cell_bw = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(Config.output_size,
input_size=Config.num_final_features) for _ in range(Config.num_layers)], state_is_tuple=False)
else:
cell = tf.contrib.rnn.GRUCell(Config.output_size, input_size=Config.num_final_features)
cell_bw = tf.contrib.rnn.GRUCell(Config.output_size, input_size=Config.num_final_features)
# output, state = tf.nn.dynamic_rnn(cell, self.inputs_placeholder, dtype=tf.float32)
output, state = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, self.inputs_placeholder, dtype=tf.float32)
output = output[0]
# output_seq_length = tf.shape(output)[1]
# last_output = output[:,output_seq_length - 1,:]
self.output = output
# self.output = tf.Print(self.output, [self.output, tf.shape(self.output)])
def add_loss_op(self, freq_weighted):
l2_cost = 0.0
weighted_differences = self.output - self.targets_placeholder
num_freq_bins = Config.num_final_features
frequencies = np.array([2.0 * 180 * i / (num_freq_bins - 1) * 22050 / 360 for i in xrange(num_freq_bins)])
frequencies[0] = 2.0 * 180 / (num_freq_bins - 1) / 2 * 22050 / 360 # 0th frequency threshold is computed at 3/4th of the frequency range
ath_val = 3.64 * np.power(1000 / frequencies, 0.8) - 6.5 * np.exp(-0.6 * np.power(frequencies / 1000 - 3.3, 2)) + np.power(0.1, 3) * np.power(frequencies / 1000, 4)
ath_shifted = (1 - np.amin(ath_val)) + ath_val # shift all ath vals so that min is 1
weights = np.tile(1 / ath_shifted, 2)
if freq_weighted:
weighted_differences = weights * weighted_differences
else:
normalized = np.full(weights.shape, np.sqrt(np.sum(np.power(weights, 2)) / num_freq_bins))
weighted_differences = normalized * weighted_differences
squared_error = tf.norm(weighted_differences, ord=2)
self.loss = Config.l2_lambda * l2_cost + squared_error
tf.summary.scalar("squared_error", squared_error)
tf.summary.scalar("loss", self.loss)
def add_training_op(self):
"""Sets up the training Ops.
Creates an optimizer and applies the gradients to all trainable variables. The Op returned by this
function is what must be passed to the `sess.run()` call to cause the model to train. See
https://www.tensorflow.org/versions/r0.7/api_docs/python/train.html#Optimizer
for more information.
Use tf.train.AdamOptimizer for this model. Call optimizer.minimize() on self.loss.
"""
optimizer = None
### YOUR CODE HERE (~1-2 lines)
optimizer = tf.train.AdamOptimizer(learning_rate=Config.lr).minimize(self.loss)
### END YOUR CODE
self.optimizer = optimizer
def add_summary_op(self):
self.merged_summary_op = tf.summary.merge_all()
# This actually builds the computational graph
def build(self, freq_weighted):
self.add_placeholders()
self.add_prediction_op()
self.add_loss_op(freq_weighted)
self.add_training_op()
self.add_summary_op()
def train_on_batch(self, session, train_inputs_batch, train_targets_batch, train=True):
feed = self.create_feed_dict(train_inputs_batch, train_targets_batch)
output, batch_cost, summary = session.run([self.output, self.loss, self.merged_summary_op], feed)
if math.isnan(batch_cost): # basically all examples in this batch have been skipped
return 0
if train:
_ = session.run([self.optimizer], feed)
return output, batch_cost, summary
def print_results(self, train_inputs_batch, train_targets_batch):
train_feed = self.create_feed_dict(train_inputs_batch, train_targets_batch)
train_first_batch_preds = session.run(self.decoded_sequence, feed_dict=train_feed)
compare_predicted_to_true(train_first_batch_preds, train_targets_batch)
def __init__(self, freq_weighted=None):
self.build(freq_weighted)