Skip to content

Commit

Permalink
glorot wlrscale
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthieuCourbariaux committed Sep 18, 2015
1 parent 8ae3bb9 commit fcdf8b9
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 84 deletions.
105 changes: 65 additions & 40 deletions binary_connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,42 +38,49 @@ def compute_grads(loss,network):

for layer in layers:

params = layer.get_params(trainable=True)

for param in params:
if param.name == "W":
# print(param.name)
grads.append(theano.grad(loss, wrt=layer.Wb))
else:
# print("here")
grads.append(theano.grad(loss, wrt=param))
params = layer.get_params(binary=True)
if params:
# print(params[0].name)
grads.append(theano.grad(loss, wrt=layer.Wb))

return grads

def weights_clipping(updates,network):
def clipping_scaling(updates,network):

layers = lasagne.layers.get_all_layers(network)
updates = OrderedDict(updates)

for layer in layers:

params = layer.get_params(trainable=True)

params = layer.get_params(binary=True)
for param in params:
if param.name == "W":
# print("K")
updates[param] = T.clip(updates[param], -layer.H, layer.H)
print("W_LR_scale = "+str(layer.W_LR_scale))
print("H = "+str(layer.H))
updates[param] = param + layer.W_LR_scale*(updates[param] - param)
updates[param] = T.clip(updates[param], -layer.H,layer.H)

return updates

# def weights_clipping(updates, H):
# def clipping(updates,network):

# layers = lasagne.layers.get_all_layers(network)
# updates = OrderedDict(updates)

# for layer in layers:

# params = layer.get_params(binary=True)
# for param in params:
# updates[param] = T.clip(updates[param], -layer.H, layer.H)

# return updates

# def clipping(updates, H):

# params = updates.keys()
# updates = OrderedDict(updates)

# for param in params:
# if param.name == "W":
# updates[param] = T.clip(updates[param], -H, H)
# for param in params:
# updates[param] = T.clip(updates[param], -H, H)

# return updates

Expand Down Expand Up @@ -111,21 +118,29 @@ def binarization(W,H,binary=True,deterministic=False,stochastic=False,srng=None)
class DenseLayer(lasagne.layers.DenseLayer):

def __init__(self, incoming, num_units,
# binary = True, stochastic = True, H=1., **kwargs):
binary = True, stochastic = True, **kwargs):
binary = True, stochastic = True, H=1.,W_LR_scale="Glorot", **kwargs):
# binary = True, stochastic = True, **kwargs):

self.binary = binary
self.stochastic = stochastic

# self.H = H
num_inputs = int(np.prod(incoming.output_shape[1:]))
self.H = np.float32(np.sqrt(1.5/ (num_inputs + num_units)))
# print("H = "+str(self.H))

self.H = H
if H == "Glorot":
num_inputs = int(np.prod(incoming.output_shape[1:]))
self.H = np.float32(np.sqrt(1.5/ (num_inputs + num_units)))
# print("H = "+str(self.H))

self.W_LR_scale = W_LR_scale
if W_LR_scale == "Glorot":
num_inputs = int(np.prod(incoming.output_shape[1:]))
self.W_LR_scale = np.float32(1./np.sqrt(1.5/ (num_inputs + num_units)))

self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))

if self.binary:
super(DenseLayer, self).__init__(incoming, num_units, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs)
super(DenseLayer, self).__init__(incoming, num_units, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs)
# add the binary tag to weights
self.params[self.W]=set(['binary'])

else:
super(DenseLayer, self).__init__(incoming, num_units, **kwargs)
Expand All @@ -145,23 +160,32 @@ def get_output_for(self, input, deterministic=False, **kwargs):
class Conv2DLayer(lasagne.layers.Conv2DLayer):

def __init__(self, incoming, num_filters, filter_size,
binary = True, stochastic = True, **kwargs):
binary = True, stochastic = True, H=1.,W_LR_scale="Glorot", **kwargs):
# binary = True, stochastic = True, **kwargs):

self.binary = binary
self.stochastic = stochastic

num_inputs = int(np.prod(filter_size)*incoming.output_shape[1])
# theoretically, I should divide num_units by the pool_shape
num_units = int(np.prod(filter_size)*num_filters)
self.H = np.float32(np.sqrt(1.5 / (num_inputs + num_units)))
# print("H = "+str(self.H))
# self.H = .05


self.H = H
if H == "Glorot":
num_inputs = int(np.prod(filter_size)*incoming.output_shape[1])
num_units = int(np.prod(filter_size)*num_filters) # theoretically, I should divide num_units by the pool_shape
self.H = np.float32(np.sqrt(1.5 / (num_inputs + num_units)))
# print("H = "+str(self.H))

self.W_LR_scale = W_LR_scale
if W_LR_scale == "Glorot":
num_inputs = int(np.prod(filter_size)*incoming.output_shape[1])
num_units = int(np.prod(filter_size)*num_filters) # theoretically, I should divide num_units by the pool_shape
self.W_LR_scale = np.float32(1./np.sqrt(1.5 / (num_inputs + num_units)))
# print("W_LR_scale = "+str(self.W_LR_scale))

self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))

if self.binary:
super(Conv2DLayer, self).__init__(incoming, num_filters, filter_size, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs)

# add the binary tag to weights
self.params[self.W]=set(['binary'])
else:
super(Conv2DLayer, self).__init__(incoming, num_filters, filter_size, **kwargs)

Expand Down Expand Up @@ -242,7 +266,6 @@ def val_epoch(X,y):

train_loss = train_epoch(X_train,y_train,LR)
X_train,y_train = shuffle(X_train,y_train)
LR *= LR_decay

val_err, val_loss = val_epoch(X_val,y_val)

Expand All @@ -253,7 +276,7 @@ def val_epoch(X,y):
best_epoch = epoch+1

test_err, test_loss = val_epoch(X_test,y_test)

epoch_duration = time.time() - start_time

# Then we print the results for this epoch:
Expand All @@ -265,4 +288,6 @@ def val_epoch(X,y):
print(" best epoch: "+str(best_epoch))
print(" best validation error rate: "+str(best_val_err)+"%")
print(" test loss: "+str(test_loss))
print(" test error rate: "+str(test_err)+"%")
print(" test error rate: "+str(test_err)+"%")

LR *= LR_decay
78 changes: 55 additions & 23 deletions cifar10.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
# from pylearn2.datasets.svhn import SVHN
from pylearn2.utils import serial

from collections import OrderedDict

if __name__ == "__main__":

# BN parameters
Expand All @@ -56,7 +58,7 @@
print("epsilon = "+str(epsilon))

# Training parameters
num_epochs = 300
num_epochs = 500
print("num_epochs = "+str(num_epochs))

# BinaryConnect
Expand All @@ -67,23 +69,39 @@
# H = (1./(1<<4))/10
# H = 1./(1<<4)
# H = .316
# H = 1.
# H = "Glorot"
H = 1.
print("H = "+str(H))
# W_size = .025
# W_LR_scale = 1./(W_size**2)
# W_LR_scale = 1./W_size
# W_LR_scale = 1.
W_LR_scale = "Glorot"
print("W_LR_scale = "+str(W_LR_scale))

# LR decay
LR_start = 3.
# LR_start = .03 # tuned for adam
# LR_start = 1. # same error with 3. (with nesterov_momentum)
LR_start = 0.003
print("LR_start = "+str(LR_start))
LR_fin = .01
# LR_fin = .00003 # tuned for adam
# LR_fin = .01 # never improves below .015 (with nesterov_momentum)
LR_fin = 0.000003
print("LR_fin = "+str(LR_fin))
LR_decay = (LR_fin/LR_start)**(1./num_epochs)
print("LR_decay = "+str(LR_decay))
# BTW, LR decay is good for the BN moving average...

train_set_size = 45000
print("train_set_size = "+str(train_set_size))

print('Loading CIFAR-10 dataset...')

preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl")
train_set = ZCA_Dataset(
preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"),
preprocessor = preprocessor,
start=0, stop = 45000)
start=0, stop = train_set_size)
valid_set = ZCA_Dataset(
preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"),
preprocessor = preprocessor,
Expand Down Expand Up @@ -130,7 +148,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
num_filters=128,
filter_size=(3, 3),
nonlinearity=lasagne.nonlinearities.identity)
Expand All @@ -145,7 +164,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
num_filters=128,
filter_size=(3, 3),
nonlinearity=lasagne.nonlinearities.identity)
Expand All @@ -164,7 +184,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
num_filters=256,
filter_size=(2, 2),
nonlinearity=lasagne.nonlinearities.identity)
Expand All @@ -179,7 +200,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
num_filters=256,
filter_size=(2, 2),
nonlinearity=lasagne.nonlinearities.identity)
Expand All @@ -198,7 +220,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
num_filters=512,
filter_size=(2, 2),
nonlinearity=lasagne.nonlinearities.identity)
Expand All @@ -213,7 +236,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
num_filters=512,
filter_size=(2, 2),
nonlinearity=lasagne.nonlinearities.identity)
Expand All @@ -231,7 +255,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
num_filters=1024,
filter_size=(2, 2),
nonlinearity=lasagne.nonlinearities.identity)
Expand All @@ -248,7 +273,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
nonlinearity=lasagne.nonlinearities.identity,
num_units=1024)

Expand All @@ -262,7 +288,8 @@
cnn,
binary=binary,
stochastic=stochastic,
# H=H,
H=H,
W_LR_scale=W_LR_scale,
nonlinearity=lasagne.nonlinearities.identity,
num_units=10)

Expand All @@ -277,20 +304,25 @@
# squared hinge loss
loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))

params = lasagne.layers.get_all_params(cnn, trainable=True)

if binary:
grads = binary_connect.compute_grads(loss,cnn)
# updates = lasagne.updates.adam(loss_or_grads=grads, params=params, learning_rate=LR)
updates = lasagne.updates.sgd(loss_or_grads=grads, params=params, learning_rate=LR)
# updates = binary_connect.weights_clipping(updates,H)
updates = binary_connect.weights_clipping(updates,cnn)

# W updates
W = lasagne.layers.get_all_params(cnn, binary=True)
W_grads = binary_connect.compute_grads(loss,cnn)
updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
updates = binary_connect.clipping_scaling(updates,cnn)
# updates = binary_connect.clipping(updates,H)
# using 2H instead of H with stochastic yields about 20% relative worse results

# other parameters updates
params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False)
updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items())
# updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)

else:
params = lasagne.layers.get_all_params(cnn, trainable=True)
updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)
# updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)
updates = lasagne.updates.sgd(loss_or_grads=loss, params=params, learning_rate=LR)
# updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)

test_output = lasagne.layers.get_output(cnn, deterministic=True)
test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output)))
Expand Down
Loading

0 comments on commit fcdf8b9

Please sign in to comment.