glorot wlrscale

MatthieuCourbariaux · Sep 18, 2015 · fcdf8b9 · fcdf8b9
1 parent 8ae3bb9
commit fcdf8b9
Show file tree

Hide file tree

Showing 3 changed files with 157 additions and 84 deletions.
diff --git a/binary_connect.py b/binary_connect.py
@@ -38,42 +38,49 @@ def compute_grads(loss,network):
 
     for layer in layers:
 
-        params = layer.get_params(trainable=True)
-
-        for param in params:
-            if param.name == "W":
-                # print(param.name)
-                grads.append(theano.grad(loss, wrt=layer.Wb))
-            else:
-                # print("here")
-                grads.append(theano.grad(loss, wrt=param))
+        params = layer.get_params(binary=True)
+        if params:
+            # print(params[0].name)
+            grads.append(theano.grad(loss, wrt=layer.Wb))
 
     return grads
 
-def weights_clipping(updates,network):
+def clipping_scaling(updates,network):
 
     layers = lasagne.layers.get_all_layers(network)
     updates = OrderedDict(updates)
 
     for layer in layers:
 
-        params = layer.get_params(trainable=True)
-
+        params = layer.get_params(binary=True)
         for param in params:
-            if param.name == "W":
-                # print("K")
-                updates[param] = T.clip(updates[param], -layer.H, layer.H)           
+            print("W_LR_scale = "+str(layer.W_LR_scale))
+            print("H = "+str(layer.H))
+            updates[param] = param + layer.W_LR_scale*(updates[param] - param)
+            updates[param] = T.clip(updates[param], -layer.H,layer.H)     
 
     return updates
 
-# def weights_clipping(updates, H):
+# def clipping(updates,network):
+
+    # layers = lasagne.layers.get_all_layers(network)
+    # updates = OrderedDict(updates)
+
+    # for layer in layers:
+
+        # params = layer.get_params(binary=True)
+        # for param in params:
+            # updates[param] = T.clip(updates[param], -layer.H, layer.H)     
+
+    # return updates
+
+# def clipping(updates, H):
 
     # params = updates.keys()
     # updates = OrderedDict(updates)
 
-    # for param in params:        
-        # if param.name == "W":            
-            # updates[param] = T.clip(updates[param], -H, H)
+    # for param in params:
+        # updates[param] = T.clip(updates[param], -H, H)
 
     # return updates
 
@@ -111,21 +118,29 @@ def binarization(W,H,binary=True,deterministic=False,stochastic=False,srng=None)
 class DenseLayer(lasagne.layers.DenseLayer):
 
     def __init__(self, incoming, num_units, 
-        # binary = True, stochastic = True, H=1., **kwargs):
-        binary = True, stochastic = True, **kwargs):
+        binary = True, stochastic = True, H=1.,W_LR_scale="Glorot", **kwargs):
+        # binary = True, stochastic = True, **kwargs):
 
         self.binary = binary
         self.stochastic = stochastic
-
-        # self.H = H
-        num_inputs = int(np.prod(incoming.output_shape[1:]))
-        self.H = np.float32(np.sqrt(1.5/ (num_inputs + num_units)))
-        # print("H = "+str(self.H))
+
+        self.H = H
+        if H == "Glorot":
+            num_inputs = int(np.prod(incoming.output_shape[1:]))
+            self.H = np.float32(np.sqrt(1.5/ (num_inputs + num_units)))
+            # print("H = "+str(self.H))
+
+        self.W_LR_scale = W_LR_scale
+        if W_LR_scale == "Glorot":
+            num_inputs = int(np.prod(incoming.output_shape[1:]))
+            self.W_LR_scale = np.float32(1./np.sqrt(1.5/ (num_inputs + num_units)))
 
         self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
 
         if self.binary:
-            super(DenseLayer, self).__init__(incoming, num_units, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs) 
+            super(DenseLayer, self).__init__(incoming, num_units, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs)
+            # add the binary tag to weights            
+            self.params[self.W]=set(['binary'])
 
         else:
             super(DenseLayer, self).__init__(incoming, num_units, **kwargs)
@@ -145,23 +160,32 @@ def get_output_for(self, input, deterministic=False, **kwargs):
 class Conv2DLayer(lasagne.layers.Conv2DLayer):
 
     def __init__(self, incoming, num_filters, filter_size,
-        binary = True, stochastic = True, **kwargs):
+        binary = True, stochastic = True, H=1.,W_LR_scale="Glorot", **kwargs):
+        # binary = True, stochastic = True, **kwargs):
 
         self.binary = binary
         self.stochastic = stochastic
-
-        num_inputs = int(np.prod(filter_size)*incoming.output_shape[1])
-        # theoretically, I should divide num_units by the pool_shape
-        num_units = int(np.prod(filter_size)*num_filters)
-        self.H = np.float32(np.sqrt(1.5 / (num_inputs + num_units)))
-        # print("H = "+str(self.H))
-        # self.H = .05
-
+
+        self.H = H
+        if H == "Glorot":
+            num_inputs = int(np.prod(filter_size)*incoming.output_shape[1])
+            num_units = int(np.prod(filter_size)*num_filters) # theoretically, I should divide num_units by the pool_shape
+            self.H = np.float32(np.sqrt(1.5 / (num_inputs + num_units)))
+            # print("H = "+str(self.H))
+
+        self.W_LR_scale = W_LR_scale
+        if W_LR_scale == "Glorot":
+            num_inputs = int(np.prod(filter_size)*incoming.output_shape[1])
+            num_units = int(np.prod(filter_size)*num_filters) # theoretically, I should divide num_units by the pool_shape
+            self.W_LR_scale = np.float32(1./np.sqrt(1.5 / (num_inputs + num_units)))
+            # print("W_LR_scale = "+str(self.W_LR_scale))
+
         self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
 
         if self.binary:
             super(Conv2DLayer, self).__init__(incoming, num_filters, filter_size, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs)   
-
+            # add the binary tag to weights            
+            self.params[self.W]=set(['binary'])
         else:
             super(Conv2DLayer, self).__init__(incoming, num_filters, filter_size, **kwargs)    
 
@@ -242,7 +266,6 @@ def val_epoch(X,y):
 
         train_loss = train_epoch(X_train,y_train,LR)
         X_train,y_train = shuffle(X_train,y_train)
-        LR *= LR_decay
 
         val_err, val_loss = val_epoch(X_val,y_val)
 
@@ -253,7 +276,7 @@ def val_epoch(X,y):
             best_epoch = epoch+1
 
             test_err, test_loss = val_epoch(X_test,y_test)
-
+        
         epoch_duration = time.time() - start_time
 
         # Then we print the results for this epoch:
@@ -265,4 +288,6 @@ def val_epoch(X,y):
         print("  best epoch:                    "+str(best_epoch))
         print("  best validation error rate:    "+str(best_val_err)+"%")
         print("  test loss:                     "+str(test_loss))
-        print("  test error rate:               "+str(test_err)+"%") 
+        print("  test error rate:               "+str(test_err)+"%") 
+
+        LR *= LR_decay
diff --git a/cifar10.py b/cifar10.py
@@ -42,6 +42,8 @@
 # from pylearn2.datasets.svhn import SVHN
 from pylearn2.utils import serial
 
+from collections import OrderedDict
+
 if __name__ == "__main__":
 
     # BN parameters
@@ -56,7 +58,7 @@
     print("epsilon = "+str(epsilon))
 
     # Training parameters
-    num_epochs = 300
+    num_epochs = 500
     print("num_epochs = "+str(num_epochs))
 
     # BinaryConnect    
@@ -67,23 +69,39 @@
     # H = (1./(1<<4))/10
     # H = 1./(1<<4)
     # H = .316
-    # H = 1.
+    # H = "Glorot"
+    H = 1.
+    print("H = "+str(H))
+    # W_size = .025
+    # W_LR_scale = 1./(W_size**2)    
+    # W_LR_scale = 1./W_size    
+    # W_LR_scale = 1.    
+    W_LR_scale = "Glorot"    
+    print("W_LR_scale = "+str(W_LR_scale))
 
     # LR decay
-    LR_start = 3.
+    # LR_start = .03  # tuned for adam 
+    # LR_start = 1. # same error with 3. (with nesterov_momentum)
+    LR_start = 0.003
     print("LR_start = "+str(LR_start))
-    LR_fin = .01 
+    # LR_fin = .00003 # tuned for adam
+    # LR_fin = .01 # never improves below .015 (with nesterov_momentum)
+    LR_fin = 0.000003
     print("LR_fin = "+str(LR_fin))
     LR_decay = (LR_fin/LR_start)**(1./num_epochs)
+    print("LR_decay = "+str(LR_decay))
     # BTW, LR decay is good for the BN moving average...
 
+    train_set_size = 45000
+    print("train_set_size = "+str(train_set_size))
+
     print('Loading CIFAR-10 dataset...')
 
     preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl")
     train_set = ZCA_Dataset(
         preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
         preprocessor = preprocessor,
-        start=0, stop = 45000)
+        start=0, stop = train_set_size)
     valid_set = ZCA_Dataset(
         preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
         preprocessor = preprocessor,
@@ -130,7 +148,8 @@
             cnn, 
             binary=binary,
             stochastic=stochastic,
-            # H=H,
+            H=H,
+            W_LR_scale=W_LR_scale,
             num_filters=128, 
             filter_size=(3, 3),
             nonlinearity=lasagne.nonlinearities.identity)
@@ -145,7 +164,8 @@
             cnn, 
             binary=binary,
             stochastic=stochastic,
-            # H=H,
+            H=H,
+            W_LR_scale=W_LR_scale,
             num_filters=128, 
             filter_size=(3, 3),
             nonlinearity=lasagne.nonlinearities.identity)
@@ -164,7 +184,8 @@
             cnn, 
             binary=binary,
             stochastic=stochastic,
-            # H=H,
+            H=H,
+            W_LR_scale=W_LR_scale,
             num_filters=256, 
             filter_size=(2, 2),
             nonlinearity=lasagne.nonlinearities.identity)
@@ -179,7 +200,8 @@
             cnn, 
             binary=binary,
             stochastic=stochastic,
-            # H=H,
+            H=H,
+            W_LR_scale=W_LR_scale,
             num_filters=256, 
             filter_size=(2, 2),
             nonlinearity=lasagne.nonlinearities.identity)
@@ -198,7 +220,8 @@
             cnn, 
             binary=binary,
             stochastic=stochastic,
-            # H=H,
+            H=H,
+            W_LR_scale=W_LR_scale,
             num_filters=512, 
             filter_size=(2, 2),
             nonlinearity=lasagne.nonlinearities.identity)
@@ -213,7 +236,8 @@
             cnn, 
             binary=binary,
             stochastic=stochastic,
-            # H=H,
+            H=H,
+            W_LR_scale=W_LR_scale,
             num_filters=512, 
             filter_size=(2, 2),
             nonlinearity=lasagne.nonlinearities.identity)
@@ -231,7 +255,8 @@
             cnn, 
             binary=binary,
             stochastic=stochastic,
-            # H=H,
+            H=H,
+            W_LR_scale=W_LR_scale,
             num_filters=1024, 
             filter_size=(2, 2),
             nonlinearity=lasagne.nonlinearities.identity)
@@ -248,7 +273,8 @@
                 cnn, 
                 binary=binary,
                 stochastic=stochastic,
-                # H=H,
+                H=H,
+                W_LR_scale=W_LR_scale,
                 nonlinearity=lasagne.nonlinearities.identity,
                 num_units=1024)      
 
@@ -262,7 +288,8 @@
                 cnn, 
                 binary=binary,
                 stochastic=stochastic,
-                # H=H,
+                H=H,
+                W_LR_scale=W_LR_scale,
                 nonlinearity=lasagne.nonlinearities.identity,
                 num_units=10)      
 
@@ -277,20 +304,25 @@
     # squared hinge loss
     loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))
 
-    params = lasagne.layers.get_all_params(cnn, trainable=True)
-
     if binary:
-        grads = binary_connect.compute_grads(loss,cnn)
-        # updates = lasagne.updates.adam(loss_or_grads=grads, params=params, learning_rate=LR)
-        updates = lasagne.updates.sgd(loss_or_grads=grads, params=params, learning_rate=LR)
-        # updates = binary_connect.weights_clipping(updates,H) 
-        updates = binary_connect.weights_clipping(updates,cnn) 
+
+        # W updates
+        W = lasagne.layers.get_all_params(cnn, binary=True)
+        W_grads = binary_connect.compute_grads(loss,cnn)
+        updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
+        updates = binary_connect.clipping_scaling(updates,cnn)
+        # updates = binary_connect.clipping(updates,H)
         # using 2H instead of H with stochastic yields about 20% relative worse results
 
+        # other parameters updates
+        params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False)
+        updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items())
+        # updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)
+
     else:
+        params = lasagne.layers.get_all_params(cnn, trainable=True)
+        updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)
         # updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)
-        updates = lasagne.updates.sgd(loss_or_grads=loss, params=params, learning_rate=LR)
-        # updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)
 
     test_output = lasagne.layers.get_output(cnn, deterministic=True)
     test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output)))