TensorFlow Vanishing Gradients

Question

i'm trying to replicate Recurrent Spatial Transformer Network implemented here (https://github.com/skaae/recurrent-spatial-transformer-code) , however the loss didn't decrease at all .

the configuration of the network is as follow:

1 - relu activations .

2 - xavier weight initialization for weights , zero initialization for biases .

3 - cost function is softmax_cross_entropy_with_logits .

4 - optimizer is RMSProp (i tried 1e-6 ;1e-10 espilon) .

5 - gradient clipping by value .

so what should i try next ?

Below is the detailed Code

 import tensorflow as tf
from spatial_transformer import transformer
from tensorflow.python.ops import rnn,rnn_cell
import numpy as np
from tf_utils import weight_variable, bias_variable,  dense_to_one_hot

# %% load data
mnist_cluttered = np.load('data/mnist_sequence3_sample_8distortions_9x9.npz')

X_train = mnist_cluttered['X_train']
y_train = mnist_cluttered['y_train']
X_valid = mnist_cluttered['X_valid']
y_valid = mnist_cluttered['y_valid']
X_test = mnist_cluttered['X_test']
y_test = mnist_cluttered['y_test']

y_train = np.reshape(y_train,[y_train.size,1])
y_valid = np.reshape(y_valid,[y_valid.size,1])
y_test = np.reshape(y_test,[y_test.size,1])

# % turn from dense to one hot representation
Y_train = dense_to_one_hot(y_train, n_classes=10)
Y_valid = dense_to_one_hot(y_valid, n_classes=10)
Y_test = dense_to_one_hot(y_test, n_classes=10)


Y_train = np.reshape(Y_train,[y_train.size/3,3,10])
Y_valid = np.reshape(Y_valid,[y_valid.size/3,3,10])
Y_test = np.reshape(Y_test,[y_test.size/3,3,10])

# %% Placeholders for 100x100 resolution
x = tf.placeholder(tf.float32, [None, 10000])
y = tf.placeholder(tf.float32, [None,3, 10])


x_tensor = tf.reshape(x, [-1, 100, 100, 1])

y_tensor = tf.reshape(y,[-1 ,10])

#%% localizaton network

keep_prob = tf.placeholder(tf.float32)

l_pool0_loc = tf.nn.max_pool(x_tensor,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID')

W_conv0_loc = weight_variable([3,3,1,20],'W_conv0_loc')

b_conv0_loc = bias_variable([20],'b_conv0_loc')

l_conv0_loc = tf.nn.relu(tf.nn.conv2d(l_pool0_loc,W_conv0_loc,strides=[1,1,1,1],padding='VALID')+b_conv0_loc)

l_pool1_loc = tf.nn.max_pool(l_conv0_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')

W_conv1_loc = weight_variable([3,3,20,20],'W_conv1_loc')   

b_conv1_loc = bias_variable([20],'b_conv1_loc')

l_conv1_loc =  tf.nn.relu(tf.nn.conv2d(l_pool1_loc,W_conv1_loc,strides=[1,1,1,1],padding='VALID')+b_conv1_loc)

l_pool2_loc = tf.nn.max_pool(l_conv1_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')

W_conv2_loc = weight_variable([3,3,20,20],'W_conv2_loc')

b_conv2_loc = bias_variable([20],'b_conv2_loc')

l_conv2_loc = tf.nn.relu(tf.nn.conv2d(l_pool2_loc,W_conv2_loc,strides=[1,1,1,1],padding='VALID')+b_conv2_loc )

l_conv2_loc = tf.reshape(l_conv2_loc,[-1 ,9*9*20 ])

# Replicate input for Gated Recurrent Unit
l_conv2_loc = tf.tile(l_conv2_loc,[1,3])

l_conv2_loc = tf.split(1,3,l_conv2_loc)

# Gated Recurrent Unit

gru_cell = rnn_cell.GRUCell(num_units=256)

output, state = rnn.rnn(gru_cell,inputs=l_conv2_loc,dtype=tf.float32)

output = tf.reshape(output,[-1,256])

initial = tf.zeros([256,6]) 


W_fc1_loc = tf.Variable(initial_value=initial,name='W_fc1_loc')

# Use identity transformation as starting point
initial = np.array([[1., 0, 0], [0, 1., 0]])
initial = initial.astype('float32')
initial = initial.flatten()
b_fc1_loc = tf.Variable(initial_value=initial,name='b_fc1_loc')


l_fc1_loc = tf.add(tf.matmul(output,W_fc1_loc), b_fc1_loc)


# %% We'll create a spatial transformer module to identify discriminative patches

downsample = 3

out_size = (100/downsample, 100/downsample)


l_transform = transformer(tf.tile(x_tensor,[3,1,1,1]), l_fc1_loc, out_size)

# %% Classification Network


W_conv0_out = weight_variable([3,3,1,32],'W_conv0_out')                   

b_conv0_out = bias_variable([32],'b_conv0_out')

l_conv0_out = tf.nn.relu(tf.nn.conv2d(l_transform,W_conv0_out,strides=[1,1,1,1],padding='VALID')+b_conv0_out)

l_pool1_out = tf.nn.max_pool(l_conv0_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')

#l_drp1_out = tf.nn.dropout(l_pool1_out,keep_prob)

W_conv1_out = weight_variable([3,3,32,32],'W_conv1_out')  

b_conv1_out = bias_variable([32],'b_conv1_out')

l_conv1_out = tf.nn.relu(tf.nn.conv2d(l_pool1_out,W_conv1_out,strides=[1,1,1,1],padding='VALID')+b_conv1_out)

l_pool2_out = tf.nn.max_pool(l_conv1_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')

#l_drp2_out = tf.nn.dropout(l_pool2_out,keep_prob)

W_conv2_out = weight_variable([3,3,32,32],'W_conv2_out')     

b_conv2_out = bias_variable([32],'b_conv2_out')

l_conv2_out = tf.nn.relu(tf.nn.conv2d(l_pool2_out,W_conv2_out,strides=[1,1,1,1],padding='VALID')+b_conv2_out)



# %% We'll now reshape so we can connect to a fully-connected layer:
l_conv2_out_flat = tf.reshape(l_conv2_out, [-1, 4*4*32])

# %% Create a fully-connected layer:
n_fc = 400

W_fc1 = tf.get_variable('W_fc1',shape=[4*4*32,n_fc],initializer=tf.contrib.layers.xavier_initializer())

#W_fc1 = weight_variable([4*4*32,n_fc],'W_fc1')

b_fc1=bias_variable([n_fc],'b_fc1')


h_fc1 = tf.nn.relu(tf.add(tf.matmul(l_conv2_out_flat, W_fc1) , b_fc1))

# %% And finally our softmax layer:

W_fc2 = tf.get_variable('W_fc2',shape=[n_fc, 10],initializer=tf.contrib.layers.xavier_initializer())

#W_fc2 = weight_variable([n_fc,10],'W_fc2')

b_fc2=bias_variable([10],'b_fc2')

y_logits = tf.add(tf.matmul(h_fc1, W_fc2) , b_fc2)



# %% Monitor accuracy



correct_prediction = tf.equal(tf.argmax(y_logits, 1), tf.argmax(y_tensor, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))


# %% Define loss/eval/training functions
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(y_logits,y_tensor))

opt = tf.train.RMSPropOptimizer(0.0005,epsilon=1e-6)

#opt = tf.train.AdagradOptimizer(0.01)
#optimizer = opt.minimize(cross_entropy)




gvs = opt.compute_gradients(cross_entropy)

capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]

optimizer = opt.apply_gradients(capped_gvs )




# %% We'll now train in minibatches and report accuracy, loss:

num_batches = 600
n_epochs = 300
batch_size = 100


with tf.Session( ) as sess:

     sess.run(tf.initialize_all_variables())

     for epoch_i in range(n_epochs):

    #print ('epoch: ' + str(epoch_i))
         shuffle = np.random.permutation(X_train.shape[0])
         avg_cost = 0.
         for iter_i in range(num_batches - 1):
             idx = shuffle[iter_i*batch_size:(iter_i+1)*batch_size]
             batch_xs = X_train[idx]
             batch_ys = Y_train[idx]


             _,c=sess.run([optimizer,cross_entropy], feed_dict={x: batch_xs, y: batch_ys})

             avg_cost += c / num_batches
             print('iter: ' + str(iter_i) +' >> ' +' MiniBatch Cost: ' +str(c)) 

     #   gr_print= sess.run([grads for grads,_  in gvs], feed_dict={x : batch_xs, y : batch_ys}) 
     #   print ('iter: '+str(iter_i))
     #   for t in gr_print:
      #      print np.linalg.norm(t)



saver = tf.train.Saver()

saver.save(sess,"save/my-model")

`

TensorFlow Vanishing Gradients

Answers (1)

Related Questions