serag shaker
serag shaker

Reputation: 11

TensorFlow Vanishing Gradients

i'm trying to replicate Recurrent Spatial Transformer Network implemented here (https://github.com/skaae/recurrent-spatial-transformer-code) , however the loss didn't decrease at all .

the configuration of the network is as follow:

1 - relu activations .

2 - xavier weight initialization for weights , zero initialization for biases .

3 - cost function is softmax_cross_entropy_with_logits .

4 - optimizer is RMSProp (i tried 1e-6 ;1e-10 espilon) .

5 - gradient clipping by value .

so what should i try next ?

Below is the detailed Code

 import tensorflow as tf
from spatial_transformer import transformer
from tensorflow.python.ops import rnn,rnn_cell
import numpy as np
from tf_utils import weight_variable, bias_variable,  dense_to_one_hot

# %% load data
mnist_cluttered = np.load('data/mnist_sequence3_sample_8distortions_9x9.npz')

X_train = mnist_cluttered['X_train']
y_train = mnist_cluttered['y_train']
X_valid = mnist_cluttered['X_valid']
y_valid = mnist_cluttered['y_valid']
X_test = mnist_cluttered['X_test']
y_test = mnist_cluttered['y_test']

y_train = np.reshape(y_train,[y_train.size,1])
y_valid = np.reshape(y_valid,[y_valid.size,1])
y_test = np.reshape(y_test,[y_test.size,1])

# % turn from dense to one hot representation
Y_train = dense_to_one_hot(y_train, n_classes=10)
Y_valid = dense_to_one_hot(y_valid, n_classes=10)
Y_test = dense_to_one_hot(y_test, n_classes=10)


Y_train = np.reshape(Y_train,[y_train.size/3,3,10])
Y_valid = np.reshape(Y_valid,[y_valid.size/3,3,10])
Y_test = np.reshape(Y_test,[y_test.size/3,3,10])

# %% Placeholders for 100x100 resolution
x = tf.placeholder(tf.float32, [None, 10000])
y = tf.placeholder(tf.float32, [None,3, 10])


x_tensor = tf.reshape(x, [-1, 100, 100, 1])

y_tensor = tf.reshape(y,[-1 ,10])

#%% localizaton network

keep_prob = tf.placeholder(tf.float32)

l_pool0_loc = tf.nn.max_pool(x_tensor,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID')

W_conv0_loc = weight_variable([3,3,1,20],'W_conv0_loc')

b_conv0_loc = bias_variable([20],'b_conv0_loc')

l_conv0_loc = tf.nn.relu(tf.nn.conv2d(l_pool0_loc,W_conv0_loc,strides=[1,1,1,1],padding='VALID')+b_conv0_loc)

l_pool1_loc = tf.nn.max_pool(l_conv0_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')

W_conv1_loc = weight_variable([3,3,20,20],'W_conv1_loc')   

b_conv1_loc = bias_variable([20],'b_conv1_loc')

l_conv1_loc =  tf.nn.relu(tf.nn.conv2d(l_pool1_loc,W_conv1_loc,strides=[1,1,1,1],padding='VALID')+b_conv1_loc)

l_pool2_loc = tf.nn.max_pool(l_conv1_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')

W_conv2_loc = weight_variable([3,3,20,20],'W_conv2_loc')

b_conv2_loc = bias_variable([20],'b_conv2_loc')

l_conv2_loc = tf.nn.relu(tf.nn.conv2d(l_pool2_loc,W_conv2_loc,strides=[1,1,1,1],padding='VALID')+b_conv2_loc )

l_conv2_loc = tf.reshape(l_conv2_loc,[-1 ,9*9*20 ])

# Replicate input for Gated Recurrent Unit
l_conv2_loc = tf.tile(l_conv2_loc,[1,3])

l_conv2_loc = tf.split(1,3,l_conv2_loc)

# Gated Recurrent Unit

gru_cell = rnn_cell.GRUCell(num_units=256)

output, state = rnn.rnn(gru_cell,inputs=l_conv2_loc,dtype=tf.float32)

output = tf.reshape(output,[-1,256])

initial = tf.zeros([256,6]) 


W_fc1_loc = tf.Variable(initial_value=initial,name='W_fc1_loc')

# Use identity transformation as starting point
initial = np.array([[1., 0, 0], [0, 1., 0]])
initial = initial.astype('float32')
initial = initial.flatten()
b_fc1_loc = tf.Variable(initial_value=initial,name='b_fc1_loc')


l_fc1_loc = tf.add(tf.matmul(output,W_fc1_loc), b_fc1_loc)


# %% We'll create a spatial transformer module to identify discriminative patches

downsample = 3

out_size = (100/downsample, 100/downsample)


l_transform = transformer(tf.tile(x_tensor,[3,1,1,1]), l_fc1_loc, out_size)

# %% Classification Network


W_conv0_out = weight_variable([3,3,1,32],'W_conv0_out')                   

b_conv0_out = bias_variable([32],'b_conv0_out')

l_conv0_out = tf.nn.relu(tf.nn.conv2d(l_transform,W_conv0_out,strides=[1,1,1,1],padding='VALID')+b_conv0_out)

l_pool1_out = tf.nn.max_pool(l_conv0_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')

#l_drp1_out = tf.nn.dropout(l_pool1_out,keep_prob)

W_conv1_out = weight_variable([3,3,32,32],'W_conv1_out')  

b_conv1_out = bias_variable([32],'b_conv1_out')

l_conv1_out = tf.nn.relu(tf.nn.conv2d(l_pool1_out,W_conv1_out,strides=[1,1,1,1],padding='VALID')+b_conv1_out)

l_pool2_out = tf.nn.max_pool(l_conv1_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')

#l_drp2_out = tf.nn.dropout(l_pool2_out,keep_prob)

W_conv2_out = weight_variable([3,3,32,32],'W_conv2_out')     

b_conv2_out = bias_variable([32],'b_conv2_out')

l_conv2_out = tf.nn.relu(tf.nn.conv2d(l_pool2_out,W_conv2_out,strides=[1,1,1,1],padding='VALID')+b_conv2_out)



# %% We'll now reshape so we can connect to a fully-connected layer:
l_conv2_out_flat = tf.reshape(l_conv2_out, [-1, 4*4*32])

# %% Create a fully-connected layer:
n_fc = 400

W_fc1 = tf.get_variable('W_fc1',shape=[4*4*32,n_fc],initializer=tf.contrib.layers.xavier_initializer())

#W_fc1 = weight_variable([4*4*32,n_fc],'W_fc1')

b_fc1=bias_variable([n_fc],'b_fc1')


h_fc1 = tf.nn.relu(tf.add(tf.matmul(l_conv2_out_flat, W_fc1) , b_fc1))

# %% And finally our softmax layer:

W_fc2 = tf.get_variable('W_fc2',shape=[n_fc, 10],initializer=tf.contrib.layers.xavier_initializer())

#W_fc2 = weight_variable([n_fc,10],'W_fc2')

b_fc2=bias_variable([10],'b_fc2')

y_logits = tf.add(tf.matmul(h_fc1, W_fc2) , b_fc2)



# %% Monitor accuracy



correct_prediction = tf.equal(tf.argmax(y_logits, 1), tf.argmax(y_tensor, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))


# %% Define loss/eval/training functions
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(y_logits,y_tensor))

opt = tf.train.RMSPropOptimizer(0.0005,epsilon=1e-6)

#opt = tf.train.AdagradOptimizer(0.01)
#optimizer = opt.minimize(cross_entropy)




gvs = opt.compute_gradients(cross_entropy)

capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]

optimizer = opt.apply_gradients(capped_gvs )




# %% We'll now train in minibatches and report accuracy, loss:

num_batches = 600
n_epochs = 300
batch_size = 100


with tf.Session( ) as sess:

     sess.run(tf.initialize_all_variables())

     for epoch_i in range(n_epochs):

    #print ('epoch: ' + str(epoch_i))
         shuffle = np.random.permutation(X_train.shape[0])
         avg_cost = 0.
         for iter_i in range(num_batches - 1):
             idx = shuffle[iter_i*batch_size:(iter_i+1)*batch_size]
             batch_xs = X_train[idx]
             batch_ys = Y_train[idx]


             _,c=sess.run([optimizer,cross_entropy], feed_dict={x: batch_xs, y: batch_ys})

             avg_cost += c / num_batches
             print('iter: ' + str(iter_i) +' >> ' +' MiniBatch Cost: ' +str(c)) 

     #   gr_print= sess.run([grads for grads,_  in gvs], feed_dict={x : batch_xs, y : batch_ys}) 
     #   print ('iter: '+str(iter_i))
     #   for t in gr_print:
      #      print np.linalg.norm(t)



saver = tf.train.Saver()

saver.save(sess,"save/my-model")

`

Upvotes: 1

Views: 1389

Answers (1)

Shamane Siriwardhana
Shamane Siriwardhana

Reputation: 4201

Well you can use drop out it will be very useful. If you are using lstm or rnn you can implement dropout very easily. `

def create_rnn_cell():
        encoDecoCell = tf.contrib.rnn.BasicLSTMCell(  # Or GRUCell, LSTMCell(args.hiddenSize)
            self.args.hiddenSize,
        )
        if not self.args.test:  # TODO: Should use a placeholder instead
            encoDecoCell = tf.contrib.rnn.DropoutWrapper(                   #using the dropout
                encoDecoCell,
                input_keep_prob=1.0,
                output_keep_prob=self.args.dropout
            )
        return encoDecoCell

Batch normalization is also effective. But I haven't seen any BN examples implemented if tf specially for RNN modules. But here's good example to learn about batch norm

Batch Normalization in tensorflow

Also read this article where you can apply batch normalization to RNN

http://olavnymoen.com/2016/07/07/rnn-batch-normalization

Upvotes: 1

Related Questions