Reputation: 11
i'm trying to replicate Recurrent Spatial Transformer Network implemented here (https://github.com/skaae/recurrent-spatial-transformer-code) , however the loss didn't decrease at all .
the configuration of the network is as follow:
1 - relu activations .
2 - xavier weight initialization for weights , zero initialization for biases .
3 - cost function is softmax_cross_entropy_with_logits .
4 - optimizer is RMSProp (i tried 1e-6 ;1e-10 espilon) .
5 - gradient clipping by value .
so what should i try next ?
Below is the detailed Code
import tensorflow as tf
from spatial_transformer import transformer
from tensorflow.python.ops import rnn,rnn_cell
import numpy as np
from tf_utils import weight_variable, bias_variable, dense_to_one_hot
# %% load data
mnist_cluttered = np.load('data/mnist_sequence3_sample_8distortions_9x9.npz')
X_train = mnist_cluttered['X_train']
y_train = mnist_cluttered['y_train']
X_valid = mnist_cluttered['X_valid']
y_valid = mnist_cluttered['y_valid']
X_test = mnist_cluttered['X_test']
y_test = mnist_cluttered['y_test']
y_train = np.reshape(y_train,[y_train.size,1])
y_valid = np.reshape(y_valid,[y_valid.size,1])
y_test = np.reshape(y_test,[y_test.size,1])
# % turn from dense to one hot representation
Y_train = dense_to_one_hot(y_train, n_classes=10)
Y_valid = dense_to_one_hot(y_valid, n_classes=10)
Y_test = dense_to_one_hot(y_test, n_classes=10)
Y_train = np.reshape(Y_train,[y_train.size/3,3,10])
Y_valid = np.reshape(Y_valid,[y_valid.size/3,3,10])
Y_test = np.reshape(Y_test,[y_test.size/3,3,10])
# %% Placeholders for 100x100 resolution
x = tf.placeholder(tf.float32, [None, 10000])
y = tf.placeholder(tf.float32, [None,3, 10])
x_tensor = tf.reshape(x, [-1, 100, 100, 1])
y_tensor = tf.reshape(y,[-1 ,10])
#%% localizaton network
keep_prob = tf.placeholder(tf.float32)
l_pool0_loc = tf.nn.max_pool(x_tensor,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID')
W_conv0_loc = weight_variable([3,3,1,20],'W_conv0_loc')
b_conv0_loc = bias_variable([20],'b_conv0_loc')
l_conv0_loc = tf.nn.relu(tf.nn.conv2d(l_pool0_loc,W_conv0_loc,strides=[1,1,1,1],padding='VALID')+b_conv0_loc)
l_pool1_loc = tf.nn.max_pool(l_conv0_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')
W_conv1_loc = weight_variable([3,3,20,20],'W_conv1_loc')
b_conv1_loc = bias_variable([20],'b_conv1_loc')
l_conv1_loc = tf.nn.relu(tf.nn.conv2d(l_pool1_loc,W_conv1_loc,strides=[1,1,1,1],padding='VALID')+b_conv1_loc)
l_pool2_loc = tf.nn.max_pool(l_conv1_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')
W_conv2_loc = weight_variable([3,3,20,20],'W_conv2_loc')
b_conv2_loc = bias_variable([20],'b_conv2_loc')
l_conv2_loc = tf.nn.relu(tf.nn.conv2d(l_pool2_loc,W_conv2_loc,strides=[1,1,1,1],padding='VALID')+b_conv2_loc )
l_conv2_loc = tf.reshape(l_conv2_loc,[-1 ,9*9*20 ])
# Replicate input for Gated Recurrent Unit
l_conv2_loc = tf.tile(l_conv2_loc,[1,3])
l_conv2_loc = tf.split(1,3,l_conv2_loc)
# Gated Recurrent Unit
gru_cell = rnn_cell.GRUCell(num_units=256)
output, state = rnn.rnn(gru_cell,inputs=l_conv2_loc,dtype=tf.float32)
output = tf.reshape(output,[-1,256])
initial = tf.zeros([256,6])
W_fc1_loc = tf.Variable(initial_value=initial,name='W_fc1_loc')
# Use identity transformation as starting point
initial = np.array([[1., 0, 0], [0, 1., 0]])
initial = initial.astype('float32')
initial = initial.flatten()
b_fc1_loc = tf.Variable(initial_value=initial,name='b_fc1_loc')
l_fc1_loc = tf.add(tf.matmul(output,W_fc1_loc), b_fc1_loc)
# %% We'll create a spatial transformer module to identify discriminative patches
downsample = 3
out_size = (100/downsample, 100/downsample)
l_transform = transformer(tf.tile(x_tensor,[3,1,1,1]), l_fc1_loc, out_size)
# %% Classification Network
W_conv0_out = weight_variable([3,3,1,32],'W_conv0_out')
b_conv0_out = bias_variable([32],'b_conv0_out')
l_conv0_out = tf.nn.relu(tf.nn.conv2d(l_transform,W_conv0_out,strides=[1,1,1,1],padding='VALID')+b_conv0_out)
l_pool1_out = tf.nn.max_pool(l_conv0_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')
#l_drp1_out = tf.nn.dropout(l_pool1_out,keep_prob)
W_conv1_out = weight_variable([3,3,32,32],'W_conv1_out')
b_conv1_out = bias_variable([32],'b_conv1_out')
l_conv1_out = tf.nn.relu(tf.nn.conv2d(l_pool1_out,W_conv1_out,strides=[1,1,1,1],padding='VALID')+b_conv1_out)
l_pool2_out = tf.nn.max_pool(l_conv1_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')
#l_drp2_out = tf.nn.dropout(l_pool2_out,keep_prob)
W_conv2_out = weight_variable([3,3,32,32],'W_conv2_out')
b_conv2_out = bias_variable([32],'b_conv2_out')
l_conv2_out = tf.nn.relu(tf.nn.conv2d(l_pool2_out,W_conv2_out,strides=[1,1,1,1],padding='VALID')+b_conv2_out)
# %% We'll now reshape so we can connect to a fully-connected layer:
l_conv2_out_flat = tf.reshape(l_conv2_out, [-1, 4*4*32])
# %% Create a fully-connected layer:
n_fc = 400
W_fc1 = tf.get_variable('W_fc1',shape=[4*4*32,n_fc],initializer=tf.contrib.layers.xavier_initializer())
#W_fc1 = weight_variable([4*4*32,n_fc],'W_fc1')
b_fc1=bias_variable([n_fc],'b_fc1')
h_fc1 = tf.nn.relu(tf.add(tf.matmul(l_conv2_out_flat, W_fc1) , b_fc1))
# %% And finally our softmax layer:
W_fc2 = tf.get_variable('W_fc2',shape=[n_fc, 10],initializer=tf.contrib.layers.xavier_initializer())
#W_fc2 = weight_variable([n_fc,10],'W_fc2')
b_fc2=bias_variable([10],'b_fc2')
y_logits = tf.add(tf.matmul(h_fc1, W_fc2) , b_fc2)
# %% Monitor accuracy
correct_prediction = tf.equal(tf.argmax(y_logits, 1), tf.argmax(y_tensor, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
# %% Define loss/eval/training functions
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(y_logits,y_tensor))
opt = tf.train.RMSPropOptimizer(0.0005,epsilon=1e-6)
#opt = tf.train.AdagradOptimizer(0.01)
#optimizer = opt.minimize(cross_entropy)
gvs = opt.compute_gradients(cross_entropy)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
optimizer = opt.apply_gradients(capped_gvs )
# %% We'll now train in minibatches and report accuracy, loss:
num_batches = 600
n_epochs = 300
batch_size = 100
with tf.Session( ) as sess:
sess.run(tf.initialize_all_variables())
for epoch_i in range(n_epochs):
#print ('epoch: ' + str(epoch_i))
shuffle = np.random.permutation(X_train.shape[0])
avg_cost = 0.
for iter_i in range(num_batches - 1):
idx = shuffle[iter_i*batch_size:(iter_i+1)*batch_size]
batch_xs = X_train[idx]
batch_ys = Y_train[idx]
_,c=sess.run([optimizer,cross_entropy], feed_dict={x: batch_xs, y: batch_ys})
avg_cost += c / num_batches
print('iter: ' + str(iter_i) +' >> ' +' MiniBatch Cost: ' +str(c))
# gr_print= sess.run([grads for grads,_ in gvs], feed_dict={x : batch_xs, y : batch_ys})
# print ('iter: '+str(iter_i))
# for t in gr_print:
# print np.linalg.norm(t)
saver = tf.train.Saver()
saver.save(sess,"save/my-model")
`
Upvotes: 1
Views: 1389
Reputation: 4201
Well you can use drop out it will be very useful. If you are using lstm or rnn you can implement dropout very easily. `
def create_rnn_cell():
encoDecoCell = tf.contrib.rnn.BasicLSTMCell( # Or GRUCell, LSTMCell(args.hiddenSize)
self.args.hiddenSize,
)
if not self.args.test: # TODO: Should use a placeholder instead
encoDecoCell = tf.contrib.rnn.DropoutWrapper( #using the dropout
encoDecoCell,
input_keep_prob=1.0,
output_keep_prob=self.args.dropout
)
return encoDecoCell
Batch normalization is also effective. But I haven't seen any BN examples implemented if tf specially for RNN modules. But here's good example to learn about batch norm
Batch Normalization in tensorflow
Also read this article where you can apply batch normalization to RNN
http://olavnymoen.com/2016/07/07/rnn-batch-normalization
Upvotes: 1