Adding more conv layers increases the loss and decreases accuracy, Tensorflow

Question

In the code below, I'm using 10 conv layers and than a LSTM to compute the output.

If I use 1 Conv layer and then a LSTM it works fine. But If I start adding more conv layers(10 conv layers in code below), loss becomes huge and accuracy starts to decrease. And I've applied batch norm after each conv layer to make sure gradients do not vanish. To overfit this network, I'm using 5 to 10 examples, to check that network is overfitting but instead it's giving me huge losses, and if I decrease conv layers it works fine and also if I add more examples loss decreases up to a certain point and then stops. What's the bug here?

EDIT: Here is the reproducible code if you want to try - link

X = tf.placeholder(tf.float32, [None,time_steps,embedding])
Y = tf.placeholder(tf.int32, [None])
A = tf.placeholder(tf.bool)
B = tf.placeholder(tf.float32)

x = tf.expand_dims(X,3)

filter_shape = [1, embedding, 1, 64]
conv_weights = tf.get_variable("conv_weights1" , filter_shape, tf.float32, tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases = tf.Variable(tf.constant(0.1, shape=[64]))
conv = tf.nn.conv2d(x, conv_weights, strides=[1,1,1,1], padding = "VALID")
normalize = tf.nn.elu(conv + conv_biases)
tf_normalize = tf.contrib.layers.batch_norm(inputs = normalize,is_training = A)
outputs_fed_lstm = tf_normalize

filter_shape2 = [1, 1, 64, 64]
conv_weights2 = tf.get_variable("conv_weights2" , filter_shape2, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases2 = tf.Variable(tf.constant(0.1, shape=[64]))
conv2 = tf.nn.conv2d(outputs_fed_lstm, conv_weights2, strides=[1,1,1,1], padding = "VALID")
normalize2 = tf.nn.elu(conv2 + conv_biases2)
tf_normalize2 = tf.contrib.layers.batch_norm(inputs = normalize2,is_training = A)
outputs_fed_lstm2 = tf_normalize2

filter_shape3 = [1, 1, 64, 64]
conv_weights3 = tf.get_variable("conv_weights3" , filter_shape3, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases3 = tf.Variable(tf.constant(0.1, shape=[64]))
conv3 = tf.nn.conv2d(outputs_fed_lstm2, conv_weights3, strides=[1,1,1,1], padding = "VALID")
normalize3 = tf.nn.elu(conv3 + conv_biases3)
tf_normalize3 = tf.contrib.layers.batch_norm(inputs = normalize3,is_training = A)
outputs_fed_lstm3 = tf_normalize3

filter_shape4 = [1, 1, 64, 128]
conv_weights4 = tf.get_variable("conv_weights4" , filter_shape4, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases4 = tf.Variable(tf.constant(0.1, shape=[128]))
conv4 = tf.nn.conv2d(outputs_fed_lstm3, conv_weights4, strides=[1,1,1,1], padding = "VALID")
normalize4 = tf.nn.elu(conv4 + conv_biases4)
tf_normalize4 = tf.contrib.layers.batch_norm(inputs = normalize4,is_training = A)
outputs_fed_lstm4 = tf_normalize4

filter_shape5 = [1, 1, 128, 128]
conv_weights5 = tf.get_variable("conv_weights5" , filter_shape5, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases5 = tf.Variable(tf.constant(0.1, shape=[128]))
conv5 = tf.nn.conv2d(outputs_fed_lstm4, conv_weights5, strides=[1,1,1,1], padding = "VALID")
normalize5 = tf.nn.elu(conv5 + conv_biases5)
tf_normalize5 = tf.contrib.layers.batch_norm(inputs = normalize5,is_training = A)
outputs_fed_lstm5 = tf_normalize5

filter_shape6 = [1, 1, 128, 128]
conv_weights6 = tf.get_variable("conv_weights6" , filter_shape6, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases6 = tf.Variable(tf.constant(0.1, shape=[128]))
conv6 = tf.nn.conv2d(outputs_fed_lstm5, conv_weights6, strides=[1,1,1,1], padding = "VALID")
normalize6 = tf.nn.elu(conv6 + conv_biases6)
tf_normalize6 = tf.contrib.layers.batch_norm(inputs = normalize6,is_training = A)
outputs_fed_lstm6 = tf_normalize6  

filter_shape7 = [1, 1, 128, 256]
conv_weights7 = tf.get_variable("conv_weights7" , filter_shape7, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases7 = tf.Variable(tf.constant(0.1, shape=[256]))
conv7 = tf.nn.conv2d(outputs_fed_lstm6, conv_weights7, strides=[1,1,1,1], padding = "VALID")
normalize7 = tf.nn.elu(conv7 + conv_biases7)
tf_normalize7 = tf.contrib.layers.batch_norm(inputs = normalize7,is_training = A)
outputs_fed_lstm7 = tf_normalize7 

filter_shape8 = [1, 1, 256, 256]
conv_weights8 = tf.get_variable("conv_weights8" , filter_shape8, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases8 = tf.Variable(tf.constant(0.1, shape=[256]))
conv8 = tf.nn.conv2d(outputs_fed_lstm7, conv_weights8, strides=[1,1,1,1], padding = "VALID")
normalize8 = tf.nn.elu(conv8 + conv_biases8)
tf_normalize8 = tf.contrib.layers.batch_norm(inputs = normalize8,is_training = A)
outputs_fed_lstm8 = tf_normalize8 

filter_shape9 = [1, 1, 256, 256]
conv_weights9 = tf.get_variable("conv_weights9" , filter_shape9, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases9 = tf.Variable(tf.constant(0.1, shape=[256]))
conv9 = tf.nn.conv2d(outputs_fed_lstm8, conv_weights9, strides=[1,1,1,1], padding = "VALID")
normalize9 = tf.nn.elu(conv9 + conv_biases9)
tf_normalize9 = tf.contrib.layers.batch_norm(inputs = normalize9,is_training = A)
outputs_fed_lstm9 = tf_normalize9 

filter_shape0 = [1, 1, 256, 512]
conv_weights0 = tf.get_variable("conv_weights0" , filter_shape0, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases0 = tf.Variable(tf.constant(0.1, shape=[512]))
conv0 = tf.nn.conv2d(outputs_fed_lstm9, conv_weights0, strides=[1,1,1,1], padding = "VALID")
normalize0 = tf.nn.elu(conv0 + conv_biases0)
tf_normalize0 = tf.contrib.layers.batch_norm(inputs = normalize0,is_training = A)
outputs_fed_lstm0 = tf_normalize0 


outputs_fed_lstm10 = tf.nn.dropout(x = outputs_fed_lstm0, keep_prob = B) 


x = tf.squeeze(outputs_fed_lstm10, [2])     
x = tf.transpose(x, [1, 0, 2])
x = tf.reshape(x, [-1, 512])
x = tf.split(0, time_steps, x)

lstm = tf.nn.rnn_cell.LSTMCell(num_units = _units, state_is_tuple=True)

# multi_lstm = tf.nn.rnn_cell.MultiRNNCell([lstm] * lstm_layers, state_is_tuple = True)

outputs , state = tf.nn.rnn(lstm,x, dtype = tf.float32)     

weights = tf.Variable(tf.random_normal([_units,num_classes]))
biases  = tf.Variable(tf.random_normal([num_classes]))

logits = tf.matmul(outputs[-1], weights) + biases

c_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits,Y)
loss = tf.reduce_mean(c_loss)


global_step = tf.Variable(0, name="global_step", trainable=False)
decayed_learning_rate = tf.train.exponential_decay(learning_rate = 0.01,global_step = global_step,decay_steps = 300, decay_rate = 0.96, staircase = True)
optimizer= tf.train.AdamOptimizer(learning_rate = decayed_learning_rate)
#grads_and_vars = optimizer.compute_gradients(loss,[conv_weights0])
minimize_loss = optimizer.minimize(loss, global_step=global_step)   

correct_predict = tf.nn.in_top_k(logits, Y, 1)
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32))

shader · Accepted Answer

I've figured this out, why this is occurring,when we increase the number of neurons or layers randomly like 56, 86 then 496, then this sort of problem tend to occur, no matter how many layers you add, the result will be huge losses and very low accuracy, so solution to this problem is follow a particular pattern like 64,128,256,512.

Adding more conv layers increases the loss and decreases accuracy, Tensorflow

Answers (1)

Related Questions