Reputation: 71
In the code below, I'm using 10 conv layers and than a LSTM to compute the output.
If I use 1 Conv layer and then a LSTM it works fine. But If I start adding more conv layers(10 conv layers in code below), loss becomes huge and accuracy starts to decrease. And I've applied batch norm after each conv layer to make sure gradients do not vanish. To overfit this network, I'm using 5 to 10 examples, to check that network is overfitting but instead it's giving me huge losses, and if I decrease conv layers it works fine and also if I add more examples loss decreases up to a certain point and then stops. What's the bug here?
EDIT: Here is the reproducible code if you want to try - link
X = tf.placeholder(tf.float32, [None,time_steps,embedding])
Y = tf.placeholder(tf.int32, [None])
A = tf.placeholder(tf.bool)
B = tf.placeholder(tf.float32)
x = tf.expand_dims(X,3)
filter_shape = [1, embedding, 1, 64]
conv_weights = tf.get_variable("conv_weights1" , filter_shape, tf.float32, tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases = tf.Variable(tf.constant(0.1, shape=[64]))
conv = tf.nn.conv2d(x, conv_weights, strides=[1,1,1,1], padding = "VALID")
normalize = tf.nn.elu(conv + conv_biases)
tf_normalize = tf.contrib.layers.batch_norm(inputs = normalize,is_training = A)
outputs_fed_lstm = tf_normalize
filter_shape2 = [1, 1, 64, 64]
conv_weights2 = tf.get_variable("conv_weights2" , filter_shape2, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases2 = tf.Variable(tf.constant(0.1, shape=[64]))
conv2 = tf.nn.conv2d(outputs_fed_lstm, conv_weights2, strides=[1,1,1,1], padding = "VALID")
normalize2 = tf.nn.elu(conv2 + conv_biases2)
tf_normalize2 = tf.contrib.layers.batch_norm(inputs = normalize2,is_training = A)
outputs_fed_lstm2 = tf_normalize2
filter_shape3 = [1, 1, 64, 64]
conv_weights3 = tf.get_variable("conv_weights3" , filter_shape3, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases3 = tf.Variable(tf.constant(0.1, shape=[64]))
conv3 = tf.nn.conv2d(outputs_fed_lstm2, conv_weights3, strides=[1,1,1,1], padding = "VALID")
normalize3 = tf.nn.elu(conv3 + conv_biases3)
tf_normalize3 = tf.contrib.layers.batch_norm(inputs = normalize3,is_training = A)
outputs_fed_lstm3 = tf_normalize3
filter_shape4 = [1, 1, 64, 128]
conv_weights4 = tf.get_variable("conv_weights4" , filter_shape4, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases4 = tf.Variable(tf.constant(0.1, shape=[128]))
conv4 = tf.nn.conv2d(outputs_fed_lstm3, conv_weights4, strides=[1,1,1,1], padding = "VALID")
normalize4 = tf.nn.elu(conv4 + conv_biases4)
tf_normalize4 = tf.contrib.layers.batch_norm(inputs = normalize4,is_training = A)
outputs_fed_lstm4 = tf_normalize4
filter_shape5 = [1, 1, 128, 128]
conv_weights5 = tf.get_variable("conv_weights5" , filter_shape5, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases5 = tf.Variable(tf.constant(0.1, shape=[128]))
conv5 = tf.nn.conv2d(outputs_fed_lstm4, conv_weights5, strides=[1,1,1,1], padding = "VALID")
normalize5 = tf.nn.elu(conv5 + conv_biases5)
tf_normalize5 = tf.contrib.layers.batch_norm(inputs = normalize5,is_training = A)
outputs_fed_lstm5 = tf_normalize5
filter_shape6 = [1, 1, 128, 128]
conv_weights6 = tf.get_variable("conv_weights6" , filter_shape6, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases6 = tf.Variable(tf.constant(0.1, shape=[128]))
conv6 = tf.nn.conv2d(outputs_fed_lstm5, conv_weights6, strides=[1,1,1,1], padding = "VALID")
normalize6 = tf.nn.elu(conv6 + conv_biases6)
tf_normalize6 = tf.contrib.layers.batch_norm(inputs = normalize6,is_training = A)
outputs_fed_lstm6 = tf_normalize6
filter_shape7 = [1, 1, 128, 256]
conv_weights7 = tf.get_variable("conv_weights7" , filter_shape7, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases7 = tf.Variable(tf.constant(0.1, shape=[256]))
conv7 = tf.nn.conv2d(outputs_fed_lstm6, conv_weights7, strides=[1,1,1,1], padding = "VALID")
normalize7 = tf.nn.elu(conv7 + conv_biases7)
tf_normalize7 = tf.contrib.layers.batch_norm(inputs = normalize7,is_training = A)
outputs_fed_lstm7 = tf_normalize7
filter_shape8 = [1, 1, 256, 256]
conv_weights8 = tf.get_variable("conv_weights8" , filter_shape8, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases8 = tf.Variable(tf.constant(0.1, shape=[256]))
conv8 = tf.nn.conv2d(outputs_fed_lstm7, conv_weights8, strides=[1,1,1,1], padding = "VALID")
normalize8 = tf.nn.elu(conv8 + conv_biases8)
tf_normalize8 = tf.contrib.layers.batch_norm(inputs = normalize8,is_training = A)
outputs_fed_lstm8 = tf_normalize8
filter_shape9 = [1, 1, 256, 256]
conv_weights9 = tf.get_variable("conv_weights9" , filter_shape9, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases9 = tf.Variable(tf.constant(0.1, shape=[256]))
conv9 = tf.nn.conv2d(outputs_fed_lstm8, conv_weights9, strides=[1,1,1,1], padding = "VALID")
normalize9 = tf.nn.elu(conv9 + conv_biases9)
tf_normalize9 = tf.contrib.layers.batch_norm(inputs = normalize9,is_training = A)
outputs_fed_lstm9 = tf_normalize9
filter_shape0 = [1, 1, 256, 512]
conv_weights0 = tf.get_variable("conv_weights0" , filter_shape0, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases0 = tf.Variable(tf.constant(0.1, shape=[512]))
conv0 = tf.nn.conv2d(outputs_fed_lstm9, conv_weights0, strides=[1,1,1,1], padding = "VALID")
normalize0 = tf.nn.elu(conv0 + conv_biases0)
tf_normalize0 = tf.contrib.layers.batch_norm(inputs = normalize0,is_training = A)
outputs_fed_lstm0 = tf_normalize0
outputs_fed_lstm10 = tf.nn.dropout(x = outputs_fed_lstm0, keep_prob = B)
x = tf.squeeze(outputs_fed_lstm10, [2])
x = tf.transpose(x, [1, 0, 2])
x = tf.reshape(x, [-1, 512])
x = tf.split(0, time_steps, x)
lstm = tf.nn.rnn_cell.LSTMCell(num_units = _units, state_is_tuple=True)
# multi_lstm = tf.nn.rnn_cell.MultiRNNCell([lstm] * lstm_layers, state_is_tuple = True)
outputs , state = tf.nn.rnn(lstm,x, dtype = tf.float32)
weights = tf.Variable(tf.random_normal([_units,num_classes]))
biases = tf.Variable(tf.random_normal([num_classes]))
logits = tf.matmul(outputs[-1], weights) + biases
c_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits,Y)
loss = tf.reduce_mean(c_loss)
global_step = tf.Variable(0, name="global_step", trainable=False)
decayed_learning_rate = tf.train.exponential_decay(learning_rate = 0.01,global_step = global_step,decay_steps = 300, decay_rate = 0.96, staircase = True)
optimizer= tf.train.AdamOptimizer(learning_rate = decayed_learning_rate)
#grads_and_vars = optimizer.compute_gradients(loss,[conv_weights0])
minimize_loss = optimizer.minimize(loss, global_step=global_step)
correct_predict = tf.nn.in_top_k(logits, Y, 1)
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32))
Upvotes: 0
Views: 1195
Reputation: 71
I've figured this out, why this is occurring,when we increase the number of neurons or layers randomly like 56, 86 then 496, then this sort of problem tend to occur, no matter how many layers you add, the result will be huge losses and very low accuracy, so solution to this problem is follow a particular pattern like 64,128,256,512.
Upvotes: 1