Different loss values with same data, same initial state, same recurrent neural network

Question

I am writing a recurrent neural network (specifically, a ConvLSTM). Recently, I have noticed an interesting inconsistency that I cannot quite figure out. I have written this neural network from scratch using numpy (technically cupy for gpu) and a few Chainer lines (specifically for their F.convolution_2D function).

When running this same network twice, for the first 4 or so training examples, the losses are EXACTLY the same. However, around the 5th training example, the losses start to fluctuate in their value.

I have ensured that each time I am running this network, they are reading from the same initial state text file (and thus have the same initial weights and biases). I have also ensured that the data they are inputting are exactly the same.

Is there some inconsistency with Numpy that is the root of this problem? The only thing I can think that is different around the 4th training example is the first usage of gradient clipping. Is there some problem with numpy's linalg function? Is there some rounding error I am not familiar with? I have scanned through my code and there is no instance of utilizing random numbers.

I have added my backpropagation function below:

def bptt(x2, y2, iteration):
x = cp.asarray(x2)
y = cp.asarray(y2)

global connected_weights
global main_kernel
global bias_i
global bias_f
global bias_c
global bias_o
global bias_y
global learning_rate

# Perform forward prop
prediction, pre_sigmoid_prediction, hidden_prediction, i, f, a, c, o, h = forward_prop(x)
loss = calculate_loss(prediction, y)
print("LOSS BEFORE: ")
print(loss)
# Calculate loss with respect to final layer
dLdy_2 = loss_derivative(prediction, y)
# Calculate loss with respect to pre sigmoid layer
dLdy_1 = cp.multiply(sigmoid_derivative(pre_sigmoid_prediction), dLdy_2)

# Calculate loss with respect to last layer of lstm
dLdh = cp.zeros([T + 1, channels_hidden, M, N])
dLdh[T - 1] = cp.reshape(cp.matmul(cp.transpose(connected_weights), dLdy_1.reshape(1, M * N)), (channels_hidden, M, N)) # reshape dLdh to the appropriate size
dLdw_0 = cp.matmul(dLdy_1.reshape(1, M*N), hidden_prediction.transpose(1,0))
# Calculate loss with respect to bias y
dLdb_y = dLdy_1

#--------------------fully connected------------------
bias_y = bias_y - learning_rate*dLdb_y
connected_weights = connected_weights - learning_rate*dLdw_0

# Initialize corresponding matrices
dLdo = cp.zeros([T, channels_hidden, M, N])
dLdc = cp.zeros([T + 1, channels_hidden, M, N])
dLda = cp.zeros([T, channels_hidden, M, N])
dLdf = cp.zeros([T, channels_hidden, M, N])
dLdi = cp.zeros([T, channels_hidden, M, N])
dLdI = cp.zeros([T, channels_hidden+ channels_img, M, N])
dLdW = cp.zeros([4*channels_hidden, channels_img + channels_hidden, kernel_dimension, kernel_dimension])


# Initialize other stuff
dLdo_hat = cp.zeros([T, channels_hidden, M, N])
dLda_hat = cp.zeros([T, channels_hidden, M, N])
dLdf_hat = cp.zeros([T, channels_hidden, M, N])
dLdi_hat = cp.zeros([T, channels_hidden, M, N])

# initialize biases
dLdb_c = cp.empty([channels_hidden, M, N])
dLdb_i = cp.empty([channels_hidden, M, N])
dLdb_f = cp.empty([channels_hidden, M, N])
dLdb_o = cp.empty([channels_hidden, M, N])

for t in cp.arange(T - 1, -1, -1):
    dLdo[t] = cp.multiply(dLdh[t], tanh(c[t]))
    dLdc[t] += cp.multiply(cp.multiply(dLdh[t], o[t]), (cp.ones((channels_hidden, M, N)) - cp.multiply(tanh(c[t]), tanh(c[t]))))
    dLdi[t] = cp.multiply(dLdc[t], a[t])
    dLda[t] = cp.multiply(dLdc[t], i[t])
    dLdf[t] = cp.multiply(dLdc[t], c[t - 1])
    dLdc[t - 1] = cp.multiply(dLdc[t], f[t])

    dLda_hat[t] = cp.multiply(dLda[t], (cp.ones((channels_hidden, M, N)) - cp.multiply(a[t], a[t])))
    dLdi_hat[t] = cp.multiply(cp.multiply(dLdi[t], i[t]), cp.ones((channels_hidden, M, N)) - i[t])
    dLdf_hat[t] = cp.multiply(cp.multiply(dLdf[t], f[t]), cp.ones((channels_hidden, M, N)) - f[t])
    dLdo_hat[t] = cp.multiply(cp.multiply(dLdo[t], o[t]), cp.ones((channels_hidden, M, N)) - o[t])

    dLdb_c += dLda_hat[t]
    dLdb_i += dLdi_hat[t]
    dLdb_f += dLdf_hat[t]
    dLdb_o += dLdo_hat[t]

    # CONCATENATE Z IN THE RIGHT ORDER SAME ORDER AS THE WEIGHTS
    dLdz_hat = cp.concatenate((dLdi_hat[t], dLdf_hat[t], dLda_hat[t], dLdo_hat[t]), axis = 0) 
    #determine convolution derivatives
    #here we will use the fact that in z = w * I, dLdW = dLdz * I
    temporary = cp.concatenate((x[t], h[t - 1]), axis=0).reshape(channels_hidden + channels_img, 1, M, N)
    dLdI[t] = cp.asarray(F.convolution_2d(dLdz_hat.reshape(1, 4*channels_hidden, M, N), main_kernel.transpose(1, 0, 2, 3), b=None, pad=1)[0].data) # reshape into flipped kernel dimensions
    dLdW += cp.asarray((F.convolution_2d(temporary, dLdz_hat.reshape(4*channels_hidden, 1, M, N), b=None, pad=1).data).transpose(1,0,2,3)) #reshape into kernel dimensions
    #gradient clipping
    if cp.amax(dLdW) > 1 or cp.amin(dLdW) < -1:
        dLdW = dLdW/cp.linalg.norm(dLdW)
    if cp.amax(dLdb_c) > 1 or cp.amin(dLdb_c) < -1:
        dLdb_c = dLdb_c/cp.linalg.norm(dLdb_c)
    if cp.amax(dLdb_i) > 1 or cp.amin(dLdb_i) < -1:
        dLdb_i = dLdb_i/cp.linalg.norm(dLdb_i)
    if cp.amax(dLdb_f) > 1 or cp.amin(dLdb_f) < -1:
        dLdb_f = dLdb_f/cp.linalg.norm(dLdb_f)
    if cp.amax(dLdb_o) > 1 or cp.amin(dLdb_o) < -1:
        dLdb_o = dLdb_o/cp.linalg.norm(dLdb_o)
    if cp.amax(dLdw_0) > 1 or cp.amin(dLdw_0) < -1:
        dLdw_0 = dLdw_0/cp.linalg.norm(dLdw_0)
    if cp.amax(dLdb_y) > 1 or cp.amin(dLdb_y) < -1:
        dLdb_y = dLdb_y/cp.linalg.norm(dLdb_y)

    print("dLdW on step: " + str(t) + " is this: " + str(dLdW[0][0][0][0]))
    #print("dLdw_0")
    #print("dLdW")
    #print(dLdW)
    #print(str(cp.amax(dLdw_0)) + " : " + str(cp.amin(dLdw_0)))
    #print("dLdW")
    #print(str(cp.amax(dLdW)) + " : " + str(cp.amin(dLdW)))
    #print("dLdb_c")
    #print(str(cp.amax(dLdb_c)) + " : " + str(cp.amin(dLdb_c)))

    dLdh[t-1] = dLdI[t][channels_img: channels_img+channels_hidden] 
    #.reshape(4*channels_hidden, channels_hidden+channels_img, kernel_dimension, kernel_dimension)
    #update weights with convolution derivatives

#----------------------------adam optimizer code-----------------------------------
#---------------------update main kernel---------
main_kernel = main_kernel - learning_rate*dLdW
#--------------------update bias c-----------------------
bias_c = bias_c - learning_rate*dLdb_c
#--------------------update bias i-----------------------
bias_i = bias_i - learning_rate*dLdb_i
#--------------------update bias f-----------------------
bias_f = bias_f - learning_rate*dLdb_f
#--------------------update bias c-----------------------
bias_o = bias_o - learning_rate*dLdb_o

prediction2, pre_sigmoid_prediction2, hidden_prediction2, i2, f2, a2, c2, o2, h2 = forward_prop(x)

print("dLdW is: " + str(dLdW[0][0][0][0]))       
loss2 = calculate_loss(prediction2, y)
print("LOSS AFTER: ")
print(loss2)


print("backpropagation complete")

Rehaan Ahmad · Accepted Answer

Wow, that took some time.

If you look at the back propagation code, look closely at these lines:

dLdb_c = cp.empty([channels_hidden, M, N])
dLdb_i = cp.empty([channels_hidden, M, N])
dLdb_f = cp.empty([channels_hidden, M, N])
dLdb_o = cp.empty([channels_hidden, M, N])

However, notice how the code proceeds to use the += operator on these empty arrays. Simply change the arrays to cp.zeros, and the code gives consistent loss.

Different loss values with same data, same initial state, same recurrent neural network

Answers (1)

Related Questions