Mark .Z
Mark .Z

Reputation: 11

tensorflow rnn nan error

I want to train an RNN model to connect an article and an image. The input and the output are two arrays.

I define the parameters of RNN as follow:

learning_rate = 0.001
training_iters = 100000
batch_size = 128
display_step = 10

# Network Parameters
n_input = 128 
n_steps = 168 # timesteps
n_hidden = 512 # hidden layer num of features
output = 200 

the image is 128*168 and the article is 200

cost = tf.reduce_mean(pow(pred-y,2)/2) 
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

For the end result, I want to train a network to transform an image to an article. However, when I try to train the model, the cost is returned as NaN.

Here is the code:

# coding=utf-8
from __future__ import print_function
from tensorflow.contrib import rnn
import scipy.io as scio
import tensorflow as tf
import numpy as np
import os
TextPath = 'F://matlab_code//readtxt//ImageTextVector.mat';
ImageDirPath = 'F://matlab_code//CVPR10-LLC//features//1';
Text = scio.loadmat(TextPath)

learning_rate = 0.001
training_iters = 100000
batch_size = 128
display_step = 10

# Network Parameters
n_input = 128 # 
n_steps = 168 # timesteps
n_hidden = 512 # hidden layer num of features
output = 200 # 

x = tf.placeholder("float", [None, n_steps, n_input])
y = tf.placeholder("float", [None, output])

weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, output]))
}
biases = {
    'out': tf.Variable(tf.random_normal([output]))
}

def RNN(x, weights, biases):

    lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)

    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    return tf.matmul(outputs[-1], weights['out']) + biases['out']

pred = RNN(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(pow(pred-y,2)/2) 
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

init = tf.global_variables_initializer()

train_count=0;
with tf.Session() as sess:
    sess.run(init)
    step = 0
    while step* batch_size < training_iters:
        iter = step*batch_size
        batch_x = []
        batch_y = []
        while iter < (step+1)*batch_size:
            ImagePath = ImageDirPath + '//' + Text['X'][train_count][0][0] +'.mat'
            if os.path.exists(ImagePath):
                batch_xx=[]
                batch_yy=[]
                Image = scio.loadmat(ImagePath)
                i=0
                while i<21504 :
                    batch_xx.append(Image['fea'][i][0])
                    i=i+1
                batch_yy = Text['X'][train_count][1][0]
                batch_xx = np.array(batch_xx)
                batch_x=np.hstack((batch_x,batch_xx))
                batch_y=np.hstack((batch_y,batch_yy))
                iter = iter+1
            train_count=train_count+1
        batch_x = batch_x.reshape((batch_size,n_steps, n_input))
        batch_y = batch_y.reshape((batch_size,output))
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

        if step % display_step == 0:
            # Calculate batch loss
            loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
            print("Iter " + str(step* batch_size) + ", Minibatch Loss= " + \
                 "{:.6f}".format(loss) )
        step += 1
    print("Optimization Finished!")

Upvotes: 1

Views: 588

Answers (1)

shizi
shizi

Reputation: 85

when you pass tensor including nan values to lstm, the value in the cell of lstm's will be "forced" to nan because the numerical operation between number and nan. Check whether your data have nan value or just use numpy.nan_to_num to fill your nan data.

Upvotes: 1

Related Questions