Frank.Fan
Frank.Fan

Reputation: 939

Tensorflow : why it say CUDA_ERROR_OUT_OF_MEMORY and can not training?

I am a new guy in tensorflow ,and today when I run my training code, I get error and can't run.Here is the details: when I training my data,it will say CUDA_ERROR_OUT_OF_MEMORY and run a few of step just like this:enter image description here

And then it will error like this:enter image description here

At the end ,it will become this:enter image description here

My code is here:

# coding=utf-8
from  color_1 import read_and_decode, get_batch
import LeNet_5
import os
import tensorflow as tf
batch_size =16
TRAIN_STEPS = 10000
crop_size = 224
REGULARAZTION_RATE=0.0001

def train(batch_x, batch_y):
    image_holder = tf.placeholder(tf.float32, [batch_size, 224, 224, 3], name='x-input')
    label_holder = tf.placeholder(tf.float32, [batch_size], name='y-input')
    regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
    y = LeNet_5.inference(image_holder, train,regularizer)
    global_step = tf.Variable(0, trainable=False)
    def loss(logits, labels):
        labels = tf.cast(labels, tf.int64) 
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels, name='cross_entropy_per_example')
        cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
        tf.add_to_collection('losses', cross_entropy_mean)
        return tf.add_n(tf.get_collection('losses'), name='total_loss')
    loss = loss(y, label_holder)
    train_op = tf.train.AdamOptimizer(1e-3).minimize(loss)
    tf.add_to_collection('train_op', train_op)

    saver = tf.train.Saver(max_to_keep=3)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        for i in range(TRAIN_STEPS):
            image_batch, label_batch = sess.run([batch_x, batch_y])
            _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={image_holder: image_batch,
                                                                                     label_holder: label_batch})
            if i % 100 == 0:
                format_str = ('After %d step,loss on training batch is: %g')
                print (format_str % (i, loss_value))
        coord.request_stop() 
        coord.join(threads)

def main(argv=None):
    image, label = read_and_decode('train_day_night.tfrecords')
    batch_image, batch_label = get_batch(image, label, batch_size, crop_size)  
    train(batch_image, batch_label)
if __name__ == '__main__':
    tf.app.run()

The LeNet_5 that I use is here:

# -*- coding:utf-8 -*-    
import tensorflow as tf
def inference(input_tensor,train,regularizer):
    with tf.variable_scope('layer1-conver1'):
        conv1_weights=tf.get_variable("weight",[5,5,3,32],initializer=tf.truncated_normal_initializer(stddev=0.1))
        conv1_biases=tf.get_variable("biase",[32],initializer=tf.truncated_normal_initializer(0.0))
        conv1=tf.nn.conv2d(input_tensor,conv1_weights,strides=[1,1,1,1],padding='SAME')
        relu1=tf.nn.relu(tf.nn.bias_add(conv1,conv1_biases))
    with tf.variable_scope('layer2-pool1'):
        pool1=tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
    with tf.variable_scope('layer3-conv2'):
        conv2_weights=tf.get_variable("weight",[5,5,32,64],initializer=tf.truncated_normal_initializer(stddev=0.1))
        conv2_biases=tf.get_variable("biase",[64],initializer=tf.truncated_normal_initializer(0.0))
        conv2=tf.nn.conv2d(pool1,conv2_weights,strides=[1,1,1,1],padding='SAME')
        relu2=tf.nn.relu(tf.nn.bias_add(conv2,conv2_biases))
    with tf.variable_scope('layer4-pool2'):
        pool2=tf.nn.max_pool(relu2,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')

    pool_shape=pool2.get_shape().as_list()
    nodes=pool_shape[1]*pool_shape[2]*pool_shape[3]
    reshaped=tf.reshape(pool2,[pool_shape[0],nodes])

    with tf.variable_scope('layer5-fc1'):
        fc1_weights=tf.get_variable("weight",[nodes,512],initializer=tf.truncated_normal_initializer(stddev=0.1))
        if regularizer !=None:
            tf.add_to_collection('losses',regularizer(fc1_weights))
        fc1_biases=tf.get_variable("biases",[512],initializer=tf.truncated_normal_initializer(0.1))

        fc1=tf.nn.relu(tf.matmul(reshaped,fc1_weights)+fc1_biases)
        if train:fc1=tf.nn.dropout(fc1,0.5)

    with tf.variable_scope('layer6-fc2'):
        fc2_weights = tf.get_variable("weight", [512, 2], initializer=tf.truncated_normal_initializer(stddev=0.1))
        if regularizer != None:
            tf.add_to_collection('losses', regularizer(fc2_weights))
        fc2_biases = tf.get_variable("biases", [2], initializer=tf.truncated_normal_initializer(0.1))
        logit=tf.matmul(fc1,fc2_weights)+fc2_biases
    return logit

The size of image in .tfrecords file is 224*224*3. Now I really don't know why and how to solve.Can you help me ? Thank you very much ! If any information you need ,tell me.

Upvotes: 1

Views: 142

Answers (1)

Vijay Mariappan
Vijay Mariappan

Reputation: 17191

It means that you are running out of GPU memory. You may want to move your input pipeline (get_batch() and read_and_decode()) to CPU instead. You can achieve that using: with tf.device('/cpu:0'):.

This recommendation is detailed in tensorflow Performance guide.

Upvotes: 1

Related Questions