Reputation: 1
I am trying to build a gaussian RBM model with Tensorflow. But the program will use too much memory.
gaussian_rbm.py
import tensorflow as tf
import math
import input_data
import numpy as np
def sample_prob(probs):
return tf.nn.relu(
tf.sign(
probs - tf.random_uniform(tf.shape(probs))))
class RBM(object):
""" represents a sigmoidal rbm """
def __init__(self, name, input_size, output_size, gaussian_std_val=0.1):
with tf.name_scope("rbm_" + name):
self.weights = tf.Variable(
tf.truncated_normal([input_size, output_size],
stddev=1.0 / math.sqrt(float(input_size))), name="weights")
self.v_bias = tf.Variable(tf.zeros([input_size]), name="v_bias")
self.h_bias = tf.Variable(tf.zeros([output_size]), name="h_bias")
self.input = tf.placeholder("float", shape=[None, 784])
#Gaussian
def_a = 1/(np.sqrt(2)*gaussian_std_val)
def_a = tf.constant(def_a, dtype=tf.float32)
self.a = tf.Variable( tf.ones(shape=[input_size]) * def_a,
name="a")
def propup(self, visible):
""" P(h|v) """
return tf.nn.sigmoid(tf.matmul(visible, self.weights) + self.h_bias)
def propdown(self, hidden):
""" P(v|h) """
# return tf.nn.sigmoid(tf.matmul(hidden, tf.transpose(self.weights)) + self.v_bias)
return ( tf.matmul(hidden, tf.transpose(self.weights)) + self.v_bias ) / (2 * (self.a * self.a))
def sample_h_given_v(self, v_sample):
""" Generate a sample from the hidden layer """
return sample_prob(self.propup(v_sample))
def sample_v_given_h(self, h_sample):
""" Generate a sample from the visible layer """
return self.sample_gaussian(self.propdown(h_sample))
def gibbs_hvh(self, h0_sample):
""" A gibbs step starting from the hidden layer """
v_sample = self.sample_v_given_h(h0_sample)
h_sample = self.sample_h_given_v(v_sample)
return [v_sample, h_sample]
def gibbs_vhv(self, v0_sample):
""" A gibbs step starting from the visible layer """
h_sample = self.sample_h_given_v(v0_sample)
v_sample = self.sample_v_given_h(h_sample)
return [h_sample, v_sample]
def sample_gaussian(self, mean_field):
return tf.random_normal(shape=tf.shape(mean_field),
mean=mean_field,
stddev=1.0 / (np.sqrt(2) * self.a))
def cd1(self, learning_rate=0.1):
" One step of contrastive divergence, with Rao-Blackwellization "
h_start = self.sample_h_given_v(self.input)
v_end = self.sample_v_given_h(h_start)
h_end = self.sample_h_given_v(v_end)
w_positive_grad = tf.matmul(tf.transpose(self.input), h_start)
w_negative_grad = tf.matmul(tf.transpose(v_end), h_end)
update_w = self.weights + (learning_rate * (w_positive_grad - w_negative_grad) / tf.to_float(tf.shape(self.input)[0]))
update_vb = self.v_bias + (learning_rate * tf.reduce_mean(self.input - v_end, 0))
update_hb = self.h_bias + (learning_rate * tf.reduce_mean(h_start - h_end, 0))
return [update_w, update_vb, update_hb]
def cal_err(self):
err = self.input - self.gibbs_vhv(self.input)[1]
return tf.reduce_mean(err * err)
test_mnist.py
import tensorflow as tf
import input_data
from gaussian_RBM import RBM
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images, mnist.test.labels
rbm_modle = RBM(name="gaussian_rbm", input_size=784, output_size=1000)
sess = tf.Session()
init_op = tf.initialize_all_variables()
sess.run(init_op)
for i in range(100):
print "step: %s"%i
for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)):
rbm_modle.weights, rbm_modle.v_bias, rbm_modle.h_bias = \
sess.run(rbm_modle.cd1(), feed_dict={rbm_modle.input : trX[start : end]})
if start % 1280 == 0:
print sess.run(rbm_modle.cal_err(), feed_dict={rbm_modle.input : teX})
the output is
run test_mnist.py Extracting MNIST_data/train-images-idx3-ubyte.gz Extracting MNIST_data/train-labels-idx1-ubyte.gz Extracting MNIST_data/t10k-images-idx3-ubyte.gz Extracting MNIST_data/t10k-labels-idx1-ubyte.gz I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:900] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: name: GeForce GTX 560 major: 2 minor: 1 memoryClockRate (GHz) 1.62 pciBusID 0000:01:00.0 Total memory: 1018.69MiB Free memory: 916.73MiB I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y I tensorflow/core/common_runtime/gpu/gpu_device.cc:684] Ignoring gpu device (device: 0, name: GeForce GTX 560, pci bus id: 0000:01:00.0) with Cuda compute capability 2.1. The minimum required Cuda capability is 3.5. step: 0 0.0911714 0.0781856 0.0773076 0.0770751 0.0776582 0.0764748 0.0755164 0.0741131 0.0726497 0.0712237 0.0701839 0.0686315 0.0664856 0.0658309 0.0646239 0.0626652 0.0616178 0.0610061 0.0598332 0.0588843 0.0587477 0.0572056 0.0561556 0.0554848 Killed
Is there some way to monitor the memory? Can someone help me?
Upvotes: 0
Views: 4146
Reputation: 1691
There may be a problem with the training loop, that causes your computer to run out of memory.
For each iteration of your loop, you are calling:
sess.run(rbm_modle.cd1(), feed_dict={rbm_modle.input : trX[start : end]})
Inside this rbm_modle.cd1()
function, you are creating several new operations, such as tf.matmul()
, so every time you call rbm_modle.cd1()
you will create new operations, which will cause the used memory to increase after each iteration.
You should define all the operations before the loop, and then during run the operation with sess.run()
, without creating new ones.
Upvotes: 2
Reputation: 113
Make sure there are no memory leaks by making your graph read-only before training with
tf.get_default_graph().finalize()
TensorFlow will raise an exception each time you will try to add a new node.
Upvotes: 0
Reputation: 47
The answer seems correct, (Insufficient compute capabilities to run latest version of CUDA / Tensorflow
However, the minimal requirement seems to be "Compute Capabilites = 3.0", as my GTX_770M is able to run Tensorflow 1.0 / CUDA 8.0 (see below)
And / Or trying to recompile tensorflow from the sources, and include the 2.0 target during generation (it is proposing 3.5-5.5 by default)
Have a good day!!
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 375.51 Driver Version: 375.51 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 770M Off | 0000:01:00.0 N/A | N/A |
|100% 48C P0 N/A / N/A | 2819MiB / 3017MiB | N/A Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 Not Supported |
+-----------------------------------------------------------------------------+
Upvotes: 0
Reputation: 2827
You can monitory GPU memory with the command nvidia-smi
It looks like your GPU does not support the later versions of CUDA required to run tensorflow. You can check CUDA-Enabled GeForce Products
From your output it looks like tensorflow is smart enough to not use the GPU, so either your model / batch size is too big for your RAM or you have a memory leak.
Try running running session with log_device_placement=True to see what tensorflow is doing step by step, while running 'top' to monitor memory?
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
Upvotes: 5