Reputation: 305
I'm trying to build a simple Recurrent Neural Network model by using tensorflow on Mac OS X. It is just a toy model and size of input data doesn't exceed 3MB so it should not consume much of memory. However, when I run model, the memory usage significantly increases every training batch and goes above 10GB. It was for only two iterations. I couldn't run it more.
Here's the whole code.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
from pympler import summary
class RNN():
"""The RNN model."""
#@profile
def inference(self):
"""calculate outputs and loss for a single batch"""
total_loss = 0.0
outputs = []
for i in range(self.batch_size):
state = self.init_state
outputs.append([])
loss = 0.0
for j in range(self.num_steps):
state, output = self.next_state(self.x[i,j,:],state)
outputs[i].append(output)
loss += tf.square(self.y[i,j,:]-output)
total_loss+=loss
return outputs, total_loss / (self.batch_size*self.num_steps)
def __init__(self, is_training, config):
self.sess = sess = tf.Session()
self.prev_see = prev_see = config.prev_see
self.num_steps = num_steps = config.num_steps
#maybe "self.num_hidden =" part could be removed
self.num_hidden = num_hidden = config.num_hidden
self.batch_size = config.batch_size
self.epoch = config.epoch
self.learning_rate = config.learning_rate
self.summaries_dir = config.summaries_dir
with tf.name_scope('placeholders'):
self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see],
name='input-x')
self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y')
default_init_state = tf.zeros([num_hidden])
self.init_state = tf.placeholder_with_default(default_init_state,[num_hidden],
name='state_placeholder')
def weight_variable(self,shape):
"""Create a weight variable with appropriate initialization."""
initial = tf.truncated_normal(shape,stddev=0.1)
return tf.Variable(initial)
def bias_variable(self,shape):
"""Create a bias variable with appropriate initialization."""
initial = tf.constant(0.1,shape=shape)
return tf.Variable(initial)
def variable_summaries(self,var,name):
"""Attach a lot of summaries to a Tensor."""
with tf.name_scope('summaries'):
mean = tf.reduce_mean(var)
tf.scalar_summary('mean/'+name,mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean)))
tf.scalar_summary('stddev/'+name,stddev)
tf.scalar_summary('max/'+name, tf.reduce_max(var))
tf.scalar_summary('min/'+name, tf.reduce_min(var))
tf.histogram_summary(name, var)
#declare weight variables as property
layer_name = 'rnn_layer'
with tf.name_scope(layer_name):
with tf.name_scope('U'):
self.U = U = weight_variable(self,[prev_see,num_hidden])
variable_summaries(self,U,layer_name+'/U')
with tf.name_scope('W'):
self.W = W = weight_variable(self,[num_hidden,num_hidden])
variable_summaries(self,W,layer_name+'/W')
with tf.name_scope('b_W'):
self.b_W = b_W = bias_variable(self,[num_hidden])
variable_summaries(self,b_W,layer_name+'/b_W')
with tf.name_scope('V'):
self.V = V = weight_variable(self,[num_hidden,1])
variable_summaries(self,V,layer_name+'/V')
with tf.name_scope('b_V'):
self.b_V = b_V = bias_variable(self,[1])
variable_summaries(self,b_V,layer_name+'/b_V')
self.merged = tf.merge_all_summaries()
self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph)
tf.initialize_all_variables().run(session=sess)
_,self.loss = self.inference()
def next_state(self,x,s_prev):
"""calculate next state and output"""
x = tf.reshape(x,[1,-1])
s_prev = tf.reshape(s_prev,[1,-1])
s_next = tf.tanh(tf.matmul(x,self.U)+tf.matmul(s_prev,self.W)+self.b_W)
output = tf.matmul(s_next,self.V)+self.b_V
return s_next, output
#@profile
def batch_train(self,feed_dict):
"""train the network for a single batch"""
loss = self.loss
train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict)
#self.train_writer.add_summary(summary)
print(loss_value)
class TrainConfig():
"""Train Config."""
total_steps = 245
test_ratio = 0.3
prev_see = 100
num_steps = int(round((total_steps-prev_see)*(1-test_ratio)))
num_hidden = 10
batch_size = 5
epoch = 3
learning_rate = 0.1
summaries_dir = '/Users/Kyungsu/StockPrediction/log'
class DebugConfig():
"""For debugging memory leak."""
total_steps = 100
test_ratio = 0.3
prev_see = 100
num_steps = 10
num_hidden = 10
batch_size = 5
epoch = 2
learning_rate = 0.1
summaries_dir = '/Users/Kyungsu/StockPrediction/log'
#@profile
def run_epoch(m,x_data,y_data):
num_batch = ((len(x_data)-1) // m.batch_size)+1
#num_batch = 100
for i in range(num_batch):
x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
feed_dict = {m.x:x_batch,m.y:y_batch}
print("%dth/%dbatches"%(i+1,num_batch))
m.batch_train(feed_dict)
def process_data(data,config):
data_size = len(data)
prev_see = config.prev_see
num_steps = config.num_steps
x = np.zeros((data_size,num_steps,prev_see))
y = np.zeros((data_size,num_steps,1))
for i in range(data_size):
for j in range(num_steps-prev_see):
x[i,j,:] = data[i,i:i+prev_see]
y[i,j,0] = data[i,i+prev_see]
return x,y
#@profile
def main():
train_config = TrainConfig()
debug_config = DebugConfig()
data = np.load('processed_data.npy')
x,y = process_data(data,train_config)
rnn_model = RNN(True,train_config)
#training phase
for i in range(rnn_model.epoch):
print("%dth epoch"%(i+1))
run_epoch(rnn_model,x,y)
main()
And following is the result of memory_profiler. Weird thing is, most of memory is allocated in for loop. (See line 163,135) I guess it means memory is leaking.
Line # Mem usage Increment Line Contents
================================================
11 53.062 MiB 0.000 MiB @profile
12 def __init__(self, is_training, config):
13 53.875 MiB 0.812 MiB self.sess = sess = tf.Session()
14
15 53.875 MiB 0.000 MiB self.prev_see = prev_see = config.prev_see
16 53.875 MiB 0.000 MiB self.num_steps = num_steps = config.num_steps
17 #maybe "self.num_hidden =" part could be removed
18 53.875 MiB 0.000 MiB self.num_hidden = num_hidden = config.num_hidden
19 53.875 MiB 0.000 MiB self.batch_size = config.batch_size
20 53.875 MiB 0.000 MiB self.epoch = config.epoch
21 53.875 MiB 0.000 MiB self.learning_rate = config.learning_rate
22 53.875 MiB 0.000 MiB self.summaries_dir = config.summaries_dir
23
24 53.875 MiB 0.000 MiB with tf.name_scope('input'):
25 53.875 MiB 0.000 MiB self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see],
26 53.957 MiB 0.082 MiB name='input-x')
27 53.973 MiB 0.016 MiB self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y')
28
29 55.316 MiB 1.344 MiB def weight_variable(self,shape):
30 """Create a weight variable with appropriate initialization."""
31 55.371 MiB 0.055 MiB initial = tf.truncated_normal(shape,stddev=0.1)
32 55.414 MiB 0.043 MiB return tf.Variable(initial)
33
34 55.707 MiB 0.293 MiB def bias_variable(self,shape):
35 """Create a bias variable with appropriate initialization."""
36 55.727 MiB 0.020 MiB initial = tf.constant(0.1,shape=shape)
37 55.754 MiB 0.027 MiB return tf.Variable(initial)
38
39 55.754 MiB 0.000 MiB def variable_summaries(self,var,name):
40 """Attach a lot of summaries to a Tensor."""
41 55.754 MiB 0.000 MiB with tf.name_scope('summaries'):
42 55.801 MiB 0.047 MiB mean = tf.reduce_mean(var)
43 55.824 MiB 0.023 MiB tf.scalar_summary('mean/'+name,mean)
44 55.824 MiB 0.000 MiB with tf.name_scope('stddev'):
45 55.883 MiB 0.059 MiB stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean)))
46 55.906 MiB 0.023 MiB tf.scalar_summary('stddev/'+name,stddev)
47 55.969 MiB 0.062 MiB tf.scalar_summary('max/'+name, tf.reduce_max(var))
48 56.027 MiB 0.059 MiB tf.scalar_summary('min/'+name, tf.reduce_min(var))
49 56.055 MiB 0.027 MiB tf.histogram_summary(name, var)
50
51 #declare weight variables as property
52 53.973 MiB -2.082 MiB layer_name = 'rnn_layer'
53 53.973 MiB 0.000 MiB with tf.name_scope(layer_name):
54 53.973 MiB 0.000 MiB with tf.name_scope('U'):
55 54.230 MiB 0.258 MiB self.U = U = weight_variable(self,[prev_see,num_hidden])
56 54.598 MiB 0.367 MiB variable_summaries(self,U,layer_name+'/U')
57 54.598 MiB 0.000 MiB with tf.name_scope('W'):
58 54.691 MiB 0.094 MiB self.W = W = weight_variable(self,[num_hidden,num_hidden])
59 54.961 MiB 0.270 MiB variable_summaries(self,W,layer_name+'/W')
60 54.961 MiB 0.000 MiB with tf.name_scope('b_W'):
61 55.012 MiB 0.051 MiB self.b_W = b_W = bias_variable(self,[num_hidden])
62 55.316 MiB 0.305 MiB variable_summaries(self,b_W,layer_name+'/b_W')
63 55.316 MiB 0.000 MiB with tf.name_scope('V'):
64 55.414 MiB 0.098 MiB self.V = V = weight_variable(self,[num_hidden,1])
65 55.707 MiB 0.293 MiB variable_summaries(self,V,layer_name+'/V')
66 55.707 MiB 0.000 MiB with tf.name_scope('b_V'):
67 55.754 MiB 0.047 MiB self.b_V = b_V = bias_variable(self,[1])
68 56.055 MiB 0.301 MiB variable_summaries(self,b_V,layer_name+'/b_V')
69 56.055 MiB 0.000 MiB self.merged = tf.merge_all_summaries()
70 60.348 MiB 4.293 MiB self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph)
71 62.496 MiB 2.148 MiB tf.initialize_all_variables().run(session=sess)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
82 3013.336 MiB 0.000 MiB @profile
83 def inference(self):
84 """calculate outputs and loss for a single batch"""
85 3013.336 MiB 0.000 MiB total_loss = 0.0
86 3013.336 MiB 0.000 MiB outputs = []
87 3022.352 MiB 9.016 MiB for i in range(self.batch_size):
88 3020.441 MiB -1.910 MiB state = tf.zeros([self.num_hidden])
89 3020.441 MiB 0.000 MiB outputs.append([])
90 3020.441 MiB 0.000 MiB loss = 0.0
91 3022.348 MiB 1.906 MiB for j in range(self.num_steps):
92 3022.285 MiB -0.062 MiB state, output = self.next_state(self.x[i,j,:],state)
93 3022.285 MiB 0.000 MiB outputs[i].append(output)
94 3022.348 MiB 0.062 MiB loss += tf.square(self.y[i,j,:]-output)
95 3022.352 MiB 0.004 MiB total_loss+=loss
96 3022.371 MiB 0.020 MiB return outputs, total_loss / (self.batch_size*self.num_steps)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
97 3013.336 MiB 0.000 MiB @profile
98 def batch_train(self,feed_dict):
99 """train the network for a single batch"""
100 3022.371 MiB 9.035 MiB _, loss = self.inference()
101 3051.781 MiB 29.410 MiB train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
102 3149.891 MiB 98.109 MiB summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict)
103 #self.train_writer.add_summary(summary)
104 3149.891 MiB 0.000 MiB print(loss_value)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
131 1582.758 MiB 0.000 MiB @profile
132 def run_epoch(m,x_data,y_data):
133 1582.758 MiB 0.000 MiB num_batch = ((len(x_data)-1) // m.batch_size)+1
134 #num_batch = 100
135 3149.895 MiB 1567.137 MiB for i in range(num_batch):
136 3013.336 MiB -136.559 MiB x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
137 3013.336 MiB 0.000 MiB y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
138 3013.336 MiB 0.000 MiB feed_dict = {m.x:x_batch,m.y:y_batch}
139 3013.336 MiB 0.000 MiB print("%dth/%dbatches"%(i+1,num_batch))
140 3149.891 MiB 136.555 MiB m.batch_train(feed_dict)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
154 52.914 MiB 0.000 MiB @profile
155 def main():
156 52.914 MiB 0.000 MiB train_config = TrainConfig()
157 52.914 MiB 0.000 MiB debug_config = DebugConfig()
158 53.059 MiB 0.145 MiB data = np.load('processed_data.npy')
159 53.062 MiB 0.004 MiB x,y = process_data(data,debug_config)
160 62.496 MiB 9.434 MiB rnn_model = RNN(True,debug_config)
161
162 #training phase
163 3149.898 MiB 3087.402 MiB for i in range(rnn_model.epoch):
164 1582.758 MiB -1567.141 MiB print("%dth epoch"%(i+1))
165 3149.898 MiB 1567.141 MiB run_epoch(rnn_model,x,y)
This problem was not occured when I tried simple MNIST model from tensorflow tutorial. So it should be related with RNN model. Also, I could reproduce this problem on Ubuntu 14.04, so I don't think this problem is caused by OS X things. Thank you for reading.
Upvotes: 3
Views: 1158
Reputation: 929
I think the problem is that this line
train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
occurs in your batch_train function, so on every iteration, a new GradientDescentOptimizer is created. Try moving this to your model's init function just after you define the loss and refer to self.train_step in your batch_train function instead.
Upvotes: 4