Reputation: 1143
After a restart and without touching anything else, I open up the Jupyter Notebook and attempt to run the cells that get the GPU to start training.
But in my Terminal I get this message and in the Notebook I get the long error below. (I'm using Ubuntu 16.04, Keras with Tensorflow backend).
87] Found device 1 with properties:
name: GeForce GTX 1080 Ti
major: 6 minor: 1 memoryClockRate (GHz) 1.582
pciBusID 0000:25:00.0
Total memory: 10.91GiB
Free memory: 396.44MiB
Code of the notebook here https://github.com/fastai/courses/blob/master/deeplearning1/nbs/lesson1.ipynb
In cell [5] I've lowered the batch size to 10 and then tried 5. I also set no_of_epochs=5. In addition to restarts I've also tried looking for any command that gets the system to drop any processes the GPU might be using, but it doesn't appear to be using any.
cell [7] is the cell that gives all the errors below when it is run.
This is the full error under the cell [7] that is trying to utilize the GPU.
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-7-2b6861506a11> in <module>()
----> 1 vgg = Vgg16()
2 # Grab a few images at a time for training and validation.
3 # NB: They must be in subdirectories named based on their category
4 batches = vgg.get_batches(path+'train', batch_size=batch_size)
5 val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
/home/eagle/fastai/courses-master/deeplearning1/nbs/vgg16.pyc in __init__(self)
45 def __init__(self):
46 self.FILE_PATH = 'http://files.fast.ai/models/'
---> 47 self.create()
48 self.get_classes()
49
/home/eagle/fastai/courses-master/deeplearning1/nbs/vgg16.pyc in create(self)
137
138 fname = 'vgg16.h5'
--> 139 model.load_weights(get_file(fname, self.FILE_PATH+fname, cache_subdir='models'))
140
141
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.pyc in load_weights(self, filepath, by_name)
2706 self.load_weights_from_hdf5_group_by_name(f)
2707 else:
-> 2708 self.load_weights_from_hdf5_group(f)
2709
2710 if hasattr(f, 'close'):
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.pyc in load_weights_from_hdf5_group(self, f)
2792 weight_values[0] = w
2793 weight_value_tuples += zip(symbolic_weights, weight_values)
-> 2794 K.batch_set_value(weight_value_tuples)
2795
2796 def load_weights_from_hdf5_group_by_name(self, f):
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in batch_set_value(tuples)
1879 assign_ops.append(assign_op)
1880 feed_dict[assign_placeholder] = value
-> 1881 get_session().run(assign_ops, feed_dict=feed_dict)
1882
1883
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in get_session()
123 session = _SESSION
124 if not _MANUAL_VAR_INIT:
--> 125 _initialize_variables()
126 return session
127
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in _initialize_variables()
280 sess = get_session()
281 if hasattr(tf, 'variables_initializer'):
--> 282 sess.run(tf.variables_initializer(uninitialized_variables))
283 else:
284 sess.run(tf.initialize_variables(uninitialized_variables))
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
776 try:
777 result = self._run(None, fetches, feed_dict, options_ptr,
--> 778 run_metadata_ptr)
779 if run_metadata:
780 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
980 if final_fetches or final_targets:
981 results = self._do_run(handle, final_targets, final_fetches,
--> 982 feed_dict_string, options, run_metadata)
983 else:
984 results = []
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1030 if handle is None:
1031 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1032 target_list, options, run_metadata)
1033 else:
1034 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
1050 except KeyError:
1051 pass
-> 1052 raise type(e)(node_def, op, message)
1053
1054 def _extend_graph(self):
ResourceExhaustedError: OOM when allocating tensor with shape[25088,4096]
[[Node: random_uniform_13/RandomUniform = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=87654321, seed2=755436606, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform_13/shape)]]
Caused by op u'random_uniform_13/RandomUniform', defined at:
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 478, in start
self.io_loop.start()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 281, in dispatcher
return self.dispatch_shell(stream, msg)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 232, in dispatch_shell
handler(stream, idents, msg)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 397, in execute_request
user_expressions, allow_stdin)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-7-2b6861506a11>", line 1, in <module>
vgg = Vgg16()
File "vgg16.py", line 47, in __init__
self.create()
File "vgg16.py", line 134, in create
self.FCBlock()
File "vgg16.py", line 113, in FCBlock
model.add(Dense(4096, activation='relu'))
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/models.py", line 332, in add
output_tensor = layer(self.outputs[0])
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.py", line 546, in __call__
self.build(input_shapes[0])
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/layers/core.py", line 798, in build
constraint=self.W_constraint)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.py", line 418, in add_weight
weight = initializer(shape, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/initializations.py", line 66, in glorot_uniform
return uniform(shape, s, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/initializations.py", line 33, in uniform
return K.random_uniform_variable(shape, -scale, scale, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.py", line 634, in random_uniform_variable
low, high, dtype=tf_dtype, seed=seed)(shape)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py", line 189, in __call__
dtype, seed=self.seed)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/ops/random_ops.py", line 236, in random_uniform
shape, dtype, seed=seed1, seed2=seed2)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/ops/gen_random_ops.py", line 263, in _random_uniform
seed=seed, seed2=seed2, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
op_def=op_def)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[25088,4096]
[[Node: random_uniform_13/RandomUniform = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=87654321, seed2=755436606, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform_13/shape)]]
Upvotes: 1
Views: 303
Reputation: 1143
After
nvidia-smi
The last line will show the process with a 'pid' number
Enter with the 'pid' number last
with the following command (your four digit number at the end will be your own)
sudo kill -9 3096
Upvotes: 1