Reputation: 85
I am using Microsoft Azure to train a CNN (Convolutional Neural Network) to recognize 11 classes of food using 16k images. The Virtual Machine I'm using is a "STANDARD_NC24_PROMO" with the following specs: 24 vCPUs, 4 GPUs, 224 GB memory, 1440 GB storage.
The problem is that at a simple run of the program I get the following error about Resource Exhaustion:
2-conv-256-nodes-0-dense-1576530179
Train on 10636 samples, validate on 2660 samples
Epoch 1/10
32/10636 [..............................] - ETA: 57:51
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-10-ee913a07a18b> in <module>
86 model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
87 ### TRAIN
---> 88 model.fit(train_images, train_labels,validation_split=0.20, epochs=10,use_multiprocessing=True)
89
90 loss, acc = model.evaluate(test_images, test_labels, verbose = 0)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
726 max_queue_size=max_queue_size,
727 workers=workers,
--> 728 use_multiprocessing=use_multiprocessing)
729
730 def evaluate(self,
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
322 mode=ModeKeys.TRAIN,
323 training_context=training_context,
--> 324 total_epochs=epochs)
325 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
326
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
121 step=step, mode=mode, size=current_batch_size) as batch_logs:
122 try:
--> 123 batch_outs = execution_function(iterator)
124 except (StopIteration, errors.OutOfRangeError):
125 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
84 # `numpy` translates Tensors to values in Eager mode.
85 return nest.map_structure(_non_none_constant_value,
---> 86 distributed_function(input_fn))
87
88 return execution_function
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
518 # Lifting succeeded, so variables are initialized and we can run the
519 # stateless function.
--> 520 return self._stateless_fn(*args, **kwds)
521 else:
522 canon_args, canon_kwds = \
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
1821 """Calls a graph function specialized to the inputs."""
1822 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
1824
1825 @property
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs)
1139 if isinstance(t, (ops.Tensor,
1140 resource_variable_ops.BaseResourceVariable))),
-> 1141 self.captured_inputs)
1142
1143 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1222 if executing_eagerly:
1223 flat_outputs = forward_function.call(
-> 1224 ctx, args, cancellation_manager=cancellation_manager)
1225 else:
1226 gradient_name = self._delayed_rewrite_functions.register()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in call(self, ctx, args, cancellation_manager)
509 inputs=args,
510 attrs=("executor_type", executor_type, "config_proto", config),
--> 511 ctx=ctx)
512 else:
513 outputs = execute.execute_with_cancellation(
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/six.py in raise_from(value, from_value)
ResourceExhaustedError: OOM when allocating tensor with shape[32,256,98,98] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[node sequential_7/conv2d_14/Conv2D (defined at /anaconda/envs/azureml_py36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1751) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[Op:__inference_distributed_function_7727]
Function call stack:
distributed_function
I will attach below the bit of code that does the training:
for dense_layer in dense_layers:
for layer_size in layer_sizes:
for conv_layer in conv_layers:
NAME="{}-conv-{}-nodes-{}-dense-{}".format(conv_layer,
layer_size, dense_layer, int(time.time()))
print(NAME)
model = Sequential()
model.add(Conv2D(layer_size,(3,3),input_shape=(IMG_SIZE, IMG_SIZE, 1)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
for l in range(conv_layer-1):
model.add(Conv2D(layer_size,(3,3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
model.add(Flatten())
for l in range(dense_layer):
model.add(Dense(layer_size))
model.add(Activation("relu"))
#The output layer with 11 neurons
model.add(Dense(11))
model.add(Activation("softmax"))
### COMPILE MODEL
model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"])
### TRAIN
model.fit(train_images, train_labels,validation_split=0.20, epochs=10)
loss, acc = model.evaluate(test_images, test_labels, verbose = 0)
print(acc * 100)
if maxacc<acc*100:
maxacc=acc*100
maxname=NAME
maxdict[maxacc]=maxname
print("\n\n",maxacc," ",maxname)
My laptop which is nowhere near as good has no problem executing this, yet running it on azure gives me that error. The iteration variables don't matter as I still get the error no matter what their values are.
Any help would be greatly appreciated, thank you for your time!
I would like to add that the program is not even working with this small amount of layers:
dense_layers = [0]
layer_sizes = [32]
conv_layers = [1]
Upvotes: 1
Views: 164
Reputation: 610
Unfortunately, I never used azure for training some kind of networks. But I would try:
There is a lot of optimization happening that might cause it to work locally but that works slightly different for multi gpu machines.
Upvotes: 1