Reputation: 813
I am trying to build siamese neural network with triplet loss function using tensorflow. This is how its look
def build_network(input_shape, embeddingsize):
network = Sequential()
network.add(Conv2D(128, (7,7), activation='relu',
input_shape=input_shape,
kernel_initializer='he_uniform',
kernel_regularizer=l2(2e-4)))
network.add(MaxPooling2D())
network.add(Conv2D(128, (3,3), activation='relu', kernel_initializer='he_uniform',
kernel_regularizer=l2(2e-4)))
network.add(MaxPooling2D())
network.add(Conv2D(256, (3,3), activation='relu', kernel_initializer='he_uniform',
kernel_regularizer=l2(2e-4)))
network.add(Flatten())
network.add(Dense(4096, activation='relu',
kernel_regularizer=l2(1e-3),
kernel_initializer='he_uniform'))
network.add(Dense(embeddingsize, activation=None,
kernel_regularizer=l2(1e-3),
kernel_initializer='he_uniform'))
#Force the encoding to live on the d-dimentional hypershpere
network.add(Lambda(lambda x: K.l2_normalize(x,axis=-1)))
return network
When I am trying to initialize model by this code
emb_dim = 64
embedding_model = build_network(X_train[1].shape, emb_dim)
embedding_model.summary()
It is showing this error.
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-22-9a90ee998c2d> in <module>
1 emb_dim = 64
2
----> 3 embedding_model = build_network(X_train[1].shape, emb_dim)
4
5 # embedding_model = Sequential([
<ipython-input-19-f51afd4ad3e5> in build_network(input_shape, embeddingsize)
21 network.add(Dense(4096, activation='relu',
22 kernel_regularizer=l2(1e-3),
---> 23 kernel_initializer='he_uniform'))
24
25
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\training\tracking\base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py in add(self, layer)
201 # If the model is being built continuously on top of an input layer:
202 # refresh its output.
--> 203 output_tensor = layer(self.outputs[0])
204 if len(nest.flatten(output_tensor)) != 1:
205 raise TypeError('All layers in a Sequential model '
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
746 # Build layer if applicable (if the `build` method has been
747 # overridden).
--> 748 self._maybe_build(inputs)
749 cast_inputs = self._maybe_cast_inputs(inputs)
750
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in _maybe_build(self, inputs)
2114 # operations.
2115 with tf_utils.maybe_init_scope(self):
-> 2116 self.build(input_shapes)
2117 # We must set self.built since user defined build functions are not
2118 # constrained to set self.built.
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\keras\layers\core.py in build(self, input_shape)
1111 constraint=self.kernel_constraint,
1112 dtype=self.dtype,
-> 1113 trainable=True)
1114 if self.use_bias:
1115 self.bias = self.add_weight(
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in add_weight(self, name, shape, dtype, initializer, regularizer, trainable, constraint, partitioner, use_resource, synchronization, aggregation, **kwargs)
444 synchronization=synchronization,
445 aggregation=aggregation,
--> 446 caching_device=caching_device)
447 backend.track_variable(variable)
448
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\training\tracking\base.py in _add_variable_with_custom_getter(self, name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
742 dtype=dtype,
743 initializer=initializer,
--> 744 **kwargs_for_getter)
745
746 # If we set an initializer and the variable processed it, tracking will not
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\keras\engine\base_layer_utils.py in make_variable(name, shape, dtype, initializer, trainable, caching_device, validate_shape, constraint, use_resource, collections, synchronization, aggregation, partitioner)
140 synchronization=synchronization,
141 aggregation=aggregation,
--> 142 shape=variable_shape if variable_shape else None)
143
144
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\variables.py in __call__(cls, *args, **kwargs)
256 def __call__(cls, *args, **kwargs):
257 if cls is VariableV1:
--> 258 return cls._variable_v1_call(*args, **kwargs)
259 elif cls is Variable:
260 return cls._variable_v2_call(*args, **kwargs)
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\variables.py in _variable_v1_call(cls, initial_value, trainable, collections, validate_shape, caching_device, name, variable_def, dtype, expected_shape, import_scope, constraint, use_resource, synchronization, aggregation, shape)
217 synchronization=synchronization,
218 aggregation=aggregation,
--> 219 shape=shape)
220
221 def _variable_v2_call(cls,
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\variables.py in <lambda>(**kwargs)
195 shape=None):
196 """Call on Variable class. Useful to force the signature."""
--> 197 previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
198 for _, getter in ops.get_default_graph()._variable_creator_stack: # pylint: disable=protected-access
199 previous_getter = _make_getter(getter, previous_getter)
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\variable_scope.py in default_variable_creator(next_creator, **kwargs)
2594 synchronization=synchronization,
2595 aggregation=aggregation,
-> 2596 shape=shape)
2597 else:
2598 return variables.RefVariable(
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\variables.py in __call__(cls, *args, **kwargs)
260 return cls._variable_v2_call(*args, **kwargs)
261 else:
--> 262 return super(VariableMetaclass, cls).__call__(*args, **kwargs)
263
264
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py in __init__(self, initial_value, trainable, collections, validate_shape, caching_device, name, dtype, variable_def, import_scope, constraint, distribute_strategy, synchronization, aggregation, shape)
1409 aggregation=aggregation,
1410 shape=shape,
-> 1411 distribute_strategy=distribute_strategy)
1412
1413 def _init_from_args(self,
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py in _init_from_args(self, initial_value, trainable, collections, caching_device, name, dtype, constraint, synchronization, aggregation, distribute_strategy, shape)
1540 with ops.name_scope("Initializer"), device_context_manager(None):
1541 initial_value = ops.convert_to_tensor(
-> 1542 initial_value() if init_from_fn else initial_value,
1543 name="initial_value", dtype=dtype)
1544 if shape is not None:
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\keras\engine\base_layer_utils.py in <lambda>()
120 (type(init_ops.Initializer), type(init_ops_v2.Initializer))):
121 initializer = initializer()
--> 122 init_val = lambda: initializer(shape, dtype=dtype)
123 variable_dtype = dtype.base_dtype
124 if use_resource is None:
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\init_ops_v2.py in __call__(self, shape, dtype)
423 else:
424 limit = math.sqrt(3.0 * scale)
--> 425 return self._random_generator.random_uniform(shape, -limit, limit, dtype)
426
427 def get_config(self):
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\init_ops_v2.py in random_uniform(self, shape, minval, maxval, dtype)
786 op = random_ops.random_uniform
787 return op(
--> 788 shape=shape, minval=minval, maxval=maxval, dtype=dtype, seed=self.seed)
789
790 def truncated_normal(self, shape, mean, stddev, dtype):
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\random_ops.py in random_uniform(shape, minval, maxval, dtype, seed, name)
271 else:
272 rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
--> 273 result = math_ops.add(rnd * (maxval - minval), minval, name=name)
274 # TODO(b/132092188): C++ shape inference inside functional ops does not
275 # cross FuncGraph boundaries since that information is only available in
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\ops\gen_math_ops.py in add(x, y, name)
341 raise
342 except _core._NotOkStatusException as e:
--> 343 _ops.raise_from_not_ok_status(e, name)
344 # Add nodes to the TensorFlow graph.
345 try:
~\.conda\envs\py36\lib\site-packages\tensorflow_core\python\framework\ops.py in raise_from_not_ok_status(e, name)
6604 message = e.message + (" name: " + name if name is not None else "")
6605 # pylint: disable=protected-access
-> 6606 six.raise_from(core._status_to_exception(e.code, message), None)
6607 # pylint: enable=protected-access
6608
~\.conda\envs\py36\lib\site-packages\six.py in raise_from(value, from_value)
ResourceExhaustedError: OOM when allocating tensor with shape[278784,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Add] name: dense/kernel/Initializer/random_uniform/
I am using microsoft azure virtual machine with nvidia k80 GPU. There is one core available with 12GB of memory. I checked nvidia-smi and it seems like model is taking all of memory
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 426.00 Driver Version: 426.00 CUDA Version: 10.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K80 TCC | 00000001:00:00.0 Off | 0 |
| N/A 54C P0 55W / 149W | 10889MiB / 11448MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 6620 C ...cbbivmadmin\.conda\envs\py36\python.exe 10766MiB |
+-----------------------------------------------------------------------------+
When I was trying load the same model on other machine with CPU only it worked
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 144, 144, 128) 18944
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 72, 72, 128) 0
_________________________________________________________________
conv2d_1 (Conv2D) (None, 70, 70, 128) 147584
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 35, 35, 128) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 33, 33, 256) 295168
_________________________________________________________________
flatten (Flatten) (None, 278784) 0
_________________________________________________________________
dense (Dense) (None, 4096) 1141903360
_________________________________________________________________
dense_1 (Dense) (None, 64) 262208
_________________________________________________________________
lambda (Lambda) (None, 64) 0
=================================================================
Total params: 1,142,627,264
Trainable params: 1,142,627,264
Non-trainable params: 0
Moreover I am not sure why it is loading to GPU memory by default and taking all of memory.
Upvotes: 0
Views: 969
Reputation: 19123
You seem to have the wrong size specified somewhere:
OOM when allocating tensor with shape[278784,4096] and type float
^^^^^^
Make sure you're using the correct sizes when defining the layers in your model.
I checked nvidia-smi and it seems like model is taking all of memory
Unless told otherwise, tensorflow preallocates almost all the GPU memory and runs its own memory allocation strategy inside it, so from nvidia-smi it will always look like the GPU's memory is fully utilized.
Upvotes: 1