Reputation: 959
I am using a pre-trained model to train an image classifier. Below Code is running fine on CPU and single unit GPU (i.e. when #GPU=1)
class Metrics(tf.keras.callbacks.Callback):
def __init__(self, train_tf_data, val_tf_data, CLASSES, logs={}, **kwargs):
super().__init__(**kwargs)
# self.keras_metric = tf.keras.metrics.Mean("val_f1_after_epoch")
self.train_tf_data = train_tf_data
self.val_tf_data = val_tf_data
# self.model = model
self.CLASSES = CLASSES
def on_epoch_end(self, epoch, logs={}):
# self.keras_metric.reset_state()
# for train data
self.train_reports = test_model(model=self.model, data=self.train_tf_data, CLASSES=self.CLASSES)
self.train_f1_after_epoch = self.train_reports['f1_score']
self.train_recall_after_epoch = self.train_reports['recall']
self.train_prec_after_epoch = self.train_reports['precision']
# for val data
self.val_reports = test_model(model=self.model, data=self.val_tf_data, CLASSES=self.CLASSES)
self.val_f1_after_epoch = self.val_reports['f1_score']
self.val_recall_after_epoch = self.val_reports['recall']
self.val_prec_after_epoch = self.val_reports['precision']
# saving train results to log dir
logs["f1_after_epoch"]=self.train_f1_after_epoch
logs['precision_after_epoch'] = self.train_prec_after_epoch
logs['recall_after_epoch'] = self.train_recall_after_epoch
# saving val results to log dir
logs['val_f1_after_epoch'] = self.val_f1_after_epoch
logs['val_precision_after_epoch'] = self.val_prec_after_epoch
logs['val_recall_after_epoch'] = self.val_recall_after_epoch
# self.keras_metric.update_state(self.val_f1_after_epoch)
print('reports_after_epoch', self.train_reports)
print('val_reports_after_epoch', self.val_reports)
with strategy.scope():
pretrained_model = tf.keras.applications.MobileNetV2(
weights='imagenet',
include_top=False,
input_shape=[*IMAGE_SIZE, IMG_CHANNELS])
pretrained_model.trainable = True #fine tuning
q_aware_pretrained_model = tf.keras.models.clone_model(pretrained_model,
clone_function=apply_quantization_to_dense,)
base_model = tf.keras.Sequential([
tf.keras.layers.Lambda(# Convert image from int[0, 255] to the format expect by this base_model
lambda data:tf.keras.applications.mobilenet.preprocess_input(
tf.cast(data, tf.float32)), input_shape=[*IMAGE_SIZE, 3]),
q_aware_pretrained_model,
tf.keras.layers.GlobalAveragePooling2D()])
base_model.layers[1]._name = 'custom_mnet_trainable'
base_model.add(tf.keras.layers.Dense(64, name='object_dense',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_64'))
base_model.add(tf.keras.layers.Dropout(rate=0.5, name='dropout_dense_64'))
base_model.add(tf.keras.layers.Dense(32, name='object_dense_2',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_32'))
base_model.add(tf.keras.layers.Dropout(rate=0.4, name='dropout_dense_32'))
base_model.add(tf.keras.layers.Dense(16, name='object_dense_16', kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.Dense(len(CLASS_NAMES), activation='softmax', name='object_prob'))
m1 = tf.keras.metrics.CategoricalAccuracy()
m2 = tf.keras.metrics.Recall()
m3 = tf.keras.metrics.Precision()
m4 = Metrics(train_tf_data=train_data, val_tf_data=test_data, CLASSES=CLASS_NAMES)
optimizers = [
tfa.optimizers.AdamW(learning_rate=lr * .001 , weight_decay=wd),
tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
]
optimizers_and_layers = [(optimizers[0], base_model.layers[0]), (optimizers[1], base_model.layers[1:])]
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
annotated_model = tf.keras.models.clone_model(
base_model,
clone_function=apply_quantization_to_dense,
)
model = tfmot.quantization.keras.quantize_apply(annotated_model)
model.compile(
optimizer= optimizer, loss=tfa.losses.SigmoidFocalCrossEntropy(reduction=tf.keras.losses.Reduction.AUTO),
metrics=[m1, m2, m3],
)
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
checkpoint_name = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep + "training_chkpts/cp-{epoch:04d}-{val_f1_after_epoch:.2f}.ckpt"
checkpoint_dir_path = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep+ "training_chkpts"
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_name,
monitor = 'val_f1_after_epoch',
save_best_only=True,
save_weights_only=True,
mode='max',
save_freq='epoch',
verbose=1)
checkpoint_cb._supports_tf_logs = False
current_dir = os.getcwd()
history = model.fit(train_data, validation_data=test_data,
epochs=N_EPOCHS,
callbacks=[m4, checkpoint_cb, tensorboard_cb])
But If I use a system when the number of GPU > 1 then it is throwing the below error.
Epoch 1/2 6/Unknown - 44s 150ms/step - loss: 19.2255 - categorical_accuracy: 0.0625 - recall: 0.0000e+00 - precision: 0.0000e+00
/bwz_venv/lib/python3.8/site-packages/keras/engine/functional.py:1410: CustomMaskWarning: Custom mask layers require a config and must override get_config. When loading, the custom mask layer must be passed to the custom_objects argument. layer_config = serialize_layer_fn(layer) 288/Unknown - 84s 141ms/step - loss: 13.7873 - categorical_accuracy: 0.1788 - recall: 0.0080 - precision: 0.77082021-12-30 15:08:31.404434: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at transpose_op.cc:142 : INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4
Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/ssd/custom_mnet_v2.py", line 536, in history = model.fit(train_data, validation_data=test_data, File "bwz_venv/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler raise e.with_traceback(filtered_tb) from None File "/bwz_venv/lib/python3.8/site-packages/tensorflow/python/eager/execute.py", line 58, in quick_execute tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, tensorflow.python.framework.errors_impl.InvalidArgumentError: 3 root error(s) found.
(0) INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] [[div_no_nan_3/ReadVariableOp/_558]]
(1) INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] [[assert_less_equal/Assert/AssertGuard/else/_4049/assert_less_equal/Assert/AssertGuard/Assert/data_4/_546]]
(2) INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] 0 successful operations. 0 derived errors ignored. [Op:__inference_train_function_1079980]
Function call stack: train_function -> train_function -> train_function
Few things that I have already tested
Here is the link to the Google Colab Notebook to reproduce the error(please set #GPU>1)
Upvotes: 5
Views: 806
Reputation: 1941
I'm 70% confident this is caused by your lambda layer. I think that when you run multi-GPU, your model needs to be serializable so that it can be placed on each GPU. I think lambda layers cannot be serialized.
See this note:
WARNING: tf.keras.layers.Lambda layers have (de)serialization limitations!
here https://keras.io/api/layers/core_layers/lambda/.
Trying rewriting your lambda layers as a real custom layer that supports serialization, ie it implements get_config(). So instead of
lambda data:tf.keras.applications.mobilenet.preprocess_input(
tf.cast(data, tf.float32)),
write a proper custom layer as
class Prep(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def get_config(self):
return super().get_config()
def call(self, inputs):
return tf.keras.applications.mobilenet.preprocess_input(
tf.cast(inputs, tf.float32))
then add that new Prep layer into your sequential model. LMK if that works. If not I'll delete this response.
Upvotes: 3