learner
learner

Reputation: 959

INVALID_ARGUMENT: transpose expects a vector of size 0 (when GPU units are more than 1)

I am using a pre-trained model to train an image classifier. Below Code is running fine on CPU and single unit GPU (i.e. when #GPU=1)

class Metrics(tf.keras.callbacks.Callback):
    def __init__(self, train_tf_data, val_tf_data, CLASSES, logs={}, **kwargs):
        super().__init__(**kwargs)
        # self.keras_metric = tf.keras.metrics.Mean("val_f1_after_epoch")
        self.train_tf_data = train_tf_data
        self.val_tf_data = val_tf_data
        # self.model = model
        self.CLASSES = CLASSES

    def on_epoch_end(self, epoch, logs={}):
        # self.keras_metric.reset_state()
        # for train data
        self.train_reports = test_model(model=self.model, data=self.train_tf_data, CLASSES=self.CLASSES)
        self.train_f1_after_epoch = self.train_reports['f1_score']
        self.train_recall_after_epoch = self.train_reports['recall']
        self.train_prec_after_epoch = self.train_reports['precision']

        # for val data
        self.val_reports = test_model(model=self.model, data=self.val_tf_data, CLASSES=self.CLASSES)
        self.val_f1_after_epoch = self.val_reports['f1_score']
        self.val_recall_after_epoch = self.val_reports['recall']
        self.val_prec_after_epoch = self.val_reports['precision']

        # saving train results to log dir
        logs["f1_after_epoch"]=self.train_f1_after_epoch
        logs['precision_after_epoch'] = self.train_prec_after_epoch
        logs['recall_after_epoch'] = self.train_recall_after_epoch
        
        # saving val results to log dir
        logs['val_f1_after_epoch'] = self.val_f1_after_epoch
        logs['val_precision_after_epoch'] = self.val_prec_after_epoch
        logs['val_recall_after_epoch'] = self.val_recall_after_epoch
        # self.keras_metric.update_state(self.val_f1_after_epoch)

        print('reports_after_epoch', self.train_reports)
        print('val_reports_after_epoch', self.val_reports)
        



with strategy.scope():
    pretrained_model = tf.keras.applications.MobileNetV2(
                                                    weights='imagenet',
                                                    include_top=False,
                                                    input_shape=[*IMAGE_SIZE, IMG_CHANNELS])
    pretrained_model.trainable = True #fine tuning
    q_aware_pretrained_model = tf.keras.models.clone_model(pretrained_model,
                                                          clone_function=apply_quantization_to_dense,)
    base_model = tf.keras.Sequential([
                            tf.keras.layers.Lambda(# Convert image from int[0, 255] to the format expect by this base_model
                            lambda data:tf.keras.applications.mobilenet.preprocess_input(
                                tf.cast(data, tf.float32)), input_shape=[*IMAGE_SIZE, 3]),
                            q_aware_pretrained_model,
                            tf.keras.layers.GlobalAveragePooling2D()])
    base_model.layers[1]._name = 'custom_mnet_trainable'
    base_model.add(tf.keras.layers.Dense(64, name='object_dense',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
    base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
    base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_64'))
    base_model.add(tf.keras.layers.Dropout(rate=0.5, name='dropout_dense_64'))
    base_model.add(tf.keras.layers.Dense(32, name='object_dense_2',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
    base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
    base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_32'))
    base_model.add(tf.keras.layers.Dropout(rate=0.4, name='dropout_dense_32'))
    base_model.add(tf.keras.layers.Dense(16, name='object_dense_16', kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
    base_model.add(tf.keras.layers.Dense(len(CLASS_NAMES), activation='softmax', name='object_prob'))
    m1 = tf.keras.metrics.CategoricalAccuracy()
    m2 = tf.keras.metrics.Recall()
    m3 = tf.keras.metrics.Precision()

    m4 = Metrics(train_tf_data=train_data, val_tf_data=test_data, CLASSES=CLASS_NAMES)


    optimizers = [
        tfa.optimizers.AdamW(learning_rate=lr * .001 , weight_decay=wd),
        tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
            ]

    optimizers_and_layers = [(optimizers[0], base_model.layers[0]), (optimizers[1], base_model.layers[1:])]

    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)

    annotated_model = tf.keras.models.clone_model(
        base_model,
        clone_function=apply_quantization_to_dense,
    )


    model = tfmot.quantization.keras.quantize_apply(annotated_model)
    model.compile(
        optimizer= optimizer, loss=tfa.losses.SigmoidFocalCrossEntropy(reduction=tf.keras.losses.Reduction.AUTO),
        metrics=[m1, m2, m3],
        )

tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)

checkpoint_name = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep + "training_chkpts/cp-{epoch:04d}-{val_f1_after_epoch:.2f}.ckpt"
checkpoint_dir_path  = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep+ "training_chkpts"
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_name, 
                                                    monitor = 'val_f1_after_epoch',
                                                    save_best_only=True,
                                                    save_weights_only=True,
                                                    mode='max',
                                                    save_freq='epoch',
                                                    verbose=1)

checkpoint_cb._supports_tf_logs = False
current_dir = os.getcwd()
history = model.fit(train_data, validation_data=test_data, 
                    epochs=N_EPOCHS,
                    callbacks=[m4, checkpoint_cb, tensorboard_cb])

But If I use a system when the number of GPU > 1 then it is throwing the below error.

Epoch 1/2 6/Unknown - 44s 150ms/step - loss: 19.2255 - categorical_accuracy: 0.0625 - recall: 0.0000e+00 - precision: 0.0000e+00

/bwz_venv/lib/python3.8/site-packages/keras/engine/functional.py:1410: CustomMaskWarning: Custom mask layers require a config and must override get_config. When loading, the custom mask layer must be passed to the custom_objects argument. layer_config = serialize_layer_fn(layer) 288/Unknown - 84s 141ms/step - loss: 13.7873 - categorical_accuracy: 0.1788 - recall: 0.0080 - precision: 0.77082021-12-30 15:08:31.404434: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at transpose_op.cc:142 : INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4

Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/ssd/custom_mnet_v2.py", line 536, in history = model.fit(train_data, validation_data=test_data, File "bwz_venv/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler raise e.with_traceback(filtered_tb) from None File "/bwz_venv/lib/python3.8/site-packages/tensorflow/python/eager/execute.py", line 58, in quick_execute tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, tensorflow.python.framework.errors_impl.InvalidArgumentError: 3 root error(s) found.

(0) INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] [[div_no_nan_3/ReadVariableOp/_558]]

(1) INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] [[assert_less_equal/Assert/AssertGuard/else/_4049/assert_less_equal/Assert/AssertGuard/Assert/data_4/_546]]

(2) INVALID_ARGUMENT: transpose expects a vector of size 0. But input(1) is a vector of size 4 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] 0 successful operations. 0 derived errors ignored. [Op:__inference_train_function_1079980]

Function call stack: train_function -> train_function -> train_function

Few things that I have already tested

  1. Tried out different metrics (categorical_accuracy) to check whether the issue is realted to the custom monitoring metrics or not.
  2. Run the code in CPU and single GPU enviroment and it is working perfectly fine

Here is the link to the Google Colab Notebook to reproduce the error(please set #GPU>1)

Upvotes: 5

Views: 806

Answers (1)

Yaoshiang
Yaoshiang

Reputation: 1941

I'm 70% confident this is caused by your lambda layer. I think that when you run multi-GPU, your model needs to be serializable so that it can be placed on each GPU. I think lambda layers cannot be serialized.

See this note:

WARNING: tf.keras.layers.Lambda layers have (de)serialization limitations!

here https://keras.io/api/layers/core_layers/lambda/.

Trying rewriting your lambda layers as a real custom layer that supports serialization, ie it implements get_config(). So instead of

lambda data:tf.keras.applications.mobilenet.preprocess_input(
                                tf.cast(data, tf.float32)), 

write a proper custom layer as

class Prep(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
  
  def get_config(self):
    return super().get_config()

  def call(self, inputs):
    return tf.keras.applications.mobilenet.preprocess_input(
                                tf.cast(inputs, tf.float32))

then add that new Prep layer into your sequential model. LMK if that works. If not I'll delete this response.

Upvotes: 3

Related Questions