CNN Model learning improperly returning extrema values

Question

I've been working on a binary object detection CNN model using transfer learning with keras' built in resnet 50 model. However after multiple times of training over 100 epochs it is returning coordinates of 0,0,0,1 which translates Xmin, Ymin, and Xmax = 0 and Y max = 300. The input images it in receiving are of shape (batch size, 300,300,3), the labels are being processed and are in shape (batch size, 4) and are normalized between 0 and 1, and finally the categorical labels are in shape (batch size, 2) and are one hot encoded. Below is the model architecture.

from tensorflow.keras import layers
from tensorflow.keras import models
from keras.applications.resnet50 import ResNet50 


res = ResNet50(weights ='imagenet', include_top = False,  
               input_shape =(300, 300, 3))  
x = res.output  
x = layers.MaxPooling2D((2, 2))(x)



# Flatten and Fully connected layers

x = layers.BatchNormalization()(x)  

x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.BatchNormalization()(x)  
x = layers.Dropout(0.50)(x)
x = layers.Dense(512)(x)


# Output layers with proper names
bbox_output = layers.Dense(4, activation='sigmoid', name='bbox_output')(x)  # Bounding box output
class_output = layers.Dense(2, activation='softmax', name='class_output')(x)  # Class output

# Define the model
model = models.Model(inputs = res.input, outputs=[bbox_output, class_output])

model.compile(
    optimizer='Adam',
    loss={'bbox_output': diou_loss, 'class_output': 'categorical_crossentropy'},
    metrics={'bbox_output': 'MSE', 'class_output': 'accuracy'})

callbacks = [
    keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor='val_bbox_output_loss',
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=20,
        verbose=1,
    )]

history = model.fit(X,
    [bbox_labels,class_labels], 
    epochs=100,batch_size=50, 
    verbose=1,
    validation_data=(X_test,[bbox_labels_test,class_labels_test]),
    callbacks=callbacks)

Below is the DIOU loss function:

def diou_loss(y_true, y_pred, epsilon=1e-7):
    # Use fixed image dimensions (256x256)

    
    print('yt',y_true)
    print('yp',y_pred)

    x_min_inter = tf.maximum(y_true[..., 0], y_pred[..., 0])
    y_min_inter = tf.maximum(y_true[..., 1], y_pred[..., 1])
    x_max_inter = tf.minimum(y_true[..., 2], y_pred[..., 2])
    y_max_inter = tf.minimum(y_true[..., 3], y_pred[..., 3])
    
    inter_area = tf.maximum(0.0, x_max_inter - x_min_inter) * tf.maximum(0.0, y_max_inter - y_min_inter)
    print('ia',inter_area)
    true_area = tf.maximum(0.0, y_true[..., 2] - y_true[..., 0]) * tf.maximum(0.0, y_true[..., 3] - y_true[..., 1])
    pred_area = tf.maximum(0.0, y_pred[..., 2] - y_pred[..., 0]) * tf.maximum(0.0, y_pred[..., 3] - y_pred[..., 1])
    union_area = true_area + pred_area - inter_area
    print('ua',union_area)
    iou = inter_area / tf.maximum(union_area, epsilon)
    print('iou',iou)


    # Calculate the center coordinates of the true and predicted boxes
    true_center_x = (y_true[..., 0] + y_true[..., 2]) / 2.0
    true_center_y = (y_true[..., 1] + y_true[..., 3]) / 2.0
    pred_center_x = (y_pred[..., 0] + y_pred[..., 2]) / 2.0
    pred_center_y = (y_pred[..., 1] + y_pred[..., 3]) / 2.0

    # Calculate the squared Euclidean distance between the centers
    center_distance = (true_center_x - pred_center_x) ** 2 + (true_center_y - pred_center_y) ** 2
    print('cd',center_distance)
    # Calculate the coordinates of the smallest enclosing box
    x_min_enclosing = tf.minimum(y_true[..., 0], y_pred[..., 0])
    y_min_enclosing = tf.minimum(y_true[..., 1], y_pred[..., 1])
    x_max_enclosing = tf.maximum(y_true[..., 2], y_pred[..., 2])
    y_max_enclosing = tf.maximum(y_true[..., 3], y_pred[..., 3])

    # Calculate the diagonal length squared of the enclosing box
    enclosing_diagonal = (x_max_enclosing - x_min_enclosing) ** 2 + (y_max_enclosing - y_min_enclosing) ** 2
    print('ed',enclosing_diagonal)
    # Calculate the DIoU
     
    # Return the DIoU loss
    return 1.0 - iou  + ((center_distance) / tf.maximum(enclosing_diagonal, 1e-7))

I tried switching the loss function to MSE but it still has the same problem so I don't think it's the loss function. When I print bbox_labels it gives an array with each row looking something like this:

[0.74666667 0.32333333 0.88333333 0.69 ]

CNN Model learning improperly returning extrema values

Answers (0)

Related Questions