Correct loss function for bboxes in a detector model

Question

I try to clarify the learning process of the detector model with anchors. Unfortunately, I have some trouble with the loss function. I have built the model with the classification and regression heads, and the model and anchor generation work well (I checked). I have the main problem with the loss function. I definitely used different variations of different functions, but I was unsuccessful. The draft of the loss class is listed below

class Losses(nn.Module):
    def __init__(self, cls_loss=nn.BCEWithLogistLoss(), reg_loss=nn.SmoothL1Loss(), threshold=0.1, alpha=1.0, beta=1.0):
        """
        """
        super().__init__()
        self.threshold = threshold
        self.cls_loss = cls_loss # 
        self.reg_loss = reg_loss
        self.alpha = alpha
        self.beta = beta

    def forward(self, bboxes, pred_offsets, predictions, groundtruth):
        """
        bboxes - bounding boxes, shape [bs, X, 4]
        pred_offsets - offset values, shape [bs, X, 4],
        predictions - prediction of target object in bbox, shape [bs, X], 
        groundtruth - real boxes, shape [bs, y, 4]. The structure is [xc, yc, w, h]
        """
        assert bboxes.shape[0] == groundtruth.shape[0], "Mismatch in batch sizes"

        distance, probs, giou_losses = [], [], []
        for i in range(bboxes.shape[0]):
            dist, prob, giou = self._overlap(bboxes[i], groundtruth[i])
            distance.append(offset)
            probs.append(prob)
            giou_losses.append(giou)

        distance = torch.stack(distance)
        probs = torch.stack(probs)
        giou_loss = torch.stack(giou_losses).mean()  

        # Classification loss
        cls_loss = self.cls_loss(predictions, probs)
        # Regression loss
        reg_loss = self.reg_loss(distance, torch.zeros_like(predictions[0]))
        # Weighted total loss
        total_loss = self.alpha * cls_loss + self.beta * reg_loss + giou_loss
        return total_loss, cls_loss, reg_loss, giou_loss

    def _overlap(self, bboxes, groundtruth):
        """
        """
        distance = torch.zeros(len(bboxes)).to(bboxes.device)
        probs = torch.zeros(len(bboxes)).to(bboxes.device)
        giou_loss = torch.zeros(len(bboxes)).to(bboxes.device)

        for gt in groundtruth:
            if torch.any(gt[:4] != 0):
                x1, y1, x2, y2 = to_corners(gt[:4])
                bx1, by1, bx2, by2 = to_corners_batch(bboxes)
                iou, dist = self._intersect(x1, y1, x2, y2, bx1, by1, bx2, by2)
                giou = self.generalized_iou(
                    torch.stack([bx1, by1, bx2, by2], dim=-1),
                    torch.tensor([x1, y1, x2, y2], device=bboxes.device)
                )
                mask = (iou >= self.threshold)
                probs[mask] = iou[mask]  # Use IoU as probabilities
                distance[mask] = dist[mask]
                giou_loss[mask] = (1 - giou[mask])  # GIoU values

        probs[probs == 0] = 0.01  # small noise
        return distance, probs, giou_loss

    @staticmethod
    def generalized_iou(bboxes1, bboxes2):
        """
        """
        x1_min = torch.min(bboxes1[:, 0], bboxes2[0])
        y1_min = torch.min(bboxes1[:, 1], bboxes2[1])
        x2_max = torch.max(bboxes1[:, 2], bboxes2[2])
        y2_max = torch.max(bboxes1[:, 3], bboxes2[3])

        inter_x1 = torch.max(bboxes1[:, 0], bboxes2[0])
        inter_y1 = torch.max(bboxes1[:, 1], bboxes2[1])
        inter_x2 = torch.min(bboxes1[:, 2], bboxes2[2])
        inter_y2 = torch.min(bboxes1[:, 3], bboxes2[3])

        inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
        bbox1_area = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
        bbox2_area = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1])

        union_area = bbox1_area + bbox2_area - inter_area
        iou = inter_area / (union_area + 1e-6)

        # Enclosing box area
        enclosing_area = (x2_max - x1_min) * (y2_max - y1_min)
        giou = iou - (enclosing_area - union_area) / (enclosing_area + 1e-6)
        return giou

    def _intersect(self, x1, y1, x2, y2, bx1, by1, bx2, by2):
        """
        """
        x_inter_min = torch.max(x1, bx1)
        y_inter_min = torch.max(y1, by1)
        x_inter_max = torch.min(x2, bx2)
        y_inter_max = torch.min(y2, by2)

        inter_width = torch.clamp(x_inter_max - x_inter_min, min=0)
        inter_height = torch.clamp(y_inter_max - y_inter_min, min=0)
        intersection_area = inter_width * inter_height

        union_area = ((x2 - x1) * (y2 - y1)) + ((bx2 - bx1) * (by2 - by1)) - intersection_area
        iou = intersection_area / (union_area + 1e-6)

        dist = torch.sqrt((bx2 - x2) ** 2 + (by2 - y2) ** 2) + torch.sqrt((bx1 - x1) ** 2 + (by1 - y1) ** 2)
        return iou, dist


def to_corners_batch(batch):
    """
    """
    cx, cy, w, h = batch[..., 0], batch[..., 1], batch[..., 2], batch[..., 3]
    bx1 = cx - w / 2
    by1 = cy - h / 2
    bx2 = cx + w / 2
    by2 = cy + h / 2
    return bx1, by1, bx2, by2


def to_corners(box):
    """
    """
    cx, cy, w, h = box[0], box[1], box[2], box[3]
    bx1 = cx - w / 2
    by1 = cy - h / 2
    bx2 = cx + w / 2
    by2 = cy + h / 2
    return bx1, by1, bx2, by2

What is wrong? The model is correct. It uses a feature backbone from the mobilenet that is connected to two heads with adapters to get the correct outputs for the regression and classification parts.

Addition. I modified the loss function but it doesn't work. The current version is:

 def forward(self, anchors, preds, gt_bboxes, offset):
        probs = preds['probs']  
        bboxes = preds['bboxes']  
        device = anchors.device

        cls_pos_targets = torch.zeros_like(probs, device=device)  
        cls_neg_targets = torch.zeros_like(probs, device=device)  
        reg_targets = torch.zeros_like(bboxes, device=device)   
        reg_mask = torch.zeros_like(probs, device=device)  

        iou_loss = 0
        reg_loss = 0
        cls_loss = 0
        
        for b in range(bs):
            gt = gt_bboxes[b]  
            gt = gt[~torch.all(gt == 0, dim=-1)]  
            if len(gt) == 0:
                continue  # if we do not have groundtruth
                
            ious = self.compute_iou(bboxes[b], gt)  # calc IoU between bboxes and groundtruth

            max_ious, gt_indices = ious.max(dim=1) 
            
            # Positive boxes
            pos_mask = max_ious >= self.pos_trh
            iou_loss += (1 -max_ious[pos_mask].sum()/pos_mask.sum())
            neg_mask = max_ious < self.neg_trh
            #update pos and neg targets
            cls_pos_targets[b, pos_mask] = 1
            cls_neg_targets[b, neg_mask] = 1

            # regression targets (use only positive anchors, boxes and groundtruth)
            pos_anchors = anchors[0][pos_mask]
            pos_gt = gt[gt_indices[pos_mask]]
            reg_targets[b, pos_mask] = self.box_to_deltas(pos_anchors, pos_gt) # calculate the required offset between the anchors and their groundtruth
            pred_delta = self.box_to_deltas(pos_anchors,bboxes[b, pos_mask]) # predicte offset between the anchors and bboxes
            reg_loss += F.smooth_l1_loss(pred_delta, reg_targets[b, pos_mask], reduction="sum")/max(1, pos_mask.sum()) # calculate regression loss

        # classification losses. Most probably the error is here
        total = cls_pos_targets.sum() + cls_neg_targets.sum()
        pos_count = cls_pos_targets.sum().clamp(min=1)
        neg_count = cls_neg_targets.sum().clamp(min=1)
        total_count = pos_count + neg_count
        cls_pos_loss = focal(probs, cls_pos_targets).sum()
        cls_neg_loss = focal(-probs, cls_neg_targets).sum()/neg_count
        cls_loss = cls_pos_loss +  cls_neg_loss
        
        # Total loss
        total_loss = cls_loss/total + self.lambda_reg * reg_loss 
        # print(f'classification loss {cls_loss/total} and regression loss {reg_loss}, iou loss = {iou_loss}')
        return total_loss

What do I see? The classification loss is almost unchangable while the regression loss decreases as expected. At the same time I see that the iou loss is decreased a bit. I think the problem is in the classification. I outputed the prob logits, and they look unchanged over the train while the gradient is normaly change between the layers.

Correct loss function for bboxes in a detector model

Answers (0)

Related Questions