I try to clarify the learning process of the detector model with anchors. Unfortunately, I have some trouble with the loss function. I have built the model with the classification and regression heads, and the model and anchor generation work well (I checked). I have the main problem with the loss function. I definitely used different variations of different functions, but I was unsuccessful. The draft of the loss class is listed below
class Losses(nn.Module):
def __init__(self, cls_loss=nn.BCEWithLogistLoss(), reg_loss=nn.SmoothL1Loss(), threshold=0.1, alpha=1.0, beta=1.0):
self.threshold = threshold
self.cls_loss = cls_loss #
self.reg_loss = reg_loss
self.alpha = alpha
self.beta = beta
def forward(self, bboxes, pred_offsets, predictions, groundtruth):
bboxes - bounding boxes, shape [bs, X, 4]
pred_offsets - offset values, shape [bs, X, 4],
predictions - prediction of target object in bbox, shape [bs, X],
groundtruth - real boxes, shape [bs, y, 4]. The structure is [xc, yc, w, h]
assert bboxes.shape[0] == groundtruth.shape[0], "Mismatch in batch sizes"
distance, probs, giou_losses = [], [], []
for i in range(bboxes.shape[0]):
dist, prob, giou = self._overlap(bboxes[i], groundtruth[i])
distance = torch.stack(distance)
probs = torch.stack(probs)
giou_loss = torch.stack(giou_losses).mean()
# Classification loss
cls_loss = self.cls_loss(predictions, probs)
# Regression loss
reg_loss = self.reg_loss(distance, torch.zeros_like(predictions[0]))
# Weighted total loss
total_loss = self.alpha * cls_loss + self.beta * reg_loss + giou_loss
return total_loss, cls_loss, reg_loss, giou_loss
def _overlap(self, bboxes, groundtruth):
distance = torch.zeros(len(bboxes)).to(bboxes.device)
probs = torch.zeros(len(bboxes)).to(bboxes.device)
giou_loss = torch.zeros(len(bboxes)).to(bboxes.device)
for gt in groundtruth:
if torch.any(gt[:4] != 0):
x1, y1, x2, y2 = to_corners(gt[:4])
bx1, by1, bx2, by2 = to_corners_batch(bboxes)
iou, dist = self._intersect(x1, y1, x2, y2, bx1, by1, bx2, by2)
giou = self.generalized_iou(
torch.stack([bx1, by1, bx2, by2], dim=-1),
torch.tensor([x1, y1, x2, y2], device=bboxes.device)
mask = (iou >= self.threshold)
probs[mask] = iou[mask] # Use IoU as probabilities
distance[mask] = dist[mask]
giou_loss[mask] = (1 - giou[mask]) # GIoU values
probs[probs == 0] = 0.01 # small noise
return distance, probs, giou_loss
def generalized_iou(bboxes1, bboxes2):
x1_min = torch.min(bboxes1[:, 0], bboxes2[0])
y1_min = torch.min(bboxes1[:, 1], bboxes2[1])
x2_max = torch.max(bboxes1[:, 2], bboxes2[2])
y2_max = torch.max(bboxes1[:, 3], bboxes2[3])
inter_x1 = torch.max(bboxes1[:, 0], bboxes2[0])
inter_y1 = torch.max(bboxes1[:, 1], bboxes2[1])
inter_x2 = torch.min(bboxes1[:, 2], bboxes2[2])
inter_y2 = torch.min(bboxes1[:, 3], bboxes2[3])
inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
bbox1_area = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
bbox2_area = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1])
union_area = bbox1_area + bbox2_area - inter_area
iou = inter_area / (union_area + 1e-6)
# Enclosing box area
enclosing_area = (x2_max - x1_min) * (y2_max - y1_min)
giou = iou - (enclosing_area - union_area) / (enclosing_area + 1e-6)
return giou
def _intersect(self, x1, y1, x2, y2, bx1, by1, bx2, by2):
x_inter_min = torch.max(x1, bx1)
y_inter_min = torch.max(y1, by1)
x_inter_max = torch.min(x2, bx2)
y_inter_max = torch.min(y2, by2)
inter_width = torch.clamp(x_inter_max - x_inter_min, min=0)
inter_height = torch.clamp(y_inter_max - y_inter_min, min=0)
intersection_area = inter_width * inter_height
union_area = ((x2 - x1) * (y2 - y1)) + ((bx2 - bx1) * (by2 - by1)) - intersection_area
iou = intersection_area / (union_area + 1e-6)
dist = torch.sqrt((bx2 - x2) ** 2 + (by2 - y2) ** 2) + torch.sqrt((bx1 - x1) ** 2 + (by1 - y1) ** 2)
return iou, dist
def to_corners_batch(batch):
cx, cy, w, h = batch[..., 0], batch[..., 1], batch[..., 2], batch[..., 3]
bx1 = cx - w / 2
by1 = cy - h / 2
bx2 = cx + w / 2
by2 = cy + h / 2
return bx1, by1, bx2, by2
def to_corners(box):
cx, cy, w, h = box[0], box[1], box[2], box[3]
bx1 = cx - w / 2
by1 = cy - h / 2
bx2 = cx + w / 2
by2 = cy + h / 2
return bx1, by1, bx2, by2
What is wrong? The model is correct. It uses a feature backbone from the mobilenet that is connected to two heads with adapters to get the correct outputs for the regression and classification parts.
Addition. I modified the loss function but it doesn't work. The current version is:
def forward(self, anchors, preds, gt_bboxes, offset):
probs = preds['probs']
bboxes = preds['bboxes']
device = anchors.device
cls_pos_targets = torch.zeros_like(probs, device=device)
cls_neg_targets = torch.zeros_like(probs, device=device)
reg_targets = torch.zeros_like(bboxes, device=device)
reg_mask = torch.zeros_like(probs, device=device)
iou_loss = 0
reg_loss = 0
cls_loss = 0
for b in range(bs):
gt = gt_bboxes[b]
gt = gt[~torch.all(gt == 0, dim=-1)]
if len(gt) == 0:
continue # if we do not have groundtruth
ious = self.compute_iou(bboxes[b], gt) # calc IoU between bboxes and groundtruth
max_ious, gt_indices = ious.max(dim=1)
# Positive boxes
pos_mask = max_ious >= self.pos_trh
iou_loss += (1 -max_ious[pos_mask].sum()/pos_mask.sum())
neg_mask = max_ious < self.neg_trh
#update pos and neg targets
cls_pos_targets[b, pos_mask] = 1
cls_neg_targets[b, neg_mask] = 1
# regression targets (use only positive anchors, boxes and groundtruth)
pos_anchors = anchors[0][pos_mask]
pos_gt = gt[gt_indices[pos_mask]]
reg_targets[b, pos_mask] = self.box_to_deltas(pos_anchors, pos_gt) # calculate the required offset between the anchors and their groundtruth
pred_delta = self.box_to_deltas(pos_anchors,bboxes[b, pos_mask]) # predicte offset between the anchors and bboxes
reg_loss += F.smooth_l1_loss(pred_delta, reg_targets[b, pos_mask], reduction="sum")/max(1, pos_mask.sum()) # calculate regression loss
# classification losses. Most probably the error is here
total = cls_pos_targets.sum() + cls_neg_targets.sum()
pos_count = cls_pos_targets.sum().clamp(min=1)
neg_count = cls_neg_targets.sum().clamp(min=1)
total_count = pos_count + neg_count
cls_pos_loss = focal(probs, cls_pos_targets).sum()
cls_neg_loss = focal(-probs, cls_neg_targets).sum()/neg_count
cls_loss = cls_pos_loss + cls_neg_loss
# Total loss
total_loss = cls_loss/total + self.lambda_reg * reg_loss
# print(f'classification loss {cls_loss/total} and regression loss {reg_loss}, iou loss = {iou_loss}')
return total_loss
What do I see? The classification loss is almost unchangable while the regression loss decreases as expected. At the same time I see that the iou loss is decreased a bit. I think the problem is in the classification. I outputed the prob logits, and they look unchanged over the train while the gradient is normaly change between the layers.
