Reputation: 41

Loss is nan, stopping training when training Mask-RCNN multi-class segmentation

number of train data: 346
number of test data: 69
Epoch: [0] [0/346] eta: 0:35:20 lr: 0.000019 loss: -312.6024 (-312.6024) loss_classifier: 1.5789 (1.5789) loss_box_reg: 0.1299 (0.1299) loss_mask: -314.3485 (-314.3485) loss_objectness: 0.0266 (0.0266) loss_rpn_box_reg: 0.0106 (0.0106) time: 6.1275 data: 0.1599 max mem: 0
Loss is nan, stopping training
{‘loss_classifier’: tensor (nan, grad_fn = ), ‘loss_box_reg’: tensor (nan, grad_fn = ), ‘loss_mask’: tensor (nan, grad_fn = ), ’ tensor (nan, grad_fn = ), ‘loss_rpn_box_reg’: tensor (nan, grad_fn = )}
An exception has occurred, use% tb to see the full traceback.

SystemExit : 1

And this is the dataset code:


class maskrcnn_Dataset(torch.utils.data.Dataset):
   def __init__(self, root, transforms=None):
    self.root = root
    self.transforms = transforms
    # load all image files, sorting them to
    # ensure that they are aligned
    self.imgs = list(sorted(os.listdir(os.path.join(root, "images"))))
    self.masks = list(sorted(os.listdir(os.path.join(root, "masks"))))
    #self.class_masks = list(sorted(os.listdir(os.path.join(root, "SegmentationClass"))))

def __getitem__(self, idx):
    # load images ad masks
    img_path = os.path.join(self.root, "images", self.imgs[idx])
    x=self.imgs[idx].split('.')
    mask_path = os.path.join(self.root, "masks", self.masks[idx])
    #class_mask_path = os.path.join(self.root, "SegmentationClass", self.class_masks[idx])
    
    #read and convert image to RGB
    img = cv2.imread(img_path)
    mask_for_all=[]
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # note that we haven't converted the mask to RGB,
    # because each color corresponds to a different instance
    # with 0 being background
    # mask = Image.open(mask_path)
    mask_folder=os.path.join(self.root,"masks")
    source_mask = os.path.join(mask_folder, x[0])
    #print(os.listdir(source_mask))
    boxes = []
    xx=trier(os.listdir(source_mask))
    #print(xx)
    for file_name in xx:
        mask = Image.open(os.path.join(source_mask,file_name))
        mask = np.array(mask)
        mask_for_all.append(mask)
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[1:]
        masks = mask == obj_ids[:, None, None]
        num_objs = len(obj_ids)
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
    
    num_objs=len(boxes)
    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    # there is only one class
    if(self.root.find("train")!=-1):
        #print("bisgltjf")
        
        labels =class_ids_train[class_ids_train_names.index(self.imgs[idx])]
        #print(labels)
    else:
        labels =class_ids_val[class_ids_val_names.index(self.imgs[idx])]
        #print('l3assba')
        
    #labels = np.array([])
    #for i in range(masks.shape[0]):
     #   labels = np.append(labels, (masks[i] * class_mask).max())
    
    labels = torch.as_tensor(labels, dtype=torch.int64)
    #print(boxes,":",labels)
    masks = torch.as_tensor(mask_for_all, dtype=torch.uint8)
    #print(labels)
    #print(masks)
    #print(masks.shape)

    image_id = torch.tensor([idx])
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
    #print(img.shape)
    #print(self.imgs[idx])
    

    target = {}
    target["boxes"] = boxes
    #print(boxes)
    target["labels"] = labels
    #print(labels.shape)
    target["masks"] = masks
    #print(masks.shape)
    target["image_id"] = image_id
    #print(image_id.shape)
    target["area"] = area
    #print(area)
    target["iscrowd"] = iscrowd
    #print(iscrowd.shape)

    if self.transforms is not None:
        img, target = self.transforms(img, target)

    return img, target

def __len__(self):
    return len(self.imgs)

Upvotes: 2

Answers (2)

summer

Reputation: 1

I read your log information, your the "loss_mask" first became nan, and I guess there may have been a problem calculating the "loss_mask", so I suggest you:

Make sure your mask image has been normalized to between 0,1.
If you are sure that 1 is okay, I suggest you take a look at the "loss_mask" Is the calculation of the correct.

Upvotes: 0

tian

Reputation: 11

There can be two issues:

Check the coordinate of boxes, make sure [xmin, ymin, xmax, ymax] is positive
Make sure the mask's length is the same as boxes.

Upvotes: 1

Loss is nan, stopping training when training Mask-RCNN multi-class segmentation

Answers (2)

Related Questions