Reputation: 39
i am trying to do mask rcnn model training with custom dataset using pytorch but am getting very small accuracy at the end of training making me wondering if there is a step i skipped.
Here is the folders structure
📦paddy images
┣ 📂paddy with pests
┃ ┣ 📜00000000.jpg
┃ ┣ 📜00000001.jpg
┃ ┣ 📜00000002.jpg
┃ ┣ 📜00000004.jpg
┗ 📂paddy without pests
┃ ┣ 📜00000000.jpg
┃ ┣ 📜00000001.jpg
┃ ┣ 📜00000002.jpg
The aim is to create masks around pests detected on the image.
I took the images and annotated them in roboflow as type of instance segmentation, trained it and downloded the dataset of format of coco segmentation
Here is file stracture of the dataset
- test
- _annotations.coco.json
- 00000013_jpg.rf.ce10adfab7483328487f1a31f4419922.jpg
- 00000015_jpg.rf.7e8bbc14ef9132320d28b840abb86270.jpg
- 00000020_jpg.rf.5c1b21acf36426b0ab747bc62f74e9b5.jpg
- ...
- train
- _annotations.coco.json
- 0000001_jpg.rf.ce10adfab7483328487f1a31f4419922.jpg
- 0000002_jpg.rf.7e8bbc14ef9132320d28b840abb86270.jpg
- 0000003_jpg.rf.5c1b21acf36426b0ab747bc62f74e9b5.jpg
- ...
- valid
- _annotations.coco.json
- 0000004_jpg.rf.ce10adfab7483328487f1a31f4419922.jpg
- 0000008_jpg.rf.7e8bbc14ef9132320d28b840abb86270.jpg
- 00000012_jpg.rf.5c1b21acf36426b0ab747bc62f74e9b5.jpg
- ...
- README.dataset.txt
- README.robofow.txt
I downloded the dataset into colab using teh following code.
from roboflow import Roboflow
rf = Roboflow(api_key="chQQqFnE0E********")
project = rf.workspace("paddy-pest-detection2").project("paddy-pest-detection3")
version = project.version(2)
dataset = version.download("coco-segmentation")
Here is the overall code
import os
import numpy as np
import pandas as pd
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, AnchorGenerator
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision import transforms as T
from PIL import Image
from pycocotools.coco import COCO
import pycocotools.mask as mask_utils
from torch.utils.data import WeightedRandomSampler
from tqdm import tqdm
from pycocotools.cocoeval import COCOeval
import utils
root_path=dataset.location
root_path
# Path to COCO annotation files
TRAIN_DIR = os.path.join(root_path, 'train')
VALID_DIR = os.path.join(root_path, 'valid')
TEST_DIR = os.path.join(root_path, 'test')
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Custom Dataset Class
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, root, transforms=None):
self.root = root
self.transforms = transforms
self.coco = COCO(os.path.join(self.root, '_annotations.coco.json'))
self.ids = list(sorted(self.coco.imgs.keys()))
def __getitem__(self, idx):
img_id = self.ids[idx]
img_data = self.coco.loadImgs(img_id)[0]
path = os.path.join(self.root, img_data['file_name'])
img = Image.open(path).convert("RGB")
ann_ids = self.coco.getAnnIds(imgIds=img_id)
anns = self.coco.loadAnns(ann_ids)
boxes, masks, labels = [], [], []
for ann in anns:
bbox = ann['bbox']
boxes.append([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]])
labels.append(ann['category_id'])
rles = mask_utils.frPyObjects(ann['segmentation'], img_data['height'], img_data['width'])
masks.append(mask_utils.decode(rles).squeeze())
boxes = torch.as_tensor(boxes, dtype=torch.float32)
masks = torch.as_tensor(np.stack(masks), dtype=torch.uint8)
labels = torch.as_tensor(labels, dtype=torch.int64)
target = {
"boxes": boxes,
"labels": labels,
"masks": masks,
"image_id": torch.tensor([img_id]),
"area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]),
"iscrowd": torch.zeros((len(anns),), dtype=torch.int64)
}
if self.transforms:
img = self.transforms(img)
return img, target
def __len__(self):
return len(self.ids)
# Define transformations
# def get_transform(train):
# transforms = [T.ToTensor()]
# if train:
# transforms.append(T.RandomHorizontalFlip(0.5))
# return T.Compose(transforms)
def get_transform(train):
transforms = [T.ToTensor()]
if train:
# Random Horizontal Flip
transforms.append(T.RandomHorizontalFlip(0.5))
# Random Vertical Flip
transforms.append(T.RandomVerticalFlip(0.5))
# Random Rotation
transforms.append(T.RandomRotation(degrees=15)) # Rotate within ±15 degrees
# Color Jitter
transforms.append(T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1))
return T.Compose(transforms)
Initialize datasets and data loaders
dataset_train = CustomDataset(TRAIN_DIR, get_transform(train=True))
dataset_valid = CustomDataset(VALID_DIR, get_transform(train=False))
# Dataset Balancing with Weighted Sampling
# label_counts = [0] * len(dataset_train.coco.cats)
# for idx in range(len(dataset_train)):
# _, target = dataset_train[idx]
# for label in target["labels"]:
# label_counts[label] += 1
# class_weights = [1 / count if count > 0 else 0 for count in label_counts]
# sample_weights = []
# for idx in range(len(dataset_train)):
# _, target = dataset_train[idx]
# sample_weights.append(np.mean([class_weights[label] for label in target["labels"]]))
data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=4,shuffle=True,num_workers=2, collate_fn=lambda x: tuple(zip(*x)))
data_loader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=2, shuffle=False, num_workers=2, collate_fn=lambda x: tuple(zip(*x)))
# Load pre-trained Mask R-CNN model
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
num_classes = len(dataset_train.coco.cats) + 1 # Including background class
# Modify the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# Modify the mask predictor
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, num_classes)
# Customize anchor sizes and aspect ratios
anchor_generator = AnchorGenerator(
sizes=((32,), (64,), (128,), (256,), (512,)), # One size per feature map
aspect_ratios=((0.5, 1.0, 2.0),) * 5 # Same ratios for all feature maps
)
model.rpn.anchor_generator = anchor_generator
model.to(device)
# optimizer = torch.optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.0005)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
# Training loop
num_epochs = 100
#gradient_clip_value = 5.0 # Clip gradients to this value
gradient_clip_value = 1.0 # Clip gradients to this value
for epoch in range(num_epochs):
model.train()
epoch_loss = 0
print(f"\nEpoch {epoch + 1}/{num_epochs}")
for batch_idx, (images, targets) in enumerate(data_loader_train):
images = [img.to(device) for img in images]
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
epoch_loss += losses.item()
optimizer.zero_grad()
losses.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip_value)
optimizer.step()
if batch_idx % 10 == 0:
print(f"Batch {batch_idx + 1}/{len(data_loader_train)} | Loss: {losses.item():.4f}")
lr_scheduler.step()
print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(data_loader_train):.4f}")
# Evaluation
model.eval()
coco_gt = dataset_valid.coco
coco_results = []
img_id = 0 #Important to keep track of the image id
# Evaluation
model.eval()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
coco_gt = dataset_valid.coco
coco_results = []
img_id = 0
with torch.no_grad():
for images, targets in tqdm(data_loader_valid, desc="Evaluating"):
images = list(image.to(device) for image in images)
outputs = model(images)
for i, output in enumerate(outputs):
image_id = dataset_valid.ids[img_id]
img_id += 1
boxes = output['boxes'].cpu().numpy()
scores = output['scores'].cpu().numpy()
labels = output['labels'].cpu().numpy()
masks = output['masks'].cpu().numpy()
# Threshold the masks to convert probabilities to binary values
masks = (masks > 0.5).astype(np.uint8) # Key change: Thresholding and type conversion
# Filter out low-confidence detections (optional)
threshold = 0.5
high_indices = np.where(scores > threshold)[0]
boxes = boxes[high_indices]
scores = scores[high_indices]
labels = labels[high_indices]
masks = masks[high_indices]
for j in range(len(boxes)):
# Convert mask to RLE format (with explicit squeeze and fortranarray)
mask = np.squeeze(masks[j], axis=0) # Squeeze the first dimension
fortran_mask = np.asfortranarray(mask)
rle = mask_utils.encode(fortran_mask)
coco_results.append({
'image_id': image_id,
'category_id': int(labels[j]),
'bbox': boxes[j].tolist(),
'score': float(scores[j]),
'segmentation': rle
})
coco_results
# Evaluate using COCOeval
coco_dt = coco_gt.loadRes(coco_results)
coco_eval = COCOeval(coco_gt, coco_dt, 'segm')
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
print("mAP (mask): ", coco_eval.stats[0])
print("AP @ IoU=0.5 (mask): ", coco_eval.stats[1])
print("AP @ IoU=0.75 (mask): ", coco_eval.stats[2])
print("AR @ max detections=100 (mask): ", coco_eval.stats[8])
After training is complete igot teh following as results
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *segm*
DONE (t=0.04s).
Accumulating evaluation results...
DONE (t=0.01s).
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
mAP (mask): 0.0
AP @ IoU=0.5 (mask): 0.0
AP @ IoU=0.75 (mask): 0.0
AR @ max detections=100 (mask): 0.0
Is there away to modify the code and help improve on perfomance?
In roboflow after training i got:
Is there i can also train mine to produce the accuracy similar to that of roboflow?
Accuracy of atleast 70% plus
Upvotes: 0
Views: 36