Mohamadali Mahmoodpour
Mohamadali Mahmoodpour

Reputation: 126

Nearly Constant training and validation accuracy

I’m new to pytorch and my problem may be a little naive I’m training a pretrained VGG16 network on my dataset which it’s size is near 33000 images in 8 classes with labels [1,2,…,8] and my classes are imbalanced. my problem is that during training, validation and training accuracy is low and doesn’t increase, is there any problem in my code? if not, what do you suggest to improve training? '''

import torch
import time
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from torch.optim import Adam
import cv2
import torchvision.models as models
from classify_dataset import Classification_dataset
from torchvision import transforms

transform = transforms.Compose([transforms.Resize((224,224)),
                                    transforms.RandomHorizontalFlip(p=0.5),
                                    transforms.RandomVerticalFlip(p=0.5),
                                    transforms.RandomRotation(degrees=45),
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
                                    ])
dataset = Classification_dataset(root_dir=r'//home/arisa/Desktop/Hamid/IQA/Hamid_Dataset',
                                     csv_file=r'/home/arisa/Desktop/Hamid/IQA/new_label.csv',transform=transform)


target = dataset.labels - 1

train_indices, test_indices = train_test_split(np.arange(target.shape[0]), stratify=target)
test_dataset = torch.utils.data.Subset(dataset, indices=test_indices)
train_dataset = torch.utils.data.Subset(dataset, indices=train_indices)

class_sample_count = np.array([len(np.where(target[train_indices] == t)[0]) for t in np.unique(target)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in target[train_indices]])
samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()
  
sampler = torch.utils.data.WeightedRandomSampler(samples_weight, len(samples_weight), replacement = True)


train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=64,
                                           sampler=sampler)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                           batch_size=64,
                                           shuffle=False)
for param in model.parameters():
     param.requires_grad = False

num_ftrs = model.classifier[0].in_features
model.classifier = nn.Linear(num_ftrs,8)    


optimizer = Adam(model.parameters(), lr = 0.0001 )
criterion = nn.CrossEntropyLoss()
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.01)

path = '/home/arisa/Desktop/Hamid/IQA/'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
def train_model(model, train_loader,valid_loader, optimizer, criterion, scheduler=None, num_epochs=10 ):
        
    
        min_valid_loss = np.inf
        model.train()
        start = time.time()
        TrainLoss = []
        model = model.to(device)
        for epoch in range(num_epochs):
            total = 0
            correct = 0 
            train_loss = 0
            #lr_scheduler.step()
            print('Epoch {}/{}'.format(epoch+1, num_epochs))
            print('-' * 10)

            train_loss = 0.0
            for x,y in train_loader:
                x = x.to(device)
                #print(y.shape)
                y = y.view(y.shape[0],).to(device)
                y = y.to(device)
                y -= 1
                out = model(x)

                loss = criterion(out, y)
                optimizer.zero_grad()
                loss.backward()
                
                TrainLoss.append(loss.item()* y.shape[0])
                train_loss += loss.item() * y.shape[0]
                _,predicted = torch.max(out.data,1)
                total += y.size(0)
                correct += (predicted == y).sum().item()
                optimizer.step()
                lr_scheduler.step()
            accuracy = 100*correct/total
            valid_loss = 0.0
            val_loss = []
            model.eval()
            val_correct = 0
            val_total = 0

            with torch.no_grad():
                
                for x_val, y_val in test_loader:
                    x_val = x_val.to(device)
                    y_val = y_val.view(y_val.shape[0],).to(device)
                    y_val -= 1
                    target = model(x_val)
                    loss = criterion(target, y_val)
                    valid_loss += loss.item() * y_val.shape[0]

                    _,predicted = torch.max(target.data,1)
                    val_total += y_val.size(0)
                    val_correct += (predicted == y_val).sum().item()



                    val_loss.append(loss.item()* y_val.shape[0])
                val_acc = 100*val_correct / val_total




                print(f'Epoch {epoch + 1} \t\t Training Loss: {train_loss / len(train_loader)} \t\t Validation Loss: {valid_loss / len(test_loader)} \t\t Train Acc:{accuracy} \t\t Validation Acc:{val_acc}')
                if min_valid_loss > (valid_loss / len(test_loader)):
                    print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss / len(test_loader):.6f}) \t Saving The Model')
                    min_valid_loss = valid_loss / len(test_loader)
                    state = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict(),}
                    torch.save(state,'/home/arisa/Desktop/Hamid/IQA/checkpoint.t7')

        end = time.time()
        print('TRAIN TIME:')
        print('%.2gs'%(end-start))

    train_model(model=model, train_loader=train_loader, optimizer=optimizer, criterion=criterion, valid_loader= test_loader,num_epochs=500  )

Thanks in advance here is the result of 15 epoch

Epoch 1/500
----------
Epoch 1          Training Loss: 205.63448420514916       Validation Loss: 233.89266112356475         Train Acc:39.36360386127994         Validation Acc:24.142040038131555
Epoch 2/500
----------
Epoch 2          Training Loss: 199.05699240435197       Validation Loss: 235.08799531243065         Train Acc:41.90998291820601         Validation Acc:24.27311725452812
Epoch 3/500
----------
Epoch 3          Training Loss: 199.15626737127448       Validation Loss: 236.00033430619672         Train Acc:41.1035633416756          Validation Acc:23.677311725452814
Epoch 4/500
----------
Epoch 4          Training Loss: 199.02581041173886       Validation Loss: 233.60767459869385         Train Acc:41.86628530568466         Validation Acc:24.606768350810295
Epoch 5/500
----------
Epoch 5          Training Loss: 198.61493769454472       Validation Loss: 233.7503859202067          Train Acc:41.53656695665991         Validation Acc:25.0
Epoch 6/500
----------
Epoch 6          Training Loss: 198.71323942956585       Validation Loss: 234.17176149830675         Train Acc:41.639852222619474        Validation Acc:25.369399428026693
Epoch 7/500
----------
Epoch 7          Training Loss: 199.9395153770592        Validation Loss: 234.1744423635078          Train Acc:40.98041552456998         Validation Acc:24.84509056244042
Epoch 8/500
----------
Epoch 8          Training Loss: 199.3533399020355        Validation Loss: 235.4645173188412          Train Acc:41.26643626107337         Validation Acc:24.165872259294567
Epoch 9/500
----------
Epoch 9          Training Loss: 199.6451746921249        Validation Loss: 233.33387595956975         Train Acc:40.96452548365312         Validation Acc:24.59485224022879
Epoch 10/500
----------
Epoch 10         Training Loss: 197.9305159737011        Validation Loss: 233.76405122063377         Train Acc:41.8782028363723          Validation Acc:24.6186844613918
Epoch 11/500
----------
Epoch 11         Training Loss: 199.33247244055502       Validation Loss: 234.41085289463854         Train Acc:41.59218209986891         Validation Acc:25.119161105815063
Epoch 12/500
----------
Epoch 12         Training Loss: 199.87399289874256       Validation Loss: 234.23621463775635         Train Acc:41.028085647320545        Validation Acc:24.49952335557674
Epoch 13/500
----------
Epoch 13         Training Loss: 198.85540591944292       Validation Loss: 234.33149099349976         Train Acc:41.206848607635166        Validation Acc:24.857006673021925
Epoch 14/500
----------
Epoch 14         Training Loss: 199.92641723337513       Validation Loss: 233.37722391070741         Train Acc:41.15520597465539         Validation Acc:24.988083889418494
Epoch 15/500
----------
Epoch 15         Training Loss: 197.82172771698328       Validation Loss: 234.4943131533536          Train Acc:41.69943987605768         Validation Acc:24.380362249761678

Upvotes: 2

Views: 765

Answers (2)

Mohamadali Mahmoodpour
Mohamadali Mahmoodpour

Reputation: 126

my problem was in model.train(). This phrase should be inside the training loop. but in my case I put it outside the training loop and when it comes to model.eval(), model maintained in this mode

Upvotes: 1

Proko
Proko

Reputation: 2021

You freezed your model through

for param in model.parameters():
     param.requires_grad = False

which basically says "do not calculate any gradient for any weight" which is equivalent of not updating weights - hence no optimization

Upvotes: 1

Related Questions