Deffo
Deffo

Reputation: 207

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 512, 3, 3], but got 2-dimensional input of size [32, 2048] instead

I want to train a classifier based on a pretrained network with PyTorch. What I need to do is to take a pretrained model (I tried with ResNet50), add some layers at the end (I need to do this as it is required by the project specifications) and train only those layers I add. I tried this:

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision import models
from guitar_dataset import GuitarDataset
from tqdm import tqdm


device = ("cuda" if torch.cuda.is_available() else "cpu")

transformations = transforms.Compose([
    transforms.Resize((200, 200))
])

num_epochs = 10
learning_rate = 0.001
train_CNN = False
batch_size = 32
shuffle = True
pin_memory = True
num_workers = 1

dataset = GuitarDataset(f"../chords_data/cropped/train", transform=transformations)
train_set, validation_set = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)),
                                                                    len(dataset) - int(0.8 * len(dataset))])
train_loader = DataLoader(dataset=train_set, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers,
                          pin_memory=pin_memory)
validation_loader = DataLoader(dataset=validation_set, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers,
                               pin_memory=pin_memory)

testset = GuitarDataset(f"../chords_data/cropped/test", transform=transformations)
test_loader = DataLoader(dataset=testset, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers,
                         pin_memory=pin_memory)

model = models.resnet50(pretrained=True)
for param in model.parameters():
    param.requires_grad = False
model.fc = nn.Sequential(
            nn.Conv2d(512, 64, (3, 3)),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 64, (3, 3)),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.5),
            nn.Flatten(),
            nn.Linear(147456, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 7)
        )
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.fc.parameters(), lr=learning_rate)


PATH = f"./saved_models/mynet.pth"



def check_accuracy(loader, model):
    if loader == train_loader:
        print("Checking accuracy on training data")
    else:
        print("Checking accuracy on validation data")

    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)
            # predictions = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in scores]).to(device)
            predictions = scores.argmax(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
            print(
                f"Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}"
            )
    return f"{float(num_correct) / float(num_samples) * 100:.2f}"


def train():
    model.train()
    for epoch in range(num_epochs + 1):
        loop = tqdm(train_loader, total=len(train_loader), leave=True)
        # if epoch % 2 == 0:
        loop.set_postfix(val_acc=check_accuracy(validation_loader, model))
        if epoch == num_epochs:
            break
        for imgs, labels in loop:
            labels = torch.nn.functional.one_hot(labels, num_classes=7).float()
            imgs = imgs.to(device)
            labels = labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loop.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")
            loop.set_postfix(loss=loss.item())

    torch.save(model.state_dict(), PATH)


def test():
    model.load_state_dict(torch.load(PATH))

    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            # calculate outputs by running images through the network
            outputs = model(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the test images: %d %%' % (
            100 * correct / total))


if __name__ == "__main__":
    print(f"Working on {data_type}")
    train()
    test()

but I get the error in the title as soon as I start the training phase. Shouldn't the downloaded model be ready-to-use?

Full stack trace:

Traceback (most recent call last):
  File "/home/deffo/Documents/Unimore/Magistrale/Computer Vision and Cognitive Systems/Guitar_Fingering_&_Chords_Recognition/ChordsClassification/train_ResNetChord.py", line 139, in <module>
    train()
  File "/home/deffo/Documents/Unimore/Magistrale/Computer Vision and Cognitive Systems/Guitar_Fingering_&_Chords_Recognition/ChordsClassification/train_ResNetChord.py", line 99, in train
    loop.set_postfix(val_acc=check_accuracy(validation_loader, model))
  File "/home/deffo/Documents/Unimore/Magistrale/Computer Vision and Cognitive Systems/Guitar_Fingering_&_Chords_Recognition/ChordsClassification/train_ResNetChord.py", line 83, in check_accuracy
    scores = model(x)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torchvision/models/resnet.py", line 249, in forward
    return self._forward_impl(x)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torchvision/models/resnet.py", line 244, in _forward_impl
    x = self.fc(x)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torch/nn/modules/container.py", line 119, in forward
    input = module(input)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 399, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/deffo/anaconda3/envs/ComputerVision/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 395, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 512, 3, 3], but got 2-dimensional input of size [32, 2048] instead

Upvotes: 0

Views: 852

Answers (2)

ayandas
ayandas

Reputation: 2278

Your network design in wrong.

You are not supposed to add Convolutional layers at the end of Resnet50's feature extractor. Put some Linear layers

model.fc = nn.Sequential(
            # It has to start from 2048
            nn.Linear(2048, 1024), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 256), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 7) # 7 is number of classes
        )

The model.fc has to start from 2048 units because that's what the ResNet50's feature extractor produces.

The error is basically saying that it was expecting 4D input (because of your Conv2D layer at begining of model.fc) but got (batch_size, 2048) because that's what the the ResNet50 produced.

Upvotes: 1

Ivan
Ivan

Reputation: 40728

You can't replace resnet50's fc with a convolutional network. The output of resnet's feature extractor is a CNN which outputs a flat 2048-long tensor, as such the layers following it should be fully connected layers.

Upvotes: 1

Related Questions