I'm building a DCGAN, and I am having a problem with the shape of the output, it is not matching the shape of the labels when I try calculating the BCELoss.
To generate the discriminator output, do I have to use convolutions all the way down or can I add a Linear layer at some point to match the shape I want?
I mean, do I have to reduce the shape by adding more convolutional layers or can I add a fully connected one? I thought it should have a fully connected layer, but on every tutorial I checked the discriminator had no fully connected layer.
import random
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as torch_dataset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
seed = 1
print("Random Seed: ", seed)
images_folder_path = "./spectrograms/"
batch_size = 1
image_size = 256
n_channels = 1
z_vector = 100
n_features_generator = 32
n_features_discriminator = 32
num_epochs = 5
lr = 0.0002
beta1 = 0.5
dataset = torch_dataset.ImageFolder(
root=images_folder_path, transform=transforms.Compose(
transforms.Normalize(0.5, 0.5)
dataloader =, batch_size=batch_size, shuffle=True, num_workers=0)
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
nn.init.normal_(, 0.0, 0.02)
elif classname.find('BatchNorm') != -1:
nn.init.normal_(, 1.0, 0.02)
nn.init.constant_(, 0)
class Generator(nn.Module):
def __init__(self):
super(Generator, self).__init__()
self.main = nn.Sequential(
nn.ConvTranspose2d(z_vector, n_features_generator * 8, 4, 1, bias=False),
nn.BatchNorm2d(n_features_generator * 8),
nn.ConvTranspose2d(n_features_generator * 8, n_features_generator * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(n_features_generator * 4),
nn.ConvTranspose2d(n_features_generator * 4, n_features_generator * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(n_features_generator * 2),
nn.ConvTranspose2d(n_features_generator * 2, n_features_generator, 4, 2, 1, bias=False),
nn.ConvTranspose2d(n_features_generator, n_channels, 4, 2, 1, bias=False),
def forward(self, inputs):
return self.main(inputs)
# Convolutional Layer Output Shape = [(W−K+2P)/S]+1
# W is the input volume
# K is the Kernel size
# P is the padding
# S is the stride
class Discriminator(nn.Module):
def __init__(self):
super(Discriminator, self).__init__()
self.main = nn.Sequential(
nn.Conv2d(n_channels, n_features_discriminator, 4, 2, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(n_features_discriminator, n_features_discriminator * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(n_features_discriminator * 2),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(n_features_discriminator * 2, n_features_discriminator * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(n_features_discriminator * 4),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(n_features_discriminator * 4, n_features_discriminator * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(n_features_discriminator * 8),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(n_features_discriminator * 8, 1, 4, 1, bias=False),
def forward(self, inputs):
return self.main(inputs)
netG = Generator().to(device)
if device.type == 'cuda':
netG = nn.DataParallel(netG)
netD = Discriminator().to(device)
if device.type == 'cuda':
netD = nn.DataParallel(netD)
criterion = nn.BCEWithLogitsLoss()
fixed_noise = torch.randn(64, z_vector, 1, 1, device=device)
real_label = 1.
fake_label = 0.
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))
img_list = []
G_losses = []
D_losses = []
iters = 0
print("Starting Training Loop...")
for epoch in range(num_epochs):
for i, data in enumerate(dataloader, 0):
real_cpu = data[0].to(device)
b_size = real_cpu.size(0)
label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
output = netD(real_cpu)
output = output.view(-1)
errD_real = criterion(output, label)
D_x = output.mean().item()
noise = torch.randn(b_size, z_vector, 1, 1, device=device)
fake = netG(noise)
output = netD(fake.detach()).view(-1)
errD_fake = criterion(output, label)
D_G_z1 = output.mean().item()
errD = errD_real + errD_fake
output = netD(fake).view(-1)
errG = criterion(output, label)
D_G_z2 = output.mean().item()
if i % 50 == 0:
print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
% (epoch, num_epochs, i, len(dataloader),
errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
with torch.no_grad():
fake = netG(fixed_noise).detach().cpu()
img_list.append(vutils.make_grid(fake, padding=2, normalize=True))
iters += 1
The error I'm getting:
Traceback (most recent call last):
File "G:/Pastas Estruturadas/Conhecimento/CEFET/IA/SpectroGAN/", line 140, in <module>
errD_real = criterion(output, label)
File "C:\Users\Ramon\anaconda3\envs\vision\lib\site-packages\torch\nn\modules\", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\Ramon\anaconda3\envs\vision\lib\site-packages\torch\nn\modules\", line 631, in forward
File "C:\Users\Ramon\anaconda3\envs\vision\lib\site-packages\torch\nn\", line 2538, in binary_cross_entropy_with_logits
raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
ValueError: Target size (torch.Size([1])) must be the same as input size (torch.Size([169]))
The shape of output: torch.Size([1, 1, 13, 13])
, and shape of label: torch.Size([1])
The DCGAN described a concrete architecture where Conv layers were used for the downsampling of the feature maps. If you carefully design your Conv layers, you can do without a Linear layer but that does not mean that it will not work when you use a Linear layer to downsample (especially as the very last layer). The DCGAN paper just found out it worked better to use Conv layers instead of Linear to downsample.
If you want to maintain this architecture, you can change the kernel size or padding or stride to give you exactly a single value in the last layer. Refer to the Pytorch documentation on Conv layers to see what the output size should be, given an input size
