Reputation: 53
I try to implement the classification of images with bayesian CNN using dropout.
I have defined two classes:
When I started the program I remade that the train/test accuracy remain stable they don’t increase. I don’t see what the problem is.
I don’t know if it’s because of convolution and pooling layer parameters or what? Any idea please.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5, padding=2)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5, padding=2)
self.fc1 = nn.Linear(16 * 8 * 8, 1024)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 192 * 8 * 8)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
# Lenet with MCDO
class Net_MCDO(nn.Module):
def __init__(self):
super(Net_MCDO, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5, padding=2)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(16, 192, 5, padding=2)
self.fc1 = nn.Linear(16 * 8 * 8, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.dropout = nn.Dropout(p=0.3)
def forward(self, x):
x = self.pool(self.dropout(self.conv1(x)))
x = self.pool(self.dropout(self.conv2(x)))
x = x.view(-1, 192 * 8 * 8)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(self.dropout(x)))
x = F.softmax(self.fc3(self.dropout(x)),dim=1)
return x
net=Net()
mcdo=Net_MCDO()
CE = nn.CrossEntropyLoss()
learning_rate=0.001
optimizer=optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
epoch_num = 30
train_accuracies=np.zeros(epoch_num)
test_accuracies=np.zeros(epoch_num)
for epoch in range(epoch_num):
average_loss = 0.0
total=0
success=0
for i, data in enumerate(trainloader, 0):
inputs, labels = data
inputs, labels = Variable(inputs), Variable(labels)
optimizer.zero_grad()
outputs = mcdo(inputs)
loss=CE(outputs, labels)
loss.backward()
optimizer.step()
average_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
success += (predicted==labels.data).sum()
train_accuracy = 100.0*success/total
succes=0
total=0
for (inputs, labels) in testloader:
inputs, labels = Variable(inputs), Variable(labels)
outputs = net(inputs)
_,predicted = torch.max(outputs.data, 1)
total += labels.size(0)
success += (predicted==labels.data).sum()
test_accuracy = 100.0*success/total
print(u"epoch{}, average_loss{}, train_accuracy{},
test_accuracy{}".format(
epoch,
average_loss/n_batches,
train_accuracy,
100*success/total
))
#save
train_accuracies[epoch] = train_accuracy
test_accuracies[epoch] = 100.0*success/total
plt.plot(np.arange(1, epoch_num+1), train_accuracies)
plt.plot(np.arange(1, epoch_num+1), test_accuracies)
plt.show()
Upvotes: 1
Views: 95
Reputation: 1974
Pytorch merges Softmax
inside the CrossEntroplyLoss
for numerical stability (and better training). So you should remove the softmax layer of your models. (check the documentation here: https://pytorch.org/docs/stable/nn.html#crossentropyloss). Keeping the Sofmax
layer in your model will lead to slower training and possibly worse metrics, that is because you are squashing the gradient twice, thus the weight update is a lot less significant.
Change your code to:
class Net_MCDO(nn.Module):
def __init__(self):
super(Net_MCDO, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5, padding=2)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(16, 192, 5, padding=2)
self.fc1 = nn.Linear(16 * 8 * 8, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.dropout = nn.Dropout(p=0.3)
def forward(self, x):
x = self.pool(F.relu(self.dropout(self.conv1(x)))) # recommended to add the relu
x = self.pool(F.relu(self.dropout(self.conv2(x)))) # recommended to add the relu
x = x.view(-1, 192 * 8 * 8)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(self.dropout(x)))
x = self.fc3(self.dropout(x)) # no activation function needed for the last layer
return x
Furthermore, I would recommend you using an activation function, such as ReLU()
after every conv or linear layer. Otherwise you are just performing a bunch of linear operations that could be learned in one single layer.
I hope that helps =)
Upvotes: 0
Reputation: 3496
Yes you are right: Dont use dropout (nor batchnorm) when testing. But you dont have to create to different models for that. You can choose between a train-mode and a test-mode. Just create one model, for example 'net':
# when training
outputs = net.train()(inputs)
# when testing:
outputs = net.eval()(inputs)
But you shouldnt really use dropout with conv-layers anyways. Just on the dense layers at the end. That could be the reason why it is not improving. And you architecture is quite small. How big are you images? If they are over 32x32 you can try and add one more layer. You can also try to start with a learning rate about 0.001 and then deviding it by two every time the accuracy doesnt improve for some epochs. Hope this will help you :)
Edit I just saw that you are missing the relu activations on the second model (with dropout), that should cause problems.
Upvotes: 1