Reputation: 1
I'm trying to use gpu to train a ResNet architecture on CIFAR10 dataset. Here's my code for ResNet :
import torch
import torch.nn as nn
import torch.nn.functional as F
class ResNetBlock(nn.Module):
def __init__(self, in_planes, planes, stride=1):
super(ResNetBlock, self).__init__()
self.stride = stride
self.in_planes=in_planes
self.planes = planes
if stride!=1:
self.fx = nn.Sequential(nn.Conv2d(in_planes, planes, 3, stride=2,
padding=1),
nn.ReLU(),
nn.Conv2d(planes, planes,3, padding=1))
else:
self.fx = nn.Sequential(nn.Conv2d(planes, planes, 3, padding = 1),
nn.ReLU(),
nn.Conv2d(planes, planes,3, padding=1))
def forward(self, x):
if self.stride ==1:
fx = self.fx(x)
id = nn.Sequential()
out = fx + id(x)
relu = nn.ReLU()
return relu(out)
else:
fx = self.fx(x)
id = nn.Conv2d(self.in_planes, self.planes, 2, stride = 2)
out = fx + id(x)
relu = nn.ReLU()
return relu(out)
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10, num_filters=16, input_dim=3):
super(ResNet, self).__init__()
self.in_planes = num_filters
self.conv1 = nn.Conv2d(input_dim, num_filters, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(num_filters)
layers = []
plane = num_filters
for nb in num_blocks:
layer = self._make_layer(block,plane ,nb,2)
layers.append(layer)
plane*=2
self.layers = nn.Sequential(*layers)
self.linear = nn.Linear(2304, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
layers = []
block1 = ResNetBlock(planes, 2*planes, stride = 2)
planes *=2
layers.append(block1)
for i in range(1,num_blocks):
block = ResNetBlock(planes, planes, stride =1)
layers.append(block)
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layers(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
# (1 + 2*(1 + 1) + 2*(1 + 1) + 2*(1 + 1) + 2*(1 + 1)) + 1 = 18
def ResNet18():
return ResNet(ResNetBlock, [2,2,2,2])
Then I train the network using gpu :
net = ResNet18()
net = net.to('cuda')
train2(net, torch.optim.Adam(net.parameters(), lr=0.001), trainloader, criterion, n_ep=3)
And I get the error :
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
which is annoying because my weights should be cuda as well because of the resnet.cuda().
With another network the train function works well, so it must come from the classes mentioned above.
Also, next(resnet.parameters()).is_cuda returns True.
Update : Here's my training function.
def train(net, optimizer, trainload, criterion, n_ep=10, cuda = True):
if cuda:
net = net.to('cuda')
for epoch in range(n_ep):
for data in trainload:
inputs, labels = data
if cuda:
inputs = inputs.type(torch.cuda.FloatTensor)
labels = labels.type(torch.cuda.LongTensor)
optimizer.zero_grad()
print(next(net.parameters()).is_cuda)
## this actually prints "True" !
outputs = net.forward(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
return net
The thing is, this training function works well with another type of net. For example is used this one (AlexNet) :
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(nn.Conv2d(3,64,11), nn.ReLU(),nn.MaxPool2d(2, stride = 2), nn.Conv2d(64,192,5),
nn.ReLU(), nn.MaxPool2d(2, stride = 2), nn.Conv2d(192,384,3),
nn.ReLU(),nn.Conv2d(384,256,3), nn.ReLU(), nn.Conv2d(256,256,3), nn.ReLU())
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.view(x.size(0), 256 * 6 * 6)
x = self.classifier(x)
return x
and with this one the gpu training works well.
There's something else I don't understand. I tried to train a network that I moved to GPU (using .cuda() ) with training data that I did not move to GPU (on purpose). And this time I get the error that weights type is torch.cuda and data type isn't.
EDIT : I thought it had to do with using nn.ModuleList instead of regular python lists. However I tried that and it has not fixed the issue.
Upvotes: 0
Views: 1432
Reputation: 1
Ok I got it finally.
I was defining some nn.Module objects in the forward function of the ResNetBlock class. I'm guessing those could not be moved to gpu because pytorch only looks for such objects in the init function. I changed a bit my implementation to define the objects in the init function and it worked.
Thank you for your help :)
Upvotes: 0
Reputation: 331
We would need a snippet of your training loop to better determine your error.
I am asuming that somewhere on that loop you have some lines of code which do the following:
for data, label in CifarDataLoader:
data, label = data.to('cuda'), label.to('cuda')
My first guess would be to add a line just before the for loop ->
resnet = resnet.to('cuda')
Let me know if this works, if not I would need more of your code to find the error.
Upvotes: 1