Reputation: 56
I used the colab pro to train cifar-10 data. But it's very slow, even spent 2 more times than CPU on colab pro and my own PC. These data are on google dirve, and I used pytorch to train. Could someone help me?
I also tried changing to accelerator in colab to 'None', but it's also faster than 'GPU'. It's very strange?
And here is my code:
label_names = ["airplane", "automobile", "bird",
"cat", "deer", "dog", "frog", "horse", "ship", "truck"]
label_dict = {}
for idx, name in enumerate(label_names):
label_dict[name] = idx
def default_loader(path):
return Image.open(path).convert("RGB")
train_transform = transforms.Compose(
[
transforms.RandomResizedCrop((28, 28)), # 原来是32x32
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.RandomRotation(90),
transforms.RandomGrayscale(0.1),
transforms.ColorJitter(0.3, 0.3, 0.3, 0.3),
transforms.ToTensor()
]
)
# 数据的读取和增强
class MyDataset():
def __init__(self, im_list, transform=None, loader=default_loader):
super(MyDataset, self).__init__()
imgs = []
for im_item in im_list:
im_label_name = im_item.split('/')[-2]
imgs.append([im_item, label_dict[im_label_name]])
self.imgs = imgs
self.transform = transform
self.loader = loader
def __getitem__(self, index):
im_path, im_label = self.imgs[index]
im_data = self.loader(im_path)
if self.transform is not None:
im_data = self.transform(im_data)
return im_label, im_data
def __len__(self):
return len(self.imgs)
im_train_list = glob.glob("./drive/MyDrive/cifar-10-batches-py/train/*/*png")
im_test_list = glob.glob("./drive/MyDrive/cifar-10-batches-py/test/*/*png")
# Enhance
train_dataset = MyDataset(im_list=im_train_list, transform=train_transform)
test_dataset = MyDataset(im_list=im_test_list, transform=transforms.ToTensor())
train_data_loader = DataLoader(dataset=train_dataset,
batch_size=128, shuffle=True, num_workers=4)
test_data_loader = DataLoader(dataset=test_dataset,
batch_size=128, shuffle=False, num_workers=4)
# VGGnet
class VGGnet(nn.Module):
def __init__(self):
super(VGGnet, self).__init__()
# 3 * 28 * 28
self.conv1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(64),
nn.ReLU()
)
# 14 * 14
self.maxpooling1 = nn.MaxPool2d(kernel_size=2, stride=2)
# 下采样之后channel通常会翻倍
self.conv2_1 = nn.Sequential(
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU()
)
self.conv2_2 = nn.Sequential(
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU()
)
# 7 * 7
self.maxpooling2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv3_1 = nn.Sequential(
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(256),
nn.ReLU()
)
self.conv3_2 = nn.Sequential(
nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(256),
nn.ReLU()
)
# 4 * 4
self.maxpooling3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=1)
self.conv4_1 = nn.Sequential(
nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(512),
nn.ReLU()
)
self.conv4_2 = nn.Sequential(
nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(512),
nn.ReLU()
)
# 2* 2
self.maxpooling4 = nn.MaxPool2d(kernel_size=2, stride=2)
# 全连接 (一个FC层)
self.fc = nn.Linear(512 * 4, 10)
def forward(self, x):
batchsize = x.size(0)
out = self.conv1(x)
out = self.maxpooling1(out)
out = self.conv2_1(out)
out = self.conv2_2(out)
out = self.maxpooling2(out)
out = self.conv3_1(out)
out = self.conv3_2(out)
out = self.maxpooling3(out)
out = self.conv4_1(out)
out = self.conv4_2(out)
out = self.maxpooling4(out)
out = out.view(batchsize, -1)
out = self.fc(out)
out = F.log_softmax(out, dim=1)
return out
device = torch.device("cuda")
epoch_num = 10
lr = 0.01
net = VGGnet().to(device)
# loss
loss_func = nn.CrossEntropyLoss()
# optimizer
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
# lr
torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)
%%time
for epoch in range(epoch_num):
print(epoch)
net.train()
for i, data in enumerate(train_data_loader):
labels, inputs = data
labels, inputs = labels.to(device), inputs.to(device)
outputs = net(inputs)
loss = loss_func(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# print("loss is {}".format(loss.item()))
_, pred = torch.max(outputs.data, dim=1)
correct = pred.eq(labels.data).sum()
batch_size = inputs.size(0)
print("step = {}; loss = {}; mini-batch correct = {}"
.format(i, loss.item(), 100. * correct/batch_size) )
Upvotes: 2
Views: 2974
Reputation: 56
I knew the reason! When I put the data to local, not google dirve, it's very very fast! (Although I still don’t understand why the diff of CPU is not big (colab "None" vc PC CPU))
!mkdir train_local
!wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
!tar zxvf cifar-10-python.tar.gz -C train_local
!rm cifar-10-python.tar.gz
Upvotes: 1