Why pytorch training on CUDA works much slower than in CPU?

Question

I guess i have made something in folowing simple neural network with PyTorch, because this runs much slower with CUDA then in CPU, can you find the mistake pls. The using function like

    def backward(ctx, input):

        return backward_sigm(ctx, input)

seems have no real impact on preformance

import torch
import torch.nn as nn
import torch.nn.functional as f


dname = 'cuda:0'
dname = 'cpu'




device = torch.device(dname)


print(torch.version.cuda)

def forward_sigm(ctx, input):

    sigm = 1 / (1 + torch.exp(-input))

    ctx.save_for_backward(sigm)

    return sigm

def forward_step(ctx, input):

    return  torch.tensor(input > 0.5, dtype = torch.float32, device = device)


def backward_sigm(ctx, grad_output):

    sigm, = ctx.saved_tensors

    return grad_output * sigm * (1-sigm)


def backward_step(ctx, grad_output):

    return grad_output




class StepAF(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return forward_sigm(ctx, input)


    @staticmethod
    def backward(ctx, input):

        return backward_sigm(ctx, input)
    #else return grad_output



class StepNN(torch.nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(StepNN, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        #self.linear1.cuda()
        self.linear2 = torch.nn.Linear(hidden_size, output_size)
        #self.linear2.cuda()

        #self.StepAF = StepAF.apply



    def forward(self,x):

        h_line_1 = self.linear1(x)

        h_thrash_1 = StepAF.apply(h_line_1)

        h_line_2 = self.linear2(h_thrash_1)

        output = StepAF.apply(h_line_2)

        return output


inputs = torch.tensor( [[1,0,1,0],[1,0,0,1],[0,1,0,1],[0,1,1,0],[1,0,0,0],[0,0,0,1],[1,1,0,1],[0,1,0,0],], dtype = torch.float32, device = device)

expected = torch.tensor( [[1,0,0],[1,0,0],[0,1,0],[0,1,0],[1,0,0],[0,0,1],[0,1,0],[0,0,1],], dtype = torch.float32, device = device)


nn = StepNN(4,8,3)


#print(*(x for x in nn.parameters()))

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(nn.parameters(), lr=1e-3)

steps = 50000

print_steps = steps // 20

good_loss = 1e-5

for t in range(steps):

    output = nn(inputs)
    loss = criterion(output, expected)



    if t % print_steps == 0:
        print('step ',t, ', loss :' , loss.item())

    if loss < good_loss:
        print('step ',t, ', loss :' , loss.item())
        break

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()



test = torch.tensor( [[0,1,0,1],[0,1,1,0],[1,0,1,0],[1,1,0,1],], dtype = torch.float32, device=device)


print(nn(test))

Why pytorch training on CUDA works much slower than in CPU?

Answers (1)

Related Questions