Cuda error: device side assert triggered - only after certain number of batches

Question

I am trying to put a dataset through a neural network. It is running on a Google Cloud virtual machine using a Tesla V100 GPU. However, before I can finish training a single epoch, I get an error message: "Cuda error: device side assert triggered". I think the problem may be in my data, but I have no idea where and I'm not sure what the problem is exactly (but I tested the code with a different dataset and it ran fine).

The thing that is odd is that the network actually runs for some time before triggering the error. I had it print every time it finished a batch and sometimes it finishes 60+ batches, sometimes 80+, I've even gotten it to finish as many as 140 batches (given the size of my data and my batches, there are 200 batches in each epoch). No matter how many it finishes, it eventually triggers this error and has not completed an epoch.

I tried setting CUDA_LAUNCH_BLOCKING = 1 and did not get any better error message. I of course made sure the neural network has the right number of input and output parameters (this is given because it works for the first however many batches). I also standardized the inputs. Some were really large and some were closes to zero so I normalized them to all fall in the range [-1,1]. Certainly the network should be able to handle that but it still causes a problem.

Here is my training loop which WORKS with a different data set. It is always the line "loss.backward()" that eventually triggers the error message.

CUDA_LAUNCH_BLOCKING = 1

start = time.time()
for epoch in range(1,6):

    # Decrease learning rate at epoch 3 and 5
    if epoch == 3 or epoch == 5:
        lr = lr/3

    # Setup optimizer
    optimizer = optim.SGD(net.parameters(), lr=lr)

    # Initialize stats to zeros to track network's progress
    running_loss = 0
    running_error = 0
    num_batches = 0

    # Shuffle indices to train randomly
    shuffled_indices = torch.randperm(50000)

    for count in range(0, 50000, bs):

        # Clear gradient before each iteration
        optimizer.zero_grad()

        # Setup indices for minibatch
        if (count + bs > 50000):
            indices_list = shuffled_indices[count : ].tolist() + shuffled_indices[ : (count + bs) - 50000].tolist()
            indices = torch.Tensor(indices_list)
        else:
            indices = shuffled_indices[count : count + bs]

        # Create minibatch
        minibatch_data = train_data[indices]
        minibatch_label = train_label[indices]

        # Send minibatch to gpu for training
        minibatch_data = minibatch_data.to(device)
        minibatch_label = minibatch_label.to(device)
        temp = minibatch_data - mean

        # Standardize entries with mean and std
        inputs = ((minibatch_data - mean) / std).view(bs, 33)

        # Begin tracking changes
        inputs.requires_grad_()

        # Forward inputs through the network
        scores = net(inputs)

        print(scores[:2])
        print(minibatch_label)

        # Compute loss
        loss = criterion(scores, minibatch_label)

        # Back propogate neural network
        loss.backward()

        # Do one step of stochastic gradient descent
        optimizer.step()

        # Update summary statistics
        with torch.no_grad():
            num_batches += 1
            error = get_error(scores, minibatch_label)
            running_error += error
            running_loss += loss.item()

        print("success: ", num_batches)    

    # At the end of each epoch, compute and print summary statistics
    total_error = running_error / num_batches
    avg_loss = running_loss / num_batches
    print('Epoch: ', epoch)
    print('Time: ', time.time(), '	 Loss: ', avg_loss, '	 Error (%): ', total_error * 100)

Here is my dataset formatting and normalizing:

train_list_updated = []
train_label_list = []
for entry in train_list[1:]:
    entry[0] = string_to_int(entry[0])
    entry[1] = handedness[entry[1]]
    entry[2] = string_to_int(entry[2])
    entry[3] = handedness[entry[3]]
    entry[4] = string_to_int(entry[4])
    entry[5] = string_to_int(entry[5])
    entry[6] = string_to_int(entry[6])
    entry[17] = entry[17].replace(':','')
    entry[-3] = pitch_types[entry[-3]]
    entry[-2] = pitch_outcomes[entry[-2]]
    train_label_list.append(entry[-2])
    del entry[-1]
    del entry[-1]
    del entry[-3]
    train_list_updated.append(entry)

final_train_list = []
for entry in train_list_updated:
    for index in range(len(entry)):
        try:
            entry[index] = float(entry[index])
        except:
            entry[index] = 0.
    final_train_list.append(entry)

# Do the same for the test data
test_list_updated = []
for entry in test_list[1:]:
    entry[0] = string_to_int(entry[0])
    entry[1] = handedness[entry[1]]
    entry[2] = string_to_int(entry[2])
    entry[3] = handedness[entry[3]]
    entry[4] = string_to_int(entry[4])
    entry[5] = string_to_int(entry[5])
    entry[6] = string_to_int(entry[6])
    entry[17] = entry[17].replace(':','')
    entry[-3] = pitch_types[entry[-3]]
    del entry[-1]
    del entry[-1]
    del entry[-3]
    test_list_updated.append(entry)

final_test_list = []
for entry in test_list_updated:
    for index in range(len(entry)):
        try:
            entry[index] = float(entry[index])
        except:
            entry[index] = 0.
    final_test_list.append(entry)

# Create tensors of test and train data
train_data = torch.tensor(final_train_list)
train_label = torch.tensor(train_label_list)
test_data = torch.tensor(final_test_list)

And normalizing:

max_indices = torch.argmax(train_data, dim = 0)
min_indices = torch.argmin(train_data, dim = 0)

max_values = []
min_values = []
for i in range(33):
    max_idx = max_indices[i].item()
    min_idx = min_indices[i].item()
    max_val = train_data[max_idx][i]
    min_val = train_data[min_idx][i]
    max_values.append(max_val)
    min_values.append(min_val)

max_values = torch.Tensor(max_values)
min_values = torch.Tensor(min_values)
ranges = max_values - min_values

min_values = min_values.view(1, 33)
min_values = torch.repeat_interleave(min_values, 582205, dim = 0)
ranges = ranges.view(1, 33)
ranges = torch.repeat_interleave(ranges, 582205, dim = 0)

train_data = train_data - min_values
train_data = 2 * (train_data / ranges) 
train_data = train_data - 1

And here's my net (a lot is commented out since I thought maybe there was an issue with the gradient zeroing or something. A five layer neural network should definitely not cause a problem though):

"""
DEFINING A NEURAL NETWORK
"""

# Define a fifteen layer artificial neural network
class fifteen_layer_net(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear1 = nn.Linear(33, 200)
        self.linear2 = nn.Linear(200, 250)
        self.linear3 = nn.Linear(250, 300)
        self.linear4 = nn.Linear(300, 350)
        self.linear5 = nn.Linear(350, 7)
#         self.linear6 = nn.Linear(400, 450)
#         self.linear7 = nn.Linear(450, 500)
#         self.linear8 = nn.Linear(500, 450)
#         self.linear9 = nn.Linear(450, 400)
#         self.linear10 = nn.Linear(400, 350)
#         self.linear11 = nn.Linear(350, 300)
#         self.linear12 = nn.Linear(300, 250)
#         self.linear13 = nn.Linear(250, 200)
#         self.linear14 = nn.Linear(200, 150)
#         self.linear15 = nn.Linear(150, 7)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        x = F.relu(x)
        x = self.linear4(x)
        x = F.relu(x)
        scores = self.linear5(x)
#         x = F.relu(x)
#         x = self.linear6(x)
#         x = F.relu(x)
#         x = self.linear7(x)
#         x = F.relu(x)
#         x = self.linear8(x)
#         x = F.relu(x)
#         x = self.linear9(x)
#         x = F.relu(x)
#         x = self.linear10(x)
#         x = F.relu(x)
#         x = self.linear11(x)
#         x = F.relu(x)
#         x = self.linear12(x)
#         x = F.relu(x)
#         x = self.linear13(x)
#         x = F.relu(x)
#         x = self.linear14(x)
#         x = F.relu(x)
#         scores = self.linear15(x)

        return scores

Network should output scores, compute a loss using cross entropy loss criterion, and then do one step of stochastic gradient descent. This works for awhile and then mysteriously breaks. I have no idea why.

Any help is greatly appreciated.

Thanks in advance.

Cuda error: device side assert triggered - only after certain number of batches

Answers (1)

Related Questions