Reputation: 620
I am trying to put a dataset through a neural network. It is running on a Google Cloud virtual machine using a Tesla V100 GPU. However, before I can finish training a single epoch, I get an error message: "Cuda error: device side assert triggered". I think the problem may be in my data, but I have no idea where and I'm not sure what the problem is exactly (but I tested the code with a different dataset and it ran fine).
The thing that is odd is that the network actually runs for some time before triggering the error. I had it print every time it finished a batch and sometimes it finishes 60+ batches, sometimes 80+, I've even gotten it to finish as many as 140 batches (given the size of my data and my batches, there are 200 batches in each epoch). No matter how many it finishes, it eventually triggers this error and has not completed an epoch.
I tried setting CUDA_LAUNCH_BLOCKING = 1 and did not get any better error message. I of course made sure the neural network has the right number of input and output parameters (this is given because it works for the first however many batches). I also standardized the inputs. Some were really large and some were closes to zero so I normalized them to all fall in the range [-1,1]. Certainly the network should be able to handle that but it still causes a problem.
Here is my training loop which WORKS with a different data set. It is always the line "loss.backward()" that eventually triggers the error message.
CUDA_LAUNCH_BLOCKING = 1
start = time.time()
for epoch in range(1,6):
# Decrease learning rate at epoch 3 and 5
if epoch == 3 or epoch == 5:
lr = lr/3
# Setup optimizer
optimizer = optim.SGD(net.parameters(), lr=lr)
# Initialize stats to zeros to track network's progress
running_loss = 0
running_error = 0
num_batches = 0
# Shuffle indices to train randomly
shuffled_indices = torch.randperm(50000)
for count in range(0, 50000, bs):
# Clear gradient before each iteration
optimizer.zero_grad()
# Setup indices for minibatch
if (count + bs > 50000):
indices_list = shuffled_indices[count : ].tolist() + shuffled_indices[ : (count + bs) - 50000].tolist()
indices = torch.Tensor(indices_list)
else:
indices = shuffled_indices[count : count + bs]
# Create minibatch
minibatch_data = train_data[indices]
minibatch_label = train_label[indices]
# Send minibatch to gpu for training
minibatch_data = minibatch_data.to(device)
minibatch_label = minibatch_label.to(device)
temp = minibatch_data - mean
# Standardize entries with mean and std
inputs = ((minibatch_data - mean) / std).view(bs, 33)
# Begin tracking changes
inputs.requires_grad_()
# Forward inputs through the network
scores = net(inputs)
print(scores[:2])
print(minibatch_label)
# Compute loss
loss = criterion(scores, minibatch_label)
# Back propogate neural network
loss.backward()
# Do one step of stochastic gradient descent
optimizer.step()
# Update summary statistics
with torch.no_grad():
num_batches += 1
error = get_error(scores, minibatch_label)
running_error += error
running_loss += loss.item()
print("success: ", num_batches)
# At the end of each epoch, compute and print summary statistics
total_error = running_error / num_batches
avg_loss = running_loss / num_batches
print('Epoch: ', epoch)
print('Time: ', time.time(), '\t Loss: ', avg_loss, '\t Error (%): ', total_error * 100)
Here is my dataset formatting and normalizing:
train_list_updated = []
train_label_list = []
for entry in train_list[1:]:
entry[0] = string_to_int(entry[0])
entry[1] = handedness[entry[1]]
entry[2] = string_to_int(entry[2])
entry[3] = handedness[entry[3]]
entry[4] = string_to_int(entry[4])
entry[5] = string_to_int(entry[5])
entry[6] = string_to_int(entry[6])
entry[17] = entry[17].replace(':','')
entry[-3] = pitch_types[entry[-3]]
entry[-2] = pitch_outcomes[entry[-2]]
train_label_list.append(entry[-2])
del entry[-1]
del entry[-1]
del entry[-3]
train_list_updated.append(entry)
final_train_list = []
for entry in train_list_updated:
for index in range(len(entry)):
try:
entry[index] = float(entry[index])
except:
entry[index] = 0.
final_train_list.append(entry)
# Do the same for the test data
test_list_updated = []
for entry in test_list[1:]:
entry[0] = string_to_int(entry[0])
entry[1] = handedness[entry[1]]
entry[2] = string_to_int(entry[2])
entry[3] = handedness[entry[3]]
entry[4] = string_to_int(entry[4])
entry[5] = string_to_int(entry[5])
entry[6] = string_to_int(entry[6])
entry[17] = entry[17].replace(':','')
entry[-3] = pitch_types[entry[-3]]
del entry[-1]
del entry[-1]
del entry[-3]
test_list_updated.append(entry)
final_test_list = []
for entry in test_list_updated:
for index in range(len(entry)):
try:
entry[index] = float(entry[index])
except:
entry[index] = 0.
final_test_list.append(entry)
# Create tensors of test and train data
train_data = torch.tensor(final_train_list)
train_label = torch.tensor(train_label_list)
test_data = torch.tensor(final_test_list)
And normalizing:
max_indices = torch.argmax(train_data, dim = 0)
min_indices = torch.argmin(train_data, dim = 0)
max_values = []
min_values = []
for i in range(33):
max_idx = max_indices[i].item()
min_idx = min_indices[i].item()
max_val = train_data[max_idx][i]
min_val = train_data[min_idx][i]
max_values.append(max_val)
min_values.append(min_val)
max_values = torch.Tensor(max_values)
min_values = torch.Tensor(min_values)
ranges = max_values - min_values
min_values = min_values.view(1, 33)
min_values = torch.repeat_interleave(min_values, 582205, dim = 0)
ranges = ranges.view(1, 33)
ranges = torch.repeat_interleave(ranges, 582205, dim = 0)
train_data = train_data - min_values
train_data = 2 * (train_data / ranges)
train_data = train_data - 1
And here's my net (a lot is commented out since I thought maybe there was an issue with the gradient zeroing or something. A five layer neural network should definitely not cause a problem though):
"""
DEFINING A NEURAL NETWORK
"""
# Define a fifteen layer artificial neural network
class fifteen_layer_net(nn.Module):
def __init__(self):
super().__init__()
self.linear1 = nn.Linear(33, 200)
self.linear2 = nn.Linear(200, 250)
self.linear3 = nn.Linear(250, 300)
self.linear4 = nn.Linear(300, 350)
self.linear5 = nn.Linear(350, 7)
# self.linear6 = nn.Linear(400, 450)
# self.linear7 = nn.Linear(450, 500)
# self.linear8 = nn.Linear(500, 450)
# self.linear9 = nn.Linear(450, 400)
# self.linear10 = nn.Linear(400, 350)
# self.linear11 = nn.Linear(350, 300)
# self.linear12 = nn.Linear(300, 250)
# self.linear13 = nn.Linear(250, 200)
# self.linear14 = nn.Linear(200, 150)
# self.linear15 = nn.Linear(150, 7)
def forward(self, x):
x = self.linear1(x)
x = F.relu(x)
x = self.linear2(x)
x = F.relu(x)
x = self.linear3(x)
x = F.relu(x)
x = self.linear4(x)
x = F.relu(x)
scores = self.linear5(x)
# x = F.relu(x)
# x = self.linear6(x)
# x = F.relu(x)
# x = self.linear7(x)
# x = F.relu(x)
# x = self.linear8(x)
# x = F.relu(x)
# x = self.linear9(x)
# x = F.relu(x)
# x = self.linear10(x)
# x = F.relu(x)
# x = self.linear11(x)
# x = F.relu(x)
# x = self.linear12(x)
# x = F.relu(x)
# x = self.linear13(x)
# x = F.relu(x)
# x = self.linear14(x)
# x = F.relu(x)
# scores = self.linear15(x)
return scores
Network should output scores, compute a loss using cross entropy loss criterion, and then do one step of stochastic gradient descent. This works for awhile and then mysteriously breaks. I have no idea why.
Any help is greatly appreciated.
Thanks in advance.
Upvotes: 0
Views: 1828
Reputation: 12689
I was also facing same issue, You can try few things :
NaN
, and inf
values in your dataset.samples % batchsize = 0
Upvotes: 0