Reputation: 11
I have a few questions:
I think that the Compile () method should be used to improve performance, but in my code, If I use Compile, it takes almost 2 to 30% more time than when not in use. Can you tell me what the problem in my code is?
If I use the compile in my Bert pre-training model, the following errors occur and it slows down dozens of times. Can you tell me what caused this error?
torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64) function: 'forward' (/home/mj/.../bert.py:287) reasons: ___check_obj_id(self, 139626116174448) to diagnose recompilation issues, see https://pytorch.org/docs/master/dynamo/troubleshooting.html.
torch._inductor.utils: [WARNING] using triton random, expect difference from eager
<- Error List
import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import random
import time
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(1234)
np.random.seed(1234)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.sequential = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=2),
nn.ReLU(),
nn.MaxPool2d(stride=2, kernel_size=3),
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=2),
nn.ReLU(),
nn.MaxPool2d(stride=2, kernel_size=3),
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=2),
nn.ReLU(),
nn.AdaptiveAvgPool2d(8),
nn.Flatten(),
nn.Linear(in_features=128*8*8, out_features=10))
def forward(self, x):
out = self.sequential(x)
return out
# model1 = NeuralNetwork()
# model2 = NeuralNetwork()
model1 = torchvision.models.resnet18()
model2 = torchvision.models.resnet18()
model1.to(device='cuda')
model2.to(device='cuda')
model1 = torch.compile(model1)
model2 = torch.compile(model2)
def cifar10_cnn():
epochs = 5
batch_size = 64
report_period = 100
tr_count = 0
te_count = 0
data_root = "/data/"
torch.set_float32_matmul_precision('high')
tr_dset = datasets.CIFAR10(root=data_root, train=True, download=True, transform=transforms.ToTensor())
te_dset = datasets.CIFAR10(root=data_root, train=False, download=True, transform=transforms.ToTensor())
tr_loader = DataLoader(tr_dset, batch_size=batch_size, shuffle=True)
te_loader = DataLoader(te_dset, batch_size=batch_size, shuffle=False)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model1.parameters(), lr=1e-3)
start_time = time.time()
for i in range(epochs):
print(f"\nEpoch {i + 1}/{epochs}\n------------------------------")
train(tr_loader, model1, loss_fn, optimizer, report_period, start_time)
print(f"\nTest started with {len(te_loader)} data:")
test(te_loader, model2, loss_fn, start_time)
def train(dataloader, model1, loss_fn, optimizer, report_period, start_time):
running_loss = 0.0
train_loss= 0.0
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
pred = model1(X)
loss = loss_fn(pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
train_loss = running_loss / len(dataloader)
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
print(f"train_loss: {train_loss}")
def test(dataloader, model2, loss_fn, start_time):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model2.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model1(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
if __name__ == "__main__":
start = time.time()
cifar10_cnn()
print("Done!\n")
print(f"running time: {time.time()-start}")
First, I tried to compile one model and run the train and test, and there was an error related to the Differentiation (or gradient), so I separated the compile model for train and the compile model for test.
Second, I tried using the model provided by torchvision because I wanted to see if the model I implemented was a problem, but it is equally slow.
Third, I tried to use big model because I think the slow-down may caused by unnecessary compiling-time with small model, but the same result comes.
Upvotes: 1
Views: 834