motor_junkie
motor_junkie

Reputation: 31

Process stuck when training on multiple nodes using PyTorch DistributedDataParallel

I am trying to run the script mnist-distributed.py from Distributed data parallel training in Pytorch. I have also pasted the same code here. (I have replaced my actual MASTER_ADDR with a.b.c.d for posting here).

import os
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist

class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=2, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args()
    args.world_size = args.gpus * args.nodes               
    os.environ['MASTER_ADDR'] = 'a.b.c.d'              
    os.environ['MASTER_PORT'] = '8890'                    
    mp.spawn(train, nprocs=args.gpus, args=(args,))       

def train(gpu, args):
    rank = args.nr * args.gpus + gpu                              
    dist.init_process_group(                                   
        backend='nccl',                                         
        init_method='env://',                                   
        world_size=args.world_size,                              
        rank=rank                                               
    )                                                          
    
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    batch_size = 100
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model,
                                                device_ids=[gpu])

    # Data loading code
    train_dataset = torchvision.datasets.MNIST(
        root='./data',
        train=True,
        transform=transforms.ToTensor(),
        download=True
    )                                               
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=args.world_size,
        rank=rank
    )

    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
       batch_size=batch_size,
       shuffle=False,            
       num_workers=0,
       pin_memory=True,
      sampler=train_sampler)     

    total_step = len(train_loader)
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1, 
                    args.epochs, 
                    i + 1, 
                    total_step,
                    loss.item())
                   )

if __name__ == '__main__':
    main()

There are 2 nodes with 2 GPUs each. I run this command from the terminal of the master node-

python mnist-distributed.py -n 2 -g 2 -nr 0

, and then this from the terminal of the other node-

python mnist-distributed.py -n 2 -g 2 -nr 1

But then my process gets stuck with no output on either terminal.

Running the same code on a single node using the following command works perfectly fine-

python mnist-distributed.py -n 1 -g 2 -nr 0

Upvotes: 3

Views: 2864

Answers (1)

Frank
Frank

Reputation: 1249

I met a similar problem. And the problem is solved by

sudo vi /etc/default/grub

Edit it:

#GRUB_CMDLINE_LINUX=""                           <----- Original commented
GRUB_CMDLINE_LINUX="iommu=soft"           <------ Change
sudo update-grub

Reboot to see the change.

Ref: https://github.com/pytorch/pytorch/issues/1637#issuecomment-338268158

Upvotes: 1

Related Questions