我正在尝试从 Distributed data parallel training in Pytorch 运行脚本 mnist-distributed.py
.我也在这里粘贴了相同的代码。 (我已将我的实际 MASTER_ADDR
替换为 a.b.c.d
以便在此处发布)。
import os
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=2, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = 'a.b.c.d'
os.environ['MASTER_PORT'] = '8890'
mp.spawn(train, nprocs=args.gpus, args=(args,))
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=args.world_size,
rank=rank
)
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrap the model
model = nn.parallel.DistributedDataParallel(model,
device_ids=[gpu])
# Data loading code
train_dataset = torchvision.datasets.MNIST(
root='./data',
train=True,
transform=transforms.ToTensor(),
download=True
)
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset,
num_replicas=args.world_size,
rank=rank
)
train_loader = torch.utils.data.DataLoader(
dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
epoch + 1,
args.epochs,
i + 1,
total_step,
loss.item())
)
if __name__ == '__main__':
main()
有 2 个节点,每个节点有 2 个 GPU。我从主节点的终端运行这个命令-
python mnist-distributed.py -n 2 -g 2 -nr 0
,然后这个来自另一个节点的终端-
python mnist-distributed.py -n 2 -g 2 -nr 1
但随后我的进程卡住了,两个终端上都没有输出。
使用以下命令在单个节点上运行相同的代码非常好-
python mnist-distributed.py -n 1 -g 2 -nr 0
最佳答案
我遇到了类似的问题。问题解决了
sudo vi /etc/default/grub
编辑它:
#GRUB_CMDLINE_LINUX="" <----- Original commented
GRUB_CMDLINE_LINUX="iommu=soft" <------ Change
sudo update-grub
重新启动以查看更改。
引用:https://github.com/pytorch/pytorch/issues/1637#issuecomment-338268158
关于deep-learning - 使用 PyTorch DistributedDataParallel 在多个节点上训练时进程卡住,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/63968082/