Do you want to try out this notebook? Get a free account (no credit-card reqd) at hopsworks.ai. You can also install open-source Hopsworks or view tutorial videos here.
Maggy precision training using PyTorch
Mixed precision training with Maggy
From the Pascal line on, NVIDIA GPUs are equipped with so called Tensor Cores. These cores accelerate computations with half precision and can be used to significantly speed up training of neural networks without loss of accuracy. This notebook shows a brief example on how to train an MNIST classifier with mixed precision on Maggy. For more information about mixed precision, see here.
from hops import hdfs
import torch
import torch.nn.functional as F
Define a classifier CNN
class Classifier(torch.nn.Module):
def __init__(self):
super().__init__()
self.l1 = torch.nn.Conv2d(1,10,5)
self.l2 = torch.nn.Conv2d(10,20,5)
self.l3 = torch.nn.Linear(20*20*20,10)
def forward(self, x):
x = F.relu(self.l1(x))
x = F.relu(self.l2(x))
x = F.softmax(self.l3(x.flatten(start_dim=1)), dim=0)
return x
Define the training function
As you can see from the example below, mixed precision in Maggy is distribution transparent and can be employed just as in your normal PyTorch code. Note however that you currently can not combine the GradScaler
with ZeRO!
def train_fn(module, hparams, train_set, test_set):
import time
import torch
from torch.cuda.amp import GradScaler, autocast
from maggy.core.patching import MaggyPetastormDataLoader
model = module(**hparams)
n_epochs = 11
batch_size = 64
lr_base = 0.01
# Parameters as in https://arxiv.org/pdf/1706.02677.pdf
optimizer = torch.optim.SGD(model.parameters(), lr=lr_base, momentum=0.5)
loss_criterion = torch.nn.CrossEntropyLoss()
train_loader = MaggyPetastormDataLoader(train_set, batch_size=batch_size)
test_loader = MaggyPetastormDataLoader(test_set, batch_size=batch_size)
def eval_model(model, test_loader):
acc = 0
model.eval()
img_cnt = 0
with torch.no_grad():
for data in test_loader:
img, label = data["image"].float(), data["label"].float()
with autocast():
prediction = model(img)
acc += torch.sum(torch.argmax(prediction, dim=1) == label).detach()
img_cnt += len(label.detach())
acc = acc/float(img_cnt)
print("Test accuracy: {:.3f}\n".format(acc) + 20*"-")
return acc
# Initialize a gradient scaler to keep precision of small gradients.
scaler = GradScaler()
model.train()
t_0 = time.time()
for epoch in range(n_epochs):
model.train()
for idx, data in enumerate(train_loader):
optimizer.zero_grad()
img, label = data["image"].float(), data["label"].long()
with autocast():
prediction = model(img)
loss = loss_criterion(prediction, label)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
if epoch % 10 == 0:
acc = eval_model(model, test_loader)
t_1 = time.time()
minutes, seconds = divmod(t_1 - t_0, 60)
hours, minutes = divmod(minutes, 60)
print("-"*20 + "\nTotal training time: {:.0f}h {:.0f}m {:.0f}s.".format(hours, minutes, seconds))
return float(acc)
train_ds = hdfs.project_path() + "/DataSets/MNIST/PetastormMNIST/train_set"
test_ds = hdfs.project_path() + "/DataSets/MNIST/PetastormMNIST/test_set"
print(hdfs.exists(train_ds), hdfs.exists(test_ds))
Defining the config
For mixed precision the config stays exactly the same as usual. You can now start mp training!
from maggy import experiment
from maggy.experiment_config import TorchDistributedConfig
config = TorchDistributedConfig(name='torch_mixed_precision', module=Classifier, train_set=train_ds, test_set=test_ds, backend="torch")
experiment.lagom(train_fn, config)