Maggy precision training using PyTorch

Mixed precision training with Maggy

From the Pascal line on, NVIDIA GPUs are equipped with so called Tensor Cores. These cores accelerate computations with half precision and can be used to significantly speed up training of neural networks without loss of accuracy. This notebook shows a brief example on how to train an MNIST classifier with mixed precision on Maggy. For more information about mixed precision, see here.

from hops import hdfs
import torch
import torch.nn.functional as F

Define a classifier CNN

class Classifier(torch.nn.Module):
    def __init__(self):
        self.l1 = torch.nn.Conv2d(1,10,5)
        self.l2 = torch.nn.Conv2d(10,20,5)
        self.l3 = torch.nn.Linear(20*20*20,10)
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = F.softmax(self.l3(x.flatten(start_dim=1)), dim=0)
        return x

Define the training function

As you can see from the example below, mixed precision in Maggy is distribution transparent and can be employed just as in your normal PyTorch code. Note however that you currently can not combine the GradScaler with ZeRO!

def train_fn(module, hparams, train_set, test_set):
    import time    
    import torch
    from torch.cuda.amp import GradScaler, autocast

    from maggy.core.patching import MaggyPetastormDataLoader

    model = module(**hparams)
    n_epochs = 11
    batch_size = 64
    lr_base =  0.01
    # Parameters as in
    optimizer = torch.optim.SGD(model.parameters(), lr=lr_base, momentum=0.5)
    loss_criterion = torch.nn.CrossEntropyLoss()
    train_loader = MaggyPetastormDataLoader(train_set, batch_size=batch_size)
    test_loader = MaggyPetastormDataLoader(test_set, batch_size=batch_size)
    def eval_model(model, test_loader):
        acc = 0
        img_cnt = 0
        with torch.no_grad():
            for data in test_loader:
                img, label = data["image"].float(), data["label"].float()
                with autocast():
                    prediction = model(img)
                acc += torch.sum(torch.argmax(prediction, dim=1) == label).detach()
                img_cnt += len(label.detach())
        acc = acc/float(img_cnt)
        print("Test accuracy: {:.3f}\n".format(acc) + 20*"-")
        return acc

    # Initialize a gradient scaler to keep precision of small gradients.
    scaler = GradScaler()

    t_0 = time.time()
    for epoch in range(n_epochs):
        for idx, data in enumerate(train_loader):
            img, label = data["image"].float(), data["label"].long()
            with autocast():
                prediction = model(img)
                loss = loss_criterion(prediction, label)
        if epoch % 10 == 0:
            acc = eval_model(model, test_loader)
    t_1 = time.time()
    minutes, seconds = divmod(t_1 - t_0, 60)
    hours, minutes = divmod(minutes, 60)
    print("-"*20 + "\nTotal training time: {:.0f}h {:.0f}m {:.0f}s.".format(hours, minutes, seconds))
    return float(acc)
train_ds = hdfs.project_path() + "/DataSets/MNIST/PetastormMNIST/train_set"
test_ds = hdfs.project_path() + "/DataSets/MNIST/PetastormMNIST/test_set"
print(hdfs.exists(train_ds), hdfs.exists(test_ds))

Defining the config

For mixed precision the config stays exactly the same as usual. You can now start mp training!

from maggy import experiment
from maggy.experiment_config import TorchDistributedConfig

config = TorchDistributedConfig(name='torch_mixed_precision', module=Classifier, train_set=train_ds, test_set=test_ds, backend="torch")
experiment.lagom(train_fn, config)