PyTorch Example - MNIST

Simple PyTorch mnist experiment

Tested with PyTorch 1.6.0

Tested with torchvision 0.7.0

Machine Learning on Hopsworks


The hops python module

hops is a helper library for Hops that facilitates development by hiding the complexity of running applications and iteracting with services.

Have a feature request or encountered an issue? Please let us know on github.

Using the experiment module

To be able to run your Machine Learning code in Hopsworks, the code for the whole program needs to be provided and put inside a wrapper function. Everything, from importing libraries to reading data and defining the model and running the program needs to be put inside a wrapper function.

The experiment module provides an api to Python programs such as TensorFlow, Keras and PyTorch on a Hopsworks on any number of machines and GPUs.

An Experiment could be a single Python program, which we refer to as an Experiment.

Grid search or genetic hyperparameter optimization such as differential evolution which runs several Experiments in parallel, which we refer to as Parallel Experiment.

ParameterServerStrategy, CollectiveAllReduceStrategy and MultiworkerMirroredStrategy making multi-machine/multi-gpu training as simple as invoking a function for orchestration. This mode is referred to as Distributed Training.

Using the tensorboard module

The tensorboard module allow us to get the log directory for summaries and checkpoints to be written to the TensorBoard we will see in a bit. The only function that we currently need to call is tensorboard.logdir(), which returns the path to the TensorBoard log directory. Furthermore, the content of this directory will be put in as a Dataset in your project’s Experiments folder.

The directory could in practice be used to store other data that should be accessible after the experiment is finished.

# Use this module to get the TensorBoard logdir
from hops import tensorboard
tensorboard_logdir = tensorboard.logdir()

Using the hdfs module

The hdfs module provides a method to get the path in HopsFS where your data is stored, namely by calling hdfs.project_path(). The path resolves to the root path for your project, which is the view that you see when you click Data Sets in HopsWorks. To point where your actual data resides in the project you to append the full path from there to your Dataset. For example if you create a mnist folder in your Resources Dataset, the path to the mnist data would be hdfs.project_path() + 'Resources/mnist'

# Use this module to get the path to your project in HopsFS, then append the path to your Dataset in your project
from hops import hdfs
project_path = hdfs.project_path()
# Downloading the mnist dataset to the current working directory
from hops import hdfs
mnist_hdfs_path = hdfs.project_path() + "Resources/mnist"
local_mnist_path = hdfs.copy_to_local(mnist_hdfs_path)


See the following links to learn more about running experiments in Hopsworks

Managing experiments

Experiments service provides a unified view of all the experiments run using the experiment module.
As demonstrated in the gif it provides general information about the experiment and the resulting metric. Experiments can be visualized meanwhile or after training in a TensorBoard.


def wrapper():
    import argparse
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from torchvision import datasets, transforms
    from torch.utils.tensorboard import SummaryWriter    
    import os
    from hops import tensorboard
    from hops import hdfs
    import matplotlib.pyplot as plt
    from matplotlib.ticker import MaxNLocator
    import hsml
    import uuid
    train_acc = []
    test_acc = []

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 20, 5, 1)
            self.conv2 = nn.Conv2d(20, 50, 5, 1)
            self.fc1 = nn.Linear(4*4*50, 500)
            self.fc2 = nn.Linear(500, 10)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            x = F.max_pool2d(x, 2, 2)
            x = F.relu(self.conv2(x))
            x = F.max_pool2d(x, 2, 2)
            x = x.view(-1, 4*4*50)
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return F.log_softmax(x, dim=1)

    def train(model, device, train_loader, optimizer, writer):
        correct = 0
        test_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target =,
            output = model(data)
            loss = F.nll_loss(output, target)
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
            if batch_idx % 10 == 0:
                print('[{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
                writer.add_scalar('Loss/train', loss.item(), batch_idx)
    def test(model, device, test_loader):
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target =,
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
                pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
                correct += pred.eq(target.view_as(pred)).sum().item()
        test_loss /= len(test_loader.dataset)

        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))

    # Training settings
    use_cuda = torch.cuda.is_available()


    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    # Set SummaryWriter to point to the local directory containing the tensorboard logs
    writer = SummaryWriter(log_dir=tensorboard.logdir())

    # The same working directory may be used multiple times if running multiple experiments
    # Make sure we only download the dataset once
    if not os.path.exists(os.getcwd() + '/MNIST'):
        #Copy dataset from project to local filesystem
    train_loader =
        datasets.MNIST(os.getcwd(), train=True, download=False,
                           transforms.Normalize((0.1307,), (0.3081,))
        batch_size=64, shuffle=True, **kwargs)
    test_loader =
        datasets.MNIST(os.getcwd(), train=False, download=False,
                           transforms.Normalize((0.1307,), (0.3081,))
        batch_size=1000, shuffle=True, **kwargs)

    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    # Train model
    epochs = range(1,3)
    for i in epochs:
        train(model, device, train_loader, optimizer, writer)
        test(model, device, test_loader)
    # Export model    
    conn = hsml.connection()
    mr = conn.get_model_registry()    
    export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
    os.mkdir(export_path), export_path + "/")
    torch_model = mr.torch.create_model("mnist_torch", metrics={'train_acc': train_acc[-1], 'test_acc': test_acc[-1]})

    # Data
    data = {'Epoch': epochs, 'Train': train_acc, 'Test': test_acc}

    # multiple line plot
    ax = plt.figure().gca()

    plt.plot('Epoch', 'Train', data=data, marker='o', markerfacecolor='black', markersize=8, color='blue', linewidth=3)
    plt.plot('Epoch', 'Test', data=data, marker='o', markerfacecolor='black', markersize=8, color='purple', linewidth=3)
    return {'accuracy': test_acc[-1:], 'train_summary': 'train_summary.png'}
Starting Spark application
IDYARN Application IDKindStateSpark UIDriver log
SparkSession available as 'spark'.
from hops import experiment
# Simple experiment, local_logdir=True to write to local FS and not HDFS (Look in Experiments dataset for contents after experiment)
experiment.launch(wrapper, local_logdir=True)