Cuda-aware MPI

General introduction: https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/

Cuda-aware MPI Example

#include <stdlib.h>
#include <stdio.h>
#include <mpi.h>
#include <cuda_runtime.h>

int main(int argc, char *argv[]) {
    int myrank;
    float *val_device, *val_host;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    val_host = (float*)malloc(sizeof(float));
    cudaMalloc((void **)&val_device, sizeof(float));

    *val_host = -1.0;
    if (myrank != 0) {
      printf("%s %d %s %f\n", "I am rank", myrank, "and my initial value is:", *val_host);
    }

    if (myrank == 0) {
        *val_host = 42.0;
        cudaMemcpy(val_device, val_host, sizeof(float), cudaMemcpyHostToDevice);
        printf("%s %d %s %f\n", "I am rank", myrank, "and will broadcast value:", *val_host);
    }

    MPI_Bcast(val_device, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);

    if (myrank != 0) {
      cudaMemcpy(val_host, val_device, sizeof(float), cudaMemcpyDeviceToHost);
      printf("%s %d %s %f\n", "I am rank", myrank, "and received broadcasted value:", *val_host);
    }

    cudaFree(val_device);
    free(val_host);

    MPI_Finalize();
    return 0;
}

Cuda-aware MPI CCE

In order to compile the application with the CCE compiler, you need to set up the target acceleration matching the Nvidia A100 GPUs.

module load craype-accel-nvidia80
export CRAY_ACCEL_TARGET=nvidia80
cc cuda-aware-mpi.c -o cuda-aware-mpi -lcudart -target-accel=nvidia80

The Cuda-aware application was dyamically linked with the Cray libraries:

$ ldd cuda-aware-mpi | grep mpi
      libmpi_cray.so.12 => /opt/cray/pe/lib64/libmpi_cray.so.12 (0x00007fabda26a000)
      libmpi_gtl_cuda.so.0 => /opt/cray/pe/lib64/libmpi_gtl_cuda.so.0 (0x00007fabda024000)

Cuda-aware MPI Batch Job

In this example batch script we send the executable to 2 nodes using 8 tasks. To enable GPU support MPICH_GPU_SUPPORT_ENABLED enviroment variable must be set to 1.

#!/bin/bash
#SBATCH -A hpcteszt
#SBATCH --partition=ai
#SBATCH --job-name=cuda-aware-mpi
#SBATCH --output=cuda-aware-mpi.out
#SBATCH --time=06:00:00
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=8
#SBATCH --gres=gpu:1
export MPICH_GPU_SUPPORT_ENABLED=1
srun ./cuda-aware-mpi