Skip to content
Snippets Groups Projects
Commit e1334a58 authored by Erik Strand's avatar Erik Strand
Browse files

Add MPI pi calculation (GPU)

parent fdf08d82
Branches
No related tags found
No related merge requests found
mpi_pi_gpu
output
mpi_pi_gpu: mpi_pi_gpu.c
mpic++ $< -lcudart -o $@
#!/bin/bash
module purge
module load spack git gcc/7.3.0 cuda/10.1 openmpi/3.1.4-pmi-cuda-ucx
// Neil Gershenfeld 10/14/20
// Erik Strand 3/6/2021
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
#include <chrono>
#include <cstdint>
#include <cuda_runtime.h>
#include <iostream>
#include <mpi.h>
using namespace std;
uint64_t const n_terms_per_thread = 100000;
uint64_t const n_threads_per_gpu = 1024;
uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
uint64_t const n_threads_per_block = 512;
uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
int nloop = 10;
__global__
void init(double *arr, int gpu_idx) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1;
double sum = 0.0;
for (uint64_t i = start; i < end; ++i) {
sum += 0.5 / ((i - 0.75) * (i - 0.25));
}
arr[thread_idx] = sum;
}
__global__
void reduce_sum(double *arr, uint64_t stride) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_idx < stride) {
arr[thread_idx] += arr[thread_idx + stride];
}
}
void reduce(double *arr) {
uint64_t stride = n_threads_per_gpu >> 1;
while (stride > 0) {
reduce_sum<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
stride = stride >> 1;
}
}
int main(int argc, char** argv) {
char* local_rank_str = NULL;
int local_rank = 0;
local_rank_str = getenv("SLURM_LOCALID");
if (local_rank_str != NULL) {
local_rank = atoi(local_rank_str);
std::cout << "slurm local rank = " << local_rank << '\n';
} else {
std::cout << "slurm local rank not defined\n";
}
int n_gpus;
cudaGetDeviceCount(&n_gpus);
// relative to this node
int local_gpu_idx = local_rank % n_gpus;
//cudaSetDevice(local_gpu_idx);
int n_tasks, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &n_tasks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// We assume that all nodes have the same number of GPUs.
int gpu_idx = n_gpus * rank + local_gpu_idx;
int host_name_length;
char host_name[MPI_MAX_PROCESSOR_NAME];
MPI_Get_processor_name(host_name, &host_name_length);
std::cout << "task: " << rank << " of " << n_tasks << '\n';
std::cout << "node: " << host_name << '\n';
std::cout << "local gpu idx: " << local_gpu_idx << '\n';
std::cout << "global gpu idx: " << gpu_idx << '\n';
/*
int rank,nranks;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&nranks);
double *arr,result,pi;
cudaMalloc(&arr,nthreads*sizeof(double));
if (rank == 0) {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
double tstart = MPI_Wtime();
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
double tend = MPI_Wtime();
double dt = tend-tstart;
double gflops = npts*nthreads*nranks*5.0/dt/1e9;
printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
printf("time = %f, estimated GFlops = %f\n",dt,gflops);
}
}
else {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
}
}
cudaFree(arr);
*/
MPI_Finalize();
return 0;
}
#!/bin/bash
#SBATCH -J mpi_pi_gpu
#SBATCH -o output/%j.out
#SBATCH -e output/%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --gres=gpu:2
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-core=1
#SBATCH --threads-per-core=1
#SBATCH --mem=10G
#SBATCH --time 00:01:00
source ./load_modules.sh
srun ./mpi_pi_gpu
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment