diff --git a/mpi_pi_gpu/.gitignore b/mpi_pi_gpu/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1926ab2490cda6d1b83e58c191d87dc9b1f2879b --- /dev/null +++ b/mpi_pi_gpu/.gitignore @@ -0,0 +1,2 @@ +mpi_pi_gpu +output diff --git a/mpi_pi_gpu/Makefile b/mpi_pi_gpu/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..272faf7674608c4acbffba9328d0678974d1939f --- /dev/null +++ b/mpi_pi_gpu/Makefile @@ -0,0 +1,2 @@ +mpi_pi_gpu: mpi_pi_gpu.c + mpic++ $< -lcudart -o $@ diff --git a/mpi_pi_gpu/load_modules.sh b/mpi_pi_gpu/load_modules.sh new file mode 100755 index 0000000000000000000000000000000000000000..450477e7571b88a7dc52e6a251bcc1c112e4bd35 --- /dev/null +++ b/mpi_pi_gpu/load_modules.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +module purge +module load spack git gcc/7.3.0 cuda/10.1 openmpi/3.1.4-pmi-cuda-ucx diff --git a/mpi_pi_gpu/mpi_pi_gpu.cu b/mpi_pi_gpu/mpi_pi_gpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..3f966243e401318d29bfb82cd465070497b0a5b6 --- /dev/null +++ b/mpi_pi_gpu/mpi_pi_gpu.cu @@ -0,0 +1,122 @@ +// Neil Gershenfeld 10/14/20 +// Erik Strand 3/6/2021 +// calculation of pi by a CUDA+MPI sum +// assumes one GPU per MPI rank + +#include <chrono> +#include <cstdint> +#include <cuda_runtime.h> +#include <iostream> +#include <mpi.h> + +using namespace std; + +uint64_t const n_terms_per_thread = 100000; +uint64_t const n_threads_per_gpu = 1024; +uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu; +uint64_t const n_threads_per_block = 512; +uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block; + +int nloop = 10; + +__global__ +void init(double *arr, int gpu_idx) { + uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1; + uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1; + double sum = 0.0; + for (uint64_t i = start; i < end; ++i) { + sum += 0.5 / ((i - 0.75) * (i - 0.25)); + } + arr[thread_idx] = sum; +} + +__global__ +void reduce_sum(double *arr, uint64_t stride) { + uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx < stride) { + arr[thread_idx] += arr[thread_idx + stride]; + } +} + +void reduce(double *arr) { + uint64_t stride = n_threads_per_gpu >> 1; + while (stride > 0) { + reduce_sum<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride); + stride = stride >> 1; + } +} + +int main(int argc, char** argv) { + char* local_rank_str = NULL; + int local_rank = 0; + local_rank_str = getenv("SLURM_LOCALID"); + if (local_rank_str != NULL) { + local_rank = atoi(local_rank_str); + std::cout << "slurm local rank = " << local_rank << '\n'; + } else { + std::cout << "slurm local rank not defined\n"; + } + + int n_gpus; + cudaGetDeviceCount(&n_gpus); + // relative to this node + int local_gpu_idx = local_rank % n_gpus; + //cudaSetDevice(local_gpu_idx); + + int n_tasks, rank; + MPI_Init(&argc, &argv); + + MPI_Comm_size(MPI_COMM_WORLD, &n_tasks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + // We assume that all nodes have the same number of GPUs. + int gpu_idx = n_gpus * rank + local_gpu_idx; + + int host_name_length; + char host_name[MPI_MAX_PROCESSOR_NAME]; + MPI_Get_processor_name(host_name, &host_name_length); + + std::cout << "task: " << rank << " of " << n_tasks << '\n'; + std::cout << "node: " << host_name << '\n'; + std::cout << "local gpu idx: " << local_gpu_idx << '\n'; + std::cout << "global gpu idx: " << gpu_idx << '\n'; + + /* + int rank,nranks; + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&nranks); + double *arr,result,pi; + cudaMalloc(&arr,nthreads*sizeof(double)); + if (rank == 0) { + for (int i = 0; i < nloop; ++i) { + MPI_Barrier(MPI_COMM_WORLD); + double tstart = MPI_Wtime(); + init<<<blocks,threads>>>(arr,npts,nthreads,rank); + reduce(arr); + cudaDeviceSynchronize(); + cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost); + MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + double tend = MPI_Wtime(); + double dt = tend-tstart; + double gflops = npts*nthreads*nranks*5.0/dt/1e9; + printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi); + printf("time = %f, estimated GFlops = %f\n",dt,gflops); + } + } + else { + for (int i = 0; i < nloop; ++i) { + MPI_Barrier(MPI_COMM_WORLD); + init<<<blocks,threads>>>(arr,npts,nthreads,rank); + reduce(arr); + cudaDeviceSynchronize(); + cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost); + MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + } + } + cudaFree(arr); + */ + + MPI_Finalize(); + return 0; +} diff --git a/mpi_pi_gpu/mpi_pi_gpu.slurm b/mpi_pi_gpu/mpi_pi_gpu.slurm new file mode 100644 index 0000000000000000000000000000000000000000..f31083a37325b43d589dd1d312944613fc9c06d6 --- /dev/null +++ b/mpi_pi_gpu/mpi_pi_gpu.slurm @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH -J mpi_pi_gpu +#SBATCH -o output/%j.out +#SBATCH -e output/%j.err +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --gres=gpu:2 +#SBATCH --cpus-per-task=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --threads-per-core=1 +#SBATCH --mem=10G +#SBATCH --time 00:01:00 + +source ./load_modules.sh +srun ./mpi_pi_gpu