// // cudampipi.cu // Neil Gershenfeld 10/14/20 // calculation of pi by a CUDA+MPI sum // assumes one GPU per MPI rank // pi = 3.14159265358979323846 // #include <iostream> #include <cstdint> #include <mpi.h> // using namespace std; // uint64_t blocks = 1024; uint64_t threads = 1024; uint64_t npts = 1000000; uint64_t nthreads = blocks*threads; int nloop = 10; // __global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) { uint64_t index = blockIdx.x*blockDim.x+threadIdx.x; uint64_t start = npts*index+npts*nthreads*rank+1; uint64_t end = npts*(index+1)+npts*nthreads*rank+1; arr[index] = 0; for (uint64_t j = start; j < end; ++j) arr[index] += 0.5/((j-0.75)*(j-0.25)); } // __global__ void reduce_sum(double *arr,uint64_t len) { uint64_t index = blockIdx.x*blockDim.x+threadIdx.x; if (index < len) arr[index] += arr[index+len]; } // void reduce(double *arr) { uint64_t len = nthreads >> 1; while (1) { reduce_sum<<<blocks,threads>>>(arr,len); len = len >> 1; if (len == 0) return; } } // int main(int argc, char** argv) { int rank,nranks; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&rank); MPI_Comm_size(MPI_COMM_WORLD,&nranks); double *arr,result,pi; cudaMalloc(&arr,nthreads*sizeof(double)); if (rank == 0) { for (int i = 0; i < nloop; ++i) { MPI_Barrier(MPI_COMM_WORLD); double tstart = MPI_Wtime(); init<<<blocks,threads>>>(arr,npts,nthreads,rank); reduce(arr); cudaDeviceSynchronize(); cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost); MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); double tend = MPI_Wtime(); double dt = tend-tstart; double gflops = npts*nthreads*nranks*5.0/dt/1e9; printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi); printf("time = %f, estimated GFlops = %f\n",dt,gflops); } } else { for (int i = 0; i < nloop; ++i) { MPI_Barrier(MPI_COMM_WORLD); init<<<blocks,threads>>>(arr,npts,nthreads,rank); reduce(arr); cudaDeviceSynchronize(); cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost); MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); } } cudaFree(arr); MPI_Finalize(); return 0; }