//
// cudampipi.cu
// Neil Gershenfeld 10/14/20
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
// pi = 3.14159265358979323846 
//
#include <iostream>
#include <cstdint>
#include <mpi.h>
//
using namespace std;
//
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t npts = 1000000;
uint64_t nthreads = blocks*threads;
int nloop = 10;
//
__global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) {
   uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
   uint64_t start = npts*index+npts*nthreads*rank+1;
   uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
   arr[index] = 0;
   for (uint64_t j = start; j < end; ++j)
      arr[index] += 0.5/((j-0.75)*(j-0.25));
   }
//
__global__ void reduce_sum(double *arr,uint64_t len) {
   uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
   if (index < len)
      arr[index] += arr[index+len];
   }
//
void reduce(double *arr) {
   uint64_t len = nthreads >> 1;
   while (1) {
      reduce_sum<<<blocks,threads>>>(arr,len);
      len = len >> 1;
      if (len == 0)
         return;
      }
   }
//
int main(int argc, char** argv) {
   int rank,nranks;
   MPI_Init(&argc,&argv);
   MPI_Comm_rank(MPI_COMM_WORLD,&rank);
   MPI_Comm_size(MPI_COMM_WORLD,&nranks);
   double *arr,result,pi;
   cudaMalloc(&arr,nthreads*sizeof(double));
   if (rank == 0) {
      for (int i = 0; i < nloop; ++i) {
         MPI_Barrier(MPI_COMM_WORLD);
	      double tstart = MPI_Wtime();
         init<<<blocks,threads>>>(arr,npts,nthreads,rank);
         reduce(arr);
         cudaDeviceSynchronize();
         cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
         MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
	      double tend = MPI_Wtime();
         double dt = tend-tstart;
         double gflops = npts*nthreads*nranks*5.0/dt/1e9;
         printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
         printf("time = %f, estimated GFlops = %f\n",dt,gflops);
         }
      }
   else {
      for (int i = 0; i < nloop; ++i) {
         MPI_Barrier(MPI_COMM_WORLD);
         init<<<blocks,threads>>>(arr,npts,nthreads,rank);
         reduce(arr);
         cudaDeviceSynchronize();
         cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
         MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
         }
      }
   cudaFree(arr);
   MPI_Finalize();
   return 0;
   }