//
// mpithreadpi.cpp
// Neil Gershenfeld 3/3/20
// calculation of pi by an MPI thread sum
// pi = 3.14159265358979323846 
//
#include <iostream>
#include <chrono>
#include <thread>
#include <vector>
#include <cstdint>
#include <mpi.h>
#define nloop 10
#define npts 1e9
unsigned int nthreads = std::thread::hardware_concurrency();
std::vector<double> results(nthreads);
void partialsum(int index, int rank) {
   uint64_t start = npts*index+npts*nthreads*rank+1;
   uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
   double result = 0;
   for (uint64_t i = start; i < end; ++i)
      result += 0.5/((i-0.75)*(i-0.25));
   results[index] = result;
   }
int main(int argc, char** argv) {
   double sum,pi,mflops,max;
   int i,j,rank,nproc;
   std::thread threads[nthreads];
   MPI_Init(&argc,&argv);
   MPI_Comm_rank(MPI_COMM_WORLD,&rank);
   MPI_Comm_size(MPI_COMM_WORLD,&nproc);
   if (rank == 0) {
      for (j = 0; j < nloop; ++j) {
         MPI_Barrier(MPI_COMM_WORLD);
         auto tstart = std::chrono::high_resolution_clock::now();        
         sum = 0.0;
         for (int i = 0; i < nthreads; ++i)
            threads[i] = std::thread(partialsum,i,rank);
         for (int i = 0; i < nthreads; ++i) {
            threads[i].join();
            sum += results[i];
            }
         MPI_Reduce(&sum,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
         auto tend = std::chrono::high_resolution_clock::now();        
         auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
         auto mflops = npts*nthreads*nproc*5.0/dt;
         if (mflops > max) max = mflops;
         std::cout << "npts: " << npts << " nthreads: " << nthreads
            << " nproc: " << nproc << " pi: " << pi << '\n';
         std::cout << "time: " << 1e-6*dt << " estimated MFlops: " << mflops << " max: " << max << '\n';
         }
      }
   else {
      for (j = 0; j < nloop; ++j) {
         MPI_Barrier(MPI_COMM_WORLD);
         sum = 0.0;
         for (int i = 0; i < nthreads; ++i)
            threads[i] = std::thread(partialsum,i,rank);
         for (int i = 0; i < nthreads; ++i) {
            threads[i].join();
            sum += results[i];
            }
         MPI_Reduce(&sum,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
         }
      }
   MPI_Finalize();
   return 0;
   }