Skip to content
Snippets Groups Projects
cudapip.cu 1.96 KiB
Newer Older
  • Learn to ignore specific revisions
  • Neil Gershenfeld's avatar
    Neil Gershenfeld committed
    //
    // cudapip.cu
    // Neil Gershenfeld 7/14/20
    // calculation of pi by a CUDA multi-GPU peer sum
    // pi = 3.14159265358979323846 
    //
    #include <iostream>
    #include <chrono>
    #include <cstdint>
    uint64_t blocks = 1024;
    uint64_t threads = 1024;
    uint64_t nloop = 1000000;
    uint64_t npts = blocks*threads;
    using namespace std;
    __global__ void init(double *arr,uint64_t nloop,uint64_t npts,int ngpus,int index) {
       uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
       uint64_t start = nloop*i+npts*nloop*index+1;
       uint64_t end = nloop*(i+1)+npts*nloop*index+1;
       arr[i+index*npts] = 0;
       for (uint64_t j = start; j < end; ++j)
          arr[i+index*npts] += 0.5/((j-0.75)*(j-0.25));
       }
    void cudaCheck(string msg) {
       cudaError err;
       err = cudaGetLastError();
       if (cudaSuccess != err)
       cerr << msg << ": " << cudaGetErrorString(err) << endl;
       }
    int main(void) {
       double *arr,*darr;
       int ngpus;
       cudaGetDeviceCount(&ngpus);
       arr = new double[ngpus*npts];
       cudaSetDevice(0);
       cudaMalloc(&darr,ngpus*npts*sizeof(double));
       for (int i = 1; i < ngpus; ++i) {
          cudaSetDevice(i);
          cudaDeviceEnablePeerAccess(0,0);
          cudaCheck("peer access");
          }
       auto tstart = chrono::high_resolution_clock::now();
       for (int i = 0; i < ngpus; ++i) {
          cudaSetDevice(i);
          init<<<blocks,threads>>>(darr,nloop,npts,ngpus,i);
          }
       for (int i = 1; i < ngpus; ++i) {
          cudaSetDevice(i);
          cudaDeviceSynchronize();
          }
       cudaSetDevice(0);
       cudaMemcpy(arr,darr,ngpus*npts*sizeof(double),cudaMemcpyDeviceToHost);
       double pi = 0;
       for (int i = 0; i < ngpus*npts; ++i)
          pi += arr[i];
       auto tend = chrono::high_resolution_clock::now();
    	auto dt = chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
       auto gflops = npts*nloop*ngpus*5.0/dt/1e3;
       std::cout << "npts: " << npts << " nloop: " << nloop << " ngpus: " << ngpus << " pi: " << pi << '\n';
       std::cout << "time: " << 1e-6*dt << " estimated GFlops: " << gflops << '\n';
       return 0;
       }