// // cudapi.cu // Neil Gershenfeld 3/1/20 // calculation of pi by a CUDA sum // pi = 3.14159265358979323846 // #include <iostream> #include <chrono> #include <string> using namespace std; uint64_t blocks = 1024; uint64_t threads = 1024; uint64_t nloop = 1000000; uint64_t npts = blocks*threads; void cudaCheck(string msg) { cudaError err; err = cudaGetLastError(); if (cudaSuccess != err) cerr << msg << ": " << cudaGetErrorString(err) << endl; } __global__ void init(double *arr,uint64_t nloop) { uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; uint64_t start = nloop*i+1; uint64_t end = nloop*(i+1)+1; arr[i] = 0; for (uint64_t j = start; j < end; ++j) arr[i] += 0.5/((j-0.75)*(j-0.25)); } __global__ void reduce_sum(double *arr,uint64_t len) { uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; if (i < len) arr[i] += arr[i+len]; } void reduce(double *arr) { uint64_t len = npts >> 1; while (1) { reduce_sum<<<blocks,threads>>>(arr,len); cudaCheck("reduce_sum"); len = len >> 1; if (len == 0) return; } } int main(void) { double harr[1],*darr; cudaMalloc(&darr,npts*sizeof(double)); cudaCheck("cudaMalloc"); auto tstart = std::chrono::high_resolution_clock::now(); init<<<blocks,threads>>>(darr,nloop); cudaCheck("init"); reduce(darr); cudaDeviceSynchronize(); cudaCheck("cudaDeviceSynchronize"); auto tend = std::chrono::high_resolution_clock::now(); auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count(); auto mflops = npts*nloop*5.0/dt; cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost); cudaCheck("cudaMemcpy"); printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]); printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops); cudaFree(darr); return 0; }