Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
//
// cudapip.cu
// Neil Gershenfeld 7/14/20
// calculation of pi by a CUDA multi-GPU peer sum
// pi = 3.14159265358979323846
//
#include <iostream>
#include <chrono>
#include <cstdint>
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t nloop = 1000000;
uint64_t npts = blocks*threads;
using namespace std;
__global__ void init(double *arr,uint64_t nloop,uint64_t npts,int ngpus,int index) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = nloop*i+npts*nloop*index+1;
uint64_t end = nloop*(i+1)+npts*nloop*index+1;
arr[i+index*npts] = 0;
for (uint64_t j = start; j < end; ++j)
arr[i+index*npts] += 0.5/((j-0.75)*(j-0.25));
}
void cudaCheck(string msg) {
cudaError err;
err = cudaGetLastError();
if (cudaSuccess != err)
cerr << msg << ": " << cudaGetErrorString(err) << endl;
}
int main(void) {
double *arr,*darr;
int ngpus;
cudaGetDeviceCount(&ngpus);
arr = new double[ngpus*npts];
cudaSetDevice(0);
cudaMalloc(&darr,ngpus*npts*sizeof(double));
for (int i = 1; i < ngpus; ++i) {
cudaSetDevice(i);
cudaDeviceEnablePeerAccess(0,0);
cudaCheck("peer access");
}
auto tstart = chrono::high_resolution_clock::now();
for (int i = 0; i < ngpus; ++i) {
cudaSetDevice(i);
init<<<blocks,threads>>>(darr,nloop,npts,ngpus,i);
}
for (int i = 1; i < ngpus; ++i) {
cudaSetDevice(i);
cudaDeviceSynchronize();
}
cudaSetDevice(0);
cudaMemcpy(arr,darr,ngpus*npts*sizeof(double),cudaMemcpyDeviceToHost);
double pi = 0;
for (int i = 0; i < ngpus*npts; ++i)
pi += arr[i];
auto tend = chrono::high_resolution_clock::now();
auto dt = chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
auto gflops = npts*nloop*ngpus*5.0/dt/1e3;
std::cout << "npts: " << npts << " nloop: " << nloop << " ngpus: " << ngpus << " pi: " << pi << '\n';
std::cout << "time: " << 1e-6*dt << " estimated GFlops: " << gflops << '\n';
return 0;
}