Skip to content
Snippets Groups Projects
Commit d935c24d authored by Neil Gershenfeld's avatar Neil Gershenfeld
Browse files

wip

parent 29d3f0dd
No related branches found
No related tags found
No related merge requests found
Pipeline #8246 passed
//
// cudampipi.cu
// Neil Gershenfeld 10/14/20
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
// pi = 3.14159265358979323846
//
#include <iostream>
#include <cstdint>
#include <mpi.h>
//
using namespace std;
//
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t npts = 1000000;
uint64_t nthreads = blocks*threads;
int nloop = 10;
//
__global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = npts*index+npts*nthreads*rank+1;
uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
arr[index] = 0;
for (uint64_t j = start; j < end; ++j)
arr[index] += 0.5/((j-0.75)*(j-0.25));
}
//
__global__ void reduce_sum(double *arr,uint64_t len) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < len)
arr[index] += arr[index+len];
}
//
void reduce(double *arr) {
uint64_t len = nthreads >> 1;
while (1) {
reduce_sum<<<blocks,threads>>>(arr,len);
len = len >> 1;
if (len == 0)
return;
}
}
//
int main(int argc, char** argv) {
int rank,nranks;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&nranks);
double *arr,result,pi;
cudaMalloc(&arr,nthreads*sizeof(double));
if (rank == 0) {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
double tstart = MPI_Wtime();
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
double tend = MPI_Wtime();
double dt = tend-tstart;
double gflops = npts*nthreads*nranks*5.0/dt/1e9;
printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
printf("time = %f, estimated GFlops = %f\n",dt,gflops);
}
}
else {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
}
}
cudaFree(arr);
MPI_Finalize();
return 0;
}
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
|estimated GFlops|code|description|system|date| |estimated GFlops|code|description|system|date|
|---|---|---|---|---| |---|---|---|---|---|
|2,493,988|[cudampipi.cu](CUDA/cudampipi.cu)|C++, CUDA+MPI, 256 nodes, 1536 ranks<br>nvcc -arch=sm_70 -std=c++11|Oak Ridge OLCF Summit<br>IBM POWER9+NVIDA V100|October 15, 2020|
|88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019| |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
|12,589|[cudapit.cu](CUDA/cudapit.cu)|C++, CUDA, 8 GPUs, 5120 cores/GPU|NVIDIA V100|March 3, 2020| |12,589|[cudapit.cu](CUDA/cudapit.cu)|C++, CUDA, 8 GPUs, 5120 cores/GPU|NVIDIA V100|March 3, 2020|
|11,083|[mpithreadpi.cpp](hybrid/mpithreadpi.cpp)|C++, MPI+threads, 128 nodes, 64 cores/node, 4 threads/core<br>CC mpithreadpi.cpp -o mpithreadpi -O3 -ffast-math -std=c++11|Argonne ALCF Theta<br>Cray XC40|Mar 8, 2020| |11,083|[mpithreadpi.cpp](hybrid/mpithreadpi.cpp)|C++, MPI+threads, 128 nodes, 64 cores/node, 4 threads/core<br>CC mpithreadpi.cpp -o mpithreadpi -O3 -ffast-math -std=c++11|Argonne ALCF Theta<br>Cray XC40|Mar 8, 2020|
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment