Skip to content
Snippets Groups Projects
Commit e300c310 authored by Neil Gershenfeld's avatar Neil Gershenfeld
Browse files

wip

parent 21d19cbd
Branches
No related tags found
No related merge requests found
Pipeline #4933 passed
# #
# numbapig.py # numbapig.py
# Neil Gershenfeld 2/9/20 # Neil Gershenfeld 2/9/20
# calculation of pi by a Numba CUDA sum # calculation of pi by a Numba GPU sum
# pi = 3.14159265358979323846 # pi = 3.14159265358979323846
# #
from numba import cuda from numba import cuda
...@@ -20,6 +20,7 @@ NPTS = grid_size*block_size ...@@ -20,6 +20,7 @@ NPTS = grid_size*block_size
def init(arr): def init(arr):
i = 1+cuda.grid(1) i = 1+cuda.grid(1)
arr[i-1] = 0.5/((i-0.75)*(i-0.25)) arr[i-1] = 0.5/((i-0.75)*(i-0.25))
#arr[i-1] = i # for testing reduction
# #
@cuda.reduce @cuda.reduce
def Numba_reduce(a,b): def Numba_reduce(a,b):
...@@ -39,15 +40,25 @@ def CUDA_reduce(arr,NPTS): ...@@ -39,15 +40,25 @@ def CUDA_reduce(arr,NPTS):
if (len == 0): if (len == 0):
return return
# #
@cuda.jit
def CUDA_result(arr,result):
i = cuda.grid(1)
if (i == 0):
result[0] = arr[0]
#
# device array # device array
# #
arr = cuda.device_array(NPTS,np.float32) arr = cuda.device_array(NPTS,np.float32)
result = cuda.device_array(1,np.float32)
#arr = cuda.device_array(NPTS,np.int64) # for testing reduction
#result = cuda.device_array(1,np.int64) # for testing reduction
# #
# compile kernels # compile kernels by calling them
# #
init[grid_size,block_size](arr) init[grid_size,block_size](arr)
pi = Numba_reduce(arr) pi = Numba_reduce(arr)
CUDA_reduce(arr,NPTS) CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
# #
# CUDA kernel array calculation # CUDA kernel array calculation
# #
...@@ -94,9 +105,11 @@ print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) ...@@ -94,9 +105,11 @@ print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
start_time = time.time() start_time = time.time()
init[grid_size,block_size](arr) init[grid_size,block_size](arr)
CUDA_reduce(arr,NPTS) CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
end_time = time.time() end_time = time.time()
darr = arr.copy_to_host() pi = result.copy_to_host()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with CUDA kernel reduction:") print("both with CUDA kernel reduction:")
print(" NPTS = %d, pi = %f"%(NPTS,darr[0])) print(" NPTS = %d, pi = %f"%(NPTS,pi[0]))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
|88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019| |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
|2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019| |2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
|2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020| |2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020|
|1,919|[numbapig.py](Python/numbapig.py)|Python, Numba, GPU<br>5120 cores|NVIDIA V100|Feb 9, 2020|
|315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020| |315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020|
|272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019| |272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019|
|211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019| |211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment