From 6c99a4d69dff7e2d1c9de284de6f5f90eda40aa6 Mon Sep 17 00:00:00 2001
From: Erik Strand <erik.strand@cba.mit.edu>
Date: Fri, 19 Mar 2021 13:49:15 -0400
Subject: [PATCH] Fix things

---
 pi_gpu/Makefile     |  2 +-
 pi_gpu/pi_gpu.cu    | 46 ++++++++++++++++++++++++---------------------
 pi_gpu/pi_gpu.slurm |  4 ++--
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/pi_gpu/Makefile b/pi_gpu/Makefile
index 39aa56a..caead4f 100644
--- a/pi_gpu/Makefile
+++ b/pi_gpu/Makefile
@@ -1,2 +1,2 @@
 pi_gpu: pi_gpu.cu
-	nvcc $< -o $@
+	nvcc $< -o $@ --use_fast_math
diff --git a/pi_gpu/pi_gpu.cu b/pi_gpu/pi_gpu.cu
index bc84387..6080b2e 100644
--- a/pi_gpu/pi_gpu.cu
+++ b/pi_gpu/pi_gpu.cu
@@ -4,29 +4,30 @@
 // based on a series expansion of pi, but ignoring numeric concerns
 
 #include <chrono>
-#include <cuda_runtime.h>
 #include <iostream>
-#include <mpi.h>
-#include "constants.h"
-#include "kernels.h"
 
 using namespace std;
 
 // currently init_kernel assumes n_terms_per_thread is a multiple of 10
-uint64_t const n_terms_per_thread = 100000;
-uint64_t const n_threads_per_gpu = 1024 * 2;
+uint64_t const n_terms_per_thread = 1000000;
+uint64_t const n_threads_per_gpu = 1024 * 1024;
 uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
-uint64_t const n_threads_per_block = 512;
+uint64_t const n_threads_per_block = 1024;
 uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
 
-int const n_loops = 1;
+int const n_loops = 8;
 
 //--------------------------------------------------------------------------------------------------
 __global__
-void init_kernel(double *arr, int gpu_idx) {
+void init_kernel(double *arr) {
     uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
-    uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1;
+    if (thread_idx >= n_threads_per_gpu) {
+        return;
+    }
+
+    uint64_t const start = n_terms_per_thread * thread_idx + 1;
+    uint64_t const end = start + n_terms_per_thread;
+
     double sum = 0.0;
     uint64_t i = start;
     while (i < end) {
@@ -49,8 +50,8 @@ void reduce_sum_kernel(double *arr, uint64_t stride) {
 }
 
 //--------------------------------------------------------------------------------------------------
-void init(double *arr, int gpu_idx) {
-    init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx);
+void init(double *arr) {
+    init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr);
 }
 
 //--------------------------------------------------------------------------------------------------
@@ -72,15 +73,16 @@ int main(int argc, char** argv) {
     double result;
 
     // timing data
+    decltype(std::chrono::high_resolution_clock::now()) global_start;
+    decltype(std::chrono::high_resolution_clock::now()) global_stop;
     decltype(std::chrono::high_resolution_clock::now()) start;
     decltype(std::chrono::high_resolution_clock::now()) stop;
 
+    global_start = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < n_loops; ++i) {
-        if (rank == 0) {
-            start = std::chrono::high_resolution_clock::now();
-        }
+        start = std::chrono::high_resolution_clock::now();
 
-        init(d_arr, gpu_idx);
+        init(d_arr);
         reduce(d_arr);
         cudaDeviceSynchronize();
         cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost);
@@ -88,16 +90,18 @@ int main(int argc, char** argv) {
         stop = std::chrono::high_resolution_clock::now();
         auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
         auto const millis = duration.count();
-        auto const n_terms_total = n_tasks * n_terms_per_gpu;
-        auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9;
+        auto const gflops = n_terms_per_gpu * 5.0 / (millis * 1e-3) * 1e-9;
 
         std::cout << "loop " << i << '\n';
-        std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu
-            << ", total terms = " << n_terms_total << '\n';
+        std::cout << "processes = " << 1 << ", terms per GPU = " << n_terms_per_gpu << '\n';
         std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n';
         std::cout << "pi ~ " << result << '\n';
         std::cout << '\n';
     }
+    global_stop = std::chrono::high_resolution_clock::now();
+    auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start);
+    auto const millis = duration.count();
+    std::cout << "total time = " << millis * 1e-3 << "s\n";
 
     cudaFree(d_arr);
     return 0;
diff --git a/pi_gpu/pi_gpu.slurm b/pi_gpu/pi_gpu.slurm
index 492e0d4..07c78d0 100644
--- a/pi_gpu/pi_gpu.slurm
+++ b/pi_gpu/pi_gpu.slurm
@@ -8,8 +8,8 @@
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks-per-core=1
 #SBATCH --threads-per-core=1
-#SBATCH --mem=100M
-#SBATCH --time 00:01:00
+#SBATCH --mem=1G
+#SBATCH --time 00:05:00
 
 source ./load_modules.sh
 srun ./pi_gpu
-- 
GitLab