From 6c99a4d69dff7e2d1c9de284de6f5f90eda40aa6 Mon Sep 17 00:00:00 2001 From: Erik Strand <erik.strand@cba.mit.edu> Date: Fri, 19 Mar 2021 13:49:15 -0400 Subject: [PATCH] Fix things --- pi_gpu/Makefile | 2 +- pi_gpu/pi_gpu.cu | 46 ++++++++++++++++++++++++--------------------- pi_gpu/pi_gpu.slurm | 4 ++-- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/pi_gpu/Makefile b/pi_gpu/Makefile index 39aa56a..caead4f 100644 --- a/pi_gpu/Makefile +++ b/pi_gpu/Makefile @@ -1,2 +1,2 @@ pi_gpu: pi_gpu.cu - nvcc $< -o $@ + nvcc $< -o $@ --use_fast_math diff --git a/pi_gpu/pi_gpu.cu b/pi_gpu/pi_gpu.cu index bc84387..6080b2e 100644 --- a/pi_gpu/pi_gpu.cu +++ b/pi_gpu/pi_gpu.cu @@ -4,29 +4,30 @@ // based on a series expansion of pi, but ignoring numeric concerns #include <chrono> -#include <cuda_runtime.h> #include <iostream> -#include <mpi.h> -#include "constants.h" -#include "kernels.h" using namespace std; // currently init_kernel assumes n_terms_per_thread is a multiple of 10 -uint64_t const n_terms_per_thread = 100000; -uint64_t const n_threads_per_gpu = 1024 * 2; +uint64_t const n_terms_per_thread = 1000000; +uint64_t const n_threads_per_gpu = 1024 * 1024; uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu; -uint64_t const n_threads_per_block = 512; +uint64_t const n_threads_per_block = 1024; uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block; -int const n_loops = 1; +int const n_loops = 8; //-------------------------------------------------------------------------------------------------- __global__ -void init_kernel(double *arr, int gpu_idx) { +void init_kernel(double *arr) { uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; - uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1; - uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1; + if (thread_idx >= n_threads_per_gpu) { + return; + } + + uint64_t const start = n_terms_per_thread * thread_idx + 1; + uint64_t const end = start + n_terms_per_thread; + double sum = 0.0; uint64_t i = start; while (i < end) { @@ -49,8 +50,8 @@ void reduce_sum_kernel(double *arr, uint64_t stride) { } //-------------------------------------------------------------------------------------------------- -void init(double *arr, int gpu_idx) { - init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx); +void init(double *arr) { + init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr); } //-------------------------------------------------------------------------------------------------- @@ -72,15 +73,16 @@ int main(int argc, char** argv) { double result; // timing data + decltype(std::chrono::high_resolution_clock::now()) global_start; + decltype(std::chrono::high_resolution_clock::now()) global_stop; decltype(std::chrono::high_resolution_clock::now()) start; decltype(std::chrono::high_resolution_clock::now()) stop; + global_start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n_loops; ++i) { - if (rank == 0) { - start = std::chrono::high_resolution_clock::now(); - } + start = std::chrono::high_resolution_clock::now(); - init(d_arr, gpu_idx); + init(d_arr); reduce(d_arr); cudaDeviceSynchronize(); cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost); @@ -88,16 +90,18 @@ int main(int argc, char** argv) { stop = std::chrono::high_resolution_clock::now(); auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start); auto const millis = duration.count(); - auto const n_terms_total = n_tasks * n_terms_per_gpu; - auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9; + auto const gflops = n_terms_per_gpu * 5.0 / (millis * 1e-3) * 1e-9; std::cout << "loop " << i << '\n'; - std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu - << ", total terms = " << n_terms_total << '\n'; + std::cout << "processes = " << 1 << ", terms per GPU = " << n_terms_per_gpu << '\n'; std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n'; std::cout << "pi ~ " << result << '\n'; std::cout << '\n'; } + global_stop = std::chrono::high_resolution_clock::now(); + auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start); + auto const millis = duration.count(); + std::cout << "total time = " << millis * 1e-3 << "s\n"; cudaFree(d_arr); return 0; diff --git a/pi_gpu/pi_gpu.slurm b/pi_gpu/pi_gpu.slurm index 492e0d4..07c78d0 100644 --- a/pi_gpu/pi_gpu.slurm +++ b/pi_gpu/pi_gpu.slurm @@ -8,8 +8,8 @@ #SBATCH --cpus-per-task=1 #SBATCH --ntasks-per-core=1 #SBATCH --threads-per-core=1 -#SBATCH --mem=100M -#SBATCH --time 00:01:00 +#SBATCH --mem=1G +#SBATCH --time 00:05:00 source ./load_modules.sh srun ./pi_gpu -- GitLab