Skip to content
Snippets Groups Projects
Commit 6c99a4d6 authored by Erik Strand's avatar Erik Strand
Browse files

Fix things

parent 08ca1b59
No related branches found
No related tags found
No related merge requests found
pi_gpu: pi_gpu.cu pi_gpu: pi_gpu.cu
nvcc $< -o $@ nvcc $< -o $@ --use_fast_math
...@@ -4,29 +4,30 @@ ...@@ -4,29 +4,30 @@
// based on a series expansion of pi, but ignoring numeric concerns // based on a series expansion of pi, but ignoring numeric concerns
#include <chrono> #include <chrono>
#include <cuda_runtime.h>
#include <iostream> #include <iostream>
#include <mpi.h>
#include "constants.h"
#include "kernels.h"
using namespace std; using namespace std;
// currently init_kernel assumes n_terms_per_thread is a multiple of 10 // currently init_kernel assumes n_terms_per_thread is a multiple of 10
uint64_t const n_terms_per_thread = 100000; uint64_t const n_terms_per_thread = 1000000;
uint64_t const n_threads_per_gpu = 1024 * 2; uint64_t const n_threads_per_gpu = 1024 * 1024;
uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu; uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
uint64_t const n_threads_per_block = 512; uint64_t const n_threads_per_block = 1024;
uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block; uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
int const n_loops = 1; int const n_loops = 8;
//-------------------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------------------
__global__ __global__
void init_kernel(double *arr, int gpu_idx) { void init_kernel(double *arr) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1; if (thread_idx >= n_threads_per_gpu) {
uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1; return;
}
uint64_t const start = n_terms_per_thread * thread_idx + 1;
uint64_t const end = start + n_terms_per_thread;
double sum = 0.0; double sum = 0.0;
uint64_t i = start; uint64_t i = start;
while (i < end) { while (i < end) {
...@@ -49,8 +50,8 @@ void reduce_sum_kernel(double *arr, uint64_t stride) { ...@@ -49,8 +50,8 @@ void reduce_sum_kernel(double *arr, uint64_t stride) {
} }
//-------------------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------------------
void init(double *arr, int gpu_idx) { void init(double *arr) {
init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx); init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr);
} }
//-------------------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------------------
...@@ -72,15 +73,16 @@ int main(int argc, char** argv) { ...@@ -72,15 +73,16 @@ int main(int argc, char** argv) {
double result; double result;
// timing data // timing data
decltype(std::chrono::high_resolution_clock::now()) global_start;
decltype(std::chrono::high_resolution_clock::now()) global_stop;
decltype(std::chrono::high_resolution_clock::now()) start; decltype(std::chrono::high_resolution_clock::now()) start;
decltype(std::chrono::high_resolution_clock::now()) stop; decltype(std::chrono::high_resolution_clock::now()) stop;
global_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n_loops; ++i) { for (int i = 0; i < n_loops; ++i) {
if (rank == 0) {
start = std::chrono::high_resolution_clock::now(); start = std::chrono::high_resolution_clock::now();
}
init(d_arr, gpu_idx); init(d_arr);
reduce(d_arr); reduce(d_arr);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost);
...@@ -88,16 +90,18 @@ int main(int argc, char** argv) { ...@@ -88,16 +90,18 @@ int main(int argc, char** argv) {
stop = std::chrono::high_resolution_clock::now(); stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start); auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
auto const millis = duration.count(); auto const millis = duration.count();
auto const n_terms_total = n_tasks * n_terms_per_gpu; auto const gflops = n_terms_per_gpu * 5.0 / (millis * 1e-3) * 1e-9;
auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9;
std::cout << "loop " << i << '\n'; std::cout << "loop " << i << '\n';
std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu std::cout << "processes = " << 1 << ", terms per GPU = " << n_terms_per_gpu << '\n';
<< ", total terms = " << n_terms_total << '\n';
std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n'; std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n';
std::cout << "pi ~ " << result << '\n'; std::cout << "pi ~ " << result << '\n';
std::cout << '\n'; std::cout << '\n';
} }
global_stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start);
auto const millis = duration.count();
std::cout << "total time = " << millis * 1e-3 << "s\n";
cudaFree(d_arr); cudaFree(d_arr);
return 0; return 0;
......
...@@ -8,8 +8,8 @@ ...@@ -8,8 +8,8 @@
#SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-core=1 #SBATCH --ntasks-per-core=1
#SBATCH --threads-per-core=1 #SBATCH --threads-per-core=1
#SBATCH --mem=100M #SBATCH --mem=1G
#SBATCH --time 00:01:00 #SBATCH --time 00:05:00
source ./load_modules.sh source ./load_modules.sh
srun ./pi_gpu srun ./pi_gpu
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment