Skip to content
Snippets Groups Projects
Commit 6c99a4d6 authored by Erik Strand's avatar Erik Strand
Browse files

Fix things

parent 08ca1b59
Branches pi
No related tags found
No related merge requests found
pi_gpu: pi_gpu.cu
nvcc $< -o $@
nvcc $< -o $@ --use_fast_math
......@@ -4,29 +4,30 @@
// based on a series expansion of pi, but ignoring numeric concerns
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#include <mpi.h>
#include "constants.h"
#include "kernels.h"
using namespace std;
// currently init_kernel assumes n_terms_per_thread is a multiple of 10
uint64_t const n_terms_per_thread = 100000;
uint64_t const n_threads_per_gpu = 1024 * 2;
uint64_t const n_terms_per_thread = 1000000;
uint64_t const n_threads_per_gpu = 1024 * 1024;
uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
uint64_t const n_threads_per_block = 512;
uint64_t const n_threads_per_block = 1024;
uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
int const n_loops = 1;
int const n_loops = 8;
//--------------------------------------------------------------------------------------------------
__global__
void init_kernel(double *arr, int gpu_idx) {
void init_kernel(double *arr) {
uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1;
if (thread_idx >= n_threads_per_gpu) {
return;
}
uint64_t const start = n_terms_per_thread * thread_idx + 1;
uint64_t const end = start + n_terms_per_thread;
double sum = 0.0;
uint64_t i = start;
while (i < end) {
......@@ -49,8 +50,8 @@ void reduce_sum_kernel(double *arr, uint64_t stride) {
}
//--------------------------------------------------------------------------------------------------
void init(double *arr, int gpu_idx) {
init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, gpu_idx);
void init(double *arr) {
init_kernel<<<n_blocks_per_gpu, n_threads_per_block>>>(arr);
}
//--------------------------------------------------------------------------------------------------
......@@ -72,15 +73,16 @@ int main(int argc, char** argv) {
double result;
// timing data
decltype(std::chrono::high_resolution_clock::now()) global_start;
decltype(std::chrono::high_resolution_clock::now()) global_stop;
decltype(std::chrono::high_resolution_clock::now()) start;
decltype(std::chrono::high_resolution_clock::now()) stop;
global_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n_loops; ++i) {
if (rank == 0) {
start = std::chrono::high_resolution_clock::now();
}
init(d_arr, gpu_idx);
init(d_arr);
reduce(d_arr);
cudaDeviceSynchronize();
cudaMemcpy(&result, d_arr, sizeof(double), cudaMemcpyDeviceToHost);
......@@ -88,16 +90,18 @@ int main(int argc, char** argv) {
stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
auto const millis = duration.count();
auto const n_terms_total = n_tasks * n_terms_per_gpu;
auto const gflops = n_terms_total * 5.0 / (millis * 1e-3) * 1e-9;
auto const gflops = n_terms_per_gpu * 5.0 / (millis * 1e-3) * 1e-9;
std::cout << "loop " << i << '\n';
std::cout << "processes = " << n_tasks << ", terms per GPU = " << n_terms_per_gpu
<< ", total terms = " << n_terms_total << '\n';
std::cout << "processes = " << 1 << ", terms per GPU = " << n_terms_per_gpu << '\n';
std::cout << "time = " << millis * 1e-3 << "s, estimated GFlops = " << gflops << '\n';
std::cout << "pi ~ " << result << '\n';
std::cout << '\n';
}
global_stop = std::chrono::high_resolution_clock::now();
auto const duration = std::chrono::duration_cast<std::chrono::milliseconds>(global_stop - global_start);
auto const millis = duration.count();
std::cout << "total time = " << millis * 1e-3 << "s\n";
cudaFree(d_arr);
return 0;
......
......@@ -8,8 +8,8 @@
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-core=1
#SBATCH --threads-per-core=1
#SBATCH --mem=100M
#SBATCH --time 00:01:00
#SBATCH --mem=1G
#SBATCH --time 00:05:00
source ./load_modules.sh
srun ./pi_gpu
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment