Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
satori
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Erik Strand
satori
Commits
6c99a4d6
Commit
6c99a4d6
authored
4 years ago
by
Erik Strand
Browse files
Options
Downloads
Patches
Plain Diff
Fix things
parent
08ca1b59
Branches
pi
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
pi_gpu/Makefile
+1
-1
1 addition, 1 deletion
pi_gpu/Makefile
pi_gpu/pi_gpu.cu
+25
-21
25 additions, 21 deletions
pi_gpu/pi_gpu.cu
pi_gpu/pi_gpu.slurm
+2
-2
2 additions, 2 deletions
pi_gpu/pi_gpu.slurm
with
28 additions
and
24 deletions
pi_gpu/Makefile
+
1
−
1
View file @
6c99a4d6
pi_gpu
:
pi_gpu.cu
nvcc
$<
-o
$@
nvcc
$<
-o
$@
--use_fast_math
This diff is collapsed.
Click to expand it.
pi_gpu/pi_gpu.cu
+
25
−
21
View file @
6c99a4d6
...
...
@@ -4,29 +4,30 @@
// based on a series expansion of pi, but ignoring numeric concerns
#include
<chrono>
#include
<cuda_runtime.h>
#include
<iostream>
#include
<mpi.h>
#include
"constants.h"
#include
"kernels.h"
using
namespace
std
;
// currently init_kernel assumes n_terms_per_thread is a multiple of 10
uint64_t
const
n_terms_per_thread
=
100000
;
uint64_t
const
n_threads_per_gpu
=
1024
*
2
;
uint64_t
const
n_terms_per_thread
=
100000
0
;
uint64_t
const
n_threads_per_gpu
=
1024
*
1024
;
uint64_t
const
n_terms_per_gpu
=
n_terms_per_thread
*
n_threads_per_gpu
;
uint64_t
const
n_threads_per_block
=
512
;
uint64_t
const
n_threads_per_block
=
1024
;
uint64_t
const
n_blocks_per_gpu
=
(
n_threads_per_gpu
+
n_threads_per_block
-
1
)
/
n_threads_per_block
;
int
const
n_loops
=
1
;
int
const
n_loops
=
8
;
//--------------------------------------------------------------------------------------------------
__global__
void
init_kernel
(
double
*
arr
,
int
gpu_idx
)
{
void
init_kernel
(
double
*
arr
)
{
uint64_t
const
thread_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
uint64_t
const
start
=
n_terms_per_gpu
*
gpu_idx
+
n_terms_per_thread
*
thread_idx
+
1
;
uint64_t
const
end
=
n_terms_per_gpu
*
(
gpu_idx
+
1
)
+
n_terms_per_thread
*
thread_idx
+
1
;
if
(
thread_idx
>=
n_threads_per_gpu
)
{
return
;
}
uint64_t
const
start
=
n_terms_per_thread
*
thread_idx
+
1
;
uint64_t
const
end
=
start
+
n_terms_per_thread
;
double
sum
=
0.0
;
uint64_t
i
=
start
;
while
(
i
<
end
)
{
...
...
@@ -49,8 +50,8 @@ void reduce_sum_kernel(double *arr, uint64_t stride) {
}
//--------------------------------------------------------------------------------------------------
void
init
(
double
*
arr
,
int
gpu_idx
)
{
init_kernel
<<<
n_blocks_per_gpu
,
n_threads_per_block
>>>
(
arr
,
gpu_idx
);
void
init
(
double
*
arr
)
{
init_kernel
<<<
n_blocks_per_gpu
,
n_threads_per_block
>>>
(
arr
);
}
//--------------------------------------------------------------------------------------------------
...
...
@@ -72,15 +73,16 @@ int main(int argc, char** argv) {
double
result
;
// timing data
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
())
global_start
;
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
())
global_stop
;
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
())
start
;
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
())
stop
;
global_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
for
(
int
i
=
0
;
i
<
n_loops
;
++
i
)
{
if
(
rank
==
0
)
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
init
(
d_arr
,
gpu_idx
);
init
(
d_arr
);
reduce
(
d_arr
);
cudaDeviceSynchronize
();
cudaMemcpy
(
&
result
,
d_arr
,
sizeof
(
double
),
cudaMemcpyDeviceToHost
);
...
...
@@ -88,16 +90,18 @@ int main(int argc, char** argv) {
stop
=
std
::
chrono
::
high_resolution_clock
::
now
();
auto
const
duration
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
milliseconds
>
(
stop
-
start
);
auto
const
millis
=
duration
.
count
();
auto
const
n_terms_total
=
n_tasks
*
n_terms_per_gpu
;
auto
const
gflops
=
n_terms_total
*
5.0
/
(
millis
*
1e-3
)
*
1e-9
;
auto
const
gflops
=
n_terms_per_gpu
*
5.0
/
(
millis
*
1e-3
)
*
1e-9
;
std
::
cout
<<
"loop "
<<
i
<<
'\n'
;
std
::
cout
<<
"processes = "
<<
n_tasks
<<
", terms per GPU = "
<<
n_terms_per_gpu
<<
", total terms = "
<<
n_terms_total
<<
'\n'
;
std
::
cout
<<
"processes = "
<<
1
<<
", terms per GPU = "
<<
n_terms_per_gpu
<<
'\n'
;
std
::
cout
<<
"time = "
<<
millis
*
1e-3
<<
"s, estimated GFlops = "
<<
gflops
<<
'\n'
;
std
::
cout
<<
"pi ~ "
<<
result
<<
'\n'
;
std
::
cout
<<
'\n'
;
}
global_stop
=
std
::
chrono
::
high_resolution_clock
::
now
();
auto
const
duration
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
milliseconds
>
(
global_stop
-
global_start
);
auto
const
millis
=
duration
.
count
();
std
::
cout
<<
"total time = "
<<
millis
*
1e-3
<<
"s
\n
"
;
cudaFree
(
d_arr
);
return
0
;
...
...
This diff is collapsed.
Click to expand it.
pi_gpu/pi_gpu.slurm
+
2
−
2
View file @
6c99a4d6
...
...
@@ -8,8 +8,8 @@
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-core=1
#SBATCH --threads-per-core=1
#SBATCH --mem=1
00M
#SBATCH --time 00:0
1
:00
#SBATCH --mem=1
G
#SBATCH --time 00:0
5
:00
source
./load_modules.sh
srun ./pi_gpu
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment