diff --git a/mpi_pi_gpu/.gitignore b/mpi_pi_gpu/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1926ab2490cda6d1b83e58c191d87dc9b1f2879b
--- /dev/null
+++ b/mpi_pi_gpu/.gitignore
@@ -0,0 +1,2 @@
+mpi_pi_gpu
+output
diff --git a/mpi_pi_gpu/Makefile b/mpi_pi_gpu/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..272faf7674608c4acbffba9328d0678974d1939f
--- /dev/null
+++ b/mpi_pi_gpu/Makefile
@@ -0,0 +1,2 @@
+mpi_pi_gpu: mpi_pi_gpu.c
+	mpic++ $< -lcudart -o $@
diff --git a/mpi_pi_gpu/load_modules.sh b/mpi_pi_gpu/load_modules.sh
new file mode 100755
index 0000000000000000000000000000000000000000..450477e7571b88a7dc52e6a251bcc1c112e4bd35
--- /dev/null
+++ b/mpi_pi_gpu/load_modules.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+module purge
+module load spack git gcc/7.3.0 cuda/10.1 openmpi/3.1.4-pmi-cuda-ucx
diff --git a/mpi_pi_gpu/mpi_pi_gpu.cu b/mpi_pi_gpu/mpi_pi_gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3f966243e401318d29bfb82cd465070497b0a5b6
--- /dev/null
+++ b/mpi_pi_gpu/mpi_pi_gpu.cu
@@ -0,0 +1,122 @@
+// Neil Gershenfeld 10/14/20
+// Erik Strand 3/6/2021
+// calculation of pi by a CUDA+MPI sum
+// assumes one GPU per MPI rank
+
+#include <chrono>
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <mpi.h>
+
+using namespace std;
+
+uint64_t const n_terms_per_thread = 100000;
+uint64_t const n_threads_per_gpu = 1024;
+uint64_t const n_terms_per_gpu = n_terms_per_thread * n_threads_per_gpu;
+uint64_t const n_threads_per_block = 512;
+uint64_t const n_blocks_per_gpu = (n_threads_per_gpu + n_threads_per_block - 1) / n_threads_per_block;
+
+int nloop = 10;
+
+__global__
+void init(double *arr, int gpu_idx) {
+    uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    uint64_t const start = n_terms_per_gpu * gpu_idx + n_terms_per_thread * thread_idx + 1;
+    uint64_t const end = n_terms_per_gpu * (gpu_idx + 1) + n_terms_per_thread * thread_idx + 1;
+    double sum = 0.0;
+    for (uint64_t i = start; i < end; ++i) {
+        sum += 0.5 / ((i - 0.75) * (i - 0.25));
+    }
+    arr[thread_idx] = sum;
+}
+
+__global__
+void reduce_sum(double *arr, uint64_t stride) {
+    uint64_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_idx < stride) {
+        arr[thread_idx] += arr[thread_idx + stride];
+    }
+}
+
+void reduce(double *arr) {
+    uint64_t stride = n_threads_per_gpu >> 1;
+    while (stride > 0) {
+        reduce_sum<<<n_blocks_per_gpu, n_threads_per_block>>>(arr, stride);
+        stride = stride >> 1;
+    }
+}
+
+int main(int argc, char** argv) {
+    char* local_rank_str = NULL;
+    int local_rank = 0;
+    local_rank_str = getenv("SLURM_LOCALID");
+    if (local_rank_str != NULL) {
+        local_rank = atoi(local_rank_str);
+        std::cout << "slurm local rank = " << local_rank << '\n';
+    } else {
+        std::cout << "slurm local rank not defined\n";
+    }
+
+    int n_gpus;
+    cudaGetDeviceCount(&n_gpus);
+    // relative to this node
+    int local_gpu_idx = local_rank % n_gpus;
+    //cudaSetDevice(local_gpu_idx);
+
+    int n_tasks, rank;
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &n_tasks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    // We assume that all nodes have the same number of GPUs.
+    int gpu_idx = n_gpus * rank + local_gpu_idx;
+
+    int host_name_length;
+    char host_name[MPI_MAX_PROCESSOR_NAME];
+    MPI_Get_processor_name(host_name, &host_name_length);
+
+    std::cout << "task: " << rank << " of " << n_tasks << '\n';
+    std::cout << "node: " << host_name << '\n';
+    std::cout << "local gpu idx: " << local_gpu_idx << '\n';
+    std::cout << "global gpu idx: " << gpu_idx << '\n';
+
+    /*
+    int rank,nranks;
+    MPI_Init(&argc,&argv);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+    MPI_Comm_size(MPI_COMM_WORLD,&nranks);
+    double *arr,result,pi;
+    cudaMalloc(&arr,nthreads*sizeof(double));
+    if (rank == 0) {
+        for (int i = 0; i < nloop; ++i) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            double tstart = MPI_Wtime();
+            init<<<blocks,threads>>>(arr,npts,nthreads,rank);
+            reduce(arr);
+            cudaDeviceSynchronize();
+            cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
+            MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+            double tend = MPI_Wtime();
+            double dt = tend-tstart;
+            double gflops = npts*nthreads*nranks*5.0/dt/1e9;
+            printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
+            printf("time = %f, estimated GFlops = %f\n",dt,gflops);
+        }
+    }
+    else {
+        for (int i = 0; i < nloop; ++i) {
+            MPI_Barrier(MPI_COMM_WORLD);
+            init<<<blocks,threads>>>(arr,npts,nthreads,rank);
+            reduce(arr);
+            cudaDeviceSynchronize();
+            cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
+            MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+        }
+    }
+    cudaFree(arr);
+    */
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/mpi_pi_gpu/mpi_pi_gpu.slurm b/mpi_pi_gpu/mpi_pi_gpu.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..f31083a37325b43d589dd1d312944613fc9c06d6
--- /dev/null
+++ b/mpi_pi_gpu/mpi_pi_gpu.slurm
@@ -0,0 +1,15 @@
+#!/bin/bash
+#SBATCH -J mpi_pi_gpu
+#SBATCH -o output/%j.out
+#SBATCH -e output/%j.err
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --gres=gpu:2
+#SBATCH --cpus-per-task=1
+#SBATCH --ntasks-per-core=1
+#SBATCH --threads-per-core=1
+#SBATCH --mem=10G
+#SBATCH --time 00:01:00
+
+source ./load_modules.sh
+srun ./mpi_pi_gpu