diff --git a/jlse/job.pbs b/jlse/job.pbs
new file mode 100755
index 0000000..dea2f33
--- /dev/null
+++ b/jlse/job.pbs
@@ -0,0 +1,21 @@
+#!/bin/bash
+#COBALT -t 00:20:00
+#COBALT -n 2
+#COBALT --jobname cublas-nsys-test
+#COBALT -O cublas-nsys-test
+#COBALT -q gpu_v100_smx2
+
+cd $HOME/hpc/mpi-cuda/jlse
+pwd
+source ./setup.sh
+which mpirun
+which nsys
+
+./run.sh noum none 2 4
+./run.sh noum nsys 2 4
+./run.sh noum none 1 4 &
+./run.sh noum nsys 1 4
+wait
+./run.sh noum none 1 2 &
+./run.sh noum nsys 1 2
+wait
diff --git a/jlse/run.sh b/jlse/run.sh
new file mode 100755
index 0000000..b88a483
--- /dev/null
+++ b/jlse/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 um|noum nsys|nvprof|none nodes ppn"
+  exit 1
+fi
+
+um=$1
+prof=$2
+nodes=$3
+ppn=$4
+
+tag=${um}_${prof}_${nodes}_${ppn}
+
+if [ $prof == "nsys" ]; then
+  prof_cmd="nsys profile --kill=none -c cudaProfilerApi -o profile/${tag}.%q{PMIX_RANK}"
+elif [ $prof == "nvprof" ]; then
+  prof_cmd="nvprof -o profile/nvprof.%q{PMIX_RANK}.nvvp --profile-from-start off"
+else
+  prof_cmd=""
+fi
+
+if [ $um == "um" ]; then
+  cmd=./mpi_daxpy_nvtx_managed
+else
+  cmd=./mpi_daxpy_nvtx_unmanaged
+fi
+
+total_procs=$((ppn * nodes))
+
+set +x
+mpirun -np $total_procs \
+   $prof_cmd $cmd >out-${tag}.txt 2>&1
+set -x
diff --git a/jlse/setup.sh b/jlse/setup.sh
new file mode 100755
index 0000000..9a30d6d
--- /dev/null
+++ b/jlse/setup.sh
@@ -0,0 +1,5 @@
+source $HOME/fusion/spack/ivolta86/share/spack/setup-env.sh
+spack load -r openmpi
+
+module use $HOME/soft/modulefiles
+module load nsight-systems
diff --git a/mpi_daxpy_nvtx.cc b/mpi_daxpy_nvtx.cc
index aff8152..1f90ea4 100644
--- a/mpi_daxpy_nvtx.cc
+++ b/mpi_daxpy_nvtx.cc
@@ -69,6 +69,19 @@ void set_rank_device(int n_ranks, int rank) {
 }
 
 
+int get_node_count(int n_ranks) {
+    int shm_size;
+    MPI_Comm shm_comm;
+
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                        MPI_INFO_NULL, &shm_comm);
+    MPI_Comm_size(shm_comm, &shm_size);
+
+    MPI_Comm_free(&shm_comm);
+    return n_ranks / shm_size;
+}
+
+
 int main(int argc, char **argv) {
     const int n_per_node = 48*MB;
     int nodes = 1;
@@ -105,11 +118,15 @@ int main(int argc, char **argv) {
     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 
+    nodes = get_node_count(world_size);
+
     // hack: assume max 6 mpi per node, so we use bigger
     // arrays on multi-node runs
+    /*
     if (world_size > 6) {
         nodes = (world_size + 5) / 6;
     }
+    */
 
     nall = nodes * n_per_node;
     n = nall / world_size;