add jlse runners, more flexible node counter

2020-08-11 15:34:07 +00:00
parent 12d76b4a42
commit cd6e6f7eb5
4 changed files with 77 additions and 0 deletions
--- a/jlse/job.pbs
+++ b/jlse/job.pbs
@@ -0,0 +1,21 @@
+#!/bin/bash
+#COBALT -t 00:20:00
+#COBALT -n 2
+#COBALT --jobname cublas-nsys-test
+#COBALT -O cublas-nsys-test
+#COBALT -q gpu_v100_smx2
+
+cd $HOME/hpc/mpi-cuda/jlse
+pwd
+source ./setup.sh
+which mpirun
+which nsys
+
+./run.sh noum none 2 4
+./run.sh noum nsys 2 4
+./run.sh noum none 1 4 &
+./run.sh noum nsys 1 4
+wait
+./run.sh noum none 1 2 &
+./run.sh noum nsys 1 2
+wait
--- a/jlse/run.sh
+++ b/jlse/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 um|noum nsys|nvprof|none nodes ppn"
+  exit 1
+fi
+
+um=$1
+prof=$2
+nodes=$3
+ppn=$4
+
+tag=${um}_${prof}_${nodes}_${ppn}
+
+if [ $prof == "nsys" ]; then
+  prof_cmd="nsys profile --kill=none -c cudaProfilerApi -o profile/${tag}.%q{PMIX_RANK}"
+elif [ $prof == "nvprof" ]; then
+  prof_cmd="nvprof -o profile/nvprof.%q{PMIX_RANK}.nvvp --profile-from-start off"
+else
+  prof_cmd=""
+fi
+
+if [ $um == "um" ]; then
+  cmd=./mpi_daxpy_nvtx_managed
+else
+  cmd=./mpi_daxpy_nvtx_unmanaged
+fi
+
+total_procs=$((ppn * nodes))
+
+set +x
+mpirun -np $total_procs \
+   $prof_cmd $cmd >out-${tag}.txt 2>&1
+set -x
--- a/jlse/setup.sh
+++ b/jlse/setup.sh
@@ -0,0 +1,5 @@
+source $HOME/fusion/spack/ivolta86/share/spack/setup-env.sh
+spack load -r openmpi
+
+module use $HOME/soft/modulefiles
+module load nsight-systems
--- a/mpi_daxpy_nvtx.cc
+++ b/mpi_daxpy_nvtx.cc
@@ -69,6 +69,19 @@ void set_rank_device(int n_ranks, int rank) {
 }


+int get_node_count(int n_ranks) {
+    int shm_size;
+    MPI_Comm shm_comm;
+
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                        MPI_INFO_NULL, &shm_comm);
+    MPI_Comm_size(shm_comm, &shm_size);
+
+    MPI_Comm_free(&shm_comm);
+    return n_ranks / shm_size;
+}
+
+
 int main(int argc, char **argv) {
    const int n_per_node = 48*MB;
    int nodes = 1;
@@ -105,11 +118,15 @@ int main(int argc, char **argv) {
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

+    nodes = get_node_count(world_size);
+
    // hack: assume max 6 mpi per node, so we use bigger
    // arrays on multi-node runs
+    /*
    if (world_size > 6) {
        nodes = (world_size + 5) / 6;
    }
+    */

    nall = nodes * n_per_node;
    n = nall / world_size;