add jlse runners, more flexible node counter

5 years ago · cd6e6f7eb5
parent 12d76b4a42
commit cd6e6f7eb5
4 changed files with 77 additions and 0 deletions
--- a/jlse/job.pbs
+++ b/jlse/job.pbs
@ -0,0 +1,21 @@
 #!/bin/bash
 #COBALT -t 00:20:00
 #COBALT -n 2
 #COBALT --jobname cublas-nsys-test
 #COBALT -O cublas-nsys-test
 #COBALT -q gpu_v100_smx2
 cd $HOME/hpc/mpi-cuda/jlse
 pwd
 source ./setup.sh
 which mpirun
 which nsys
 ./run.sh noum none 2 4
 ./run.sh noum nsys 2 4
 ./run.sh noum none 1 4 &
 ./run.sh noum nsys 1 4
 wait
 ./run.sh noum none 1 2 &
 ./run.sh noum nsys 1 2
 wait
--- a/jlse/run.sh
+++ b/jlse/run.sh
@ -0,0 +1,34 @@
 #!/bin/bash
 if [ $# -ne 4 ]; then
  echo "Usage: $0 um|noum nsys|nvprof|none nodes ppn"
  exit 1
 fi
 um=$1
 prof=$2
 nodes=$3
 ppn=$4
 tag=${um}_${prof}_${nodes}_${ppn}
 if [ $prof == "nsys" ]; then
  prof_cmd="nsys profile --kill=none -c cudaProfilerApi -o profile/${tag}.%q{PMIX_RANK}"
 elif [ $prof == "nvprof" ]; then
  prof_cmd="nvprof -o profile/nvprof.%q{PMIX_RANK}.nvvp --profile-from-start off"
 else
  prof_cmd=""
 fi
 if [ $um == "um" ]; then
  cmd=./mpi_daxpy_nvtx_managed
 else
  cmd=./mpi_daxpy_nvtx_unmanaged
 fi
 total_procs=$((ppn * nodes))
 set +x
 mpirun -np $total_procs \
   $prof_cmd $cmd >out-${tag}.txt 2>&1
 set -x
--- a/jlse/setup.sh
+++ b/jlse/setup.sh
@ -0,0 +1,5 @@
 source $HOME/fusion/spack/ivolta86/share/spack/setup-env.sh
 spack load -r openmpi
 module use $HOME/soft/modulefiles
 module load nsight-systems
--- a/mpi_daxpy_nvtx.cc
+++ b/mpi_daxpy_nvtx.cc
@ -69,6 +69,19 @@ void set_rank_device(int n_ranks, int rank) {
 }
 int get_node_count(int n_ranks) {
    int shm_size;
    MPI_Comm shm_comm;
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
                        MPI_INFO_NULL, &shm_comm);
    MPI_Comm_size(shm_comm, &shm_size);
    MPI_Comm_free(&shm_comm);
    return n_ranks / shm_size;
 }
 int main(int argc, char **argv) {
    const int n_per_node = 48*MB;
    int nodes = 1;
@ -105,11 +118,15 @@ int main(int argc, char **argv) {
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    nodes = get_node_count(world_size);
    // hack: assume max 6 mpi per node, so we use bigger
    // arrays on multi-node runs
    /*
    if (world_size > 6) {
        nodes = (world_size + 5) / 6;
    }
    */
    nall = nodes * n_per_node;
    n = nall / world_size;