diff --git a/jlse/job.pbs b/jlse/job.pbs new file mode 100755 index 0000000..dea2f33 --- /dev/null +++ b/jlse/job.pbs @@ -0,0 +1,21 @@ +#!/bin/bash +#COBALT -t 00:20:00 +#COBALT -n 2 +#COBALT --jobname cublas-nsys-test +#COBALT -O cublas-nsys-test +#COBALT -q gpu_v100_smx2 + +cd $HOME/hpc/mpi-cuda/jlse +pwd +source ./setup.sh +which mpirun +which nsys + +./run.sh noum none 2 4 +./run.sh noum nsys 2 4 +./run.sh noum none 1 4 & +./run.sh noum nsys 1 4 +wait +./run.sh noum none 1 2 & +./run.sh noum nsys 1 2 +wait diff --git a/jlse/run.sh b/jlse/run.sh new file mode 100755 index 0000000..b88a483 --- /dev/null +++ b/jlse/run.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +if [ $# -ne 4 ]; then + echo "Usage: $0 um|noum nsys|nvprof|none nodes ppn" + exit 1 +fi + +um=$1 +prof=$2 +nodes=$3 +ppn=$4 + +tag=${um}_${prof}_${nodes}_${ppn} + +if [ $prof == "nsys" ]; then + prof_cmd="nsys profile --kill=none -c cudaProfilerApi -o profile/${tag}.%q{PMIX_RANK}" +elif [ $prof == "nvprof" ]; then + prof_cmd="nvprof -o profile/nvprof.%q{PMIX_RANK}.nvvp --profile-from-start off" +else + prof_cmd="" +fi + +if [ $um == "um" ]; then + cmd=./mpi_daxpy_nvtx_managed +else + cmd=./mpi_daxpy_nvtx_unmanaged +fi + +total_procs=$((ppn * nodes)) + +set +x +mpirun -np $total_procs \ + $prof_cmd $cmd >out-${tag}.txt 2>&1 +set -x diff --git a/jlse/setup.sh b/jlse/setup.sh new file mode 100755 index 0000000..9a30d6d --- /dev/null +++ b/jlse/setup.sh @@ -0,0 +1,5 @@ +source $HOME/fusion/spack/ivolta86/share/spack/setup-env.sh +spack load -r openmpi + +module use $HOME/soft/modulefiles +module load nsight-systems diff --git a/mpi_daxpy_nvtx.cc b/mpi_daxpy_nvtx.cc index aff8152..1f90ea4 100644 --- a/mpi_daxpy_nvtx.cc +++ b/mpi_daxpy_nvtx.cc @@ -69,6 +69,19 @@ void set_rank_device(int n_ranks, int rank) { } +int get_node_count(int n_ranks) { + int shm_size; + MPI_Comm shm_comm; + + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + MPI_INFO_NULL, &shm_comm); + MPI_Comm_size(shm_comm, &shm_size); + + MPI_Comm_free(&shm_comm); + return n_ranks / shm_size; +} + + int main(int argc, char **argv) { const int n_per_node = 48*MB; int nodes = 1; @@ -105,11 +118,15 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + nodes = get_node_count(world_size); + // hack: assume max 6 mpi per node, so we use bigger // arrays on multi-node runs + /* if (world_size > 6) { nodes = (world_size + 5) / 6; } + */ nall = nodes * n_per_node; n = nall / world_size;