add jlse runners, more flexible node counter

main
Bryce Allen 5 years ago
parent 12d76b4a42
commit cd6e6f7eb5

@ -0,0 +1,21 @@
#!/bin/bash
#COBALT -t 00:20:00
#COBALT -n 2
#COBALT --jobname cublas-nsys-test
#COBALT -O cublas-nsys-test
#COBALT -q gpu_v100_smx2
cd $HOME/hpc/mpi-cuda/jlse
pwd
source ./setup.sh
which mpirun
which nsys
./run.sh noum none 2 4
./run.sh noum nsys 2 4
./run.sh noum none 1 4 &
./run.sh noum nsys 1 4
wait
./run.sh noum none 1 2 &
./run.sh noum nsys 1 2
wait

@ -0,0 +1,34 @@
#!/bin/bash
if [ $# -ne 4 ]; then
echo "Usage: $0 um|noum nsys|nvprof|none nodes ppn"
exit 1
fi
um=$1
prof=$2
nodes=$3
ppn=$4
tag=${um}_${prof}_${nodes}_${ppn}
if [ $prof == "nsys" ]; then
prof_cmd="nsys profile --kill=none -c cudaProfilerApi -o profile/${tag}.%q{PMIX_RANK}"
elif [ $prof == "nvprof" ]; then
prof_cmd="nvprof -o profile/nvprof.%q{PMIX_RANK}.nvvp --profile-from-start off"
else
prof_cmd=""
fi
if [ $um == "um" ]; then
cmd=./mpi_daxpy_nvtx_managed
else
cmd=./mpi_daxpy_nvtx_unmanaged
fi
total_procs=$((ppn * nodes))
set +x
mpirun -np $total_procs \
$prof_cmd $cmd >out-${tag}.txt 2>&1
set -x

@ -0,0 +1,5 @@
source $HOME/fusion/spack/ivolta86/share/spack/setup-env.sh
spack load -r openmpi
module use $HOME/soft/modulefiles
module load nsight-systems

@ -69,6 +69,19 @@ void set_rank_device(int n_ranks, int rank) {
} }
int get_node_count(int n_ranks) {
int shm_size;
MPI_Comm shm_comm;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shm_comm);
MPI_Comm_size(shm_comm, &shm_size);
MPI_Comm_free(&shm_comm);
return n_ranks / shm_size;
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
const int n_per_node = 48*MB; const int n_per_node = 48*MB;
int nodes = 1; int nodes = 1;
@ -105,11 +118,15 @@ int main(int argc, char **argv) {
MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
nodes = get_node_count(world_size);
// hack: assume max 6 mpi per node, so we use bigger // hack: assume max 6 mpi per node, so we use bigger
// arrays on multi-node runs // arrays on multi-node runs
/*
if (world_size > 6) { if (world_size > 6) {
nodes = (world_size + 5) / 6; nodes = (world_size + 5) / 6;
} }
*/
nall = nodes * n_per_node; nall = nodes * n_per_node;
n = nall / world_size; n = nall / world_size;

Loading…
Cancel
Save