/* * =========================================================================== * * Filename: mpi_daxpy_nvtx.c * * Description: Adds MPI to cublas test, to debug issue on Summit * * Version: 1.0 * Created: 05/20/2019 10:33:30 AM * Revision: none * Compiler: gcc * * Author: YOUR NAME (), * Organization: * * =========================================================================== */ #include #include #include #include "cublas_v2.h" #include "cuda_runtime_api.h" #include "nvToolsExt.h" #include "cuda_profiler_api.h" #define GPU_CHECK_CALLS #include "cuda_error.h" // column major #define IDX2C(i,j,ld) (((j)*(ld))+(i)) //#define BARRIER static cublasHandle_t handle; static const int MB = 1024*1024; void set_rank_device(int n_ranks, int rank) { int n_devices, device, ranks_per_device; size_t memory_per_rank; cudaDeviceProp device_prop; CHECK("get device count", cudaGetDeviceCount(&n_devices)); if (n_ranks > n_devices) { if (n_ranks % n_devices != 0) { printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n", n_ranks, n_devices); exit(EXIT_FAILURE); } ranks_per_device = n_ranks / n_devices; device = rank / ranks_per_device; } else { ranks_per_device = 1; device = rank; } CHECK("get device props", cudaGetDeviceProperties(&device_prop, device)); memory_per_rank = device_prop.totalGlobalMem / ranks_per_device; printf("RANK[%d/%d] => DEVICE[%d/%d] mem=%zd\n", rank+1, n_ranks, device+1, n_devices, memory_per_rank); CHECK("set device", cudaSetDevice(device)); } int get_node_count(int n_ranks) { int shm_size; MPI_Comm shm_comm; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_size(shm_comm, &shm_size); MPI_Comm_free(&shm_comm); return n_ranks / shm_size; } int main(int argc, char **argv) { const int n_per_node = 48*MB; int nodes = 1; int nall = n_per_node; int n = 0; int world_size, world_rank; size_t free_mem, total_mem; double a = 2.0; double sum = 0.0; double start_time = 0.0; double end_time = 0.0; double k_start_time = 0.0; double k_end_time = 0.0; double g_start_time = 0.0; double g_end_time = 0.0; double b_start_time = 0.0; double b_end_time = 0.0; #ifndef MANAGED double *h_x, *h_y; double *h_allx, *h_ally; #endif double *d_x, *d_y; double *d_allx, *d_ally; char *mb_per_core; MPI_Init(NULL, NULL); MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); nodes = get_node_count(world_size); // hack: assume max 6 mpi per node, so we use bigger // arrays on multi-node runs /* if (world_size > 6) { nodes = (world_size + 5) / 6; } */ nall = nodes * n_per_node; n = nall / world_size; if (world_rank == 0) { printf("%d nodes, %d ranks, %d elements each, total %d\n", nodes, world_size, n, nall); } /* x = (double *)malloc(n*sizeof(*x)); if (x == NULL) { printf("host malloc(x) failed\n"); return EXIT_FAILURE; } y = (double *)malloc(n*sizeof(*y)); if (y == NULL) { printf("host malloc(y) failed\n"); return EXIT_FAILURE; } */ // DEBUG weirdness on summit where GENE can't see MEMORY_PER_CORE, // possibly because the system spectrum mpi uses it in some way. if (world_rank == 0) { mb_per_core = getenv("MEMORY_PER_CORE"); if (mb_per_core == NULL) { printf("MEMORY_PER_CORE is not set\n"); } else { printf("MEMORY_PER_CORE=%s\n", mb_per_core); } } set_rank_device(world_size, world_rank); //CHECK("setDevice", cudaSetDevice(0)); cudaProfilerStart(); start_time = MPI_Wtime(); CHECK( "cublas", cublasCreate(&handle) ); /* CHECK( "d_x", cudaMalloc((void**)&d_x, n*sizeof(*d_x)) ); CHECK( "d_y", cudaMalloc((void**)&d_y, n*sizeof(*d_y)) ); */ nvtxRangePushA("allocateArrays"); #ifdef MANAGED CHECK( "d_x", cudaMallocManaged((void**)&d_x, n*sizeof(*d_x)) ); CHECK( "d_y", cudaMallocManaged((void**)&d_y, n*sizeof(*d_y)) ); CHECK( "d_allx", cudaMallocManaged((void**)&d_allx, n*sizeof(*d_allx)*world_size) ); CHECK( "d_ally", cudaMallocManaged((void**)&d_ally, n*sizeof(*d_ally)*world_size) ); #else CHECK( "h_x", cudaMallocHost((void**)&h_x, n*sizeof(*h_x)) ); CHECK( "h_y", cudaMallocHost((void**)&h_y, n*sizeof(*h_y)) ); CHECK( "d_x", cudaMalloc((void**)&d_x, n*sizeof(*d_x)) ); CHECK( "d_y", cudaMalloc((void**)&d_y, n*sizeof(*d_y)) ); CHECK( "d_allx", cudaMalloc((void**)&d_allx, n*sizeof(*d_allx)*world_size) ); CHECK( "d_ally", cudaMalloc((void**)&d_ally, n*sizeof(*d_ally)*world_size) ); CHECK( "h_allx", cudaMallocHost((void**)&h_allx, n*sizeof(*h_allx)*world_size) ); CHECK( "h_ally", cudaMallocHost((void**)&h_ally, n*sizeof(*h_ally)*world_size) ); #endif nvtxRangePop(); if (world_rank == 0) { CHECK( "memInfo", cudaMemGetInfo(&free_mem, &total_mem) ); printf("GPU memory %0.3f / %0.3f (%0.3f) MB\n", free_mem/(double)MB, (double)total_mem/MB, (double)(total_mem-free_mem)/MB); } nvtxRangePushA("initializeArrays"); #ifdef MANAGED for (int i=0; i