/* * ===================================================================================== * * Filename: mpi_daxpy.c * * Description: Adds MPI to cublas test, to debug issue on Summit * * Version: 1.0 * Created: 05/20/2019 10:33:30 AM * Revision: none * Compiler: gcc * * Author: YOUR NAME (), * Organization: * * ===================================================================================== */ #include #include #include #include "cublas_v2.h" #include "cuda_runtime_api.h" #define GPU_CHECK_CALLS #include "cuda_error.h" // column major #define IDX2C(i,j,ld) (((j)*(ld))+(i)) static cublasHandle_t handle; void set_rank_device(int n_ranks, int rank) { int n_devices, device, ranks_per_device; size_t memory_per_rank; cudaDeviceProp device_prop; CHECK("get device count", cudaGetDeviceCount(&n_devices)); if (n_ranks > n_devices) { if (n_ranks % n_devices != 0) { printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n", n_ranks, n_devices); exit(EXIT_FAILURE); } ranks_per_device = n_ranks / n_devices; device = rank / ranks_per_device; } else { ranks_per_device = 1; device = rank; } CHECK("get device props", cudaGetDeviceProperties(&device_prop, device)); memory_per_rank = device_prop.totalGlobalMem / ranks_per_device; printf("RANK[%d/%d] => DEVICE[%d/%d] mem=%zd\n", rank+1, n_ranks, device+1, n_devices, memory_per_rank); CHECK("set device", cudaSetDevice(device)); } int main(int argc, char **argv) { int n = 1024; int world_size, world_rank; double a = 2.0; double sum = 0.0; double *x, *y, *d_x, *d_y; double *m_x, *m_y; char *mb_per_core; MPI_Init(NULL, NULL); MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); x = (double *)malloc(n*sizeof(*x)); if (x == NULL) { printf("host malloc(x) failed\n"); return EXIT_FAILURE; } y = (double *)malloc(n*sizeof(*y)); if (x == NULL) { printf("host malloc(y) failed\n"); return EXIT_FAILURE; } for (int i=0; i