initial version

2020-02-24 17:20:21 -05:00
commit 74b23dff0b
5 changed files with 406 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+daxpy
+mpi_daxpy
--- a/15
+++ b/15
@@ -0,0 +1,15 @@
+.PHONY: all
+all: daxpy mpi_daxpy
+
+daxpy: daxpy.cu cuda_error.h
+	nvcc -lcublas -o daxpy daxpy.cu
+
+mpi_daxpy: mpi_daxpy.cc cuda_error.h
+	mpic++ -lcudart -lcublas -I$(CUDA_HOME)/include -o mpi_daxpy mpi_daxpy.cc
+
+.PHONY: clean
+clean:
+	rm -rf daxpy mpi_daxpy
+
+.PHONY: force
+force: clean all
--- a/cuda_error.h
+++ b/cuda_error.h
@@ -0,0 +1,138 @@
+/** Error handling macros for CUDA. All cuda routines should be wrapped
+ * in either a CHECK or WARN call; CHECK will print the error and exit on
+ * failure, while WARN will just print the error on failure. PTRINFO is
+ * a convenience routine for debugging data that needs to be moved to
+ * reg storage.
+ *
+ * By default, no checking is done, for maximum performance on production
+ * runs. define "GPU_CHECK_CALLS" to enable checks.
+ *
+ * TODO: add fortran interface
+ * */
+#include "cuda_runtime_api.h"
+#include "cublas_v2.h"
+
+
+#ifndef GPU_NO_CHECK_CALLS
+#define CHECK(msg, val) __checkCuda(msg, (val), __FILE__, __LINE__, true)
+#define WARN(msg, val) __checkCuda(msg, (val), __FILE__, __LINE__, false)
+#define PTRINFO(msg, ptr) __print_cuda_ptr_info(msg, ptr)
+#define MEMINFO(msg, ptr, size) __print_cuda_mem_info(msg, ptr, size)
+#else
+#define CHECK(msg, val) { int __i = (val); }
+#define WARN(msg, val)  { int __i = (val); }
+#define PTRINFO(msg, ptr) { void *__p = (void *)(ptr); }
+#define MEMINFO(msg, ptr, size)  { void *__p = (void *)(ptr); }
+#endif
+
+
+inline int __checkCuda(const char *msg, cudaError_t val, const char *fname,
+                        const int line, bool abort=true) {
+  if (val != cudaSuccess) {
+     fprintf(stderr,
+             "%s(%i): CUDA Error (%s) %i: %s\n",
+             fname, line, msg, val, cudaGetErrorString(val));
+     if (abort) {
+       cudaDeviceReset();
+         exit(EXIT_FAILURE);
+     }
+  }
+  return (int)val;
+}
+
+
+// overload for cublasStatus_t
+inline int __checkCuda(const char *msg, cublasStatus_t val, const char *fname,
+                        const int line, bool abort=true) {
+  if (val != CUBLAS_STATUS_SUCCESS) {
+     const char *err_s = "OTHER";
+     if (val == CUBLAS_STATUS_NOT_INITIALIZED) {
+         err_s = "NOT_INITIALIZED";
+     } else if (val == CUBLAS_STATUS_INVALID_VALUE) {
+         err_s = "INVALID_VALUE";
+     }
+     fprintf(stderr,
+             "%s(%i): CUDA Error (%s) %i: %s\n",
+             fname, line, msg, val, err_s);
+     if (abort) {
+       cudaDeviceReset();
+       exit(EXIT_FAILURE);
+     }
+  }
+  return (int)val;
+}
+
+
+inline void __print_cuda_ptr_info(const char *label, void *ptr) {
+  cudaError_t cu_err;
+  cudaPointerAttributes attr;
+  const char *type_name = NULL;
+
+  if (ptr == NULL) {
+    printf("CUDA pointer %s (%zx): NULL\n", label, ptr);
+    return;
+  }
+
+  // NB: the 'type' attribute was not added until CUDA 10.0, use memoryType
+  // for better compatibility
+  cu_err = cudaPointerGetAttributes(&attr, ptr);
+  if (cu_err != cudaSuccess) {
+    if (cu_err == cudaErrorInvalidValue) {
+      type_name = "Invalid (non-unified addressing)";
+    } else {
+      WARN("get pointer attr", cu_err);
+      return;
+    }
+  } else if (attr.memoryType == cudaMemoryTypeDevice) {
+    if (attr.isManaged) {
+        type_name = "Managed";
+    } else {
+        type_name = "Device";
+    }
+  } else if (attr.memoryType == cudaMemoryTypeHost) {
+    type_name = "Host";
+  }
+  printf("CUDA pointer %s (%zx): %s\n", label, ptr, type_name);
+}
+
+
+inline void __print_cuda_mem_info(const char *label, void *ptr, size_t size) {
+  cudaError_t cu_err;
+  cudaPointerAttributes pointer_attr;
+  int mem_attr = -123;
+  bool is_managed = false;
+
+  cu_err = cudaPointerGetAttributes(&pointer_attr, ptr);
+  if (cu_err != cudaSuccess) {
+    if (cu_err == cudaErrorInvalidValue) {
+      printf("CUDA PreferredLocation of '%s' is NOT CUDA\n", label);
+      return;
+    } else {
+      WARN("get pointer attr", cu_err);
+      return;
+    }
+  } else if (pointer_attr.memoryType == cudaMemoryTypeDevice) {
+    if (pointer_attr.isManaged) {
+        is_managed = true;
+    }
+  }
+
+  if (!is_managed) {
+    printf("CUDA PreferredLocation of '%s' is UNMANAGED\n", label);
+    return;
+  }
+ 
+  WARN("get mem range preferred location",
+       cudaMemRangeGetAttribute(&mem_attr, sizeof(mem_attr),
+                                cudaMemRangeAttributePreferredLocation,
+                                ptr, size));
+  if (mem_attr == cudaCpuDeviceId) {
+      printf("CUDA PreferredLocation of '%s' is CPU (%d)\n", label, mem_attr);
+  } else if (mem_attr == cudaInvalidDeviceId) {
+      printf("CUDA PreferredLocation of '%s' is INVALID (%d)\n",
+             label, mem_attr);
+  } else {
+      printf("CUDA PreferredLocation of '%s' is DEVICE (%d)\n",
+             label, mem_attr);
+  }
+}
--- a/daxpy.cu
+++ b/daxpy.cu
@@ -0,0 +1,94 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  daxpy.c
+ *
+ *    Description:  Test cublas DAXPY, specifically to verify usage on
+ *                  summit with GPUMPS and all 6 GPUs shared over 42 procs.
+ *
+ *        Version:  1.0
+ *        Created:  05/20/2019 10:33:30 AM
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  YOUR NAME (), 
+ *   Organization:  
+ *
+ * =====================================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cublas_v2.h"
+#include "cuda_runtime_api.h"
+
+#define GPU_CHECK_CALLS
+#include "cuda_error.h"
+
+// column major
+#define IDX2C(i,j,ld) (((j)*(ld))+(i))
+
+static cublasHandle_t handle;
+
+
+int main(int argc, char **argv) {
+    int n = 1024;
+
+    double a = 2.0;
+    double sum = 0.0;
+
+    double *x, *y, *d_x, *d_y;
+
+    x = (double *)malloc(n*sizeof(*x));
+    if (x == NULL) {
+        printf("host malloc(x) failed\n");
+        return EXIT_FAILURE;
+    }
+
+    y = (double *)malloc(n*sizeof(*y));
+    if (x == NULL) {
+        printf("host malloc(y) failed\n");
+        return EXIT_FAILURE;
+    }
+
+    for (int i=0; i<n; i++) {
+        x[i] = i+1;
+        y[i] = -i-1;
+    }
+
+    //CHECK("setDevice", cudaSetDevice(0));
+
+    CHECK( "cublas", cublasCreate(&handle) );
+
+    CHECK( "d_x", cudaMalloc((void**)&d_x, n*sizeof(*d_x)) );
+    CHECK( "d_y", cudaMalloc((void**)&d_y, n*sizeof(*d_y)) );
+
+    CHECK("d_x = x",
+          cudaMemcpy(d_x, x, n*sizeof(*x), cudaMemcpyHostToDevice) );
+    CHECK("d_y = y",
+          cudaMemcpy(d_y, y, n*sizeof(*y), cudaMemcpyHostToDevice) );
+
+    CHECK("daxpy",
+          cublasDaxpy(handle, n, &a, d_x, 1, d_y, 1) );
+
+    CHECK("daxpy sync", cudaDeviceSynchronize());
+    
+    CHECK("y = d_y",
+          cudaMemcpy(y, d_y, n*sizeof(*y), cudaMemcpyDeviceToHost) );
+
+    CHECK("y = d_y sync", cudaDeviceSynchronize() );
+
+    sum = 0.0;
+    for (int i=0; i<n; i++) {
+        printf("%f\n", y[i]);
+        sum += y[i];
+    }
+    printf("SUM = %f\n", sum);
+
+    // cleanup
+    cudaFree(d_x);
+    cudaFree(d_y);
+    cublasDestroy(handle);
+    return EXIT_SUCCESS;
+}
--- a/mpi_daxpy.cc
+++ b/mpi_daxpy.cc
@@ -0,0 +1,157 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  mpi_daxpy.c
+ *
+ *    Description:  Adds MPI to cublas test, to debug issue on Summit
+ *
+ *        Version:  1.0
+ *        Created:  05/20/2019 10:33:30 AM
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  YOUR NAME (), 
+ *   Organization:  
+ *
+ * =====================================================================================
+ */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cublas_v2.h"
+#include "cuda_runtime_api.h"
+
+#define GPU_CHECK_CALLS
+#include "cuda_error.h"
+
+// column major
+#define IDX2C(i,j,ld) (((j)*(ld))+(i))
+
+
+static cublasHandle_t handle;
+
+
+void set_rank_device(int n_ranks, int rank) {
+    int n_devices, device, ranks_per_device;
+    size_t memory_per_rank;
+    cudaDeviceProp device_prop;
+
+    CHECK("get device count", cudaGetDeviceCount(&n_devices));
+
+    if (n_ranks > n_devices) {
+        if (n_ranks % n_devices != 0) {
+            printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n",
+                   n_ranks, n_devices);
+            exit(EXIT_FAILURE);
+        }
+        ranks_per_device = n_ranks / n_devices;
+        device = rank / ranks_per_device;
+    } else {
+        ranks_per_device = 1;
+        device = rank;
+    }
+
+    CHECK("get device props", cudaGetDeviceProperties(&device_prop, device));
+    memory_per_rank = device_prop.totalGlobalMem / ranks_per_device;
+    printf("RANK[%d/%d] => DEVICE[%d/%d] mem=%zd\n", rank+1, n_ranks,
+           device+1, n_devices, memory_per_rank);
+
+    CHECK("set device", cudaSetDevice(device));
+}
+
+
+int main(int argc, char **argv) {
+    int n = 1024;
+
+    int world_size, world_rank;
+
+    double a = 2.0;
+    double sum = 0.0;
+
+    double *x, *y, *d_x, *d_y;
+    double *m_x, *m_y;
+
+    MPI_Init(NULL, NULL);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    x = (double *)malloc(n*sizeof(*x));
+    if (x == NULL) {
+        printf("host malloc(x) failed\n");
+        return EXIT_FAILURE;
+    }
+
+    y = (double *)malloc(n*sizeof(*y));
+    if (x == NULL) {
+        printf("host malloc(y) failed\n");
+        return EXIT_FAILURE;
+    }
+
+    for (int i=0; i<n; i++) {
+        x[i] = i+1;
+        y[i] = -i-1;
+    }
+
+    set_rank_device(world_size, world_rank);
+    //CHECK("setDevice", cudaSetDevice(0));
+
+    CHECK( "cublas", cublasCreate(&handle) );
+
+    CHECK( "d_x", cudaMalloc((void**)&d_x, n*sizeof(*d_x)) );
+    CHECK( "d_y", cudaMalloc((void**)&d_y, n*sizeof(*d_y)) );
+
+    CHECK( "m_x", cudaMallocManaged((void**)&m_x, n*sizeof(*m_x)) );
+    CHECK( "m_y", cudaMallocManaged((void**)&m_y, n*sizeof(*m_y)) );
+
+    CHECK("d_x = x",
+          cudaMemcpy(d_x, x, n*sizeof(*x), cudaMemcpyHostToDevice) );
+    CHECK("d_y = y",
+          cudaMemcpy(d_y, y, n*sizeof(*y), cudaMemcpyHostToDevice) );
+
+    CHECK("m_x = x",
+          cudaMemcpy(m_x, x, n*sizeof(*x), cudaMemcpyHostToDevice) );
+    CHECK("m_y = y",
+          cudaMemcpy(m_y, y, n*sizeof(*y), cudaMemcpyHostToDevice) );
+
+    MEMINFO("d_x", d_x, sizeof(d_x));
+    MEMINFO("d_y", d_y, sizeof(d_y));
+
+    MEMINFO("m_x", m_x, sizeof(m_x));
+    MEMINFO("m_y", m_y, sizeof(m_y));
+
+    MEMINFO("x", x, sizeof(x));
+    MEMINFO("y", y, sizeof(y));
+
+    CHECK("daxpy",
+          cublasDaxpy(handle, n, &a, m_x, 1, m_y, 1) );
+
+    CHECK("daxpy sync", cudaDeviceSynchronize());
+    
+    /*
+    CHECK("y = d_y",
+          cudaMemcpy(y, m_y, n*sizeof(*y), cudaMemcpyDeviceToHost) );
+    */
+
+    CHECK("y = d_y sync", cudaDeviceSynchronize() );
+
+    sum = 0.0;
+    for (int i=0; i<n; i++) {
+        //printf("%f\n", y[i]);
+        sum += m_y[i];
+    }
+    printf("%d/%d SUM = %f\n", world_rank, world_size, sum);
+
+    // cleanup
+    cudaFree(d_x);
+    cudaFree(d_y);
+    cudaFree(m_x);
+    cudaFree(m_y);
+    cublasDestroy(handle);
+
+    MPI_Finalize();
+
+    return EXIT_SUCCESS;
+}