commit
74b23dff0b
@ -0,0 +1,2 @@
|
||||
daxpy
|
||||
mpi_daxpy
|
||||
@ -0,0 +1,15 @@
|
||||
.PHONY: all
|
||||
all: daxpy mpi_daxpy
|
||||
|
||||
daxpy: daxpy.cu cuda_error.h
|
||||
nvcc -lcublas -o daxpy daxpy.cu
|
||||
|
||||
mpi_daxpy: mpi_daxpy.cc cuda_error.h
|
||||
mpic++ -lcudart -lcublas -I$(CUDA_HOME)/include -o mpi_daxpy mpi_daxpy.cc
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -rf daxpy mpi_daxpy
|
||||
|
||||
.PHONY: force
|
||||
force: clean all
|
||||
@ -0,0 +1,138 @@
|
||||
/** Error handling macros for CUDA. All cuda routines should be wrapped
|
||||
* in either a CHECK or WARN call; CHECK will print the error and exit on
|
||||
* failure, while WARN will just print the error on failure. PTRINFO is
|
||||
* a convenience routine for debugging data that needs to be moved to
|
||||
* reg storage.
|
||||
*
|
||||
* By default, no checking is done, for maximum performance on production
|
||||
* runs. define "GPU_CHECK_CALLS" to enable checks.
|
||||
*
|
||||
* TODO: add fortran interface
|
||||
* */
|
||||
#include "cuda_runtime_api.h"
|
||||
#include "cublas_v2.h"
|
||||
|
||||
|
||||
#ifndef GPU_NO_CHECK_CALLS
|
||||
#define CHECK(msg, val) __checkCuda(msg, (val), __FILE__, __LINE__, true)
|
||||
#define WARN(msg, val) __checkCuda(msg, (val), __FILE__, __LINE__, false)
|
||||
#define PTRINFO(msg, ptr) __print_cuda_ptr_info(msg, ptr)
|
||||
#define MEMINFO(msg, ptr, size) __print_cuda_mem_info(msg, ptr, size)
|
||||
#else
|
||||
#define CHECK(msg, val) { int __i = (val); }
|
||||
#define WARN(msg, val) { int __i = (val); }
|
||||
#define PTRINFO(msg, ptr) { void *__p = (void *)(ptr); }
|
||||
#define MEMINFO(msg, ptr, size) { void *__p = (void *)(ptr); }
|
||||
#endif
|
||||
|
||||
|
||||
inline int __checkCuda(const char *msg, cudaError_t val, const char *fname,
|
||||
const int line, bool abort=true) {
|
||||
if (val != cudaSuccess) {
|
||||
fprintf(stderr,
|
||||
"%s(%i): CUDA Error (%s) %i: %s\n",
|
||||
fname, line, msg, val, cudaGetErrorString(val));
|
||||
if (abort) {
|
||||
cudaDeviceReset();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
return (int)val;
|
||||
}
|
||||
|
||||
|
||||
// overload for cublasStatus_t
|
||||
inline int __checkCuda(const char *msg, cublasStatus_t val, const char *fname,
|
||||
const int line, bool abort=true) {
|
||||
if (val != CUBLAS_STATUS_SUCCESS) {
|
||||
const char *err_s = "OTHER";
|
||||
if (val == CUBLAS_STATUS_NOT_INITIALIZED) {
|
||||
err_s = "NOT_INITIALIZED";
|
||||
} else if (val == CUBLAS_STATUS_INVALID_VALUE) {
|
||||
err_s = "INVALID_VALUE";
|
||||
}
|
||||
fprintf(stderr,
|
||||
"%s(%i): CUDA Error (%s) %i: %s\n",
|
||||
fname, line, msg, val, err_s);
|
||||
if (abort) {
|
||||
cudaDeviceReset();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
return (int)val;
|
||||
}
|
||||
|
||||
|
||||
inline void __print_cuda_ptr_info(const char *label, void *ptr) {
|
||||
cudaError_t cu_err;
|
||||
cudaPointerAttributes attr;
|
||||
const char *type_name = NULL;
|
||||
|
||||
if (ptr == NULL) {
|
||||
printf("CUDA pointer %s (%zx): NULL\n", label, ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// NB: the 'type' attribute was not added until CUDA 10.0, use memoryType
|
||||
// for better compatibility
|
||||
cu_err = cudaPointerGetAttributes(&attr, ptr);
|
||||
if (cu_err != cudaSuccess) {
|
||||
if (cu_err == cudaErrorInvalidValue) {
|
||||
type_name = "Invalid (non-unified addressing)";
|
||||
} else {
|
||||
WARN("get pointer attr", cu_err);
|
||||
return;
|
||||
}
|
||||
} else if (attr.memoryType == cudaMemoryTypeDevice) {
|
||||
if (attr.isManaged) {
|
||||
type_name = "Managed";
|
||||
} else {
|
||||
type_name = "Device";
|
||||
}
|
||||
} else if (attr.memoryType == cudaMemoryTypeHost) {
|
||||
type_name = "Host";
|
||||
}
|
||||
printf("CUDA pointer %s (%zx): %s\n", label, ptr, type_name);
|
||||
}
|
||||
|
||||
|
||||
inline void __print_cuda_mem_info(const char *label, void *ptr, size_t size) {
|
||||
cudaError_t cu_err;
|
||||
cudaPointerAttributes pointer_attr;
|
||||
int mem_attr = -123;
|
||||
bool is_managed = false;
|
||||
|
||||
cu_err = cudaPointerGetAttributes(&pointer_attr, ptr);
|
||||
if (cu_err != cudaSuccess) {
|
||||
if (cu_err == cudaErrorInvalidValue) {
|
||||
printf("CUDA PreferredLocation of '%s' is NOT CUDA\n", label);
|
||||
return;
|
||||
} else {
|
||||
WARN("get pointer attr", cu_err);
|
||||
return;
|
||||
}
|
||||
} else if (pointer_attr.memoryType == cudaMemoryTypeDevice) {
|
||||
if (pointer_attr.isManaged) {
|
||||
is_managed = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_managed) {
|
||||
printf("CUDA PreferredLocation of '%s' is UNMANAGED\n", label);
|
||||
return;
|
||||
}
|
||||
|
||||
WARN("get mem range preferred location",
|
||||
cudaMemRangeGetAttribute(&mem_attr, sizeof(mem_attr),
|
||||
cudaMemRangeAttributePreferredLocation,
|
||||
ptr, size));
|
||||
if (mem_attr == cudaCpuDeviceId) {
|
||||
printf("CUDA PreferredLocation of '%s' is CPU (%d)\n", label, mem_attr);
|
||||
} else if (mem_attr == cudaInvalidDeviceId) {
|
||||
printf("CUDA PreferredLocation of '%s' is INVALID (%d)\n",
|
||||
label, mem_attr);
|
||||
} else {
|
||||
printf("CUDA PreferredLocation of '%s' is DEVICE (%d)\n",
|
||||
label, mem_attr);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,94 @@
|
||||
/*
|
||||
* =====================================================================================
|
||||
*
|
||||
* Filename: daxpy.c
|
||||
*
|
||||
* Description: Test cublas DAXPY, specifically to verify usage on
|
||||
* summit with GPUMPS and all 6 GPUs shared over 42 procs.
|
||||
*
|
||||
* Version: 1.0
|
||||
* Created: 05/20/2019 10:33:30 AM
|
||||
* Revision: none
|
||||
* Compiler: gcc
|
||||
*
|
||||
* Author: YOUR NAME (),
|
||||
* Organization:
|
||||
*
|
||||
* =====================================================================================
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "cublas_v2.h"
|
||||
#include "cuda_runtime_api.h"
|
||||
|
||||
#define GPU_CHECK_CALLS
|
||||
#include "cuda_error.h"
|
||||
|
||||
// column major
|
||||
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
|
||||
|
||||
static cublasHandle_t handle;
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int n = 1024;
|
||||
|
||||
double a = 2.0;
|
||||
double sum = 0.0;
|
||||
|
||||
double *x, *y, *d_x, *d_y;
|
||||
|
||||
x = (double *)malloc(n*sizeof(*x));
|
||||
if (x == NULL) {
|
||||
printf("host malloc(x) failed\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
y = (double *)malloc(n*sizeof(*y));
|
||||
if (x == NULL) {
|
||||
printf("host malloc(y) failed\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
x[i] = i+1;
|
||||
y[i] = -i-1;
|
||||
}
|
||||
|
||||
//CHECK("setDevice", cudaSetDevice(0));
|
||||
|
||||
CHECK( "cublas", cublasCreate(&handle) );
|
||||
|
||||
CHECK( "d_x", cudaMalloc((void**)&d_x, n*sizeof(*d_x)) );
|
||||
CHECK( "d_y", cudaMalloc((void**)&d_y, n*sizeof(*d_y)) );
|
||||
|
||||
CHECK("d_x = x",
|
||||
cudaMemcpy(d_x, x, n*sizeof(*x), cudaMemcpyHostToDevice) );
|
||||
CHECK("d_y = y",
|
||||
cudaMemcpy(d_y, y, n*sizeof(*y), cudaMemcpyHostToDevice) );
|
||||
|
||||
CHECK("daxpy",
|
||||
cublasDaxpy(handle, n, &a, d_x, 1, d_y, 1) );
|
||||
|
||||
CHECK("daxpy sync", cudaDeviceSynchronize());
|
||||
|
||||
CHECK("y = d_y",
|
||||
cudaMemcpy(y, d_y, n*sizeof(*y), cudaMemcpyDeviceToHost) );
|
||||
|
||||
CHECK("y = d_y sync", cudaDeviceSynchronize() );
|
||||
|
||||
sum = 0.0;
|
||||
for (int i=0; i<n; i++) {
|
||||
printf("%f\n", y[i]);
|
||||
sum += y[i];
|
||||
}
|
||||
printf("SUM = %f\n", sum);
|
||||
|
||||
// cleanup
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_y);
|
||||
cublasDestroy(handle);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@ -0,0 +1,157 @@
|
||||
/*
|
||||
* =====================================================================================
|
||||
*
|
||||
* Filename: mpi_daxpy.c
|
||||
*
|
||||
* Description: Adds MPI to cublas test, to debug issue on Summit
|
||||
*
|
||||
* Version: 1.0
|
||||
* Created: 05/20/2019 10:33:30 AM
|
||||
* Revision: none
|
||||
* Compiler: gcc
|
||||
*
|
||||
* Author: YOUR NAME (),
|
||||
* Organization:
|
||||
*
|
||||
* =====================================================================================
|
||||
*/
|
||||
|
||||
#include <mpi.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "cublas_v2.h"
|
||||
#include "cuda_runtime_api.h"
|
||||
|
||||
#define GPU_CHECK_CALLS
|
||||
#include "cuda_error.h"
|
||||
|
||||
// column major
|
||||
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
|
||||
|
||||
|
||||
static cublasHandle_t handle;
|
||||
|
||||
|
||||
void set_rank_device(int n_ranks, int rank) {
|
||||
int n_devices, device, ranks_per_device;
|
||||
size_t memory_per_rank;
|
||||
cudaDeviceProp device_prop;
|
||||
|
||||
CHECK("get device count", cudaGetDeviceCount(&n_devices));
|
||||
|
||||
if (n_ranks > n_devices) {
|
||||
if (n_ranks % n_devices != 0) {
|
||||
printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n",
|
||||
n_ranks, n_devices);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ranks_per_device = n_ranks / n_devices;
|
||||
device = rank / ranks_per_device;
|
||||
} else {
|
||||
ranks_per_device = 1;
|
||||
device = rank;
|
||||
}
|
||||
|
||||
CHECK("get device props", cudaGetDeviceProperties(&device_prop, device));
|
||||
memory_per_rank = device_prop.totalGlobalMem / ranks_per_device;
|
||||
printf("RANK[%d/%d] => DEVICE[%d/%d] mem=%zd\n", rank+1, n_ranks,
|
||||
device+1, n_devices, memory_per_rank);
|
||||
|
||||
CHECK("set device", cudaSetDevice(device));
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int n = 1024;
|
||||
|
||||
int world_size, world_rank;
|
||||
|
||||
double a = 2.0;
|
||||
double sum = 0.0;
|
||||
|
||||
double *x, *y, *d_x, *d_y;
|
||||
double *m_x, *m_y;
|
||||
|
||||
MPI_Init(NULL, NULL);
|
||||
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
|
||||
|
||||
x = (double *)malloc(n*sizeof(*x));
|
||||
if (x == NULL) {
|
||||
printf("host malloc(x) failed\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
y = (double *)malloc(n*sizeof(*y));
|
||||
if (x == NULL) {
|
||||
printf("host malloc(y) failed\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
x[i] = i+1;
|
||||
y[i] = -i-1;
|
||||
}
|
||||
|
||||
set_rank_device(world_size, world_rank);
|
||||
//CHECK("setDevice", cudaSetDevice(0));
|
||||
|
||||
CHECK( "cublas", cublasCreate(&handle) );
|
||||
|
||||
CHECK( "d_x", cudaMalloc((void**)&d_x, n*sizeof(*d_x)) );
|
||||
CHECK( "d_y", cudaMalloc((void**)&d_y, n*sizeof(*d_y)) );
|
||||
|
||||
CHECK( "m_x", cudaMallocManaged((void**)&m_x, n*sizeof(*m_x)) );
|
||||
CHECK( "m_y", cudaMallocManaged((void**)&m_y, n*sizeof(*m_y)) );
|
||||
|
||||
CHECK("d_x = x",
|
||||
cudaMemcpy(d_x, x, n*sizeof(*x), cudaMemcpyHostToDevice) );
|
||||
CHECK("d_y = y",
|
||||
cudaMemcpy(d_y, y, n*sizeof(*y), cudaMemcpyHostToDevice) );
|
||||
|
||||
CHECK("m_x = x",
|
||||
cudaMemcpy(m_x, x, n*sizeof(*x), cudaMemcpyHostToDevice) );
|
||||
CHECK("m_y = y",
|
||||
cudaMemcpy(m_y, y, n*sizeof(*y), cudaMemcpyHostToDevice) );
|
||||
|
||||
MEMINFO("d_x", d_x, sizeof(d_x));
|
||||
MEMINFO("d_y", d_y, sizeof(d_y));
|
||||
|
||||
MEMINFO("m_x", m_x, sizeof(m_x));
|
||||
MEMINFO("m_y", m_y, sizeof(m_y));
|
||||
|
||||
MEMINFO("x", x, sizeof(x));
|
||||
MEMINFO("y", y, sizeof(y));
|
||||
|
||||
CHECK("daxpy",
|
||||
cublasDaxpy(handle, n, &a, m_x, 1, m_y, 1) );
|
||||
|
||||
CHECK("daxpy sync", cudaDeviceSynchronize());
|
||||
|
||||
/*
|
||||
CHECK("y = d_y",
|
||||
cudaMemcpy(y, m_y, n*sizeof(*y), cudaMemcpyDeviceToHost) );
|
||||
*/
|
||||
|
||||
CHECK("y = d_y sync", cudaDeviceSynchronize() );
|
||||
|
||||
sum = 0.0;
|
||||
for (int i=0; i<n; i++) {
|
||||
//printf("%f\n", y[i]);
|
||||
sum += m_y[i];
|
||||
}
|
||||
printf("%d/%d SUM = %f\n", world_rank, world_size, sum);
|
||||
|
||||
// cleanup
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_y);
|
||||
cudaFree(m_x);
|
||||
cudaFree(m_y);
|
||||
cublasDestroy(handle);
|
||||
|
||||
MPI_Finalize();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
Loading…
Reference in new issue