distribute total across ranks

useful for test < 6 ranks per node
This commit is contained in:
Bryce Allen
2020-08-07 17:53:16 -04:00
parent 538c22a22f
commit c32b86422f

View File

@@ -68,7 +68,8 @@ void set_rank_device(int n_ranks, int rank) {
int main(int argc, char **argv) { int main(int argc, char **argv) {
int n = 4*MB; const int nall = 48*MB;
int n = 0;
int world_size, world_rank; int world_size, world_rank;
size_t free_mem, total_mem; size_t free_mem, total_mem;
@@ -84,7 +85,8 @@ int main(int argc, char **argv) {
double g_end_time = 0.0; double g_end_time = 0.0;
#ifndef MANAGED #ifndef MANAGED
double *h_x, *h_y, *h_allx, *h_ally; double *h_x, *h_y;
double *h_allx, *h_ally;
#endif #endif
double *d_x, *d_y; double *d_x, *d_y;
@@ -97,6 +99,12 @@ int main(int argc, char **argv) {
MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
n = nall / world_size;
if (world_rank == 0) {
printf("%d ranks, %d elements each, total %d\n", world_size, n, nall);
}
/* /*
x = (double *)malloc(n*sizeof(*x)); x = (double *)malloc(n*sizeof(*x));
if (x == NULL) { if (x == NULL) {
@@ -159,9 +167,11 @@ int main(int argc, char **argv) {
#endif #endif
nvtxRangePop(); nvtxRangePop();
if (world_rank == 0) {
CHECK( "memInfo", cudaMemGetInfo(&free_mem, &total_mem) ); CHECK( "memInfo", cudaMemGetInfo(&free_mem, &total_mem) );
printf("GPU memory %0.3f / %0.3f (%0.3f)\n", free_mem/(double)MB, printf("GPU memory %0.3f / %0.3f (%0.3f) MB\n", free_mem/(double)MB,
(double)total_mem/MB, (double)(total_mem-free_mem)/MB); (double)total_mem/MB, (double)(total_mem-free_mem)/MB);
}
nvtxRangePushA("initializeArrays"); nvtxRangePushA("initializeArrays");
#ifdef MANAGED #ifdef MANAGED
@@ -175,9 +185,9 @@ int main(int argc, char **argv) {
h_y[i] = -h_x[i]; h_y[i] = -h_x[i];
} }
nvtxRangePushA("copyInput"); nvtxRangePushA("copyInput");
CHECK("d_x = x", CHECK("d_x = h_x",
cudaMemcpy(d_x, h_x, n*sizeof(*h_x), cudaMemcpyHostToDevice) ); cudaMemcpy(d_x, h_x, n*sizeof(*h_x), cudaMemcpyHostToDevice) );
CHECK("d_y = y", CHECK("d_y = h_y",
cudaMemcpy(d_y, h_y, n*sizeof(*h_y), cudaMemcpyHostToDevice) ); cudaMemcpy(d_y, h_y, n*sizeof(*h_y), cudaMemcpyHostToDevice) );
nvtxRangePop(); nvtxRangePop();
#endif #endif
@@ -191,6 +201,13 @@ int main(int argc, char **argv) {
MEMINFO("d_x", d_x, sizeof(d_x)); MEMINFO("d_x", d_x, sizeof(d_x));
MEMINFO("d_y", d_y, sizeof(d_y)); MEMINFO("d_y", d_y, sizeof(d_y));
#ifndef MANAGED
MEMINFO("h_x", h_x, sizeof(h_x));
MEMINFO("h_y", h_y, sizeof(h_y));
MEMINFO("h_allx", h_allx, sizeof(h_allx));
MEMINFO("h_ally", h_ally, sizeof(h_ally));
#endif
k_start_time = MPI_Wtime(); k_start_time = MPI_Wtime();
nvtxRangePushA("cublasDaxpy"); nvtxRangePushA("cublasDaxpy");
CHECK("daxpy", CHECK("daxpy",
@@ -237,9 +254,9 @@ int main(int argc, char **argv) {
sum += d_ally[i]; sum += d_ally[i];
} }
#else #else
nvtxRangePushA("copyOutput"); nvtxRangePushA("copyAlly");
CHECK("h_ally = d_ally", CHECK("h_ally = d_ally",
cudaMemcpy(h_ally, d_ally, n*sizeof(*h_ally), cudaMemcpy(h_ally, d_ally, n*sizeof(*h_ally)*world_size,
cudaMemcpyDeviceToHost) ); cudaMemcpyDeviceToHost) );
nvtxRangePop(); nvtxRangePop();
for (int i=0; i<n*world_size; i++) { for (int i=0; i<n*world_size; i++) {
@@ -252,10 +269,10 @@ int main(int argc, char **argv) {
// cleanup // cleanup
nvtxRangePushA("free"); nvtxRangePushA("free");
#ifndef MANAGED #ifndef MANAGED
cudaFree(h_x); cudaFreeHost(h_x);
cudaFree(h_y); cudaFreeHost(h_y);
cudaFree(h_allx); cudaFreeHost(h_allx);
cudaFree(h_ally); cudaFreeHost(h_ally);
#endif #endif
cudaFree(d_x); cudaFree(d_x);
cudaFree(d_y); cudaFree(d_y);
@@ -270,9 +287,12 @@ int main(int argc, char **argv) {
cublasDestroy(handle); cublasDestroy(handle);
MPI_Finalize(); MPI_Finalize();
printf("total time: %0.3f\n", end_time-start_time); printf("%d/%d TIME total : %0.3f\n", world_rank, world_size,
printf("kernel time: %0.3f\n", k_end_time-k_start_time); end_time-start_time);
printf("gather time: %0.3f\n", g_end_time-g_start_time); printf("%d/%d TIME kernel: %0.3f\n", world_rank, world_size,
k_end_time-k_start_time);
printf("%d/%d TIME gather: %0.3f\n", world_rank, world_size,
g_end_time-g_start_time);
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }