distribute total across ranks
useful for test < 6 ranks per node
This commit is contained in:
@@ -68,7 +68,8 @@ void set_rank_device(int n_ranks, int rank) {
|
|||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int n = 4*MB;
|
const int nall = 48*MB;
|
||||||
|
int n = 0;
|
||||||
int world_size, world_rank;
|
int world_size, world_rank;
|
||||||
|
|
||||||
size_t free_mem, total_mem;
|
size_t free_mem, total_mem;
|
||||||
@@ -84,7 +85,8 @@ int main(int argc, char **argv) {
|
|||||||
double g_end_time = 0.0;
|
double g_end_time = 0.0;
|
||||||
|
|
||||||
#ifndef MANAGED
|
#ifndef MANAGED
|
||||||
double *h_x, *h_y, *h_allx, *h_ally;
|
double *h_x, *h_y;
|
||||||
|
double *h_allx, *h_ally;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
double *d_x, *d_y;
|
double *d_x, *d_y;
|
||||||
@@ -97,6 +99,12 @@ int main(int argc, char **argv) {
|
|||||||
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
|
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
|
||||||
|
|
||||||
|
n = nall / world_size;
|
||||||
|
|
||||||
|
if (world_rank == 0) {
|
||||||
|
printf("%d ranks, %d elements each, total %d\n", world_size, n, nall);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
x = (double *)malloc(n*sizeof(*x));
|
x = (double *)malloc(n*sizeof(*x));
|
||||||
if (x == NULL) {
|
if (x == NULL) {
|
||||||
@@ -159,9 +167,11 @@ int main(int argc, char **argv) {
|
|||||||
#endif
|
#endif
|
||||||
nvtxRangePop();
|
nvtxRangePop();
|
||||||
|
|
||||||
|
if (world_rank == 0) {
|
||||||
CHECK( "memInfo", cudaMemGetInfo(&free_mem, &total_mem) );
|
CHECK( "memInfo", cudaMemGetInfo(&free_mem, &total_mem) );
|
||||||
printf("GPU memory %0.3f / %0.3f (%0.3f)\n", free_mem/(double)MB,
|
printf("GPU memory %0.3f / %0.3f (%0.3f) MB\n", free_mem/(double)MB,
|
||||||
(double)total_mem/MB, (double)(total_mem-free_mem)/MB);
|
(double)total_mem/MB, (double)(total_mem-free_mem)/MB);
|
||||||
|
}
|
||||||
|
|
||||||
nvtxRangePushA("initializeArrays");
|
nvtxRangePushA("initializeArrays");
|
||||||
#ifdef MANAGED
|
#ifdef MANAGED
|
||||||
@@ -175,9 +185,9 @@ int main(int argc, char **argv) {
|
|||||||
h_y[i] = -h_x[i];
|
h_y[i] = -h_x[i];
|
||||||
}
|
}
|
||||||
nvtxRangePushA("copyInput");
|
nvtxRangePushA("copyInput");
|
||||||
CHECK("d_x = x",
|
CHECK("d_x = h_x",
|
||||||
cudaMemcpy(d_x, h_x, n*sizeof(*h_x), cudaMemcpyHostToDevice) );
|
cudaMemcpy(d_x, h_x, n*sizeof(*h_x), cudaMemcpyHostToDevice) );
|
||||||
CHECK("d_y = y",
|
CHECK("d_y = h_y",
|
||||||
cudaMemcpy(d_y, h_y, n*sizeof(*h_y), cudaMemcpyHostToDevice) );
|
cudaMemcpy(d_y, h_y, n*sizeof(*h_y), cudaMemcpyHostToDevice) );
|
||||||
nvtxRangePop();
|
nvtxRangePop();
|
||||||
#endif
|
#endif
|
||||||
@@ -191,6 +201,13 @@ int main(int argc, char **argv) {
|
|||||||
MEMINFO("d_x", d_x, sizeof(d_x));
|
MEMINFO("d_x", d_x, sizeof(d_x));
|
||||||
MEMINFO("d_y", d_y, sizeof(d_y));
|
MEMINFO("d_y", d_y, sizeof(d_y));
|
||||||
|
|
||||||
|
#ifndef MANAGED
|
||||||
|
MEMINFO("h_x", h_x, sizeof(h_x));
|
||||||
|
MEMINFO("h_y", h_y, sizeof(h_y));
|
||||||
|
MEMINFO("h_allx", h_allx, sizeof(h_allx));
|
||||||
|
MEMINFO("h_ally", h_ally, sizeof(h_ally));
|
||||||
|
#endif
|
||||||
|
|
||||||
k_start_time = MPI_Wtime();
|
k_start_time = MPI_Wtime();
|
||||||
nvtxRangePushA("cublasDaxpy");
|
nvtxRangePushA("cublasDaxpy");
|
||||||
CHECK("daxpy",
|
CHECK("daxpy",
|
||||||
@@ -237,9 +254,9 @@ int main(int argc, char **argv) {
|
|||||||
sum += d_ally[i];
|
sum += d_ally[i];
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
nvtxRangePushA("copyOutput");
|
nvtxRangePushA("copyAlly");
|
||||||
CHECK("h_ally = d_ally",
|
CHECK("h_ally = d_ally",
|
||||||
cudaMemcpy(h_ally, d_ally, n*sizeof(*h_ally),
|
cudaMemcpy(h_ally, d_ally, n*sizeof(*h_ally)*world_size,
|
||||||
cudaMemcpyDeviceToHost) );
|
cudaMemcpyDeviceToHost) );
|
||||||
nvtxRangePop();
|
nvtxRangePop();
|
||||||
for (int i=0; i<n*world_size; i++) {
|
for (int i=0; i<n*world_size; i++) {
|
||||||
@@ -252,10 +269,10 @@ int main(int argc, char **argv) {
|
|||||||
// cleanup
|
// cleanup
|
||||||
nvtxRangePushA("free");
|
nvtxRangePushA("free");
|
||||||
#ifndef MANAGED
|
#ifndef MANAGED
|
||||||
cudaFree(h_x);
|
cudaFreeHost(h_x);
|
||||||
cudaFree(h_y);
|
cudaFreeHost(h_y);
|
||||||
cudaFree(h_allx);
|
cudaFreeHost(h_allx);
|
||||||
cudaFree(h_ally);
|
cudaFreeHost(h_ally);
|
||||||
#endif
|
#endif
|
||||||
cudaFree(d_x);
|
cudaFree(d_x);
|
||||||
cudaFree(d_y);
|
cudaFree(d_y);
|
||||||
@@ -270,9 +287,12 @@ int main(int argc, char **argv) {
|
|||||||
cublasDestroy(handle);
|
cublasDestroy(handle);
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
|
|
||||||
printf("total time: %0.3f\n", end_time-start_time);
|
printf("%d/%d TIME total : %0.3f\n", world_rank, world_size,
|
||||||
printf("kernel time: %0.3f\n", k_end_time-k_start_time);
|
end_time-start_time);
|
||||||
printf("gather time: %0.3f\n", g_end_time-g_start_time);
|
printf("%d/%d TIME kernel: %0.3f\n", world_rank, world_size,
|
||||||
|
k_end_time-k_start_time);
|
||||||
|
printf("%d/%d TIME gather: %0.3f\n", world_rank, world_size,
|
||||||
|
g_end_time-g_start_time);
|
||||||
|
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user