Use MPI_IN_PLACE in one of the allgathers

Try to reproduce nsys segfault seen when running GENE, which
has an in place allgather as the BT for the segfault.
This commit is contained in:
Bryce Allen
2020-09-02 16:34:37 -04:00
parent cff437eace
commit 7a1d10349e

View File

@@ -267,6 +267,10 @@ int main(int argc, char **argv) {
nvtxRangePop(); nvtxRangePop();
printf("%d/%d SUM = %f\n", world_rank, world_size, sum); printf("%d/%d SUM = %f\n", world_rank, world_size, sum);
nvtxRangePushA("copyPrepAllxInplace");
cudaMemcpy(d_allx+(world_rank*n), d_x, n*sizeof(*d_x), cudaMemcpyDeviceToDevice);
nvtxRangePop();
#ifdef BARRIER #ifdef BARRIER
b_start_time = MPI_Wtime(); b_start_time = MPI_Wtime();
nvtxRangePushA("mpiBarrier"); nvtxRangePushA("mpiBarrier");
@@ -278,7 +282,7 @@ int main(int argc, char **argv) {
g_start_time = MPI_Wtime(); g_start_time = MPI_Wtime();
nvtxRangePushA("mpiAllGather"); nvtxRangePushA("mpiAllGather");
nvtxRangePushA("x"); nvtxRangePushA("x");
MPI_Allgather(d_x, n, MPI_DOUBLE, d_allx, n, MPI_DOUBLE, MPI_COMM_WORLD); MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE, d_allx, n, MPI_DOUBLE, MPI_COMM_WORLD);
nvtxRangePop(); nvtxRangePop();
nvtxRangePushA("y"); nvtxRangePushA("y");
MPI_Allgather(d_y, n, MPI_DOUBLE, d_ally, n, MPI_DOUBLE, MPI_COMM_WORLD); MPI_Allgather(d_y, n, MPI_DOUBLE, d_ally, n, MPI_DOUBLE, MPI_COMM_WORLD);