WIP stencil example

fix mpi init/set device order
gt and cmake fixes
2022-10-23 01:32:50 +00:00 · 2021-07-17 14:23:50 +00:00 · 2021-07-16 22:07:00 -04:00 · 2021-07-16 21:36:50 -04:00 · 2020-09-02 18:42:48 -04:00
6 changed files with 419 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,41 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 # create project
 project(mpi-daxpy-test)
 # add dependencies
 include(cmake/CPM.cmake)
 CPMFindPackage(NAME gtensor
               GITHUB_REPOSITORY bd4/gtensor
               GIT_TAG "pr/sycl-include-refactor"
               OPTIONS "GTENSOR_ENABLE_BLAS ON")
 find_package(MPI REQUIRED)
 add_executable(mpi_daxpy_gt)
 target_sources(mpi_daxpy_gt PRIVATE mpi_daxpy_gt.cc)
 target_link_libraries(mpi_daxpy_gt gtensor::gtensor)
 target_link_libraries(mpi_daxpy_gt gtensor::blas)
 target_link_libraries(mpi_daxpy_gt MPI::MPI_CXX)
 add_executable(mpi_stencil_gt)
 target_sources(mpi_stencil_gt PRIVATE mpi_stencil_gt.cc)
 target_link_libraries(mpi_stencil_gt gtensor::gtensor)
 target_link_libraries(mpi_stencil_gt MPI::MPI_CXX)
 if ("${GTENSOR_DEVICE}" STREQUAL "cuda") 
  enable_language(CUDA)
  set_source_files_properties(mpi_daxpy_gt.cc
                              TARGET_DIRECTORY mpi_daxpy_gt
                              PROPERTIES LANGUAGE CUDA)
  set_source_files_properties(mpi_stencil_gt.cc
                              TARGET_DIRECTORY mpi_stencil_gt
                              PROPERTIES LANGUAGE CUDA)
 else()
  set_source_files_properties(mpi_daxpy_gt.cc
                              TARGET_DIRECTORY mpi_daxpy_gt
                              PROPERTIES LANGUAGE CXX)
  set_source_files_properties(mpi_stencil_gt.cc
                              TARGET_DIRECTORY mpi_stencil_gt
                              PROPERTIES LANGUAGE CXX)
 endif()
--- a/5
+++ b/5
@@ -1,5 +1,5 @@
 .PHONY: all
-all: daxpy mpi_daxpy mpienv daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged
+all: daxpy mpi_daxpy mpienv daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged mpigatherinplace
 CCFLAGS = -std=c++11
 CUDA_HOME ?= $(CUDA_DIR)
@@ -22,6 +22,9 @@ mpi_daxpy_nvtx_unmanaged: mpi_daxpy_nvtx.cc cuda_error.h
 mpienv: mpienv.f90
 	mpif90 -o mpienv mpienv.f90
 mpigatherinplace: mpigatherinplace.f90
 	mpifort -o mpigatherinplace mpigatherinplace.f90
 .PHONY: clean
 clean:
 	rm -rf daxpy mpi_daxpy daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged
--- a/cmake/CPM.cmake
+++ b/cmake/CPM.cmake
@@ -0,0 +1,21 @@
 set(CPM_DOWNLOAD_VERSION 0.32.1)
 if(CPM_SOURCE_CACHE)
  # Expand relative path. This is important if the provided path contains a tilde (~)
  get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE)
  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 elseif(DEFINED ENV{CPM_SOURCE_CACHE})
  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 else()
  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 endif()
 if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
  file(DOWNLOAD
       https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
       ${CPM_DOWNLOAD_LOCATION}
  )
 endif()
 include(${CPM_DOWNLOAD_LOCATION})
--- a/mpi_daxpy_gt.cc
+++ b/mpi_daxpy_gt.cc
@@ -0,0 +1,97 @@
 /*
 * =====================================================================================
 *
 *       Filename:  mpi_daxpy_gt.c
 *
 *    Description:  Port to gtensor / gt-blas
 *
 *        Version:  1.0
 *        Created:  05/20/2019 10:33:30 AM
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  YOUR NAME (), 
 *   Organization:  
 *
 * =====================================================================================
 */
 #include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "gtensor/gtensor.h"
 #include "gt-blas/blas.h"
 void set_rank_device(int n_ranks, int rank) {
    int n_devices, device, ranks_per_device;
    n_devices = gt::backend::clib::device_get_count();
    if (n_ranks > n_devices) {
        if (n_ranks % n_devices != 0) {
            printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n",
                   n_ranks, n_devices);
            exit(EXIT_FAILURE);
        }
        ranks_per_device = n_ranks / n_devices;
        device = rank / ranks_per_device;
    } else {
        ranks_per_device = 1;
        device = rank;
    }
    gt::backend::clib::device_set(device);
 }
 int main(int argc, char **argv) {
    int n = 1024;
    int world_size, world_rank, device_id;
    uint32_t vendor_id;
    double a = 2.0;
    double sum = 0.0;
    MPI_Init(NULL, NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    set_rank_device(world_size, world_rank);
    auto x = gt::empty<double>({n});
    auto y = gt::empty<double>({n});
    auto d_x = gt::empty_device<double>({n});
    auto d_y = gt::empty_device<double>({n});
    for (int i=0; i<n; i++) {
        x[i] =  i+1;
        y[i] = -i-1;
    }
    device_id = gt::backend::clib::device_get();
    vendor_id = gt::backend::clib::device_get_vendor_id(device_id);
    gt::blas::handle_t* h = gt::blas::create();
    gt::copy(x, d_x);
    gt::copy(y, d_y);
    gt::blas::axpy(h, a, d_x, d_y);
    gt::synchronize();
    gt::copy(d_y, y);
    sum = 0.0;
    for (int i=0; i<n; i++) {
        //printf("%f\n", y[i]);
        sum += y[i];
    }
    printf("%d/%d [%d:0x%08x] SUM = %f\n", world_rank, world_size, device_id, vendor_id, sum);
    MPI_Finalize();
    return EXIT_SUCCESS;
 }
--- a/mpi_stencil_gt.cc
+++ b/mpi_stencil_gt.cc
@@ -0,0 +1,198 @@
 /*
 * Test GPU aware MPI on different platforms using a simple
 * distributed 1d stencil as an example. Gtensor is used so
 * a single source can be used for all platforms.
 */
 #include <cmath>
 #include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "gtensor/gtensor.h"
 #include "gtensor/reductions.h"
 using namespace gt::placeholders;
 // little hack to make code parameterizable on managed vs device memory
 namespace gt
 {
 namespace ext
 {
 namespace detail
 {
 template <typename T, gt::size_type N, typename S = gt::space::device>
 struct gthelper
 {
  using gtensor = gt::gtensor<T, N, S>;
 };
 #ifdef GTENSOR_HAVE_DEVICE
 template <typename T, gt::size_type N>
 struct gthelper<T, N, gt::space::managed>
 {
  using gtensor = gt::gtensor_container<gt::space::managed_vector<T>, N>;
 };
 #endif
 } // namespace detail
 template <typename T, gt::size_type N, typename S = gt::space::device>
 using gtensor2 = typename detail::gthelper<T, N, S>::gtensor;
 } // namespace ext
 } // namespace gt
 static const gt::gtensor<double, 1> stencil5 = {1.0 / 12.0, -2.0 / 3.0, 0.0,
                                                2.0 / 3.0, -1.0 / 12.0};
 /*
 * Return unevaluated expression that calculates the stencil.
 *
 * Size of the result will be size of y minus 4 (the number of boundary points).
 */
 inline auto stencil1d_5(const gt::gtensor_device<double, 1>& y,
                        const gt::gtensor<double, 1>& stencil)
 {
  return stencil(0) * y.view(_s(0, -4)) + stencil(1) * y.view(_s(1, -3)) +
         stencil(2) * y.view(_s(2, -2)) + stencil(3) * y.view(_s(3, -1)) +
         stencil(4) * y.view(_s(4, _));
 }
 void set_rank_device(int n_ranks, int rank)
 {
  int n_devices, device, ranks_per_device;
  n_devices = gt::backend::clib::device_get_count();
  if (n_ranks > n_devices) {
    if (n_ranks % n_devices != 0) {
      printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs "
             "(%d)\n_global",
             n_ranks, n_devices);
      exit(EXIT_FAILURE);
    }
    ranks_per_device = n_ranks / n_devices;
    device = rank / ranks_per_device;
  } else {
    ranks_per_device = 1;
    device = rank;
  }
  gt::backend::clib::device_set(device);
 }
 void boundary_exchange(MPI_Comm comm, int world_size, int rank,
                       gt::gtensor_device<double, 1>& d_y, int n_bnd)
 {
  double* d_y_data = gt::raw_pointer_cast(d_y.data());
  double* d_y_data_end = gt::raw_pointer_cast(d_y.data()) + d_y.size();
  MPI_Request req_l[2];
  MPI_Request req_r[2];
  int rank_l = rank - 1;
  int rank_r = rank + 1;
  if (rank_l >= 0) {
    printf("%d left\n", rank);
    // send/recv left boundary
    MPI_Irecv(d_y_data, n_bnd, MPI_DOUBLE, rank_l, 123, comm,
              &req_l[0]);
    MPI_Isend(d_y_data + n_bnd, n_bnd, MPI_DOUBLE, rank_l, 456, comm,
              &req_l[1]);
  }
  if (rank_r < world_size) {
    printf("%d right\n", rank);
    // send/recv right boundary
    MPI_Irecv(d_y_data_end - n_bnd, n_bnd, MPI_DOUBLE, rank_r, 456,
              comm, &req_r[0]);
    MPI_Isend(d_y_data - 2 * n_bnd, n_bnd, MPI_DOUBLE, rank_r, 123,
              comm, &req_r[1]);
  }
  int mpi_rval;
  if (rank_l >= 0) {
    printf("%d wait left\n", rank);
    mpi_rval = MPI_Waitall(2, req_l, MPI_STATUSES_IGNORE);
    if (mpi_rval != MPI_SUCCESS) {
      printf("send_l error: %d\n", mpi_rval);
    }
  }
  if (rank_r < world_size) {
    printf("%d wait right\n", rank);
    mpi_rval = MPI_Waitall(2, req_r, MPI_STATUSES_IGNORE);
    if (mpi_rval != MPI_SUCCESS) {
      printf("send_r error: %d\n", mpi_rval);
    }
  }
 }
 int main(int argc, char** argv)
 {
  constexpr int n_global = 1024 * 1024 * 1024;
  constexpr int n_sten = 5;
  constexpr int n_bnd = (n_sten - 1) / 2;
  int world_size, world_rank, device_id;
  uint32_t vendor_id;
  MPI_Init(NULL, NULL);
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  const int n_local = n_global / world_size;
  const int n_local_with_ghost = n_local + 2 * n_bnd;
  set_rank_device(world_size, world_rank);
  device_id = gt::backend::clib::device_get();
  vendor_id = gt::backend::clib::device_get_vendor_id(device_id);
  auto h_y = gt::empty<double>({n_local_with_ghost});
  auto d_y = gt::empty_device<double>({n_local_with_ghost});
  auto h_dydx_numeric = gt::empty<double>({n_local});
  auto h_dydx_actual = gt::empty<double>({n_local});
  auto d_dydx_numeric = gt::empty_device<double>({n_local});
  double lx = 8;
  double dx = lx / n_global;
  double lx_local = lx / world_rank;
  double scale = n_global / lx;
  auto fn_x_cubed = [](double x) { return x * x * x; };
  auto fn_x_cubed_deriv = [](double x) { return 3 * x * x; };
  printf("%d Init\n", world_rank);
  double x_start = world_rank * lx_local;
  for (int i = 0; i < n_local; i++) {
    double xtmp = x_start + i * dx;
    h_y(i + n_bnd) = fn_x_cubed(xtmp);
    h_dydx_actual(i) = fn_x_cubed_deriv(xtmp);
  }
  printf("%d Ex\n", world_rank);
  boundary_exchange(MPI_COMM_WORLD, world_size, world_rank, d_y, n_bnd);
  printf("%d Sten\n", world_rank);
  //d_dydx_numeric = stencil1d_5(d_y, stencil5) * scale;
  printf("Copy\n");
  gt::copy(d_dydx_numeric, h_dydx_numeric);
  printf("Err calc\n");
  double err_norm = std::sqrt(gt::sum_squares(h_dydx_numeric - h_dydx_actual));
  printf("%d/%d [%d:0x%08x] err_norm = %f\n", world_rank, world_size, device_id,
         vendor_id, err_norm);
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
--- a/mpigatherinplace.f90
+++ b/mpigatherinplace.f90
@@ -0,0 +1,58 @@
 program mpigatherinplace
 use mpi
 implicit none
 integer :: rank, ierr, nmpi, i
 integer :: N, err
 real(kind=8), dimension(:), allocatable :: allx
 real :: asum, lsum
 N = 128*1024*1024
 call MPI_Init(ierr)
 if (ierr /= 0) then
    print *, 'Failed MPI_Init: ', ierr
    stop
 end if
 call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
 if (ierr /= 0) then
    print *, 'Failed MPI_COMM_RANK: ', ierr
    stop
 end if
 call MPI_COMM_SIZE(MPI_COMM_WORLD, nmpi, ierr)
 if (ierr /= 0) then
    print *, 'Failed MPI_COMM_SIZE: ', ierr
    stop
 end if
 allocate(allx(N*nmpi))
 lsum = 0
 do i=1, N
    allx(rank*N+i) = rank*i/N
    lsum = lsum + allx(rank*N+i)
 end do
 call MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &
                 & allx, N, MPI_DOUBLE, MPI_COMM_WORLD, ierr)
 if (ierr /= 0) then
    print *, 'Failed MPI_Allgather: ', ierr
    stop
 end if
 asum = sum(allx)
 print *, rank, "/", nmpi, " ", lsum, " ", asum
 deallocate(allx)
 call MPI_Finalize(ierr)
 if (ierr /= 0) then
    print *, 'Failed MPI_Finalize: ', ierr
    stop
 end if
 end program mpigatherinplace
Author	SHA1	Message	Date
Bryce Allen	2139816f8c	WIP stencil example	2022-10-23 01:32:50 +00:00
Bryce Allen	349837e9c7	fix mpi init/set device order	2021-07-17 14:23:50 +00:00
Bryce Allen	df5f830a26	gt and cmake fixes	2021-07-16 22:07:00 -04:00
Bryce Allen	d791b81cb6	add gt port of mpi_daxpy	2021-07-16 21:36:50 -04:00
Bryce Allen	2434b39b53	add mpigatherinplace example for reproducing pmpi wrapper bug	2020-09-02 18:42:48 -04:00