WIP stencil example

fix mpi init/set device order
gt and cmake fixes
2022-10-23 01:32:50 +00:00 · 2021-07-17 14:23:50 +00:00 · 2021-07-16 22:07:00 -04:00 · 2021-07-16 21:36:50 -04:00 · 2020-09-02 18:42:48 -04:00
6 changed files with 419 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+# create project
+project(mpi-daxpy-test)
+
+# add dependencies
+include(cmake/CPM.cmake)
+CPMFindPackage(NAME gtensor
+               GITHUB_REPOSITORY bd4/gtensor
+               GIT_TAG "pr/sycl-include-refactor"
+               OPTIONS "GTENSOR_ENABLE_BLAS ON")
+
+find_package(MPI REQUIRED)
+
+add_executable(mpi_daxpy_gt)
+target_sources(mpi_daxpy_gt PRIVATE mpi_daxpy_gt.cc)
+target_link_libraries(mpi_daxpy_gt gtensor::gtensor)
+target_link_libraries(mpi_daxpy_gt gtensor::blas)
+target_link_libraries(mpi_daxpy_gt MPI::MPI_CXX)
+
+add_executable(mpi_stencil_gt)
+target_sources(mpi_stencil_gt PRIVATE mpi_stencil_gt.cc)
+target_link_libraries(mpi_stencil_gt gtensor::gtensor)
+target_link_libraries(mpi_stencil_gt MPI::MPI_CXX)
+
+if ("${GTENSOR_DEVICE}" STREQUAL "cuda") 
+  enable_language(CUDA)
+  set_source_files_properties(mpi_daxpy_gt.cc
+                              TARGET_DIRECTORY mpi_daxpy_gt
+                              PROPERTIES LANGUAGE CUDA)
+  set_source_files_properties(mpi_stencil_gt.cc
+                              TARGET_DIRECTORY mpi_stencil_gt
+                              PROPERTIES LANGUAGE CUDA)
+else()
+  set_source_files_properties(mpi_daxpy_gt.cc
+                              TARGET_DIRECTORY mpi_daxpy_gt
+                              PROPERTIES LANGUAGE CXX)
+  set_source_files_properties(mpi_stencil_gt.cc
+                              TARGET_DIRECTORY mpi_stencil_gt
+                              PROPERTIES LANGUAGE CXX)
+endif()
--- a/5
+++ b/5
@@ -1,5 +1,5 @@
 .PHONY: all
-all: daxpy mpi_daxpy mpienv daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged
+all: daxpy mpi_daxpy mpienv daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged mpigatherinplace

 CCFLAGS = -std=c++11
 CUDA_HOME ?= $(CUDA_DIR)
@@ -22,6 +22,9 @@ mpi_daxpy_nvtx_unmanaged: mpi_daxpy_nvtx.cc cuda_error.h
 mpienv: mpienv.f90
 	mpif90 -o mpienv mpienv.f90

+mpigatherinplace: mpigatherinplace.f90
+	mpifort -o mpigatherinplace mpigatherinplace.f90
+
 .PHONY: clean
 clean:
 	rm -rf daxpy mpi_daxpy daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged
--- a/cmake/CPM.cmake
+++ b/cmake/CPM.cmake
@@ -0,0 +1,21 @@
+set(CPM_DOWNLOAD_VERSION 0.32.1)
+
+if(CPM_SOURCE_CACHE)
+  # Expand relative path. This is important if the provided path contains a tilde (~)
+  get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
+  file(DOWNLOAD
+       https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+       ${CPM_DOWNLOAD_LOCATION}
+  )
+endif()
+
+include(${CPM_DOWNLOAD_LOCATION})
--- a/mpi_daxpy_gt.cc
+++ b/mpi_daxpy_gt.cc
@@ -0,0 +1,97 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  mpi_daxpy_gt.c
+ *
+ *    Description:  Port to gtensor / gt-blas
+ *
+ *        Version:  1.0
+ *        Created:  05/20/2019 10:33:30 AM
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  YOUR NAME (), 
+ *   Organization:  
+ *
+ * =====================================================================================
+ */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gtensor/gtensor.h"
+#include "gt-blas/blas.h"
+
+void set_rank_device(int n_ranks, int rank) {
+    int n_devices, device, ranks_per_device;
+
+    n_devices = gt::backend::clib::device_get_count();
+
+    if (n_ranks > n_devices) {
+        if (n_ranks % n_devices != 0) {
+            printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n",
+                   n_ranks, n_devices);
+            exit(EXIT_FAILURE);
+        }
+        ranks_per_device = n_ranks / n_devices;
+        device = rank / ranks_per_device;
+    } else {
+        ranks_per_device = 1;
+        device = rank;
+    }
+
+    gt::backend::clib::device_set(device);
+}
+
+
+int main(int argc, char **argv) {
+    int n = 1024;
+    int world_size, world_rank, device_id;
+    uint32_t vendor_id;
+
+    double a = 2.0;
+    double sum = 0.0;
+
+    MPI_Init(NULL, NULL);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    set_rank_device(world_size, world_rank);
+
+    auto x = gt::empty<double>({n});
+    auto y = gt::empty<double>({n});
+    auto d_x = gt::empty_device<double>({n});
+    auto d_y = gt::empty_device<double>({n});
+
+    for (int i=0; i<n; i++) {
+        x[i] =  i+1;
+        y[i] = -i-1;
+    }
+
+    device_id = gt::backend::clib::device_get();
+    vendor_id = gt::backend::clib::device_get_vendor_id(device_id);
+
+    gt::blas::handle_t* h = gt::blas::create();
+
+    gt::copy(x, d_x);
+    gt::copy(y, d_y);
+
+    gt::blas::axpy(h, a, d_x, d_y);
+
+    gt::synchronize();
+
+    gt::copy(d_y, y);
+    
+    sum = 0.0;
+    for (int i=0; i<n; i++) {
+        //printf("%f\n", y[i]);
+        sum += y[i];
+    }
+    printf("%d/%d [%d:0x%08x] SUM = %f\n", world_rank, world_size, device_id, vendor_id, sum);
+
+    MPI_Finalize();
+
+    return EXIT_SUCCESS;
+}
--- a/mpi_stencil_gt.cc
+++ b/mpi_stencil_gt.cc
@@ -0,0 +1,198 @@
+/*
+ * Test GPU aware MPI on different platforms using a simple
+ * distributed 1d stencil as an example. Gtensor is used so
+ * a single source can be used for all platforms.
+ */
+
+#include <cmath>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gtensor/gtensor.h"
+#include "gtensor/reductions.h"
+
+using namespace gt::placeholders;
+
+// little hack to make code parameterizable on managed vs device memory
+namespace gt
+{
+
+namespace ext
+{
+
+namespace detail
+{
+
+template <typename T, gt::size_type N, typename S = gt::space::device>
+struct gthelper
+{
+  using gtensor = gt::gtensor<T, N, S>;
+};
+
+#ifdef GTENSOR_HAVE_DEVICE
+
+template <typename T, gt::size_type N>
+struct gthelper<T, N, gt::space::managed>
+{
+  using gtensor = gt::gtensor_container<gt::space::managed_vector<T>, N>;
+};
+#endif
+
+} // namespace detail
+
+template <typename T, gt::size_type N, typename S = gt::space::device>
+using gtensor2 = typename detail::gthelper<T, N, S>::gtensor;
+
+} // namespace ext
+
+} // namespace gt
+
+static const gt::gtensor<double, 1> stencil5 = {1.0 / 12.0, -2.0 / 3.0, 0.0,
+                                                2.0 / 3.0, -1.0 / 12.0};
+
+/*
+ * Return unevaluated expression that calculates the stencil.
+ *
+ * Size of the result will be size of y minus 4 (the number of boundary points).
+ */
+inline auto stencil1d_5(const gt::gtensor_device<double, 1>& y,
+                        const gt::gtensor<double, 1>& stencil)
+{
+  return stencil(0) * y.view(_s(0, -4)) + stencil(1) * y.view(_s(1, -3)) +
+         stencil(2) * y.view(_s(2, -2)) + stencil(3) * y.view(_s(3, -1)) +
+         stencil(4) * y.view(_s(4, _));
+}
+
+void set_rank_device(int n_ranks, int rank)
+{
+  int n_devices, device, ranks_per_device;
+
+  n_devices = gt::backend::clib::device_get_count();
+
+  if (n_ranks > n_devices) {
+    if (n_ranks % n_devices != 0) {
+      printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs "
+             "(%d)\n_global",
+             n_ranks, n_devices);
+      exit(EXIT_FAILURE);
+    }
+    ranks_per_device = n_ranks / n_devices;
+    device = rank / ranks_per_device;
+  } else {
+    ranks_per_device = 1;
+    device = rank;
+  }
+
+  gt::backend::clib::device_set(device);
+}
+
+void boundary_exchange(MPI_Comm comm, int world_size, int rank,
+                       gt::gtensor_device<double, 1>& d_y, int n_bnd)
+{
+  double* d_y_data = gt::raw_pointer_cast(d_y.data());
+  double* d_y_data_end = gt::raw_pointer_cast(d_y.data()) + d_y.size();
+
+  MPI_Request req_l[2];
+  MPI_Request req_r[2];
+
+  int rank_l = rank - 1;
+  int rank_r = rank + 1;
+
+  if (rank_l >= 0) {
+    printf("%d left\n", rank);
+    // send/recv left boundary
+    MPI_Irecv(d_y_data, n_bnd, MPI_DOUBLE, rank_l, 123, comm,
+              &req_l[0]);
+    MPI_Isend(d_y_data + n_bnd, n_bnd, MPI_DOUBLE, rank_l, 456, comm,
+              &req_l[1]);
+  }
+
+  if (rank_r < world_size) {
+    printf("%d right\n", rank);
+    // send/recv right boundary
+    MPI_Irecv(d_y_data_end - n_bnd, n_bnd, MPI_DOUBLE, rank_r, 456,
+              comm, &req_r[0]);
+    MPI_Isend(d_y_data - 2 * n_bnd, n_bnd, MPI_DOUBLE, rank_r, 123,
+              comm, &req_r[1]);
+  }
+
+  int mpi_rval;
+  if (rank_l >= 0) {
+    printf("%d wait left\n", rank);
+    mpi_rval = MPI_Waitall(2, req_l, MPI_STATUSES_IGNORE);
+    if (mpi_rval != MPI_SUCCESS) {
+      printf("send_l error: %d\n", mpi_rval);
+    }
+  }
+  if (rank_r < world_size) {
+    printf("%d wait right\n", rank);
+    mpi_rval = MPI_Waitall(2, req_r, MPI_STATUSES_IGNORE);
+    if (mpi_rval != MPI_SUCCESS) {
+      printf("send_r error: %d\n", mpi_rval);
+    }
+  }
+}
+
+int main(int argc, char** argv)
+{
+  constexpr int n_global = 1024 * 1024 * 1024;
+  constexpr int n_sten = 5;
+  constexpr int n_bnd = (n_sten - 1) / 2;
+  int world_size, world_rank, device_id;
+  uint32_t vendor_id;
+
+  MPI_Init(NULL, NULL);
+
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  const int n_local = n_global / world_size;
+  const int n_local_with_ghost = n_local + 2 * n_bnd;
+
+  set_rank_device(world_size, world_rank);
+  device_id = gt::backend::clib::device_get();
+  vendor_id = gt::backend::clib::device_get_vendor_id(device_id);
+
+  auto h_y = gt::empty<double>({n_local_with_ghost});
+  auto d_y = gt::empty_device<double>({n_local_with_ghost});
+
+  auto h_dydx_numeric = gt::empty<double>({n_local});
+  auto h_dydx_actual = gt::empty<double>({n_local});
+  auto d_dydx_numeric = gt::empty_device<double>({n_local});
+
+  double lx = 8;
+  double dx = lx / n_global;
+  double lx_local = lx / world_rank;
+  double scale = n_global / lx;
+  auto fn_x_cubed = [](double x) { return x * x * x; };
+  auto fn_x_cubed_deriv = [](double x) { return 3 * x * x; };
+
+  printf("%d Init\n", world_rank);
+  double x_start = world_rank * lx_local;
+  for (int i = 0; i < n_local; i++) {
+    double xtmp = x_start + i * dx;
+    h_y(i + n_bnd) = fn_x_cubed(xtmp);
+    h_dydx_actual(i) = fn_x_cubed_deriv(xtmp);
+  }
+
+  printf("%d Ex\n", world_rank);
+
+  boundary_exchange(MPI_COMM_WORLD, world_size, world_rank, d_y, n_bnd);
+
+  printf("%d Sten\n", world_rank);
+  //d_dydx_numeric = stencil1d_5(d_y, stencil5) * scale;
+
+  printf("Copy\n");
+  gt::copy(d_dydx_numeric, h_dydx_numeric);
+
+  printf("Err calc\n");
+  double err_norm = std::sqrt(gt::sum_squares(h_dydx_numeric - h_dydx_actual));
+
+  printf("%d/%d [%d:0x%08x] err_norm = %f\n", world_rank, world_size, device_id,
+         vendor_id, err_norm);
+
+  MPI_Finalize();
+
+  return EXIT_SUCCESS;
+}
--- a/mpigatherinplace.f90
+++ b/mpigatherinplace.f90
@@ -0,0 +1,58 @@
+program mpigatherinplace
+use mpi
+implicit none
+
+integer :: rank, ierr, nmpi, i
+
+integer :: N, err
+real(kind=8), dimension(:), allocatable :: allx
+real :: asum, lsum
+
+N = 128*1024*1024
+
+call MPI_Init(ierr)
+if (ierr /= 0) then
+    print *, 'Failed MPI_Init: ', ierr
+    stop
+end if
+
+call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
+if (ierr /= 0) then
+    print *, 'Failed MPI_COMM_RANK: ', ierr
+    stop
+end if
+
+call MPI_COMM_SIZE(MPI_COMM_WORLD, nmpi, ierr)
+if (ierr /= 0) then
+    print *, 'Failed MPI_COMM_SIZE: ', ierr
+    stop
+end if
+
+allocate(allx(N*nmpi))
+
+lsum = 0
+do i=1, N
+    allx(rank*N+i) = rank*i/N
+    lsum = lsum + allx(rank*N+i)
+end do
+
+call MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &
+                 & allx, N, MPI_DOUBLE, MPI_COMM_WORLD, ierr)
+if (ierr /= 0) then
+    print *, 'Failed MPI_Allgather: ', ierr
+    stop
+end if
+
+asum = sum(allx)
+
+print *, rank, "/", nmpi, " ", lsum, " ", asum
+
+deallocate(allx)
+
+call MPI_Finalize(ierr)
+if (ierr /= 0) then
+    print *, 'Failed MPI_Finalize: ', ierr
+    stop
+end if
+
+end program mpigatherinplace
Author	SHA1	Message	Date
Bryce Allen	2139816f8c	WIP stencil example	2022-10-23 01:32:50 +00:00
Bryce Allen	349837e9c7	fix mpi init/set device order	2021-07-17 14:23:50 +00:00
Bryce Allen	df5f830a26	gt and cmake fixes	2021-07-16 22:07:00 -04:00
Bryce Allen	d791b81cb6	add gt port of mpi_daxpy	2021-07-16 21:36:50 -04:00
Bryce Allen	2434b39b53	add mpigatherinplace example for reproducing pmpi wrapper bug	2020-09-02 18:42:48 -04:00