Compare commits

..

5 Commits

Author SHA1 Message Date
Bryce Allen
2139816f8c WIP stencil example 2022-10-23 01:32:50 +00:00
Bryce Allen
349837e9c7 fix mpi init/set device order 2021-07-17 14:23:50 +00:00
Bryce Allen
df5f830a26 gt and cmake fixes 2021-07-16 22:07:00 -04:00
Bryce Allen
d791b81cb6 add gt port of mpi_daxpy 2021-07-16 21:36:50 -04:00
Bryce Allen
2434b39b53 add mpigatherinplace example for reproducing pmpi wrapper bug 2020-09-02 18:42:48 -04:00
6 changed files with 419 additions and 1 deletions

41
CMakeLists.txt Normal file
View File

@@ -0,0 +1,41 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
# create project
project(mpi-daxpy-test)
# add dependencies
include(cmake/CPM.cmake)
CPMFindPackage(NAME gtensor
GITHUB_REPOSITORY bd4/gtensor
GIT_TAG "pr/sycl-include-refactor"
OPTIONS "GTENSOR_ENABLE_BLAS ON")
find_package(MPI REQUIRED)
add_executable(mpi_daxpy_gt)
target_sources(mpi_daxpy_gt PRIVATE mpi_daxpy_gt.cc)
target_link_libraries(mpi_daxpy_gt gtensor::gtensor)
target_link_libraries(mpi_daxpy_gt gtensor::blas)
target_link_libraries(mpi_daxpy_gt MPI::MPI_CXX)
add_executable(mpi_stencil_gt)
target_sources(mpi_stencil_gt PRIVATE mpi_stencil_gt.cc)
target_link_libraries(mpi_stencil_gt gtensor::gtensor)
target_link_libraries(mpi_stencil_gt MPI::MPI_CXX)
if ("${GTENSOR_DEVICE}" STREQUAL "cuda")
enable_language(CUDA)
set_source_files_properties(mpi_daxpy_gt.cc
TARGET_DIRECTORY mpi_daxpy_gt
PROPERTIES LANGUAGE CUDA)
set_source_files_properties(mpi_stencil_gt.cc
TARGET_DIRECTORY mpi_stencil_gt
PROPERTIES LANGUAGE CUDA)
else()
set_source_files_properties(mpi_daxpy_gt.cc
TARGET_DIRECTORY mpi_daxpy_gt
PROPERTIES LANGUAGE CXX)
set_source_files_properties(mpi_stencil_gt.cc
TARGET_DIRECTORY mpi_stencil_gt
PROPERTIES LANGUAGE CXX)
endif()

View File

@@ -1,5 +1,5 @@
.PHONY: all
all: daxpy mpi_daxpy mpienv daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged
all: daxpy mpi_daxpy mpienv daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged mpigatherinplace
CCFLAGS = -std=c++11
CUDA_HOME ?= $(CUDA_DIR)
@@ -22,6 +22,9 @@ mpi_daxpy_nvtx_unmanaged: mpi_daxpy_nvtx.cc cuda_error.h
mpienv: mpienv.f90
mpif90 -o mpienv mpienv.f90
mpigatherinplace: mpigatherinplace.f90
mpifort -o mpigatherinplace mpigatherinplace.f90
.PHONY: clean
clean:
rm -rf daxpy mpi_daxpy daxpy_nvtx mpi_daxpy_nvtx_managed mpi_daxpy_nvtx_unmanaged

21
cmake/CPM.cmake Normal file
View File

@@ -0,0 +1,21 @@
set(CPM_DOWNLOAD_VERSION 0.32.1)
if(CPM_SOURCE_CACHE)
# Expand relative path. This is important if the provided path contains a tilde (~)
get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE)
set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
elseif(DEFINED ENV{CPM_SOURCE_CACHE})
set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
else()
set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
endif()
if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
file(DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
${CPM_DOWNLOAD_LOCATION}
)
endif()
include(${CPM_DOWNLOAD_LOCATION})

97
mpi_daxpy_gt.cc Normal file
View File

@@ -0,0 +1,97 @@
/*
* =====================================================================================
*
* Filename: mpi_daxpy_gt.c
*
* Description: Port to gtensor / gt-blas
*
* Version: 1.0
* Created: 05/20/2019 10:33:30 AM
* Revision: none
* Compiler: gcc
*
* Author: YOUR NAME (),
* Organization:
*
* =====================================================================================
*/
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include "gtensor/gtensor.h"
#include "gt-blas/blas.h"
void set_rank_device(int n_ranks, int rank) {
int n_devices, device, ranks_per_device;
n_devices = gt::backend::clib::device_get_count();
if (n_ranks > n_devices) {
if (n_ranks % n_devices != 0) {
printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n",
n_ranks, n_devices);
exit(EXIT_FAILURE);
}
ranks_per_device = n_ranks / n_devices;
device = rank / ranks_per_device;
} else {
ranks_per_device = 1;
device = rank;
}
gt::backend::clib::device_set(device);
}
int main(int argc, char **argv) {
int n = 1024;
int world_size, world_rank, device_id;
uint32_t vendor_id;
double a = 2.0;
double sum = 0.0;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
set_rank_device(world_size, world_rank);
auto x = gt::empty<double>({n});
auto y = gt::empty<double>({n});
auto d_x = gt::empty_device<double>({n});
auto d_y = gt::empty_device<double>({n});
for (int i=0; i<n; i++) {
x[i] = i+1;
y[i] = -i-1;
}
device_id = gt::backend::clib::device_get();
vendor_id = gt::backend::clib::device_get_vendor_id(device_id);
gt::blas::handle_t* h = gt::blas::create();
gt::copy(x, d_x);
gt::copy(y, d_y);
gt::blas::axpy(h, a, d_x, d_y);
gt::synchronize();
gt::copy(d_y, y);
sum = 0.0;
for (int i=0; i<n; i++) {
//printf("%f\n", y[i]);
sum += y[i];
}
printf("%d/%d [%d:0x%08x] SUM = %f\n", world_rank, world_size, device_id, vendor_id, sum);
MPI_Finalize();
return EXIT_SUCCESS;
}

198
mpi_stencil_gt.cc Normal file
View File

@@ -0,0 +1,198 @@
/*
* Test GPU aware MPI on different platforms using a simple
* distributed 1d stencil as an example. Gtensor is used so
* a single source can be used for all platforms.
*/
#include <cmath>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include "gtensor/gtensor.h"
#include "gtensor/reductions.h"
using namespace gt::placeholders;
// little hack to make code parameterizable on managed vs device memory
namespace gt
{
namespace ext
{
namespace detail
{
template <typename T, gt::size_type N, typename S = gt::space::device>
struct gthelper
{
using gtensor = gt::gtensor<T, N, S>;
};
#ifdef GTENSOR_HAVE_DEVICE
template <typename T, gt::size_type N>
struct gthelper<T, N, gt::space::managed>
{
using gtensor = gt::gtensor_container<gt::space::managed_vector<T>, N>;
};
#endif
} // namespace detail
template <typename T, gt::size_type N, typename S = gt::space::device>
using gtensor2 = typename detail::gthelper<T, N, S>::gtensor;
} // namespace ext
} // namespace gt
static const gt::gtensor<double, 1> stencil5 = {1.0 / 12.0, -2.0 / 3.0, 0.0,
2.0 / 3.0, -1.0 / 12.0};
/*
* Return unevaluated expression that calculates the stencil.
*
* Size of the result will be size of y minus 4 (the number of boundary points).
*/
inline auto stencil1d_5(const gt::gtensor_device<double, 1>& y,
const gt::gtensor<double, 1>& stencil)
{
return stencil(0) * y.view(_s(0, -4)) + stencil(1) * y.view(_s(1, -3)) +
stencil(2) * y.view(_s(2, -2)) + stencil(3) * y.view(_s(3, -1)) +
stencil(4) * y.view(_s(4, _));
}
void set_rank_device(int n_ranks, int rank)
{
int n_devices, device, ranks_per_device;
n_devices = gt::backend::clib::device_get_count();
if (n_ranks > n_devices) {
if (n_ranks % n_devices != 0) {
printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs "
"(%d)\n_global",
n_ranks, n_devices);
exit(EXIT_FAILURE);
}
ranks_per_device = n_ranks / n_devices;
device = rank / ranks_per_device;
} else {
ranks_per_device = 1;
device = rank;
}
gt::backend::clib::device_set(device);
}
void boundary_exchange(MPI_Comm comm, int world_size, int rank,
gt::gtensor_device<double, 1>& d_y, int n_bnd)
{
double* d_y_data = gt::raw_pointer_cast(d_y.data());
double* d_y_data_end = gt::raw_pointer_cast(d_y.data()) + d_y.size();
MPI_Request req_l[2];
MPI_Request req_r[2];
int rank_l = rank - 1;
int rank_r = rank + 1;
if (rank_l >= 0) {
printf("%d left\n", rank);
// send/recv left boundary
MPI_Irecv(d_y_data, n_bnd, MPI_DOUBLE, rank_l, 123, comm,
&req_l[0]);
MPI_Isend(d_y_data + n_bnd, n_bnd, MPI_DOUBLE, rank_l, 456, comm,
&req_l[1]);
}
if (rank_r < world_size) {
printf("%d right\n", rank);
// send/recv right boundary
MPI_Irecv(d_y_data_end - n_bnd, n_bnd, MPI_DOUBLE, rank_r, 456,
comm, &req_r[0]);
MPI_Isend(d_y_data - 2 * n_bnd, n_bnd, MPI_DOUBLE, rank_r, 123,
comm, &req_r[1]);
}
int mpi_rval;
if (rank_l >= 0) {
printf("%d wait left\n", rank);
mpi_rval = MPI_Waitall(2, req_l, MPI_STATUSES_IGNORE);
if (mpi_rval != MPI_SUCCESS) {
printf("send_l error: %d\n", mpi_rval);
}
}
if (rank_r < world_size) {
printf("%d wait right\n", rank);
mpi_rval = MPI_Waitall(2, req_r, MPI_STATUSES_IGNORE);
if (mpi_rval != MPI_SUCCESS) {
printf("send_r error: %d\n", mpi_rval);
}
}
}
int main(int argc, char** argv)
{
constexpr int n_global = 1024 * 1024 * 1024;
constexpr int n_sten = 5;
constexpr int n_bnd = (n_sten - 1) / 2;
int world_size, world_rank, device_id;
uint32_t vendor_id;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
const int n_local = n_global / world_size;
const int n_local_with_ghost = n_local + 2 * n_bnd;
set_rank_device(world_size, world_rank);
device_id = gt::backend::clib::device_get();
vendor_id = gt::backend::clib::device_get_vendor_id(device_id);
auto h_y = gt::empty<double>({n_local_with_ghost});
auto d_y = gt::empty_device<double>({n_local_with_ghost});
auto h_dydx_numeric = gt::empty<double>({n_local});
auto h_dydx_actual = gt::empty<double>({n_local});
auto d_dydx_numeric = gt::empty_device<double>({n_local});
double lx = 8;
double dx = lx / n_global;
double lx_local = lx / world_rank;
double scale = n_global / lx;
auto fn_x_cubed = [](double x) { return x * x * x; };
auto fn_x_cubed_deriv = [](double x) { return 3 * x * x; };
printf("%d Init\n", world_rank);
double x_start = world_rank * lx_local;
for (int i = 0; i < n_local; i++) {
double xtmp = x_start + i * dx;
h_y(i + n_bnd) = fn_x_cubed(xtmp);
h_dydx_actual(i) = fn_x_cubed_deriv(xtmp);
}
printf("%d Ex\n", world_rank);
boundary_exchange(MPI_COMM_WORLD, world_size, world_rank, d_y, n_bnd);
printf("%d Sten\n", world_rank);
//d_dydx_numeric = stencil1d_5(d_y, stencil5) * scale;
printf("Copy\n");
gt::copy(d_dydx_numeric, h_dydx_numeric);
printf("Err calc\n");
double err_norm = std::sqrt(gt::sum_squares(h_dydx_numeric - h_dydx_actual));
printf("%d/%d [%d:0x%08x] err_norm = %f\n", world_rank, world_size, device_id,
vendor_id, err_norm);
MPI_Finalize();
return EXIT_SUCCESS;
}

58
mpigatherinplace.f90 Normal file
View File

@@ -0,0 +1,58 @@
program mpigatherinplace
use mpi
implicit none
integer :: rank, ierr, nmpi, i
integer :: N, err
real(kind=8), dimension(:), allocatable :: allx
real :: asum, lsum
N = 128*1024*1024
call MPI_Init(ierr)
if (ierr /= 0) then
print *, 'Failed MPI_Init: ', ierr
stop
end if
call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
if (ierr /= 0) then
print *, 'Failed MPI_COMM_RANK: ', ierr
stop
end if
call MPI_COMM_SIZE(MPI_COMM_WORLD, nmpi, ierr)
if (ierr /= 0) then
print *, 'Failed MPI_COMM_SIZE: ', ierr
stop
end if
allocate(allx(N*nmpi))
lsum = 0
do i=1, N
allx(rank*N+i) = rank*i/N
lsum = lsum + allx(rank*N+i)
end do
call MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &
& allx, N, MPI_DOUBLE, MPI_COMM_WORLD, ierr)
if (ierr /= 0) then
print *, 'Failed MPI_Allgather: ', ierr
stop
end if
asum = sum(allx)
print *, rank, "/", nmpi, " ", lsum, " ", asum
deallocate(allx)
call MPI_Finalize(ierr)
if (ierr /= 0) then
print *, 'Failed MPI_Finalize: ', ierr
stop
end if
end program mpigatherinplace