diff --git a/mpi_stencil2d_sycl.cc b/mpi_stencil2d_sycl.cc index 60903ce..cba10d4 100644 --- a/mpi_stencil2d_sycl.cc +++ b/mpi_stencil2d_sycl.cc @@ -16,13 +16,116 @@ #include #include +#include #include #include #include #include +#include #include "sycl/sycl.hpp" +constexpr int idx2(int n, int row, int col) +{ + return row + col * n; +} + +template +class array2d +{ +public: + using value_type = T; + using allocator_type = sycl::usm_allocator; + using pointer = typename std::allocator_traits::pointer; + using const_pointer = + typename std::allocator_traits::const_pointer; + using reference = value_type&; + using const_reference = const value_type&; + using size_type = typename std::allocator_traits::size_type; + + array2d(sycl::queue& q, allocator_type& a, int nrows, int ncols) + : q_(q), allocator_(a), nrows_(nrows), ncols_(ncols) + { + data_ = a.allocate(nrows * ncols); + } + + ~array2d() { allocator_.deallocate(data_, nrows_ * ncols_); } + + reference operator()(int row, int col) + { + return data_[idx2(nrows_, row, col)]; + } + + const_reference operator()(int row, int col) const + { + return data_[idx2(nrows_, row, col)]; + } + + reference operator[](size_type i) { return data_[i]; } + + const_reference operator[](size_type i) const { return data_[i]; } + + auto copy_in_slice(array2d& source, int dim, int start, int end) + { + assert(dim == 0 || dim == 1); + if (dim == 0) { + auto range = sycl::range<2>(source.ncols_, end - start); + auto e = q_.submit([&](sycl::handler& cgh) { + cgh.parallel_for(range, [=](sycl::item<2> item) { + int row = item.get_id(1); + int col = item.get_id(0); + (*this)(start + row, col) = source(row, col); + }); + }); + return e; + } else { + auto range = sycl::range<2>(end - start, source.nrows_); + auto e = q_.submit([&](sycl::handler& cgh) { + cgh.parallel_for(range, [=](sycl::item<2> item) { + int row = item.get_id(1); + int col = item.get_id(0); + (*this)(row, start + col) = source(row, col); + }); + }); + return e; + } + } + + void copy_out_slice(array2d& dest, int dim, int start, + int end) const + { + assert(dim == 0 || dim == 1); + if (dim == 0) { + auto range = sycl::range<2>(dest.ncols_, end - start); + auto e = q_.submit([&](sycl::handler& cgh) { + cgh.parallel_for(range, [=](sycl::item<2> item) { + int row = item.get_id(1); + int col = item.get_id(0); + dest(row, col) = (*this)(start + row, col); + }); + }); + return e; + } else { + auto range = sycl::range<2>(end - start, dest.nrows_); + auto e = q_.submit([&](sycl::handler& cgh) { + cgh.parallel_for(range, [=](sycl::item<2> item) { + int row = item.get_id(1); + int col = item.get_id(0); + dest(row, col) = (*this)(row, start + col); + }); + }); + return e; + } + } + +private: + sycl::queue& q_; + allocator_type& allocator_; + int nrows_; + int ncols_; + T* data_; +}; + inline void check(const char* fname, int line, int mpi_rval) { if (mpi_rval != MPI_SUCCESS) { @@ -36,11 +139,6 @@ inline void check(const char* fname, int line, int mpi_rval) static constexpr double stencil5[] = {1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0}; -constexpr int idx2(int n, int row, int col) -{ - return row + col * n; -} - /* * Calculate 1d stencil of second dimension of 2d array on GPU. Out array must * be contiguous column major nrows x ncols array, while in array must be