|
|
|
@ -16,13 +16,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
#include <cassert>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cmath>
|
|
|
|
#include <cmath>
|
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
#include <mpi.h>
|
|
|
|
#include <mpi.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <time.h>
|
|
|
|
#include <time.h>
|
|
|
|
|
|
|
|
#include <type_traits>
|
|
|
|
|
|
|
|
|
|
|
|
#include "sycl/sycl.hpp"
|
|
|
|
#include "sycl/sycl.hpp"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
constexpr int idx2(int n, int row, int col)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return row + col * n;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, sycl::usm::alloc AllocType>
|
|
|
|
|
|
|
|
class array2d
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
using value_type = T;
|
|
|
|
|
|
|
|
using allocator_type = sycl::usm_allocator<T, AllocType>;
|
|
|
|
|
|
|
|
using pointer = typename std::allocator_traits<allocator_type>::pointer;
|
|
|
|
|
|
|
|
using const_pointer =
|
|
|
|
|
|
|
|
typename std::allocator_traits<allocator_type>::const_pointer;
|
|
|
|
|
|
|
|
using reference = value_type&;
|
|
|
|
|
|
|
|
using const_reference = const value_type&;
|
|
|
|
|
|
|
|
using size_type = typename std::allocator_traits<allocator_type>::size_type;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
array2d(sycl::queue& q, allocator_type& a, int nrows, int ncols)
|
|
|
|
|
|
|
|
: q_(q), allocator_(a), nrows_(nrows), ncols_(ncols)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
data_ = a.allocate(nrows * ncols);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~array2d() { allocator_.deallocate(data_, nrows_ * ncols_); }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reference operator()(int row, int col)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return data_[idx2(nrows_, row, col)];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const_reference operator()(int row, int col) const
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return data_[idx2(nrows_, row, col)];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reference operator[](size_type i) { return data_[i]; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const_reference operator[](size_type i) const { return data_[i]; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto copy_in_slice(array2d<T, AllocType>& source, int dim, int start, int end)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
assert(dim == 0 || dim == 1);
|
|
|
|
|
|
|
|
if (dim == 0) {
|
|
|
|
|
|
|
|
auto range = sycl::range<2>(source.ncols_, end - start);
|
|
|
|
|
|
|
|
auto e = q_.submit([&](sycl::handler& cgh) {
|
|
|
|
|
|
|
|
cgh.parallel_for(range, [=](sycl::item<2> item) {
|
|
|
|
|
|
|
|
int row = item.get_id(1);
|
|
|
|
|
|
|
|
int col = item.get_id(0);
|
|
|
|
|
|
|
|
(*this)(start + row, col) = source(row, col);
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
return e;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
auto range = sycl::range<2>(end - start, source.nrows_);
|
|
|
|
|
|
|
|
auto e = q_.submit([&](sycl::handler& cgh) {
|
|
|
|
|
|
|
|
cgh.parallel_for(range, [=](sycl::item<2> item) {
|
|
|
|
|
|
|
|
int row = item.get_id(1);
|
|
|
|
|
|
|
|
int col = item.get_id(0);
|
|
|
|
|
|
|
|
(*this)(row, start + col) = source(row, col);
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
return e;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void copy_out_slice(array2d<T, AllocType>& dest, int dim, int start,
|
|
|
|
|
|
|
|
int end) const
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
assert(dim == 0 || dim == 1);
|
|
|
|
|
|
|
|
if (dim == 0) {
|
|
|
|
|
|
|
|
auto range = sycl::range<2>(dest.ncols_, end - start);
|
|
|
|
|
|
|
|
auto e = q_.submit([&](sycl::handler& cgh) {
|
|
|
|
|
|
|
|
cgh.parallel_for(range, [=](sycl::item<2> item) {
|
|
|
|
|
|
|
|
int row = item.get_id(1);
|
|
|
|
|
|
|
|
int col = item.get_id(0);
|
|
|
|
|
|
|
|
dest(row, col) = (*this)(start + row, col);
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
return e;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
auto range = sycl::range<2>(end - start, dest.nrows_);
|
|
|
|
|
|
|
|
auto e = q_.submit([&](sycl::handler& cgh) {
|
|
|
|
|
|
|
|
cgh.parallel_for(range, [=](sycl::item<2> item) {
|
|
|
|
|
|
|
|
int row = item.get_id(1);
|
|
|
|
|
|
|
|
int col = item.get_id(0);
|
|
|
|
|
|
|
|
dest(row, col) = (*this)(row, start + col);
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
return e;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
sycl::queue& q_;
|
|
|
|
|
|
|
|
allocator_type& allocator_;
|
|
|
|
|
|
|
|
int nrows_;
|
|
|
|
|
|
|
|
int ncols_;
|
|
|
|
|
|
|
|
T* data_;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
inline void check(const char* fname, int line, int mpi_rval)
|
|
|
|
inline void check(const char* fname, int line, int mpi_rval)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
if (mpi_rval != MPI_SUCCESS) {
|
|
|
|
if (mpi_rval != MPI_SUCCESS) {
|
|
|
|
@ -36,11 +139,6 @@ inline void check(const char* fname, int line, int mpi_rval)
|
|
|
|
static constexpr double stencil5[] = {1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0,
|
|
|
|
static constexpr double stencil5[] = {1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0,
|
|
|
|
-1.0 / 12.0};
|
|
|
|
-1.0 / 12.0};
|
|
|
|
|
|
|
|
|
|
|
|
constexpr int idx2(int n, int row, int col)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return row + col * n;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
/*
|
|
|
|
* Calculate 1d stencil of second dimension of 2d array on GPU. Out array must
|
|
|
|
* Calculate 1d stencil of second dimension of 2d array on GPU. Out array must
|
|
|
|
* be contiguous column major nrows x ncols array, while in array must be
|
|
|
|
* be contiguous column major nrows x ncols array, while in array must be
|
|
|
|
|