add gt port of mpi_daxpy

4 years ago · d791b81cb6
parent 2434b39b53
commit d791b81cb6
3 changed files with 146 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+# create project
+project(mpi-daxpy-test)
+
+set(GTENSOR_ENABLE_BLAS ON CACHE BOOL "Enable gtblas")
+
+# add dependencies
+include(cmake/CPM.cmake)
+CPMFindPackage(NAME gtensor
+               GITHUB_REPOSITORY wdmapp/gtensor
+               GIT_TAG main
+               OPTIONS "GTENSOR_ENABLE_BLAS ON")
+
+find_package(MPI REQUIRED)
+
+add_executable(mpi_daxpy_gt)
+target_sources(mpi_daxpy_gt PRIVATE mpi_daxpy_gt.cc)
+target_link_libraries(mpi_daxpy_gt gtensor::gtensor)
+target_link_libraries(mpi_daxpy_gt MPI::MPI_CXX)
+
+if ("${GTENSOR_DEVICE}" STREQUAL "cuda") 
+  set_source_files_properties(mpi_daxpy_gt.cc
+                              TARGET_DIRECTORY mpi_daxpy_gt
+                              PROPERTIES LANGUAGE CUDA)
+else()
+  set_source_files_properties(mpi_daxpy_gt.cc
+                              TARGET_DIRECTORY mpi_daxpy_gt
+                              PROPERTIES LANGUAGE CXX)
+endif()
--- a/cmake/CPM.cmake
+++ b/cmake/CPM.cmake
@ -0,0 +1,21 @@
+set(CPM_DOWNLOAD_VERSION 0.32.1)
+
+if(CPM_SOURCE_CACHE)
+  # Expand relative path. This is important if the provided path contains a tilde (~)
+  get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
+  file(DOWNLOAD
+       https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+       ${CPM_DOWNLOAD_LOCATION}
+  )
+endif()
+
+include(${CPM_DOWNLOAD_LOCATION})
--- a/mpi_daxpy_gt.cc
+++ b/mpi_daxpy_gt.cc
@ -0,0 +1,95 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  mpi_daxpy_gt.c
+ *
+ *    Description:  Port to gtensor / gt-blas
+ *
+ *        Version:  1.0
+ *        Created:  05/20/2019 10:33:30 AM
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  YOUR NAME (), 
+ *   Organization:  
+ *
+ * =====================================================================================
+ */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gtensor/gtensor.h"
+#include "gt-blas/blas.h"
+
+void set_rank_device(int n_ranks, int rank) {
+    int n_devices, device, ranks_per_device;
+    size_t memory_per_rank;
+
+    n_devices = gt::backend::device_get_count();
+
+    if (n_ranks > n_devices) {
+        if (n_ranks % n_devices != 0) {
+            printf("ERROR: Number of ranks (%d) not a multiple of number of GPUs (%d)\n",
+                   n_ranks, n_devices);
+            exit(EXIT_FAILURE);
+        }
+        ranks_per_device = n_ranks / n_devices;
+        device = rank / ranks_per_device;
+    } else {
+        ranks_per_device = 1;
+        device = rank;
+    }
+
+
+    gt::backend::device_set(device);
+}
+
+
+int main(int argc, char **argv) {
+    int n = 1024;
+    int world_size, world_rank;
+
+    double a = 2.0;
+    double sum = 0.0;
+
+    auto x = gt::empty<double>({n});
+    auto y = gt::empty<double>({n});
+    auto d_x = gt::empty_device<double>({n});
+    auto d_y = gt::empty_device<double>({n});
+
+    MPI_Init(NULL, NULL);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    for (int i=0; i<n; i++) {
+        x[i] =  i+1;
+        y[i] = -i-1;
+    }
+
+    set_rank_device(world_size, world_rank);
+
+    gt::blas::handle_t* h = gt::blas::create();
+
+    gt::copy(x, d_x);
+    gt::copy(y, d_y);
+
+    gt::blas::axpy(h, a, d_x, d_y);
+
+    gt::synchronize();
+
+    gt::copy(d_y, y);
+    
+    sum = 0.0;
+    for (int i=0; i<n; i++) {
+        //printf("%f\n", y[i]);
+        sum += y[i];
+    }
+    printf("%d/%d SUM = %f\n", world_rank, world_size, sum);
+
+    MPI_Finalize();
+
+    return EXIT_SUCCESS;
+}