1 #ifndef __CS_DISPATCH_H__ 
    2 #define __CS_DISPATCH_H__ 
   42 #if defined(SYCL_LANGUAGE_VERSION) 
   43 #include <sycl/sycl.hpp> 
   56 #include "cs_alge_cuda.cuh" 
   63 #if defined(SYCL_LANGUAGE_VERSION) 
   65 #define CS_DISPATCH_SUM_DOUBLE auto 
   69 #define CS_DISPATCH_SUM_DOUBLE double 
   82   CS_DISPATCH_SUM_SIMPLE,  
 
   84   CS_DISPATCH_SUM_ATOMIC   
 
   86 } cs_dispatch_sum_type_t;
 
   94 template <
class Derived>
 
   95 class cs_dispatch_context_mixin {
 
  100   template <
class F, 
class... Args>
 
  102   parallel_for(
cs_lnum_t n, F&& f, Args&&... args) = delete;
 
  105   template <class F, class... Args>
 
  112   template <class F, class... Args>
 
  120   template <class F, class... Args>
 
  122   parallel_for_reduce_sum
 
  123     (
cs_lnum_t n, 
double& sum, F&& f, Args&&... args) = delete;
 
  128   try_get_parallel_for_i_faces_sum_type(const 
cs_mesh_t*         m,
 
  129                                         cs_dispatch_sum_type_t&  st);
 
  134   try_get_parallel_for_b_faces_sum_type(const 
cs_mesh_t*         m,
 
  135                                         cs_dispatch_sum_type_t&  st);
 
  140 template <class Derived>
 
  141 template <class F, class... Args>
 
  142 decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
 
  143   (const 
cs_mesh_t* m, F&& f, Args&&... args) {
 
  144   return static_cast<Derived*
>(
this)->parallel_for
 
  147                                          static_cast<Args&&
>(args)...);
 
  151 template <
class Derived>
 
  152 template <
class F, 
class... Args>
 
  153 decltype(
auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
 
  154   (const 
cs_mesh_t* m, F&& f, Args&&... args) {
 
  155   return static_cast<Derived*
>(
this)->parallel_for
 
  158                                          static_cast<Args&&
>(args)...);
 
  162 template <
class Derived>
 
  163 bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
 
  165    cs_dispatch_sum_type_t&           st) {
 
  166   st = CS_DISPATCH_SUM_SIMPLE;
 
  171 template <
class Derived>
 
  172 bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
 
  174    cs_dispatch_sum_type_t&           st) {
 
  175   st = CS_DISPATCH_SUM_SIMPLE;
 
  183 class cs_host_context : 
public cs_dispatch_context_mixin<cs_host_context> {
 
  201     this->n_min_for_threads = n;
 
  206   n_min_for_cpu_threads(
void) {
 
  207     return this->n_min_for_threads;
 
  211   template <
class F, 
class... Args>
 
  213   parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
 
  214 #   pragma omp parallel for  if (n >= n_min_for_threads) 
  223   template <
class F, 
class... Args>
 
  225   parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
 
  229     for (
int g_id = 0; g_id < n_i_groups; g_id++) {
 
  230       #pragma omp parallel for 
  231       for (
int t_id = 0; t_id < n_i_threads; t_id++) {
 
  232         for (
cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
 
  233              f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
 
  244   template <
class F, 
class... Args>
 
  246   parallel_for_b_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
 
  250     #pragma omp parallel for 
  251     for (
int t_id = 0; t_id < n_b_threads; t_id++) {
 
  252       for (
cs_lnum_t f_id = b_group_index[t_id*2];
 
  253            f_id < b_group_index[t_id*2 + 1];
 
  262   template <
class F, 
class... Args>
 
  269 #   pragma omp parallel for reduction(+:sum) if (n >= n_min_for_threads) 
  278   try_get_parallel_for_i_faces_sum_type([[maybe_unused]]
const cs_mesh_t*  m,
 
  279                                         cs_dispatch_sum_type_t&           st) {
 
  280     st = CS_DISPATCH_SUM_SIMPLE;
 
  286   try_get_parallel_for_b_faces_sum_type([[maybe_unused]]
const cs_mesh_t*  m,
 
  287                                         cs_dispatch_sum_type_t&           st) {
 
  288     st = CS_DISPATCH_SUM_SIMPLE;
 
  294 #if defined(__NVCC__) 
  302 template <
class F, 
class... Args>
 
  303 __global__ 
void cs_cuda_kernel_parallel_for(
cs_lnum_t n, F f, Args... args) {
 
  305   for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; 
id < n;
 
  306        id += blockDim.x * gridDim.x) {
 
  317 template <
class F, 
class... Args>
 
  319 cs_cuda_kernel_parallel_for_reduce_sum(
cs_lnum_t   n,
 
  324   extern double __shared__ stmp[];
 
  329   for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; 
id < n;
 
  330        id += blockDim.x * gridDim.x) {
 
  331      f(
id, stmp[tid], args...);
 
  334   switch (blockDim.x) {
 
  336     cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
 
  339     cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
 
  342     cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
 
  345     cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
 
  356 class cs_device_context : 
public cs_dispatch_context_mixin<cs_device_context> {
 
  364   cudaStream_t  stream_;      
 
  373   cs_device_context(
void)
 
  374     : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
 
  375       device_(0), use_gpu_(true)
 
  377     device_ = cs_base_cuda_get_device();
 
  380   cs_device_context(
long          grid_size,
 
  384     : grid_size_(grid_size), block_size_(block_size), stream_(stream),
 
  385       device_(device), use_gpu_(true)
 
  388   cs_device_context(
long          grid_size,
 
  391     : grid_size_(grid_size), block_size_(block_size), stream_(stream),
 
  392       device_(0), use_gpu_(true)
 
  394     device_ = cs_base_cuda_get_device();
 
  397   cs_device_context(
long  grid_size,
 
  399     : grid_size_(grid_size), block_size_(block_size),
 
  400        stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
 
  402     device_ = cs_base_cuda_get_device();
 
  405   cs_device_context(cudaStream_t  stream)
 
  406     : grid_size_(0), block_size_(256), stream_(stream), device_(0),
 
  409     device_ = cs_base_cuda_get_device();
 
  415   set_cuda_grid(
long  grid_size,
 
  417     this->grid_size_ = grid_size;
 
  418     this->block_size_ = block_size;
 
  424   set_cuda_stream(cudaStream_t stream) {
 
  425     this->stream_ = stream;
 
  431   set_cuda_stream(
int  stream_id) {
 
  432     this->stream_ = cs_cuda_get_stream(stream_id);
 
  439     return this->stream_;
 
  445   set_cuda_device(
int  device) {
 
  446     this->device_ = device;
 
  452   set_use_gpu(
bool  use_gpu) {
 
  453     this->use_gpu_ = use_gpu;
 
  460     return (device_ >= 0 && use_gpu_);
 
  473   alloc_mode(
bool readable_on_cpu) {
 
  475     if (device_ >= 0 && use_gpu_) {
 
  487   template <
class F, 
class... Args>
 
  489   parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
 
  490     if (device_ < 0 || use_gpu_ == 
false) {
 
  494     long l_grid_size = grid_size_;
 
  495     if (l_grid_size < 1) {
 
  496       l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
 
  499     cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
 
  500       (n, 
static_cast<F&&
>(f), 
static_cast<Args&&
>(args)...);
 
  506   template <
class F, 
class... Args>
 
  508   parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
 
  510     if (device_ < 0 || use_gpu_ == 
false) {
 
  514     long l_grid_size = grid_size_;
 
  515     if (l_grid_size < 1) {
 
  516       l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
 
  519     cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
 
  520       (n, 
static_cast<F&&
>(f), 
static_cast<Args&&
>(args)...);
 
  527   template <
class F, 
class... Args>
 
  534     if (device_ < 0 || use_gpu_ == 
false) {
 
  538     long l_grid_size = grid_size_;
 
  539     if (l_grid_size < 1) {
 
  540       l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
 
  543     double *r_grid_, *r_reduce_;
 
  544     cs_blas_cuda_get_2_stage_reduce_buffers
 
  545       (n, 1, l_grid_size, r_grid_, r_reduce_);
 
  547     int smem_size = block_size_ * 
sizeof(double);
 
  548     cs_cuda_kernel_parallel_for_reduce_sum
 
  549       <<<l_grid_size, block_size_, smem_size, stream_>>>
 
  550       (n, r_grid_, 
static_cast<F&&
>(f), 
static_cast<Args&&
>(args)...);
 
  552     switch (block_size_) {
 
  554       cs_blas_cuda_reduce_single_block<1024, 1>
 
  555         <<<1, block_size_, 0, stream_>>>
 
  556         (l_grid_size, r_grid_, r_reduce_);
 
  559       cs_blas_cuda_reduce_single_block<512, 1>
 
  560         <<<1, block_size_, 0, stream_>>>
 
  561         (l_grid_size, r_grid_, r_reduce_);
 
  564       cs_blas_cuda_reduce_single_block<256, 1>
 
  565         <<<1, block_size_, 0, stream_>>>
 
  566         (l_grid_size, r_grid_, r_reduce_);
 
  569       cs_blas_cuda_reduce_single_block<128, 1>
 
  570         <<<1, block_size_, 0, stream_>>>
 
  571         (l_grid_size, r_grid_, r_reduce_);
 
  577     cudaStreamSynchronize(stream_);
 
  586     if (device_ > -1 && use_gpu_)
 
  587       cudaStreamSynchronize(stream_);
 
  592   try_get_parallel_for_i_faces_sum_type(
const cs_mesh_t         *m,
 
  593                                         cs_dispatch_sum_type_t  &st) {
 
  594     if (device_ < 0 || use_gpu_ == 
false) {
 
  598     st = CS_DISPATCH_SUM_ATOMIC;
 
  604   try_get_parallel_for_b_faces_sum_type(
const cs_mesh_t         *m,
 
  605                                         cs_dispatch_sum_type_t  &st) {
 
  606     if (device_ < 0 || use_gpu_ == 
false) {
 
  610     st = CS_DISPATCH_SUM_ATOMIC;
 
  616 #elif defined(SYCL_LANGUAGE_VERSION) 
  622 class cs_device_context : 
public cs_dispatch_context_mixin<cs_device_context> {
 
  635   cs_device_context(
void)
 
  636     : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
 
  638     is_gpu = queue_.get_device().is_gpu();
 
  644   set_use_gpu(
bool  use_gpu) {
 
  645     this->use_gpu_ = use_gpu;
 
  652     return (is_gpu && use_gpu_);
 
  665   alloc_mode([[maybe_unused]] 
bool readable_on_cpu) {
 
  674   template <
class F, 
class... Args>
 
  676   parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
 
  677     if (is_gpu == 
false || use_gpu_ == 
false) {
 
  681     queue_.parallel_for(n, 
static_cast<F&&
>(f), 
static_cast<Args&&
>(args)...);
 
  687   template <
class F, 
class... Args>
 
  689   parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
 
  691     if (is_gpu == 
false || use_gpu_ == 
false) {
 
  695     queue_.parallel_for(n, 
static_cast<F&&
>(f), 
static_cast<Args&&
>(args)...);
 
  701   template <
class F, 
class... Args>
 
  708     if (is_gpu == 
false || use_gpu_ == 
false) {
 
  714     double *sum_ptr = (
double *)sycl::malloc_shared(
sizeof(
double), queue_);
 
  716     queue_.parallel_for(n,
 
  717                         sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
 
  719                         static_cast<Args&&
>(args)...).wait();
 
  723     sycl::free((
void *)sum_ptr, queue_);
 
  731     if (is_gpu == && use_gpu_)
 
  737   try_get_parallel_for_i_faces_sum_type(
const cs_mesh_t         *m,
 
  738                                         cs_dispatch_sum_type_t  &st) {
 
  739     if (is_gpu == 
false || use_gpu_ == 
false) {
 
  743     st = CS_DISPATCH_SUM_ATOMIC;
 
  749   try_get_parallel_for_b_faces_sum_type(
const cs_mesh_t         *m,
 
  750                                         cs_dispatch_sum_type_t  &st) {
 
  751     if (is_gpu == 
false || use_gpu_ == 
false) {
 
  755     st = CS_DISPATCH_SUM_ATOMIC;
 
  767 class cs_void_context : 
public cs_dispatch_context_mixin<cs_void_context> {
 
  773   cs_void_context(
void)
 
  776 #if !defined(__NVCC__) 
  786   set_cuda_grid([[maybe_unused]] 
long  grid_size,
 
  787                 [[maybe_unused]] 
long  block_size) {
 
  791   set_cuda_stream([[maybe_unused]] 
int  stream_id) {
 
  795   set_cuda_device([[maybe_unused]] 
int  device_id) {
 
  800 #if !defined(__NVCC__) && !defined(SYCL_LANGUAGE_VERSION) 
  805   set_use_gpu([[maybe_unused]] 
bool  use_gpu) {
 
  823   alloc_mode([[maybe_unused]] 
bool readable_on_cpu) {
 
  836   template <
class F, 
class... Args>
 
  837   bool parallel_for([[maybe_unused]] 
cs_lnum_t  n,
 
  838                     [[maybe_unused]] F&&        f,
 
  839                     [[maybe_unused]] Args&&...  args) {
 
  845   template <
class F, 
class... Args>
 
  846   bool parallel_for_reduce_sum([[maybe_unused]] 
cs_lnum_t  n,
 
  847                                [[maybe_unused]] 
double&    sum,
 
  848                                [[maybe_unused]] F&&        f,
 
  849                                [[maybe_unused]] Args&&...  args) {
 
  861 template <
class... Contexts>
 
  862 class cs_combined_context
 
  863   : 
public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
 
  867   using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
 
  870   cs_combined_context() = 
default;
 
  871   cs_combined_context(Contexts... contexts)
 
  872     : Contexts(std::move(contexts))...
 
  877   template <
class F, 
class... Args>
 
  878   auto parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
 
  879     bool launched = 
false;
 
  880     [[maybe_unused]] decltype(
nullptr) try_execute[] = {
 
  881       (   launched = launched
 
  882        || Contexts::parallel_for_i_faces(m, f, args...), 
nullptr)...
 
  886   template <
class F, 
class... Args>
 
  887   auto parallel_for_b_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
 
  888     bool launched = 
false;
 
  889     [[maybe_unused]] decltype(
nullptr) try_execute[] = {
 
  890       (   launched = launched
 
  891        || Contexts::parallel_for_b_faces(m, f, args...), 
nullptr)...
 
  895   template <
class F, 
class... Args>
 
  896   auto parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
 
  897     bool launched = 
false;
 
  898     [[maybe_unused]] decltype(
nullptr) try_execute[] = {
 
  899       (   launched = launched
 
  900        || Contexts::parallel_for(n, f, args...), 
nullptr)...
 
  904   template <
class F, 
class... Args>
 
  905   auto parallel_for_reduce_sum
 
  906     (
cs_lnum_t n, 
double& sum, F&& f, Args&&... args) {
 
  907     bool launched = 
false;
 
  908     [[maybe_unused]] decltype(
nullptr) try_execute[] = {
 
  909       (   launched = launched
 
  910        || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
 
  915   cs_dispatch_sum_type_t
 
  916   get_parallel_for_i_faces_sum_type(
const cs_mesh_t* m) {
 
  917     cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
 
  919     [[maybe_unused]] decltype(
nullptr) try_query[] = {
 
  921        || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
 
  927   cs_dispatch_sum_type_t
 
  928   get_parallel_for_b_faces_sum_type(
const cs_mesh_t* m) {
 
  929     cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
 
  931     [[maybe_unused]] decltype(
nullptr) try_query[] = {
 
  933        || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
 
  948 class cs_dispatch_context : 
public cs_combined_context<
 
  949 #if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION)
 
  958   using base_t = cs_combined_context<
 
  959 #if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION) 
  967   using base_t::base_t;
 
  968   using base_t::operator=;
 
 1013 #ifdef __CUDA_ARCH__    
 1015 template <
typename T>
 
 1016 __device__ 
static void __forceinline__
 
 1017 cs_dispatch_sum(T                       *dest,
 
 1019                 cs_dispatch_sum_type_t   sum_type)
 
 1021   if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
 
 1023     using sum_v = assembled_value<T>;
 
 1027     sum_v::ref(*dest).conflict_free_add(-1u, 
v);
 
 1029     atomicAdd(dest, src);
 
 1032   else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
 
 1037 #elif defined(SYCL_LANGUAGE_VERSION) 
 1039 template <
typename T>
 
 1041 cs_dispatch_sum(T                       *dest,
 
 1043                 cs_dispatch_sum_type_t   sum_type)
 
 1045   if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
 
 1048   else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
 
 1050                      sycl::memory_order::relaxed,
 
 1051                      sycl::memory_scope::device> aref(*dest);
 
 1052     aref.fetch_add(src);
 
 1058 template <
typename T>
 
 1060 cs_dispatch_sum(T                       *dest,
 
 1062                 cs_dispatch_sum_type_t   sum_type)
 
 1064   if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
 
 1067   else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
 
 1091 #ifdef __CUDA_ARCH__    
 1093 template <
size_t dim, 
typename T>
 
 1094 __device__ 
static void __forceinline__
 
 1095 cs_dispatch_sum(T                       *dest,
 
 1097                 cs_dispatch_sum_type_t   sum_type)
 
 1099   if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
 
 1104   else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
 
 1105 #if __CUDA_ARCH__ >= 700 
 1106     using sum_v = assembled_value<T, dim>;
 
 1109     for (
size_t i = 0; i < dim; i++) {
 
 1110       v[i].get() = src[i];
 
 1113     sum_v &vs = 
reinterpret_cast<sum_v &
>(*dest);
 
 1114     vs.conflict_free_add(-1u, 
v);
 
 1118     for (
size_t i = 0; i < dim; i++) {
 
 1119       atomicAdd(&dest[i], src[i]);
 
 1125 #elif defined(SYCL_LANGUAGE_VERSION) 
 1127 template <
size_t dim, 
typename T>
 
 1129 cs_dispatch_sum(T                       *dest,
 
 1131                 cs_dispatch_sum_type_t   sum_type)
 
 1133   if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
 
 1134     for (
size_t i = 0; i < dim; i++) {
 
 1138   else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
 
 1139     for (
size_t i = 0; i < dim; i++) {
 
 1141                        sycl::memory_order::relaxed,
 
 1142                        sycl::memory_scope::device> aref(dest[i]);
 
 1143       aref.fetch_add(src[i]);
 
 1150 template <
size_t dim, 
typename T>
 
 1152 cs_dispatch_sum(T                       *dest,
 
 1154                 cs_dispatch_sum_type_t   sum_type)
 
 1156   if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
 
 1157     for (
size_t i = 0; i < dim; i++) {
 
 1161   else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
 
 1162     for (
size_t i = 0; i < dim; i++) {
 
cs_alloc_mode_t
Definition: bft_mem.h:50
@ CS_ALLOC_HOST
Definition: bft_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: bft_mem.h:57
@ CS_ALLOC_DEVICE
Definition: bft_mem.h:59
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition: cs_assert.h:67
#define restrict
Definition: cs_defs.h:141
#define CS_THR_MIN
Definition: cs_defs.h:479
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:325
double precision, dimension(:,:,:), allocatable v
Definition: atimbr.f90:113
cs_lnum_t n_i_faces
Definition: cs_mesh.h:98
cs_numbering_t * b_face_numbering
Definition: cs_mesh.h:163
cs_numbering_t * i_face_numbering
Definition: cs_mesh.h:162
cs_lnum_t * group_index
Definition: cs_numbering.h:102
int n_threads
Definition: cs_numbering.h:93
int n_groups
Definition: cs_numbering.h:94