1#ifndef __CS_DISPATCH_H__
2#define __CS_DISPATCH_H__
42#if defined(SYCL_LANGUAGE_VERSION)
43#include <sycl/sycl.hpp>
55#include "cs_math_cuda.cuh"
62#if defined(SYCL_LANGUAGE_VERSION)
64#define CS_DISPATCH_SUM_DOUBLE auto
68#define CS_DISPATCH_SUM_DOUBLE double
81 CS_DISPATCH_SUM_SIMPLE,
83 CS_DISPATCH_SUM_ATOMIC
85} cs_dispatch_sum_type_t;
93template <
class Derived>
94class cs_dispatch_context_mixin {
99 template <
class F,
class... Args>
101 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) =
delete;
104 template <
class M,
class F,
class... Args>
106 parallel_for_i_faces(
const M* m,
111 template <
class M,
class F,
class... Args>
113 parallel_for_b_faces(
const M* m,
119 template <
class F,
class... Args>
121 parallel_for_reduce_sum
122 (
cs_lnum_t n,
double& sum, F&& f, Args&&... args) =
delete;
128 try_get_parallel_for_i_faces_sum_type(
const M* m,
129 cs_dispatch_sum_type_t& st);
135 try_get_parallel_for_b_faces_sum_type(
const M* m,
136 cs_dispatch_sum_type_t& st);
141template <
class Derived>
142template <
class M,
class F,
class... Args>
143decltype(
auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
144 (
const M* m, F&& f, Args&&... args) {
145 return static_cast<Derived*
>(
this)->parallel_for
148 static_cast<Args&&
>(args)...);
152template <
class Derived>
153template <
class M,
class F,
class... Args>
154decltype(
auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
155 (
const M* m, F&& f, Args&&... args) {
156 return static_cast<Derived*
>(
this)->parallel_for
159 static_cast<Args&&
>(args)...);
163template <
class Derived>
165bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
166 ([[maybe_unused]]
const M* m,
167 cs_dispatch_sum_type_t& st) {
168 st = CS_DISPATCH_SUM_SIMPLE;
173template <
class Derived>
175bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
176 ([[maybe_unused]]
const M* m,
177 cs_dispatch_sum_type_t& st) {
178 st = CS_DISPATCH_SUM_SIMPLE;
186class cs_host_context :
public cs_dispatch_context_mixin<cs_host_context> {
196 : n_min_per_thread(
CS_THR_MIN), n_threads_(-1)
204 this->n_min_per_thread = n;
209 n_min_per_cpu_thread(
void) {
210 return this->n_min_per_thread;
215 set_n_cpu_threads(
int n) {
216 this->n_threads_ = n;
221 n_cpu_threads(
void) {
222 return this->n_threads_;
226 template <
class F,
class... Args>
228 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
229 int n_t = n_threads_;
232 int n_t_l = n / n_min_per_thread;
239# pragma omp parallel for num_threads(n_t)
248 template <
class M,
class F,
class... Args>
250 parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
251 const int n_i_groups = m->i_face_numbering->n_groups;
252 const int n_i_threads = m->i_face_numbering->n_threads;
254 for (
int g_id = 0; g_id < n_i_groups; g_id++) {
255 #pragma omp parallel for
256 for (
int t_id = 0; t_id < n_i_threads; t_id++) {
257 for (
cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
258 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
269 template <
class M,
class F,
class... Args>
271 parallel_for_b_faces(
const M* m, F&& f, Args&&... args) {
272 const int n_b_threads = m->b_face_numbering->n_threads;
275 #pragma omp parallel for
276 for (
int t_id = 0; t_id < n_b_threads; t_id++) {
277 for (
cs_lnum_t f_id = b_group_index[t_id*2];
278 f_id < b_group_index[t_id*2 + 1];
287 template <
class F,
class... Args>
293 int n_t = n_threads_;
296 int n_t_l = n / n_min_per_thread;
305# pragma omp parallel for reduction(+:sum) num_threads(n_t)
315 try_get_parallel_for_i_faces_sum_type([[maybe_unused]]
const M* m,
316 cs_dispatch_sum_type_t& st) {
317 st = CS_DISPATCH_SUM_SIMPLE;
324 try_get_parallel_for_b_faces_sum_type([[maybe_unused]]
const M* m,
325 cs_dispatch_sum_type_t& st) {
326 st = CS_DISPATCH_SUM_SIMPLE;
340template <
class F,
class... Args>
341__global__
void cs_cuda_kernel_parallel_for(
cs_lnum_t n, F f, Args... args) {
343 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
344 id += blockDim.x * gridDim.x) {
355template <
class F,
class... Args>
357cs_cuda_kernel_parallel_for_reduce_sum(
cs_lnum_t n,
362 extern double __shared__ stmp[];
367 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
368 id += blockDim.x * gridDim.x) {
369 f(
id, stmp[tid], args...);
372 switch (blockDim.x) {
374 cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
377 cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
380 cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
383 cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
394class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
402 cudaStream_t stream_;
411 cs_device_context(
void)
412 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
413 device_(0), use_gpu_(true)
415 device_ = cs_glob_cuda_device_id;
418 cs_device_context(
long grid_size,
422 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
423 device_(device), use_gpu_(true)
426 cs_device_context(
long grid_size,
429 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
430 device_(0), use_gpu_(true)
432 device_ = cs_base_cuda_get_device();
435 cs_device_context(
long grid_size,
437 : grid_size_(grid_size), block_size_(block_size),
438 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
440 device_ = cs_base_cuda_get_device();
443 cs_device_context(cudaStream_t stream)
444 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
447 device_ = cs_base_cuda_get_device();
456 set_cuda_grid(
long grid_size,
458 this->grid_size_ = (grid_size > 0) ? grid_size : -1;
459 this->block_size_ = block_size;
465 set_cuda_stream(cudaStream_t stream) {
466 this->stream_ = stream;
472 set_cuda_stream(
int stream_id) {
473 this->stream_ = cs_cuda_get_stream(stream_id);
480 return this->stream_;
486 set_cuda_device(
int device) {
487 this->device_ = device;
493 set_use_gpu(
bool use_gpu) {
494 this->use_gpu_ = use_gpu;
501 return (device_ >= 0 && use_gpu_);
514 alloc_mode(
bool readable_on_cpu) {
516 if (device_ >= 0 && use_gpu_) {
528 template <
class F,
class... Args>
530 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
531 if (device_ < 0 || use_gpu_ ==
false) {
535 long l_grid_size = grid_size_;
536 if (l_grid_size < 1) {
537 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
541 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
542 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
548 template <
class M,
class F,
class... Args>
550 parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
552 if (device_ < 0 || use_gpu_ ==
false) {
556 long l_grid_size = grid_size_;
557 if (l_grid_size < 1) {
558 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
562 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
563 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
570 template <
class F,
class... Args>
577 if (device_ < 0 || use_gpu_ ==
false) {
581 long l_grid_size = grid_size_;
582 if (l_grid_size < 1) {
583 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
590 double *r_grid_, *r_reduce_;
591 cs_blas_cuda_get_2_stage_reduce_buffers
592 (n, 1, l_grid_size, r_grid_, r_reduce_);
594 int smem_size = block_size_ *
sizeof(double);
595 cs_cuda_kernel_parallel_for_reduce_sum
596 <<<l_grid_size, block_size_, smem_size, stream_>>>
597 (n, r_grid_,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
599 switch (block_size_) {
601 cs_blas_cuda_reduce_single_block<1024, 1>
602 <<<1, block_size_, 0, stream_>>>
603 (l_grid_size, r_grid_, r_reduce_);
606 cs_blas_cuda_reduce_single_block<512, 1>
607 <<<1, block_size_, 0, stream_>>>
608 (l_grid_size, r_grid_, r_reduce_);
611 cs_blas_cuda_reduce_single_block<256, 1>
612 <<<1, block_size_, 0, stream_>>>
613 (l_grid_size, r_grid_, r_reduce_);
616 cs_blas_cuda_reduce_single_block<128, 1>
617 <<<1, block_size_, 0, stream_>>>
618 (l_grid_size, r_grid_, r_reduce_);
624 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
625 CS_CUDA_CHECK(cudaGetLastError());
634 if (device_ > -1 && use_gpu_) {
635 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
636 CS_CUDA_CHECK(cudaGetLastError());
643 try_get_parallel_for_i_faces_sum_type(
const M *m,
644 cs_dispatch_sum_type_t &st) {
645 if (device_ < 0 || use_gpu_ ==
false) {
649 st = CS_DISPATCH_SUM_ATOMIC;
656 try_get_parallel_for_b_faces_sum_type(
const M *m,
657 cs_dispatch_sum_type_t &st) {
658 if (device_ < 0 || use_gpu_ ==
false) {
662 st = CS_DISPATCH_SUM_ATOMIC;
668#elif defined(SYCL_LANGUAGE_VERSION)
671#if !defined(CS_GLOB_SYCL_QUEUE_IS_DEFINED)
672extern sycl::queue cs_glob_sycl_queue;
673#define CS_GLOB_SYCL_QUEUE_IS_DEFINED 1
680class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
693 cs_device_context(
void)
694 : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
696 is_gpu = queue_.get_device().is_gpu();
702 set_use_gpu(
bool use_gpu) {
703 this->use_gpu_ = use_gpu;
710 return (is_gpu && use_gpu_);
723 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
732 template <
class F,
class... Args>
734 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
735 if (is_gpu ==
false || use_gpu_ ==
false) {
739 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
745 template <
class M,
class F,
class... Args>
747 parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
749 if (is_gpu ==
false || use_gpu_ ==
false) {
753 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
759 template <
class F,
class... Args>
766 if (is_gpu ==
false || use_gpu_ ==
false) {
772 double *sum_ptr = (
double *)sycl::malloc_shared(
sizeof(
double), queue_);
774 queue_.parallel_for(n,
775 sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
777 static_cast<Args&&
>(args)...).wait();
781 sycl::free((
void *)sum_ptr, queue_);
789 if (is_gpu && use_gpu_)
796 try_get_parallel_for_i_faces_sum_type(
const M *m,
797 cs_dispatch_sum_type_t &st) {
798 if (is_gpu ==
false || use_gpu_ ==
false) {
802 st = CS_DISPATCH_SUM_ATOMIC;
809 try_get_parallel_for_b_faces_sum_type(
const M *m,
810 cs_dispatch_sum_type_t &st) {
811 if (is_gpu ==
false || use_gpu_ ==
false) {
815 st = CS_DISPATCH_SUM_ATOMIC;
821#elif defined(HAVE_OPENMP_TARGET)
827class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
839 cs_device_context(
void)
840 : is_gpu(false), use_gpu_(true)
844 is_gpu = (omp_get_num_devices() > 1) ?
true :
false;
850 set_use_gpu(
bool use_gpu) {
851 this->use_gpu_ = use_gpu;
858 return (is_gpu && use_gpu_);
871 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
880 template <
class F,
class... Args>
882 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
883 if (is_gpu ==
false || use_gpu_ ==
false) {
888# pragma omp target teams distribute parallel for
897 template <
class F,
class... Args>
904 if (is_gpu ==
false || use_gpu_ ==
false) {
909# pragma omp target teams distribute parallel for reduction(+:sum)
926 try_get_parallel_for_i_faces_sum_type(
const M *m,
927 cs_dispatch_sum_type_t &st) {
928 if (is_gpu ==
false || use_gpu_ ==
false) {
932 st = CS_DISPATCH_SUM_ATOMIC;
939 try_get_parallel_for_b_faces_sum_type(
const M *m,
940 cs_dispatch_sum_type_t &st) {
941 if (is_gpu ==
false || use_gpu_ ==
false) {
945 st = CS_DISPATCH_SUM_ATOMIC;
957class cs_void_context :
public cs_dispatch_context_mixin<cs_void_context> {
963 cs_void_context(
void)
966#if !defined(__NVCC__)
976 set_cuda_grid([[maybe_unused]]
long grid_size,
977 [[maybe_unused]]
long block_size) {
981 set_cuda_stream([[maybe_unused]]
int stream_id) {
985 set_cuda_device([[maybe_unused]]
int device_id) {
990#if !defined(__NVCC__) \
991 && !defined(SYCL_LANGUAGE_VERSION) \
992 && !defined(HAVE_OPENMP_TARGET)
997 set_use_gpu([[maybe_unused]]
bool use_gpu) {
1015 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
1028 template <
class F,
class... Args>
1029 bool parallel_for([[maybe_unused]]
cs_lnum_t n,
1030 [[maybe_unused]] F&& f,
1031 [[maybe_unused]] Args&&... args) {
1037 template <
class F,
class... Args>
1038 bool parallel_for_reduce_sum([[maybe_unused]]
cs_lnum_t n,
1039 [[maybe_unused]]
double& sum,
1040 [[maybe_unused]] F&& f,
1041 [[maybe_unused]] Args&&... args) {
1053template <
class... Contexts>
1054class cs_combined_context
1055 :
public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
1056 public Contexts... {
1059 using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
1062 cs_combined_context() =
default;
1063 cs_combined_context(Contexts... contexts)
1064 : Contexts(std::move(contexts))...
1069 template <
class M,
class F,
class... Args>
1070 auto parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
1071 bool launched =
false;
1072 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1073 ( launched = launched
1074 || Contexts::parallel_for_i_faces(m, f, args...),
nullptr)...
1078 template <
class M,
class F,
class... Args>
1079 auto parallel_for_b_faces(
const M* m, F&& f, Args&&... args) {
1080 bool launched =
false;
1081 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1082 ( launched = launched
1083 || Contexts::parallel_for_b_faces(m, f, args...),
nullptr)...
1087 template <
class F,
class... Args>
1088 auto parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
1089 bool launched =
false;
1090 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1091 ( launched = launched
1092 || Contexts::parallel_for(n, f, args...),
nullptr)...
1096 template <
class F,
class... Args>
1097 auto parallel_for_reduce_sum
1098 (
cs_lnum_t n,
double& sum, F&& f, Args&&... args) {
1099 bool launched =
false;
1100 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1101 ( launched = launched
1102 || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
1108 cs_dispatch_sum_type_t
1109 get_parallel_for_i_faces_sum_type(
const M* m) {
1110 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1112 [[maybe_unused]]
decltype(
nullptr) try_query[] = {
1114 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
1121 cs_dispatch_sum_type_t
1122 get_parallel_for_b_faces_sum_type(
const M* m) {
1123 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1125 [[maybe_unused]]
decltype(
nullptr) try_query[] = {
1127 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
1142class cs_dispatch_context :
public cs_combined_context<
1143#if defined(__NVCC__) \
1144 || defined(SYCL_LANGUAGE_VERSION) \
1145 || defined(HAVE_OPENMP_TARGET)
1154 using base_t = cs_combined_context<
1155#if defined(__NVCC__) \
1156 || defined(SYCL_LANGUAGE_VERSION) \
1157 || defined(HAVE_OPENMP_TARGET)
1165 using base_t::base_t;
1166 using base_t::operator=;
1213template <
typename T>
1214__device__
static void __forceinline__
1215cs_dispatch_sum(T *dest,
1217 cs_dispatch_sum_type_t sum_type)
1219 if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1221 using sum_v = assembled_value<T>;
1225 sum_v::ref(*dest).conflict_free_add(-1u,
v);
1227 atomicAdd(dest, src);
1230 else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1235#elif defined(SYCL_LANGUAGE_VERSION)
1237template <
typename T>
1239cs_dispatch_sum(T *dest,
1241 cs_dispatch_sum_type_t sum_type)
1243 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1246 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1248 sycl::memory_order::relaxed,
1249 sycl::memory_scope::device> aref(*dest);
1250 aref.fetch_add(src);
1256template <
typename T>
1258cs_dispatch_sum(T *dest,
1260 cs_dispatch_sum_type_t sum_type)
1262 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1265 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1291template <
size_t dim,
typename T>
1292__device__
static void __forceinline__
1293cs_dispatch_sum(T *dest,
1295 cs_dispatch_sum_type_t sum_type)
1297 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1302 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1303#if __CUDA_ARCH__ >= 700
1304 using sum_v = assembled_value<T, dim>;
1307 for (
size_t i = 0; i < dim; i++) {
1308 v[i].get() = src[i];
1311 sum_v &vs =
reinterpret_cast<sum_v &
>(*dest);
1312 vs.conflict_free_add(-1u,
v);
1316 for (
size_t i = 0; i < dim; i++) {
1317 atomicAdd(&dest[i], src[i]);
1323#elif defined(SYCL_LANGUAGE_VERSION)
1325template <
size_t dim,
typename T>
1327cs_dispatch_sum(T *dest,
1329 cs_dispatch_sum_type_t sum_type)
1331 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1332 for (
size_t i = 0; i < dim; i++) {
1336 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1337 for (
size_t i = 0; i < dim; i++) {
1339 sycl::memory_order::relaxed,
1340 sycl::memory_scope::device> aref(dest[i]);
1341 aref.fetch_add(src[i]);
1348template <
size_t dim,
typename T>
1350cs_dispatch_sum(T *dest,
1352 cs_dispatch_sum_type_t sum_type)
1354 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1355 for (
size_t i = 0; i < dim; i++) {
1359 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1360 for (
size_t i = 0; i < dim; i++) {
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition: cs_assert.h:67
int cs_glob_n_threads
Definition: cs_defs.cpp:172
#define restrict
Definition: cs_defs.h:145
#define CS_THR_MIN
Definition: cs_defs.h:493
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:335
cs_alloc_mode_t
Definition: cs_mem.h:50
@ CS_ALLOC_HOST
Definition: cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: cs_mem.h:57
@ CS_ALLOC_DEVICE
Definition: cs_mem.h:59
double precision, dimension(:,:,:), allocatable v
Definition: atimbr.f90:113