1 #ifndef __CS_DISPATCH_H__
2 #define __CS_DISPATCH_H__
42 #if defined(SYCL_LANGUAGE_VERSION)
43 #include <sycl/sycl.hpp>
56 #include "cs_alge_cuda.cuh"
63 #if defined(SYCL_LANGUAGE_VERSION)
65 #define CS_DISPATCH_SUM_DOUBLE auto
69 #define CS_DISPATCH_SUM_DOUBLE double
82 CS_DISPATCH_SUM_SIMPLE,
84 CS_DISPATCH_SUM_ATOMIC
86 } cs_dispatch_sum_type_t;
94 template <
class Derived>
95 class cs_dispatch_context_mixin {
100 template <
class F,
class... Args>
102 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) = delete;
105 template <class F, class... Args>
112 template <class F, class... Args>
120 template <class F, class... Args>
122 parallel_for_reduce_sum
123 (
cs_lnum_t n,
double& sum, F&& f, Args&&... args) = delete;
128 try_get_parallel_for_i_faces_sum_type(const
cs_mesh_t* m,
129 cs_dispatch_sum_type_t& st);
134 try_get_parallel_for_b_faces_sum_type(const
cs_mesh_t* m,
135 cs_dispatch_sum_type_t& st);
140 template <class Derived>
141 template <class F, class... Args>
142 decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
143 (const
cs_mesh_t* m, F&& f, Args&&... args) {
144 return static_cast<Derived*
>(
this)->parallel_for
147 static_cast<Args&&
>(args)...);
151 template <
class Derived>
152 template <
class F,
class... Args>
153 decltype(
auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
154 (const
cs_mesh_t* m, F&& f, Args&&... args) {
155 return static_cast<Derived*
>(
this)->parallel_for
158 static_cast<Args&&
>(args)...);
162 template <
class Derived>
163 bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
165 cs_dispatch_sum_type_t& st) {
166 st = CS_DISPATCH_SUM_SIMPLE;
171 template <
class Derived>
172 bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
174 cs_dispatch_sum_type_t& st) {
175 st = CS_DISPATCH_SUM_SIMPLE;
183 class cs_host_context :
public cs_dispatch_context_mixin<cs_host_context> {
201 this->n_min_for_threads = n;
206 n_min_for_cpu_threads(
void) {
207 return this->n_min_for_threads;
211 template <
class F,
class... Args>
213 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
214 # pragma omp parallel for if (n >= n_min_for_threads)
223 template <
class F,
class... Args>
225 parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
229 for (
int g_id = 0; g_id < n_i_groups; g_id++) {
230 #pragma omp parallel for
231 for (
int t_id = 0; t_id < n_i_threads; t_id++) {
232 for (
cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
233 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
244 template <
class F,
class... Args>
246 parallel_for_b_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
250 #pragma omp parallel for
251 for (
int t_id = 0; t_id < n_b_threads; t_id++) {
252 for (
cs_lnum_t f_id = b_group_index[t_id*2];
253 f_id < b_group_index[t_id*2 + 1];
262 template <
class F,
class... Args>
269 # pragma omp parallel for reduction(+:sum) if (n >= n_min_for_threads)
278 try_get_parallel_for_i_faces_sum_type([[maybe_unused]]
const cs_mesh_t* m,
279 cs_dispatch_sum_type_t& st) {
280 st = CS_DISPATCH_SUM_SIMPLE;
286 try_get_parallel_for_b_faces_sum_type([[maybe_unused]]
const cs_mesh_t* m,
287 cs_dispatch_sum_type_t& st) {
288 st = CS_DISPATCH_SUM_SIMPLE;
294 #if defined(__NVCC__)
302 template <
class F,
class... Args>
303 __global__
void cs_cuda_kernel_parallel_for(
cs_lnum_t n, F f, Args... args) {
305 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
306 id += blockDim.x * gridDim.x) {
317 template <
class F,
class... Args>
319 cs_cuda_kernel_parallel_for_reduce_sum(
cs_lnum_t n,
324 extern double __shared__ stmp[];
329 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
330 id += blockDim.x * gridDim.x) {
331 f(
id, stmp[tid], args...);
334 switch (blockDim.x) {
336 cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
339 cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
342 cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
345 cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
356 class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
364 cudaStream_t stream_;
373 cs_device_context(
void)
374 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
375 device_(0), use_gpu_(true)
377 device_ = cs_base_cuda_get_device();
380 cs_device_context(
long grid_size,
384 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
385 device_(device), use_gpu_(true)
388 cs_device_context(
long grid_size,
391 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
392 device_(0), use_gpu_(true)
394 device_ = cs_base_cuda_get_device();
397 cs_device_context(
long grid_size,
399 : grid_size_(grid_size), block_size_(block_size),
400 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
402 device_ = cs_base_cuda_get_device();
405 cs_device_context(cudaStream_t stream)
406 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
409 device_ = cs_base_cuda_get_device();
415 set_cuda_grid(
long grid_size,
417 this->grid_size_ = grid_size;
418 this->block_size_ = block_size;
424 set_cuda_stream(cudaStream_t stream) {
425 this->stream_ = stream;
431 set_cuda_stream(
int stream_id) {
432 this->stream_ = cs_cuda_get_stream(stream_id);
439 return this->stream_;
445 set_cuda_device(
int device) {
446 this->device_ = device;
452 set_use_gpu(
bool use_gpu) {
453 this->use_gpu_ = use_gpu;
460 return (device_ >= 0 && use_gpu_);
473 alloc_mode(
bool readable_on_cpu) {
475 if (device_ >= 0 && use_gpu_) {
487 template <
class F,
class... Args>
489 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
490 if (device_ < 0 || use_gpu_ ==
false) {
494 long l_grid_size = grid_size_;
495 if (l_grid_size < 1) {
496 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
499 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
500 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
506 template <
class F,
class... Args>
508 parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
510 if (device_ < 0 || use_gpu_ ==
false) {
514 long l_grid_size = grid_size_;
515 if (l_grid_size < 1) {
516 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
519 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
520 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
527 template <
class F,
class... Args>
534 if (device_ < 0 || use_gpu_ ==
false) {
538 long l_grid_size = grid_size_;
539 if (l_grid_size < 1) {
540 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
543 double *r_grid_, *r_reduce_;
544 cs_blas_cuda_get_2_stage_reduce_buffers
545 (n, 1, l_grid_size, r_grid_, r_reduce_);
547 int smem_size = block_size_ *
sizeof(double);
548 cs_cuda_kernel_parallel_for_reduce_sum
549 <<<l_grid_size, block_size_, smem_size, stream_>>>
550 (n, r_grid_,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
552 switch (block_size_) {
554 cs_blas_cuda_reduce_single_block<1024, 1>
555 <<<1, block_size_, 0, stream_>>>
556 (l_grid_size, r_grid_, r_reduce_);
559 cs_blas_cuda_reduce_single_block<512, 1>
560 <<<1, block_size_, 0, stream_>>>
561 (l_grid_size, r_grid_, r_reduce_);
564 cs_blas_cuda_reduce_single_block<256, 1>
565 <<<1, block_size_, 0, stream_>>>
566 (l_grid_size, r_grid_, r_reduce_);
569 cs_blas_cuda_reduce_single_block<128, 1>
570 <<<1, block_size_, 0, stream_>>>
571 (l_grid_size, r_grid_, r_reduce_);
577 cudaStreamSynchronize(stream_);
586 if (device_ > -1 && use_gpu_)
587 cudaStreamSynchronize(stream_);
592 try_get_parallel_for_i_faces_sum_type(
const cs_mesh_t *m,
593 cs_dispatch_sum_type_t &st) {
594 if (device_ < 0 || use_gpu_ ==
false) {
598 st = CS_DISPATCH_SUM_ATOMIC;
604 try_get_parallel_for_b_faces_sum_type(
const cs_mesh_t *m,
605 cs_dispatch_sum_type_t &st) {
606 if (device_ < 0 || use_gpu_ ==
false) {
610 st = CS_DISPATCH_SUM_ATOMIC;
616 #elif defined(SYCL_LANGUAGE_VERSION)
622 class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
635 cs_device_context(
void)
636 : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
638 is_gpu = queue_.get_device().is_gpu();
644 set_use_gpu(
bool use_gpu) {
645 this->use_gpu_ = use_gpu;
652 return (is_gpu && use_gpu_);
665 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
674 template <
class F,
class... Args>
676 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
677 if (is_gpu ==
false || use_gpu_ ==
false) {
681 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
687 template <
class F,
class... Args>
689 parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
691 if (is_gpu ==
false || use_gpu_ ==
false) {
695 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
701 template <
class F,
class... Args>
708 if (is_gpu ==
false || use_gpu_ ==
false) {
714 double *sum_ptr = (
double *)sycl::malloc_shared(
sizeof(
double), queue_);
716 queue_.parallel_for(n,
717 sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
719 static_cast<Args&&
>(args)...).wait();
723 sycl::free((
void *)sum_ptr, queue_);
731 if (is_gpu == && use_gpu_)
737 try_get_parallel_for_i_faces_sum_type(
const cs_mesh_t *m,
738 cs_dispatch_sum_type_t &st) {
739 if (is_gpu ==
false || use_gpu_ ==
false) {
743 st = CS_DISPATCH_SUM_ATOMIC;
749 try_get_parallel_for_b_faces_sum_type(
const cs_mesh_t *m,
750 cs_dispatch_sum_type_t &st) {
751 if (is_gpu ==
false || use_gpu_ ==
false) {
755 st = CS_DISPATCH_SUM_ATOMIC;
767 class cs_void_context :
public cs_dispatch_context_mixin<cs_void_context> {
773 cs_void_context(
void)
776 #if !defined(__NVCC__)
786 set_cuda_grid([[maybe_unused]]
long grid_size,
787 [[maybe_unused]]
long block_size) {
791 set_cuda_stream([[maybe_unused]]
int stream_id) {
795 set_cuda_device([[maybe_unused]]
int device_id) {
800 #if !defined(__NVCC__) && !defined(SYCL_LANGUAGE_VERSION)
805 set_use_gpu([[maybe_unused]]
bool use_gpu) {
823 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
836 template <
class F,
class... Args>
837 bool parallel_for([[maybe_unused]]
cs_lnum_t n,
838 [[maybe_unused]] F&& f,
839 [[maybe_unused]] Args&&... args) {
845 template <
class F,
class... Args>
846 bool parallel_for_reduce_sum([[maybe_unused]]
cs_lnum_t n,
847 [[maybe_unused]]
double& sum,
848 [[maybe_unused]] F&& f,
849 [[maybe_unused]] Args&&... args) {
861 template <
class... Contexts>
862 class cs_combined_context
863 :
public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
867 using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
870 cs_combined_context() =
default;
871 cs_combined_context(Contexts... contexts)
872 : Contexts(std::move(contexts))...
877 template <
class F,
class... Args>
878 auto parallel_for_i_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
879 bool launched =
false;
880 [[maybe_unused]] decltype(
nullptr) try_execute[] = {
881 ( launched = launched
882 || Contexts::parallel_for_i_faces(m, f, args...),
nullptr)...
886 template <
class F,
class... Args>
887 auto parallel_for_b_faces(
const cs_mesh_t* m, F&& f, Args&&... args) {
888 bool launched =
false;
889 [[maybe_unused]] decltype(
nullptr) try_execute[] = {
890 ( launched = launched
891 || Contexts::parallel_for_b_faces(m, f, args...),
nullptr)...
895 template <
class F,
class... Args>
896 auto parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
897 bool launched =
false;
898 [[maybe_unused]] decltype(
nullptr) try_execute[] = {
899 ( launched = launched
900 || Contexts::parallel_for(n, f, args...),
nullptr)...
904 template <
class F,
class... Args>
905 auto parallel_for_reduce_sum
906 (
cs_lnum_t n,
double& sum, F&& f, Args&&... args) {
907 bool launched =
false;
908 [[maybe_unused]] decltype(
nullptr) try_execute[] = {
909 ( launched = launched
910 || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
915 cs_dispatch_sum_type_t
916 get_parallel_for_i_faces_sum_type(
const cs_mesh_t* m) {
917 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
919 [[maybe_unused]] decltype(
nullptr) try_query[] = {
921 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
927 cs_dispatch_sum_type_t
928 get_parallel_for_b_faces_sum_type(
const cs_mesh_t* m) {
929 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
931 [[maybe_unused]] decltype(
nullptr) try_query[] = {
933 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
948 class cs_dispatch_context :
public cs_combined_context<
949 #if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION)
958 using base_t = cs_combined_context<
959 #if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION)
967 using base_t::base_t;
968 using base_t::operator=;
1013 #ifdef __CUDA_ARCH__
1015 template <
typename T>
1016 __device__
static void __forceinline__
1017 cs_dispatch_sum(T *dest,
1019 cs_dispatch_sum_type_t sum_type)
1021 if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1023 using sum_v = assembled_value<T>;
1027 sum_v::ref(*dest).conflict_free_add(-1u,
v);
1029 atomicAdd(dest, src);
1032 else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1037 #elif defined(SYCL_LANGUAGE_VERSION)
1039 template <
typename T>
1041 cs_dispatch_sum(T *dest,
1043 cs_dispatch_sum_type_t sum_type)
1045 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1048 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1050 sycl::memory_order::relaxed,
1051 sycl::memory_scope::device> aref(*dest);
1052 aref.fetch_add(src);
1058 template <
typename T>
1060 cs_dispatch_sum(T *dest,
1062 cs_dispatch_sum_type_t sum_type)
1064 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1067 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1091 #ifdef __CUDA_ARCH__
1093 template <
size_t dim,
typename T>
1094 __device__
static void __forceinline__
1095 cs_dispatch_sum(T *dest,
1097 cs_dispatch_sum_type_t sum_type)
1099 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1104 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1105 #if __CUDA_ARCH__ >= 700
1106 using sum_v = assembled_value<T, dim>;
1109 for (
size_t i = 0; i < dim; i++) {
1110 v[i].get() = src[i];
1113 sum_v &vs =
reinterpret_cast<sum_v &
>(*dest);
1114 vs.conflict_free_add(-1u,
v);
1118 for (
size_t i = 0; i < dim; i++) {
1119 atomicAdd(&dest[i], src[i]);
1125 #elif defined(SYCL_LANGUAGE_VERSION)
1127 template <
size_t dim,
typename T>
1129 cs_dispatch_sum(T *dest,
1131 cs_dispatch_sum_type_t sum_type)
1133 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1134 for (
size_t i = 0; i < dim; i++) {
1138 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1139 for (
size_t i = 0; i < dim; i++) {
1141 sycl::memory_order::relaxed,
1142 sycl::memory_scope::device> aref(dest[i]);
1143 aref.fetch_add(src[i]);
1150 template <
size_t dim,
typename T>
1152 cs_dispatch_sum(T *dest,
1154 cs_dispatch_sum_type_t sum_type)
1156 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1157 for (
size_t i = 0; i < dim; i++) {
1161 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1162 for (
size_t i = 0; i < dim; i++) {
cs_alloc_mode_t
Definition: bft_mem.h:50
@ CS_ALLOC_HOST
Definition: bft_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: bft_mem.h:57
@ CS_ALLOC_DEVICE
Definition: bft_mem.h:59
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition: cs_assert.h:67
#define restrict
Definition: cs_defs.h:141
#define CS_THR_MIN
Definition: cs_defs.h:479
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:325
double precision, dimension(:,:,:), allocatable v
Definition: atimbr.f90:113
cs_lnum_t n_i_faces
Definition: cs_mesh.h:98
cs_numbering_t * b_face_numbering
Definition: cs_mesh.h:163
cs_numbering_t * i_face_numbering
Definition: cs_mesh.h:162
cs_lnum_t * group_index
Definition: cs_numbering.h:102
int n_threads
Definition: cs_numbering.h:93
int n_groups
Definition: cs_numbering.h:94