1#ifndef __CS_DISPATCH_H__
2#define __CS_DISPATCH_H__
42#if defined(SYCL_LANGUAGE_VERSION)
43#include <sycl/sycl.hpp>
56#include "cs_math_cuda.cuh"
72#if defined(SYCL_LANGUAGE_VERSION)
74#define CS_DISPATCH_SUM_DOUBLE auto
78#define CS_DISPATCH_SUM_DOUBLE double
91 CS_DISPATCH_SUM_SIMPLE,
93 CS_DISPATCH_SUM_ATOMIC
95} cs_dispatch_sum_type_t;
103template <
class Derived>
104class cs_dispatch_context_mixin {
109 template <
class F,
class... Args>
111 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) =
delete;
114 template <
class M,
class F,
class... Args>
116 parallel_for_i_faces(
const M* m,
121 template <
class M,
class F,
class... Args>
123 parallel_for_b_faces(
const M* m,
129 template <
class F,
class... Args>
131 parallel_for_reduce_sum
132 (
cs_lnum_t n,
double& sum, F&& f, Args&&... args) =
delete;
138 try_get_parallel_for_i_faces_sum_type(
const M* m,
139 cs_dispatch_sum_type_t& st);
145 try_get_parallel_for_b_faces_sum_type(
const M* m,
146 cs_dispatch_sum_type_t& st);
151template <
class Derived>
152template <
class M,
class F,
class... Args>
153decltype(
auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
154 (
const M* m, F&& f, Args&&... args) {
155 return static_cast<Derived*
>(
this)->parallel_for
158 static_cast<Args&&
>(args)...);
162template <
class Derived>
163template <
class M,
class F,
class... Args>
164decltype(
auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
165 (
const M* m, F&& f, Args&&... args) {
166 return static_cast<Derived*
>(
this)->parallel_for
169 static_cast<Args&&
>(args)...);
173template <
class Derived>
175bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
176 ([[maybe_unused]]
const M* m,
177 cs_dispatch_sum_type_t& st) {
178 st = CS_DISPATCH_SUM_SIMPLE;
183template <
class Derived>
185bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
186 ([[maybe_unused]]
const M* m,
187 cs_dispatch_sum_type_t& st) {
188 st = CS_DISPATCH_SUM_SIMPLE;
196class cs_host_context :
public cs_dispatch_context_mixin<cs_host_context> {
206 : n_min_per_thread(
CS_THR_MIN), n_threads_(-1)
214 this->n_min_per_thread = n;
219 n_min_per_cpu_thread(
void) {
220 return this->n_min_per_thread;
225 set_n_cpu_threads(
int n) {
226 this->n_threads_ = n;
231 n_cpu_threads(
void) {
232 return this->n_threads_;
236 template <
class F,
class... Args>
238 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
239 int n_t = n_threads_;
242 int n_t_l = n / n_min_per_thread;
249# pragma omp parallel for num_threads(n_t)
258 template <
class M,
class F,
class... Args>
260 parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
261 const int n_i_groups = m->i_face_numbering->n_groups;
262 const int n_i_threads = m->i_face_numbering->n_threads;
264 for (
int g_id = 0; g_id < n_i_groups; g_id++) {
265 #pragma omp parallel for
266 for (
int t_id = 0; t_id < n_i_threads; t_id++) {
267 for (
cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
268 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
279 template <
class M,
class F,
class... Args>
281 parallel_for_b_faces(
const M* m, F&& f, Args&&... args) {
282 const int n_b_threads = m->b_face_numbering->n_threads;
285 #pragma omp parallel for
286 for (
int t_id = 0; t_id < n_b_threads; t_id++) {
287 for (
cs_lnum_t f_id = b_group_index[t_id*2];
288 f_id < b_group_index[t_id*2 + 1];
297 template <
class F,
class... Args>
303 int n_t = n_threads_;
306 int n_t_l = n / n_min_per_thread;
315# pragma omp parallel for reduction(+:sum) num_threads(n_t)
325 try_get_parallel_for_i_faces_sum_type([[maybe_unused]]
const M* m,
326 cs_dispatch_sum_type_t& st) {
327 st = CS_DISPATCH_SUM_SIMPLE;
334 try_get_parallel_for_b_faces_sum_type([[maybe_unused]]
const M* m,
335 cs_dispatch_sum_type_t& st) {
336 st = CS_DISPATCH_SUM_SIMPLE;
350template <
class F,
class... Args>
351__global__
void cs_cuda_kernel_parallel_for(
cs_lnum_t n, F f, Args... args) {
353 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
354 id += blockDim.x * gridDim.x) {
365template <
class F,
class... Args>
367cs_cuda_kernel_parallel_for_reduce_sum(
cs_lnum_t n,
372 extern double __shared__ stmp[];
377 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
378 id += blockDim.x * gridDim.x) {
379 f(
id, stmp[tid], args...);
382 switch (blockDim.x) {
384 cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
387 cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
390 cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
393 cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
404class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
412 cudaStream_t stream_;
421 cs_device_context(
void)
422 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
423 device_(0), use_gpu_(
true)
425 device_ = cs_glob_cuda_device_id;
428 cs_device_context(
long grid_size,
432 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
433 device_(device), use_gpu_(
true)
436 cs_device_context(
long grid_size,
439 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
440 device_(0), use_gpu_(
true)
442 device_ = cs_base_cuda_get_device();
445 cs_device_context(
long grid_size,
447 : grid_size_(grid_size), block_size_(block_size),
448 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(
true)
450 device_ = cs_base_cuda_get_device();
453 cs_device_context(cudaStream_t stream)
454 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
457 device_ = cs_base_cuda_get_device();
466 set_cuda_grid(
long grid_size,
468 this->grid_size_ = (grid_size > 0) ? grid_size : -1;
469 this->block_size_ = block_size;
475 set_cuda_stream(cudaStream_t stream) {
476 this->stream_ = stream;
482 set_cuda_stream(
int stream_id) {
483 this->stream_ = cs_cuda_get_stream(stream_id);
490 return this->stream_;
496 set_cuda_device(
int device) {
497 this->device_ = device;
503 set_use_gpu(
bool use_gpu) {
504 this->use_gpu_ = use_gpu;
511 return (device_ >= 0 && use_gpu_);
524 alloc_mode(
bool readable_on_cpu) {
526 if (device_ >= 0 && use_gpu_) {
538 template <
class F,
class... Args>
540 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
541 if (device_ < 0 || use_gpu_ ==
false) {
545 long l_grid_size = grid_size_;
546 if (l_grid_size < 1) {
547 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
551 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
552 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
558 template <
class M,
class F,
class... Args>
560 parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
562 if (device_ < 0 || use_gpu_ ==
false) {
566 long l_grid_size = grid_size_;
567 if (l_grid_size < 1) {
568 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
572 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
573 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
580 template <
class F,
class... Args>
587 if (device_ < 0 || use_gpu_ ==
false) {
591 long l_grid_size = grid_size_;
592 if (l_grid_size < 1) {
593 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
600 double *r_grid_, *r_reduce_;
601 cs_blas_cuda_get_2_stage_reduce_buffers
602 (n, 1, l_grid_size, r_grid_, r_reduce_);
604 int smem_size = block_size_ *
sizeof(double);
605 cs_cuda_kernel_parallel_for_reduce_sum
606 <<<l_grid_size, block_size_, smem_size, stream_>>>
607 (n, r_grid_,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
609 switch (block_size_) {
611 cs_blas_cuda_reduce_single_block<1024, 1>
612 <<<1, block_size_, 0, stream_>>>
613 (l_grid_size, r_grid_, r_reduce_);
616 cs_blas_cuda_reduce_single_block<512, 1>
617 <<<1, block_size_, 0, stream_>>>
618 (l_grid_size, r_grid_, r_reduce_);
621 cs_blas_cuda_reduce_single_block<256, 1>
622 <<<1, block_size_, 0, stream_>>>
623 (l_grid_size, r_grid_, r_reduce_);
626 cs_blas_cuda_reduce_single_block<128, 1>
627 <<<1, block_size_, 0, stream_>>>
628 (l_grid_size, r_grid_, r_reduce_);
634 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
635 CS_CUDA_CHECK(cudaGetLastError());
644 if (device_ > -1 && use_gpu_) {
645 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
646 CS_CUDA_CHECK(cudaGetLastError());
653 try_get_parallel_for_i_faces_sum_type(
const M *m,
654 cs_dispatch_sum_type_t &st) {
655 if (device_ < 0 || use_gpu_ ==
false) {
659 st = CS_DISPATCH_SUM_ATOMIC;
666 try_get_parallel_for_b_faces_sum_type(
const M *m,
667 cs_dispatch_sum_type_t &st) {
668 if (device_ < 0 || use_gpu_ ==
false) {
672 st = CS_DISPATCH_SUM_ATOMIC;
678#elif defined(SYCL_LANGUAGE_VERSION)
681#if !defined(CS_GLOB_SYCL_QUEUE_IS_DEFINED)
682extern sycl::queue cs_glob_sycl_queue;
683#define CS_GLOB_SYCL_QUEUE_IS_DEFINED 1
690class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
703 cs_device_context(
void)
704 : queue_(cs_glob_sycl_queue), is_gpu(
false), use_gpu_(
true)
706 is_gpu = queue_.get_device().is_gpu();
712 set_use_gpu(
bool use_gpu) {
713 this->use_gpu_ = use_gpu;
720 return (is_gpu && use_gpu_);
733 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
742 template <
class F,
class... Args>
744 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
745 if (is_gpu ==
false || use_gpu_ ==
false) {
749 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
755 template <
class M,
class F,
class... Args>
757 parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
759 if (is_gpu ==
false || use_gpu_ ==
false) {
763 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
769 template <
class F,
class... Args>
776 if (is_gpu ==
false || use_gpu_ ==
false) {
782 double *sum_ptr = (
double *)sycl::malloc_shared(
sizeof(
double), queue_);
784 queue_.parallel_for(n,
785 sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
787 static_cast<Args&&
>(args)...).wait();
791 sycl::free((
void *)sum_ptr, queue_);
799 if (is_gpu && use_gpu_)
806 try_get_parallel_for_i_faces_sum_type(
const M *m,
807 cs_dispatch_sum_type_t &st) {
808 if (is_gpu ==
false || use_gpu_ ==
false) {
812 st = CS_DISPATCH_SUM_ATOMIC;
819 try_get_parallel_for_b_faces_sum_type(
const M *m,
820 cs_dispatch_sum_type_t &st) {
821 if (is_gpu ==
false || use_gpu_ ==
false) {
825 st = CS_DISPATCH_SUM_ATOMIC;
831#elif defined(HAVE_OPENMP_TARGET)
837class cs_device_context :
public cs_dispatch_context_mixin<cs_device_context> {
849 cs_device_context(
void)
854 is_gpu = (omp_get_num_devices() > 1) ?
true :
false;
860 set_use_gpu(
bool use_gpu) {
861 this->use_gpu_ = use_gpu;
868 return (is_gpu && use_gpu_);
881 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
890 template <
class F,
class... Args>
892 parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
893 if (is_gpu ==
false || use_gpu_ ==
false) {
898# pragma omp target teams distribute parallel for
907 template <
class F,
class... Args>
914 if (is_gpu ==
false || use_gpu_ ==
false) {
919# pragma omp target teams distribute parallel for reduction(+:sum)
936 try_get_parallel_for_i_faces_sum_type(
const M *m,
937 cs_dispatch_sum_type_t &st) {
938 if (is_gpu ==
false || use_gpu_ ==
false) {
942 st = CS_DISPATCH_SUM_ATOMIC;
949 try_get_parallel_for_b_faces_sum_type(
const M *m,
950 cs_dispatch_sum_type_t &st) {
951 if (is_gpu ==
false || use_gpu_ ==
false) {
955 st = CS_DISPATCH_SUM_ATOMIC;
967class cs_void_context :
public cs_dispatch_context_mixin<cs_void_context> {
973 cs_void_context(
void)
976#if !defined(__NVCC__)
986 set_cuda_grid([[maybe_unused]]
long grid_size,
987 [[maybe_unused]]
long block_size) {
991 set_cuda_stream([[maybe_unused]]
int stream_id) {
995 set_cuda_device([[maybe_unused]]
int device_id) {
1000#if !defined(__NVCC__) \
1001 && !defined(SYCL_LANGUAGE_VERSION) \
1002 && !defined(HAVE_OPENMP_TARGET)
1007 set_use_gpu([[maybe_unused]]
bool use_gpu) {
1025 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
1038 template <
class F,
class... Args>
1039 bool parallel_for([[maybe_unused]]
cs_lnum_t n,
1040 [[maybe_unused]] F&& f,
1041 [[maybe_unused]] Args&&... args) {
1047 template <
class F,
class... Args>
1048 bool parallel_for_reduce_sum([[maybe_unused]]
cs_lnum_t n,
1049 [[maybe_unused]]
double& sum,
1050 [[maybe_unused]] F&& f,
1051 [[maybe_unused]] Args&&... args) {
1063template <
class... Contexts>
1064class cs_combined_context
1065 :
public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
1066 public Contexts... {
1069 using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
1072 cs_combined_context() =
default;
1073 cs_combined_context(Contexts... contexts)
1074 : Contexts(std::move(contexts))...
1079 template <
class M,
class F,
class... Args>
1080 auto parallel_for_i_faces(
const M* m, F&& f, Args&&... args) {
1081 bool launched =
false;
1082 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1083 ( launched = launched
1084 || Contexts::parallel_for_i_faces(m, f, args...),
nullptr)...
1088 template <
class M,
class F,
class... Args>
1089 auto parallel_for_b_faces(
const M* m, F&& f, Args&&... args) {
1090 bool launched =
false;
1091 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1092 ( launched = launched
1093 || Contexts::parallel_for_b_faces(m, f, args...),
nullptr)...
1097 template <
class F,
class... Args>
1098 auto parallel_for(
cs_lnum_t n, F&& f, Args&&... args) {
1099 bool launched =
false;
1100 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1101 ( launched = launched
1102 || Contexts::parallel_for(n, f, args...),
nullptr)...
1106 template <
class F,
class... Args>
1107 auto parallel_for_reduce_sum
1108 (
cs_lnum_t n,
double& sum, F&& f, Args&&... args) {
1109 bool launched =
false;
1110 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1111 ( launched = launched
1112 || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
1118 cs_dispatch_sum_type_t
1119 get_parallel_for_i_faces_sum_type(
const M* m) {
1120 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1122 [[maybe_unused]]
decltype(
nullptr) try_query[] = {
1124 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
1131 cs_dispatch_sum_type_t
1132 get_parallel_for_b_faces_sum_type(
const M* m) {
1133 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1135 [[maybe_unused]]
decltype(
nullptr) try_query[] = {
1137 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
1152class cs_dispatch_context :
public cs_combined_context<
1153#if defined(__NVCC__) \
1154 || defined(SYCL_LANGUAGE_VERSION) \
1155 || defined(HAVE_OPENMP_TARGET)
1164 using base_t = cs_combined_context<
1165#if defined(__NVCC__) \
1166 || defined(SYCL_LANGUAGE_VERSION) \
1167 || defined(HAVE_OPENMP_TARGET)
1175 using base_t::base_t;
1176 using base_t::operator=;
1223template <
typename T>
1224__device__
static void __forceinline__
1225cs_dispatch_sum(T *dest,
1227 cs_dispatch_sum_type_t sum_type)
1229 if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1231 using sum_v = assembled_value<T>;
1235 sum_v::ref(*dest).conflict_free_add(-1u, v);
1237 atomicAdd(dest, src);
1240 else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1245#elif defined(SYCL_LANGUAGE_VERSION)
1247template <
typename T>
1249cs_dispatch_sum(T *dest,
1251 cs_dispatch_sum_type_t sum_type)
1253 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1256 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1258 sycl::memory_order::relaxed,
1259 sycl::memory_scope::device> aref(*dest);
1260 aref.fetch_add(src);
1266template <
typename T>
1268cs_dispatch_sum(T *dest,
1270 cs_dispatch_sum_type_t sum_type)
1272 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1275 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1301template <
size_t dim,
typename T>
1302__device__
static void __forceinline__
1303cs_dispatch_sum(T *dest,
1305 cs_dispatch_sum_type_t sum_type)
1307 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1312 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1313#if __CUDA_ARCH__ >= 700
1314 using sum_v = assembled_value<T, dim>;
1317 for (
size_t i = 0; i < dim; i++) {
1318 v[i].get() = src[i];
1321 sum_v &vs =
reinterpret_cast<sum_v &
>(*dest);
1322 vs.conflict_free_add(-1u, v);
1326 for (
size_t i = 0; i < dim; i++) {
1327 atomicAdd(&dest[i], src[i]);
1333#elif defined(SYCL_LANGUAGE_VERSION)
1335template <
size_t dim,
typename T>
1337cs_dispatch_sum(T *dest,
1339 cs_dispatch_sum_type_t sum_type)
1341 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1342 for (
size_t i = 0; i < dim; i++) {
1346 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1347 for (
size_t i = 0; i < dim; i++) {
1349 sycl::memory_order::relaxed,
1350 sycl::memory_scope::device> aref(dest[i]);
1351 aref.fetch_add(src[i]);
1358template <
size_t dim,
typename T>
1360cs_dispatch_sum(T *dest,
1362 cs_dispatch_sum_type_t sum_type)
1364 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1365 for (
size_t i = 0; i < dim; i++) {
1369 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1370 for (
size_t i = 0; i < dim; i++) {
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition cs_assert.h:67
int cs_glob_n_threads
Definition cs_defs.cpp:172
#define restrict
Definition cs_defs.h:145
#define true
Definition cs_defs.h:220
#define false
Definition cs_defs.h:219
#define CS_THR_MIN
Definition cs_defs.h:493
int cs_lnum_t
local mesh entity id
Definition cs_defs.h:335
cs_alloc_mode_t
Definition cs_mem.h:50
@ CS_ALLOC_HOST
Definition cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition cs_mem.h:57
@ CS_ALLOC_DEVICE
Definition cs_mem.h:59
double precision, dimension(:,:,:), allocatable v
Definition atimbr.f90:113