8.2
general documentation
Loading...
Searching...
No Matches
cs_dispatch.h
Go to the documentation of this file.
1#ifndef __CS_DISPATCH_H__
2#define __CS_DISPATCH_H__
3
4/*============================================================================
5 * Class to dispatch computation using various runtimes (OpenMP, CUDA, ...)
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2024 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28// Valid only for C++
29
30#ifdef __cplusplus
31
32/*----------------------------------------------------------------------------*/
33
34#include "cs_defs.h"
35
36/*----------------------------------------------------------------------------
37 * Standard C++ library headers
38 *----------------------------------------------------------------------------*/
39
40#include <utility>
41
42#if defined(SYCL_LANGUAGE_VERSION)
43#include <sycl/sycl.hpp>
44#endif
45
46/*----------------------------------------------------------------------------
47 * Local headers
48 *----------------------------------------------------------------------------*/
49
50#include "cs_assert.h"
51#include "cs_mesh.h"
52
53#ifdef __NVCC__
54#include "cs_base_cuda.h"
55#include "cs_blas_cuda.h"
56#include "cs_alge_cuda.cuh"
57#endif
58
59/*=============================================================================
60 * Macro definitions
61 *============================================================================*/
62
63#if defined(SYCL_LANGUAGE_VERSION)
64
65#define CS_DISPATCH_SUM_DOUBLE auto
66
67#else
68
69#define CS_DISPATCH_SUM_DOUBLE double
70
71#endif
72
73/*============================================================================
74 * Type definitions
75 *============================================================================*/
76
79
80typedef enum {
81
82 CS_DISPATCH_SUM_SIMPLE,
84 CS_DISPATCH_SUM_ATOMIC
85
86} cs_dispatch_sum_type_t;
87
93
94template <class Derived>
95class cs_dispatch_context_mixin {
96public:
97
98 // Loop over n elements
99 // Must be redefined by the child class
100 template <class F, class... Args>
101 decltype(auto)
102 parallel_for(cs_lnum_t n, F&& f, Args&&... args) = delete;
103
104 // Assembly loop over all internal faces
105 template <class F, class... Args>
106 decltype(auto)
107 parallel_for_i_faces(const cs_mesh_t* m,
108 F&& f,
109 Args&&... args);
110
111 // Assembly loop over all boundary faces
112 template <class F, class... Args>
113 decltype(auto)
114 parallel_for_b_faces(const cs_mesh_t* m,
115 F&& f,
116 Args&&... args);
117
118 // Parallel reduction with simple sum.
119 // Must be redefined by the child class
120 template <class F, class... Args>
121 decltype(auto)
122 parallel_for_reduce_sum
123 (cs_lnum_t n, double& sum, F&& f, Args&&... args) = delete;
124
125 // Query sum type for assembly loop over all interior faces
126 // Must be redefined by the child class
127 bool
128 try_get_parallel_for_i_faces_sum_type(const cs_mesh_t* m,
129 cs_dispatch_sum_type_t& st);
130
131 // Query sum type for assembly loop over all boundary faces
132 // Must be redefined by the child class
133 bool
134 try_get_parallel_for_b_faces_sum_type(const cs_mesh_t* m,
135 cs_dispatch_sum_type_t& st);
136
137};
138
139// Default implementation of parallel_for_i_faces based on parallel_for
140template <class Derived>
141template <class F, class... Args>
142decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
143 (const cs_mesh_t* m, F&& f, Args&&... args) {
144 return static_cast<Derived*>(this)->parallel_for
145 (m->n_i_faces,
146 static_cast<F&&>(f),
147 static_cast<Args&&>(args)...);
148}
149
150// Default implementation of parallel_for_b_faces based on parallel_for_sum
151template <class Derived>
152template <class F, class... Args>
153decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
154 (const cs_mesh_t* m, F&& f, Args&&... args) {
155 return static_cast<Derived*>(this)->parallel_for
156 (m->n_b_faces,
157 static_cast<F&&>(f),
158 static_cast<Args&&>(args)...);
159}
160
161// Default implementation of get interior faces sum type
162template <class Derived>
163bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
164 ([[maybe_unused]]const cs_mesh_t* m,
165 cs_dispatch_sum_type_t& st) {
166 st = CS_DISPATCH_SUM_SIMPLE;
167 return true;
168}
169
170// Default implementation of get boundary faces sum type
171template <class Derived>
172bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
173 ([[maybe_unused]]const cs_mesh_t* m,
174 cs_dispatch_sum_type_t& st) {
175 st = CS_DISPATCH_SUM_SIMPLE;
176 return true;
177}
178
179/*
180 * cs_context to execute loops with OpenMP on the CPU
181 */
182
183class cs_host_context : public cs_dispatch_context_mixin<cs_host_context> {
184
185private:
186
187 cs_lnum_t n_min_for_threads;
189
190public:
191
192 cs_host_context()
193 : n_min_for_threads(CS_THR_MIN)
194 {}
195
196public:
197
199 void
200 set_n_min_for_cpu_threads(cs_lnum_t n) {
201 this->n_min_for_threads = n;
202 }
203
206 n_min_for_cpu_threads(void) {
207 return this->n_min_for_threads;
208 }
209
211 template <class F, class... Args>
212 bool
213 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
214# pragma omp parallel for if (n >= n_min_for_threads)
215 for (cs_lnum_t i = 0; i < n; ++i) {
216 f(i, args...);
217 }
218 return true;
219 }
220
223 template <class F, class... Args>
224 bool
225 parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
226 const int n_i_groups = m->i_face_numbering->n_groups;
227 const int n_i_threads = m->i_face_numbering->n_threads;
228 const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
229 for (int g_id = 0; g_id < n_i_groups; g_id++) {
230 #pragma omp parallel for
231 for (int t_id = 0; t_id < n_i_threads; t_id++) {
232 for (cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
233 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
234 f_id++) {
235 f(f_id, args...);
236 }
237 }
238 }
239 return true;
240 }
241
244 template <class F, class... Args>
245 bool
246 parallel_for_b_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
247 const int n_b_threads = m->b_face_numbering->n_threads;
248 const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
249
250 #pragma omp parallel for
251 for (int t_id = 0; t_id < n_b_threads; t_id++) {
252 for (cs_lnum_t f_id = b_group_index[t_id*2];
253 f_id < b_group_index[t_id*2 + 1];
254 f_id++) {
255 f(f_id, args...);
256 }
257 }
258 return true;
259 }
260
262 template <class F, class... Args>
263 bool
264 parallel_for_reduce_sum(cs_lnum_t n,
265 double& sum,
266 F&& f,
267 Args&&... args) {
268 sum = 0;
269# pragma omp parallel for reduction(+:sum) if (n >= n_min_for_threads)
270 for (cs_lnum_t i = 0; i < n; ++i) {
271 f(i, sum, args...);
272 }
273 return true;
274 }
275
276 // Get interior faces sum type associated with this context
277 bool
278 try_get_parallel_for_i_faces_sum_type([[maybe_unused]]const cs_mesh_t* m,
279 cs_dispatch_sum_type_t& st) {
280 st = CS_DISPATCH_SUM_SIMPLE;
281 return true;
282 }
283
284 // Get boundary faces sum type associated with this context
285 bool
286 try_get_parallel_for_b_faces_sum_type([[maybe_unused]]const cs_mesh_t* m,
287 cs_dispatch_sum_type_t& st) {
288 st = CS_DISPATCH_SUM_SIMPLE;
289 return true;
290 }
291
292};
293
294#if defined(__NVCC__)
295
296/* Default kernel that loops over an integer range and calls a device functor.
297 This kernel uses a grid_size-stride loop and thus guarantees that all
298 integers are processed, even if the grid is smaller.
299 All arguments *must* be passed by value to avoid passing CPU references
300 to the GPU. */
301
302template <class F, class... Args>
303__global__ void cs_cuda_kernel_parallel_for(cs_lnum_t n, F f, Args... args) {
304 // grid_size-stride loop
305 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
306 id += blockDim.x * gridDim.x) {
307 f(id, args...);
308 }
309}
310
311/* Default kernel that loops over an integer range and calls a device functor.
312 This kernel uses a grid_size-stride loop and thus guarantees that all
313 integers are processed, even if the grid is smaller.
314 All arguments *must* be passed by value to avoid passing CPU references
315 to the GPU. */
316
317template <class F, class... Args>
318__global__ void
319cs_cuda_kernel_parallel_for_reduce_sum(cs_lnum_t n,
320 double *b_res,
321 F f,
322 Args... args) {
323 // grid_size-stride loop
324 extern double __shared__ stmp[];
325 const cs_lnum_t tid = threadIdx.x;
326
327 stmp[tid] = 0;
328
329 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
330 id += blockDim.x * gridDim.x) {
331 f(id, stmp[tid], args...);
332 }
333
334 switch (blockDim.x) {
335 case 1024:
336 cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
337 break;
338 case 512:
339 cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
340 break;
341 case 256:
342 cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
343 break;
344 case 128:
345 cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
346 break;
347 default:
348 assert(0);
349 }
350}
351
355
356class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
357
358private:
359
360 long grid_size_;
363 long block_size_;
364 cudaStream_t stream_;
365 int device_;
366
367 bool use_gpu_;
368
369public:
370
372
373 cs_device_context(void)
374 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
375 device_(0), use_gpu_(true)
376 {
377 device_ = cs_base_cuda_get_device();
378 }
379
380 cs_device_context(long grid_size,
381 long block_size,
382 cudaStream_t stream,
383 int device)
384 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
385 device_(device), use_gpu_(true)
386 {}
387
388 cs_device_context(long grid_size,
389 long block_size,
390 cudaStream_t stream)
391 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
392 device_(0), use_gpu_(true)
393 {
394 device_ = cs_base_cuda_get_device();
395 }
396
397 cs_device_context(long grid_size,
398 long block_size)
399 : grid_size_(grid_size), block_size_(block_size),
400 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
401 {
402 device_ = cs_base_cuda_get_device();
403 }
404
405 cs_device_context(cudaStream_t stream)
406 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
407 use_gpu_(true)
408 {
409 device_ = cs_base_cuda_get_device();
410 }
411
413
414 void
415 set_cuda_grid(long grid_size,
416 long block_size) {
417 this->grid_size_ = grid_size;
418 this->block_size_ = block_size;
419 }
420
422
423 void
424 set_cuda_stream(cudaStream_t stream) {
425 this->stream_ = stream;
426 }
427
429
430 void
431 set_cuda_stream(int stream_id) {
432 this->stream_ = cs_cuda_get_stream(stream_id);
433 }
434
436
437 cudaStream_t
438 cuda_stream(void) {
439 return this->stream_;
440 }
441
443
444 void
445 set_cuda_device(int device) {
446 this->device_ = device;
447 }
448
450
451 void
452 set_use_gpu(bool use_gpu) {
453 this->use_gpu_ = use_gpu;
454 }
455
457
458 bool
459 use_gpu(void) {
460 return (device_ >= 0 && use_gpu_);
461 }
462
464
466 alloc_mode(void) {
467 cs_alloc_mode_t amode
468 = (device_ >= 0 && use_gpu_) ? CS_ALLOC_DEVICE : CS_ALLOC_HOST;
469 return (amode);
470 }
471
473 alloc_mode(bool readable_on_cpu) {
475 if (device_ >= 0 && use_gpu_) {
476 if (readable_on_cpu)
478 else
479 amode = CS_ALLOC_DEVICE;
480 }
481 return (amode);
482 }
483
484public:
485
487 template <class F, class... Args>
488 bool
489 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
490 if (device_ < 0 || use_gpu_ == false) {
491 return false;
492 }
493
494 long l_grid_size = grid_size_;
495 if (l_grid_size < 1) {
496 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
497 }
498
499 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
500 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
501
502 return true;
503 }
504
506 template <class F, class... Args>
507 bool
508 parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
509 const cs_lnum_t n = m->n_i_faces;
510 if (device_ < 0 || use_gpu_ == false) {
511 return false;
512 }
513
514 long l_grid_size = grid_size_;
515 if (l_grid_size < 1) {
516 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
517 }
518
519 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
520 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
521
522 return true;
523 }
524
527 template <class F, class... Args>
528 bool
529 parallel_for_reduce_sum(cs_lnum_t n,
530 double& sum,
531 F&& f,
532 Args&&... args) {
533 sum = 0;
534 if (device_ < 0 || use_gpu_ == false) {
535 return false;
536 }
537
538 long l_grid_size = grid_size_;
539 if (l_grid_size < 1) {
540 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
541 }
542
543 double *r_grid_, *r_reduce_;
544 cs_blas_cuda_get_2_stage_reduce_buffers
545 (n, 1, l_grid_size, r_grid_, r_reduce_);
546
547 int smem_size = block_size_ * sizeof(double);
548 cs_cuda_kernel_parallel_for_reduce_sum
549 <<<l_grid_size, block_size_, smem_size, stream_>>>
550 (n, r_grid_, static_cast<F&&>(f), static_cast<Args&&>(args)...);
551
552 switch (block_size_) {
553 case 1024:
554 cs_blas_cuda_reduce_single_block<1024, 1>
555 <<<1, block_size_, 0, stream_>>>
556 (l_grid_size, r_grid_, r_reduce_);
557 break;
558 case 512:
559 cs_blas_cuda_reduce_single_block<512, 1>
560 <<<1, block_size_, 0, stream_>>>
561 (l_grid_size, r_grid_, r_reduce_);
562 break;
563 case 256:
564 cs_blas_cuda_reduce_single_block<256, 1>
565 <<<1, block_size_, 0, stream_>>>
566 (l_grid_size, r_grid_, r_reduce_);
567 break;
568 case 128:
569 cs_blas_cuda_reduce_single_block<128, 1>
570 <<<1, block_size_, 0, stream_>>>
571 (l_grid_size, r_grid_, r_reduce_);
572 break;
573 default:
574 cs_assert(0);
575 }
576
577 cudaStreamSynchronize(stream_);
578 sum = r_reduce_[0];
579
580 return true;
581 }
582
584 void
585 wait(void) {
586 if (device_ > -1 && use_gpu_)
587 cudaStreamSynchronize(stream_);
588 }
589
590 // Get interior faces sum type associated with this context
591 bool
592 try_get_parallel_for_i_faces_sum_type(const cs_mesh_t *m,
593 cs_dispatch_sum_type_t &st) {
594 if (device_ < 0 || use_gpu_ == false) {
595 return false;
596 }
597
598 st = CS_DISPATCH_SUM_ATOMIC;
599 return true;
600 }
601
602 // Get boundary faces sum type associated with this context
603 bool
604 try_get_parallel_for_b_faces_sum_type(const cs_mesh_t *m,
605 cs_dispatch_sum_type_t &st) {
606 if (device_ < 0 || use_gpu_ == false) {
607 return false;
608 }
609
610 st = CS_DISPATCH_SUM_ATOMIC;
611 return true;
612 }
613
614};
615
616#elif defined(SYCL_LANGUAGE_VERSION)
617
621
622class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
623
624private:
625
626 sycl::queue &queue_;
627 bool is_gpu;
628
629 bool use_gpu_;
630
631public:
632
634
635 cs_device_context(void)
636 : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
637 {
638 is_gpu = queue_.get_device().is_gpu();
639 }
640
642
643 void
644 set_use_gpu(bool use_gpu) {
645 this->use_gpu_ = use_gpu;
646 }
647
649
650 bool
651 use_gpu(void) {
652 return (is_gpu && use_gpu_);
653 }
654
656
658 alloc_mode(void) {
659 cs_alloc_mode_t amode
660 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
661 return (amode);
662 }
663
665 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
666 cs_alloc_mode_t amode
667 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
668 return (amode);
669 }
670
671public:
672
674 template <class F, class... Args>
675 bool
676 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
677 if (is_gpu == false || use_gpu_ == false) {
678 return false;
679 }
680
681 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
682
683 return true;
684 }
685
687 template <class F, class... Args>
688 bool
689 parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
690 const cs_lnum_t n = m->n_i_faces;
691 if (is_gpu == false || use_gpu_ == false) {
692 return false;
693 }
694
695 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
696
697 return true;
698 }
699
701 template <class F, class... Args>
702 bool
703 parallel_for_reduce_sum(cs_lnum_t n,
704 double& sum_,
705 F&& f,
706 Args&&... args) {
707 sum_ = 0;
708 if (is_gpu == false || use_gpu_ == false) {
709 return false;
710 }
711
712 // TODO: use persistent allocation as we do in CUDA BLAS to avoid
713 // excess allocation/deallocation.
714 double *sum_ptr = (double *)sycl::malloc_shared(sizeof(double), queue_);
715
716 queue_.parallel_for(n,
717 sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
718 static_cast<F&&>(f),
719 static_cast<Args&&>(args)...).wait();
720
721 sum_ = sum_ptr[0];
722
723 sycl::free((void *)sum_ptr, queue_);
724
725 return true;
726 }
727
729 void
730 wait(void) {
731 if (is_gpu == && use_gpu_)
732 queue_.wait();
733 }
734
735 // Get interior faces sum type associated with this context
736 bool
737 try_get_parallel_for_i_faces_sum_type(const cs_mesh_t *m,
738 cs_dispatch_sum_type_t &st) {
739 if (is_gpu == false || use_gpu_ == false) {
740 return false;
741 }
742
743 st = CS_DISPATCH_SUM_ATOMIC;
744 return true;
745 }
746
747 // Get interior faces sum type associated with this context
748 bool
749 try_get_parallel_for_b_faces_sum_type(const cs_mesh_t *m,
750 cs_dispatch_sum_type_t &st) {
751 if (is_gpu == false || use_gpu_ == false) {
752 return false;
753 }
754
755 st = CS_DISPATCH_SUM_ATOMIC;
756 return true;
757 }
758
759};
760
761#endif // __NVCC__ or SYCL
762
766
767class cs_void_context : public cs_dispatch_context_mixin<cs_void_context> {
768
769public:
770
772
773 cs_void_context(void)
774 {}
775
776#if !defined(__NVCC__)
777
778 /* Fill-in for CUDA methods, so as to allow using these methods
779 in final cs_dispatch_context even when CUDA is not available,
780 and without requireing a static cast of the form
781
782 static_cast<cs_device_context&>(ctx).set_use_gpu(true);
783 */
784
785 void
786 set_cuda_grid([[maybe_unused]] long grid_size,
787 [[maybe_unused]] long block_size) {
788 }
789
790 void
791 set_cuda_stream([[maybe_unused]] int stream_id) {
792 }
793
794 void
795 set_cuda_device([[maybe_unused]] int device_id) {
796 }
797
798#endif // __NVCC__
799
800#if !defined(__NVCC__) && !defined(SYCL_LANGUAGE_VERSION)
801
802 /* Fill-in for device methods */
803
804 void
805 set_use_gpu([[maybe_unused]] bool use_gpu) {
806 }
807
809
810 bool
811 use_gpu(void) {
812 return false;
813 }
814
816
818 alloc_mode(void) {
819 return CS_ALLOC_HOST;
820 }
821
823 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
824 return CS_ALLOC_HOST;
825 }
826
827 void
828 wait(void) {
829 }
830
831#endif // ! __NVCC__ && ! SYCL_LANGUAGE_VERSION
832
833public:
834
835 // Abort execution if no execution method is available.
836 template <class F, class... Args>
837 bool parallel_for([[maybe_unused]] cs_lnum_t n,
838 [[maybe_unused]] F&& f,
839 [[maybe_unused]] Args&&... args) {
840 cs_assert(0);
841 return false;
842 }
843
844 // Abort execution if no execution method is available.
845 template <class F, class... Args>
846 bool parallel_for_reduce_sum([[maybe_unused]] cs_lnum_t n,
847 [[maybe_unused]] double& sum,
848 [[maybe_unused]] F&& f,
849 [[maybe_unused]] Args&&... args) {
850 cs_assert(0);
851 return false;
852 }
853
854};
855
860
861template <class... Contexts>
862class cs_combined_context
863 : public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
864 public Contexts... {
865
866private:
867 using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
868
869public:
870 cs_combined_context() = default;
871 cs_combined_context(Contexts... contexts)
872 : Contexts(std::move(contexts))...
873 {}
874
875public:
876
877 template <class F, class... Args>
878 auto parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
879 bool launched = false;
880 [[maybe_unused]] decltype(nullptr) try_execute[] = {
881 ( launched = launched
882 || Contexts::parallel_for_i_faces(m, f, args...), nullptr)...
883 };
884 }
885
886 template <class F, class... Args>
887 auto parallel_for_b_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
888 bool launched = false;
889 [[maybe_unused]] decltype(nullptr) try_execute[] = {
890 ( launched = launched
891 || Contexts::parallel_for_b_faces(m, f, args...), nullptr)...
892 };
893 }
894
895 template <class F, class... Args>
896 auto parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
897 bool launched = false;
898 [[maybe_unused]] decltype(nullptr) try_execute[] = {
899 ( launched = launched
900 || Contexts::parallel_for(n, f, args...), nullptr)...
901 };
902 }
903
904 template <class F, class... Args>
905 auto parallel_for_reduce_sum
906 (cs_lnum_t n, double& sum, F&& f, Args&&... args) {
907 bool launched = false;
908 [[maybe_unused]] decltype(nullptr) try_execute[] = {
909 ( launched = launched
910 || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
911 nullptr)...
912 };
913 }
914
915 cs_dispatch_sum_type_t
916 get_parallel_for_i_faces_sum_type(const cs_mesh_t* m) {
917 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
918 bool known = false;
919 [[maybe_unused]] decltype(nullptr) try_query[] = {
920 ( known = known
921 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
922 nullptr)...
923 };
924 return sum_type;
925 }
926
927 cs_dispatch_sum_type_t
928 get_parallel_for_b_faces_sum_type(const cs_mesh_t* m) {
929 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
930 bool known = false;
931 [[maybe_unused]] decltype(nullptr) try_query[] = {
932 ( known = known
933 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
934 nullptr)...
935 };
936 return sum_type;
937 }
938
939};
940
941/*----------------------------------------------------------------------------*/
946/*----------------------------------------------------------------------------*/
947
948class cs_dispatch_context : public cs_combined_context<
949#if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION)
950 cs_device_context,
951#endif
952 cs_host_context,
953 cs_void_context
954>
955{
956
957private:
958 using base_t = cs_combined_context<
959#if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION)
960 cs_device_context,
961#endif
962 cs_host_context,
963 cs_void_context
964>;
965
966public:
967 using base_t::base_t;
968 using base_t::operator=;
969
970};
971
972/*
973 Remarks:
974
975 Instantiation can simply be done using:
976
977 `cs_dispatch_context ctx;`
978
979 Instanciation can also be done with specific contruction options,
980 for example:
981
982 `cs_dispatch_context ctx(cs_device_context(stream), {});`
983
984 or:
985
986 `cs_dispatch_context ctx(cs_device_context(), {});`
987
988*/
989
990/*=============================================================================
991 * Global variable definitions
992 *============================================================================*/
993
994/*=============================================================================
995 * Public function prototypes
996 *============================================================================*/
997
998/*----------------------------------------------------------------------------*/
1011/*----------------------------------------------------------------------------*/
1012
1013#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1014
1015template <typename T>
1016__device__ static void __forceinline__
1017cs_dispatch_sum(T *dest,
1018 const T src,
1019 cs_dispatch_sum_type_t sum_type)
1020{
1021 if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1022#if 1
1023 using sum_v = assembled_value<T>;
1024 sum_v v;
1025
1026 v.get() = src;
1027 sum_v::ref(*dest).conflict_free_add(-1u, v);
1028#else
1029 atomicAdd(dest, src);
1030#endif
1031 }
1032 else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1033 *dest += src;
1034 }
1035}
1036
1037#elif defined(SYCL_LANGUAGE_VERSION)
1038
1039template <typename T>
1040inline void
1041cs_dispatch_sum(T *dest,
1042 const T src,
1043 cs_dispatch_sum_type_t sum_type)
1044{
1045 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1046 *dest += src;
1047 }
1048 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1049 sycl::atomic_ref<T,
1050 sycl::memory_order::relaxed,
1051 sycl::memory_scope::device> aref(*dest);
1052 aref.fetch_add(src);
1053 }
1054}
1055
1056#else // ! CUDA or SYCL
1057
1058template <typename T>
1059inline void
1060cs_dispatch_sum(T *dest,
1061 const T src,
1062 cs_dispatch_sum_type_t sum_type)
1063{
1064 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1065 *dest += src;
1066 }
1067 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1068 #pragma omp atomic
1069 *dest += src;
1070 }
1071}
1072
1073#endif // __CUDA_ARCH__
1074
1075/*----------------------------------------------------------------------------*/
1089/*----------------------------------------------------------------------------*/
1090
1091#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1092
1093template <size_t dim, typename T>
1094__device__ static void __forceinline__
1095cs_dispatch_sum(T *dest,
1096 const T *src,
1097 cs_dispatch_sum_type_t sum_type)
1098{
1099 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1100 for (cs_lnum_t i = 0; i < dim; i++) {
1101 dest[i] += src[i];
1102 }
1103 }
1104 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1105#if __CUDA_ARCH__ >= 700
1106 using sum_v = assembled_value<T, dim>;
1107 sum_v v;
1108
1109 for (size_t i = 0; i < dim; i++) {
1110 v[i].get() = src[i];
1111 }
1112
1113 sum_v &vs = reinterpret_cast<sum_v &>(*dest);
1114 vs.conflict_free_add(-1u, v);
1115
1116 //sum_v::ref(dest).conflict_free_add(-1u, v);
1117#else
1118 for (size_t i = 0; i < dim; i++) {
1119 atomicAdd(&dest[i], src[i]);
1120 }
1121#endif
1122 }
1123}
1124
1125#elif defined(SYCL_LANGUAGE_VERSION)
1126
1127template <size_t dim, typename T>
1128inline void
1129cs_dispatch_sum(T *dest,
1130 const T *src,
1131 cs_dispatch_sum_type_t sum_type)
1132{
1133 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1134 for (size_t i = 0; i < dim; i++) {
1135 dest[i] += src[i];
1136 }
1137 }
1138 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1139 for (size_t i = 0; i < dim; i++) {
1140 sycl::atomic_ref<T,
1141 sycl::memory_order::relaxed,
1142 sycl::memory_scope::device> aref(dest[i]);
1143 aref.fetch_add(src[i]);
1144 }
1145 }
1146}
1147
1148#else // ! CUDA or SYCL
1149
1150template <size_t dim, typename T>
1151inline void
1152cs_dispatch_sum(T *dest,
1153 const T *src,
1154 cs_dispatch_sum_type_t sum_type)
1155{
1156 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1157 for (size_t i = 0; i < dim; i++) {
1158 dest[i] += src[i];
1159 }
1160 }
1161 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1162 for (size_t i = 0; i < dim; i++) {
1163 #pragma omp atomic
1164 dest[i] += src[i];
1165 }
1166 }
1167}
1168
1169#endif // __CUDA_ARCH__
1170
1171#endif /* __cplusplus */
1172
1173/*----------------------------------------------------------------------------*/
1174
1175#endif /* __CS_DISPATCH_H__ */
cs_alloc_mode_t
Definition bft_mem.h:50
@ CS_ALLOC_HOST
Definition bft_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition bft_mem.h:57
@ CS_ALLOC_DEVICE
Definition bft_mem.h:59
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition cs_assert.h:67
#define restrict
Definition cs_defs.h:141
#define true
Definition cs_defs.h:210
#define false
Definition cs_defs.h:209
#define CS_THR_MIN
Definition cs_defs.h:479
int cs_lnum_t
local mesh entity id
Definition cs_defs.h:325
double precision, dimension(:,:,:), allocatable v
Definition atimbr.f90:113
Definition cs_mesh.h:85
cs_lnum_t n_i_faces
Definition cs_mesh.h:98
cs_numbering_t * b_face_numbering
Definition cs_mesh.h:163
cs_numbering_t * i_face_numbering
Definition cs_mesh.h:162
cs_lnum_t n_b_faces
Definition cs_mesh.h:99
cs_lnum_t * group_index
Definition cs_numbering.h:102
int n_threads
Definition cs_numbering.h:93
int n_groups
Definition cs_numbering.h:94