9.0
general documentation
Loading...
Searching...
No Matches
cs_dispatch.h
Go to the documentation of this file.
1#ifndef __CS_DISPATCH_H__
2#define __CS_DISPATCH_H__
3
4/*============================================================================
5 * Class to dispatch computation using various runtimes (OpenMP, CUDA, ...)
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2025 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28// Valid only for C++
29
30#ifdef __cplusplus
31
32/*----------------------------------------------------------------------------*/
33
34#include "base/cs_defs.h"
35
36/*----------------------------------------------------------------------------
37 * Standard C++ library headers
38 *----------------------------------------------------------------------------*/
39
40#include <utility>
41
42#if defined(SYCL_LANGUAGE_VERSION)
43#include <sycl/sycl.hpp>
44#endif
45
46/*----------------------------------------------------------------------------
47 * Local headers
48 *----------------------------------------------------------------------------*/
49
50#include "base/cs_assert.h"
51#include "base/cs_mem.h"
52
53#ifdef __NVCC__
54#include "base/cs_base_cuda.h"
55#include "alge/cs_blas_cuda.h"
56#include "cs_math_cuda.cuh"
57#endif
58
59/*=============================================================================
60 * Additional doxygen documentation
61 *============================================================================*/
62
67
68/*=============================================================================
69 * Macro definitions
70 *============================================================================*/
71
72#if defined(SYCL_LANGUAGE_VERSION)
73
74#define CS_DISPATCH_SUM_DOUBLE auto
75
76#else
77
78#define CS_DISPATCH_SUM_DOUBLE double
79
80#endif
81
82/*============================================================================
83 * Type definitions
84 *============================================================================*/
85
88
89typedef enum {
90
91 CS_DISPATCH_SUM_SIMPLE,
93 CS_DISPATCH_SUM_ATOMIC
94
95} cs_dispatch_sum_type_t;
96
102
103template <class Derived>
104class cs_dispatch_context_mixin {
105public:
106
107 // Loop over n elements
108 // Must be redefined by the child class
109 template <class F, class... Args>
110 decltype(auto)
111 parallel_for(cs_lnum_t n, F&& f, Args&&... args) = delete;
112
113 // Assembly loop over all internal faces
114 template <class M, class F, class... Args>
115 decltype(auto)
116 parallel_for_i_faces(const M* m,
117 F&& f,
118 Args&&... args);
119
120 // Assembly loop over all boundary faces
121 template <class M, class F, class... Args>
122 decltype(auto)
123 parallel_for_b_faces(const M* m,
124 F&& f,
125 Args&&... args);
126
127 // Parallel reduction with simple sum.
128 // Must be redefined by the child class
129 template <class F, class... Args>
130 decltype(auto)
131 parallel_for_reduce_sum
132 (cs_lnum_t n, double& sum, F&& f, Args&&... args) = delete;
133
134 // Query sum type for assembly loop over all interior faces
135 // Must be redefined by the child class
136 template <class M>
137 bool
138 try_get_parallel_for_i_faces_sum_type(const M* m,
139 cs_dispatch_sum_type_t& st);
140
141 // Query sum type for assembly loop over all boundary faces
142 // Must be redefined by the child class
143 template <class M>
144 bool
145 try_get_parallel_for_b_faces_sum_type(const M* m,
146 cs_dispatch_sum_type_t& st);
147
148};
149
150// Default implementation of parallel_for_i_faces based on parallel_for
151template <class Derived>
152template <class M, class F, class... Args>
153decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
154 (const M* m, F&& f, Args&&... args) {
155 return static_cast<Derived*>(this)->parallel_for
156 (m->n_i_faces,
157 static_cast<F&&>(f),
158 static_cast<Args&&>(args)...);
159}
160
161// Default implementation of parallel_for_b_faces based on parallel_for_sum
162template <class Derived>
163template <class M, class F, class... Args>
164decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
165 (const M* m, F&& f, Args&&... args) {
166 return static_cast<Derived*>(this)->parallel_for
167 (m->n_b_faces,
168 static_cast<F&&>(f),
169 static_cast<Args&&>(args)...);
170}
171
172// Default implementation of get interior faces sum type
173template <class Derived>
174template <class M>
175bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
176 ([[maybe_unused]]const M* m,
177 cs_dispatch_sum_type_t& st) {
178 st = CS_DISPATCH_SUM_SIMPLE;
179 return true;
180}
181
182// Default implementation of get boundary faces sum type
183template <class Derived>
184template <class M>
185bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
186 ([[maybe_unused]]const M* m,
187 cs_dispatch_sum_type_t& st) {
188 st = CS_DISPATCH_SUM_SIMPLE;
189 return true;
190}
191
192/*
193 * cs_context to execute loops with OpenMP on the CPU
194 */
195
196class cs_host_context : public cs_dispatch_context_mixin<cs_host_context> {
197
198private:
199
200 cs_lnum_t n_min_per_thread;
201 int n_threads_;
202
203public:
204
205 cs_host_context()
206 : n_min_per_thread(CS_THR_MIN), n_threads_(-1)
207 {}
208
209public:
210
212 void
213 set_n_min_per_cpu_thread(cs_lnum_t n) {
214 this->n_min_per_thread = n;
215 }
216
219 n_min_per_cpu_thread(void) {
220 return this->n_min_per_thread;
221 }
222
224 void
225 set_n_cpu_threads(int n) {
226 this->n_threads_ = n;
227 }
228
230 int
231 n_cpu_threads(void) {
232 return this->n_threads_;
233 }
234
236 template <class F, class... Args>
237 bool
238 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
239 int n_t = n_threads_;
240 if (n_t < 0) {
241 n_t = cs_glob_n_threads;
242 int n_t_l = n / n_min_per_thread;
243 if (n_t_l < n_t)
244 n_t = n_t_l;
245 if (n_t < 1)
246 n_t = 1;
247 }
248
249# pragma omp parallel for num_threads(n_t)
250 for (cs_lnum_t i = 0; i < n; ++i) {
251 f(i, args...);
252 }
253 return true;
254 }
255
258 template <class M, class F, class... Args>
259 bool
260 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
261 const int n_i_groups = m->i_face_numbering->n_groups;
262 const int n_i_threads = m->i_face_numbering->n_threads;
263 const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
264 for (int g_id = 0; g_id < n_i_groups; g_id++) {
265 #pragma omp parallel for
266 for (int t_id = 0; t_id < n_i_threads; t_id++) {
267 for (cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
268 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
269 f_id++) {
270 f(f_id, args...);
271 }
272 }
273 }
274 return true;
275 }
276
279 template <class M, class F, class... Args>
280 bool
281 parallel_for_b_faces(const M* m, F&& f, Args&&... args) {
282 const int n_b_threads = m->b_face_numbering->n_threads;
283 const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
284
285 #pragma omp parallel for
286 for (int t_id = 0; t_id < n_b_threads; t_id++) {
287 for (cs_lnum_t f_id = b_group_index[t_id*2];
288 f_id < b_group_index[t_id*2 + 1];
289 f_id++) {
290 f(f_id, args...);
291 }
292 }
293 return true;
294 }
295
297 template <class F, class... Args>
298 bool
299 parallel_for_reduce_sum(cs_lnum_t n,
300 double& sum,
301 F&& f,
302 Args&&... args) {
303 int n_t = n_threads_;
304 if (n_t < 0) {
305 n_t = cs_glob_n_threads;
306 int n_t_l = n / n_min_per_thread;
307 if (n_t_l < n_t)
308 n_t = n_t_l;
309 if (n_t < 1)
310 n_t = 1;
311 }
312
313 sum = 0;
314
315# pragma omp parallel for reduction(+:sum) num_threads(n_t)
316 for (cs_lnum_t i = 0; i < n; ++i) {
317 f(i, sum, args...);
318 }
319 return true;
320 }
321
322 // Get interior faces sum type associated with this context
323 template <class M>
324 bool
325 try_get_parallel_for_i_faces_sum_type([[maybe_unused]]const M* m,
326 cs_dispatch_sum_type_t& st) {
327 st = CS_DISPATCH_SUM_SIMPLE;
328 return true;
329 }
330
331 // Get boundary faces sum type associated with this context
332 template <class M>
333 bool
334 try_get_parallel_for_b_faces_sum_type([[maybe_unused]]const M* m,
335 cs_dispatch_sum_type_t& st) {
336 st = CS_DISPATCH_SUM_SIMPLE;
337 return true;
338 }
339
340};
341
342#if defined(__NVCC__)
343
344/* Default kernel that loops over an integer range and calls a device functor.
345 This kernel uses a grid_size-stride loop and thus guarantees that all
346 integers are processed, even if the grid is smaller.
347 All arguments *must* be passed by value to avoid passing CPU references
348 to the GPU. */
349
350template <class F, class... Args>
351__global__ void cs_cuda_kernel_parallel_for(cs_lnum_t n, F f, Args... args) {
352 // grid_size-stride loop
353 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
354 id += blockDim.x * gridDim.x) {
355 f(id, args...);
356 }
357}
358
359/* Default kernel that loops over an integer range and calls a device functor.
360 This kernel uses a grid_size-stride loop and thus guarantees that all
361 integers are processed, even if the grid is smaller.
362 All arguments *must* be passed by value to avoid passing CPU references
363 to the GPU. */
364
365template <class F, class... Args>
366__global__ void
367cs_cuda_kernel_parallel_for_reduce_sum(cs_lnum_t n,
368 double *b_res,
369 F f,
370 Args... args) {
371 // grid_size-stride loop
372 extern double __shared__ stmp[];
373 const cs_lnum_t tid = threadIdx.x;
374
375 stmp[tid] = 0;
376
377 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
378 id += blockDim.x * gridDim.x) {
379 f(id, stmp[tid], args...);
380 }
381
382 switch (blockDim.x) {
383 case 1024:
384 cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
385 break;
386 case 512:
387 cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
388 break;
389 case 256:
390 cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
391 break;
392 case 128:
393 cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
394 break;
395 default:
396 assert(0);
397 }
398}
399
403
404class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
405
406private:
407
408 long grid_size_;
411 long block_size_;
412 cudaStream_t stream_;
413 int device_;
414
415 bool use_gpu_;
416
417public:
418
420
421 cs_device_context(void)
422 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
423 device_(0), use_gpu_(true)
424 {
425 device_ = cs_glob_cuda_device_id;
426 }
427
428 cs_device_context(long grid_size,
429 long block_size,
430 cudaStream_t stream,
431 int device)
432 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
433 device_(device), use_gpu_(true)
434 {}
435
436 cs_device_context(long grid_size,
437 long block_size,
438 cudaStream_t stream)
439 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
440 device_(0), use_gpu_(true)
441 {
442 device_ = cs_base_cuda_get_device();
443 }
444
445 cs_device_context(long grid_size,
446 long block_size)
447 : grid_size_(grid_size), block_size_(block_size),
448 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
449 {
450 device_ = cs_base_cuda_get_device();
451 }
452
453 cs_device_context(cudaStream_t stream)
454 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
455 use_gpu_(true)
456 {
457 device_ = cs_base_cuda_get_device();
458 }
459
461 //
462 // \param[in] grid_size CUDA grid size, or -1 for automatic choice
463 // \param[in] block_size CUDA block size (power of 2 if reduction is used)
464
465 void
466 set_cuda_grid(long grid_size,
467 long block_size) {
468 this->grid_size_ = (grid_size > 0) ? grid_size : -1;
469 this->block_size_ = block_size;
470 }
471
473
474 void
475 set_cuda_stream(cudaStream_t stream) {
476 this->stream_ = stream;
477 }
478
480
481 void
482 set_cuda_stream(int stream_id) {
483 this->stream_ = cs_cuda_get_stream(stream_id);
484 }
485
487
488 cudaStream_t
489 cuda_stream(void) {
490 return this->stream_;
491 }
492
494
495 void
496 set_cuda_device(int device) {
497 this->device_ = device;
498 }
499
501
502 void
503 set_use_gpu(bool use_gpu) {
504 this->use_gpu_ = use_gpu;
505 }
506
508
509 bool
510 use_gpu(void) {
511 return (device_ >= 0 && use_gpu_);
512 }
513
515
517 alloc_mode(void) {
518 cs_alloc_mode_t amode
519 = (device_ >= 0 && use_gpu_) ? CS_ALLOC_DEVICE : CS_ALLOC_HOST;
520 return (amode);
521 }
522
524 alloc_mode(bool readable_on_cpu) {
526 if (device_ >= 0 && use_gpu_) {
527 if (readable_on_cpu)
529 else
530 amode = CS_ALLOC_DEVICE;
531 }
532 return (amode);
533 }
534
535public:
536
538 template <class F, class... Args>
539 bool
540 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
541 if (device_ < 0 || use_gpu_ == false) {
542 return false;
543 }
544
545 long l_grid_size = grid_size_;
546 if (l_grid_size < 1) {
547 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
548 }
549
550 if (n > 0)
551 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
552 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
553
554 return true;
555 }
556
558 template <class M, class F, class... Args>
559 bool
560 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
561 const cs_lnum_t n = m->n_i_faces;
562 if (device_ < 0 || use_gpu_ == false) {
563 return false;
564 }
565
566 long l_grid_size = grid_size_;
567 if (l_grid_size < 1) {
568 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
569 }
570
571 if (n > 0)
572 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
573 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
574
575 return true;
576 }
577
580 template <class F, class... Args>
581 bool
582 parallel_for_reduce_sum(cs_lnum_t n,
583 double& sum,
584 F&& f,
585 Args&&... args) {
586 sum = 0;
587 if (device_ < 0 || use_gpu_ == false) {
588 return false;
589 }
590
591 long l_grid_size = grid_size_;
592 if (l_grid_size < 1) {
593 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
594 }
595 if (n == 0) {
596 sum = 0;
597 return true;
598 }
599
600 double *r_grid_, *r_reduce_;
601 cs_blas_cuda_get_2_stage_reduce_buffers
602 (n, 1, l_grid_size, r_grid_, r_reduce_);
603
604 int smem_size = block_size_ * sizeof(double);
605 cs_cuda_kernel_parallel_for_reduce_sum
606 <<<l_grid_size, block_size_, smem_size, stream_>>>
607 (n, r_grid_, static_cast<F&&>(f), static_cast<Args&&>(args)...);
608
609 switch (block_size_) {
610 case 1024:
611 cs_blas_cuda_reduce_single_block<1024, 1>
612 <<<1, block_size_, 0, stream_>>>
613 (l_grid_size, r_grid_, r_reduce_);
614 break;
615 case 512:
616 cs_blas_cuda_reduce_single_block<512, 1>
617 <<<1, block_size_, 0, stream_>>>
618 (l_grid_size, r_grid_, r_reduce_);
619 break;
620 case 256:
621 cs_blas_cuda_reduce_single_block<256, 1>
622 <<<1, block_size_, 0, stream_>>>
623 (l_grid_size, r_grid_, r_reduce_);
624 break;
625 case 128:
626 cs_blas_cuda_reduce_single_block<128, 1>
627 <<<1, block_size_, 0, stream_>>>
628 (l_grid_size, r_grid_, r_reduce_);
629 break;
630 default:
631 cs_assert(0);
632 }
633
634 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
635 CS_CUDA_CHECK(cudaGetLastError());
636 sum = r_reduce_[0];
637
638 return true;
639 }
640
642 void
643 wait(void) {
644 if (device_ > -1 && use_gpu_) {
645 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
646 CS_CUDA_CHECK(cudaGetLastError());
647 }
648 }
649
650 // Get interior faces sum type associated with this context
651 template <class M>
652 bool
653 try_get_parallel_for_i_faces_sum_type(const M *m,
654 cs_dispatch_sum_type_t &st) {
655 if (device_ < 0 || use_gpu_ == false) {
656 return false;
657 }
658
659 st = CS_DISPATCH_SUM_ATOMIC;
660 return true;
661 }
662
663 // Get boundary faces sum type associated with this context
664 template <class M>
665 bool
666 try_get_parallel_for_b_faces_sum_type(const M *m,
667 cs_dispatch_sum_type_t &st) {
668 if (device_ < 0 || use_gpu_ == false) {
669 return false;
670 }
671
672 st = CS_DISPATCH_SUM_ATOMIC;
673 return true;
674 }
675
676};
677
678#elif defined(SYCL_LANGUAGE_VERSION)
679
681#if !defined(CS_GLOB_SYCL_QUEUE_IS_DEFINED)
682extern sycl::queue cs_glob_sycl_queue;
683#define CS_GLOB_SYCL_QUEUE_IS_DEFINED 1
684#endif
685
689
690class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
691
692private:
693
694 sycl::queue &queue_;
695 bool is_gpu;
696
697 bool use_gpu_;
698
699public:
700
702
703 cs_device_context(void)
704 : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
705 {
706 is_gpu = queue_.get_device().is_gpu();
707 }
708
710
711 void
712 set_use_gpu(bool use_gpu) {
713 this->use_gpu_ = use_gpu;
714 }
715
717
718 bool
719 use_gpu(void) {
720 return (is_gpu && use_gpu_);
721 }
722
724
726 alloc_mode(void) {
727 cs_alloc_mode_t amode
728 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
729 return (amode);
730 }
731
733 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
734 cs_alloc_mode_t amode
735 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
736 return (amode);
737 }
738
739public:
740
742 template <class F, class... Args>
743 bool
744 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
745 if (is_gpu == false || use_gpu_ == false) {
746 return false;
747 }
748
749 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
750
751 return true;
752 }
753
755 template <class M, class F, class... Args>
756 bool
757 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
758 const cs_lnum_t n = m->n_i_faces;
759 if (is_gpu == false || use_gpu_ == false) {
760 return false;
761 }
762
763 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
764
765 return true;
766 }
767
769 template <class F, class... Args>
770 bool
771 parallel_for_reduce_sum(cs_lnum_t n,
772 double& sum_,
773 F&& f,
774 Args&&... args) {
775 sum_ = 0;
776 if (is_gpu == false || use_gpu_ == false) {
777 return false;
778 }
779
780 // TODO: use persistent allocation as we do in CUDA BLAS to avoid
781 // excess allocation/deallocation.
782 double *sum_ptr = (double *)sycl::malloc_shared(sizeof(double), queue_);
783
784 queue_.parallel_for(n,
785 sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
786 static_cast<F&&>(f),
787 static_cast<Args&&>(args)...).wait();
788
789 sum_ = sum_ptr[0];
790
791 sycl::free((void *)sum_ptr, queue_);
792
793 return true;
794 }
795
797 void
798 wait(void) {
799 if (is_gpu && use_gpu_)
800 queue_.wait();
801 }
802
803 // Get interior faces sum type associated with this context
804 template <class M>
805 bool
806 try_get_parallel_for_i_faces_sum_type(const M *m,
807 cs_dispatch_sum_type_t &st) {
808 if (is_gpu == false || use_gpu_ == false) {
809 return false;
810 }
811
812 st = CS_DISPATCH_SUM_ATOMIC;
813 return true;
814 }
815
816 // Get interior faces sum type associated with this context
817 template <class M>
818 bool
819 try_get_parallel_for_b_faces_sum_type(const M *m,
820 cs_dispatch_sum_type_t &st) {
821 if (is_gpu == false || use_gpu_ == false) {
822 return false;
823 }
824
825 st = CS_DISPATCH_SUM_ATOMIC;
826 return true;
827 }
828
829};
830
831#elif defined(HAVE_OPENMP_TARGET)
832
836
837class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
838
839private:
840
841 bool is_gpu;
842
843 bool use_gpu_;
844
845public:
846
848
849 cs_device_context(void)
850 : is_gpu(false), use_gpu_(true)
851 {
852 // This should be improved for any actual use of this approach
853 // beyond basic testing
854 is_gpu = (omp_get_num_devices() > 1) ? true : false;
855 }
856
858
859 void
860 set_use_gpu(bool use_gpu) {
861 this->use_gpu_ = use_gpu;
862 }
863
865
866 bool
867 use_gpu(void) {
868 return (is_gpu && use_gpu_);
869 }
870
872
874 alloc_mode(void) {
875 cs_alloc_mode_t amode
876 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
877 return (amode);
878 }
879
881 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
882 cs_alloc_mode_t amode
883 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
884 return (amode);
885 }
886
887public:
888
890 template <class F, class... Args>
891 bool
892 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
893 if (is_gpu == false || use_gpu_ == false) {
894 return false;
895 }
896
898# pragma omp target teams distribute parallel for
899 for (cs_lnum_t i = 0; i < n; ++i) {
900 f(i, args...);
901 }
902
903 return true;
904 }
905
907 template <class F, class... Args>
908 bool
909 parallel_for_reduce_sum(cs_lnum_t n,
910 double& sum,
911 F&& f,
912 Args&&... args) {
913 sum = 0;
914 if (is_gpu == false || use_gpu_ == false) {
915 return false;
916 }
917
919# pragma omp target teams distribute parallel for reduction(+:sum)
920 for (cs_lnum_t i = 0; i < n; ++i) {
921 f(i, sum, args...);
922 }
923
924 return true;
925 }
926
928 void
929 wait(void) {
930 return;
931 }
932
933 // Get interior faces sum type associated with this context
934 template <class M>
935 bool
936 try_get_parallel_for_i_faces_sum_type(const M *m,
937 cs_dispatch_sum_type_t &st) {
938 if (is_gpu == false || use_gpu_ == false) {
939 return false;
940 }
941
942 st = CS_DISPATCH_SUM_ATOMIC;
943 return true;
944 }
945
946 // Get interior faces sum type associated with this context
947 template <class M>
948 bool
949 try_get_parallel_for_b_faces_sum_type(const M *m,
950 cs_dispatch_sum_type_t &st) {
951 if (is_gpu == false || use_gpu_ == false) {
952 return false;
953 }
954
955 st = CS_DISPATCH_SUM_ATOMIC;
956 return true;
957 }
958
959};
960
961#endif // __NVCC__ or SYCL or defined(HAVE_OPENMP_TARGET)
962
966
967class cs_void_context : public cs_dispatch_context_mixin<cs_void_context> {
968
969public:
970
972
973 cs_void_context(void)
974 {}
975
976#if !defined(__NVCC__)
977
978 /* Fill-in for CUDA methods, so as to allow using these methods
979 in final cs_dispatch_context even when CUDA is not available,
980 and without requiring a static cast of the form
981
982 static_cast<cs_device_context&>(ctx).set_use_gpu(true);
983 */
984
985 void
986 set_cuda_grid([[maybe_unused]] long grid_size,
987 [[maybe_unused]] long block_size) {
988 }
989
990 void
991 set_cuda_stream([[maybe_unused]] int stream_id) {
992 }
993
994 void
995 set_cuda_device([[maybe_unused]] int device_id) {
996 }
997
998#endif // __NVCC__
999
1000#if !defined(__NVCC__) \
1001 && !defined(SYCL_LANGUAGE_VERSION) \
1002 && !defined(HAVE_OPENMP_TARGET)
1003
1004 /* Fill-in for device methods */
1005
1006 void
1007 set_use_gpu([[maybe_unused]] bool use_gpu) {
1008 }
1009
1011
1012 bool
1013 use_gpu(void) {
1014 return false;
1015 }
1016
1018
1020 alloc_mode(void) {
1021 return CS_ALLOC_HOST;
1022 }
1023
1025 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
1026 return CS_ALLOC_HOST;
1027 }
1028
1029 void
1030 wait(void) {
1031 }
1032
1033#endif // ! __NVCC__ && ! SYCL_LANGUAGE_VERSION && ! defined(HAVE_OPENMP_TARGET)
1034
1035public:
1036
1037 // Abort execution if no execution method is available.
1038 template <class F, class... Args>
1039 bool parallel_for([[maybe_unused]] cs_lnum_t n,
1040 [[maybe_unused]] F&& f,
1041 [[maybe_unused]] Args&&... args) {
1042 cs_assert(0);
1043 return false;
1044 }
1045
1046 // Abort execution if no execution method is available.
1047 template <class F, class... Args>
1048 bool parallel_for_reduce_sum([[maybe_unused]] cs_lnum_t n,
1049 [[maybe_unused]] double& sum,
1050 [[maybe_unused]] F&& f,
1051 [[maybe_unused]] Args&&... args) {
1052 cs_assert(0);
1053 return false;
1054 }
1055
1056};
1057
1062
1063template <class... Contexts>
1064class cs_combined_context
1065 : public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
1066 public Contexts... {
1067
1068private:
1069 using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
1070
1071public:
1072 cs_combined_context() = default;
1073 cs_combined_context(Contexts... contexts)
1074 : Contexts(std::move(contexts))...
1075 {}
1076
1077public:
1078
1079 template <class M, class F, class... Args>
1080 auto parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
1081 bool launched = false;
1082 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1083 ( launched = launched
1084 || Contexts::parallel_for_i_faces(m, f, args...), nullptr)...
1085 };
1086 }
1087
1088 template <class M, class F, class... Args>
1089 auto parallel_for_b_faces(const M* m, F&& f, Args&&... args) {
1090 bool launched = false;
1091 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1092 ( launched = launched
1093 || Contexts::parallel_for_b_faces(m, f, args...), nullptr)...
1094 };
1095 }
1096
1097 template <class F, class... Args>
1098 auto parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
1099 bool launched = false;
1100 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1101 ( launched = launched
1102 || Contexts::parallel_for(n, f, args...), nullptr)...
1103 };
1104 }
1105
1106 template <class F, class... Args>
1107 auto parallel_for_reduce_sum
1108 (cs_lnum_t n, double& sum, F&& f, Args&&... args) {
1109 bool launched = false;
1110 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1111 ( launched = launched
1112 || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
1113 nullptr)...
1114 };
1115 }
1116
1117 template <class M>
1118 cs_dispatch_sum_type_t
1119 get_parallel_for_i_faces_sum_type(const M* m) {
1120 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1121 bool known = false;
1122 [[maybe_unused]] decltype(nullptr) try_query[] = {
1123 ( known = known
1124 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
1125 nullptr)...
1126 };
1127 return sum_type;
1128 }
1129
1130 template <class M>
1131 cs_dispatch_sum_type_t
1132 get_parallel_for_b_faces_sum_type(const M* m) {
1133 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1134 bool known = false;
1135 [[maybe_unused]] decltype(nullptr) try_query[] = {
1136 ( known = known
1137 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
1138 nullptr)...
1139 };
1140 return sum_type;
1141 }
1142
1143};
1144
1145/*----------------------------------------------------------------------------*/
1150/*----------------------------------------------------------------------------*/
1151
1152class cs_dispatch_context : public cs_combined_context<
1153#if defined(__NVCC__) \
1154 || defined(SYCL_LANGUAGE_VERSION) \
1155 || defined(HAVE_OPENMP_TARGET)
1156 cs_device_context,
1157#endif
1158 cs_host_context,
1159 cs_void_context
1160>
1161{
1162
1163private:
1164 using base_t = cs_combined_context<
1165#if defined(__NVCC__) \
1166 || defined(SYCL_LANGUAGE_VERSION) \
1167 || defined(HAVE_OPENMP_TARGET)
1168 cs_device_context,
1169#endif
1170 cs_host_context,
1171 cs_void_context
1172>;
1173
1174public:
1175 using base_t::base_t;
1176 using base_t::operator=;
1177
1178};
1179
1180/*
1181 Remarks:
1182
1183 Instantiation can simply be done using:
1184
1185 `cs_dispatch_context ctx;`
1186
1187 Instanciation can also be done with specific construction options,
1188 for example:
1189
1190 `cs_dispatch_context ctx(cs_device_context(stream), {});`
1191
1192 or:
1193
1194 `cs_dispatch_context ctx(cs_device_context(), {});`
1195
1196*/
1197
1198/*=============================================================================
1199 * Global variable definitions
1200 *============================================================================*/
1201
1202/*=============================================================================
1203 * Public function prototypes
1204 *============================================================================*/
1205
1206/*----------------------------------------------------------------------------*/
1219/*----------------------------------------------------------------------------*/
1220
1221#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1222
1223template <typename T>
1224__device__ static void __forceinline__
1225cs_dispatch_sum(T *dest,
1226 const T src,
1227 cs_dispatch_sum_type_t sum_type)
1228{
1229 if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1230#if 1
1231 using sum_v = assembled_value<T>;
1232 sum_v v;
1233
1234 v.get() = src;
1235 sum_v::ref(*dest).conflict_free_add(-1u, v);
1236#else
1237 atomicAdd(dest, src);
1238#endif
1239 }
1240 else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1241 *dest += src;
1242 }
1243}
1244
1245#elif defined(SYCL_LANGUAGE_VERSION)
1246
1247template <typename T>
1248inline void
1249cs_dispatch_sum(T *dest,
1250 const T src,
1251 cs_dispatch_sum_type_t sum_type)
1252{
1253 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1254 *dest += src;
1255 }
1256 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1257 sycl::atomic_ref<T,
1258 sycl::memory_order::relaxed,
1259 sycl::memory_scope::device> aref(*dest);
1260 aref.fetch_add(src);
1261 }
1262}
1263
1264#else // ! CUDA or SYCL
1265
1266template <typename T>
1267inline void
1268cs_dispatch_sum(T *dest,
1269 const T src,
1270 cs_dispatch_sum_type_t sum_type)
1271{
1272 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1273 *dest += src;
1274 }
1275 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1276 #pragma omp atomic
1277 *dest += src;
1278 }
1279}
1280
1281#endif // __CUDA_ARCH__
1282
1283/*----------------------------------------------------------------------------*/
1297/*----------------------------------------------------------------------------*/
1298
1299#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1300
1301template <size_t dim, typename T>
1302__device__ static void __forceinline__
1303cs_dispatch_sum(T *dest,
1304 const T *src,
1305 cs_dispatch_sum_type_t sum_type)
1306{
1307 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1308 for (cs_lnum_t i = 0; i < dim; i++) {
1309 dest[i] += src[i];
1310 }
1311 }
1312 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1313#if __CUDA_ARCH__ >= 700
1314 using sum_v = assembled_value<T, dim>;
1315 sum_v v;
1316
1317 for (size_t i = 0; i < dim; i++) {
1318 v[i].get() = src[i];
1319 }
1320
1321 sum_v &vs = reinterpret_cast<sum_v &>(*dest);
1322 vs.conflict_free_add(-1u, v);
1323
1324 //sum_v::ref(dest).conflict_free_add(-1u, v);
1325#else
1326 for (size_t i = 0; i < dim; i++) {
1327 atomicAdd(&dest[i], src[i]);
1328 }
1329#endif
1330 }
1331}
1332
1333#elif defined(SYCL_LANGUAGE_VERSION)
1334
1335template <size_t dim, typename T>
1336inline void
1337cs_dispatch_sum(T *dest,
1338 const T *src,
1339 cs_dispatch_sum_type_t sum_type)
1340{
1341 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1342 for (size_t i = 0; i < dim; i++) {
1343 dest[i] += src[i];
1344 }
1345 }
1346 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1347 for (size_t i = 0; i < dim; i++) {
1348 sycl::atomic_ref<T,
1349 sycl::memory_order::relaxed,
1350 sycl::memory_scope::device> aref(dest[i]);
1351 aref.fetch_add(src[i]);
1352 }
1353 }
1354}
1355
1356#else // ! CUDA or SYCL
1357
1358template <size_t dim, typename T>
1359inline void
1360cs_dispatch_sum(T *dest,
1361 const T *src,
1362 cs_dispatch_sum_type_t sum_type)
1363{
1364 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1365 for (size_t i = 0; i < dim; i++) {
1366 dest[i] += src[i];
1367 }
1368 }
1369 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1370 for (size_t i = 0; i < dim; i++) {
1371 #pragma omp atomic
1372 dest[i] += src[i];
1373 }
1374 }
1375}
1376
1377#endif // __CUDA_ARCH__
1378
1379#endif /* __cplusplus */
1380
1381/*----------------------------------------------------------------------------*/
1382
1383#endif /* __CS_DISPATCH_H__ */
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition cs_assert.h:67
int cs_glob_n_threads
Definition cs_defs.cpp:172
#define restrict
Definition cs_defs.h:145
#define true
Definition cs_defs.h:220
#define false
Definition cs_defs.h:219
#define CS_THR_MIN
Definition cs_defs.h:493
int cs_lnum_t
local mesh entity id
Definition cs_defs.h:335
cs_alloc_mode_t
Definition cs_mem.h:50
@ CS_ALLOC_HOST
Definition cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition cs_mem.h:57
@ CS_ALLOC_DEVICE
Definition cs_mem.h:59
double precision, dimension(:,:,:), allocatable v
Definition atimbr.f90:113