8.3
general documentation
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
cs_dispatch.h
Go to the documentation of this file.
1#ifndef __CS_DISPATCH_H__
2#define __CS_DISPATCH_H__
3
4/*============================================================================
5 * Class to dispatch computation using various runtimes (OpenMP, CUDA, ...)
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2024 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28// Valid only for C++
29
30#ifdef __cplusplus
31
32/*----------------------------------------------------------------------------*/
33
34#include "cs_defs.h"
35
36/*----------------------------------------------------------------------------
37 * Standard C++ library headers
38 *----------------------------------------------------------------------------*/
39
40#include <utility>
41
42#if defined(SYCL_LANGUAGE_VERSION)
43#include <sycl/sycl.hpp>
44#endif
45
46/*----------------------------------------------------------------------------
47 * Local headers
48 *----------------------------------------------------------------------------*/
49
50#include "cs_assert.h"
51
52#ifdef __NVCC__
53#include "cs_base_cuda.h"
54#include "cs_blas_cuda.h"
55#include "cs_math_cuda.cuh"
56#endif
57
58/*=============================================================================
59 * Macro definitions
60 *============================================================================*/
61
62#if defined(SYCL_LANGUAGE_VERSION)
63
64#define CS_DISPATCH_SUM_DOUBLE auto
65
66#else
67
68#define CS_DISPATCH_SUM_DOUBLE double
69
70#endif
71
72/*============================================================================
73 * Type definitions
74 *============================================================================*/
75
79typedef enum {
80
81 CS_DISPATCH_SUM_SIMPLE,
83 CS_DISPATCH_SUM_ATOMIC
85} cs_dispatch_sum_type_t;
86
93template <class Derived>
94class cs_dispatch_context_mixin {
95public:
96
97 // Loop over n elements
98 // Must be redefined by the child class
99 template <class F, class... Args>
100 decltype(auto)
101 parallel_for(cs_lnum_t n, F&& f, Args&&... args) = delete;
102
103 // Assembly loop over all internal faces
104 template <class M, class F, class... Args>
105 decltype(auto)
106 parallel_for_i_faces(const M* m,
107 F&& f,
108 Args&&... args);
109
110 // Assembly loop over all boundary faces
111 template <class M, class F, class... Args>
112 decltype(auto)
113 parallel_for_b_faces(const M* m,
114 F&& f,
115 Args&&... args);
116
117 // Parallel reduction with simple sum.
118 // Must be redefined by the child class
119 template <class F, class... Args>
120 decltype(auto)
121 parallel_for_reduce_sum
122 (cs_lnum_t n, double& sum, F&& f, Args&&... args) = delete;
123
124 // Query sum type for assembly loop over all interior faces
125 // Must be redefined by the child class
126 template <class M>
127 bool
128 try_get_parallel_for_i_faces_sum_type(const M* m,
129 cs_dispatch_sum_type_t& st);
130
131 // Query sum type for assembly loop over all boundary faces
132 // Must be redefined by the child class
133 template <class M>
134 bool
135 try_get_parallel_for_b_faces_sum_type(const M* m,
136 cs_dispatch_sum_type_t& st);
137
138};
139
140// Default implementation of parallel_for_i_faces based on parallel_for
141template <class Derived>
142template <class M, class F, class... Args>
143decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
144 (const M* m, F&& f, Args&&... args) {
145 return static_cast<Derived*>(this)->parallel_for
146 (m->n_i_faces,
147 static_cast<F&&>(f),
148 static_cast<Args&&>(args)...);
149}
150
151// Default implementation of parallel_for_b_faces based on parallel_for_sum
152template <class Derived>
153template <class M, class F, class... Args>
154decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
155 (const M* m, F&& f, Args&&... args) {
156 return static_cast<Derived*>(this)->parallel_for
157 (m->n_b_faces,
158 static_cast<F&&>(f),
159 static_cast<Args&&>(args)...);
160}
161
162// Default implementation of get interior faces sum type
163template <class Derived>
164template <class M>
165bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
166 ([[maybe_unused]]const M* m,
167 cs_dispatch_sum_type_t& st) {
168 st = CS_DISPATCH_SUM_SIMPLE;
169 return true;
170}
171
172// Default implementation of get boundary faces sum type
173template <class Derived>
174template <class M>
175bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
176 ([[maybe_unused]]const M* m,
177 cs_dispatch_sum_type_t& st) {
178 st = CS_DISPATCH_SUM_SIMPLE;
179 return true;
180}
181
182/*
183 * cs_context to execute loops with OpenMP on the CPU
184 */
185
186class cs_host_context : public cs_dispatch_context_mixin<cs_host_context> {
187
188private:
189
190 cs_lnum_t n_min_per_thread;
191 int n_threads_;
193public:
194
195 cs_host_context()
196 : n_min_per_thread(CS_THR_MIN), n_threads_(-1)
197 {}
198
199public:
200
202 void
203 set_n_min_per_cpu_thread(cs_lnum_t n) {
204 this->n_min_per_thread = n;
205 }
206
209 n_min_per_cpu_thread(void) {
210 return this->n_min_per_thread;
211 }
212
214 void
215 set_n_cpu_threads(int n) {
216 this->n_threads_ = n;
217 }
218
220 int
221 n_cpu_threads(void) {
222 return this->n_threads_;
223 }
224
226 template <class F, class... Args>
227 bool
228 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
229 int n_t = n_threads_;
230 if (n_t < 0) {
231 n_t = cs_glob_n_threads;
232 int n_t_l = n / n_min_per_thread;
233 if (n_t_l < n_t)
234 n_t = n_t_l;
235 if (n_t < 1)
236 n_t = 1;
237 }
238
239# pragma omp parallel for num_threads(n_t)
240 for (cs_lnum_t i = 0; i < n; ++i) {
241 f(i, args...);
242 }
243 return true;
244 }
245
248 template <class M, class F, class... Args>
249 bool
250 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
251 const int n_i_groups = m->i_face_numbering->n_groups;
252 const int n_i_threads = m->i_face_numbering->n_threads;
253 const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
254 for (int g_id = 0; g_id < n_i_groups; g_id++) {
255 #pragma omp parallel for
256 for (int t_id = 0; t_id < n_i_threads; t_id++) {
257 for (cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
258 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
259 f_id++) {
260 f(f_id, args...);
261 }
262 }
263 }
264 return true;
265 }
266
269 template <class M, class F, class... Args>
270 bool
271 parallel_for_b_faces(const M* m, F&& f, Args&&... args) {
272 const int n_b_threads = m->b_face_numbering->n_threads;
273 const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
274
275 #pragma omp parallel for
276 for (int t_id = 0; t_id < n_b_threads; t_id++) {
277 for (cs_lnum_t f_id = b_group_index[t_id*2];
278 f_id < b_group_index[t_id*2 + 1];
279 f_id++) {
280 f(f_id, args...);
281 }
282 }
283 return true;
284 }
285
287 template <class F, class... Args>
288 bool
289 parallel_for_reduce_sum(cs_lnum_t n,
290 double& sum,
291 F&& f,
292 Args&&... args) {
293 int n_t = n_threads_;
294 if (n_t < 0) {
295 n_t = cs_glob_n_threads;
296 int n_t_l = n / n_min_per_thread;
297 if (n_t_l < n_t)
298 n_t = n_t_l;
299 if (n_t < 1)
300 n_t = 1;
301 }
302
303 sum = 0;
304
305# pragma omp parallel for reduction(+:sum) num_threads(n_t)
306 for (cs_lnum_t i = 0; i < n; ++i) {
307 f(i, sum, args...);
308 }
309 return true;
310 }
311
312 // Get interior faces sum type associated with this context
313 template <class M>
314 bool
315 try_get_parallel_for_i_faces_sum_type([[maybe_unused]]const M* m,
316 cs_dispatch_sum_type_t& st) {
317 st = CS_DISPATCH_SUM_SIMPLE;
318 return true;
319 }
320
321 // Get boundary faces sum type associated with this context
322 template <class M>
323 bool
324 try_get_parallel_for_b_faces_sum_type([[maybe_unused]]const M* m,
325 cs_dispatch_sum_type_t& st) {
326 st = CS_DISPATCH_SUM_SIMPLE;
327 return true;
328 }
329
330};
331
332#if defined(__NVCC__)
333
334/* Default kernel that loops over an integer range and calls a device functor.
335 This kernel uses a grid_size-stride loop and thus guarantees that all
336 integers are processed, even if the grid is smaller.
337 All arguments *must* be passed by value to avoid passing CPU references
338 to the GPU. */
339
340template <class F, class... Args>
341__global__ void cs_cuda_kernel_parallel_for(cs_lnum_t n, F f, Args... args) {
342 // grid_size-stride loop
343 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
344 id += blockDim.x * gridDim.x) {
345 f(id, args...);
346 }
347}
348
349/* Default kernel that loops over an integer range and calls a device functor.
350 This kernel uses a grid_size-stride loop and thus guarantees that all
351 integers are processed, even if the grid is smaller.
352 All arguments *must* be passed by value to avoid passing CPU references
353 to the GPU. */
354
355template <class F, class... Args>
356__global__ void
357cs_cuda_kernel_parallel_for_reduce_sum(cs_lnum_t n,
358 double *b_res,
359 F f,
360 Args... args) {
361 // grid_size-stride loop
362 extern double __shared__ stmp[];
363 const cs_lnum_t tid = threadIdx.x;
364
365 stmp[tid] = 0;
366
367 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
368 id += blockDim.x * gridDim.x) {
369 f(id, stmp[tid], args...);
370 }
371
372 switch (blockDim.x) {
373 case 1024:
374 cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
375 break;
376 case 512:
377 cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
378 break;
379 case 256:
380 cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
381 break;
382 case 128:
383 cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
384 break;
385 default:
386 assert(0);
387 }
388}
389
394class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
395
396private:
397
398 long grid_size_;
401 long block_size_;
402 cudaStream_t stream_;
403 int device_;
405 bool use_gpu_;
407public:
408
410
411 cs_device_context(void)
412 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
413 device_(0), use_gpu_(true)
414 {
415 device_ = cs_glob_cuda_device_id;
416 }
417
418 cs_device_context(long grid_size,
419 long block_size,
420 cudaStream_t stream,
421 int device)
422 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
423 device_(device), use_gpu_(true)
424 {}
425
426 cs_device_context(long grid_size,
427 long block_size,
428 cudaStream_t stream)
429 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
430 device_(0), use_gpu_(true)
431 {
432 device_ = cs_base_cuda_get_device();
433 }
434
435 cs_device_context(long grid_size,
436 long block_size)
437 : grid_size_(grid_size), block_size_(block_size),
438 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
439 {
440 device_ = cs_base_cuda_get_device();
441 }
442
443 cs_device_context(cudaStream_t stream)
444 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
445 use_gpu_(true)
446 {
447 device_ = cs_base_cuda_get_device();
448 }
449
451 //
452 // \param[in] grid_size CUDA grid size, or -1 for automatic choice
453 // \param[in] block_size CUDA block size (power of 2 if reduction is used)
454
455 void
456 set_cuda_grid(long grid_size,
457 long block_size) {
458 this->grid_size_ = (grid_size > 0) ? grid_size : -1;
459 this->block_size_ = block_size;
460 }
461
463
464 void
465 set_cuda_stream(cudaStream_t stream) {
466 this->stream_ = stream;
467 }
468
470
471 void
472 set_cuda_stream(int stream_id) {
473 this->stream_ = cs_cuda_get_stream(stream_id);
474 }
475
477
478 cudaStream_t
479 cuda_stream(void) {
480 return this->stream_;
481 }
482
484
485 void
486 set_cuda_device(int device) {
487 this->device_ = device;
488 }
489
491
492 void
493 set_use_gpu(bool use_gpu) {
494 this->use_gpu_ = use_gpu;
495 }
496
498
499 bool
500 use_gpu(void) {
501 return (device_ >= 0 && use_gpu_);
502 }
503
505
507 alloc_mode(void) {
508 cs_alloc_mode_t amode
509 = (device_ >= 0 && use_gpu_) ? CS_ALLOC_DEVICE : CS_ALLOC_HOST;
510 return (amode);
511 }
512
514 alloc_mode(bool readable_on_cpu) {
516 if (device_ >= 0 && use_gpu_) {
517 if (readable_on_cpu)
519 else
520 amode = CS_ALLOC_DEVICE;
521 }
522 return (amode);
523 }
524
525public:
526
528 template <class F, class... Args>
529 bool
530 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
531 if (device_ < 0 || use_gpu_ == false) {
532 return false;
533 }
534
535 long l_grid_size = grid_size_;
536 if (l_grid_size < 1) {
537 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
538 }
539
540 if (n > 0)
541 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
542 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
543
544 return true;
545 }
546
548 template <class M, class F, class... Args>
549 bool
550 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
551 const cs_lnum_t n = m->n_i_faces;
552 if (device_ < 0 || use_gpu_ == false) {
553 return false;
554 }
555
556 long l_grid_size = grid_size_;
557 if (l_grid_size < 1) {
558 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
559 }
560
561 if (n > 0)
562 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
563 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
564
565 return true;
566 }
567
570 template <class F, class... Args>
571 bool
572 parallel_for_reduce_sum(cs_lnum_t n,
573 double& sum,
574 F&& f,
575 Args&&... args) {
576 sum = 0;
577 if (device_ < 0 || use_gpu_ == false) {
578 return false;
579 }
580
581 long l_grid_size = grid_size_;
582 if (l_grid_size < 1) {
583 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
584 }
585 if (n == 0) {
586 sum = 0;
587 return true;
588 }
589
590 double *r_grid_, *r_reduce_;
591 cs_blas_cuda_get_2_stage_reduce_buffers
592 (n, 1, l_grid_size, r_grid_, r_reduce_);
593
594 int smem_size = block_size_ * sizeof(double);
595 cs_cuda_kernel_parallel_for_reduce_sum
596 <<<l_grid_size, block_size_, smem_size, stream_>>>
597 (n, r_grid_, static_cast<F&&>(f), static_cast<Args&&>(args)...);
598
599 switch (block_size_) {
600 case 1024:
601 cs_blas_cuda_reduce_single_block<1024, 1>
602 <<<1, block_size_, 0, stream_>>>
603 (l_grid_size, r_grid_, r_reduce_);
604 break;
605 case 512:
606 cs_blas_cuda_reduce_single_block<512, 1>
607 <<<1, block_size_, 0, stream_>>>
608 (l_grid_size, r_grid_, r_reduce_);
609 break;
610 case 256:
611 cs_blas_cuda_reduce_single_block<256, 1>
612 <<<1, block_size_, 0, stream_>>>
613 (l_grid_size, r_grid_, r_reduce_);
614 break;
615 case 128:
616 cs_blas_cuda_reduce_single_block<128, 1>
617 <<<1, block_size_, 0, stream_>>>
618 (l_grid_size, r_grid_, r_reduce_);
619 break;
620 default:
621 cs_assert(0);
622 }
623
624 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
625 CS_CUDA_CHECK(cudaGetLastError());
626 sum = r_reduce_[0];
627
628 return true;
629 }
630
632 void
633 wait(void) {
634 if (device_ > -1 && use_gpu_) {
635 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
636 CS_CUDA_CHECK(cudaGetLastError());
637 }
638 }
639
640 // Get interior faces sum type associated with this context
641 template <class M>
642 bool
643 try_get_parallel_for_i_faces_sum_type(const M *m,
644 cs_dispatch_sum_type_t &st) {
645 if (device_ < 0 || use_gpu_ == false) {
646 return false;
647 }
648
649 st = CS_DISPATCH_SUM_ATOMIC;
650 return true;
651 }
652
653 // Get boundary faces sum type associated with this context
654 template <class M>
655 bool
656 try_get_parallel_for_b_faces_sum_type(const M *m,
657 cs_dispatch_sum_type_t &st) {
658 if (device_ < 0 || use_gpu_ == false) {
659 return false;
660 }
661
662 st = CS_DISPATCH_SUM_ATOMIC;
663 return true;
664 }
665
666};
667
668#elif defined(SYCL_LANGUAGE_VERSION)
669
671#if !defined(CS_GLOB_SYCL_QUEUE_IS_DEFINED)
672extern sycl::queue cs_glob_sycl_queue;
673#define CS_GLOB_SYCL_QUEUE_IS_DEFINED 1
674#endif
675
680class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
681
682private:
683
684 sycl::queue &queue_;
685 bool is_gpu;
687 bool use_gpu_;
689public:
690
692
693 cs_device_context(void)
694 : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
695 {
696 is_gpu = queue_.get_device().is_gpu();
697 }
698
700
701 void
702 set_use_gpu(bool use_gpu) {
703 this->use_gpu_ = use_gpu;
704 }
705
707
708 bool
709 use_gpu(void) {
710 return (is_gpu && use_gpu_);
711 }
712
714
716 alloc_mode(void) {
717 cs_alloc_mode_t amode
718 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
719 return (amode);
720 }
721
723 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
724 cs_alloc_mode_t amode
725 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
726 return (amode);
727 }
728
729public:
730
732 template <class F, class... Args>
733 bool
734 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
735 if (is_gpu == false || use_gpu_ == false) {
736 return false;
737 }
738
739 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
740
741 return true;
742 }
743
745 template <class M, class F, class... Args>
746 bool
747 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
748 const cs_lnum_t n = m->n_i_faces;
749 if (is_gpu == false || use_gpu_ == false) {
750 return false;
751 }
752
753 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
754
755 return true;
756 }
757
759 template <class F, class... Args>
760 bool
761 parallel_for_reduce_sum(cs_lnum_t n,
762 double& sum_,
763 F&& f,
764 Args&&... args) {
765 sum_ = 0;
766 if (is_gpu == false || use_gpu_ == false) {
767 return false;
768 }
769
770 // TODO: use persistent allocation as we do in CUDA BLAS to avoid
771 // excess allocation/deallocation.
772 double *sum_ptr = (double *)sycl::malloc_shared(sizeof(double), queue_);
773
774 queue_.parallel_for(n,
775 sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
776 static_cast<F&&>(f),
777 static_cast<Args&&>(args)...).wait();
778
779 sum_ = sum_ptr[0];
780
781 sycl::free((void *)sum_ptr, queue_);
782
783 return true;
784 }
785
787 void
788 wait(void) {
789 if (is_gpu && use_gpu_)
790 queue_.wait();
791 }
792
793 // Get interior faces sum type associated with this context
794 template <class M>
795 bool
796 try_get_parallel_for_i_faces_sum_type(const M *m,
797 cs_dispatch_sum_type_t &st) {
798 if (is_gpu == false || use_gpu_ == false) {
799 return false;
800 }
801
802 st = CS_DISPATCH_SUM_ATOMIC;
803 return true;
804 }
805
806 // Get interior faces sum type associated with this context
807 template <class M>
808 bool
809 try_get_parallel_for_b_faces_sum_type(const M *m,
810 cs_dispatch_sum_type_t &st) {
811 if (is_gpu == false || use_gpu_ == false) {
812 return false;
813 }
814
815 st = CS_DISPATCH_SUM_ATOMIC;
816 return true;
817 }
818
819};
820
821#elif defined(HAVE_OPENMP_TARGET)
822
827class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
828
829private:
830
831 bool is_gpu;
833 bool use_gpu_;
835public:
836
838
839 cs_device_context(void)
840 : is_gpu(false), use_gpu_(true)
841 {
842 // This should be improved for any actual use of this approach
843 // beyond basic testing
844 is_gpu = (omp_get_num_devices() > 1) ? true : false;
845 }
846
848
849 void
850 set_use_gpu(bool use_gpu) {
851 this->use_gpu_ = use_gpu;
852 }
853
855
856 bool
857 use_gpu(void) {
858 return (is_gpu && use_gpu_);
859 }
860
862
864 alloc_mode(void) {
865 cs_alloc_mode_t amode
866 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
867 return (amode);
868 }
869
871 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
872 cs_alloc_mode_t amode
873 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
874 return (amode);
875 }
876
877public:
878
880 template <class F, class... Args>
881 bool
882 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
883 if (is_gpu == false || use_gpu_ == false) {
884 return false;
885 }
886
888# pragma omp target teams distribute parallel for
889 for (cs_lnum_t i = 0; i < n; ++i) {
890 f(i, args...);
891 }
892
893 return true;
894 }
895
897 template <class F, class... Args>
898 bool
899 parallel_for_reduce_sum(cs_lnum_t n,
900 double& sum,
901 F&& f,
902 Args&&... args) {
903 sum = 0;
904 if (is_gpu == false || use_gpu_ == false) {
905 return false;
906 }
907
909# pragma omp target teams distribute parallel for reduction(+:sum)
910 for (cs_lnum_t i = 0; i < n; ++i) {
911 f(i, sum, args...);
912 }
913
914 return true;
915 }
916
918 void
919 wait(void) {
920 return;
921 }
922
923 // Get interior faces sum type associated with this context
924 template <class M>
925 bool
926 try_get_parallel_for_i_faces_sum_type(const M *m,
927 cs_dispatch_sum_type_t &st) {
928 if (is_gpu == false || use_gpu_ == false) {
929 return false;
930 }
931
932 st = CS_DISPATCH_SUM_ATOMIC;
933 return true;
934 }
935
936 // Get interior faces sum type associated with this context
937 template <class M>
938 bool
939 try_get_parallel_for_b_faces_sum_type(const M *m,
940 cs_dispatch_sum_type_t &st) {
941 if (is_gpu == false || use_gpu_ == false) {
942 return false;
943 }
944
945 st = CS_DISPATCH_SUM_ATOMIC;
946 return true;
947 }
948
949};
950
951#endif // __NVCC__ or SYCL or defined(HAVE_OPENMP_TARGET)
952
957class cs_void_context : public cs_dispatch_context_mixin<cs_void_context> {
958
959public:
960
962
963 cs_void_context(void)
964 {}
965
966#if !defined(__NVCC__)
967
968 /* Fill-in for CUDA methods, so as to allow using these methods
969 in final cs_dispatch_context even when CUDA is not available,
970 and without requireing a static cast of the form
971
972 static_cast<cs_device_context&>(ctx).set_use_gpu(true);
973 */
974
975 void
976 set_cuda_grid([[maybe_unused]] long grid_size,
977 [[maybe_unused]] long block_size) {
978 }
979
980 void
981 set_cuda_stream([[maybe_unused]] int stream_id) {
982 }
983
984 void
985 set_cuda_device([[maybe_unused]] int device_id) {
986 }
987
988#endif // __NVCC__
989
990#if !defined(__NVCC__) \
991 && !defined(SYCL_LANGUAGE_VERSION) \
992 && !defined(HAVE_OPENMP_TARGET)
993
994 /* Fill-in for device methods */
995
996 void
997 set_use_gpu([[maybe_unused]] bool use_gpu) {
998 }
999
1001
1002 bool
1003 use_gpu(void) {
1004 return false;
1005 }
1006
1008
1010 alloc_mode(void) {
1011 return CS_ALLOC_HOST;
1012 }
1013
1015 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
1016 return CS_ALLOC_HOST;
1017 }
1018
1019 void
1020 wait(void) {
1021 }
1022
1023#endif // ! __NVCC__ && ! SYCL_LANGUAGE_VERSION && ! defined(HAVE_OPENMP_TARGET)
1024
1025public:
1026
1027 // Abort execution if no execution method is available.
1028 template <class F, class... Args>
1029 bool parallel_for([[maybe_unused]] cs_lnum_t n,
1030 [[maybe_unused]] F&& f,
1031 [[maybe_unused]] Args&&... args) {
1032 cs_assert(0);
1033 return false;
1034 }
1035
1036 // Abort execution if no execution method is available.
1037 template <class F, class... Args>
1038 bool parallel_for_reduce_sum([[maybe_unused]] cs_lnum_t n,
1039 [[maybe_unused]] double& sum,
1040 [[maybe_unused]] F&& f,
1041 [[maybe_unused]] Args&&... args) {
1042 cs_assert(0);
1043 return false;
1044 }
1045
1046};
1047
1053template <class... Contexts>
1054class cs_combined_context
1055 : public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
1056 public Contexts... {
1057
1058private:
1059 using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
1060
1061public:
1062 cs_combined_context() = default;
1063 cs_combined_context(Contexts... contexts)
1064 : Contexts(std::move(contexts))...
1065 {}
1066
1067public:
1068
1069 template <class M, class F, class... Args>
1070 auto parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
1071 bool launched = false;
1072 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1073 ( launched = launched
1074 || Contexts::parallel_for_i_faces(m, f, args...), nullptr)...
1075 };
1076 }
1077
1078 template <class M, class F, class... Args>
1079 auto parallel_for_b_faces(const M* m, F&& f, Args&&... args) {
1080 bool launched = false;
1081 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1082 ( launched = launched
1083 || Contexts::parallel_for_b_faces(m, f, args...), nullptr)...
1084 };
1085 }
1086
1087 template <class F, class... Args>
1088 auto parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
1089 bool launched = false;
1090 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1091 ( launched = launched
1092 || Contexts::parallel_for(n, f, args...), nullptr)...
1093 };
1094 }
1095
1096 template <class F, class... Args>
1097 auto parallel_for_reduce_sum
1098 (cs_lnum_t n, double& sum, F&& f, Args&&... args) {
1099 bool launched = false;
1100 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1101 ( launched = launched
1102 || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
1103 nullptr)...
1104 };
1105 }
1106
1107 template <class M>
1108 cs_dispatch_sum_type_t
1109 get_parallel_for_i_faces_sum_type(const M* m) {
1110 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1111 bool known = false;
1112 [[maybe_unused]] decltype(nullptr) try_query[] = {
1113 ( known = known
1114 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
1115 nullptr)...
1116 };
1117 return sum_type;
1118 }
1119
1120 template <class M>
1121 cs_dispatch_sum_type_t
1122 get_parallel_for_b_faces_sum_type(const M* m) {
1123 cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
1124 bool known = false;
1125 [[maybe_unused]] decltype(nullptr) try_query[] = {
1126 ( known = known
1127 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
1128 nullptr)...
1129 };
1130 return sum_type;
1131 }
1132
1133};
1134
1135/*----------------------------------------------------------------------------*/
1140/*----------------------------------------------------------------------------*/
1141
1142class cs_dispatch_context : public cs_combined_context<
1143#if defined(__NVCC__) \
1144 || defined(SYCL_LANGUAGE_VERSION) \
1145 || defined(HAVE_OPENMP_TARGET)
1146 cs_device_context,
1147#endif
1148 cs_host_context,
1149 cs_void_context
1150>
1151{
1152
1153private:
1154 using base_t = cs_combined_context<
1155#if defined(__NVCC__) \
1156 || defined(SYCL_LANGUAGE_VERSION) \
1157 || defined(HAVE_OPENMP_TARGET)
1158 cs_device_context,
1159#endif
1160 cs_host_context,
1161 cs_void_context
1162>;
1163
1164public:
1165 using base_t::base_t;
1166 using base_t::operator=;
1167
1168};
1169
1170/*
1171 Remarks:
1172
1173 Instantiation can simply be done using:
1174
1175 `cs_dispatch_context ctx;`
1176
1177 Instanciation can also be done with specific contruction options,
1178 for example:
1179
1180 `cs_dispatch_context ctx(cs_device_context(stream), {});`
1181
1182 or:
1183
1184 `cs_dispatch_context ctx(cs_device_context(), {});`
1185
1186*/
1187
1188/*=============================================================================
1189 * Global variable definitions
1190 *============================================================================*/
1191
1192/*=============================================================================
1193 * Public function prototypes
1194 *============================================================================*/
1195
1196/*----------------------------------------------------------------------------*/
1209/*----------------------------------------------------------------------------*/
1210
1211#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1212
1213template <typename T>
1214__device__ static void __forceinline__
1215cs_dispatch_sum(T *dest,
1216 const T src,
1217 cs_dispatch_sum_type_t sum_type)
1218{
1219 if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1220#if 1
1221 using sum_v = assembled_value<T>;
1222 sum_v v;
1223
1224 v.get() = src;
1225 sum_v::ref(*dest).conflict_free_add(-1u, v);
1226#else
1227 atomicAdd(dest, src);
1228#endif
1229 }
1230 else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1231 *dest += src;
1232 }
1233}
1234
1235#elif defined(SYCL_LANGUAGE_VERSION)
1236
1237template <typename T>
1238inline void
1239cs_dispatch_sum(T *dest,
1240 const T src,
1241 cs_dispatch_sum_type_t sum_type)
1242{
1243 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1244 *dest += src;
1245 }
1246 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1247 sycl::atomic_ref<T,
1248 sycl::memory_order::relaxed,
1249 sycl::memory_scope::device> aref(*dest);
1250 aref.fetch_add(src);
1251 }
1252}
1253
1254#else // ! CUDA or SYCL
1255
1256template <typename T>
1257inline void
1258cs_dispatch_sum(T *dest,
1259 const T src,
1260 cs_dispatch_sum_type_t sum_type)
1261{
1262 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1263 *dest += src;
1264 }
1265 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1266 #pragma omp atomic
1267 *dest += src;
1268 }
1269}
1270
1271#endif // __CUDA_ARCH__
1272
1273/*----------------------------------------------------------------------------*/
1287/*----------------------------------------------------------------------------*/
1288
1289#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1290
1291template <size_t dim, typename T>
1292__device__ static void __forceinline__
1293cs_dispatch_sum(T *dest,
1294 const T *src,
1295 cs_dispatch_sum_type_t sum_type)
1296{
1297 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1298 for (cs_lnum_t i = 0; i < dim; i++) {
1299 dest[i] += src[i];
1300 }
1301 }
1302 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1303#if __CUDA_ARCH__ >= 700
1304 using sum_v = assembled_value<T, dim>;
1305 sum_v v;
1306
1307 for (size_t i = 0; i < dim; i++) {
1308 v[i].get() = src[i];
1309 }
1310
1311 sum_v &vs = reinterpret_cast<sum_v &>(*dest);
1312 vs.conflict_free_add(-1u, v);
1313
1314 //sum_v::ref(dest).conflict_free_add(-1u, v);
1315#else
1316 for (size_t i = 0; i < dim; i++) {
1317 atomicAdd(&dest[i], src[i]);
1318 }
1319#endif
1320 }
1321}
1322
1323#elif defined(SYCL_LANGUAGE_VERSION)
1324
1325template <size_t dim, typename T>
1326inline void
1327cs_dispatch_sum(T *dest,
1328 const T *src,
1329 cs_dispatch_sum_type_t sum_type)
1330{
1331 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1332 for (size_t i = 0; i < dim; i++) {
1333 dest[i] += src[i];
1334 }
1335 }
1336 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1337 for (size_t i = 0; i < dim; i++) {
1338 sycl::atomic_ref<T,
1339 sycl::memory_order::relaxed,
1340 sycl::memory_scope::device> aref(dest[i]);
1341 aref.fetch_add(src[i]);
1342 }
1343 }
1344}
1345
1346#else // ! CUDA or SYCL
1347
1348template <size_t dim, typename T>
1349inline void
1350cs_dispatch_sum(T *dest,
1351 const T *src,
1352 cs_dispatch_sum_type_t sum_type)
1353{
1354 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1355 for (size_t i = 0; i < dim; i++) {
1356 dest[i] += src[i];
1357 }
1358 }
1359 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1360 for (size_t i = 0; i < dim; i++) {
1361 #pragma omp atomic
1362 dest[i] += src[i];
1363 }
1364 }
1365}
1366
1367#endif // __CUDA_ARCH__
1368
1369#endif /* __cplusplus */
1370
1371/*----------------------------------------------------------------------------*/
1372
1373#endif /* __CS_DISPATCH_H__ */
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition: cs_assert.h:67
int cs_glob_n_threads
Definition: cs_defs.cpp:172
#define restrict
Definition: cs_defs.h:145
#define CS_THR_MIN
Definition: cs_defs.h:493
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:335
cs_alloc_mode_t
Definition: cs_mem.h:50
@ CS_ALLOC_HOST
Definition: cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: cs_mem.h:57
@ CS_ALLOC_DEVICE
Definition: cs_mem.h:59
double precision, dimension(:,:,:), allocatable v
Definition: atimbr.f90:113