8.2
general documentation
cs_dispatch.h
Go to the documentation of this file.
1 #ifndef __CS_DISPATCH_H__
2 #define __CS_DISPATCH_H__
3 
4 /*============================================================================
5  * Class to dispatch computation using various runtimes (OpenMP, CUDA, ...)
6  *============================================================================*/
7 
8 /*
9  This file is part of code_saturne, a general-purpose CFD tool.
10 
11  Copyright (C) 1998-2024 EDF S.A.
12 
13  This program is free software; you can redistribute it and/or modify it under
14  the terms of the GNU General Public License as published by the Free Software
15  Foundation; either version 2 of the License, or (at your option) any later
16  version.
17 
18  This program is distributed in the hope that it will be useful, but WITHOUT
19  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21  details.
22 
23  You should have received a copy of the GNU General Public License along with
24  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25  Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 */
27 
28 // Valid only for C++
29 
30 #ifdef __cplusplus
31 
32 /*----------------------------------------------------------------------------*/
33 
34 #include "cs_defs.h"
35 
36 /*----------------------------------------------------------------------------
37  * Standard C++ library headers
38  *----------------------------------------------------------------------------*/
39 
40 #include <utility>
41 
42 #if defined(SYCL_LANGUAGE_VERSION)
43 #include <sycl/sycl.hpp>
44 #endif
45 
46 /*----------------------------------------------------------------------------
47  * Local headers
48  *----------------------------------------------------------------------------*/
49 
50 #include "cs_assert.h"
51 #include "cs_mesh.h"
52 
53 #ifdef __NVCC__
54 #include "cs_base_cuda.h"
55 #include "cs_blas_cuda.h"
56 #include "cs_alge_cuda.cuh"
57 #endif
58 
59 /*=============================================================================
60  * Macro definitions
61  *============================================================================*/
62 
63 #if defined(SYCL_LANGUAGE_VERSION)
64 
65 #define CS_DISPATCH_SUM_DOUBLE auto
66 
67 #else
68 
69 #define CS_DISPATCH_SUM_DOUBLE double
70 
71 #endif
72 
73 /*============================================================================
74  * Type definitions
75  *============================================================================*/
76 
80 typedef enum {
81 
82  CS_DISPATCH_SUM_SIMPLE,
84  CS_DISPATCH_SUM_ATOMIC
86 } cs_dispatch_sum_type_t;
87 
94 template <class Derived>
95 class cs_dispatch_context_mixin {
96 public:
97 
98  // Loop over n elements
99  // Must be redefined by the child class
100  template <class F, class... Args>
101  decltype(auto)
102  parallel_for(cs_lnum_t n, F&& f, Args&&... args) = delete;
103 
104  // Assembly loop over all internal faces
105  template <class F, class... Args>
106  decltype(auto)
107  parallel_for_i_faces(const cs_mesh_t* m,
108  F&& f,
109  Args&&... args);
110 
111  // Assembly loop over all boundary faces
112  template <class F, class... Args>
113  decltype(auto)
114  parallel_for_b_faces(const cs_mesh_t* m,
115  F&& f,
116  Args&&... args);
117 
118  // Parallel reduction with simple sum.
119  // Must be redefined by the child class
120  template <class F, class... Args>
121  decltype(auto)
122  parallel_for_reduce_sum
123  (cs_lnum_t n, double& sum, F&& f, Args&&... args) = delete;
124 
125  // Query sum type for assembly loop over all interior faces
126  // Must be redefined by the child class
127  bool
128  try_get_parallel_for_i_faces_sum_type(const cs_mesh_t* m,
129  cs_dispatch_sum_type_t& st);
130 
131  // Query sum type for assembly loop over all boundary faces
132  // Must be redefined by the child class
133  bool
134  try_get_parallel_for_b_faces_sum_type(const cs_mesh_t* m,
135  cs_dispatch_sum_type_t& st);
136 
137 };
138 
139 // Default implementation of parallel_for_i_faces based on parallel_for
140 template <class Derived>
141 template <class F, class... Args>
142 decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_i_faces
143  (const cs_mesh_t* m, F&& f, Args&&... args) {
144  return static_cast<Derived*>(this)->parallel_for
145  (m->n_i_faces,
146  static_cast<F&&>(f),
147  static_cast<Args&&>(args)...);
148 }
149 
150 // Default implementation of parallel_for_b_faces based on parallel_for_sum
151 template <class Derived>
152 template <class F, class... Args>
153 decltype(auto) cs_dispatch_context_mixin<Derived>::parallel_for_b_faces
154  (const cs_mesh_t* m, F&& f, Args&&... args) {
155  return static_cast<Derived*>(this)->parallel_for
156  (m->n_b_faces,
157  static_cast<F&&>(f),
158  static_cast<Args&&>(args)...);
159 }
160 
161 // Default implementation of get interior faces sum type
162 template <class Derived>
163 bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_i_faces_sum_type
164  ([[maybe_unused]]const cs_mesh_t* m,
165  cs_dispatch_sum_type_t& st) {
166  st = CS_DISPATCH_SUM_SIMPLE;
167  return true;
168 }
169 
170 // Default implementation of get boundary faces sum type
171 template <class Derived>
172 bool cs_dispatch_context_mixin<Derived>::try_get_parallel_for_b_faces_sum_type
173  ([[maybe_unused]]const cs_mesh_t* m,
174  cs_dispatch_sum_type_t& st) {
175  st = CS_DISPATCH_SUM_SIMPLE;
176  return true;
177 }
178 
179 /*
180  * cs_context to execute loops with OpenMP on the CPU
181  */
182 
183 class cs_host_context : public cs_dispatch_context_mixin<cs_host_context> {
184 
185 private:
186 
187  cs_lnum_t n_min_for_threads;
190 public:
191 
192  cs_host_context()
193  : n_min_for_threads(CS_THR_MIN)
194  {}
195 
196 public:
197 
199  void
200  set_n_min_for_cpu_threads(cs_lnum_t n) {
201  this->n_min_for_threads = n;
202  }
203 
205  cs_lnum_t
206  n_min_for_cpu_threads(void) {
207  return this->n_min_for_threads;
208  }
209 
211  template <class F, class... Args>
212  bool
213  parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
214 # pragma omp parallel for if (n >= n_min_for_threads)
215  for (cs_lnum_t i = 0; i < n; ++i) {
216  f(i, args...);
217  }
218  return true;
219  }
220 
223  template <class F, class... Args>
224  bool
225  parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
226  const int n_i_groups = m->i_face_numbering->n_groups;
227  const int n_i_threads = m->i_face_numbering->n_threads;
228  const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
229  for (int g_id = 0; g_id < n_i_groups; g_id++) {
230  #pragma omp parallel for
231  for (int t_id = 0; t_id < n_i_threads; t_id++) {
232  for (cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
233  f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
234  f_id++) {
235  f(f_id, args...);
236  }
237  }
238  }
239  return true;
240  }
241 
244  template <class F, class... Args>
245  bool
246  parallel_for_b_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
247  const int n_b_threads = m->b_face_numbering->n_threads;
248  const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
249 
250  #pragma omp parallel for
251  for (int t_id = 0; t_id < n_b_threads; t_id++) {
252  for (cs_lnum_t f_id = b_group_index[t_id*2];
253  f_id < b_group_index[t_id*2 + 1];
254  f_id++) {
255  f(f_id, args...);
256  }
257  }
258  return true;
259  }
260 
262  template <class F, class... Args>
263  bool
264  parallel_for_reduce_sum(cs_lnum_t n,
265  double& sum,
266  F&& f,
267  Args&&... args) {
268  sum = 0;
269 # pragma omp parallel for reduction(+:sum) if (n >= n_min_for_threads)
270  for (cs_lnum_t i = 0; i < n; ++i) {
271  f(i, sum, args...);
272  }
273  return true;
274  }
275 
276  // Get interior faces sum type associated with this context
277  bool
278  try_get_parallel_for_i_faces_sum_type([[maybe_unused]]const cs_mesh_t* m,
279  cs_dispatch_sum_type_t& st) {
280  st = CS_DISPATCH_SUM_SIMPLE;
281  return true;
282  }
283 
284  // Get boundary faces sum type associated with this context
285  bool
286  try_get_parallel_for_b_faces_sum_type([[maybe_unused]]const cs_mesh_t* m,
287  cs_dispatch_sum_type_t& st) {
288  st = CS_DISPATCH_SUM_SIMPLE;
289  return true;
290  }
291 
292 };
293 
294 #if defined(__NVCC__)
295 
296 /* Default kernel that loops over an integer range and calls a device functor.
297  This kernel uses a grid_size-stride loop and thus guarantees that all
298  integers are processed, even if the grid is smaller.
299  All arguments *must* be passed by value to avoid passing CPU references
300  to the GPU. */
301 
302 template <class F, class... Args>
303 __global__ void cs_cuda_kernel_parallel_for(cs_lnum_t n, F f, Args... args) {
304  // grid_size-stride loop
305  for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
306  id += blockDim.x * gridDim.x) {
307  f(id, args...);
308  }
309 }
310 
311 /* Default kernel that loops over an integer range and calls a device functor.
312  This kernel uses a grid_size-stride loop and thus guarantees that all
313  integers are processed, even if the grid is smaller.
314  All arguments *must* be passed by value to avoid passing CPU references
315  to the GPU. */
316 
317 template <class F, class... Args>
318 __global__ void
319 cs_cuda_kernel_parallel_for_reduce_sum(cs_lnum_t n,
320  double *b_res,
321  F f,
322  Args... args) {
323  // grid_size-stride loop
324  extern double __shared__ stmp[];
325  const cs_lnum_t tid = threadIdx.x;
326 
327  stmp[tid] = 0;
328 
329  for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
330  id += blockDim.x * gridDim.x) {
331  f(id, stmp[tid], args...);
332  }
333 
334  switch (blockDim.x) {
335  case 1024:
336  cs_blas_cuda_block_reduce_sum<1024, 1>(stmp, tid, b_res);
337  break;
338  case 512:
339  cs_blas_cuda_block_reduce_sum<512, 1>(stmp, tid, b_res);
340  break;
341  case 256:
342  cs_blas_cuda_block_reduce_sum<256, 1>(stmp, tid, b_res);
343  break;
344  case 128:
345  cs_blas_cuda_block_reduce_sum<128, 1>(stmp, tid, b_res);
346  break;
347  default:
348  assert(0);
349  }
350 }
351 
356 class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
357 
358 private:
359 
360  long grid_size_;
363  long block_size_;
364  cudaStream_t stream_;
365  int device_;
367  bool use_gpu_;
369 public:
370 
372 
373  cs_device_context(void)
374  : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
375  device_(0), use_gpu_(true)
376  {
377  device_ = cs_base_cuda_get_device();
378  }
379 
380  cs_device_context(long grid_size,
381  long block_size,
382  cudaStream_t stream,
383  int device)
384  : grid_size_(grid_size), block_size_(block_size), stream_(stream),
385  device_(device), use_gpu_(true)
386  {}
387 
388  cs_device_context(long grid_size,
389  long block_size,
390  cudaStream_t stream)
391  : grid_size_(grid_size), block_size_(block_size), stream_(stream),
392  device_(0), use_gpu_(true)
393  {
394  device_ = cs_base_cuda_get_device();
395  }
396 
397  cs_device_context(long grid_size,
398  long block_size)
399  : grid_size_(grid_size), block_size_(block_size),
400  stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
401  {
402  device_ = cs_base_cuda_get_device();
403  }
404 
405  cs_device_context(cudaStream_t stream)
406  : grid_size_(0), block_size_(256), stream_(stream), device_(0),
407  use_gpu_(true)
408  {
409  device_ = cs_base_cuda_get_device();
410  }
411 
413 
414  void
415  set_cuda_grid(long grid_size,
416  long block_size) {
417  this->grid_size_ = grid_size;
418  this->block_size_ = block_size;
419  }
420 
422 
423  void
424  set_cuda_stream(cudaStream_t stream) {
425  this->stream_ = stream;
426  }
427 
429 
430  void
431  set_cuda_stream(int stream_id) {
432  this->stream_ = cs_cuda_get_stream(stream_id);
433  }
434 
436 
437  cudaStream_t
438  cuda_stream(void) {
439  return this->stream_;
440  }
441 
443 
444  void
445  set_cuda_device(int device) {
446  this->device_ = device;
447  }
448 
450 
451  void
452  set_use_gpu(bool use_gpu) {
453  this->use_gpu_ = use_gpu;
454  }
455 
457 
458  bool
459  use_gpu(void) {
460  return (device_ >= 0 && use_gpu_);
461  }
462 
464 
466  alloc_mode(void) {
467  cs_alloc_mode_t amode
468  = (device_ >= 0 && use_gpu_) ? CS_ALLOC_DEVICE : CS_ALLOC_HOST;
469  return (amode);
470  }
471 
473  alloc_mode(bool readable_on_cpu) {
475  if (device_ >= 0 && use_gpu_) {
476  if (readable_on_cpu)
478  else
479  amode = CS_ALLOC_DEVICE;
480  }
481  return (amode);
482  }
483 
484 public:
485 
487  template <class F, class... Args>
488  bool
489  parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
490  if (device_ < 0 || use_gpu_ == false) {
491  return false;
492  }
493 
494  long l_grid_size = grid_size_;
495  if (l_grid_size < 1) {
496  l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
497  }
498 
499  cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
500  (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
501 
502  return true;
503  }
504 
506  template <class F, class... Args>
507  bool
508  parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
509  const cs_lnum_t n = m->n_i_faces;
510  if (device_ < 0 || use_gpu_ == false) {
511  return false;
512  }
513 
514  long l_grid_size = grid_size_;
515  if (l_grid_size < 1) {
516  l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
517  }
518 
519  cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
520  (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
521 
522  return true;
523  }
524 
527  template <class F, class... Args>
528  bool
529  parallel_for_reduce_sum(cs_lnum_t n,
530  double& sum,
531  F&& f,
532  Args&&... args) {
533  sum = 0;
534  if (device_ < 0 || use_gpu_ == false) {
535  return false;
536  }
537 
538  long l_grid_size = grid_size_;
539  if (l_grid_size < 1) {
540  l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
541  }
542 
543  double *r_grid_, *r_reduce_;
544  cs_blas_cuda_get_2_stage_reduce_buffers
545  (n, 1, l_grid_size, r_grid_, r_reduce_);
546 
547  int smem_size = block_size_ * sizeof(double);
548  cs_cuda_kernel_parallel_for_reduce_sum
549  <<<l_grid_size, block_size_, smem_size, stream_>>>
550  (n, r_grid_, static_cast<F&&>(f), static_cast<Args&&>(args)...);
551 
552  switch (block_size_) {
553  case 1024:
554  cs_blas_cuda_reduce_single_block<1024, 1>
555  <<<1, block_size_, 0, stream_>>>
556  (l_grid_size, r_grid_, r_reduce_);
557  break;
558  case 512:
559  cs_blas_cuda_reduce_single_block<512, 1>
560  <<<1, block_size_, 0, stream_>>>
561  (l_grid_size, r_grid_, r_reduce_);
562  break;
563  case 256:
564  cs_blas_cuda_reduce_single_block<256, 1>
565  <<<1, block_size_, 0, stream_>>>
566  (l_grid_size, r_grid_, r_reduce_);
567  break;
568  case 128:
569  cs_blas_cuda_reduce_single_block<128, 1>
570  <<<1, block_size_, 0, stream_>>>
571  (l_grid_size, r_grid_, r_reduce_);
572  break;
573  default:
574  cs_assert(0);
575  }
576 
577  cudaStreamSynchronize(stream_);
578  sum = r_reduce_[0];
579 
580  return true;
581  }
582 
584  void
585  wait(void) {
586  if (device_ > -1 && use_gpu_)
587  cudaStreamSynchronize(stream_);
588  }
589 
590  // Get interior faces sum type associated with this context
591  bool
592  try_get_parallel_for_i_faces_sum_type(const cs_mesh_t *m,
593  cs_dispatch_sum_type_t &st) {
594  if (device_ < 0 || use_gpu_ == false) {
595  return false;
596  }
597 
598  st = CS_DISPATCH_SUM_ATOMIC;
599  return true;
600  }
601 
602  // Get boundary faces sum type associated with this context
603  bool
604  try_get_parallel_for_b_faces_sum_type(const cs_mesh_t *m,
605  cs_dispatch_sum_type_t &st) {
606  if (device_ < 0 || use_gpu_ == false) {
607  return false;
608  }
609 
610  st = CS_DISPATCH_SUM_ATOMIC;
611  return true;
612  }
613 
614 };
615 
616 #elif defined(SYCL_LANGUAGE_VERSION)
617 
622 class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
623 
624 private:
625 
626  sycl::queue &queue_;
627  bool is_gpu;
629  bool use_gpu_;
631 public:
632 
634 
635  cs_device_context(void)
636  : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
637  {
638  is_gpu = queue_.get_device().is_gpu();
639  }
640 
642 
643  void
644  set_use_gpu(bool use_gpu) {
645  this->use_gpu_ = use_gpu;
646  }
647 
649 
650  bool
651  use_gpu(void) {
652  return (is_gpu && use_gpu_);
653  }
654 
656 
658  alloc_mode(void) {
659  cs_alloc_mode_t amode
660  = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
661  return (amode);
662  }
663 
665  alloc_mode([[maybe_unused]] bool readable_on_cpu) {
666  cs_alloc_mode_t amode
667  = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
668  return (amode);
669  }
670 
671 public:
672 
674  template <class F, class... Args>
675  bool
676  parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
677  if (is_gpu == false || use_gpu_ == false) {
678  return false;
679  }
680 
681  queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
682 
683  return true;
684  }
685 
687  template <class F, class... Args>
688  bool
689  parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
690  const cs_lnum_t n = m->n_i_faces;
691  if (is_gpu == false || use_gpu_ == false) {
692  return false;
693  }
694 
695  queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
696 
697  return true;
698  }
699 
701  template <class F, class... Args>
702  bool
703  parallel_for_reduce_sum(cs_lnum_t n,
704  double& sum_,
705  F&& f,
706  Args&&... args) {
707  sum_ = 0;
708  if (is_gpu == false || use_gpu_ == false) {
709  return false;
710  }
711 
712  // TODO: use persistent allocation as we do in CUDA BLAS to avoid
713  // excess allocation/deallocation.
714  double *sum_ptr = (double *)sycl::malloc_shared(sizeof(double), queue_);
715 
716  queue_.parallel_for(n,
717  sycl::reduction(sum_ptr, 0., sycl::plus<double>()),
718  static_cast<F&&>(f),
719  static_cast<Args&&>(args)...).wait();
720 
721  sum_ = sum_ptr[0];
722 
723  sycl::free((void *)sum_ptr, queue_);
724 
725  return true;
726  }
727 
729  void
730  wait(void) {
731  if (is_gpu == && use_gpu_)
732  queue_.wait();
733  }
734 
735  // Get interior faces sum type associated with this context
736  bool
737  try_get_parallel_for_i_faces_sum_type(const cs_mesh_t *m,
738  cs_dispatch_sum_type_t &st) {
739  if (is_gpu == false || use_gpu_ == false) {
740  return false;
741  }
742 
743  st = CS_DISPATCH_SUM_ATOMIC;
744  return true;
745  }
746 
747  // Get interior faces sum type associated with this context
748  bool
749  try_get_parallel_for_b_faces_sum_type(const cs_mesh_t *m,
750  cs_dispatch_sum_type_t &st) {
751  if (is_gpu == false || use_gpu_ == false) {
752  return false;
753  }
754 
755  st = CS_DISPATCH_SUM_ATOMIC;
756  return true;
757  }
758 
759 };
760 
761 #endif // __NVCC__ or SYCL
762 
767 class cs_void_context : public cs_dispatch_context_mixin<cs_void_context> {
768 
769 public:
770 
772 
773  cs_void_context(void)
774  {}
775 
776 #if !defined(__NVCC__)
777 
778  /* Fill-in for CUDA methods, so as to allow using these methods
779  in final cs_dispatch_context even when CUDA is not available,
780  and without requireing a static cast of the form
781 
782  static_cast<cs_device_context&>(ctx).set_use_gpu(true);
783  */
784 
785  void
786  set_cuda_grid([[maybe_unused]] long grid_size,
787  [[maybe_unused]] long block_size) {
788  }
789 
790  void
791  set_cuda_stream([[maybe_unused]] int stream_id) {
792  }
793 
794  void
795  set_cuda_device([[maybe_unused]] int device_id) {
796  }
797 
798 #endif // __NVCC__
799 
800 #if !defined(__NVCC__) && !defined(SYCL_LANGUAGE_VERSION)
801 
802  /* Fill-in for device methods */
803 
804  void
805  set_use_gpu([[maybe_unused]] bool use_gpu) {
806  }
807 
809 
810  bool
811  use_gpu(void) {
812  return false;
813  }
814 
816 
818  alloc_mode(void) {
819  return CS_ALLOC_HOST;
820  }
821 
823  alloc_mode([[maybe_unused]] bool readable_on_cpu) {
824  return CS_ALLOC_HOST;
825  }
826 
827  void
828  wait(void) {
829  }
830 
831 #endif // ! __NVCC__ && ! SYCL_LANGUAGE_VERSION
832 
833 public:
834 
835  // Abort execution if no execution method is available.
836  template <class F, class... Args>
837  bool parallel_for([[maybe_unused]] cs_lnum_t n,
838  [[maybe_unused]] F&& f,
839  [[maybe_unused]] Args&&... args) {
840  cs_assert(0);
841  return false;
842  }
843 
844  // Abort execution if no execution method is available.
845  template <class F, class... Args>
846  bool parallel_for_reduce_sum([[maybe_unused]] cs_lnum_t n,
847  [[maybe_unused]] double& sum,
848  [[maybe_unused]] F&& f,
849  [[maybe_unused]] Args&&... args) {
850  cs_assert(0);
851  return false;
852  }
853 
854 };
855 
861 template <class... Contexts>
862 class cs_combined_context
863  : public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
864  public Contexts... {
865 
866 private:
867  using mixin_t = cs_dispatch_context_mixin<cs_combined_context<Contexts...>>;
868 
869 public:
870  cs_combined_context() = default;
871  cs_combined_context(Contexts... contexts)
872  : Contexts(std::move(contexts))...
873  {}
874 
875 public:
876 
877  template <class F, class... Args>
878  auto parallel_for_i_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
879  bool launched = false;
880  [[maybe_unused]] decltype(nullptr) try_execute[] = {
881  ( launched = launched
882  || Contexts::parallel_for_i_faces(m, f, args...), nullptr)...
883  };
884  }
885 
886  template <class F, class... Args>
887  auto parallel_for_b_faces(const cs_mesh_t* m, F&& f, Args&&... args) {
888  bool launched = false;
889  [[maybe_unused]] decltype(nullptr) try_execute[] = {
890  ( launched = launched
891  || Contexts::parallel_for_b_faces(m, f, args...), nullptr)...
892  };
893  }
894 
895  template <class F, class... Args>
896  auto parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
897  bool launched = false;
898  [[maybe_unused]] decltype(nullptr) try_execute[] = {
899  ( launched = launched
900  || Contexts::parallel_for(n, f, args...), nullptr)...
901  };
902  }
903 
904  template <class F, class... Args>
905  auto parallel_for_reduce_sum
906  (cs_lnum_t n, double& sum, F&& f, Args&&... args) {
907  bool launched = false;
908  [[maybe_unused]] decltype(nullptr) try_execute[] = {
909  ( launched = launched
910  || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
911  nullptr)...
912  };
913  }
914 
915  cs_dispatch_sum_type_t
916  get_parallel_for_i_faces_sum_type(const cs_mesh_t* m) {
917  cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
918  bool known = false;
919  [[maybe_unused]] decltype(nullptr) try_query[] = {
920  ( known = known
921  || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
922  nullptr)...
923  };
924  return sum_type;
925  }
926 
927  cs_dispatch_sum_type_t
928  get_parallel_for_b_faces_sum_type(const cs_mesh_t* m) {
929  cs_dispatch_sum_type_t sum_type = CS_DISPATCH_SUM_ATOMIC;
930  bool known = false;
931  [[maybe_unused]] decltype(nullptr) try_query[] = {
932  ( known = known
933  || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
934  nullptr)...
935  };
936  return sum_type;
937  }
938 
939 };
940 
941 /*----------------------------------------------------------------------------*/
946 /*----------------------------------------------------------------------------*/
947 
948 class cs_dispatch_context : public cs_combined_context<
949 #if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION)
950  cs_device_context,
951 #endif
952  cs_host_context,
953  cs_void_context
954 >
955 {
956 
957 private:
958  using base_t = cs_combined_context<
959 #if defined(__NVCC__) || defined(SYCL_LANGUAGE_VERSION)
960  cs_device_context,
961 #endif
962  cs_host_context,
963  cs_void_context
964 >;
965 
966 public:
967  using base_t::base_t;
968  using base_t::operator=;
969 
970 };
971 
972 /*
973  Remarks:
974 
975  Instantiation can simply be done using:
976 
977  `cs_dispatch_context ctx;`
978 
979  Instanciation can also be done with specific contruction options,
980  for example:
981 
982  `cs_dispatch_context ctx(cs_device_context(stream), {});`
983 
984  or:
985 
986  `cs_dispatch_context ctx(cs_device_context(), {});`
987 
988 */
989 
990 /*=============================================================================
991  * Global variable definitions
992  *============================================================================*/
993 
994 /*=============================================================================
995  * Public function prototypes
996  *============================================================================*/
997 
998 /*----------------------------------------------------------------------------*/
1011 /*----------------------------------------------------------------------------*/
1012 
1013 #ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1014 
1015 template <typename T>
1016 __device__ static void __forceinline__
1017 cs_dispatch_sum(T *dest,
1018  const T src,
1019  cs_dispatch_sum_type_t sum_type)
1020 {
1021  if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1022 #if 1
1023  using sum_v = assembled_value<T>;
1024  sum_v v;
1025 
1026  v.get() = src;
1027  sum_v::ref(*dest).conflict_free_add(-1u, v);
1028 #else
1029  atomicAdd(dest, src);
1030 #endif
1031  }
1032  else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1033  *dest += src;
1034  }
1035 }
1036 
1037 #elif defined(SYCL_LANGUAGE_VERSION)
1038 
1039 template <typename T>
1040 inline void
1041 cs_dispatch_sum(T *dest,
1042  const T src,
1043  cs_dispatch_sum_type_t sum_type)
1044 {
1045  if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1046  *dest += src;
1047  }
1048  else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1049  sycl::atomic_ref<T,
1050  sycl::memory_order::relaxed,
1051  sycl::memory_scope::device> aref(*dest);
1052  aref.fetch_add(src);
1053  }
1054 }
1055 
1056 #else // ! CUDA or SYCL
1057 
1058 template <typename T>
1059 inline void
1060 cs_dispatch_sum(T *dest,
1061  const T src,
1062  cs_dispatch_sum_type_t sum_type)
1063 {
1064  if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1065  *dest += src;
1066  }
1067  else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1068  #pragma omp atomic
1069  *dest += src;
1070  }
1071 }
1072 
1073 #endif // __CUDA_ARCH__
1074 
1075 /*----------------------------------------------------------------------------*/
1089 /*----------------------------------------------------------------------------*/
1090 
1091 #ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1092 
1093 template <size_t dim, typename T>
1094 __device__ static void __forceinline__
1095 cs_dispatch_sum(T *dest,
1096  const T *src,
1097  cs_dispatch_sum_type_t sum_type)
1098 {
1099  if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1100  for (cs_lnum_t i = 0; i < dim; i++) {
1101  dest[i] += src[i];
1102  }
1103  }
1104  else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1105 #if __CUDA_ARCH__ >= 700
1106  using sum_v = assembled_value<T, dim>;
1107  sum_v v;
1108 
1109  for (size_t i = 0; i < dim; i++) {
1110  v[i].get() = src[i];
1111  }
1112 
1113  sum_v &vs = reinterpret_cast<sum_v &>(*dest);
1114  vs.conflict_free_add(-1u, v);
1115 
1116  //sum_v::ref(dest).conflict_free_add(-1u, v);
1117 #else
1118  for (size_t i = 0; i < dim; i++) {
1119  atomicAdd(&dest[i], src[i]);
1120  }
1121 #endif
1122  }
1123 }
1124 
1125 #elif defined(SYCL_LANGUAGE_VERSION)
1126 
1127 template <size_t dim, typename T>
1128 inline void
1129 cs_dispatch_sum(T *dest,
1130  const T *src,
1131  cs_dispatch_sum_type_t sum_type)
1132 {
1133  if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1134  for (size_t i = 0; i < dim; i++) {
1135  dest[i] += src[i];
1136  }
1137  }
1138  else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1139  for (size_t i = 0; i < dim; i++) {
1140  sycl::atomic_ref<T,
1141  sycl::memory_order::relaxed,
1142  sycl::memory_scope::device> aref(dest[i]);
1143  aref.fetch_add(src[i]);
1144  }
1145  }
1146 }
1147 
1148 #else // ! CUDA or SYCL
1149 
1150 template <size_t dim, typename T>
1151 inline void
1152 cs_dispatch_sum(T *dest,
1153  const T *src,
1154  cs_dispatch_sum_type_t sum_type)
1155 {
1156  if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1157  for (size_t i = 0; i < dim; i++) {
1158  dest[i] += src[i];
1159  }
1160  }
1161  else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1162  for (size_t i = 0; i < dim; i++) {
1163  #pragma omp atomic
1164  dest[i] += src[i];
1165  }
1166  }
1167 }
1168 
1169 #endif // __CUDA_ARCH__
1170 
1171 #endif /* __cplusplus */
1172 
1173 /*----------------------------------------------------------------------------*/
1174 
1175 #endif /* __CS_DISPATCH_H__ */
cs_alloc_mode_t
Definition: bft_mem.h:50
@ CS_ALLOC_HOST
Definition: bft_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: bft_mem.h:57
@ CS_ALLOC_DEVICE
Definition: bft_mem.h:59
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition: cs_assert.h:67
#define restrict
Definition: cs_defs.h:141
#define CS_THR_MIN
Definition: cs_defs.h:479
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:325
double precision, dimension(:,:,:), allocatable v
Definition: atimbr.f90:113
Definition: cs_mesh.h:85
cs_lnum_t n_i_faces
Definition: cs_mesh.h:98
cs_numbering_t * b_face_numbering
Definition: cs_mesh.h:163
cs_numbering_t * i_face_numbering
Definition: cs_mesh.h:162
cs_lnum_t * group_index
Definition: cs_numbering.h:102
int n_threads
Definition: cs_numbering.h:93
int n_groups
Definition: cs_numbering.h:94