8.0
general documentation
cs_blas_cuda.h
Go to the documentation of this file.
1 #ifndef __CS_BLAS_CUDA_H__
2 #define __CS_BLAS_CUDA_H__
3 
4 /*============================================================================
5  * BLAS (Basic Linear Algebra Subroutine) functions
6  *============================================================================*/
7 
8 /*
9  This file is part of code_saturne, a general-purpose CFD tool.
10 
11  Copyright (C) 1998-2023 EDF S.A.
12 
13  This program is free software; you can redistribute it and/or modify it under
14  the terms of the GNU General Public License as published by the Free Software
15  Foundation; either version 2 of the License, or (at your option) any later
16  version.
17 
18  This program is distributed in the hope that it will be useful, but WITHOUT
19  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21  details.
22 
23  You should have received a copy of the GNU General Public License along with
24  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25  Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 */
27 
28 /*----------------------------------------------------------------------------*/
29 
30 #include "cs_defs.h"
31 
32 /*----------------------------------------------------------------------------
33  * External library headers
34  *----------------------------------------------------------------------------*/
35 
36 /*----------------------------------------------------------------------------
37  * Local headers
38  *----------------------------------------------------------------------------*/
39 
40 #include "cs_base.h"
41 #include "cs_base_cuda.h"
42 
43 /*----------------------------------------------------------------------------*/
44 
46 
47 /*============================================================================
48  * Macro definitions
49  *============================================================================*/
50 
51 /*============================================================================
52  * Type definitions
53  *============================================================================*/
54 
56 
57 /*============================================================================
58  * Templated function definitions
59  *============================================================================*/
60 
61 #if defined(__CUDACC__)
62 
63 /*----------------------------------------------------------------------------*/
74 /*----------------------------------------------------------------------------*/
75 
76 template <size_t blockSize, size_t stride, typename T>
77 __device__ static void __forceinline__
78 cs_blas_cuda_warp_reduce_sum(volatile T *stmp,
79  size_t tid)
80 {
81  if (stride == 1) {
82 
83  if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
84  if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
85  if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
86  if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
87  if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
88  if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
89 
90  }
91  else {
92 
93  if (blockSize >= 64) {
94  #pragma unroll
95  for (size_t i = 0; i < stride; i++)
96  stmp[tid*stride + i] += stmp[(tid + 32)*stride + i];
97  }
98  if (blockSize >= 32) {
99  #pragma unroll
100  for (size_t i = 0; i < stride; i++)
101  stmp[tid*stride + i] += stmp[(tid + 16)*stride + i];
102  }
103  if (blockSize >= 16) {
104  #pragma unroll
105  for (size_t i = 0; i < stride; i++)
106  stmp[tid*stride + i] += stmp[(tid + 8)*stride + i];
107  }
108  if (blockSize >= 8) {
109  #pragma unroll
110  for (size_t i = 0; i < stride; i++)
111  stmp[tid*stride + i] += stmp[(tid + 4)*stride + i];
112  }
113  if (blockSize >= 4) {
114  #pragma unroll
115  for (size_t i = 0; i < stride; i++)
116  stmp[tid*stride + i] += stmp[(tid + 2)*stride + i];
117  }
118  if (blockSize >= 2) {
119  #pragma unroll
120  for (size_t i = 0; i < stride; i++)
121  stmp[tid*stride + i] += stmp[(tid + 1)*stride + i];
122  }
123  }
124 }
125 
126 /*----------------------------------------------------------------------------*/
144 /*----------------------------------------------------------------------------*/
145 
146 template <size_t blockSize, size_t stride, typename T>
147 __device__ static void __forceinline__
148 cs_blas_cuda_block_reduce_sum(T *stmp,
149  size_t tid,
150  T *sum_block)
151 {
152  __syncthreads();
153 
154  /* Loop explicitely unrolled */
155 
156  if (stride == 1) { /* Scalar data */
157 
158  if (blockSize >= 1024) {
159  if (tid < 512) {
160  stmp[tid] += stmp[tid + 512];
161  }
162  __syncthreads();
163  }
164  if (blockSize >= 512) {
165  if (tid < 256) {
166  stmp[tid] += stmp[tid + 256];
167  }
168  __syncthreads();
169  }
170  if (blockSize >= 256) {
171  if (tid < 128) {
172  stmp[tid] += stmp[tid + 128];
173  } __syncthreads();
174  }
175  if (blockSize >= 128) {
176  if (tid < 64) {
177  stmp[tid] += stmp[tid + 64];
178  } __syncthreads();
179  }
180 
181  if (tid < 32) {
182  cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
183  }
184 
185  // Output: b_res for this block
186 
187  if (tid == 0) sum_block[blockIdx.x] = stmp[0];
188 
189  }
190 
191  else { /* Vector data */
192 
193  if (blockSize >= 1024) {
194  if (tid < 512) {
195  #pragma unroll
196  for (size_t i = 0; i < stride; i++)
197  stmp[tid*stride + i] += stmp[(tid + 512)*stride + i];
198  }
199  __syncthreads();
200  }
201  if (blockSize >= 512) {
202  if (tid < 256) {
203  #pragma unroll
204  for (size_t i = 0; i < stride; i++)
205  stmp[tid*stride + i] += stmp[(tid + 256)*stride + i];
206  }
207  __syncthreads();
208  }
209  if (blockSize >= 256) {
210  if (tid < 128) {
211  #pragma unroll
212  for (size_t i = 0; i < stride; i++)
213  stmp[tid*stride + i] += stmp[(tid + 128)*stride + i];
214  } __syncthreads();
215  }
216  if (blockSize >= 128) {
217  if (tid < 64) {
218  #pragma unroll
219  for (size_t i = 0; i < stride; i++)
220  stmp[tid*stride + i] += stmp[(tid + 64)*stride + i];
221  } __syncthreads();
222  }
223 
224  if (tid < 32)
225  cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
226 
227  // Output: b_res for this block
228 
229  if (tid == 0) {
230  #pragma unroll
231  for (size_t i = 0; i < stride; i++)
232  sum_block[(blockIdx.x)*stride + i] = stmp[i];
233  }
234 
235  }
236 }
237 
238 /*----------------------------------------------------------------------------*/
248 /*----------------------------------------------------------------------------*/
249 
250 template <size_t blockSize, size_t stride, typename T>
251 __global__ static void
252 cs_blas_cuda_reduce_single_block(size_t n,
253  T *g_idata,
254  T *g_odata)
255 {
256  __shared__ T sdata[blockSize * stride];
257 
258  size_t tid = threadIdx.x;
259  T r_s[stride];
260 
261  if (stride == 1) {
262 
263  r_s[0] = 0;
264  sdata[tid] = 0.;
265 
266  for (size_t i = threadIdx.x; i < n; i+= blockSize)
267  r_s[0] += g_idata[i];
268 
269  sdata[tid] = r_s[0];
270  __syncthreads();
271 
272  for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
273  if (tid < j) {
274  sdata[tid] += sdata[tid + j];
275  }
276  __syncthreads();
277  }
278 
279  if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
280  if (tid == 0) *g_odata = sdata[0];
281 
282  }
283  else {
284 
285  #pragma unroll
286  for (size_t k = 0; k < stride; k++) {
287  r_s[k] = 0;
288  sdata[tid*stride + k] = 0.;
289  }
290 
291  for (size_t i = threadIdx.x; i < n; i+= blockSize) {
292  #pragma unroll
293  for (size_t k = 0; k < stride; k++) {
294  r_s[k] += g_idata[i*stride + k];
295  }
296 
297  #pragma unroll
298  for (size_t k = 0; k < stride; k++)
299  sdata[tid*stride + k] = r_s[k];
300  __syncthreads();
301 
302  for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
303  if (tid < j) {
304  #pragma unroll
305  for (size_t k = 0; k < stride; k++)
306  sdata[tid*stride + k] += sdata[(tid + j)*stride + k];
307  }
308  __syncthreads();
309  }
310 
311  }
312 
313  if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
314  if (tid == 0) {
315  #pragma unroll
316  for (size_t k = 0; k < stride; k++)
317  g_odata[k] = sdata[k];
318  }
319 
320  }
321 }
322 
323 #endif /* defined(__CUDACC__) */
324 
326 
327 /*============================================================================
328  * Public function prototypes
329  *============================================================================*/
330 
331 /*----------------------------------------------------------------------------*/
337 /*----------------------------------------------------------------------------*/
338 
339 void
341 
342 #if defined(__CUDACC__)
343 
344 /*----------------------------------------------------------------------------*/
355 /*----------------------------------------------------------------------------*/
356 
357 void
358 cs_blas_cuda_set_stream(cudaStream_t stream);
359 
360 /*----------------------------------------------------------------------------*/
377 /*----------------------------------------------------------------------------*/
378 
379 void
380 cs_blas_cuda_get_2_stage_reduce_buffers(cs_lnum_t n,
381  cs_lnum_t tuple_size,
382  unsigned int grid_size,
383  double* &r_grid,
384  double* &r_reduce);
385 
386 #endif /* defined(__CUDACC__) */
387 
388 /*----------------------------------------------------------------------------*/
397 /*----------------------------------------------------------------------------*/
398 
399 double
401  const cs_real_t x[]);
402 
403 /*----------------------------------------------------------------------------*/
413 /*----------------------------------------------------------------------------*/
414 
415 double
417  const cs_real_t x[],
418  const cs_real_t y[]);
419 
420 #if defined(HAVE_CUBLAS)
421 
422 /*----------------------------------------------------------------------------*/
431 /*----------------------------------------------------------------------------*/
432 
433 double
434 cs_blas_cublas_asum(cs_lnum_t n,
435  const cs_real_t x[]);
436 
437 /*----------------------------------------------------------------------------*/
447 /*----------------------------------------------------------------------------*/
448 
449 double
450 cs_blas_cublas_dot(cs_lnum_t n,
451  const cs_real_t x[],
452  const cs_real_t y[]);
453 
454 #endif /* defined(HAVE_CUBLAS) */
455 
456 /*----------------------------------------------------------------------------
457  * Compute y <- alpha.x + y
458  *
459  * This function may be set to use either cuBLAS or a local kernel.
460  *
461  * parameters:
462  * n <-- number of elements
463  * alpha <-- constant value (on device)
464  * x <-- vector of elements (on device)
465  * y <-> vector of elements (on device)
466  *----------------------------------------------------------------------------*/
467 
468 void
470  const cs_real_t *alpha,
471  const cs_real_t *restrict x,
472  cs_real_t *restrict y);
473 
474 /*----------------------------------------------------------------------------
475  * Compute x <- alpha.x
476  *
477  * This function may be set to use either cuBLAS or a local kernel.
478  *
479  * parameters:
480  * n <-- number of elements
481  * alpha <-- constant value (on device)
482  * x <-> vector of elements (on device)
483  *----------------------------------------------------------------------------*/
484 
485 void
487  const cs_real_t *alpha,
488  cs_real_t *restrict x);
489 
490 /*----------------------------------------------------------------------------*/
491 
493 
494 #endif /* __CS_BLAS_CUDA_H__ */
#define CS_CUDA_WARP_SIZE
Definition: cs_base_cuda.h:68
void cs_blas_cuda_scal(cs_lnum_t n, const cs_real_t *alpha, cs_real_t *restrict x)
void cs_blas_cuda_finalize(void)
Finalize CUDA BLAS API.
void cs_blas_cuda_axpy(cs_lnum_t n, const cs_real_t *alpha, const cs_real_t *restrict x, cs_real_t *restrict y)
double cs_blas_cuda_asum(cs_lnum_t n, const cs_real_t x[])
Return the absolute sum of vector values using CUDA.
double cs_blas_cuda_dot(cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
Return the dot product of 2 vectors: x.y using CUDA.
#define restrict
Definition: cs_defs.h:139
#define BEGIN_C_DECLS
Definition: cs_defs.h:509
double cs_real_t
Floating-point value.
Definition: cs_defs.h:319
#define END_C_DECLS
Definition: cs_defs.h:510
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:313
@ k
Definition: cs_field_pointer.h:70
double precision, dimension(ncharm), save alpha
Definition: cpincl.f90:99