8.1
general documentation
cs_blas_cuda.h
Go to the documentation of this file.
1 #ifndef __CS_BLAS_CUDA_H__
2 #define __CS_BLAS_CUDA_H__
3 
4 /*============================================================================
5  * BLAS (Basic Linear Algebra Subroutine) functions
6  *============================================================================*/
7 
8 /*
9  This file is part of code_saturne, a general-purpose CFD tool.
10 
11  Copyright (C) 1998-2023 EDF S.A.
12 
13  This program is free software; you can redistribute it and/or modify it under
14  the terms of the GNU General Public License as published by the Free Software
15  Foundation; either version 2 of the License, or (at your option) any later
16  version.
17 
18  This program is distributed in the hope that it will be useful, but WITHOUT
19  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21  details.
22 
23  You should have received a copy of the GNU General Public License along with
24  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25  Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 */
27 
28 /*----------------------------------------------------------------------------*/
29 
30 #include "cs_defs.h"
31 
32 /*----------------------------------------------------------------------------
33  * External library headers
34  *----------------------------------------------------------------------------*/
35 
36 /*----------------------------------------------------------------------------
37  * Local headers
38  *----------------------------------------------------------------------------*/
39 
40 #include "cs_base.h"
41 #include "cs_base_cuda.h"
42 
43 /*----------------------------------------------------------------------------*/
44 
46 
47 /*============================================================================
48  * Macro definitions
49  *============================================================================*/
50 
51 /*============================================================================
52  * Type definitions
53  *============================================================================*/
54 
56 
57 /*============================================================================
58  * Templated function definitions
59  *============================================================================*/
60 
61 #if defined(__CUDACC__)
62 
63 /*----------------------------------------------------------------------------*/
74 /*----------------------------------------------------------------------------*/
75 
76 template <size_t blockSize, size_t stride, typename T>
77 __device__ static void __forceinline__
78 cs_blas_cuda_warp_reduce_sum(volatile T *stmp,
79  size_t tid)
80 {
81  if (stride == 1) {
82 
83  if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
84  if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
85  if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
86  if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
87  if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
88  if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
89 
90  }
91  else {
92 
93  if (blockSize >= 64) {
94  #pragma unroll
95  for (size_t i = 0; i < stride; i++)
96  stmp[tid*stride + i] += stmp[(tid + 32)*stride + i];
97  }
98  if (blockSize >= 32) {
99  #pragma unroll
100  for (size_t i = 0; i < stride; i++)
101  stmp[tid*stride + i] += stmp[(tid + 16)*stride + i];
102  }
103  if (blockSize >= 16) {
104  #pragma unroll
105  for (size_t i = 0; i < stride; i++)
106  stmp[tid*stride + i] += stmp[(tid + 8)*stride + i];
107  }
108  if (blockSize >= 8) {
109  #pragma unroll
110  for (size_t i = 0; i < stride; i++)
111  stmp[tid*stride + i] += stmp[(tid + 4)*stride + i];
112  }
113  if (blockSize >= 4) {
114  #pragma unroll
115  for (size_t i = 0; i < stride; i++)
116  stmp[tid*stride + i] += stmp[(tid + 2)*stride + i];
117  }
118  if (blockSize >= 2) {
119  #pragma unroll
120  for (size_t i = 0; i < stride; i++)
121  stmp[tid*stride + i] += stmp[(tid + 1)*stride + i];
122  }
123  }
124 }
125 
126 /*----------------------------------------------------------------------------*/
144 /*----------------------------------------------------------------------------*/
145 
146 template <size_t blockSize, size_t stride, typename T>
147 __device__ static void __forceinline__
148 cs_blas_cuda_block_reduce_sum(T *stmp,
149  size_t tid,
150  T *sum_block)
151 {
152  __syncthreads();
153 
154  /* Loop explicitely unrolled */
155 
156  if (stride == 1) { /* Scalar data */
157 
158  if (blockSize >= 1024) {
159  if (tid < 512) {
160  stmp[tid] += stmp[tid + 512];
161  }
162  __syncthreads();
163  }
164  if (blockSize >= 512) {
165  if (tid < 256) {
166  stmp[tid] += stmp[tid + 256];
167  }
168  __syncthreads();
169  }
170  if (blockSize >= 256) {
171  if (tid < 128) {
172  stmp[tid] += stmp[tid + 128];
173  } __syncthreads();
174  }
175  if (blockSize >= 128) {
176  if (tid < 64) {
177  stmp[tid] += stmp[tid + 64];
178  } __syncthreads();
179  }
180 
181  if (tid < 32) {
182  cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
183  }
184 
185  // Output: b_res for this block
186 
187  if (tid == 0) sum_block[blockIdx.x] = stmp[0];
188 
189  }
190 
191  else { /* Vector data */
192 
193  if (blockSize >= 1024) {
194  if (tid < 512) {
195  #pragma unroll
196  for (size_t i = 0; i < stride; i++)
197  stmp[tid*stride + i] += stmp[(tid + 512)*stride + i];
198  }
199  __syncthreads();
200  }
201  if (blockSize >= 512) {
202  if (tid < 256) {
203  #pragma unroll
204  for (size_t i = 0; i < stride; i++)
205  stmp[tid*stride + i] += stmp[(tid + 256)*stride + i];
206  }
207  __syncthreads();
208  }
209  if (blockSize >= 256) {
210  if (tid < 128) {
211  #pragma unroll
212  for (size_t i = 0; i < stride; i++)
213  stmp[tid*stride + i] += stmp[(tid + 128)*stride + i];
214  } __syncthreads();
215  }
216  if (blockSize >= 128) {
217  if (tid < 64) {
218  #pragma unroll
219  for (size_t i = 0; i < stride; i++)
220  stmp[tid*stride + i] += stmp[(tid + 64)*stride + i];
221  } __syncthreads();
222  }
223 
224  if (tid < 32)
225  cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
226 
227  // Output: b_res for this block
228 
229  if (tid == 0) {
230  #pragma unroll
231  for (size_t i = 0; i < stride; i++)
232  sum_block[(blockIdx.x)*stride + i] = stmp[i];
233  }
234 
235  }
236 }
237 
238 /*----------------------------------------------------------------------------*/
248 /*----------------------------------------------------------------------------*/
249 
250 template <size_t blockSize, size_t stride, typename T>
251 __global__ static void
252 cs_blas_cuda_reduce_single_block(size_t n,
253  T *g_idata,
254  T *g_odata)
255 {
256  __shared__ T sdata[blockSize * stride];
257 
258  size_t tid = threadIdx.x;
259  T r_s[stride];
260 
261  if (stride == 1) {
262 
263  r_s[0] = 0;
264  sdata[tid] = 0.;
265 
266  for (size_t i = threadIdx.x; i < n; i+= blockSize)
267  r_s[0] += g_idata[i];
268 
269  sdata[tid] = r_s[0];
270  __syncthreads();
271 
272  for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
273  if (tid < j) {
274  sdata[tid] += sdata[tid + j];
275  }
276  __syncthreads();
277  }
278 
279  if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
280  if (tid == 0) *g_odata = sdata[0];
281 
282  }
283  else {
284 
285  #pragma unroll
286  for (size_t k = 0; k < stride; k++) {
287  r_s[k] = 0;
288  sdata[tid*stride + k] = 0.;
289  }
290 
291  for (size_t i = threadIdx.x; i < n; i+= blockSize) {
292  #pragma unroll
293  for (size_t k = 0; k < stride; k++) {
294  r_s[k] += g_idata[i*stride + k];
295  }
296  }
297 
298  #pragma unroll
299  for (size_t k = 0; k < stride; k++)
300  sdata[tid*stride + k] = r_s[k];
301  __syncthreads();
302 
303  for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
304  if (tid < j) {
305  #pragma unroll
306  for (size_t k = 0; k < stride; k++)
307  sdata[tid*stride + k] += sdata[(tid + j)*stride + k];
308  }
309  __syncthreads();
310  }
311 
312  if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
313  if (tid == 0) {
314  #pragma unroll
315  for (size_t k = 0; k < stride; k++)
316  g_odata[k] = sdata[k];
317  }
318 
319  }
320 }
321 
322 #endif /* defined(__CUDACC__) */
323 
325 
326 /*============================================================================
327  * Public function prototypes
328  *============================================================================*/
329 
330 /*----------------------------------------------------------------------------*/
336 /*----------------------------------------------------------------------------*/
337 
338 void
340 
341 #if defined(__CUDACC__)
342 
343 /*----------------------------------------------------------------------------*/
354 /*----------------------------------------------------------------------------*/
355 
356 void
357 cs_blas_cuda_set_stream(cudaStream_t stream);
358 
359 /*----------------------------------------------------------------------------*/
376 /*----------------------------------------------------------------------------*/
377 
378 void
379 cs_blas_cuda_get_2_stage_reduce_buffers(cs_lnum_t n,
380  cs_lnum_t tuple_size,
381  unsigned int grid_size,
382  double* &r_grid,
383  double* &r_reduce);
384 
385 #endif /* defined(__CUDACC__) */
386 
387 /*----------------------------------------------------------------------------*/
396 /*----------------------------------------------------------------------------*/
397 
398 double
400  const cs_real_t x[]);
401 
402 /*----------------------------------------------------------------------------*/
412 /*----------------------------------------------------------------------------*/
413 
414 double
416  const cs_real_t x[],
417  const cs_real_t y[]);
418 
419 #if defined(HAVE_CUBLAS)
420 
421 /*----------------------------------------------------------------------------*/
430 /*----------------------------------------------------------------------------*/
431 
432 double
433 cs_blas_cublas_asum(cs_lnum_t n,
434  const cs_real_t x[]);
435 
436 /*----------------------------------------------------------------------------*/
446 /*----------------------------------------------------------------------------*/
447 
448 double
449 cs_blas_cublas_dot(cs_lnum_t n,
450  const cs_real_t x[],
451  const cs_real_t y[]);
452 
453 #endif /* defined(HAVE_CUBLAS) */
454 
455 /*----------------------------------------------------------------------------
456  * Compute y <- alpha.x + y
457  *
458  * This function may be set to use either cuBLAS or a local kernel.
459  *
460  * parameters:
461  * n <-- number of elements
462  * alpha <-- constant value (on device)
463  * x <-- vector of elements (on device)
464  * y <-> vector of elements (on device)
465  *----------------------------------------------------------------------------*/
466 
467 void
469  const cs_real_t *alpha,
470  const cs_real_t *restrict x,
471  cs_real_t *restrict y);
472 
473 /*----------------------------------------------------------------------------
474  * Compute x <- alpha.x
475  *
476  * This function may be set to use either cuBLAS or a local kernel.
477  *
478  * parameters:
479  * n <-- number of elements
480  * alpha <-- constant value (on device)
481  * x <-> vector of elements (on device)
482  *----------------------------------------------------------------------------*/
483 
484 void
486  const cs_real_t *alpha,
487  cs_real_t *restrict x);
488 
489 /*----------------------------------------------------------------------------*/
490 
492 
493 #endif /* __CS_BLAS_CUDA_H__ */
#define CS_CUDA_WARP_SIZE
Definition: cs_base_cuda.h:68
void cs_blas_cuda_scal(cs_lnum_t n, const cs_real_t *alpha, cs_real_t *restrict x)
void cs_blas_cuda_finalize(void)
Finalize CUDA BLAS API.
void cs_blas_cuda_axpy(cs_lnum_t n, const cs_real_t *alpha, const cs_real_t *restrict x, cs_real_t *restrict y)
double cs_blas_cuda_asum(cs_lnum_t n, const cs_real_t x[])
Return the absolute sum of vector values using CUDA.
double cs_blas_cuda_dot(cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
Return the dot product of 2 vectors: x.y using CUDA.
#define restrict
Definition: cs_defs.h:139
#define BEGIN_C_DECLS
Definition: cs_defs.h:514
double cs_real_t
Floating-point value.
Definition: cs_defs.h:319
#define END_C_DECLS
Definition: cs_defs.h:515
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:313
@ k
Definition: cs_field_pointer.h:70
double precision, dimension(ncharm), save alpha
Definition: cpincl.f90:96