7.2
general documentation
cs_blas_cuda.h
Go to the documentation of this file.
1 #ifndef __CS_BLAS_CUDA_H__
2 #define __CS_BLAS_CUDA_H__
3 
4 /*============================================================================
5  * BLAS (Basic Linear Algebra Subroutine) functions
6  *============================================================================*/
7 
8 /*
9  This file is part of code_saturne, a general-purpose CFD tool.
10 
11  Copyright (C) 1998-2022 EDF S.A.
12 
13  This program is free software; you can redistribute it and/or modify it under
14  the terms of the GNU General Public License as published by the Free Software
15  Foundation; either version 2 of the License, or (at your option) any later
16  version.
17 
18  This program is distributed in the hope that it will be useful, but WITHOUT
19  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21  details.
22 
23  You should have received a copy of the GNU General Public License along with
24  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25  Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 */
27 
28 /*----------------------------------------------------------------------------*/
29 
30 #include "cs_defs.h"
31 
32 /*----------------------------------------------------------------------------
33  * External library headers
34  *----------------------------------------------------------------------------*/
35 
36 /*----------------------------------------------------------------------------
37  * Local headers
38  *----------------------------------------------------------------------------*/
39 
40 #include "cs_base.h"
41 #include "cs_base_cuda.h"
42 
43 /*----------------------------------------------------------------------------*/
44 
46 
47 /*============================================================================
48  * Macro definitions
49  *============================================================================*/
50 
51 /*============================================================================
52  * Type definitions
53  *============================================================================*/
54 
56 
57 /*============================================================================
58  * Templated function definitions
59  *============================================================================*/
60 
61 #if defined(__CUDACC__)
62 
63 /*----------------------------------------------------------------------------*/
70 /*----------------------------------------------------------------------------*/
71 
72 template <size_t blockSize, typename T>
73 __device__ static void __forceinline__
74 cs_blas_cuda_warp_reduce_sum(volatile T *stmp,
75  size_t tid)
76 {
77  if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
78  if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
79  if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
80  if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
81  if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
82  if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
83 }
84 
85 /*----------------------------------------------------------------------------*/
99 /*----------------------------------------------------------------------------*/
100 
101 template <size_t blockSize, typename T>
102 __device__ static void __forceinline__
103 cs_blas_cuda_block_reduce_sum(T *stmp,
104  size_t tid,
105  double *sum_block)
106 {
107  __syncthreads();
108 
109  /* Loop explicitely unrolled */
110 
111  if (blockSize >= 1024) {
112  if (tid < 512) {
113  stmp[tid] += stmp[tid + 512];
114  }
115  __syncthreads();
116  }
117  if (blockSize >= 512) {
118  if (tid < 256) {
119  stmp[tid] += stmp[tid + 256];
120  }
121  __syncthreads();
122  }
123  if (blockSize >= 256) {
124  if (tid < 128) {
125  stmp[tid] += stmp[tid + 128];
126  } __syncthreads();
127  }
128  if (blockSize >= 128) {
129  if (tid < 64) {
130  stmp[tid] += stmp[tid + 64];
131  } __syncthreads();
132  }
133 
134  if (tid < 32)
135  cs_blas_cuda_warp_reduce_sum<blockSize>(stmp, tid);
136 
137  // Output: b_res for this block
138 
139  if (tid == 0) sum_block[blockIdx.x] = stmp[0];
140 }
141 
142 /*----------------------------------------------------------------------------*/
150 /*----------------------------------------------------------------------------*/
151 
152 template <size_t blockSize, typename T>
153 __global__ static void
154 cs_blas_cuda_reduce_single_block(size_t n,
155  T *g_idata,
156  T *g_odata)
157 {
158  __shared__ T sdata[blockSize];
159 
160  size_t tid = threadIdx.x;
161  T r_s = 0;
162 
163  for (int i = threadIdx.x; i < n; i+= blockSize)
164  r_s += g_idata[i];
165 
166  sdata[tid] = r_s;
167  __syncthreads();
168 
169  for (int j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
170  if (tid < j) {
171  sdata[tid] += sdata[tid + j];
172  }
173  __syncthreads();
174  }
175 
176  if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize>(sdata, tid);
177  if (tid == 0) *g_odata = sdata[0];
178 }
179 
180 #endif /* defined(__CUDACC__) */
181 
183 
184 /*============================================================================
185  * Public function prototypes
186  *============================================================================*/
187 
188 /*----------------------------------------------------------------------------*/
194 /*----------------------------------------------------------------------------*/
195 
196 void
198 
199 #if defined(__CUDACC__)
200 
201 /*----------------------------------------------------------------------------*/
212 /*----------------------------------------------------------------------------*/
213 
214 void
215 cs_blas_cuda_set_stream(cudaStream_t stream);
216 
217 #endif /* defined(__CUDACC__) */
218 
219 /*----------------------------------------------------------------------------*/
236 /*----------------------------------------------------------------------------*/
237 
238 double *
240  cs_lnum_t tuple_size,
241  unsigned int grid_size);
242 
243 /*----------------------------------------------------------------------------*/
252 /*----------------------------------------------------------------------------*/
253 
254 double
256  const cs_real_t x[]);
257 
258 /*----------------------------------------------------------------------------*/
268 /*----------------------------------------------------------------------------*/
269 
270 double
272  const cs_real_t x[],
273  const cs_real_t y[]);
274 
275 #if defined(HAVE_CUBLAS)
276 
277 /*----------------------------------------------------------------------------*/
286 /*----------------------------------------------------------------------------*/
287 
288 double
289 cs_blas_cublas_asum(cs_lnum_t n,
290  const cs_real_t x[]);
291 
292 /*----------------------------------------------------------------------------*/
302 /*----------------------------------------------------------------------------*/
303 
304 double
305 cs_blas_cublas_dot(cs_lnum_t n,
306  const cs_real_t x[],
307  const cs_real_t y[]);
308 
309 #endif /* defined(HAVE_CUBLAS) */
310 
311 /*----------------------------------------------------------------------------
312  * Compute y <- alpha.x + y
313  *
314  * This function may be set to use either cuBLAS or a local kernel.
315  *
316  * parameters:
317  * n <-- number of elements
318  * alpha <-- constant value (on device)
319  * x <-- vector of elements (on device)
320  * y <-> vector of elements (on device)
321  *----------------------------------------------------------------------------*/
322 
323 void
325  const cs_real_t *alpha,
326  const cs_real_t *restrict x,
327  cs_real_t *restrict y);
328 
329 /*----------------------------------------------------------------------------
330  * Compute x <- alpha.x
331  *
332  * This function may be set to use either cuBLAS or a local kernel.
333  *
334  * parameters:
335  * n <-- number of elements
336  * alpha <-- constant value (on device)
337  * x <-> vector of elements (on device)
338  *----------------------------------------------------------------------------*/
339 
340 void
342  const cs_real_t *alpha,
343  cs_real_t *restrict x);
344 
345 /*----------------------------------------------------------------------------*/
346 
348 
349 #endif /* __CS_BLAS_CUDA_H__ */
void cs_blas_cuda_finalize(void)
Finalize CUDA BLAS API.
#define restrict
Definition: cs_defs.h:142
double cs_blas_cuda_dot(cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
Return the dot product of 2 vectors: x.y using CUDA.
double precision, dimension(ncharm), save alpha
Definition: cpincl.f90:99
#define BEGIN_C_DECLS
Definition: cs_defs.h:510
double cs_blas_cuda_asum(cs_lnum_t n, const cs_real_t x[])
Return the absolute sum of vector values using CUDA.
double cs_real_t
Floating-point value.
Definition: cs_defs.h:322
void cs_blas_cuda_scal(cs_lnum_t n, const cs_real_t *alpha, cs_real_t *restrict x)
#define CS_CUDA_WARP_SIZE
Definition: cs_base_cuda.h:68
void cs_blas_cuda_axpy(cs_lnum_t n, const cs_real_t *alpha, const cs_real_t *restrict x, cs_real_t *restrict y)
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:316
#define END_C_DECLS
Definition: cs_defs.h:511
double * cs_blas_cuda_get_2_stage_reduce_buffer(cs_lnum_t n, cs_lnum_t tuple_size, unsigned int grid_size)
Return pointer to reduction buffer needed for 2-stage reductions.