8.3
general documentation
cs_blas_cuda.h
Go to the documentation of this file.
1#ifndef __CS_BLAS_CUDA_H__
2#define __CS_BLAS_CUDA_H__
3
4/*============================================================================
5 * BLAS (Basic Linear Algebra Subroutine) functions
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2024 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28/*----------------------------------------------------------------------------*/
29
30#include "cs_defs.h"
31
32/*----------------------------------------------------------------------------
33 * External library headers
34 *----------------------------------------------------------------------------*/
35
36/*----------------------------------------------------------------------------
37 * Local headers
38 *----------------------------------------------------------------------------*/
39
40#include "cs_base.h"
41#include "cs_base_cuda.h"
42
43/*----------------------------------------------------------------------------*/
44
46
47/*============================================================================
48 * Macro definitions
49 *============================================================================*/
50
51/*============================================================================
52 * Type definitions
53 *============================================================================*/
54
56
57/*============================================================================
58 * Templated function definitions
59 *============================================================================*/
60
61#if defined(__CUDACC__)
62
63/*----------------------------------------------------------------------------*/
74/*----------------------------------------------------------------------------*/
75
76template <size_t blockSize, size_t stride, typename T>
77__device__ static void __forceinline__
78cs_blas_cuda_warp_reduce_sum(volatile T *stmp,
79 size_t tid)
80{
81 if (stride == 1) {
82
83 if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
84 if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
85 if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
86 if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
87 if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
88 if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
89
90 }
91 else {
92
93 if (blockSize >= 64) {
94 #pragma unroll
95 for (size_t i = 0; i < stride; i++)
96 stmp[tid*stride + i] += stmp[(tid + 32)*stride + i];
97 }
98 if (blockSize >= 32) {
99 #pragma unroll
100 for (size_t i = 0; i < stride; i++)
101 stmp[tid*stride + i] += stmp[(tid + 16)*stride + i];
102 }
103 if (blockSize >= 16) {
104 #pragma unroll
105 for (size_t i = 0; i < stride; i++)
106 stmp[tid*stride + i] += stmp[(tid + 8)*stride + i];
107 }
108 if (blockSize >= 8) {
109 #pragma unroll
110 for (size_t i = 0; i < stride; i++)
111 stmp[tid*stride + i] += stmp[(tid + 4)*stride + i];
112 }
113 if (blockSize >= 4) {
114 #pragma unroll
115 for (size_t i = 0; i < stride; i++)
116 stmp[tid*stride + i] += stmp[(tid + 2)*stride + i];
117 }
118 if (blockSize >= 2) {
119 #pragma unroll
120 for (size_t i = 0; i < stride; i++)
121 stmp[tid*stride + i] += stmp[(tid + 1)*stride + i];
122 }
123 }
124}
125
126/*----------------------------------------------------------------------------*/
144/*----------------------------------------------------------------------------*/
145
146template <size_t blockSize, size_t stride, typename T>
147__device__ static void __forceinline__
148cs_blas_cuda_block_reduce_sum(T *stmp,
149 size_t tid,
150 T *sum_block)
151{
152 __syncthreads();
153
154 /* Loop explicitely unrolled */
155
156 if (stride == 1) { /* Scalar data */
157
158 if (blockSize >= 1024) {
159 if (tid < 512) {
160 stmp[tid] += stmp[tid + 512];
161 }
162 __syncthreads();
163 }
164 if (blockSize >= 512) {
165 if (tid < 256) {
166 stmp[tid] += stmp[tid + 256];
167 }
168 __syncthreads();
169 }
170 if (blockSize >= 256) {
171 if (tid < 128) {
172 stmp[tid] += stmp[tid + 128];
173 } __syncthreads();
174 }
175 if (blockSize >= 128) {
176 if (tid < 64) {
177 stmp[tid] += stmp[tid + 64];
178 } __syncthreads();
179 }
180
181 if (tid < 32) {
182 cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
183 }
184
185 // Output: b_res for this block
186
187 if (tid == 0) sum_block[blockIdx.x] = stmp[0];
188
189 }
190
191 else { /* Vector data */
192
193 if (blockSize >= 1024) {
194 if (tid < 512) {
195 #pragma unroll
196 for (size_t i = 0; i < stride; i++)
197 stmp[tid*stride + i] += stmp[(tid + 512)*stride + i];
198 }
199 __syncthreads();
200 }
201 if (blockSize >= 512) {
202 if (tid < 256) {
203 #pragma unroll
204 for (size_t i = 0; i < stride; i++)
205 stmp[tid*stride + i] += stmp[(tid + 256)*stride + i];
206 }
207 __syncthreads();
208 }
209 if (blockSize >= 256) {
210 if (tid < 128) {
211 #pragma unroll
212 for (size_t i = 0; i < stride; i++)
213 stmp[tid*stride + i] += stmp[(tid + 128)*stride + i];
214 } __syncthreads();
215 }
216 if (blockSize >= 128) {
217 if (tid < 64) {
218 #pragma unroll
219 for (size_t i = 0; i < stride; i++)
220 stmp[tid*stride + i] += stmp[(tid + 64)*stride + i];
221 } __syncthreads();
222 }
223
224 if (tid < 32)
225 cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
226
227 // Output: b_res for this block
228
229 if (tid == 0) {
230 #pragma unroll
231 for (size_t i = 0; i < stride; i++)
232 sum_block[(blockIdx.x)*stride + i] = stmp[i];
233 }
234
235 }
236}
237
238/*----------------------------------------------------------------------------*/
248/*----------------------------------------------------------------------------*/
249
250template <size_t blockSize, size_t stride, typename T>
251__global__ static void
252cs_blas_cuda_reduce_single_block(size_t n,
253 T *g_idata,
254 T *g_odata)
255{
256 __shared__ T sdata[blockSize * stride];
257
258 size_t tid = threadIdx.x;
259 T r_s[stride];
260
261 if (stride == 1) {
262
263 r_s[0] = 0;
264 sdata[tid] = 0.;
265
266 for (size_t i = threadIdx.x; i < n; i+= blockSize)
267 r_s[0] += g_idata[i];
268
269 sdata[tid] = r_s[0];
270 __syncthreads();
271
272 for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
273 if (tid < j) {
274 sdata[tid] += sdata[tid + j];
275 }
276 __syncthreads();
277 }
278
279 if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
280 if (tid == 0) *g_odata = sdata[0];
281
282 }
283 else {
284
285 #pragma unroll
286 for (size_t k = 0; k < stride; k++) {
287 r_s[k] = 0;
288 sdata[tid*stride + k] = 0.;
289 }
290
291 for (size_t i = threadIdx.x; i < n; i+= blockSize) {
292 #pragma unroll
293 for (size_t k = 0; k < stride; k++) {
294 r_s[k] += g_idata[i*stride + k];
295 }
296 }
297
298 #pragma unroll
299 for (size_t k = 0; k < stride; k++)
300 sdata[tid*stride + k] = r_s[k];
301 __syncthreads();
302
303 for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
304 if (tid < j) {
305 #pragma unroll
306 for (size_t k = 0; k < stride; k++)
307 sdata[tid*stride + k] += sdata[(tid + j)*stride + k];
308 }
309 __syncthreads();
310 }
311
312 if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
313 if (tid == 0) {
314 #pragma unroll
315 for (size_t k = 0; k < stride; k++)
316 g_odata[k] = sdata[k];
317 }
318
319 }
320}
321
322#endif /* defined(__CUDACC__) */
323
325
326/*============================================================================
327 * Public function prototypes
328 *============================================================================*/
329
330/*----------------------------------------------------------------------------*/
336/*----------------------------------------------------------------------------*/
337
338void
340
341#if defined(__CUDACC__)
342
343/*----------------------------------------------------------------------------*/
354/*----------------------------------------------------------------------------*/
355
356void
357cs_blas_cuda_set_stream(cudaStream_t stream);
358
359/*----------------------------------------------------------------------------*/
376/*----------------------------------------------------------------------------*/
377
378void
379cs_blas_cuda_get_2_stage_reduce_buffers(cs_lnum_t n,
380 cs_lnum_t tuple_size,
381 unsigned int grid_size,
382 double* &r_grid,
383 double* &r_reduce);
384
385#endif /* defined(__CUDACC__) */
386
387/*----------------------------------------------------------------------------*/
396/*----------------------------------------------------------------------------*/
397
398double
400 const cs_real_t x[]);
401
402/*----------------------------------------------------------------------------*/
412/*----------------------------------------------------------------------------*/
413
414double
416 const cs_real_t x[],
417 const cs_real_t y[]);
418
419#if defined(HAVE_CUBLAS)
420
421/*----------------------------------------------------------------------------*/
430/*----------------------------------------------------------------------------*/
431
432double
433cs_blas_cublas_asum(cs_lnum_t n,
434 const cs_real_t x[]);
435
436/*----------------------------------------------------------------------------*/
446/*----------------------------------------------------------------------------*/
447
448double
449cs_blas_cublas_dot(cs_lnum_t n,
450 const cs_real_t x[],
451 const cs_real_t y[]);
452
453#endif /* defined(HAVE_CUBLAS) */
454
455/*----------------------------------------------------------------------------
456 * Compute y <- alpha.x + y
457 *
458 * This function may be set to use either cuBLAS or a local kernel.
459 *
460 * parameters:
461 * n <-- number of elements
462 * alpha <-- constant value (on device)
463 * x <-- vector of elements (on device)
464 * y <-> vector of elements (on device)
465 *----------------------------------------------------------------------------*/
466
467void
469 const cs_real_t *alpha,
470 const cs_real_t *x,
471 cs_real_t *y);
472
473/*----------------------------------------------------------------------------
474 * Compute x <- alpha.x
475 *
476 * This function may be set to use either cuBLAS or a local kernel.
477 *
478 * parameters:
479 * n <-- number of elements
480 * alpha <-- constant value (on device)
481 * x <-> vector of elements (on device)
482 *----------------------------------------------------------------------------*/
483
484void
486 const cs_real_t *alpha,
487 cs_real_t *x);
488
489/*----------------------------------------------------------------------------*/
490
492
493#endif /* __CS_BLAS_CUDA_H__ */
void cs_blas_cuda_finalize(void)
Finalize CUDA BLAS API.
double cs_blas_cuda_asum(cs_lnum_t n, const cs_real_t x[])
Return the absolute sum of vector values using CUDA.
void cs_blas_cuda_scal(cs_lnum_t n, const cs_real_t *alpha, cs_real_t *x)
double cs_blas_cuda_dot(cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
Return the dot product of 2 vectors: x.y using CUDA.
void cs_blas_cuda_axpy(cs_lnum_t n, const cs_real_t *alpha, const cs_real_t *x, cs_real_t *y)
#define BEGIN_C_DECLS
Definition: cs_defs.h:542
double cs_real_t
Floating-point value.
Definition: cs_defs.h:342
#define END_C_DECLS
Definition: cs_defs.h:543
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:335
@ k
Definition: cs_field_pointer.h:72