1 #ifndef __CS_BLAS_CUDA_H__
2 #define __CS_BLAS_CUDA_H__
61 #if defined(__CUDACC__)
76 template <
size_t blockSize,
size_t str
ide,
typename T>
77 __device__
static void __forceinline__
78 cs_blas_cuda_warp_reduce_sum(
volatile T *stmp,
83 if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
84 if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
85 if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
86 if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
87 if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
88 if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
93 if (blockSize >= 64) {
95 for (
size_t i = 0; i < stride; i++)
96 stmp[tid*stride + i] += stmp[(tid + 32)*stride + i];
98 if (blockSize >= 32) {
100 for (
size_t i = 0; i < stride; i++)
101 stmp[tid*stride + i] += stmp[(tid + 16)*stride + i];
103 if (blockSize >= 16) {
105 for (
size_t i = 0; i < stride; i++)
106 stmp[tid*stride + i] += stmp[(tid + 8)*stride + i];
108 if (blockSize >= 8) {
110 for (
size_t i = 0; i < stride; i++)
111 stmp[tid*stride + i] += stmp[(tid + 4)*stride + i];
113 if (blockSize >= 4) {
115 for (
size_t i = 0; i < stride; i++)
116 stmp[tid*stride + i] += stmp[(tid + 2)*stride + i];
118 if (blockSize >= 2) {
120 for (
size_t i = 0; i < stride; i++)
121 stmp[tid*stride + i] += stmp[(tid + 1)*stride + i];
146 template <
size_t blockSize,
size_t str
ide,
typename T>
147 __device__
static void __forceinline__
148 cs_blas_cuda_block_reduce_sum(T *stmp,
158 if (blockSize >= 1024) {
160 stmp[tid] += stmp[tid + 512];
164 if (blockSize >= 512) {
166 stmp[tid] += stmp[tid + 256];
170 if (blockSize >= 256) {
172 stmp[tid] += stmp[tid + 128];
175 if (blockSize >= 128) {
177 stmp[tid] += stmp[tid + 64];
182 cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
187 if (tid == 0) sum_block[blockIdx.x] = stmp[0];
193 if (blockSize >= 1024) {
196 for (
size_t i = 0; i < stride; i++)
197 stmp[tid*stride + i] += stmp[(tid + 512)*stride + i];
201 if (blockSize >= 512) {
204 for (
size_t i = 0; i < stride; i++)
205 stmp[tid*stride + i] += stmp[(tid + 256)*stride + i];
209 if (blockSize >= 256) {
212 for (
size_t i = 0; i < stride; i++)
213 stmp[tid*stride + i] += stmp[(tid + 128)*stride + i];
216 if (blockSize >= 128) {
219 for (
size_t i = 0; i < stride; i++)
220 stmp[tid*stride + i] += stmp[(tid + 64)*stride + i];
225 cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
231 for (
size_t i = 0; i < stride; i++)
232 sum_block[(blockIdx.x)*stride + i] = stmp[i];
250 template <
size_t blockSize,
size_t str
ide,
typename T>
251 __global__
static void
252 cs_blas_cuda_reduce_single_block(
size_t n,
256 __shared__ T sdata[blockSize * stride];
258 size_t tid = threadIdx.x;
266 for (
size_t i = threadIdx.x; i < n; i+= blockSize)
267 r_s[0] += g_idata[i];
274 sdata[tid] += sdata[tid + j];
279 if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
280 if (tid == 0) *g_odata = sdata[0];
286 for (
size_t k = 0;
k < stride;
k++) {
288 sdata[tid*stride +
k] = 0.;
291 for (
size_t i = threadIdx.x; i < n; i+= blockSize) {
293 for (
size_t k = 0;
k < stride;
k++) {
294 r_s[
k] += g_idata[i*stride +
k];
298 for (
size_t k = 0;
k < stride;
k++)
299 sdata[tid*stride +
k] = r_s[
k];
305 for (
size_t k = 0;
k < stride;
k++)
306 sdata[tid*stride +
k] += sdata[(tid + j)*stride +
k];
313 if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
316 for (
size_t k = 0;
k < stride;
k++)
317 g_odata[
k] = sdata[
k];
342 #if defined(__CUDACC__)
358 cs_blas_cuda_set_stream(cudaStream_t stream);
380 cs_blas_cuda_get_2_stage_reduce_buffers(
cs_lnum_t n,
382 unsigned int grid_size,
420 #if defined(HAVE_CUBLAS)
#define CS_CUDA_WARP_SIZE
Definition: cs_base_cuda.h:68
void cs_blas_cuda_scal(cs_lnum_t n, const cs_real_t *alpha, cs_real_t *restrict x)
void cs_blas_cuda_finalize(void)
Finalize CUDA BLAS API.
void cs_blas_cuda_axpy(cs_lnum_t n, const cs_real_t *alpha, const cs_real_t *restrict x, cs_real_t *restrict y)
double cs_blas_cuda_asum(cs_lnum_t n, const cs_real_t x[])
Return the absolute sum of vector values using CUDA.
double cs_blas_cuda_dot(cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
Return the dot product of 2 vectors: x.y using CUDA.
#define restrict
Definition: cs_defs.h:139
#define BEGIN_C_DECLS
Definition: cs_defs.h:509
double cs_real_t
Floating-point value.
Definition: cs_defs.h:319
#define END_C_DECLS
Definition: cs_defs.h:510
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:313
@ k
Definition: cs_field_pointer.h:70
double precision, dimension(ncharm), save alpha
Definition: cpincl.f90:99