1 #ifndef __CS_BLAS_CUDA_H__ 2 #define __CS_BLAS_CUDA_H__ 61 #if defined(__CUDACC__) 72 template <
size_t blockSize,
typename T>
73 __device__
static void __forceinline__
74 cs_blas_cuda_warp_reduce_sum(
volatile T *stmp,
77 if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
78 if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
79 if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
80 if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
81 if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
82 if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
101 template <
size_t blockSize,
typename T>
102 __device__
static void __forceinline__
103 cs_blas_cuda_block_reduce_sum(T *stmp,
111 if (blockSize >= 1024) {
113 stmp[tid] += stmp[tid + 512];
117 if (blockSize >= 512) {
119 stmp[tid] += stmp[tid + 256];
123 if (blockSize >= 256) {
125 stmp[tid] += stmp[tid + 128];
128 if (blockSize >= 128) {
130 stmp[tid] += stmp[tid + 64];
135 cs_blas_cuda_warp_reduce_sum<blockSize>(stmp, tid);
139 if (tid == 0) sum_block[blockIdx.x] = stmp[0];
152 template <
size_t blockSize,
typename T>
153 __global__
static void 154 cs_blas_cuda_reduce_single_block(
size_t n,
158 __shared__ T sdata[blockSize];
160 size_t tid = threadIdx.x;
163 for (
int i = threadIdx.x; i < n; i+= blockSize)
171 sdata[tid] += sdata[tid + j];
176 if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize>(sdata, tid);
177 if (tid == 0) *g_odata = sdata[0];
199 #if defined(__CUDACC__) 215 cs_blas_cuda_set_stream(cudaStream_t stream);
241 unsigned int grid_size);
275 #if defined(HAVE_CUBLAS) void cs_blas_cuda_finalize(void)
Finalize CUDA BLAS API.
#define restrict
Definition: cs_defs.h:142
double cs_blas_cuda_dot(cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
Return the dot product of 2 vectors: x.y using CUDA.
double precision, dimension(ncharm), save alpha
Definition: cpincl.f90:99
#define BEGIN_C_DECLS
Definition: cs_defs.h:510
double cs_blas_cuda_asum(cs_lnum_t n, const cs_real_t x[])
Return the absolute sum of vector values using CUDA.
double cs_real_t
Floating-point value.
Definition: cs_defs.h:322
void cs_blas_cuda_scal(cs_lnum_t n, const cs_real_t *alpha, cs_real_t *restrict x)
#define CS_CUDA_WARP_SIZE
Definition: cs_base_cuda.h:68
void cs_blas_cuda_axpy(cs_lnum_t n, const cs_real_t *alpha, const cs_real_t *restrict x, cs_real_t *restrict y)
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:316
#define END_C_DECLS
Definition: cs_defs.h:511
double * cs_blas_cuda_get_2_stage_reduce_buffer(cs_lnum_t n, cs_lnum_t tuple_size, unsigned int grid_size)
Return pointer to reduction buffer needed for 2-stage reductions.