1 #ifndef __CS_BLAS_CUDA_H__ 
    2 #define __CS_BLAS_CUDA_H__ 
   61 #if defined(__CUDACC__) 
   76 template <
size_t blockSize, 
size_t str
ide, 
typename T>
 
   77 __device__ 
static void __forceinline__
 
   78 cs_blas_cuda_warp_reduce_sum(
volatile T  *stmp,
 
   83     if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
 
   84     if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
 
   85     if (blockSize >= 16) stmp[tid] += stmp[tid +  8];
 
   86     if (blockSize >=  8) stmp[tid] += stmp[tid +  4];
 
   87     if (blockSize >=  4) stmp[tid] += stmp[tid +  2];
 
   88     if (blockSize >=  2) stmp[tid] += stmp[tid +  1];
 
   93     if (blockSize >= 64) {
 
   95       for (
size_t i = 0; i < stride; i++)
 
   96         stmp[tid*stride + i] += stmp[(tid + 32)*stride + i];
 
   98     if (blockSize >= 32) {
 
  100       for (
size_t i = 0; i < stride; i++)
 
  101         stmp[tid*stride + i] += stmp[(tid + 16)*stride + i];
 
  103     if (blockSize >= 16) {
 
  105       for (
size_t i = 0; i < stride; i++)
 
  106         stmp[tid*stride + i] += stmp[(tid + 8)*stride + i];
 
  108     if (blockSize >= 8) {
 
  110       for (
size_t i = 0; i < stride; i++)
 
  111         stmp[tid*stride + i] += stmp[(tid + 4)*stride + i];
 
  113     if (blockSize >= 4) {
 
  115       for (
size_t i = 0; i < stride; i++)
 
  116         stmp[tid*stride + i] += stmp[(tid + 2)*stride + i];
 
  118     if (blockSize >= 2) {
 
  120       for (
size_t i = 0; i < stride; i++)
 
  121         stmp[tid*stride + i] += stmp[(tid + 1)*stride + i];
 
  146 template <
size_t blockSize, 
size_t str
ide, 
typename T>
 
  147 __device__ 
static void __forceinline__
 
  148 cs_blas_cuda_block_reduce_sum(T       *stmp,
 
  158     if (blockSize >= 1024) {
 
  160         stmp[tid] += stmp[tid + 512];
 
  164     if (blockSize >= 512) {
 
  166         stmp[tid] += stmp[tid + 256];
 
  170     if (blockSize >= 256) {
 
  172         stmp[tid] += stmp[tid + 128];
 
  175     if (blockSize >= 128) {
 
  177         stmp[tid] += stmp[tid +  64];
 
  182       cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
 
  187     if (tid == 0) sum_block[blockIdx.x] = stmp[0];
 
  193     if (blockSize >= 1024) {
 
  196         for (
size_t i = 0; i < stride; i++)
 
  197           stmp[tid*stride + i] += stmp[(tid + 512)*stride + i];
 
  201     if (blockSize >= 512) {
 
  204         for (
size_t i = 0; i < stride; i++)
 
  205           stmp[tid*stride + i] += stmp[(tid + 256)*stride + i];
 
  209     if (blockSize >= 256) {
 
  212         for (
size_t i = 0; i < stride; i++)
 
  213           stmp[tid*stride + i] += stmp[(tid + 128)*stride + i];
 
  216     if (blockSize >= 128) {
 
  219         for (
size_t i = 0; i < stride; i++)
 
  220           stmp[tid*stride + i] += stmp[(tid + 64)*stride + i];
 
  225       cs_blas_cuda_warp_reduce_sum<blockSize, stride>(stmp, tid);
 
  231       for (
size_t i = 0; i < stride; i++)
 
  232         sum_block[(blockIdx.x)*stride + i] = stmp[i];
 
  250 template <
size_t blockSize, 
size_t str
ide, 
typename T>
 
  251 __global__ 
static void 
  252 cs_blas_cuda_reduce_single_block(
size_t   n,
 
  256   __shared__ T sdata[blockSize * stride];
 
  258   size_t tid = threadIdx.x;
 
  266     for (
size_t i = threadIdx.x; i < n; i+= blockSize)
 
  267       r_s[0] += g_idata[i];
 
  272     for (
size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
 
  274         sdata[tid] += sdata[tid + j];
 
  279     if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
 
  280     if (tid == 0) *g_odata = sdata[0];
 
  286     for (
size_t k = 0; 
k < stride; 
k++) {
 
  288       sdata[tid*stride + 
k] = 0.;
 
  291     for (
size_t i = threadIdx.x; i < n; i+= blockSize) {
 
  293       for (
size_t k = 0; 
k < stride; 
k++) {
 
  294         r_s[
k] += g_idata[i*stride + 
k];
 
  299     for (
size_t k = 0; 
k < stride; 
k++)
 
  300       sdata[tid*stride + 
k] = r_s[
k];
 
  303     for (
size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
 
  306         for (
size_t k = 0; 
k < stride; 
k++)
 
  307             sdata[tid*stride + 
k] += sdata[(tid + j)*stride + 
k];
 
  312     if (tid < 32) cs_blas_cuda_warp_reduce_sum<blockSize, stride>(sdata, tid);
 
  315       for (
size_t k = 0; 
k < stride; 
k++)
 
  316         g_odata[
k] = sdata[
k];
 
  341 #if defined(__CUDACC__) 
  357 cs_blas_cuda_set_stream(cudaStream_t  stream);
 
  379 cs_blas_cuda_get_2_stage_reduce_buffers(
cs_lnum_t      n,
 
  381                                         unsigned int   grid_size,
 
  419 #if defined(HAVE_CUBLAS) 
void cs_blas_cuda_scal(cs_lnum_t n, const cs_real_t *alpha, cs_real_t *restrict x)
void cs_blas_cuda_finalize(void)
Finalize CUDA BLAS API.
void cs_blas_cuda_axpy(cs_lnum_t n, const cs_real_t *alpha, const cs_real_t *restrict x, cs_real_t *restrict y)
double cs_blas_cuda_asum(cs_lnum_t n, const cs_real_t x[])
Return the absolute sum of vector values using CUDA.
double cs_blas_cuda_dot(cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
Return the dot product of 2 vectors: x.y using CUDA.
#define restrict
Definition: cs_defs.h:141
#define BEGIN_C_DECLS
Definition: cs_defs.h:528
double cs_real_t
Floating-point value.
Definition: cs_defs.h:332
#define END_C_DECLS
Definition: cs_defs.h:529
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:325
@ k
Definition: cs_field_pointer.h:70