1 #ifndef __CS_CUDA_CONTRIB_H__ 
    2 #define __CS_CUDA_CONTRIB_H__ 
   38 __device__ __forceinline__ T 
warpReduceSum(
unsigned int mask, T mySum) {
 
   39   for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
 
   40     mySum += __shfl_down_sync(mask, mySum, offset);
 
   45 #if __CUDA_ARCH__ >= 800 
   49 __device__ __forceinline__ 
int warpReduceSum<int>(
unsigned int mask,
 
   51   mySum = __reduce_add_sync(mask, mySum);
 
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition: cs_cuda_contrib.h:38