1 #ifndef __CS_CUDA_CONTRIB_H__
2 #define __CS_CUDA_CONTRIB_H__
38 __device__ __forceinline__ T
warpReduceSum(
unsigned int mask, T mySum) {
39 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
40 mySum += __shfl_down_sync(mask, mySum, offset);
45 #if __CUDA_ARCH__ >= 800
49 __device__ __forceinline__
int warpReduceSum<int>(
unsigned int mask,
51 mySum = __reduce_add_sync(mask, mySum);
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition: cs_cuda_contrib.h:38