1 #ifndef __CS_CUDA_CONTRIB_H__
2 #define __CS_CUDA_CONTRIB_H__
38 __device__ __forceinline__ T
warpReduceSum(
unsigned int mask, T mySum) {
39 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
40 mySum += __shfl_down_sync(mask, mySum, offset);
45 #if __CUDA_ARCH__ >= 800
49 __device__ __forceinline__
int warpReduceSum<int>(
unsigned int mask,
51 mySum = __reduce_add_sync(mask, mySum);
56 #if (__CUDA_ARCH__ < 600)
59 __device__
unsigned long long int atomicCAS(
unsigned long long int *address,
60 unsigned long long int compare,
61 unsigned long long int val);
64 unsigned long long int *address_as_ull = (
unsigned long long int *)address;
65 unsigned long long int old = *address_as_ull, assumed;
69 __double_as_longlong(val + __longlong_as_double(assumed)));
70 }
while (assumed != old);
71 return __longlong_as_double(old);
__device__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition: cs_cuda_contrib.h:38
__device__ double atomicAddDouble(double *address, double val)
Definition: cs_cuda_contrib.h:63