1#ifndef __CS_CUDA_CONTRIB_H__
2#define __CS_CUDA_CONTRIB_H__
39 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
40 mySum += __shfl_down_sync(mask, mySum, offset);
45#if __CUDA_ARCH__ >= 800
51 mySum = __reduce_add_sync(mask, mySum);
56#if (__CUDA_ARCH__ < 600)
59__device__
unsigned long long int atomicCAS(
unsigned long long int *address,
60 unsigned long long int compare,
61 unsigned long long int val);
64 unsigned long long int *address_as_ull = (
unsigned long long int *)address;
65 unsigned long long int old = *address_as_ull, assumed;
69 __double_as_longlong(val + __longlong_as_double(assumed)));
70 }
while (assumed != old);
71 return __longlong_as_double(old);
__device__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition cs_cuda_contrib.h:38
__device__ double atomicAddDouble(double *address, double val)
Definition cs_cuda_contrib.h:63