doxygen/src/cs__cuda__contrib_8h_source.html

 #ifndef __CS_CUDA_CONTRIB_H__

 #define __CS_CUDA_CONTRIB_H__


 /*============================================================================

  * CUDA utility functions, from CUDA libraries or examples itself.

  *============================================================================*/


 /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.

  *

  * Redistribution and use in source and binary forms, with or without

  * modification, are permitted provided that the following conditions

  * are met:

  *  * Redistributions of source code must retain the above copyright

  *    notice, this list of conditions and the following disclaimer.

  *  * Redistributions in binary form must reproduce the above copyright

  *    notice, this list of conditions and the following disclaimer in the

  *    documentation and/or other materials provided with the distribution.

  *  * Neither the name of NVIDIA CORPORATION nor the names of its

  *    contributors may be used to endorse or promote products derived

  *    from this software without specific prior written permission.

  *

  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY

  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR

  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  */


 /*----------------------------------------------------------------------------*/


 template <class T>

 __device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {

   for (int offset = warpSize / 2; offset > 0; offset /= 2) {

     mySum += __shfl_down_sync(mask, mySum, offset);

   }

   return mySum;

 }


 #if __CUDA_ARCH__ >= 800

 // Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic

 // when on SM 8.0 or higher

 template <>

 __device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,

                                                   int mySum) {

   mySum = __reduce_add_sync(mask, mySum);

   return mySum;

 }

 #endif


 #if (__CUDA_ARCH__ < 600)

 // Atomic double add for older GPUs.


 __device__  unsigned long long int atomicCAS(unsigned long long int *address,

                                              unsigned long long int  compare,

                                              unsigned long long int  val);


 __device__ double atomicAddDouble(double *address, double val) {

   unsigned long long int *address_as_ull = (unsigned long long int *)address;

   unsigned long long int old = *address_as_ull, assumed;

   do {

     assumed = old;

     old = atomicCAS(address_as_ull, assumed,

                     __double_as_longlong(val + __longlong_as_double(assumed)));

   } while (assumed != old);

   return __longlong_as_double(old);

 }


 #endif


 /*----------------------------------------------------------------------------*/


 #endif /* __CS_CUDA_CONTRIB_H__ */

atomicCAS
__device__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)

warpReduceSum
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition: cs_cuda_contrib.h:38

atomicAddDouble
__device__ double atomicAddDouble(double *address, double val)
Definition: cs_cuda_contrib.h:63