8.0
general documentation
cs_cuda_contrib.h
Go to the documentation of this file.
1 #ifndef __CS_CUDA_CONTRIB_H__
2 #define __CS_CUDA_CONTRIB_H__
3 
4 /*============================================================================
5  * CUDA utility functions, from CUDA libraries or examples itself.
6  *============================================================================*/
7 
8 /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * * Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * * Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the distribution.
18  * * Neither the name of NVIDIA CORPORATION nor the names of its
19  * contributors may be used to endorse or promote products derived
20  * from this software without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
23  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
26  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
30  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*----------------------------------------------------------------------------*/
36 
37 template <class T>
38 __device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {
39  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
40  mySum += __shfl_down_sync(mask, mySum, offset);
41  }
42  return mySum;
43 }
44 
45 #if __CUDA_ARCH__ >= 800
46 // Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic
47 // when on SM 8.0 or higher
48 template <>
49 __device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,
50  int mySum) {
51  mySum = __reduce_add_sync(mask, mySum);
52  return mySum;
53 }
54 #endif
55 
56 #if (__CUDA_ARCH__ < 600)
57 // Atomic double add for older GPUs.
58 
59 __device__ unsigned long long int atomicCAS(unsigned long long int *address,
60  unsigned long long int compare,
61  unsigned long long int val);
62 
63 __device__ double atomicAddDouble(double *address, double val) {
64  unsigned long long int *address_as_ull = (unsigned long long int *)address;
65  unsigned long long int old = *address_as_ull, assumed;
66  do {
67  assumed = old;
68  old = atomicCAS(address_as_ull, assumed,
69  __double_as_longlong(val + __longlong_as_double(assumed)));
70  } while (assumed != old);
71  return __longlong_as_double(old);
72 }
73 
74 #endif
75 
76 /*----------------------------------------------------------------------------*/
77 
78 #endif /* __CS_CUDA_CONTRIB_H__ */
__device__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition: cs_cuda_contrib.h:38
__device__ double atomicAddDouble(double *address, double val)
Definition: cs_cuda_contrib.h:63