8.3
general documentation
cs_cuda_contrib.h
Go to the documentation of this file.
1#ifndef __CS_CUDA_CONTRIB_H__
2#define __CS_CUDA_CONTRIB_H__
3
4/*============================================================================
5 * CUDA utility functions, from CUDA libraries or examples itself.
6 *============================================================================*/
7
8/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * * Neither the name of NVIDIA CORPORATION nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
23 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
26 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
30 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35/*----------------------------------------------------------------------------*/
36
37template <class T>
38__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {
39 for (int offset = warpSize / 2; offset > 0; offset /= 2) {
40 mySum += __shfl_down_sync(mask, mySum, offset);
41 }
42 return mySum;
43}
44
45#if __CUDA_ARCH__ >= 800
46// Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic
47// when on SM 8.0 or higher
48template <>
49__device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,
50 int mySum) {
51 mySum = __reduce_add_sync(mask, mySum);
52 return mySum;
53}
54#endif
55
56/*----------------------------------------------------------------------------*/
57
58#endif /* __CS_CUDA_CONTRIB_H__ */
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition: cs_cuda_contrib.h:38