doxygen/src/cs__cuda__contrib_8h_source.html

#ifndef __CS_CUDA_CONTRIB_H__

#define __CS_CUDA_CONTRIB_H__


/*============================================================================

 * CUDA utility functions, from CUDA libraries or examples itself.

 *============================================================================*/


/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions

 * are met:

 *  * Redistributions of source code must retain the above copyright

 *    notice, this list of conditions and the following disclaimer.

 *  * Redistributions in binary form must reproduce the above copyright

 *    notice, this list of conditions and the following disclaimer in the

 *    documentation and/or other materials provided with the distribution.

 *  * Neither the name of NVIDIA CORPORATION nor the names of its

 *    contributors may be used to endorse or promote products derived

 *    from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY

 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR

 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */


/*----------------------------------------------------------------------------*/


template <class T>

__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {

  for (int offset = warpSize / 2; offset > 0; offset /= 2) {

    mySum += __shfl_down_sync(mask, mySum, offset);

  }

  return mySum;

}


#if __CUDA_ARCH__ >= 800

// Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic

// when on SM 8.0 or higher

template <>

__device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,

                                                  int mySum) {

  mySum = __reduce_add_sync(mask, mySum);

  return mySum;

}

#endif


/*----------------------------------------------------------------------------*/


#endif /* __CS_CUDA_CONTRIB_H__ */

warpReduceSum
__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum)
Definition: cs_cuda_contrib.h:38