#include "cs_defs.h"
#include "cs_base.h"
#include "cs_base_cuda.h"

Include dependency graph for cs_blas_cuda.h:

Functions
void	cs_blas_cuda_finalize (void)
	Finalize CUDA BLAS API. More...

double *	cs_blas_cuda_get_2_stage_reduce_buffer (cs_lnum_t n, cs_lnum_t tuple_size, unsigned int grid_size)
	Return pointer to reduction buffer needed for 2-stage reductions. More...

double	cs_blas_cuda_asum (cs_lnum_t n, const cs_real_t x[])
	Return the absolute sum of vector values using CUDA. More...

double	cs_blas_cuda_dot (cs_lnum_t n, const cs_real_t x[], const cs_real_t y[])
	Return the dot product of 2 vectors: x.y using CUDA. More...

void	cs_blas_cuda_axpy (cs_lnum_t n, const cs_real_t alpha, const cs_real_t restrict x, cs_real_t *restrict y)

void	cs_blas_cuda_scal (cs_lnum_t n, const cs_real_t alpha, cs_real_t restrict x)

Function Documentation

◆ cs_blas_cuda_asum()

double cs_blas_cuda_asum	(	cs_lnum_t	n,
		const cs_real_t	x[]
	)

Return the absolute sum of vector values using CUDA.

Parameters

[in]	n	size of array x
[in]	x	array of floating-point values (on device)

Returns: sum of absolute array values

◆ cs_blas_cuda_axpy()

void cs_blas_cuda_axpy	(	cs_lnum_t	n,
		const cs_real_t *	alpha,
		const cs_real_t *restrict	x,
		cs_real_t *restrict	y
	)

◆ cs_blas_cuda_dot()

double cs_blas_cuda_dot	(	cs_lnum_t	n,
		const cs_real_t	x[],
		const cs_real_t	y[]
	)

Return the dot product of 2 vectors: x.y using CUDA.

Parameters

[in]	n	size of arrays x and y
[in]	x	array of floating-point values (on device)
[in]	y	array of floating-point values (on device)

Returns: dot product

◆ cs_blas_cuda_finalize()

void cs_blas_cuda_finalize ( void )

Finalize CUDA BLAS API.

This frees resources such as the cuBLAS handle, if used.

◆ cs_blas_cuda_get_2_stage_reduce_buffer()

double* cs_blas_cuda_get_2_stage_reduce_buffer	(	cs_lnum_t	n,
		cs_lnum_t	tuple_size,
		unsigned int	grid_size
	)

Return pointer to reduction buffer needed for 2-stage reductions.

This buffer is used internally by all cs_blas_cuda 2-stage operations, allocated and resized updon demand, and freed when calling cs_blas_cuda_finalize, so it is assumed no two operations (in different threads) use this simultaneously.

Also check initialization of work arrays.

Parameters

[in]	n	size of arrays
[in]	tuple_size	number of values per tuple simultaneously reduced
[in]	grid_size	associated grid size

Returns: pointer to reduction bufffer.

◆ cs_blas_cuda_scal()

void cs_blas_cuda_scal	(	cs_lnum_t	n,
		const cs_real_t *	alpha,
		cs_real_t *restrict	x
	)

Functions

Function Documentation

◆ cs_blas_cuda_asum()

◆ cs_blas_cuda_axpy()

◆ cs_blas_cuda_dot()

◆ cs_blas_cuda_finalize()

◆ cs_blas_cuda_get_2_stage_reduce_buffer()

◆ cs_blas_cuda_scal()