8.3
general documentation
cs_base_cuda.h
Go to the documentation of this file.
1#ifndef __CS_BASE_CUDA_H__
2#define __CS_BASE_CUDA_H__
3
4/*============================================================================
5 * Definitions, global variables, and base functions for CUDA
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2024 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28/*----------------------------------------------------------------------------*/
29
30#include "cs_defs.h"
31
32#if defined(HAVE_CUDA)
33
34/*----------------------------------------------------------------------------
35 * Standard C library headers
36 *----------------------------------------------------------------------------*/
37
38#include <stdio.h>
39
40/*----------------------------------------------------------------------------
41 * Local headers
42 *----------------------------------------------------------------------------*/
43
44#include "cs_base_accel.h"
45#include "cs_log.h"
46
47/*=============================================================================
48 * Macro definitions
49 *============================================================================*/
50
51#define CS_CUDA_CHECK(a) { \
52 cudaError_t _l_ret_code = a; \
53 if (cudaSuccess != _l_ret_code) { \
54 bft_error(__FILE__, __LINE__, 0, "[CUDA error] %d: %s\n running: %s", \
55 _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
56 } \
57 }
58
59#define CS_CUDA_CHECK_CALL(a, file_name, line_num) { \
60 cudaError_t _l_ret_code = a; \
61 if (cudaSuccess != _l_ret_code) { \
62 bft_error(file_name, line_num, 0, "[CUDA error] %d: %s\n running: %s", \
63 _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
64 } \
65 }
66
67/* For all current compute capabilities, the warp size is 32; If it ever
68 changes, it can be obtained through cudaDeviceProp, so we could then
69 replace this macro with a global variable */
70
71#define CS_CUDA_WARP_SIZE 32
72
73/*----------------------------------------------------------------------------*/
74
76
77/*============================================================================
78 * Type definitions
79 *============================================================================*/
80
81/*=============================================================================
82 * Global variable definitions
83 *============================================================================*/
84
85extern int cs_glob_cuda_device_id;
86
87/* Other device parameters */
88
89extern int cs_glob_cuda_max_threads_per_block;
90extern int cs_glob_cuda_max_block_size;
91extern int cs_glob_cuda_max_blocks;
92extern int cs_glob_cuda_n_mp; /* Number of multiprocessors */
93
94/* Allow graphs for kernel launches ? May interfere with profiling (nsys),
95 so can be deactivated. */
96
97extern bool cs_glob_cuda_allow_graph;
98
99/*============================================================================
100 * Semi-private function prototypes
101 *
102 * The following functions are intended to be used by the common
103 * host-device memory management functions from cs_base_accel.c, and
104 * not directly by the user.
105 *============================================================================*/
106
107/*----------------------------------------------------------------------------*/
108/*
109 * \brief Copy data from host to device.
110 *
111 * This is simply a wrapper over cudaMemcpy.
112 *
113 * A safety check is added.
114 *
115 * \param [out] dst pointer to destination data
116 * \param [in] src pointer to source data
117 * \param [in] size size of data to copy
118 */
119/*----------------------------------------------------------------------------*/
120
121void
122cs_cuda_copy_h2d(void *dst,
123 const void *src,
124 size_t size);
125
126/*----------------------------------------------------------------------------*/
127/*
128 * \brief Copy data from host to device, possibly returning on the host
129 * before the copy is finished.
130 *
131 * This is simply a wrapper over cudaMemcpyAsync.
132 *
133 * A safety check is added.
134 *
135 * \param [out] dst pointer to destination data
136 * \param [in] src pointer to source data
137 * \param [in] size size of data to copy
138 *
139 * \returns pointer to allocated memory.
140 */
141/*----------------------------------------------------------------------------*/
142
143void
144cs_cuda_copy_h2d_async(void *dst,
145 const void *src,
146 size_t size);
147
148/*----------------------------------------------------------------------------*/
149/*
150 * \brief Copy data from device to host.
151 *
152 * This is simply a wrapper over cudaMemcpy.
153 *
154 * A safety check is added.
155 *
156 * \param [out] dst pointer to destination data
157 * \param [in] src pointer to source data
158 * \param [in] size size of data to copy
159 *
160 * \returns pointer to allocated memory.
161 */
162/*----------------------------------------------------------------------------*/
163
164void
165cs_cuda_copy_d2h(void *dst,
166 const void *src,
167 size_t size);
168
169/*----------------------------------------------------------------------------*/
170/*
171 * \brief Copy data from host to device.
172 *
173 * This is simply a wrapper over cudaMemcpy.
174 *
175 * A safety check is added.
176 *
177 * \param [out] dst pointer to destination data
178 * \param [in] src pointer to source data
179 * \param [in] size size of data to copy
180 *
181 * \returns pointer to allocated memory.
182 */
183/*----------------------------------------------------------------------------*/
184
185void
186cs_cuda_copy_d2h_async(void *dst,
187 const void *src,
188 size_t size);
189
190/*----------------------------------------------------------------------------*/
191/*
192 * \brief Copy data from device to device.
193 *
194 * This is simply a wrapper over cudaMemcpy.
195 *
196 * A safety check is added.
197 *
198 * \param [out] dst pointer to destination data
199 * \param [in] src pointer to source data
200 * \param [in] size size of data to copy
201 */
202/*----------------------------------------------------------------------------*/
203
204void
205cs_cuda_copy_d2d(void *dst,
206 const void *src,
207 size_t size);
208
209/*----------------------------------------------------------------------------*/
210/*
211 * \brief Get host pointer for a managed or device pointer.
212 *
213 * This function can be called with a pointer inside an allocated block of
214 * memory, so is not retricted to values returned by CS_ALLOC_HD.
215 *
216 * This makes it possible to check whether a pointer to an array inside
217 * a larger array is shared or accessible from the device only
218 * (for example when grouping allocations).
219 *
220 * \param [in] ptr pointer to device data
221 *
222 * \return pointer to host data if shared or mapped at the CUDA level,
223 * NULL otherwise.
224 */
225/*----------------------------------------------------------------------------*/
226
227void *
228cs_cuda_get_host_ptr(const void *ptr);
229
230/*=============================================================================
231 * Inline function prototypes
232 *============================================================================*/
233
234/*----------------------------------------------------------------------------*/
247/*----------------------------------------------------------------------------*/
248
249static inline unsigned int
250cs_cuda_grid_size(cs_lnum_t n,
251 unsigned int block_size)
252{
253 return (n % block_size) ? n/block_size + 1 : n/block_size;
254}
255
257
258#if defined(__NVCC__)
259
260/*----------------------------------------------------------------------------
261 * Synchronize of copy a cs_real_t type array from the host to a device.
262 *
263 * parameters:
264 * val_h <-- pointer to host data
265 * n_vals <-- number of data values
266 * device_id <-- associated device id
267 * stream <-- associated stream (for async prefetch only)
268 * val_d --> matching pointer on device
269 * buf_d --> matching allocation pointer on device (should be freed
270 * after use if non-null)
271 *----------------------------------------------------------------------------*/
272
273template <typename T>
274void
275cs_sync_or_copy_h2d(const T *val_h,
276 cs_lnum_t n_vals,
277 int device_id,
278 cudaStream_t stream,
279 const T **val_d,
280 void **buf_d)
281{
282 const T *_val_d = NULL;
283 void *_buf_d = NULL;
284
285 if (val_h != NULL) {
286
287 cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h);
288 size_t size = n_vals * sizeof(T);
289
290 if (alloc_mode == CS_ALLOC_HOST) {
291 CS_CUDA_CHECK(cudaMalloc(&_buf_d, size));
292 cs_cuda_copy_h2d(_buf_d, val_h, size);
293 _val_d = (const T *)_buf_d;
294 }
295 else {
296 _val_d = (const T *)cs_get_device_ptr_const((const void *)val_h);
297
298 if (alloc_mode != CS_ALLOC_HOST_DEVICE_SHARED)
299 cs_sync_h2d(val_h);
300 }
301
302 }
303
304 *val_d = _val_d;
305 *buf_d = _buf_d;
306}
307
308/*=============================================================================
309 * Public function prototypes
310 *============================================================================*/
311
312/*----------------------------------------------------------------------------*/
313/*
314 * \brief Return stream handle from stream pool.
315 *
316 * If the requested stream id is higher than the current number of streams,
317 * one or more new streams will be created, so that size of the stream pool
318 * matches at least stream_id+1.
319 *
320 * By default, the first stream (with id 0) will be used for most operations,
321 * while stream id 1 will be used for operations which can be done
322 * concurrently, such as memory prefetching.
323 *
324 * Additional streams can be used for independent tasks, though opportunities
325 * for this are limited in the current code (this would probably also require
326 * associating different MPI communicators with each task).
327 *
328 * \param [in] stream_id id or requested stream
329 *
330 * \returns handle to requested stream
331 */
332/*----------------------------------------------------------------------------*/
333
334cudaStream_t
335cs_cuda_get_stream(int stream_id);
336
337/*----------------------------------------------------------------------------*/
338/*
339 * \brief Return stream handle used for prefetching.
340 *
341 * By default, a single stream is created specifically for prefetching.
342 *
343 * \returns handle to prefetching stream
344 */
345/*----------------------------------------------------------------------------*/
346
347cudaStream_t
348cs_cuda_get_stream_prefetch(void);
349
350#endif /* defined(__NVCC__) */
351
353
354/*----------------------------------------------------------------------------*/
355/*
356 * \brief Log information on available CUDA devices.
357 *
358 * \param[in] log_id id of log file in which to print information
359 */
360/*----------------------------------------------------------------------------*/
361
362void
363cs_base_cuda_device_info(cs_log_t log_id);
364
365/*----------------------------------------------------------------------------*/
366/*
367 * \brief Log information on available CUDA version.
368 *
369 * \param[in] log_id id of log file in which to print information
370 */
371/*----------------------------------------------------------------------------*/
372
373void
374cs_base_cuda_version_info(cs_log_t log_id);
375
376/*----------------------------------------------------------------------------*/
377/*
378 * \brief Log information on CUDA compiler.
379 *
380 * \param[in] log_id id of log file in which to print information
381 */
382/*----------------------------------------------------------------------------*/
383
384void
385cs_base_cuda_compiler_info(cs_log_t log_id);
386
387/*----------------------------------------------------------------------------*/
388/*
389 * \brief Set CUDA device based on MPI rank and number of devices.
390 *
391 * \param[in] comm associated MPI communicator
392 * \param[in] ranks_per_node number of ranks per node (min and max)
393 *
394 * \return selected device id, or -1 if no usable device is available
395 */
396/*----------------------------------------------------------------------------*/
397
398int
399cs_base_cuda_select_default_device(void);
400
401/*----------------------------------------------------------------------------*/
402/*
403 * \brief Return currently selected CUDA devices.
404 *
405 * \return selected device id, or -1 if no usable device is available
406 */
407/*----------------------------------------------------------------------------*/
408
409int
410cs_base_cuda_get_device(void);
411
412#endif /* CS_HAVE_CUDA */
413
414/*----------------------------------------------------------------------------*/
415
417
418#endif /* __CS_BASE_CUDA_H__ */
#define BEGIN_C_DECLS
Definition: cs_defs.h:542
#define END_C_DECLS
Definition: cs_defs.h:543
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:335
cs_log_t
Definition: cs_log.h:48
static const void * cs_get_device_ptr_const(const void *ptr)
Return matching device pointer for a given constant pointer.
Definition: cs_mem.h:693
static cs_alloc_mode_t cs_check_device_ptr(const void *ptr)
Check if a pointer is associated with a device.
Definition: cs_mem.h:787
static void cs_sync_h2d(const void *ptr)
Synchronize data from host to device.
Definition: cs_mem.h:997
cs_alloc_mode_t
Definition: cs_mem.h:50
@ CS_ALLOC_HOST
Definition: cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: cs_mem.h:57