8.2
general documentation
Loading...
Searching...
No Matches
cs_base_cuda.h
Go to the documentation of this file.
1#ifndef __CS_BASE_CUDA_H__
2#define __CS_BASE_CUDA_H__
3
4/*============================================================================
5 * Definitions, global variables, and base functions for CUDA
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2024 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28/*----------------------------------------------------------------------------*/
29
30#include "cs_defs.h"
31
32#if defined(HAVE_CUDA)
33
34/*----------------------------------------------------------------------------
35 * Standard C library headers
36 *----------------------------------------------------------------------------*/
37
38#include <stdio.h>
39
40/*----------------------------------------------------------------------------
41 * Local headers
42 *----------------------------------------------------------------------------*/
43
44#include "cs_base_accel.h"
45#include "cs_log.h"
46
47/*=============================================================================
48 * Macro definitions
49 *============================================================================*/
50
51#define CS_CUDA_CHECK(a) { \
52 cudaError_t _l_ret_code = a; \
53 if (cudaSuccess != _l_ret_code) { \
54 bft_error(__FILE__, __LINE__, 0, "[CUDA error] %d: %s\n running: %s", \
55 _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
56 } \
57 }
58
59#define CS_CUDA_CHECK_CALL(a, file_name, line_num) { \
60 cudaError_t _l_ret_code = a; \
61 if (cudaSuccess != _l_ret_code) { \
62 bft_error(file_name, line_num, 0, "[CUDA error] %d: %s\n running: %s", \
63 _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
64 } \
65 }
66
67/* For all current compute capabilities, the warp size is 32; If it ever
68 changes, it can be obtained through cudaDeviceProp, so we could then
69 replace this macro with a global variable */
70
71#define CS_CUDA_WARP_SIZE 32
72
73/*----------------------------------------------------------------------------*/
74
76
77/*============================================================================
78 * Type definitions
79 *============================================================================*/
80
81/*=============================================================================
82 * Global variable definitions
83 *============================================================================*/
84
85extern int cs_glob_cuda_device_id;
86
87/* Other device parameters */
88
89extern int cs_glob_cuda_max_threads_per_block;
90extern int cs_glob_cuda_max_block_size;
91extern int cs_glob_cuda_max_blocks;
92extern int cs_glob_cuda_n_mp; /* Number of multiprocessors */
93
94/* Allow graphs for kernel launches ? May interfere with profiling (nsys),
95 so can be deactivated. */
96
97extern bool cs_glob_cuda_allow_graph;
98
99/*============================================================================
100 * Semi-private function prototypes
101 *
102 * The following functions are intended to be used by the common
103 * host-device memory management functions from cs_base_accel.c, and
104 * not directly by the user.
105 *============================================================================*/
106
107/*----------------------------------------------------------------------------*/
108/*
109 * \brief Allocate n bytes of CUDA device memory.
110 *
111 * This function simply wraps cudaMalloc, which could probably be
112 * directly called from C or C++, but whose use in such manner is not
113 * well documented, and whose declaration in cuda_runtime.h requires
114 * support of function attributes by compiler.
115 *
116 * A safety check is added.
117 *
118 * \param [in] n element size
119 * \param [in] var_name allocated variable name string
120 * \param [in] file_name name of calling source file
121 * \param [in] line_num line number in calling source file
122 *
123 * \returns pointer to allocated memory.
124 */
125/*----------------------------------------------------------------------------*/
126
127void *
128cs_cuda_mem_malloc_device(size_t n,
129 const char *var_name,
130 const char *file_name,
131 int line_num);
132
133/*----------------------------------------------------------------------------*/
134/*
135 * \brief Allocate n bytes of host memory using CUDA.
136 *
137 * This function simply wraps cudaMallocHost, which could probably be
138 * directly called from C or C++, but whose use in such manner is not
139 * well documented, and whose declaration in cuda_runtime.h requires
140 * support of function attributes by compiler.
141 *
142 * A safety check is added.
143 *
144 * \param [in] n element size
145 * \param [in] var_name allocated variable name string
146 * \param [in] file_name name of calling source file
147 * \param [in] line_num line number in calling source file
148 *
149 * \returns pointer to allocated memory.
150 */
151/*----------------------------------------------------------------------------*/
152
153void *
154cs_cuda_mem_malloc_host(size_t n,
155 const char *var_name,
156 const char *file_name,
157 int line_num);
158
159/*----------------------------------------------------------------------------*/
160/*
161 * \brief Allocate n bytes of CUDA managed memory.
162 *
163 * This function simply wraps cudaMallocManaged, which could probably be
164 * directly called from C or C++, but whose use in such manner is not
165 * well documented, and whose declaration in cuda_runtime.h requires
166 * support of function attributes by compiler.
167 *
168 * A safety check is added.
169 *
170 * \param [in] n element size
171 * \param [in] var_name allocated variable name string
172 * \param [in] file_name name of calling source file
173 * \param [in] line_num line number in calling source file
174 *
175 * \returns pointer to allocated memory.
176 */
177/*----------------------------------------------------------------------------*/
178
179void *
180cs_cuda_mem_malloc_managed(size_t n,
181 const char *var_name,
182 const char *file_name,
183 int line_num);
184
185/*----------------------------------------------------------------------------*/
186/*
187 * \brief Free CUDA memory associated with a given pointer.
188 *
189 * This function simply wraps cudaFree, which could probably be
190 * directly called from C or C++, but whose use in such manner is not
191 * well documented, and whose declaration in cuda_runtime.h requires
192 * support of function attributes by compiler.
193 *
194 * A safety check is added.
195 *
196 * \param [in] p pointer to device memory
197 * \param [in] var_name allocated variable name string
198 * \param [in] file_name name of calling source file
199 * \param [in] line_num line number in calling source file
200 *
201 * \returns pointer to allocated memory.
202 */
203/*----------------------------------------------------------------------------*/
204
205void
206cs_cuda_mem_free(void *p,
207 const char *var_name,
208 const char *file_name,
209 int line_num);
210
211/*----------------------------------------------------------------------------*/
212/*
213 * \brief Free CUDA-allocated host memory associated with a given pointer.
214 *
215 * This function simply wraps cudaFreeHost, which could probably be
216 * directly called from C or C++, but whose use in such manner is not
217 * well documented, and whose declaration in cuda_runtime.h requires
218 * support of function attributes by compiler.
219 *
220 * A safety check is added.
221 *
222 * \param [in] p pointer to device memory
223 * \param [in] var_name allocated variable name string
224 * \param [in] file_name name of calling source file
225 * \param [in] line_num line number in calling source file
226 *
227 * \returns pointer to allocated memory.
228 */
229/*----------------------------------------------------------------------------*/
230
231void
232cs_cuda_mem_free_host(void *p,
233 const char *var_name,
234 const char *file_name,
235 int line_num);
236
237/*----------------------------------------------------------------------------*/
238/*
239 * \brief Copy data from host to device.
240 *
241 * This is simply a wrapper over cudaMemcpy.
242 *
243 * A safety check is added.
244 *
245 * \param [out] dst pointer to destination data
246 * \param [in] src pointer to source data
247 * \param [in] size size of data to copy
248 */
249/*----------------------------------------------------------------------------*/
250
251void
252cs_cuda_copy_h2d(void *dst,
253 const void *src,
254 size_t size);
255
256/*----------------------------------------------------------------------------*/
257/*
258 * \brief Copy data from host to device, possibly returning on the host
259 * before the copy is finished.
260 *
261 * This is simply a wrapper over cudaMemcpyAsync.
262 *
263 * A safety check is added.
264 *
265 * \param [out] dst pointer to destination data
266 * \param [in] src pointer to source data
267 * \param [in] size size of data to copy
268 *
269 * \returns pointer to allocated memory.
270 */
271/*----------------------------------------------------------------------------*/
272
273void
274cs_cuda_copy_h2d_async(void *dst,
275 const void *src,
276 size_t size);
277
278/*----------------------------------------------------------------------------*/
279/*
280 * \brief Copy data from device to host.
281 *
282 * This is simply a wrapper over cudaMemcpy.
283 *
284 * A safety check is added.
285 *
286 * \param [out] dst pointer to destination data
287 * \param [in] src pointer to source data
288 * \param [in] size size of data to copy
289 *
290 * \returns pointer to allocated memory.
291 */
292/*----------------------------------------------------------------------------*/
293
294void
295cs_cuda_copy_d2h(void *dst,
296 const void *src,
297 size_t size);
298
299/*----------------------------------------------------------------------------*/
300/*
301 * \brief Copy data from host to device.
302 *
303 * This is simply a wrapper over cudaMemcpy.
304 *
305 * A safety check is added.
306 *
307 * \param [out] dst pointer to destination data
308 * \param [in] src pointer to source data
309 * \param [in] size size of data to copy
310 *
311 * \returns pointer to allocated memory.
312 */
313/*----------------------------------------------------------------------------*/
314
315void
316cs_cuda_copy_d2h_async(void *dst,
317 const void *src,
318 size_t size);
319
320/*----------------------------------------------------------------------------*/
321/*
322 * \brief Prefetch data from host to device.
323 *
324 * This is simply a wrapper over cudaMemPrefetchAsync.
325 *
326 * A safety check is added.
327 *
328 * \param [out] dst pointer to destination data
329 * \param [in] src pointer to source data
330 * \param [in] size size of data to copy
331 *
332 * \returns pointer to allocated memory.
333 */
334/*----------------------------------------------------------------------------*/
335
336void
337cs_cuda_prefetch_h2d(void *dst,
338 size_t size);
339
340/*----------------------------------------------------------------------------*/
341/*
342 * \brief Prefetch data from device to host.
343 *
344 * This is simply a wrapper over cudaMemPrefetchAsync.
345 *
346 * A safety check is added.
347 *
348 * \param [out] dst pointer to destination data
349 * \param [in] src pointer to source data
350 * \param [in] size size of data to copy
351 *
352 * \returns pointer to allocated memory.
353 */
354/*----------------------------------------------------------------------------*/
355
356void
357cs_cuda_prefetch_d2h(void *dst,
358 size_t size);
359
360/*----------------------------------------------------------------------------*/
361/*
362 * \brief Copy data from device to device.
363 *
364 * This is simply a wrapper over cudaMemcpy.
365 *
366 * A safety check is added.
367 *
368 * \param [out] dst pointer to destination data
369 * \param [in] src pointer to source data
370 * \param [in] size size of data to copy
371 */
372/*----------------------------------------------------------------------------*/
373
374void
375cs_cuda_copy_d2d(void *dst,
376 const void *src,
377 size_t size);
378
379/*----------------------------------------------------------------------------*/
380/*
381 * \brief Get host pointer for a managed or device pointer.
382 *
383 * This function can be called with a pointer inside an allocated block of
384 * memory, so is not retricted to values returned by CS_ALLOC_HD.
385 *
386 * This makes it possible to check whether a pointer to an array inside
387 * a larger array is shared or accessible from the device only
388 * (for example when grouping allocations).
389 *
390 * \param [in] ptr pointer to device data
391 *
392 * \return pointer to host data if shared or mapped at the CUDA level,
393 * NULL otherwise.
394 */
395/*----------------------------------------------------------------------------*/
396
397void *
398cs_cuda_get_host_ptr(const void *ptr);
399
400/*----------------------------------------------------------------------------*/
401/*
402 * \brief Advise memory system that a given allocation will be mostly read.
403 *
404 * \param [in] ptr pointer to allocation
405 * \param [size] size associated data size
406 */
407/*----------------------------------------------------------------------------*/
408
409void
410cs_cuda_mem_set_advise_read_mostly(const void *ptr,
411 size_t size);
412
413/*----------------------------------------------------------------------------*/
414/*
415 * \brief Advise memory system that a given allocation will be mostly read.
416 *
417 * \param [in] ptr pointer to allocation
418 * \param [size] size associated data size
419 */
420/*----------------------------------------------------------------------------*/
421
422void
423cs_cuda_mem_unset_advise_read_mostly(const void *ptr,
424 size_t size);
425
426/*=============================================================================
427 * Inline function prototypes
428 *============================================================================*/
429
430/*----------------------------------------------------------------------------*/
443/*----------------------------------------------------------------------------*/
444
445static inline unsigned int
446cs_cuda_grid_size(cs_lnum_t n,
447 unsigned int block_size)
448{
449 return (n % block_size) ? n/block_size + 1 : n/block_size;
450}
451
453
454#if defined(__NVCC__)
455
456/*----------------------------------------------------------------------------
457 * Synchronize of copy a cs_real_t type array from the host to a device.
458 *
459 * parameters:
460 * val_h <-- pointer to host data
461 * n_vals <-- number of data values
462 * device_id <-- associated device id
463 * stream <-- associated stream (for async prefetch only)
464 * val_d --> matching pointer on device
465 * buf_d --> matching allocation pointer on device (should be freed
466 * after use if non-NULL)
467 *----------------------------------------------------------------------------*/
468
469template <typename T>
470void
471cs_sync_or_copy_h2d(const T *val_h,
472 cs_lnum_t n_vals,
473 int device_id,
474 cudaStream_t stream,
475 const T **val_d,
476 void **buf_d)
477{
478 const T *_val_d = NULL;
479 void *_buf_d = NULL;
480
481 if (val_h != NULL) {
482
483 cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h);
484 size_t size = n_vals * sizeof(T);
485
486 if (alloc_mode == CS_ALLOC_HOST) {
487 CS_CUDA_CHECK(cudaMalloc(&_buf_d, size));
488 cs_cuda_copy_h2d(_buf_d, val_h, size);
489 _val_d = (const T *)_buf_d;
490 }
491 else {
492 _val_d = (const T *)cs_get_device_ptr_const((const void *)val_h);
493
494 if (alloc_mode != CS_ALLOC_HOST_DEVICE_SHARED)
495 cs_sync_h2d(val_h);
496 }
497
498 }
499
500 *val_d = _val_d;
501 *buf_d = _buf_d;
502}
503
504/*=============================================================================
505 * Public function prototypes
506 *============================================================================*/
507
508/*----------------------------------------------------------------------------*/
509/*
510 * \brief Return stream handle from stream pool.
511 *
512 * If the requested stream id is higher than the current number of streams,
513 * one or more new streams will be created, so that size of the stream pool
514 * matches at least stream_id+1.
515 *
516 * By default, the first stream (with id 0) will be used for most operations,
517 * while stream id 1 will be used for operations which can be done
518 * concurrently, such as memory prefetching.
519 *
520 * Additional streams can be used for independent tasks, though opportunities
521 * for this are limited in the current code (this would probably also require
522 * associating different MPI communicators with each task).
523 *
524 * \param [in] stream_id id or requested stream
525 *
526 * \returns handle to requested stream
527 */
528/*----------------------------------------------------------------------------*/
529
530cudaStream_t
531cs_cuda_get_stream(int stream_id);
532
533#endif /* defined(__NVCC__) */
534
536
537/*----------------------------------------------------------------------------*/
538/*
539 * \brief Log information on available CUDA devices.
540 *
541 * \param[in] log_id id of log file in which to print information
542 */
543/*----------------------------------------------------------------------------*/
544
545void
546cs_base_cuda_device_info(cs_log_t log_id);
547
548/*----------------------------------------------------------------------------*/
549/*
550 * \brief Log information on available CUDA version.
551 *
552 * \param[in] log_id id of log file in which to print information
553 */
554/*----------------------------------------------------------------------------*/
555
556void
557cs_base_cuda_version_info(cs_log_t log_id);
558
559/*----------------------------------------------------------------------------*/
560/*
561 * \brief Log information on CUDA compiler.
562 *
563 * \param[in] log_id id of log file in which to print information
564 */
565/*----------------------------------------------------------------------------*/
566
567void
568cs_base_cuda_compiler_info(cs_log_t log_id);
569
570/*----------------------------------------------------------------------------*/
571/*
572 * \brief Set CUDA device based on MPI rank and number of devices.
573 *
574 * \param[in] comm associated MPI communicator
575 * \param[in] ranks_per_node number of ranks per node (min and max)
576 *
577 * \return selected device id, or -1 if no usable device is available
578 */
579/*----------------------------------------------------------------------------*/
580
581int
582cs_base_cuda_select_default_device(void);
583
584/*----------------------------------------------------------------------------*/
585/*
586 * \brief Return currently selected CUDA devices.
587 *
588 * \return selected device id, or -1 if no usable device is available
589 */
590/*----------------------------------------------------------------------------*/
591
592int
593cs_base_cuda_get_device(void);
594
595#endif /* CS_HAVE_CUDA */
596
597/*----------------------------------------------------------------------------*/
598
600
601#endif /* __CS_BASE_CUDA_H__ */
cs_alloc_mode_t
Definition bft_mem.h:50
@ CS_ALLOC_HOST
Definition bft_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition bft_mem.h:57
void cs_sync_h2d(const void *ptr)
Synchronize data from host to device.
Definition cs_base_accel.cxx:1256
cs_alloc_mode_t cs_check_device_ptr(const void *ptr)
Check if a pointer is associated with a device.
Definition cs_base_accel.cxx:1031
const void * cs_get_device_ptr_const(const void *ptr)
Return matching device pointer for a given constant pointer.
Definition cs_base_accel.cxx:888
#define BEGIN_C_DECLS
Definition cs_defs.h:528
#define END_C_DECLS
Definition cs_defs.h:529
int cs_lnum_t
local mesh entity id
Definition cs_defs.h:325
@ p
Definition cs_field_pointer.h:67
cs_log_t
Definition cs_log.h:48