8.2
general documentation
cs_base_cuda.h
Go to the documentation of this file.
1 #ifndef __CS_BASE_CUDA_H__
2 #define __CS_BASE_CUDA_H__
3 
4 /*============================================================================
5  * Definitions, global variables, and base functions for CUDA
6  *============================================================================*/
7 
8 /*
9  This file is part of code_saturne, a general-purpose CFD tool.
10 
11  Copyright (C) 1998-2024 EDF S.A.
12 
13  This program is free software; you can redistribute it and/or modify it under
14  the terms of the GNU General Public License as published by the Free Software
15  Foundation; either version 2 of the License, or (at your option) any later
16  version.
17 
18  This program is distributed in the hope that it will be useful, but WITHOUT
19  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21  details.
22 
23  You should have received a copy of the GNU General Public License along with
24  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25  Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 */
27 
28 /*----------------------------------------------------------------------------*/
29 
30 #include "cs_defs.h"
31 
32 #if defined(HAVE_CUDA)
33 
34 /*----------------------------------------------------------------------------
35  * Standard C library headers
36  *----------------------------------------------------------------------------*/
37 
38 #include <stdio.h>
39 
40 /*----------------------------------------------------------------------------
41  * Local headers
42  *----------------------------------------------------------------------------*/
43 
44 #include "cs_base_accel.h"
45 #include "cs_log.h"
46 
47 /*=============================================================================
48  * Macro definitions
49  *============================================================================*/
50 
51 #define CS_CUDA_CHECK(a) { \
52  cudaError_t _l_ret_code = a; \
53  if (cudaSuccess != _l_ret_code) { \
54  bft_error(__FILE__, __LINE__, 0, "[CUDA error] %d: %s\n running: %s", \
55  _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
56  } \
57  }
58 
59 #define CS_CUDA_CHECK_CALL(a, file_name, line_num) { \
60  cudaError_t _l_ret_code = a; \
61  if (cudaSuccess != _l_ret_code) { \
62  bft_error(file_name, line_num, 0, "[CUDA error] %d: %s\n running: %s", \
63  _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
64  } \
65  }
66 
67 /* For all current compute capabilities, the warp size is 32; If it ever
68  changes, it can be obtained through cudaDeviceProp, so we could then
69  replace this macro with a global variable */
70 
71 #define CS_CUDA_WARP_SIZE 32
72 
73 /*----------------------------------------------------------------------------*/
74 
76 
77 /*============================================================================
78  * Type definitions
79  *============================================================================*/
80 
81 /*=============================================================================
82  * Global variable definitions
83  *============================================================================*/
84 
85 extern int cs_glob_cuda_device_id;
86 
87 /* Other device parameters */
88 
89 extern int cs_glob_cuda_max_threads_per_block;
90 extern int cs_glob_cuda_max_block_size;
91 extern int cs_glob_cuda_max_blocks;
92 extern int cs_glob_cuda_n_mp; /* Number of multiprocessors */
93 
94 /* Allow graphs for kernel launches ? May interfere with profiling (nsys),
95  so can be deactivated. */
96 
97 extern bool cs_glob_cuda_allow_graph;
98 
99 /*============================================================================
100  * Semi-private function prototypes
101  *
102  * The following functions are intended to be used by the common
103  * host-device memory management functions from cs_base_accel.c, and
104  * not directly by the user.
105  *============================================================================*/
106 
107 /*----------------------------------------------------------------------------*/
108 /*
109  * \brief Allocate n bytes of CUDA device memory.
110  *
111  * This function simply wraps cudaMalloc, which could probably be
112  * directly called from C or C++, but whose use in such manner is not
113  * well documented, and whose declaration in cuda_runtime.h requires
114  * support of function attributes by compiler.
115  *
116  * A safety check is added.
117  *
118  * \param [in] n element size
119  * \param [in] var_name allocated variable name string
120  * \param [in] file_name name of calling source file
121  * \param [in] line_num line number in calling source file
122  *
123  * \returns pointer to allocated memory.
124  */
125 /*----------------------------------------------------------------------------*/
126 
127 void *
128 cs_cuda_mem_malloc_device(size_t n,
129  const char *var_name,
130  const char *file_name,
131  int line_num);
132 
133 /*----------------------------------------------------------------------------*/
134 /*
135  * \brief Allocate n bytes of host memory using CUDA.
136  *
137  * This function simply wraps cudaMallocHost, which could probably be
138  * directly called from C or C++, but whose use in such manner is not
139  * well documented, and whose declaration in cuda_runtime.h requires
140  * support of function attributes by compiler.
141  *
142  * A safety check is added.
143  *
144  * \param [in] n element size
145  * \param [in] var_name allocated variable name string
146  * \param [in] file_name name of calling source file
147  * \param [in] line_num line number in calling source file
148  *
149  * \returns pointer to allocated memory.
150  */
151 /*----------------------------------------------------------------------------*/
152 
153 void *
154 cs_cuda_mem_malloc_host(size_t n,
155  const char *var_name,
156  const char *file_name,
157  int line_num);
158 
159 /*----------------------------------------------------------------------------*/
160 /*
161  * \brief Allocate n bytes of CUDA managed memory.
162  *
163  * This function simply wraps cudaMallocManaged, which could probably be
164  * directly called from C or C++, but whose use in such manner is not
165  * well documented, and whose declaration in cuda_runtime.h requires
166  * support of function attributes by compiler.
167  *
168  * A safety check is added.
169  *
170  * \param [in] n element size
171  * \param [in] var_name allocated variable name string
172  * \param [in] file_name name of calling source file
173  * \param [in] line_num line number in calling source file
174  *
175  * \returns pointer to allocated memory.
176  */
177 /*----------------------------------------------------------------------------*/
178 
179 void *
180 cs_cuda_mem_malloc_managed(size_t n,
181  const char *var_name,
182  const char *file_name,
183  int line_num);
184 
185 /*----------------------------------------------------------------------------*/
186 /*
187  * \brief Free CUDA memory associated with a given pointer.
188  *
189  * This function simply wraps cudaFree, which could probably be
190  * directly called from C or C++, but whose use in such manner is not
191  * well documented, and whose declaration in cuda_runtime.h requires
192  * support of function attributes by compiler.
193  *
194  * A safety check is added.
195  *
196  * \param [in] p pointer to device memory
197  * \param [in] var_name allocated variable name string
198  * \param [in] file_name name of calling source file
199  * \param [in] line_num line number in calling source file
200  *
201  * \returns pointer to allocated memory.
202  */
203 /*----------------------------------------------------------------------------*/
204 
205 void
206 cs_cuda_mem_free(void *p,
207  const char *var_name,
208  const char *file_name,
209  int line_num);
210 
211 /*----------------------------------------------------------------------------*/
212 /*
213  * \brief Free CUDA-allocated host memory associated with a given pointer.
214  *
215  * This function simply wraps cudaFreeHost, which could probably be
216  * directly called from C or C++, but whose use in such manner is not
217  * well documented, and whose declaration in cuda_runtime.h requires
218  * support of function attributes by compiler.
219  *
220  * A safety check is added.
221  *
222  * \param [in] p pointer to device memory
223  * \param [in] var_name allocated variable name string
224  * \param [in] file_name name of calling source file
225  * \param [in] line_num line number in calling source file
226  *
227  * \returns pointer to allocated memory.
228  */
229 /*----------------------------------------------------------------------------*/
230 
231 void
232 cs_cuda_mem_free_host(void *p,
233  const char *var_name,
234  const char *file_name,
235  int line_num);
236 
237 /*----------------------------------------------------------------------------*/
238 /*
239  * \brief Copy data from host to device.
240  *
241  * This is simply a wrapper over cudaMemcpy.
242  *
243  * A safety check is added.
244  *
245  * \param [out] dst pointer to destination data
246  * \param [in] src pointer to source data
247  * \param [in] size size of data to copy
248  */
249 /*----------------------------------------------------------------------------*/
250 
251 void
252 cs_cuda_copy_h2d(void *dst,
253  const void *src,
254  size_t size);
255 
256 /*----------------------------------------------------------------------------*/
257 /*
258  * \brief Copy data from host to device, possibly returning on the host
259  * before the copy is finished.
260  *
261  * This is simply a wrapper over cudaMemcpyAsync.
262  *
263  * A safety check is added.
264  *
265  * \param [out] dst pointer to destination data
266  * \param [in] src pointer to source data
267  * \param [in] size size of data to copy
268  *
269  * \returns pointer to allocated memory.
270  */
271 /*----------------------------------------------------------------------------*/
272 
273 void
274 cs_cuda_copy_h2d_async(void *dst,
275  const void *src,
276  size_t size);
277 
278 /*----------------------------------------------------------------------------*/
279 /*
280  * \brief Copy data from device to host.
281  *
282  * This is simply a wrapper over cudaMemcpy.
283  *
284  * A safety check is added.
285  *
286  * \param [out] dst pointer to destination data
287  * \param [in] src pointer to source data
288  * \param [in] size size of data to copy
289  *
290  * \returns pointer to allocated memory.
291  */
292 /*----------------------------------------------------------------------------*/
293 
294 void
295 cs_cuda_copy_d2h(void *dst,
296  const void *src,
297  size_t size);
298 
299 /*----------------------------------------------------------------------------*/
300 /*
301  * \brief Copy data from host to device.
302  *
303  * This is simply a wrapper over cudaMemcpy.
304  *
305  * A safety check is added.
306  *
307  * \param [out] dst pointer to destination data
308  * \param [in] src pointer to source data
309  * \param [in] size size of data to copy
310  *
311  * \returns pointer to allocated memory.
312  */
313 /*----------------------------------------------------------------------------*/
314 
315 void
316 cs_cuda_copy_d2h_async(void *dst,
317  const void *src,
318  size_t size);
319 
320 /*----------------------------------------------------------------------------*/
321 /*
322  * \brief Prefetch data from host to device.
323  *
324  * This is simply a wrapper over cudaMemPrefetchAsync.
325  *
326  * A safety check is added.
327  *
328  * \param [out] dst pointer to destination data
329  * \param [in] src pointer to source data
330  * \param [in] size size of data to copy
331  *
332  * \returns pointer to allocated memory.
333  */
334 /*----------------------------------------------------------------------------*/
335 
336 void
337 cs_cuda_prefetch_h2d(void *dst,
338  size_t size);
339 
340 /*----------------------------------------------------------------------------*/
341 /*
342  * \brief Prefetch data from device to host.
343  *
344  * This is simply a wrapper over cudaMemPrefetchAsync.
345  *
346  * A safety check is added.
347  *
348  * \param [out] dst pointer to destination data
349  * \param [in] src pointer to source data
350  * \param [in] size size of data to copy
351  *
352  * \returns pointer to allocated memory.
353  */
354 /*----------------------------------------------------------------------------*/
355 
356 void
357 cs_cuda_prefetch_d2h(void *dst,
358  size_t size);
359 
360 /*----------------------------------------------------------------------------*/
361 /*
362  * \brief Copy data from device to device.
363  *
364  * This is simply a wrapper over cudaMemcpy.
365  *
366  * A safety check is added.
367  *
368  * \param [out] dst pointer to destination data
369  * \param [in] src pointer to source data
370  * \param [in] size size of data to copy
371  */
372 /*----------------------------------------------------------------------------*/
373 
374 void
375 cs_cuda_copy_d2d(void *dst,
376  const void *src,
377  size_t size);
378 
379 /*----------------------------------------------------------------------------*/
380 /*
381  * \brief Get host pointer for a managed or device pointer.
382  *
383  * This function can be called with a pointer inside an allocated block of
384  * memory, so is not retricted to values returned by CS_ALLOC_HD.
385  *
386  * This makes it possible to check whether a pointer to an array inside
387  * a larger array is shared or accessible from the device only
388  * (for example when grouping allocations).
389  *
390  * \param [in] ptr pointer to device data
391  *
392  * \return pointer to host data if shared or mapped at the CUDA level,
393  * NULL otherwise.
394  */
395 /*----------------------------------------------------------------------------*/
396 
397 void *
398 cs_cuda_get_host_ptr(const void *ptr);
399 
400 /*----------------------------------------------------------------------------*/
401 /*
402  * \brief Advise memory system that a given allocation will be mostly read.
403  *
404  * \param [in] ptr pointer to allocation
405  * \param [size] size associated data size
406  */
407 /*----------------------------------------------------------------------------*/
408 
409 void
410 cs_cuda_mem_set_advise_read_mostly(const void *ptr,
411  size_t size);
412 
413 /*----------------------------------------------------------------------------*/
414 /*
415  * \brief Advise memory system that a given allocation will be mostly read.
416  *
417  * \param [in] ptr pointer to allocation
418  * \param [size] size associated data size
419  */
420 /*----------------------------------------------------------------------------*/
421 
422 void
423 cs_cuda_mem_unset_advise_read_mostly(const void *ptr,
424  size_t size);
425 
426 /*=============================================================================
427  * Inline function prototypes
428  *============================================================================*/
429 
430 /*----------------------------------------------------------------------------*/
443 /*----------------------------------------------------------------------------*/
444 
445 static inline unsigned int
446 cs_cuda_grid_size(cs_lnum_t n,
447  unsigned int block_size)
448 {
449  return (n % block_size) ? n/block_size + 1 : n/block_size;
450 }
451 
453 
454 #if defined(__NVCC__)
455 
456 /*----------------------------------------------------------------------------
457  * Synchronize of copy a cs_real_t type array from the host to a device.
458  *
459  * parameters:
460  * val_h <-- pointer to host data
461  * n_vals <-- number of data values
462  * device_id <-- associated device id
463  * stream <-- associated stream (for async prefetch only)
464  * val_d --> matching pointer on device
465  * buf_d --> matching allocation pointer on device (should be freed
466  * after use if non-NULL)
467  *----------------------------------------------------------------------------*/
468 
469 template <typename T>
470 void
471 cs_sync_or_copy_h2d(const T *val_h,
472  cs_lnum_t n_vals,
473  int device_id,
474  cudaStream_t stream,
475  const T **val_d,
476  void **buf_d)
477 {
478  const T *_val_d = NULL;
479  void *_buf_d = NULL;
480 
481  if (val_h != NULL) {
482 
483  cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h);
484  size_t size = n_vals * sizeof(T);
485 
486  if (alloc_mode == CS_ALLOC_HOST) {
487  CS_CUDA_CHECK(cudaMalloc(&_buf_d, size));
488  cs_cuda_copy_h2d(_buf_d, val_h, size);
489  _val_d = (const T *)_buf_d;
490  }
491  else {
492  _val_d = (const T *)cs_get_device_ptr_const((const void *)val_h);
493 
494  if (alloc_mode != CS_ALLOC_HOST_DEVICE_SHARED)
495  cs_sync_h2d(val_h);
496  }
497 
498  }
499 
500  *val_d = _val_d;
501  *buf_d = _buf_d;
502 }
503 
504 /*=============================================================================
505  * Public function prototypes
506  *============================================================================*/
507 
508 /*----------------------------------------------------------------------------*/
509 /*
510  * \brief Return stream handle from stream pool.
511  *
512  * If the requested stream id is higher than the current number of streams,
513  * one or more new streams will be created, so that size of the stream pool
514  * matches at least stream_id+1.
515  *
516  * By default, the first stream (with id 0) will be used for most operations,
517  * while stream id 1 will be used for operations which can be done
518  * concurrently, such as memory prefetching.
519  *
520  * Additional streams can be used for independent tasks, though opportunities
521  * for this are limited in the current code (this would probably also require
522  * associating different MPI communicators with each task).
523  *
524  * \param [in] stream_id id or requested stream
525  *
526  * \returns handle to requested stream
527  */
528 /*----------------------------------------------------------------------------*/
529 
530 cudaStream_t
531 cs_cuda_get_stream(int stream_id);
532 
533 #endif /* defined(__NVCC__) */
534 
536 
537 /*----------------------------------------------------------------------------*/
538 /*
539  * \brief Log information on available CUDA devices.
540  *
541  * \param[in] log_id id of log file in which to print information
542  */
543 /*----------------------------------------------------------------------------*/
544 
545 void
546 cs_base_cuda_device_info(cs_log_t log_id);
547 
548 /*----------------------------------------------------------------------------*/
549 /*
550  * \brief Log information on available CUDA version.
551  *
552  * \param[in] log_id id of log file in which to print information
553  */
554 /*----------------------------------------------------------------------------*/
555 
556 void
557 cs_base_cuda_version_info(cs_log_t log_id);
558 
559 /*----------------------------------------------------------------------------*/
560 /*
561  * \brief Log information on CUDA compiler.
562  *
563  * \param[in] log_id id of log file in which to print information
564  */
565 /*----------------------------------------------------------------------------*/
566 
567 void
568 cs_base_cuda_compiler_info(cs_log_t log_id);
569 
570 /*----------------------------------------------------------------------------*/
571 /*
572  * \brief Set CUDA device based on MPI rank and number of devices.
573  *
574  * \param[in] comm associated MPI communicator
575  * \param[in] ranks_per_node number of ranks per node (min and max)
576  *
577  * \return selected device id, or -1 if no usable device is available
578  */
579 /*----------------------------------------------------------------------------*/
580 
581 int
582 cs_base_cuda_select_default_device(void);
583 
584 /*----------------------------------------------------------------------------*/
585 /*
586  * \brief Return currently selected CUDA devices.
587  *
588  * \return selected device id, or -1 if no usable device is available
589  */
590 /*----------------------------------------------------------------------------*/
591 
592 int
593 cs_base_cuda_get_device(void);
594 
595 #endif /* CS_HAVE_CUDA */
596 
597 /*----------------------------------------------------------------------------*/
598 
600 
601 #endif /* __CS_BASE_CUDA_H__ */
cs_alloc_mode_t
Definition: bft_mem.h:50
@ CS_ALLOC_HOST
Definition: bft_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: bft_mem.h:57
const void * cs_get_device_ptr_const(const void *ptr)
Return matching device pointer for a given constant pointer.
Definition: cs_base_accel.cxx:888
void cs_sync_h2d(const void *ptr)
Synchronize data from host to device.
Definition: cs_base_accel.cxx:1256
cs_alloc_mode_t cs_check_device_ptr(const void *ptr)
Check if a pointer is associated with a device.
Definition: cs_base_accel.cxx:1031
#define BEGIN_C_DECLS
Definition: cs_defs.h:528
#define END_C_DECLS
Definition: cs_defs.h:529
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:325
@ p
Definition: cs_field_pointer.h:67
cs_log_t
Definition: cs_log.h:48