7.1
general documentation
Performance tuning

Introduction

C user functions for performance tuning.

Several functions are present in the file, each specific to different performance tuning parameters.

Advanced mesh numbering

/* Force the target number of threads for mesh renumbering
(by default, OMP_NUM_THREADS if OpenMP is enabled, 1 otherwise) */
/* Set the minimum subset sizes when renumbering for threads. */
cs_renumber_set_min_subset_size(64, /* min. interior_subset_size */
64); /* min. boundary subset_size */
/* Select renumbering algorithms */
(false, /* halo_adjacent_cells_last */
false, /* halo_adjacent_i_faces_last */
CS_RENUMBER_ADJACENT_LOW, /* interior face base ordering */
CS_RENUMBER_CELLS_NONE, /* cells_pre_numbering */
CS_RENUMBER_CELLS_NONE, /* cells_numbering */
CS_RENUMBER_I_FACES_MULTIPASS, /* interior faces numbering */
CS_RENUMBER_B_FACES_THREAD, /* boundary faces numbering */
CS_RENUMBER_VERTICES_NONE); /* vertices numbering */

Advanced partitioning

Example 1

{
/* Example:
Force PT-SCOTCH or SCOTCH for preprocessing partitioning,
and Hilbert SFC for main partitioning;
Available algorithms (subject to build with external libraries for
SCOTCH and METIS) are:
CS_PARTITION_DEFAULT Default partitioning, based on stage
CS_PARTITION_SFC_MORTON_BOX Morton (Z) curve in bounding box
CS_PARTITION_SFC_MORTON_CUBE Morton (Z) curve in bounding cube
CS_PARTITION_SFC_HILBERT_BOX Peano-Hilbert curve in bounding box
CS_PARTITION_SFC_HILBERT_CUBE Peano-Hilbert curve in bounding cube
CS_PARTITION_SCOTCH PT-SCOTCH or SCOTCH
CS_PARTITION_METIS ParMETIS or METIS
CS_PARTITION_BLOCK Unoptimized (naive) block partitioning */
1, /* rank_step */
false); /* ignore periodicity in graph */
1, /* rank_step */
false); /* ignore periodicity in graph */
}

Example 2

{
/* Example: set partitioning write to file option.
*
* value of write flag: 0: never
* 1: for graph-based partitioning only (default)
* 2: always */
}

Example 3

{
/* Example: force activation/deactivation of initial partitioning
* for preprocessing. */
}

Example 4

{
/* Example: define list of extra partitionings to build.
*
* Partitionings in this list will be output to file, and may be used for
* subsequent calculations.
*
* When partitioning for both preprocessing and calculation stages, output to
* file of partioning data or generation of additional partitionings
* (see \ref cs_partition_add_partitions) will only be done for the
* second stage. */
int n_extra_partitions = 3;
int extra_partitions_list[] = {12, 24, 48};
cs_partition_add_partitions(n_extra_partitions, extra_partitions_list);
}

Parallel IO

#if defined(HAVE_MPI_IO) && MPI_VERSION > 1
/* Example fine-tune parallel IO settings.
Available distributed block access methods
(subject to build with MPI IO) are:
CS_FILE_STDIO_SERIAL Serial standard C IO
(funnelled through rank 0 in parallel)
CS_FILE_STDIO_PARALLEL Per-process standard C IO
CS_FILE_MPI_INDEPENDENT Non-collective MPI-IO
with independent file open and close
CS_FILE_MPI_NON_COLLECTIVE Non-collective MPI-IO
with collective file open and close
CS_FILE_MPI_COLLECTIVE Collective MPI-IO
*/
int block_rank_step = 8;
int block_min_size = 1024*1024*8;
/* Set MPI IO hints
(see MPI-IO or your filesystem documentation;
examples here may have no effect, improve, or degrade performance)
For LUSTRE filesystems, many articles in the literature seem
to recommend adjusting striping to improve performance.
If using ROMIO, useful hints for collective buffering and data-sieving
may take values: "enable", "disable", "automatic".
*/
MPI_Info_create(&hints);
MPI_Info_set(hints, "striping_factor", "8");
MPI_Info_set(hints, "striping_unit", "8388608");
MPI_Info_set(hints, "romio_cb_read", "automatic");
MPI_Info_set(hints, "romio_cb_write", "automatic");
MPI_Info_set(hints, "romio_ds_read", "automatic");
MPI_Info_set(hints, "romio_ds_write", "automatic");
/* Set default file acces methods and communicator stride */
MPI_Info_set(hints, "collective_buffering", "true");
MPI_Info_set(hints, "access_style", "read_once");
MPI_Info_free(&hints);
#endif /* defined(HAVE_MPI_IO) && MPI_VERSION > 1 */

Matrix tuning

/* Activate tuning of matrix-vector operations */
/* Set tuning runs (defaults) */
cs_matrix_set_tuning_runs(20, /* n_min_products */
0.5); /* t_measure */
/* Force default for selected types */
/* Also allow tuning for multigrid for all expected levels
* (we rarely have more than 10 or 11 levels except for huge meshes). */