Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 : #include "grid_library.h"
8 : #include "grid_common.h"
9 : #include "grid_constants.h"
10 :
11 : #include "../../mpiwrap/cp_mpi.h"
12 : #include "../../offload/offload_runtime.h"
13 :
14 : #include <assert.h>
15 : #include <omp.h>
16 : #include <stddef.h>
17 : #include <stdio.h>
18 : #include <stdlib.h>
19 : #include <string.h>
20 :
21 : #define GRID_LIBRARY_PRINT(FN, MSG, OUTPUT_UNIT) \
22 : ((FN)(MSG, (int)strlen(MSG), OUTPUT_UNIT))
23 :
24 : // counter dimensions
25 : #define GRID_NBACKENDS 5
26 : #define GRID_NKERNELS 4
27 : #define GRID_MAX_LP 20
28 :
29 : typedef struct {
30 : grid_sphere_cache sphere_cache;
31 : long counters[GRID_NBACKENDS * GRID_NKERNELS * GRID_MAX_LP];
32 : } grid_library_globals;
33 :
34 : static grid_library_globals **per_thread_globals = NULL;
35 : static bool library_initialized = false;
36 : static int max_threads = 0;
37 : static grid_library_config config = {
38 : .backend = GRID_BACKEND_AUTO, .validate = false, .apply_cutoff = false};
39 :
40 : #if !defined(_OPENMP)
41 : #error "OpenMP is required. Please add -fopenmp to your C compiler flags."
42 : #endif
43 :
44 : #if defined(NDEBUG)
45 : #error \
46 : "Please do not build CP2K with NDEBUG. There is no performance advantage and asserts will save your neck."
47 : #endif
48 :
49 : /*******************************************************************************
50 : * \brief Initializes the grid library.
51 : * \author Ole Schuett
52 : ******************************************************************************/
53 9286 : void grid_library_init(void) {
54 9286 : if (library_initialized) {
55 0 : printf("Error: Grid library was already initialized.\n");
56 0 : abort();
57 : }
58 :
59 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
60 : // Reserve global GPU memory for storing the intermediate Cab matrix blocks.
61 : // CUDA does not allow to increase this limit after a kernel was launched.
62 : // Unfortunately, the required memory is hard to predict because we neither
63 : // know which tasks will be run nor how many thread blocks the available GPU
64 : // can execute in parallel... 64 MiB ought to be enough for anybody ;-)
65 : offloadEnsureMallocHeapSize(64 * 1024 * 1024);
66 : #endif
67 :
68 9286 : max_threads = omp_get_max_threads();
69 9286 : per_thread_globals = malloc(max_threads * sizeof(grid_library_globals *));
70 9286 : assert(per_thread_globals != NULL);
71 :
72 : // Using parallel regions to ensure memory is allocated near a thread's core.
73 : #pragma omp parallel default(none) shared(per_thread_globals) \
74 : num_threads(max_threads)
75 : {
76 : const int ithread = omp_get_thread_num();
77 : per_thread_globals[ithread] = malloc(sizeof(grid_library_globals));
78 : assert(per_thread_globals[ithread] != NULL);
79 : memset(per_thread_globals[ithread], 0, sizeof(grid_library_globals));
80 : }
81 :
82 9286 : library_initialized = true;
83 9286 : }
84 :
85 : /*******************************************************************************
86 : * \brief Finalizes the grid library.
87 : * \author Ole Schuett
88 : ******************************************************************************/
89 9286 : void grid_library_finalize(void) {
90 9286 : if (!library_initialized) {
91 0 : printf("Error: Grid library is not initialized.\n");
92 0 : abort();
93 : }
94 :
95 18572 : for (int i = 0; i < max_threads; i++) {
96 9286 : grid_sphere_cache_free(&per_thread_globals[i]->sphere_cache);
97 9286 : free(per_thread_globals[i]);
98 : }
99 9286 : free(per_thread_globals);
100 9286 : per_thread_globals = NULL;
101 9286 : library_initialized = false;
102 9286 : }
103 :
104 : /*******************************************************************************
105 : * \brief Returns a pointer to the thread local sphere cache.
106 : * \author Ole Schuett
107 : ******************************************************************************/
108 128252625 : grid_sphere_cache *grid_library_get_sphere_cache(void) {
109 128252625 : const int ithread = omp_get_thread_num();
110 128252625 : assert(ithread < max_threads);
111 128252625 : return &per_thread_globals[ithread]->sphere_cache;
112 : }
113 :
114 : /*******************************************************************************
115 : * \brief Configures the grid library.
116 : * \author Ole Schuett
117 : ******************************************************************************/
118 9402 : void grid_library_set_config(const enum grid_backend backend,
119 : const bool validate, const bool apply_cutoff) {
120 9402 : config.backend = backend;
121 9402 : config.validate = validate;
122 9402 : config.apply_cutoff = apply_cutoff;
123 9402 : }
124 :
125 : /*******************************************************************************
126 : * \brief Returns the library config.
127 : * \author Ole Schuett
128 : ******************************************************************************/
129 413238 : grid_library_config grid_library_get_config(void) { return config; }
130 :
131 : /*******************************************************************************
132 : * \brief Adds given increment to counter specified by lp, backend, and kernel.
133 : * \author Ole Schuett
134 : ******************************************************************************/
135 133992006 : void grid_library_counter_add(const int lp, const enum grid_backend backend,
136 : const enum grid_library_kernel kernel,
137 : const int increment) {
138 133992006 : assert(lp >= 0);
139 133992006 : assert(kernel < GRID_NKERNELS);
140 133992006 : const int back = backend - GRID_BACKEND_REF;
141 133992006 : assert(back < GRID_NBACKENDS);
142 133992006 : const int idx = back * GRID_NKERNELS * GRID_MAX_LP + kernel * GRID_MAX_LP +
143 133992006 : imin(lp, GRID_MAX_LP - 1);
144 133992006 : const int ithread = omp_get_thread_num();
145 133992006 : assert(ithread < max_threads);
146 133992006 : per_thread_globals[ithread]->counters[idx] += increment;
147 133992006 : }
148 :
149 : /*******************************************************************************
150 : * \brief Comperator passed to qsort to compare two counters.
151 : * \author Ole Schuett
152 : ******************************************************************************/
153 15786840 : static int compare_counters(const void *a, const void *b) {
154 15786840 : return *(long *)b - *(long *)a;
155 : }
156 :
157 : /*******************************************************************************
158 : * \brief Prints statistics gathered by the grid library.
159 : * \author Ole Schuett
160 : ******************************************************************************/
161 9404 : void grid_library_print_stats(const int fortran_comm,
162 : void (*print_func)(const char *, int, int),
163 9404 : const int output_unit) {
164 9404 : if (!library_initialized) {
165 0 : printf("Error: Grid library is not initialized.\n");
166 0 : abort();
167 : }
168 :
169 9404 : const cp_mpi_comm_t comm = cp_mpi_comm_f2c(fortran_comm);
170 : // Sum all counters across threads and mpi ranks.
171 9404 : const int ncounters = GRID_NBACKENDS * GRID_NKERNELS * GRID_MAX_LP;
172 9404 : long counters[ncounters][2];
173 9404 : memset(counters, 0, ncounters * 2 * sizeof(long));
174 9404 : double total = 0.0;
175 3771004 : for (int i = 0; i < ncounters; i++) {
176 3761600 : counters[i][1] = i; // needed as inverse index after qsort
177 7523200 : for (int j = 0; j < max_threads; j++) {
178 3761600 : counters[i][0] += per_thread_globals[j]->counters[i];
179 : }
180 3761600 : cp_mpi_sum_long(&counters[i][0], 1, comm);
181 3761600 : total += counters[i][0];
182 : }
183 :
184 : // Sort counters.
185 9404 : qsort(counters, ncounters, 2 * sizeof(long), &compare_counters);
186 :
187 : // Determine if anything needs to be printed.
188 9404 : bool print = false;
189 1805530 : for (int i = 0; i < ncounters && !print; i++) {
190 1796126 : if (counters[i][0] != 0) {
191 4926 : print = true;
192 : }
193 : }
194 9404 : if (!print) {
195 4478 : return; // nothing to be printed
196 : }
197 :
198 : // Print counters.
199 4926 : GRID_LIBRARY_PRINT(print_func, "\n", output_unit);
200 4926 : GRID_LIBRARY_PRINT(
201 : print_func,
202 : " ----------------------------------------------------------------"
203 : "---------------\n",
204 : output_unit);
205 4926 : GRID_LIBRARY_PRINT(
206 : print_func,
207 : " - "
208 : " -\n",
209 : output_unit);
210 4926 : GRID_LIBRARY_PRINT(
211 : print_func,
212 : " - GRID STATISTICS "
213 : " -\n",
214 : output_unit);
215 4926 : GRID_LIBRARY_PRINT(
216 : print_func,
217 : " - "
218 : " -\n",
219 : output_unit);
220 4926 : GRID_LIBRARY_PRINT(
221 : print_func,
222 : " ----------------------------------------------------------------"
223 : "---------------\n",
224 : output_unit);
225 4926 : GRID_LIBRARY_PRINT(
226 : print_func,
227 : " LP KERNEL BACKEND "
228 : "COUNT PERCENT\n",
229 : output_unit);
230 :
231 4926 : const char *kernel_names[] = {"collocate ortho", "integrate ortho",
232 : "collocate general", "integrate general"};
233 4926 : const char *backend_names[] = {"REF", "CPU", "DGEMM", "GPU", "HIP"};
234 :
235 1975326 : for (int i = 0; i < ncounters; i++) {
236 1970400 : if (counters[i][0] == 0)
237 1919088 : continue; // skip empty counters
238 51312 : const double percent = 100.0 * counters[i][0] / total;
239 51312 : const int idx = counters[i][1];
240 51312 : const int backend_stride = GRID_NKERNELS * GRID_MAX_LP;
241 51312 : const int back = idx / backend_stride;
242 51312 : const int kern = (idx % backend_stride) / GRID_MAX_LP;
243 51312 : const int lp = (idx % backend_stride) % GRID_MAX_LP;
244 51312 : char buffer[100];
245 51312 : snprintf(buffer, sizeof(buffer), " %-5i %-17s %-6s %34li %10.2f%%\n", lp,
246 : kernel_names[kern], backend_names[back], counters[i][0], percent);
247 51312 : GRID_LIBRARY_PRINT(print_func, buffer, output_unit);
248 : }
249 :
250 4926 : GRID_LIBRARY_PRINT(
251 : print_func,
252 : " ----------------------------------------------------------------"
253 : "---------------\n",
254 : output_unit);
255 : }
256 :
257 : // EOF
|