Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 : #include "offload_mempool.h"
8 : #include "../mpiwrap/cp_mpi.h"
9 : #include "offload_library.h"
10 : #include "offload_runtime.h"
11 :
12 : #include <assert.h>
13 : #include <inttypes.h>
14 : #include <omp.h>
15 : #include <stdbool.h>
16 : #include <stdio.h>
17 : #include <stdlib.h>
18 : #include <string.h>
19 :
20 : #if defined(__parallel)
21 : #include <mpi.h>
22 : #endif
23 :
24 : #if defined(__LIBXSTREAM)
25 : #include <libxstream/libxstream.h>
26 : #include <libxstream/libxstream_opencl.h>
27 : #elif defined(__LIBXS)
28 : #include <libxs/libxs_malloc.h>
29 : #endif
30 :
31 : #define OFFLOAD_MEMPOOL_PRINT(FN, MSG, OUTPUT_UNIT) \
32 : ((FN)(MSG, (int)strlen(MSG), OUTPUT_UNIT))
33 : #define OFFLOAD_MEMPOOL_OMPALLOC 1
34 :
35 : #if !defined(__LIBXSTREAM)
36 : /*******************************************************************************
37 : * \brief Private struct for storing a chunk of memory.
38 : * \author Ole Schuett
39 : ******************************************************************************/
40 : typedef struct offload_memchunk {
41 : void *mem; // first: allows to cast memchunk into mem-ptr...
42 : struct offload_memchunk *next;
43 : size_t size, used;
44 : } offload_memchunk_t;
45 :
46 : /*******************************************************************************
47 : * \brief Private struct for storing a memory pool.
48 : * \author Ole Schuett
49 : ******************************************************************************/
50 : typedef struct offload_mempool {
51 : offload_memchunk_t *available_head, *allocated_head; // single-linked lists
52 : } offload_mempool_t;
53 :
54 : /*******************************************************************************
55 : * \brief Private pools for host and device memory.
56 : * \author Ole Schuett
57 : ******************************************************************************/
58 : #if !defined(__LIBXS)
59 : static offload_mempool_t mempool_host = {0};
60 : #endif
61 : #if !defined(__LIBXSTREAM)
62 : static offload_mempool_t mempool_device = {0};
63 : #endif
64 :
65 : /*******************************************************************************
66 : * \brief Private counters for statistics.
67 : * \author Hans Pabst
68 : ******************************************************************************/
69 : #if !defined(__LIBXS)
70 : static struct {
71 : uint64_t mallocs, mempeak;
72 : } host_stats = {0, 0};
73 : #endif
74 : #if !defined(__LIBXSTREAM)
75 : static struct {
76 : uint64_t mallocs, mempeak;
77 : } device_stats = {0, 0};
78 : #endif
79 :
80 : /*******************************************************************************
81 : * \brief Private routine for actually allocating system memory.
82 : * \author Ole Schuett
83 : ******************************************************************************/
84 50041 : static void *actual_malloc(const size_t size, const bool on_device) {
85 50041 : if (size == 0) {
86 : return NULL;
87 : }
88 :
89 50041 : void *memory = NULL;
90 : #if defined(__OFFLOAD)
91 : if (on_device) {
92 : offload_activate_chosen_device();
93 : offloadMalloc(&memory, size);
94 : } else {
95 : offload_activate_chosen_device();
96 : offloadMallocHost(&memory, size);
97 : }
98 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
99 : memory = omp_alloc(size, omp_null_allocator);
100 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
101 : if (MPI_SUCCESS != MPI_Alloc_mem((MPI_Aint)size, MPI_INFO_NULL, &memory)) {
102 : fprintf(stderr, "ERROR: MPI_Alloc_mem failed at %s:%i\n", name, __FILE__,
103 : __LINE__);
104 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
105 : }
106 : #else
107 50041 : memory = malloc(size);
108 : #endif
109 :
110 : // Update statistics.
111 50041 : if (on_device) {
112 50041 : #pragma omp atomic
113 : ++device_stats.mallocs;
114 : }
115 : #if !defined(__LIBXS)
116 : else {
117 : #pragma omp atomic
118 : ++host_stats.mallocs;
119 : }
120 : #endif
121 :
122 50041 : assert(memory != NULL);
123 : return memory;
124 : }
125 :
126 : /*******************************************************************************
127 : * \brief Private routine for actually freeing system memory.
128 : * \author Ole Schuett
129 : ******************************************************************************/
130 99769 : static void actual_free(void *memory, const bool on_device) {
131 99769 : if (NULL == memory) {
132 : return;
133 : }
134 :
135 : #if defined(__OFFLOAD)
136 : if (on_device) {
137 : offload_activate_chosen_device();
138 : offloadFree(memory);
139 : } else {
140 : offload_activate_chosen_device();
141 : offloadFreeHost(memory);
142 : }
143 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
144 : (void)on_device; // mark used
145 : omp_free(memory, omp_null_allocator);
146 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
147 : (void)on_device; // mark used
148 : if (MPI_SUCCESS != MPI_Free_mem(memory)) {
149 : fprintf(stderr, "ERROR: MPI_Free_mem failed at %s:%i\n", name, __FILE__,
150 : __LINE__);
151 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
152 : }
153 : #else
154 50033 : (void)on_device; // mark used
155 50033 : free(memory);
156 : #endif
157 : }
158 :
159 : /*******************************************************************************
160 : * \brief Private routine for allocating host or device memory from the pool.
161 : * \author Ole Schuett and Hans Pabst
162 : ******************************************************************************/
163 294355 : static void *internal_mempool_malloc(offload_mempool_t *pool, const size_t size,
164 : const bool on_device) {
165 294355 : if (size == 0) {
166 : return NULL;
167 : }
168 :
169 294355 : offload_memchunk_t *chunk;
170 :
171 588710 : #pragma omp critical(offload_mempool_modify)
172 : {
173 : // Find a possible chunk to reuse or reclaim in available list.
174 294355 : offload_memchunk_t **reuse = NULL,
175 294355 : **reclaim = NULL; // ** for easy list removal
176 294355 : offload_memchunk_t **indirect = &pool->available_head;
177 336729 : while (*indirect != NULL) {
178 282265 : const size_t s = (*indirect)->size;
179 282265 : if (size <= s && (reuse == NULL || s < (*reuse)->size)) {
180 246038 : reuse = indirect; // reuse smallest suitable chunk
181 246038 : if (s == size) {
182 : break; // perfect match, exit early
183 : }
184 36227 : } else if (reclaim == NULL || (*reclaim)->size < s) {
185 26835 : reclaim = indirect; // reclaim largest unsuitable chunk
186 : }
187 42374 : indirect = &(*indirect)->next;
188 : }
189 :
190 : // Select an existing chunk or allocate a new one.
191 294355 : if (reuse != NULL) {
192 : // Reusing an exising chunk that's already large enough.
193 244314 : chunk = *reuse;
194 244314 : *reuse = chunk->next; // remove chunk from available list.
195 50041 : } else if (reclaim != NULL) {
196 : // Reclaiming an existing chunk (resize will happen outside crit. region).
197 305 : chunk = *reclaim;
198 305 : *reclaim = chunk->next; // remove chunk from available list.
199 : } else {
200 : // Found no available chunk, allocate a new one.
201 49736 : chunk = calloc(1, sizeof(offload_memchunk_t));
202 49736 : assert(chunk != NULL);
203 : }
204 : }
205 :
206 : // Resize chunk outside of critical region before adding it to allocated list.
207 294355 : if (chunk->size < size) {
208 50041 : actual_free(chunk->mem, on_device);
209 50041 : chunk->mem = actual_malloc(size, on_device);
210 50041 : chunk->size = size;
211 : }
212 :
213 294355 : chunk->used = size; // for statistics
214 :
215 : // Insert chunk into allocated list.
216 294355 : #pragma omp critical(offload_mempool_modify)
217 : {
218 294355 : chunk->next = pool->allocated_head;
219 294355 : pool->allocated_head = chunk;
220 : }
221 :
222 294355 : return chunk->mem;
223 : }
224 :
225 : /*******************************************************************************
226 : * \brief Private routine for releasing memory back to the pool.
227 : * \author Ole Schuett
228 : ******************************************************************************/
229 294355 : static void internal_mempool_free(offload_mempool_t *pool, const void *mem) {
230 294355 : if (mem == NULL) {
231 : return;
232 : }
233 :
234 588710 : #pragma omp critical(offload_mempool_modify)
235 : {
236 294355 : offload_memchunk_t **indirect = &pool->allocated_head;
237 628362 : while (*indirect != NULL && (*indirect)->mem != mem) {
238 334007 : indirect = &(*indirect)->next;
239 : }
240 294355 : offload_memchunk_t *chunk = *indirect;
241 294355 : assert(chunk != NULL && chunk->mem == mem);
242 294355 : *indirect = chunk->next;
243 294355 : chunk->next = pool->available_head;
244 294355 : pool->available_head = chunk;
245 : }
246 : }
247 :
248 : /*******************************************************************************
249 : * \brief Private routine for freeing all memory in the pool.
250 : * \author Ole Schuett and Hans Pabst
251 : ******************************************************************************/
252 10328 : static void internal_mempool_clear(offload_mempool_t *pool,
253 : const bool on_device) {
254 20656 : #pragma omp critical(offload_mempool_modify)
255 : {
256 10328 : assert(pool->allocated_head == NULL);
257 60056 : while (pool->available_head != NULL) {
258 49728 : offload_memchunk_t *chunk = pool->available_head;
259 49728 : pool->available_head = chunk->next;
260 49728 : actual_free(chunk->mem, on_device);
261 49728 : free(chunk);
262 : }
263 : }
264 10328 : }
265 :
266 : /*******************************************************************************
267 : * \brief Private routine for summing alloc sizes of all chunks in given list.
268 : * \author Ole Schuett and Hans Pabst
269 : ******************************************************************************/
270 62432 : static uint64_t sum_chunks_size(const offload_memchunk_t *head, size_t offset) {
271 62432 : uint64_t result = 0;
272 212560 : for (const offload_memchunk_t *chunk = head; chunk != NULL;
273 150128 : chunk = chunk->next) {
274 150128 : result += *(const size_t *)((const char *)chunk + offset);
275 : }
276 62432 : return result;
277 : }
278 : #endif /* !defined(__LIBXSTREAM) */
279 :
280 : /*******************************************************************************
281 : * \brief Internal routine for allocating host memory from the pool.
282 : * \author Ole Schuett
283 : ******************************************************************************/
284 3830303 : void *offload_mempool_host_malloc(const size_t size) {
285 : #if defined(__LIBXSTREAM)
286 : return libxs_malloc(libxstream_opencl_config.pool_hst, size,
287 : LIBXS_MALLOC_AUTO);
288 : #elif defined(__LIBXS)
289 3830303 : return libxs_malloc(libxs_default_pool(), size, LIBXS_MALLOC_AUTO);
290 : #else
291 : return internal_mempool_malloc(&mempool_host, size, false);
292 : #endif
293 : }
294 :
295 : /*******************************************************************************
296 : * \brief Internal routine for allocating device memory from the pool
297 : * \author Ole Schuett
298 : ******************************************************************************/
299 294355 : void *offload_mempool_device_malloc(const size_t size) {
300 : #if defined(__LIBXSTREAM)
301 : void *memory = NULL;
302 : const int result = libxstream_mem_allocate(&memory, size);
303 : assert(EXIT_SUCCESS == result);
304 : return memory;
305 : #else
306 294355 : return internal_mempool_malloc(&mempool_device, size, true);
307 : #endif
308 : }
309 :
310 : /*******************************************************************************
311 : * \brief Internal routine for releasing memory back to the pool.
312 : * \author Ole Schuett
313 : ******************************************************************************/
314 4530658 : void offload_mempool_host_free(const void *memory) {
315 : #if defined(__LIBXSTREAM) || defined(__LIBXS)
316 4530658 : libxs_free((void *)memory);
317 : #else
318 : internal_mempool_free(&mempool_host, memory);
319 : #endif
320 4530658 : }
321 :
322 : /*******************************************************************************
323 : * \brief Internal routine for releasing memory back to the pool.
324 : * \author Ole Schuett
325 : ******************************************************************************/
326 294355 : void offload_mempool_device_free(const void *memory) {
327 : #if defined(__LIBXSTREAM)
328 : const int result = libxstream_mem_deallocate((void *)memory);
329 : assert(EXIT_SUCCESS == result);
330 : #else
331 294355 : internal_mempool_free(&mempool_device, memory);
332 : #endif
333 294355 : }
334 :
335 : /*******************************************************************************
336 : * \brief Internal routine for freeing all memory in the pool.
337 : * \author Ole Schuett
338 : ******************************************************************************/
339 10328 : void offload_mempool_clear(void) {
340 : #if defined(__LIBXSTREAM)
341 : (void)0;
342 : #elif defined(__LIBXS)
343 : {
344 10328 : const uint64_t size = sum_chunks_size(mempool_device.available_head,
345 : offsetof(offload_memchunk_t, size)) +
346 10328 : sum_chunks_size(mempool_device.allocated_head,
347 : offsetof(offload_memchunk_t, size));
348 10328 : if (device_stats.mempeak < size)
349 8925 : device_stats.mempeak = size;
350 : }
351 10328 : internal_mempool_clear(&mempool_device, true);
352 : #else
353 : {
354 : const uint64_t hsize = sum_chunks_size(mempool_host.available_head,
355 : offsetof(offload_memchunk_t, size)) +
356 : sum_chunks_size(mempool_host.allocated_head,
357 : offsetof(offload_memchunk_t, size));
358 : const uint64_t dsize = sum_chunks_size(mempool_device.available_head,
359 : offsetof(offload_memchunk_t, size)) +
360 : sum_chunks_size(mempool_device.allocated_head,
361 : offsetof(offload_memchunk_t, size));
362 : if (host_stats.mempeak < hsize)
363 : host_stats.mempeak = hsize;
364 : if (device_stats.mempeak < dsize)
365 : device_stats.mempeak = dsize;
366 : }
367 : internal_mempool_clear(&mempool_host, false);
368 : internal_mempool_clear(&mempool_device, true);
369 : #endif
370 10328 : }
371 :
372 : /*******************************************************************************
373 : * \brief Internal routine to query statistics.
374 : * \author Hans Pabst
375 : ******************************************************************************/
376 10444 : void offload_mempool_stats_get(offload_mempool_stats_t *memstats) {
377 10444 : assert(NULL != memstats);
378 20888 : #pragma omp critical(offload_mempool_modify)
379 : {
380 : #if defined(__LIBXSTREAM)
381 : if (NULL != libxstream_opencl_config.pool_hst) {
382 : libxs_malloc_pool_info_t info;
383 : libxs_malloc_pool_info(libxstream_opencl_config.pool_hst, &info);
384 : memstats->host_mallocs = info.nmallocs;
385 : memstats->host_used = info.used;
386 : memstats->host_size = info.size;
387 : memstats->host_peak = info.peak;
388 : } else {
389 : memstats->host_mallocs = 0;
390 : memstats->host_used = 0;
391 : memstats->host_size = 0;
392 : memstats->host_peak = 0;
393 : }
394 : if (NULL != libxstream_opencl_config.pool_dev) {
395 : libxs_malloc_pool_info_t info;
396 : libxs_malloc_pool_info(libxstream_opencl_config.pool_dev, &info);
397 : memstats->device_mallocs = info.nmallocs;
398 : memstats->device_used = info.used;
399 : memstats->device_size = info.size;
400 : memstats->device_peak = info.peak;
401 : } else {
402 : memstats->device_mallocs = 0;
403 : memstats->device_used = 0;
404 : memstats->device_size = 0;
405 : memstats->device_peak = 0;
406 : }
407 : #elif defined(__LIBXS)
408 : {
409 10444 : libxs_malloc_pool_info_t info;
410 20742 : if (NULL != libxs_default_pool() &&
411 10298 : EXIT_SUCCESS == libxs_malloc_pool_info(libxs_default_pool(), &info)) {
412 10298 : memstats->host_mallocs = info.nmallocs;
413 10298 : memstats->host_used = info.used;
414 10298 : memstats->host_size = info.size;
415 10298 : memstats->host_peak = info.peak;
416 : } else {
417 146 : memstats->host_mallocs = 0;
418 146 : memstats->host_used = 0;
419 146 : memstats->host_size = 0;
420 146 : memstats->host_peak = 0;
421 : }
422 : }
423 10444 : memstats->device_mallocs = device_stats.mallocs;
424 20888 : memstats->device_used =
425 10444 : sum_chunks_size(mempool_device.available_head,
426 10444 : offsetof(offload_memchunk_t, used)) +
427 10444 : sum_chunks_size(mempool_device.allocated_head,
428 : offsetof(offload_memchunk_t, used));
429 20888 : memstats->device_size =
430 10444 : sum_chunks_size(mempool_device.available_head,
431 10444 : offsetof(offload_memchunk_t, size)) +
432 10444 : sum_chunks_size(mempool_device.allocated_head,
433 : offsetof(offload_memchunk_t, size));
434 10444 : memstats->device_peak = memstats->device_size < device_stats.mempeak
435 : ? device_stats.mempeak
436 10444 : : memstats->device_size;
437 : #else
438 : memstats->host_mallocs = host_stats.mallocs;
439 : memstats->host_used = sum_chunks_size(mempool_host.available_head,
440 : offsetof(offload_memchunk_t, used)) +
441 : sum_chunks_size(mempool_host.allocated_head,
442 : offsetof(offload_memchunk_t, used));
443 : memstats->host_size = sum_chunks_size(mempool_host.available_head,
444 : offsetof(offload_memchunk_t, size)) +
445 : sum_chunks_size(mempool_host.allocated_head,
446 : offsetof(offload_memchunk_t, size));
447 : memstats->host_peak = memstats->host_size < host_stats.mempeak
448 : ? host_stats.mempeak
449 : : memstats->host_size;
450 : memstats->device_mallocs = device_stats.mallocs;
451 : memstats->device_used =
452 : sum_chunks_size(mempool_device.available_head,
453 : offsetof(offload_memchunk_t, used)) +
454 : sum_chunks_size(mempool_device.allocated_head,
455 : offsetof(offload_memchunk_t, used));
456 : memstats->device_size =
457 : sum_chunks_size(mempool_device.available_head,
458 : offsetof(offload_memchunk_t, size)) +
459 : sum_chunks_size(mempool_device.allocated_head,
460 : offsetof(offload_memchunk_t, size));
461 : memstats->device_peak = memstats->device_size < device_stats.mempeak
462 : ? device_stats.mempeak
463 : : memstats->device_size;
464 : #endif
465 : }
466 10444 : }
467 :
468 : /*******************************************************************************
469 : * \brief Print allocation statistics..
470 : * \author Hans Pabst
471 : ******************************************************************************/
472 10444 : void offload_mempool_stats_print(int fortran_comm,
473 : void (*print_func)(const char *, int, int),
474 : int output_unit) {
475 10444 : assert(omp_get_num_threads() == 1);
476 :
477 10444 : char buffer[100];
478 10444 : const cp_mpi_comm_t comm = cp_mpi_comm_f2c(fortran_comm);
479 10444 : offload_mempool_stats_t memstats;
480 10444 : offload_mempool_stats_get(&memstats);
481 10444 : cp_mpi_max_uint64(&memstats.device_mallocs, 1, comm);
482 10444 : cp_mpi_max_uint64(&memstats.host_mallocs, 1, comm);
483 :
484 10444 : if (0 != memstats.device_mallocs || 0 != memstats.host_mallocs) {
485 9066 : OFFLOAD_MEMPOOL_PRINT(print_func, "\n", output_unit);
486 9066 : OFFLOAD_MEMPOOL_PRINT(
487 : print_func,
488 : " ----------------------------------------------------------------"
489 : "---------------\n",
490 : output_unit);
491 9066 : OFFLOAD_MEMPOOL_PRINT(
492 : print_func,
493 : " - "
494 : " -\n",
495 : output_unit);
496 :
497 9066 : OFFLOAD_MEMPOOL_PRINT(
498 : print_func,
499 : " - OFFLOAD MEMPOOL STATISTICS "
500 : " -\n",
501 : output_unit);
502 9066 : OFFLOAD_MEMPOOL_PRINT(
503 : print_func,
504 : " - "
505 : " -\n",
506 : output_unit);
507 9066 : OFFLOAD_MEMPOOL_PRINT(
508 : print_func,
509 : " ----------------------------------------------------------------"
510 : "---------------\n",
511 : output_unit);
512 9066 : OFFLOAD_MEMPOOL_PRINT(print_func,
513 : " Memory consumption "
514 : " Number of allocations Used [MiB] Size [MiB]\n",
515 : output_unit);
516 : }
517 10444 : if (0 < memstats.device_mallocs) {
518 9052 : cp_mpi_max_uint64(&memstats.device_peak, 1, comm);
519 9052 : snprintf(buffer, sizeof(buffer),
520 : " Device "
521 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
522 9052 : (uintptr_t)memstats.device_mallocs,
523 9052 : (uintptr_t)((memstats.device_used + (512U << 10)) >> 20),
524 9052 : (uintptr_t)((memstats.device_peak + (512U << 10)) >> 20));
525 9052 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
526 : }
527 10444 : if (0 < memstats.host_mallocs) {
528 8920 : cp_mpi_max_uint64(&memstats.host_peak, 1, comm);
529 8920 : snprintf(buffer, sizeof(buffer),
530 : " Host "
531 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
532 8920 : (uintptr_t)memstats.host_mallocs,
533 8920 : (uintptr_t)((memstats.host_used + (512U << 10)) >> 20),
534 8920 : (uintptr_t)((memstats.host_peak + (512U << 10)) >> 20));
535 8920 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
536 : }
537 10444 : if (0 < memstats.device_mallocs || 0 < memstats.host_mallocs) {
538 9066 : OFFLOAD_MEMPOOL_PRINT(
539 : print_func,
540 : " ----------------------------------------------------------------"
541 : "---------------\n",
542 : output_unit);
543 : }
544 10444 : }
545 :
546 : // EOF
|