Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 : #include "offload_mempool.h"
8 : #include "../mpiwrap/cp_mpi.h"
9 : #include "offload_library.h"
10 : #include "offload_runtime.h"
11 :
12 : #include <assert.h>
13 : #include <inttypes.h>
14 : #include <omp.h>
15 : #include <stdbool.h>
16 : #include <stdio.h>
17 : #include <stdlib.h>
18 : #include <string.h>
19 :
20 : #if defined(__parallel)
21 : #include <mpi.h>
22 : #endif
23 :
24 : #define OFFLOAD_MEMPOOL_PRINT(FN, MSG, OUTPUT_UNIT) \
25 : ((FN)(MSG, (int)strlen(MSG), OUTPUT_UNIT))
26 : #define OFFLOAD_MEMPOOL_OMPALLOC 1
27 :
28 : /*******************************************************************************
29 : * \brief Private struct for storing a chunk of memory.
30 : * \author Ole Schuett
31 : ******************************************************************************/
32 : typedef struct offload_memchunk {
33 : void *mem; // first: allows to cast memchunk into mem-ptr...
34 : struct offload_memchunk *next;
35 : size_t size, used;
36 : } offload_memchunk_t;
37 :
38 : /*******************************************************************************
39 : * \brief Private struct for storing a memory pool.
40 : * \author Ole Schuett
41 : ******************************************************************************/
42 : typedef struct offload_mempool {
43 : offload_memchunk_t *available_head, *allocated_head; // single-linked lists
44 : uint64_t peak_size; // for statistics
45 : } offload_mempool_t;
46 :
47 : /*******************************************************************************
48 : * \brief Private pools for host and device memory.
49 : * \author Ole Schuett
50 : ******************************************************************************/
51 : static offload_mempool_t mempool_host = {0}, mempool_device = {0};
52 :
53 : /*******************************************************************************
54 : * \brief Private counters for statistics.
55 : * \author Hans Pabst
56 : ******************************************************************************/
57 : static uint64_t host_malloc_counter = 0, device_malloc_counter = 0;
58 :
59 : /*******************************************************************************
60 : * \brief Returns the larger of two given integer (missing from the C standard)
61 : * \author Ole Schuett
62 : ******************************************************************************/
63 41808 : static inline uint64_t imax(uint64_t x, uint64_t y) { return (x > y ? x : y); }
64 :
65 : /*******************************************************************************
66 : * \brief Private routine for actually allocating system memory.
67 : * \author Ole Schuett
68 : ******************************************************************************/
69 148138 : static void *actual_malloc(const size_t size, const bool on_device) {
70 148138 : if (size == 0) {
71 : return NULL;
72 : }
73 :
74 148138 : void *memory = NULL;
75 :
76 : #if defined(__OFFLOAD)
77 : if (on_device) {
78 : offload_activate_chosen_device();
79 : offloadMalloc(&memory, size);
80 : } else {
81 : offload_activate_chosen_device();
82 : offloadMallocHost(&memory, size);
83 : }
84 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
85 : memory = omp_alloc(size, omp_null_allocator);
86 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
87 : if (MPI_SUCCESS != MPI_Alloc_mem((MPI_Aint)size, MPI_INFO_NULL, &memory)) {
88 : fprintf(stderr, "ERROR: MPI_Alloc_mem failed at %s:%i\n", name, __FILE__,
89 : __LINE__);
90 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
91 : }
92 : #else
93 148138 : memory = malloc(size);
94 : #endif
95 :
96 : // Update statistics.
97 148138 : if (on_device) {
98 50481 : #pragma omp atomic
99 : ++device_malloc_counter;
100 : } else {
101 97657 : #pragma omp atomic
102 : ++host_malloc_counter;
103 : }
104 :
105 148138 : assert(memory != NULL);
106 : return memory;
107 : }
108 :
109 : /*******************************************************************************
110 : * \brief Private routine for actually freeing system memory.
111 : * \author Ole Schuett
112 : ******************************************************************************/
113 271865 : static void actual_free(void *memory, const bool on_device) {
114 271865 : if (NULL == memory) {
115 : return;
116 : }
117 :
118 : #if defined(__OFFLOAD)
119 : if (on_device) {
120 : offload_activate_chosen_device();
121 : offloadFree(memory);
122 : } else {
123 : offload_activate_chosen_device();
124 : offloadFreeHost(memory);
125 : }
126 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
127 : (void)on_device; // mark used
128 : omp_free(memory, omp_null_allocator);
129 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
130 : (void)on_device; // mark used
131 : if (MPI_SUCCESS != MPI_Free_mem(memory)) {
132 : fprintf(stderr, "ERROR: MPI_Free_mem failed at %s:%i\n", name, __FILE__,
133 : __LINE__);
134 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
135 : }
136 : #else
137 148122 : (void)on_device; // mark used
138 148122 : free(memory);
139 : #endif
140 : }
141 :
142 : /*******************************************************************************
143 : * \brief Private routine for allocating host or device memory from the pool.
144 : * \author Ole Schuett and Hans Pabst
145 : ******************************************************************************/
146 4131371 : static void *internal_mempool_malloc(offload_mempool_t *pool, const size_t size,
147 : const bool on_device) {
148 4131371 : if (size == 0) {
149 : return NULL;
150 : }
151 :
152 4077383 : offload_memchunk_t *chunk;
153 :
154 8154766 : #pragma omp critical(offload_mempool_modify)
155 : {
156 : // Find a possible chunk to reuse or reclaim in available list.
157 4077383 : offload_memchunk_t **reuse = NULL,
158 4077383 : **reclaim = NULL; // ** for easy list removal
159 4077383 : offload_memchunk_t **indirect = &pool->available_head;
160 83196731 : while (*indirect != NULL) {
161 80162359 : const size_t s = (*indirect)->size;
162 80162359 : if (size <= s && (reuse == NULL || s < (*reuse)->size)) {
163 6020028 : reuse = indirect; // reuse smallest suitable chunk
164 6020028 : if (s == size) {
165 : break; // perfect match, exit early
166 : }
167 74142331 : } else if (reclaim == NULL || (*reclaim)->size < s) {
168 7538893 : reclaim = indirect; // reclaim largest unsuitable chunk
169 : }
170 79119348 : indirect = &(*indirect)->next;
171 : }
172 :
173 : // Select an existing chunk or allocate a new one.
174 4077383 : if (reuse != NULL) {
175 : // Reusing an exising chunk that's already large enough.
176 3929245 : chunk = *reuse;
177 3929245 : *reuse = chunk->next; // remove chunk from available list.
178 148138 : } else if (reclaim != NULL) {
179 : // Reclaiming an existing chunk (resize will happen outside crit. region).
180 24395 : chunk = *reclaim;
181 24395 : *reclaim = chunk->next; // remove chunk from available list.
182 : } else {
183 : // Found no available chunk, allocate a new one.
184 123743 : chunk = calloc(1, sizeof(offload_memchunk_t));
185 123743 : assert(chunk != NULL);
186 : }
187 : }
188 :
189 : // Resize chunk outside of critical region before adding it to allocated list.
190 4077383 : if (chunk->size < size) {
191 148138 : actual_free(chunk->mem, on_device);
192 148138 : chunk->mem = actual_malloc(size, on_device);
193 148138 : chunk->size = size;
194 : }
195 :
196 4077383 : chunk->used = size; // for statistics
197 :
198 : // Insert chunk into allocated list.
199 4077383 : #pragma omp critical(offload_mempool_modify)
200 : {
201 4077383 : chunk->next = pool->allocated_head;
202 4077383 : pool->allocated_head = chunk;
203 : }
204 :
205 4077383 : return chunk->mem;
206 : }
207 :
208 : /*******************************************************************************
209 : * \brief Internal routine for allocating host memory from the pool.
210 : * \author Ole Schuett
211 : ******************************************************************************/
212 3836360 : void *offload_mempool_host_malloc(const size_t size) {
213 3836360 : return internal_mempool_malloc(&mempool_host, size, false);
214 : }
215 :
216 : /*******************************************************************************
217 : * \brief Internal routine for allocating device memory from the pool
218 : * \author Ole Schuett
219 : ******************************************************************************/
220 295011 : void *offload_mempool_device_malloc(const size_t size) {
221 295011 : return internal_mempool_malloc(&mempool_device, size, true);
222 : }
223 :
224 : /*******************************************************************************
225 : * \brief Private routine for releasing memory back to the pool.
226 : * \author Ole Schuett
227 : ******************************************************************************/
228 4837938 : static void internal_mempool_free(offload_mempool_t *pool, const void *mem) {
229 4837938 : if (mem == NULL) {
230 : return;
231 : }
232 :
233 8154766 : #pragma omp critical(offload_mempool_modify)
234 : {
235 : // Find chunk in allocated list.
236 4077383 : offload_memchunk_t **indirect = &pool->allocated_head;
237 15484964 : while (*indirect != NULL && (*indirect)->mem != mem) {
238 11407581 : indirect = &(*indirect)->next;
239 : }
240 4077383 : offload_memchunk_t *chunk = *indirect;
241 4077383 : assert(chunk != NULL && chunk->mem == mem);
242 :
243 : // Remove chunk from allocated list.
244 4077383 : *indirect = chunk->next;
245 :
246 : // Add chunk to available list.
247 4077383 : chunk->next = pool->available_head;
248 4077383 : pool->available_head = chunk;
249 : }
250 : }
251 :
252 : /*******************************************************************************
253 : * \brief Internal routine for releasing memory back to the pool.
254 : * \author Ole Schuett
255 : ******************************************************************************/
256 4542927 : void offload_mempool_host_free(const void *memory) {
257 4542927 : internal_mempool_free(&mempool_host, memory);
258 4542927 : }
259 :
260 : /*******************************************************************************
261 : * \brief Internal routine for releasing memory back to the pool.
262 : * \author Ole Schuett
263 : ******************************************************************************/
264 295011 : void offload_mempool_device_free(const void *memory) {
265 295011 : internal_mempool_free(&mempool_device, memory);
266 295011 : }
267 :
268 : /*******************************************************************************
269 : * \brief Private routine for freeing all memory in the pool.
270 : * \author Ole Schuett
271 : ******************************************************************************/
272 20788 : static void internal_mempool_clear(offload_mempool_t *pool,
273 : const bool on_device) {
274 :
275 41576 : #pragma omp critical(offload_mempool_modify)
276 : {
277 20788 : uint64_t pool_size = 0;
278 :
279 : // Check for leaks, i.e. that the allocated list is empty.
280 20788 : assert(pool->allocated_head == NULL);
281 :
282 : // Free all chunks in available list.
283 144515 : while (pool->available_head != NULL) {
284 123727 : offload_memchunk_t *chunk = pool->available_head;
285 123727 : pool->available_head = chunk->next; // remove chunk
286 123727 : actual_free(chunk->mem, on_device);
287 123727 : pool_size += chunk->size;
288 123727 : free(chunk);
289 : }
290 :
291 : // Update stats.
292 20788 : pool->peak_size = imax(pool->peak_size, pool_size);
293 : }
294 20788 : }
295 :
296 : /*******************************************************************************
297 : * \brief Internal routine for freeing all memory in the pool.
298 : * \author Ole Schuett and Hans Pabst
299 : ******************************************************************************/
300 10394 : void offload_mempool_clear(void) {
301 10394 : internal_mempool_clear(&mempool_host, false);
302 10394 : internal_mempool_clear(&mempool_device, true);
303 10394 : }
304 :
305 : /*******************************************************************************
306 : * \brief Private routine for summing alloc sizes of all chunks in given list.
307 : * \author Ole Schuett
308 : ******************************************************************************/
309 42040 : static uint64_t sum_chunks_size(const offload_memchunk_t *head) {
310 42040 : uint64_t size_sum = 0;
311 166627 : for (const offload_memchunk_t *chunk = head; chunk != NULL;
312 124587 : chunk = chunk->next) {
313 124587 : size_sum += chunk->size;
314 : }
315 42040 : return size_sum;
316 : }
317 :
318 : /*******************************************************************************
319 : * \brief Private routine for summing used sizes of all chunks in given list.
320 : * \author Ole Schuett
321 : ******************************************************************************/
322 42040 : static uint64_t sum_chunks_used(const offload_memchunk_t *head) {
323 42040 : uint64_t used_sum = 0;
324 166627 : for (const offload_memchunk_t *chunk = head; chunk != NULL;
325 124587 : chunk = chunk->next) {
326 124587 : used_sum += chunk->used;
327 : }
328 42040 : return used_sum;
329 : }
330 :
331 : /*******************************************************************************
332 : * \brief Internal routine to query statistics.
333 : * \author Hans Pabst
334 : ******************************************************************************/
335 10510 : void offload_mempool_stats_get(offload_mempool_stats_t *memstats) {
336 10510 : assert(NULL != memstats);
337 21020 : #pragma omp critical(offload_mempool_modify)
338 : {
339 10510 : memstats->host_mallocs = host_malloc_counter;
340 10510 : memstats->host_used = sum_chunks_used(mempool_host.available_head) +
341 10510 : sum_chunks_used(mempool_host.allocated_head);
342 10510 : memstats->host_size = sum_chunks_size(mempool_host.available_head) +
343 10510 : sum_chunks_size(mempool_host.allocated_head);
344 10510 : memstats->host_peak = imax(mempool_host.peak_size, memstats->device_size);
345 :
346 10510 : memstats->device_mallocs = device_malloc_counter;
347 10510 : memstats->device_used = sum_chunks_used(mempool_device.available_head) +
348 10510 : sum_chunks_used(mempool_device.allocated_head);
349 10510 : memstats->device_size = sum_chunks_size(mempool_device.available_head) +
350 10510 : sum_chunks_size(mempool_device.allocated_head);
351 10510 : memstats->device_peak =
352 10510 : imax(mempool_device.peak_size, memstats->device_size);
353 : }
354 10510 : }
355 :
356 : /*******************************************************************************
357 : * \brief Print allocation statistics..
358 : * \author Hans Pabst
359 : ******************************************************************************/
360 10510 : void offload_mempool_stats_print(int fortran_comm,
361 : void (*print_func)(const char *, int, int),
362 : int output_unit) {
363 10510 : assert(omp_get_num_threads() == 1);
364 :
365 10510 : char buffer[100];
366 10510 : const cp_mpi_comm_t comm = cp_mpi_comm_f2c(fortran_comm);
367 10510 : offload_mempool_stats_t memstats;
368 10510 : offload_mempool_stats_get(&memstats);
369 10510 : cp_mpi_max_uint64(&memstats.device_mallocs, 1, comm);
370 10510 : cp_mpi_max_uint64(&memstats.host_mallocs, 1, comm);
371 :
372 10510 : if (0 != memstats.device_mallocs || 0 != memstats.host_mallocs) {
373 9132 : OFFLOAD_MEMPOOL_PRINT(print_func, "\n", output_unit);
374 9132 : OFFLOAD_MEMPOOL_PRINT(
375 : print_func,
376 : " ----------------------------------------------------------------"
377 : "---------------\n",
378 : output_unit);
379 9132 : OFFLOAD_MEMPOOL_PRINT(
380 : print_func,
381 : " - "
382 : " -\n",
383 : output_unit);
384 :
385 9132 : OFFLOAD_MEMPOOL_PRINT(
386 : print_func,
387 : " - OFFLOAD MEMPOOL STATISTICS "
388 : " -\n",
389 : output_unit);
390 9132 : OFFLOAD_MEMPOOL_PRINT(
391 : print_func,
392 : " - "
393 : " -\n",
394 : output_unit);
395 9132 : OFFLOAD_MEMPOOL_PRINT(
396 : print_func,
397 : " ----------------------------------------------------------------"
398 : "---------------\n",
399 : output_unit);
400 9132 : OFFLOAD_MEMPOOL_PRINT(print_func,
401 : " Memory consumption "
402 : " Number of allocations Used [MiB] Size [MiB]\n",
403 : output_unit);
404 : }
405 : #if defined(__OFFLOAD)
406 : if (0 < memstats.device_mallocs) {
407 : cp_mpi_max_uint64(&memstats.device_peak, 1, comm);
408 : snprintf(buffer, sizeof(buffer),
409 : " Device "
410 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
411 : (uintptr_t)memstats.device_mallocs,
412 : (uintptr_t)((memstats.device_used + (512U << 10)) >> 20),
413 : (uintptr_t)((memstats.device_peak + (512U << 10)) >> 20));
414 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
415 : }
416 : #endif
417 10510 : if (0 < memstats.host_mallocs) {
418 9132 : cp_mpi_max_uint64(&memstats.host_peak, 1, comm);
419 9132 : snprintf(buffer, sizeof(buffer),
420 : " Host "
421 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
422 9132 : (uintptr_t)memstats.host_mallocs,
423 9132 : (uintptr_t)((memstats.host_used + (512U << 10)) >> 20),
424 9132 : (uintptr_t)((memstats.host_peak + (512U << 10)) >> 20));
425 9132 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
426 : }
427 10510 : if (0 < memstats.device_mallocs || 0 < memstats.host_mallocs) {
428 9132 : OFFLOAD_MEMPOOL_PRINT(
429 : print_func,
430 : " ----------------------------------------------------------------"
431 : "---------------\n",
432 : output_unit);
433 : }
434 10510 : }
435 :
436 : // EOF
|