Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 : #include "offload_mempool.h"
8 : #include "../mpiwrap/cp_mpi.h"
9 : #include "offload_library.h"
10 : #include "offload_runtime.h"
11 :
12 : #include <assert.h>
13 : #include <inttypes.h>
14 : #include <omp.h>
15 : #include <stdbool.h>
16 : #include <stdio.h>
17 : #include <stdlib.h>
18 : #include <string.h>
19 :
20 : #if defined(__parallel)
21 : #include <mpi.h>
22 : #endif
23 :
24 : #define OFFLOAD_MEMPOOL_PRINT(FN, MSG, OUTPUT_UNIT) \
25 : ((FN)(MSG, (int)strlen(MSG), OUTPUT_UNIT))
26 : #define OFFLOAD_MEMPOOL_OMPALLOC 1
27 :
28 : /*******************************************************************************
29 : * \brief Private struct for storing a chunk of memory.
30 : * \author Ole Schuett
31 : ******************************************************************************/
32 : typedef struct offload_memchunk {
33 : void *mem; // first: allows to cast memchunk into mem-ptr...
34 : struct offload_memchunk *next;
35 : size_t size, used;
36 : } offload_memchunk_t;
37 :
38 : /*******************************************************************************
39 : * \brief Private struct for storing a memory pool.
40 : * \author Ole Schuett
41 : ******************************************************************************/
42 : typedef struct offload_mempool {
43 : offload_memchunk_t *available_head, *allocated_head; // single-linked lists
44 : } offload_mempool_t;
45 :
46 : /*******************************************************************************
47 : * \brief Private pools for host and device memory.
48 : * \author Ole Schuett
49 : ******************************************************************************/
50 : static offload_mempool_t mempool_host = {0}, mempool_device = {0};
51 :
52 : /*******************************************************************************
53 : * \brief Private some counters for statistics.
54 : * \author Hans Pabst
55 : ******************************************************************************/
56 : static uint64_t host_malloc_counter = 0, device_malloc_counter = 0;
57 :
58 : /*******************************************************************************
59 : * \brief Private routine for actually allocating system memory.
60 : * \author Ole Schuett
61 : ******************************************************************************/
62 131452 : static void *actual_malloc(const size_t size, const bool on_device) {
63 131452 : if (size == 0) {
64 : return NULL;
65 : }
66 :
67 131452 : void *memory = NULL;
68 :
69 : #if defined(__OFFLOAD)
70 : if (on_device) {
71 : offload_activate_chosen_device();
72 : offloadMalloc(&memory, size);
73 : } else {
74 : offload_activate_chosen_device();
75 : offloadMallocHost(&memory, size);
76 : }
77 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
78 : memory = omp_alloc(size, omp_null_allocator);
79 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
80 : if (MPI_SUCCESS != MPI_Alloc_mem((MPI_Aint)size, MPI_INFO_NULL, &memory)) {
81 : fprintf(stderr, "ERROR: MPI_Alloc_mem failed at %s:%i\n", name, __FILE__,
82 : __LINE__);
83 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
84 : }
85 : #else
86 131452 : memory = malloc(size);
87 : #endif
88 :
89 : // Update statistics.
90 131452 : if (on_device) {
91 44109 : #pragma omp atomic
92 : ++device_malloc_counter;
93 : } else {
94 87343 : #pragma omp atomic
95 : ++host_malloc_counter;
96 : }
97 :
98 131452 : assert(memory != NULL);
99 : return memory;
100 : }
101 :
102 : /*******************************************************************************
103 : * \brief Private routine for actually freeing system memory.
104 : * \author Ole Schuett
105 : ******************************************************************************/
106 240548 : static void actual_free(void *memory, const bool on_device) {
107 240548 : if (NULL == memory) {
108 : return;
109 : }
110 :
111 : #if defined(__OFFLOAD)
112 : if (on_device) {
113 : offload_activate_chosen_device();
114 : offloadFree(memory);
115 : } else {
116 : offload_activate_chosen_device();
117 : offloadFreeHost(memory);
118 : }
119 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
120 : (void)on_device; // mark used
121 : omp_free(memory, omp_null_allocator);
122 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
123 : (void)on_device; // mark used
124 : if (MPI_SUCCESS != MPI_Free_mem(memory)) {
125 : fprintf(stderr, "ERROR: MPI_Free_mem failed at %s:%i\n", name, __FILE__,
126 : __LINE__);
127 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
128 : }
129 : #else
130 131436 : (void)on_device; // mark used
131 131436 : free(memory);
132 : #endif
133 : }
134 :
135 : /*******************************************************************************
136 : * \brief Private routine for allocating host or device memory from the pool.
137 : * \author Ole Schuett and Hans Pabst
138 : ******************************************************************************/
139 3679162 : static void *internal_mempool_malloc(offload_mempool_t *pool, const size_t size,
140 : const bool on_device) {
141 3679162 : if (size == 0) {
142 : return NULL;
143 : }
144 :
145 3624296 : offload_memchunk_t *chunk;
146 :
147 7248592 : #pragma omp critical(offload_mempool_modify)
148 : {
149 : // Find a possible chunk to reuse or reclaim in available list.
150 3624296 : offload_memchunk_t **reuse = NULL,
151 3624296 : **reclaim = NULL; // ** for easy list removal
152 3624296 : offload_memchunk_t **indirect = &pool->available_head;
153 68737362 : while (*indirect != NULL) {
154 66048949 : const size_t s = (*indirect)->size;
155 66048949 : if (size <= s && (reuse == NULL || s < (*reuse)->size)) {
156 5245463 : reuse = indirect; // reuse smallest suitable chunk
157 5245463 : if (s == size) {
158 : break; // perfect match, exit early
159 : }
160 60803486 : } else if (reclaim == NULL || (*reclaim)->size < s) {
161 6564603 : reclaim = indirect; // reclaim largest unsuitable chunk
162 : }
163 65113066 : indirect = &(*indirect)->next;
164 : }
165 :
166 : // Select an existing chunk or allocate a new one.
167 3624296 : if (reuse != NULL) {
168 : // Reusing an exising chunk that's already large enough.
169 3492844 : chunk = *reuse;
170 3492844 : *reuse = chunk->next; // remove chunk from available list.
171 131452 : } else if (reclaim != NULL) {
172 : // Reclaiming an existing chunk (resize will happen outside crit. region).
173 22340 : chunk = *reclaim;
174 22340 : *reclaim = chunk->next; // remove chunk from available list.
175 : } else {
176 : // Found no available chunk, allocate a new one.
177 109112 : chunk = calloc(1, sizeof(offload_memchunk_t));
178 109112 : assert(chunk != NULL);
179 : }
180 : }
181 :
182 : // Resize chunk outside of critical region before adding it to allocated list.
183 3624296 : if (chunk->size < size) {
184 131452 : actual_free(chunk->mem, on_device);
185 131452 : chunk->mem = actual_malloc(size, on_device);
186 131452 : chunk->size = size;
187 : }
188 :
189 3624296 : chunk->used = size; // for statistics
190 :
191 : // Insert chunk into allocated list.
192 3624296 : #pragma omp critical(offload_mempool_modify)
193 : {
194 3624296 : chunk->next = pool->allocated_head;
195 3624296 : pool->allocated_head = chunk;
196 : }
197 :
198 3624296 : return chunk->mem;
199 : }
200 :
201 : /*******************************************************************************
202 : * \brief Internal routine for allocating host memory from the pool.
203 : * \author Ole Schuett
204 : ******************************************************************************/
205 3404220 : void *offload_mempool_host_malloc(const size_t size) {
206 3404220 : return internal_mempool_malloc(&mempool_host, size, false);
207 : }
208 :
209 : /*******************************************************************************
210 : * \brief Internal routine for allocating device memory from the pool
211 : * \author Ole Schuett
212 : ******************************************************************************/
213 274942 : void *offload_mempool_device_malloc(const size_t size) {
214 274942 : return internal_mempool_malloc(&mempool_device, size, true);
215 : }
216 :
217 : /*******************************************************************************
218 : * \brief Private routine for releasing memory back to the pool.
219 : * \author Ole Schuett
220 : ******************************************************************************/
221 4296337 : static void internal_mempool_free(offload_mempool_t *pool, const void *mem) {
222 4296337 : if (mem == NULL) {
223 : return;
224 : }
225 :
226 7248592 : #pragma omp critical(offload_mempool_modify)
227 : {
228 : // Find chunk in allocated list.
229 3624296 : offload_memchunk_t **indirect = &pool->allocated_head;
230 13413686 : while (*indirect != NULL && (*indirect)->mem != mem) {
231 9789390 : indirect = &(*indirect)->next;
232 : }
233 3624296 : offload_memchunk_t *chunk = *indirect;
234 3624296 : assert(chunk != NULL && chunk->mem == mem);
235 :
236 : // Remove chunk from allocated list.
237 3624296 : *indirect = chunk->next;
238 :
239 : // Add chunk to available list.
240 3624296 : chunk->next = pool->available_head;
241 3624296 : pool->available_head = chunk;
242 : }
243 : }
244 :
245 : /*******************************************************************************
246 : * \brief Internal routine for releasing memory back to the pool.
247 : * \author Ole Schuett
248 : ******************************************************************************/
249 4021395 : void offload_mempool_host_free(const void *memory) {
250 4021395 : internal_mempool_free(&mempool_host, memory);
251 4021395 : }
252 :
253 : /*******************************************************************************
254 : * \brief Internal routine for releasing memory back to the pool.
255 : * \author Ole Schuett
256 : ******************************************************************************/
257 274942 : void offload_mempool_device_free(const void *memory) {
258 274942 : internal_mempool_free(&mempool_device, memory);
259 274942 : }
260 :
261 : /*******************************************************************************
262 : * \brief Private routine for freeing all memory in the pool.
263 : * \author Ole Schuett
264 : ******************************************************************************/
265 18600 : static void internal_mempool_clear(offload_mempool_t *pool,
266 : const bool on_device) {
267 37200 : #pragma omp critical(offload_mempool_modify)
268 : {
269 : // Check for leaks, i.e. that the allocated list is empty.
270 18600 : assert(pool->allocated_head == NULL);
271 :
272 : // Free all chunks in available list.
273 127696 : while (pool->available_head != NULL) {
274 109096 : offload_memchunk_t *chunk = pool->available_head;
275 109096 : pool->available_head = chunk->next; // remove chunk
276 109096 : actual_free(chunk->mem, on_device);
277 109096 : free(chunk);
278 : }
279 : }
280 18600 : }
281 :
282 : /*******************************************************************************
283 : * \brief Internal routine for freeing all memory in the pool.
284 : * \author Ole Schuett
285 : ******************************************************************************/
286 9300 : void offload_mempool_clear(void) {
287 9300 : internal_mempool_clear(&mempool_host, false);
288 9300 : internal_mempool_clear(&mempool_device, true);
289 9300 : }
290 :
291 : /*******************************************************************************
292 : * \brief Private routine for summing alloc sizes of all chunks in given list.
293 : * \author Ole Schuett
294 : ******************************************************************************/
295 37664 : static uint64_t sum_chunks_size(const offload_memchunk_t *head) {
296 37664 : uint64_t size_sum = 0;
297 147620 : for (const offload_memchunk_t *chunk = head; chunk != NULL;
298 109956 : chunk = chunk->next) {
299 109956 : size_sum += chunk->size;
300 : }
301 37664 : return size_sum;
302 : }
303 :
304 : /*******************************************************************************
305 : * \brief Private routine for summing used sizes of all chunks in given list.
306 : * \author Ole Schuett
307 : ******************************************************************************/
308 37664 : static uint64_t sum_chunks_used(const offload_memchunk_t *head) {
309 37664 : uint64_t used_sum = 0;
310 147620 : for (const offload_memchunk_t *chunk = head; chunk != NULL;
311 109956 : chunk = chunk->next) {
312 109956 : used_sum += chunk->used;
313 : }
314 37664 : return used_sum;
315 : }
316 :
317 : /*******************************************************************************
318 : * \brief Internal routine to query statistics.
319 : * \author Hans Pabst
320 : ******************************************************************************/
321 9416 : void offload_mempool_stats_get(offload_mempool_stats_t *memstats) {
322 9416 : assert(NULL != memstats);
323 18832 : #pragma omp critical(offload_mempool_modify)
324 : {
325 9416 : memstats->host_mallocs = host_malloc_counter;
326 9416 : memstats->host_used = sum_chunks_used(mempool_host.available_head) +
327 9416 : sum_chunks_used(mempool_host.allocated_head);
328 9416 : memstats->host_size = sum_chunks_size(mempool_host.available_head) +
329 9416 : sum_chunks_size(mempool_host.allocated_head);
330 :
331 9416 : memstats->device_mallocs = device_malloc_counter;
332 9416 : memstats->device_used = sum_chunks_used(mempool_device.available_head) +
333 9416 : sum_chunks_used(mempool_device.allocated_head);
334 9416 : memstats->device_size = sum_chunks_size(mempool_device.available_head) +
335 9416 : sum_chunks_size(mempool_device.allocated_head);
336 : }
337 9416 : }
338 :
339 : /*******************************************************************************
340 : * \brief Print allocation statistics..
341 : * \author Hans Pabst
342 : ******************************************************************************/
343 9416 : void offload_mempool_stats_print(int fortran_comm,
344 : void (*print_func)(const char *, int, int),
345 : int output_unit) {
346 9416 : assert(omp_get_num_threads() == 1);
347 :
348 9416 : char buffer[100];
349 9416 : const cp_mpi_comm_t comm = cp_mpi_comm_f2c(fortran_comm);
350 9416 : offload_mempool_stats_t memstats;
351 9416 : offload_mempool_stats_get(&memstats);
352 9416 : cp_mpi_max_uint64(&memstats.device_mallocs, 1, comm);
353 9416 : cp_mpi_max_uint64(&memstats.host_mallocs, 1, comm);
354 :
355 9416 : if (0 != memstats.device_mallocs || 0 != memstats.host_mallocs) {
356 8042 : OFFLOAD_MEMPOOL_PRINT(print_func, "\n", output_unit);
357 8042 : OFFLOAD_MEMPOOL_PRINT(
358 : print_func,
359 : " ----------------------------------------------------------------"
360 : "---------------\n",
361 : output_unit);
362 8042 : OFFLOAD_MEMPOOL_PRINT(
363 : print_func,
364 : " - "
365 : " -\n",
366 : output_unit);
367 :
368 8042 : OFFLOAD_MEMPOOL_PRINT(
369 : print_func,
370 : " - OFFLOAD MEMPOOL STATISTICS "
371 : " -\n",
372 : output_unit);
373 8042 : OFFLOAD_MEMPOOL_PRINT(
374 : print_func,
375 : " - "
376 : " -\n",
377 : output_unit);
378 8042 : OFFLOAD_MEMPOOL_PRINT(
379 : print_func,
380 : " ----------------------------------------------------------------"
381 : "---------------\n",
382 : output_unit);
383 8042 : OFFLOAD_MEMPOOL_PRINT(print_func,
384 : " Memory consumption "
385 : " Number of allocations Used [MiB] Size [MiB]\n",
386 : output_unit);
387 : }
388 9416 : if (0 < memstats.device_mallocs) {
389 8028 : cp_mpi_max_uint64(&memstats.device_size, 1, comm);
390 8028 : snprintf(buffer, sizeof(buffer),
391 : " Device "
392 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
393 8028 : (uintptr_t)memstats.device_mallocs,
394 8028 : (uintptr_t)((memstats.device_used + (512U << 10)) >> 20),
395 8028 : (uintptr_t)((memstats.device_size + (512U << 10)) >> 20));
396 8028 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
397 : }
398 9416 : if (0 < memstats.host_mallocs) {
399 8042 : cp_mpi_max_uint64(&memstats.host_size, 1, comm);
400 8042 : snprintf(buffer, sizeof(buffer),
401 : " Host "
402 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
403 8042 : (uintptr_t)memstats.host_mallocs,
404 8042 : (uintptr_t)((memstats.host_used + (512U << 10)) >> 20),
405 8042 : (uintptr_t)((memstats.host_size + (512U << 10)) >> 20));
406 8042 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
407 : }
408 9416 : if (0 < memstats.device_mallocs || 0 < memstats.host_mallocs) {
409 8042 : OFFLOAD_MEMPOOL_PRINT(
410 : print_func,
411 : " ----------------------------------------------------------------"
412 : "---------------\n",
413 : output_unit);
414 : }
415 9416 : }
416 :
417 : // EOF
|