LCOV - code coverage report
Current view: top level - src/offload - offload_mempool.c (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:c24029e) Lines: 100.0 % 137 137
Test Date: 2026-07-04 06:36:57 Functions: 100.0 % 14 14

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2026 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : #include "offload_mempool.h"
       8              : #include "../mpiwrap/cp_mpi.h"
       9              : #include "offload_library.h"
      10              : #include "offload_runtime.h"
      11              : 
      12              : #include <assert.h>
      13              : #include <inttypes.h>
      14              : #include <omp.h>
      15              : #include <stdbool.h>
      16              : #include <stdio.h>
      17              : #include <stdlib.h>
      18              : #include <string.h>
      19              : 
      20              : #if defined(__parallel)
      21              : #include <mpi.h>
      22              : #endif
      23              : 
      24              : #define OFFLOAD_MEMPOOL_PRINT(FN, MSG, OUTPUT_UNIT)                            \
      25              :   ((FN)(MSG, (int)strlen(MSG), OUTPUT_UNIT))
      26              : #define OFFLOAD_MEMPOOL_OMPALLOC 1
      27              : 
      28              : /*******************************************************************************
      29              :  * \brief Private struct for storing a chunk of memory.
      30              :  * \author Ole Schuett
      31              :  ******************************************************************************/
      32              : typedef struct offload_memchunk {
      33              :   void *mem; // first: allows to cast memchunk into mem-ptr...
      34              :   struct offload_memchunk *next;
      35              :   size_t size, used;
      36              : } offload_memchunk_t;
      37              : 
      38              : /*******************************************************************************
      39              :  * \brief Private struct for storing a memory pool.
      40              :  * \author Ole Schuett
      41              :  ******************************************************************************/
      42              : typedef struct offload_mempool {
      43              :   offload_memchunk_t *available_head, *allocated_head; // single-linked lists
      44              :   uint64_t peak_size;                                  // for statistics
      45              : } offload_mempool_t;
      46              : 
      47              : /*******************************************************************************
      48              :  * \brief Private pools for host and device memory.
      49              :  * \author Ole Schuett
      50              :  ******************************************************************************/
      51              : static offload_mempool_t mempool_host = {0}, mempool_device = {0};
      52              : 
      53              : /*******************************************************************************
      54              :  * \brief Private counters for statistics.
      55              :  * \author Hans Pabst
      56              :  ******************************************************************************/
      57              : static uint64_t host_malloc_counter = 0, device_malloc_counter = 0;
      58              : 
      59              : /*******************************************************************************
      60              :  * \brief Returns the larger of two given integer (missing from the C standard)
      61              :  * \author Ole Schuett
      62              :  ******************************************************************************/
      63        41808 : static inline uint64_t imax(uint64_t x, uint64_t y) { return (x > y ? x : y); }
      64              : 
      65              : /*******************************************************************************
      66              :  * \brief Private routine for actually allocating system memory.
      67              :  * \author Ole Schuett
      68              :  ******************************************************************************/
      69       148138 : static void *actual_malloc(const size_t size, const bool on_device) {
      70       148138 :   if (size == 0) {
      71              :     return NULL;
      72              :   }
      73              : 
      74       148138 :   void *memory = NULL;
      75              : 
      76              : #if defined(__OFFLOAD)
      77              :   if (on_device) {
      78              :     offload_activate_chosen_device();
      79              :     offloadMalloc(&memory, size);
      80              :   } else {
      81              :     offload_activate_chosen_device();
      82              :     offloadMallocHost(&memory, size);
      83              :   }
      84              : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
      85              :   memory = omp_alloc(size, omp_null_allocator);
      86              : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
      87              :   if (MPI_SUCCESS != MPI_Alloc_mem((MPI_Aint)size, MPI_INFO_NULL, &memory)) {
      88              :     fprintf(stderr, "ERROR: MPI_Alloc_mem failed at %s:%i\n", name, __FILE__,
      89              :             __LINE__);
      90              :     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      91              :   }
      92              : #else
      93       148138 :   memory = malloc(size);
      94              : #endif
      95              : 
      96              :   // Update statistics.
      97       148138 :   if (on_device) {
      98        50481 : #pragma omp atomic
      99              :     ++device_malloc_counter;
     100              :   } else {
     101        97657 : #pragma omp atomic
     102              :     ++host_malloc_counter;
     103              :   }
     104              : 
     105       148138 :   assert(memory != NULL);
     106              :   return memory;
     107              : }
     108              : 
     109              : /*******************************************************************************
     110              :  * \brief Private routine for actually freeing system memory.
     111              :  * \author Ole Schuett
     112              :  ******************************************************************************/
     113       271865 : static void actual_free(void *memory, const bool on_device) {
     114       271865 :   if (NULL == memory) {
     115              :     return;
     116              :   }
     117              : 
     118              : #if defined(__OFFLOAD)
     119              :   if (on_device) {
     120              :     offload_activate_chosen_device();
     121              :     offloadFree(memory);
     122              :   } else {
     123              :     offload_activate_chosen_device();
     124              :     offloadFreeHost(memory);
     125              :   }
     126              : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
     127              :   (void)on_device; // mark used
     128              :   omp_free(memory, omp_null_allocator);
     129              : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
     130              :   (void)on_device; // mark used
     131              :   if (MPI_SUCCESS != MPI_Free_mem(memory)) {
     132              :     fprintf(stderr, "ERROR: MPI_Free_mem failed at %s:%i\n", name, __FILE__,
     133              :             __LINE__);
     134              :     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
     135              :   }
     136              : #else
     137       148122 :   (void)on_device; // mark used
     138       148122 :   free(memory);
     139              : #endif
     140              : }
     141              : 
     142              : /*******************************************************************************
     143              :  * \brief Private routine for allocating host or device memory from the pool.
     144              :  * \author Ole Schuett and Hans Pabst
     145              :  ******************************************************************************/
     146      4131371 : static void *internal_mempool_malloc(offload_mempool_t *pool, const size_t size,
     147              :                                      const bool on_device) {
     148      4131371 :   if (size == 0) {
     149              :     return NULL;
     150              :   }
     151              : 
     152      4077383 :   offload_memchunk_t *chunk;
     153              : 
     154      8154766 : #pragma omp critical(offload_mempool_modify)
     155              :   {
     156              :     // Find a possible chunk to reuse or reclaim in available list.
     157      4077383 :     offload_memchunk_t **reuse = NULL,
     158      4077383 :                        **reclaim = NULL; // ** for easy list removal
     159      4077383 :     offload_memchunk_t **indirect = &pool->available_head;
     160     83196731 :     while (*indirect != NULL) {
     161     80162359 :       const size_t s = (*indirect)->size;
     162     80162359 :       if (size <= s && (reuse == NULL || s < (*reuse)->size)) {
     163      6020028 :         reuse = indirect; // reuse smallest suitable chunk
     164      6020028 :         if (s == size) {
     165              :           break; // perfect match, exit early
     166              :         }
     167     74142331 :       } else if (reclaim == NULL || (*reclaim)->size < s) {
     168      7538893 :         reclaim = indirect; // reclaim largest unsuitable chunk
     169              :       }
     170     79119348 :       indirect = &(*indirect)->next;
     171              :     }
     172              : 
     173              :     // Select an existing chunk or allocate a new one.
     174      4077383 :     if (reuse != NULL) {
     175              :       // Reusing an exising chunk that's already large enough.
     176      3929245 :       chunk = *reuse;
     177      3929245 :       *reuse = chunk->next; // remove chunk from available list.
     178       148138 :     } else if (reclaim != NULL) {
     179              :       // Reclaiming an existing chunk (resize will happen outside crit. region).
     180        24395 :       chunk = *reclaim;
     181        24395 :       *reclaim = chunk->next; // remove chunk from available list.
     182              :     } else {
     183              :       // Found no available chunk, allocate a new one.
     184       123743 :       chunk = calloc(1, sizeof(offload_memchunk_t));
     185       123743 :       assert(chunk != NULL);
     186              :     }
     187              :   }
     188              : 
     189              :   // Resize chunk outside of critical region before adding it to allocated list.
     190      4077383 :   if (chunk->size < size) {
     191       148138 :     actual_free(chunk->mem, on_device);
     192       148138 :     chunk->mem = actual_malloc(size, on_device);
     193       148138 :     chunk->size = size;
     194              :   }
     195              : 
     196      4077383 :   chunk->used = size; // for statistics
     197              : 
     198              :   // Insert chunk into allocated list.
     199      4077383 : #pragma omp critical(offload_mempool_modify)
     200              :   {
     201      4077383 :     chunk->next = pool->allocated_head;
     202      4077383 :     pool->allocated_head = chunk;
     203              :   }
     204              : 
     205      4077383 :   return chunk->mem;
     206              : }
     207              : 
     208              : /*******************************************************************************
     209              :  * \brief Internal routine for allocating host memory from the pool.
     210              :  * \author Ole Schuett
     211              :  ******************************************************************************/
     212      3836360 : void *offload_mempool_host_malloc(const size_t size) {
     213      3836360 :   return internal_mempool_malloc(&mempool_host, size, false);
     214              : }
     215              : 
     216              : /*******************************************************************************
     217              :  * \brief Internal routine for allocating device memory from the pool
     218              :  * \author Ole Schuett
     219              :  ******************************************************************************/
     220       295011 : void *offload_mempool_device_malloc(const size_t size) {
     221       295011 :   return internal_mempool_malloc(&mempool_device, size, true);
     222              : }
     223              : 
     224              : /*******************************************************************************
     225              :  * \brief Private routine for releasing memory back to the pool.
     226              :  * \author Ole Schuett
     227              :  ******************************************************************************/
     228      4837938 : static void internal_mempool_free(offload_mempool_t *pool, const void *mem) {
     229      4837938 :   if (mem == NULL) {
     230              :     return;
     231              :   }
     232              : 
     233      8154766 : #pragma omp critical(offload_mempool_modify)
     234              :   {
     235              :     // Find chunk in allocated list.
     236      4077383 :     offload_memchunk_t **indirect = &pool->allocated_head;
     237     15484964 :     while (*indirect != NULL && (*indirect)->mem != mem) {
     238     11407581 :       indirect = &(*indirect)->next;
     239              :     }
     240      4077383 :     offload_memchunk_t *chunk = *indirect;
     241      4077383 :     assert(chunk != NULL && chunk->mem == mem);
     242              : 
     243              :     // Remove chunk from allocated list.
     244      4077383 :     *indirect = chunk->next;
     245              : 
     246              :     // Add chunk to available list.
     247      4077383 :     chunk->next = pool->available_head;
     248      4077383 :     pool->available_head = chunk;
     249              :   }
     250              : }
     251              : 
     252              : /*******************************************************************************
     253              :  * \brief Internal routine for releasing memory back to the pool.
     254              :  * \author Ole Schuett
     255              :  ******************************************************************************/
     256      4542927 : void offload_mempool_host_free(const void *memory) {
     257      4542927 :   internal_mempool_free(&mempool_host, memory);
     258      4542927 : }
     259              : 
     260              : /*******************************************************************************
     261              :  * \brief Internal routine for releasing memory back to the pool.
     262              :  * \author Ole Schuett
     263              :  ******************************************************************************/
     264       295011 : void offload_mempool_device_free(const void *memory) {
     265       295011 :   internal_mempool_free(&mempool_device, memory);
     266       295011 : }
     267              : 
     268              : /*******************************************************************************
     269              :  * \brief Private routine for freeing all memory in the pool.
     270              :  * \author Ole Schuett
     271              :  ******************************************************************************/
     272        20788 : static void internal_mempool_clear(offload_mempool_t *pool,
     273              :                                    const bool on_device) {
     274              : 
     275        41576 : #pragma omp critical(offload_mempool_modify)
     276              :   {
     277        20788 :     uint64_t pool_size = 0;
     278              : 
     279              :     // Check for leaks, i.e. that the allocated list is empty.
     280        20788 :     assert(pool->allocated_head == NULL);
     281              : 
     282              :     // Free all chunks in available list.
     283       144515 :     while (pool->available_head != NULL) {
     284       123727 :       offload_memchunk_t *chunk = pool->available_head;
     285       123727 :       pool->available_head = chunk->next; // remove chunk
     286       123727 :       actual_free(chunk->mem, on_device);
     287       123727 :       pool_size += chunk->size;
     288       123727 :       free(chunk);
     289              :     }
     290              : 
     291              :     // Update stats.
     292        20788 :     pool->peak_size = imax(pool->peak_size, pool_size);
     293              :   }
     294        20788 : }
     295              : 
     296              : /*******************************************************************************
     297              :  * \brief Internal routine for freeing all memory in the pool.
     298              :  * \author Ole Schuett and Hans Pabst
     299              :  ******************************************************************************/
     300        10394 : void offload_mempool_clear(void) {
     301        10394 :   internal_mempool_clear(&mempool_host, false);
     302        10394 :   internal_mempool_clear(&mempool_device, true);
     303        10394 : }
     304              : 
     305              : /*******************************************************************************
     306              :  * \brief Private routine for summing alloc sizes of all chunks in given list.
     307              :  * \author Ole Schuett
     308              :  ******************************************************************************/
     309        42040 : static uint64_t sum_chunks_size(const offload_memchunk_t *head) {
     310        42040 :   uint64_t size_sum = 0;
     311       166627 :   for (const offload_memchunk_t *chunk = head; chunk != NULL;
     312       124587 :        chunk = chunk->next) {
     313       124587 :     size_sum += chunk->size;
     314              :   }
     315        42040 :   return size_sum;
     316              : }
     317              : 
     318              : /*******************************************************************************
     319              :  * \brief Private routine for summing used sizes of all chunks in given list.
     320              :  * \author Ole Schuett
     321              :  ******************************************************************************/
     322        42040 : static uint64_t sum_chunks_used(const offload_memchunk_t *head) {
     323        42040 :   uint64_t used_sum = 0;
     324       166627 :   for (const offload_memchunk_t *chunk = head; chunk != NULL;
     325       124587 :        chunk = chunk->next) {
     326       124587 :     used_sum += chunk->used;
     327              :   }
     328        42040 :   return used_sum;
     329              : }
     330              : 
     331              : /*******************************************************************************
     332              :  * \brief Internal routine to query statistics.
     333              :  * \author Hans Pabst
     334              :  ******************************************************************************/
     335        10510 : void offload_mempool_stats_get(offload_mempool_stats_t *memstats) {
     336        10510 :   assert(NULL != memstats);
     337        21020 : #pragma omp critical(offload_mempool_modify)
     338              :   {
     339        10510 :     memstats->host_mallocs = host_malloc_counter;
     340        10510 :     memstats->host_used = sum_chunks_used(mempool_host.available_head) +
     341        10510 :                           sum_chunks_used(mempool_host.allocated_head);
     342        10510 :     memstats->host_size = sum_chunks_size(mempool_host.available_head) +
     343        10510 :                           sum_chunks_size(mempool_host.allocated_head);
     344        10510 :     memstats->host_peak = imax(mempool_host.peak_size, memstats->device_size);
     345              : 
     346        10510 :     memstats->device_mallocs = device_malloc_counter;
     347        10510 :     memstats->device_used = sum_chunks_used(mempool_device.available_head) +
     348        10510 :                             sum_chunks_used(mempool_device.allocated_head);
     349        10510 :     memstats->device_size = sum_chunks_size(mempool_device.available_head) +
     350        10510 :                             sum_chunks_size(mempool_device.allocated_head);
     351        10510 :     memstats->device_peak =
     352        10510 :         imax(mempool_device.peak_size, memstats->device_size);
     353              :   }
     354        10510 : }
     355              : 
     356              : /*******************************************************************************
     357              :  * \brief Print allocation statistics..
     358              :  * \author Hans Pabst
     359              :  ******************************************************************************/
     360        10510 : void offload_mempool_stats_print(int fortran_comm,
     361              :                                  void (*print_func)(const char *, int, int),
     362              :                                  int output_unit) {
     363        10510 :   assert(omp_get_num_threads() == 1);
     364              : 
     365        10510 :   char buffer[100];
     366        10510 :   const cp_mpi_comm_t comm = cp_mpi_comm_f2c(fortran_comm);
     367        10510 :   offload_mempool_stats_t memstats;
     368        10510 :   offload_mempool_stats_get(&memstats);
     369        10510 :   cp_mpi_max_uint64(&memstats.device_mallocs, 1, comm);
     370        10510 :   cp_mpi_max_uint64(&memstats.host_mallocs, 1, comm);
     371              : 
     372        10510 :   if (0 != memstats.device_mallocs || 0 != memstats.host_mallocs) {
     373         9132 :     OFFLOAD_MEMPOOL_PRINT(print_func, "\n", output_unit);
     374         9132 :     OFFLOAD_MEMPOOL_PRINT(
     375              :         print_func,
     376              :         " ----------------------------------------------------------------"
     377              :         "---------------\n",
     378              :         output_unit);
     379         9132 :     OFFLOAD_MEMPOOL_PRINT(
     380              :         print_func,
     381              :         " -                                                               "
     382              :         "              -\n",
     383              :         output_unit);
     384              : 
     385         9132 :     OFFLOAD_MEMPOOL_PRINT(
     386              :         print_func,
     387              :         " -                          OFFLOAD MEMPOOL STATISTICS           "
     388              :         "              -\n",
     389              :         output_unit);
     390         9132 :     OFFLOAD_MEMPOOL_PRINT(
     391              :         print_func,
     392              :         " -                                                               "
     393              :         "              -\n",
     394              :         output_unit);
     395         9132 :     OFFLOAD_MEMPOOL_PRINT(
     396              :         print_func,
     397              :         " ----------------------------------------------------------------"
     398              :         "---------------\n",
     399              :         output_unit);
     400         9132 :     OFFLOAD_MEMPOOL_PRINT(print_func,
     401              :                           " Memory consumption               "
     402              :                           " Number of allocations  Used [MiB]  Size [MiB]\n",
     403              :                           output_unit);
     404              :   }
     405              : #if defined(__OFFLOAD)
     406              :   if (0 < memstats.device_mallocs) {
     407              :     cp_mpi_max_uint64(&memstats.device_peak, 1, comm);
     408              :     snprintf(buffer, sizeof(buffer),
     409              :              " Device                            "
     410              :              " %20" PRIuPTR "  %10" PRIuPTR "  %10" PRIuPTR "\n",
     411              :              (uintptr_t)memstats.device_mallocs,
     412              :              (uintptr_t)((memstats.device_used + (512U << 10)) >> 20),
     413              :              (uintptr_t)((memstats.device_peak + (512U << 10)) >> 20));
     414              :     OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
     415              :   }
     416              : #endif
     417        10510 :   if (0 < memstats.host_mallocs) {
     418         9132 :     cp_mpi_max_uint64(&memstats.host_peak, 1, comm);
     419         9132 :     snprintf(buffer, sizeof(buffer),
     420              :              " Host                              "
     421              :              " %20" PRIuPTR "  %10" PRIuPTR "  %10" PRIuPTR "\n",
     422         9132 :              (uintptr_t)memstats.host_mallocs,
     423         9132 :              (uintptr_t)((memstats.host_used + (512U << 10)) >> 20),
     424         9132 :              (uintptr_t)((memstats.host_peak + (512U << 10)) >> 20));
     425         9132 :     OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
     426              :   }
     427        10510 :   if (0 < memstats.device_mallocs || 0 < memstats.host_mallocs) {
     428         9132 :     OFFLOAD_MEMPOOL_PRINT(
     429              :         print_func,
     430              :         " ----------------------------------------------------------------"
     431              :         "---------------\n",
     432              :         output_unit);
     433              :   }
     434        10510 : }
     435              : 
     436              : // EOF
        

Generated by: LCOV version 2.0-1