LCOV - code coverage report
Current view: top level - src/swarm - swarm_mpi.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 92.4 % 131 121
Test Date: 2025-12-04 06:27:48 Functions: 84.6 % 13 11

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief Handles the MPI communication of the swarm framework.
      10              : !> \author Ole Schuett
      11              : ! **************************************************************************************************
      12              : MODULE swarm_mpi
      13              :    USE cp_files,                        ONLY: close_file,&
      14              :                                               open_file
      15              :    USE cp_iter_types,                   ONLY: cp_iteration_info_create,&
      16              :                                               cp_iteration_info_release,&
      17              :                                               cp_iteration_info_type
      18              :    USE cp_log_handling,                 ONLY: cp_add_default_logger,&
      19              :                                               cp_get_default_logger,&
      20              :                                               cp_logger_create,&
      21              :                                               cp_logger_release,&
      22              :                                               cp_logger_type,&
      23              :                                               cp_rm_default_logger
      24              :    USE input_section_types,             ONLY: section_vals_type,&
      25              :                                               section_vals_val_set
      26              :    USE kinds,                           ONLY: default_path_length,&
      27              :                                               default_string_length
      28              :    USE machine,                         ONLY: default_output_unit
      29              :    USE message_passing,                 ONLY: mp_any_source,&
      30              :                                               mp_comm_type,&
      31              :                                               mp_para_env_release,&
      32              :                                               mp_para_env_type
      33              :    USE swarm_message,                   ONLY: swarm_message_get,&
      34              :                                               swarm_message_mpi_bcast,&
      35              :                                               swarm_message_mpi_recv,&
      36              :                                               swarm_message_mpi_send,&
      37              :                                               swarm_message_type
      38              : #include "../base/base_uses.f90"
      39              : 
      40              :    IMPLICIT NONE
      41              :    PRIVATE
      42              : 
      43              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'swarm_mpi'
      44              : 
      45              :    PUBLIC :: swarm_mpi_type, swarm_mpi_init, swarm_mpi_finalize
      46              :    PUBLIC :: swarm_mpi_send_report, swarm_mpi_recv_report
      47              :    PUBLIC :: swarm_mpi_send_command, swarm_mpi_recv_command
      48              : 
      49              :    TYPE swarm_mpi_type
      50              :       TYPE(mp_para_env_type), POINTER          :: world => Null()
      51              :       TYPE(mp_para_env_type), POINTER          :: worker => Null()
      52              :       TYPE(mp_para_env_type), POINTER          :: master => Null()
      53              :       INTEGER, DIMENSION(:), ALLOCATABLE       :: wid2group
      54              :       CHARACTER(LEN=default_path_length)       :: master_output_path = ""
      55              :    END TYPE swarm_mpi_type
      56              : 
      57              : CONTAINS
      58              : 
      59              : ! **************************************************************************************************
      60              : !> \brief Initialize MPI communicators for a swarm run.
      61              : !> \param swarm_mpi ...
      62              : !> \param world_para_env ...
      63              : !> \param root_section ...
      64              : !> \param n_workers ...
      65              : !> \param worker_id ...
      66              : !> \param iw ...
      67              : !> \author Ole Schuett
      68              : ! **************************************************************************************************
      69            6 :    SUBROUTINE swarm_mpi_init(swarm_mpi, world_para_env, root_section, n_workers, worker_id, iw)
      70              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
      71              :       TYPE(mp_para_env_type), POINTER                    :: world_para_env
      72              :       TYPE(section_vals_type), POINTER                   :: root_section
      73              :       INTEGER, INTENT(IN)                                :: n_workers
      74              :       INTEGER, INTENT(OUT)                               :: worker_id
      75              :       INTEGER, INTENT(IN)                                :: iw
      76              : 
      77              :       INTEGER                                            :: n_groups_created, pe_per_worker, &
      78              :                                                             subgroup_rank, subgroup_size
      79              :       TYPE(mp_comm_type)                                 :: subgroup
      80              :       LOGICAL                                            :: im_the_master
      81            6 :       INTEGER, DIMENSION(:), POINTER                     :: group_distribution_p
      82              :       INTEGER, DIMENSION(0:world_para_env%num_pe-2), &
      83           12 :          TARGET                                          :: group_distribution
      84              : 
      85              : ! ====== Setup of MPI-Groups ======
      86              : 
      87            6 :       worker_id = -1
      88            6 :       swarm_mpi%world => world_para_env
      89              : 
      90            6 :       IF (MOD(swarm_mpi%world%num_pe - 1, n_workers) /= 0) THEN
      91            0 :          CPABORT("number of processors-1 is not divisible by n_workers.")
      92              :       END IF
      93            6 :       IF (swarm_mpi%world%num_pe < n_workers + 1) THEN
      94            0 :          CPABORT("There are not enough processes for n_workers + 1. Aborting.")
      95              :       END IF
      96              : 
      97            6 :       pe_per_worker = (swarm_mpi%world%num_pe - 1)/n_workers
      98              : 
      99            6 :       IF (iw > 0) THEN
     100            3 :          WRITE (iw, '(A,45X,I8)') " SWARM| Number of mpi ranks", swarm_mpi%world%num_pe
     101            3 :          WRITE (iw, '(A,47X,I8)') " SWARM| Number of workers", n_workers
     102              :       END IF
     103              : 
     104              :       ! the last task becomes the master. Preseves node-alignment of other tasks.
     105            6 :       im_the_master = (swarm_mpi%world%mepos == swarm_mpi%world%num_pe - 1)
     106              : 
     107              :       ! First split split para_env into a master- and a workers-groups...
     108            6 :       IF (im_the_master) THEN
     109            3 :          ALLOCATE (swarm_mpi%master)
     110            3 :          CALL swarm_mpi%master%from_split(swarm_mpi%world, 1)
     111            3 :          subgroup_size = swarm_mpi%master%num_pe
     112            3 :          subgroup_rank = swarm_mpi%master%mepos
     113            3 :          IF (swarm_mpi%master%num_pe /= 1) CPABORT("mp_comm_split_direct failed (master)")
     114              :       ELSE
     115            3 :          CALL subgroup%from_split(swarm_mpi%world, 2)
     116            3 :          subgroup_size = subgroup%num_pe
     117            3 :          subgroup_rank = subgroup%mepos
     118            3 :          IF (subgroup_size /= swarm_mpi%world%num_pe - 1) CPABORT("mp_comm_split_direct failed (worker)")
     119              :       END IF
     120              : 
     121           18 :       ALLOCATE (swarm_mpi%wid2group(n_workers))
     122           12 :       swarm_mpi%wid2group = 0
     123              : 
     124            6 :       IF (.NOT. im_the_master) THEN
     125              :          ! ...then split workers-group into n_workers groups - one for each worker.
     126            3 :          group_distribution_p => group_distribution
     127            3 :          ALLOCATE (swarm_mpi%worker)
     128            3 :          CALL swarm_mpi%worker%from_split(subgroup, n_groups_created, group_distribution_p, n_subgroups=n_workers)
     129            3 :          worker_id = group_distribution(subgroup_rank) + 1 ! shall start by 1
     130            3 :          IF (n_groups_created /= n_workers) CPABORT("mp_comm_split failed.")
     131            3 :          CALL subgroup%free()
     132              : 
     133              :          !WRITE (*,*) "this is worker ", worker_id, swarm_mpi%worker%mepos, swarm_mpi%worker%num_pe
     134              : 
     135              :          ! collect world-ranks of each worker groups rank-0 node
     136            3 :          IF (swarm_mpi%worker%mepos == 0) &
     137            3 :             swarm_mpi%wid2group(worker_id) = swarm_mpi%world%mepos
     138              : 
     139              :       END IF
     140              : 
     141            6 :       CALL swarm_mpi%world%sum(swarm_mpi%wid2group)
     142              :       !WRITE (*,*), "wid2group table: ",swarm_mpi%wid2group
     143              : 
     144            6 :       CALL logger_init_master(swarm_mpi)
     145            6 :       CALL logger_init_worker(swarm_mpi, root_section, worker_id)
     146            6 :    END SUBROUTINE swarm_mpi_init
     147              : 
     148              : ! **************************************************************************************************
     149              : !> \brief Helper routine for swarm_mpi_init, configures the master's logger.
     150              : !> \param swarm_mpi ...
     151              : !> \author Ole Schuett
     152              : ! **************************************************************************************************
     153            6 :    SUBROUTINE logger_init_master(swarm_mpi)
     154              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     155              : 
     156              :       INTEGER                                            :: output_unit
     157              :       TYPE(cp_logger_type), POINTER                      :: logger
     158              : 
     159              : ! broadcast master_output_path to all ranks
     160              : 
     161            6 :       IF (swarm_mpi%world%is_source()) THEN
     162            3 :          logger => cp_get_default_logger()
     163            3 :          output_unit = logger%default_local_unit_nr
     164            3 :          swarm_mpi%master_output_path = output_unit2path(output_unit)
     165            3 :          IF (output_unit /= default_output_unit) &
     166            0 :             CLOSE (output_unit)
     167              :       END IF
     168              : 
     169            6 :       CALL swarm_mpi%world%bcast(swarm_mpi%master_output_path)
     170              : 
     171            6 :       IF (ASSOCIATED(swarm_mpi%master)) &
     172            3 :          CALL error_add_new_logger(swarm_mpi%master, swarm_mpi%master_output_path)
     173            6 :    END SUBROUTINE logger_init_master
     174              : 
     175              : ! **************************************************************************************************
     176              : !> \brief Helper routine for logger_init_master, inquires filename for given unit.
     177              : !> \param output_unit ...
     178              : !> \return ...
     179              : !> \author Ole Schuett
     180              : ! **************************************************************************************************
     181            3 :    FUNCTION output_unit2path(output_unit) RESULT(output_path)
     182              :       INTEGER, INTENT(IN)                                :: output_unit
     183              :       CHARACTER(LEN=default_path_length)                 :: output_path
     184              : 
     185            3 :       output_path = "__STD_OUT__"
     186            3 :       IF (output_unit /= default_output_unit) &
     187            0 :          INQUIRE (unit=output_unit, name=output_path)
     188            3 :    END FUNCTION output_unit2path
     189              : 
     190              : ! **************************************************************************************************
     191              : !> \brief Helper routine for swarm_mpi_init, configures the workers's logger.
     192              : !> \param swarm_mpi ...
     193              : !> \param root_section ...
     194              : !> \param worker_id ...
     195              : !> \author Ole Schuett
     196              : ! **************************************************************************************************
     197            6 :    SUBROUTINE logger_init_worker(swarm_mpi, root_section, worker_id)
     198              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     199              :       TYPE(section_vals_type), POINTER                   :: root_section
     200              :       INTEGER                                            :: worker_id
     201              : 
     202              :       CHARACTER(LEN=default_path_length)                 :: output_path
     203              :       CHARACTER(len=default_string_length)               :: new_project_name, project_name, &
     204              :                                                             worker_name
     205              :       TYPE(cp_iteration_info_type), POINTER              :: new_iter_info
     206              :       TYPE(cp_logger_type), POINTER                      :: old_logger
     207              : 
     208            6 :       NULLIFY (old_logger, new_iter_info)
     209            6 :       IF (ASSOCIATED(swarm_mpi%worker)) THEN
     210            3 :          old_logger => cp_get_default_logger()
     211            3 :          project_name = old_logger%iter_info%project_name
     212            3 :          IF (worker_id > 99999) THEN
     213            0 :             CPABORT("Did not expect so many workers.")
     214              :          END IF
     215            3 :          WRITE (worker_name, "(A,I5.5)") 'WORKER', worker_id
     216            3 :          IF (LEN_TRIM(project_name) + 1 + LEN_TRIM(worker_name) > default_string_length) THEN
     217            0 :             CPABORT("project name too long")
     218              :          END IF
     219            3 :          output_path = TRIM(project_name)//"-"//TRIM(worker_name)//".out"
     220            3 :          new_project_name = TRIM(project_name)//"-"//TRIM(worker_name)
     221            3 :          CALL section_vals_val_set(root_section, "GLOBAL%PROJECT_NAME", c_val=new_project_name)
     222            3 :          CALL cp_iteration_info_create(new_iter_info, new_project_name)
     223            3 :          CALL error_add_new_logger(swarm_mpi%worker, output_path, new_iter_info)
     224            3 :          CALL cp_iteration_info_release(new_iter_info)
     225              :       END IF
     226            6 :    END SUBROUTINE logger_init_worker
     227              : 
     228              : ! **************************************************************************************************
     229              : !> \brief Helper routine for logger_init_master and logger_init_worker
     230              : !> \param para_env ...
     231              : !> \param output_path ...
     232              : !> \param iter_info ...
     233              : !> \author Ole Schuett
     234              : ! **************************************************************************************************
     235            6 :    SUBROUTINE error_add_new_logger(para_env, output_path, iter_info)
     236              :       TYPE(mp_para_env_type), POINTER                    :: para_env
     237              :       CHARACTER(LEN=default_path_length)                 :: output_path
     238              :       TYPE(cp_iteration_info_type), OPTIONAL, POINTER    :: iter_info
     239              : 
     240              :       INTEGER                                            :: output_unit
     241              :       TYPE(cp_logger_type), POINTER                      :: new_logger, old_logger
     242              : 
     243            6 :       NULLIFY (new_logger, old_logger)
     244            6 :       output_unit = -1
     245            6 :       IF (para_env%is_source()) THEN
     246              :          ! open output_unit according to output_path
     247            6 :          output_unit = default_output_unit
     248            6 :          IF (output_path /= "__STD_OUT__") &
     249              :             CALL open_file(file_name=output_path, file_status="UNKNOWN", &
     250            3 :                            file_action="WRITE", file_position="APPEND", unit_number=output_unit)
     251              :       END IF
     252              : 
     253            6 :       old_logger => cp_get_default_logger()
     254              :       CALL cp_logger_create(new_logger, para_env=para_env, &
     255              :                             default_global_unit_nr=output_unit, close_global_unit_on_dealloc=.FALSE., &
     256            6 :                             template_logger=old_logger, iter_info=iter_info)
     257              : 
     258            6 :       CALL cp_add_default_logger(new_logger)
     259            6 :       CALL cp_logger_release(new_logger)
     260            6 :    END SUBROUTINE error_add_new_logger
     261              : 
     262              : ! **************************************************************************************************
     263              : !> \brief Finalizes the MPI communicators of a swarm run.
     264              : !> \param swarm_mpi ...
     265              : !> \param root_section ...
     266              : !> \author Ole Schuett
     267              : ! **************************************************************************************************
     268            6 :    SUBROUTINE swarm_mpi_finalize(swarm_mpi, root_section)
     269              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     270              :       TYPE(section_vals_type), POINTER                   :: root_section
     271              : 
     272            6 :       CALL swarm_mpi%world%sync()
     273            6 :       CALL logger_finalize(swarm_mpi, root_section)
     274              : 
     275            6 :       IF (ASSOCIATED(swarm_mpi%worker)) CALL mp_para_env_release(swarm_mpi%worker)
     276            6 :       IF (ASSOCIATED(swarm_mpi%master)) CALL mp_para_env_release(swarm_mpi%master)
     277            6 :       NULLIFY (swarm_mpi%worker, swarm_mpi%master)
     278            6 :       DEALLOCATE (swarm_mpi%wid2group)
     279            6 :    END SUBROUTINE swarm_mpi_finalize
     280              : 
     281              : ! **************************************************************************************************
     282              : !> \brief Helper routine for swarm_mpi_finalize, restores the original loggers
     283              : !> \param swarm_mpi ...
     284              : !> \param root_section ...
     285              : !> \author Ole Schuett
     286              : ! **************************************************************************************************
     287            6 :    SUBROUTINE logger_finalize(swarm_mpi, root_section)
     288              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     289              :       TYPE(section_vals_type), POINTER                   :: root_section
     290              : 
     291              :       INTEGER                                            :: output_unit
     292              :       TYPE(cp_logger_type), POINTER                      :: logger, old_logger
     293              : 
     294            6 :       NULLIFY (logger, old_logger)
     295            6 :       logger => cp_get_default_logger()
     296            6 :       output_unit = logger%default_local_unit_nr
     297            6 :       IF (output_unit > 0 .AND. output_unit /= default_output_unit) &
     298            0 :          CALL close_file(output_unit)
     299              : 
     300            6 :       CALL cp_rm_default_logger() !pops the top-most logger
     301            6 :       old_logger => cp_get_default_logger()
     302              : 
     303              :       ! restore GLOBAL%PROJECT_NAME
     304              :       CALL section_vals_val_set(root_section, "GLOBAL%PROJECT_NAME", &
     305            6 :                                 c_val=old_logger%iter_info%project_name)
     306              : 
     307            6 :       CALL swarm_mpi%world%sync()
     308              : 
     309              :       ! do this only on master's rank 0
     310            6 :       IF (swarm_mpi%world%is_source() .AND. output_unit /= default_output_unit) THEN
     311            0 :          output_unit = old_logger%default_local_unit_nr
     312              :          OPEN (unit=output_unit, file=swarm_mpi%master_output_path, &
     313            0 :                status="UNKNOWN", action="WRITE", position="APPEND")
     314              :       END IF
     315            6 :    END SUBROUTINE logger_finalize
     316              : 
     317              : ! **************************************************************************************************
     318              : !> \brief Sends a report via MPI
     319              : !> \param swarm_mpi ...
     320              : !> \param report ...
     321              : !> \author Ole Schuett
     322              : ! **************************************************************************************************
     323           28 :    SUBROUTINE swarm_mpi_send_report(swarm_mpi, report)
     324              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     325              :       TYPE(swarm_message_type)                           :: report
     326              : 
     327              :       INTEGER                                            :: dest, tag
     328              : 
     329              : ! Only rank-0 of worker group sends its report
     330              : 
     331           28 :       IF (swarm_mpi%worker%is_source()) THEN
     332           28 :          dest = swarm_mpi%world%num_pe - 1
     333           28 :          tag = 42
     334           28 :          CALL swarm_message_mpi_send(report, group=swarm_mpi%world, dest=dest, tag=tag)
     335              :       END IF
     336              : 
     337           28 :    END SUBROUTINE swarm_mpi_send_report
     338              : 
     339              : ! **************************************************************************************************
     340              : !> \brief Receives a report via MPI
     341              : !> \param swarm_mpi ...
     342              : !> \param report ...
     343              : !> \author Ole Schuett
     344              : ! **************************************************************************************************
     345           28 :    SUBROUTINE swarm_mpi_recv_report(swarm_mpi, report)
     346              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     347              :       TYPE(swarm_message_type), INTENT(OUT)              :: report
     348              : 
     349              :       INTEGER                                            :: src, tag
     350              : 
     351           28 :       tag = 42
     352           28 :       src = mp_any_source
     353              : 
     354           28 :       CALL swarm_message_mpi_recv(report, group=swarm_mpi%world, src=src, tag=tag)
     355              : 
     356           28 :    END SUBROUTINE swarm_mpi_recv_report
     357              : 
     358              : ! **************************************************************************************************
     359              : !> \brief Sends a command via MPI
     360              : !> \param swarm_mpi ...
     361              : !> \param cmd ...
     362              : !> \author Ole Schuett
     363              : ! **************************************************************************************************
     364           28 :    SUBROUTINE swarm_mpi_send_command(swarm_mpi, cmd)
     365              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     366              :       TYPE(swarm_message_type)                           :: cmd
     367              : 
     368              :       INTEGER                                            :: dest, tag, worker_id
     369              : 
     370           28 :       CALL swarm_message_get(cmd, "worker_id", worker_id)
     371           28 :       tag = 42
     372           28 :       dest = swarm_mpi%wid2group(worker_id)
     373              : 
     374           28 :       CALL swarm_message_mpi_send(cmd, group=swarm_mpi%world, dest=dest, tag=tag)
     375              : 
     376           28 :    END SUBROUTINE swarm_mpi_send_command
     377              : 
     378              : ! **************************************************************************************************
     379              : !> \brief Receives a command via MPI and broadcasts it within a worker.
     380              : !> \param swarm_mpi ...
     381              : !> \param cmd ...
     382              : !> \author Ole Schuett
     383              : ! **************************************************************************************************
     384           28 :    SUBROUTINE swarm_mpi_recv_command(swarm_mpi, cmd)
     385              :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     386              :       TYPE(swarm_message_type), INTENT(OUT)              :: cmd
     387              : 
     388              :       INTEGER                                            :: src, tag
     389              : 
     390              : ! This is a two step communication schema.
     391              : ! First: The rank-0 of the worker groups receives the command from the master.
     392              : 
     393           28 :       IF (swarm_mpi%worker%is_source()) THEN
     394           28 :          src = swarm_mpi%world%num_pe - 1 !
     395           28 :          tag = 42
     396           28 :          CALL swarm_message_mpi_recv(cmd, group=swarm_mpi%world, src=src, tag=tag)
     397              : 
     398              :       END IF
     399              : 
     400              : !     ! Second: The command is broadcasted within the worker group.
     401           28 :       CALL swarm_message_mpi_bcast(cmd, src=swarm_mpi%worker%source, group=swarm_mpi%worker)
     402              : 
     403           28 :    END SUBROUTINE swarm_mpi_recv_command
     404              : 
     405            0 : END MODULE swarm_mpi
     406              : 
        

Generated by: LCOV version 2.0-1