LCOV - code coverage report
Current view: top level - src/swarm - swarm_master.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:42dac4a) Lines: 93.5 % 124 116
Test Date: 2025-07-25 12:55:17 Functions: 71.4 % 7 5

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief Master's routines for the swarm-framework
      10              : !> \author Ole Schuett
      11              : ! **************************************************************************************************
      12              : MODULE swarm_master
      13              :    USE cp_external_control,             ONLY: external_control
      14              :    USE cp_log_handling,                 ONLY: cp_get_default_logger,&
      15              :                                               cp_logger_type
      16              :    USE cp_output_handling,              ONLY: cp_print_key_finished_output,&
      17              :                                               cp_print_key_unit_nr
      18              :    USE cp_parser_types,                 ONLY: cp_parser_type,&
      19              :                                               parser_create,&
      20              :                                               parser_release
      21              :    USE glbopt_master,                   ONLY: glbopt_master_finalize,&
      22              :                                               glbopt_master_init,&
      23              :                                               glbopt_master_steer,&
      24              :                                               glbopt_master_type
      25              :    USE global_types,                    ONLY: global_environment_type
      26              :    USE input_constants,                 ONLY: swarm_do_glbopt
      27              :    USE input_section_types,             ONLY: section_vals_get_subs_vals,&
      28              :                                               section_vals_type,&
      29              :                                               section_vals_val_get
      30              :    USE kinds,                           ONLY: default_path_length,&
      31              :                                               default_string_length
      32              :    USE message_passing,                 ONLY: mp_para_env_type
      33              :    USE swarm_message,                   ONLY: swarm_message_add,&
      34              :                                               swarm_message_equal,&
      35              :                                               swarm_message_file_read,&
      36              :                                               swarm_message_file_write,&
      37              :                                               swarm_message_free,&
      38              :                                               swarm_message_get,&
      39              :                                               swarm_message_type
      40              : #include "../base/base_uses.f90"
      41              : 
      42              :    IMPLICIT NONE
      43              :    PRIVATE
      44              : 
      45              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'swarm_master'
      46              : 
      47              :    PUBLIC :: swarm_master_type
      48              :    PUBLIC :: swarm_master_init, swarm_master_finalize
      49              :    PUBLIC :: swarm_master_steer
      50              : 
      51              :    TYPE swarm_message_p_type
      52              :       TYPE(swarm_message_type), POINTER                   :: p => Null()
      53              :    END TYPE swarm_message_p_type
      54              : 
      55              :    TYPE swarm_master_type
      56              :       PRIVATE
      57              :       INTEGER                                             :: behavior = -1
      58              :       TYPE(glbopt_master_type), POINTER                   :: glbopt => Null()
      59              :       !possibly more behaviors ...
      60              :       INTEGER                                             :: iw = 0
      61              :       INTEGER                                             :: i_iteration = 0
      62              :       INTEGER                                             :: max_iter = 0
      63              :       LOGICAL                                             :: should_stop = .FALSE.
      64              :       INTEGER                                             :: n_workers = -1
      65              :       INTEGER                                             :: comlog_unit = -1
      66              :       TYPE(section_vals_type), POINTER                    :: swarm_section => Null()
      67              :       TYPE(mp_para_env_type), POINTER                     :: para_env => Null()
      68              :       TYPE(swarm_message_p_type), DIMENSION(:), POINTER   :: queued_commands => Null()
      69              :       TYPE(global_environment_type), POINTER              :: globenv => Null()
      70              :       LOGICAL                                             :: ignore_last_iteration = .FALSE.
      71              :       INTEGER                                             :: n_waiting = 0
      72              :    END TYPE swarm_master_type
      73              : 
      74              : CONTAINS
      75              : 
      76              : ! **************************************************************************************************
      77              : !> \brief Initializes the swarm master
      78              : !> \param master ...
      79              : !> \param para_env ...
      80              : !> \param globenv ...
      81              : !> \param root_section ...
      82              : !> \param n_workers ...
      83              : !> \author Ole Schuett
      84              : ! **************************************************************************************************
      85            3 :    SUBROUTINE swarm_master_init(master, para_env, globenv, root_section, n_workers)
      86              :       TYPE(swarm_master_type)                            :: master
      87              :       TYPE(mp_para_env_type), POINTER                    :: para_env
      88              :       TYPE(global_environment_type), POINTER             :: globenv
      89              :       TYPE(section_vals_type), POINTER                   :: root_section
      90              :       INTEGER, INTENT(IN)                                :: n_workers
      91              : 
      92              :       TYPE(cp_logger_type), POINTER                      :: logger
      93              : 
      94            3 :       master%swarm_section => section_vals_get_subs_vals(root_section, "SWARM")
      95              : 
      96            3 :       logger => cp_get_default_logger()
      97            3 :       master%n_workers = n_workers
      98            3 :       master%para_env => para_env
      99            3 :       master%globenv => globenv
     100           12 :       ALLOCATE (master%queued_commands(master%n_workers))
     101              :       master%iw = cp_print_key_unit_nr(logger, master%swarm_section, &
     102            3 :                                        "PRINT%MASTER_RUN_INFO", extension=".masterLog")
     103              : 
     104            3 :       CALL section_vals_val_get(master%swarm_section, "BEHAVIOR", i_val=master%behavior)
     105              : 
     106              :       ! uses logger%iter_info%project_name to construct filename
     107              :       master%comlog_unit = cp_print_key_unit_nr(logger, master%swarm_section, "PRINT%COMMUNICATION_LOG", &
     108              :                                                 !middle_name="comlog", extension=".xyz", &
     109              :                                                 extension=".comlog", &
     110            3 :                                                 file_action="WRITE", file_position="REWIND")
     111              : 
     112            3 :       CALL section_vals_val_get(master%swarm_section, "MAX_ITER", i_val=master%max_iter)
     113              : 
     114            6 :       SELECT CASE (master%behavior)
     115              :       CASE (swarm_do_glbopt)
     116            3 :          ALLOCATE (master%glbopt)
     117            3 :          CALL glbopt_master_init(master%glbopt, para_env, root_section, n_workers, master%iw)
     118              :       CASE DEFAULT
     119            3 :          CPABORT("got unknown behavior")
     120              :       END SELECT
     121              : 
     122            3 :       CALL replay_comlog(master)
     123            3 :    END SUBROUTINE swarm_master_init
     124              : 
     125              : ! **************************************************************************************************
     126              : !> \brief Helper routine for swarm_master_init, restarts a calculation
     127              : !> \param master ...
     128              : !> \author Ole Schuett
     129              : ! **************************************************************************************************
     130            3 :    SUBROUTINE replay_comlog(master)
     131              :       TYPE(swarm_master_type)                            :: master
     132              : 
     133              :       CHARACTER(LEN=default_path_length)                 :: filename
     134              :       CHARACTER(LEN=default_string_length)               :: command_log
     135              :       INTEGER                                            :: handle, i, worker_id
     136              :       LOGICAL                                            :: at_end, explicit
     137              :       TYPE(cp_parser_type)                               :: parser
     138              :       TYPE(swarm_message_type)                           :: cmd_log, report_log
     139              :       TYPE(swarm_message_type), &
     140            9 :          DIMENSION(master%n_workers)                     :: last_commands
     141              :       TYPE(swarm_message_type), POINTER                  :: cmd_now
     142              : 
     143              :       ! Initialize parser for trajectory
     144              :       CALL section_vals_val_get(master%swarm_section, "REPLAY_COMMUNICATION_LOG", &
     145            3 :                                 c_val=filename, explicit=explicit)
     146              : 
     147            3 :       IF (.NOT. explicit) RETURN
     148            1 :       IF (master%iw > 0) WRITE (master%iw, '(A,A)') &
     149            1 :          " SWARM| Starting replay of communication-log: ", TRIM(filename)
     150              : 
     151            1 :       CALL timeset("swarm_master_replay_comlog", handle)
     152            1 :       CALL parser_create(parser, filename, para_env=master%para_env)
     153              : 
     154            1 :       at_end = .FALSE.
     155           21 :       DO
     156            6 :          CALL swarm_message_file_read(report_log, parser, at_end)
     157            6 :          IF (at_end) EXIT
     158              : 
     159            5 :          CALL swarm_message_file_read(cmd_log, parser, at_end)
     160            5 :          IF (at_end) EXIT
     161              : 
     162            5 :          ALLOCATE (cmd_now)
     163            5 :          CALL swarm_master_steer(master, report_log, cmd_now)
     164              : 
     165              :          !TODO: maybe we should just exit the loop instead of stopping?
     166            5 :          CALL swarm_message_get(cmd_log, "command", command_log)
     167            5 :          IF (TRIM(command_log) /= "shutdown") THEN
     168            4 :             IF (.NOT. commands_equal(cmd_now, cmd_log, master%iw)) CPABORT("wrong behaviour")
     169              :          END IF
     170              : 
     171            5 :          CALL swarm_message_free(cmd_log)
     172            5 :          CALL swarm_message_free(report_log)
     173            5 :          CALL swarm_message_get(cmd_now, "worker_id", worker_id)
     174            5 :          CALL swarm_message_free(last_commands(worker_id))
     175            5 :          last_commands(worker_id) = cmd_now
     176            5 :          DEALLOCATE (cmd_now)
     177              :       END DO
     178              : 
     179            1 :       CALL swarm_message_free(report_log) !don't worry about double-frees
     180            1 :       CALL swarm_message_free(cmd_log)
     181              : 
     182            1 :       IF (master%iw > 0) WRITE (master%iw, '(A,A)') &
     183            1 :          " SWARM| Reached end of communication log. Queueing last commands."
     184              : 
     185            2 :       DO i = 1, master%n_workers
     186            1 :          ALLOCATE (master%queued_commands(i)%p)
     187            2 :          master%queued_commands(i)%p = last_commands(i)
     188              :       END DO
     189              : 
     190            1 :       CALL parser_release(parser)
     191            1 :       CALL timestop(handle)
     192            9 :    END SUBROUTINE replay_comlog
     193              : 
     194              : ! **************************************************************************************************
     195              : !> \brief Helper routine for replay_comlog, compares two commands
     196              : !> \param cmd1 ...
     197              : !> \param cmd2 ...
     198              : !> \param iw ...
     199              : !> \return ...
     200              : !> \author Ole Schuett
     201              : ! **************************************************************************************************
     202            4 :    FUNCTION commands_equal(cmd1, cmd2, iw) RESULT(res)
     203              :       TYPE(swarm_message_type)                           :: cmd1, cmd2
     204              :       INTEGER                                            :: iw
     205              :       LOGICAL                                            :: res
     206              : 
     207            4 :       res = swarm_message_equal(cmd1, cmd2)
     208            4 :       IF (.NOT. res .AND. iw > 0) THEN
     209            0 :          WRITE (iw, *) "Command 1:"
     210            0 :          CALL swarm_message_file_write(cmd1, iw)
     211            0 :          WRITE (iw, *) "Command 2:"
     212            0 :          CALL swarm_message_file_write(cmd2, iw)
     213              :       END IF
     214            4 :    END FUNCTION commands_equal
     215              : 
     216              : ! **************************************************************************************************
     217              : !> \brief Central steering routine of the swarm master
     218              : !> \param master ...
     219              : !> \param report ...
     220              : !> \param cmd ...
     221              : !> \author Ole Schuett
     222              : ! **************************************************************************************************
     223           66 :    SUBROUTINE swarm_master_steer(master, report, cmd)
     224              :       TYPE(swarm_master_type), INTENT(INOUT)             :: master
     225              :       TYPE(swarm_message_type), INTENT(IN)               :: report
     226              :       TYPE(swarm_message_type), INTENT(OUT)              :: cmd
     227              : 
     228              :       CHARACTER(len=default_string_length)               :: command, status
     229              :       INTEGER                                            :: handle, worker_id
     230              :       LOGICAL                                            :: should_stop
     231              : 
     232           33 :       should_stop = .FALSE.
     233              : 
     234           33 :       CALL timeset("swarm_master_steer", handle)
     235              : 
     236              :       ! First check if there are queued commands for this worker
     237           33 :       CALL swarm_message_get(report, "worker_id", worker_id)
     238              : 
     239           33 :       IF (ASSOCIATED(master%queued_commands(worker_id)%p)) THEN
     240            1 :          cmd = master%queued_commands(worker_id)%p
     241            1 :          DEALLOCATE (master%queued_commands(worker_id)%p)
     242            2 :          IF (master%iw > 0) WRITE (master%iw, '(A,A,A,I9,1X,A)') ' SWARM| ', &
     243            1 :             REPEAT("*", 9), " Sending out queued command to worker: ", &
     244            2 :             worker_id, REPEAT("*", 9)
     245            1 :          CALL timestop(handle)
     246            1 :          RETURN
     247              :       END IF
     248              : 
     249           32 :       IF (.NOT. master%ignore_last_iteration) THEN
     250              :          ! There are no queued commands. Do the normal processing.
     251           32 :          master%i_iteration = master%i_iteration + 1
     252              : 
     253           64 :          IF (master%iw > 0) WRITE (master%iw, '(A,A,1X,I8,A,A)') ' SWARM| ', REPEAT("*", 15), &
     254           64 :             master%i_iteration, ' Master / Worker Communication  ', REPEAT("*", 15)
     255              :       END IF
     256              : 
     257           32 :       IF (master%i_iteration >= master%max_iter .AND. .NOT. master%should_stop) THEN
     258            1 :          IF (master%iw > 0) WRITE (master%iw, '(A)') " SWARM| Reached MAX_ITER. Quitting."
     259            1 :          master%should_stop = .TRUE.
     260              :       END IF
     261              : 
     262           32 :       IF (.NOT. master%should_stop) THEN
     263           31 :          CALL external_control(master%should_stop, "SWARM", master%globenv)
     264           31 :          IF (master%should_stop .AND. master%iw > 0) &
     265            0 :             WRITE (master%iw, *) " SWARM| Received stop from external_control. Quitting."
     266              :       END IF
     267              : 
     268              :       !IF(unit > 0) &
     269              : 
     270           32 :       IF (master%should_stop) THEN
     271            1 :          CALL swarm_message_add(cmd, "command", "shutdown")
     272            1 :          IF (master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
     273            1 :             "SWARM| Sending shutdown command to worker", worker_id
     274              :       ELSE
     275           62 :          SELECT CASE (master%behavior)
     276              :          CASE (swarm_do_glbopt)
     277           31 :             CALL glbopt_master_steer(master%glbopt, report, cmd, should_stop)
     278              :          CASE DEFAULT
     279           31 :             CPABORT("got unknown behavior")
     280              :          END SELECT
     281              : 
     282           31 :          IF (should_stop) THEN
     283            2 :             CALL swarm_message_free(cmd)
     284            2 :             CALL swarm_message_add(cmd, "command", "shutdown") !overwrite command
     285            2 :             IF (master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
     286            2 :                "SWARM| Sending shutdown command to worker", worker_id
     287            2 :             master%should_stop = .TRUE.
     288              :          END IF
     289              :       END IF
     290              : 
     291           32 :       CALL swarm_message_add(cmd, "worker_id", worker_id)
     292              : 
     293              :       ! Don't pollute comlog with "continue waiting"-commands.
     294           32 :       CALL swarm_message_get(report, "status", status)
     295           32 :       CALL swarm_message_get(cmd, "command", command)
     296           32 :       IF (TRIM(status) == "wait_done") master%n_waiting = master%n_waiting - 1
     297           32 :       IF (TRIM(command) == "wait") master%n_waiting = master%n_waiting + 1
     298           32 :       IF (master%n_waiting < 0) CPABORT("master%n_waiting < 0")
     299           32 :       IF (TRIM(status) /= "wait_done" .OR. TRIM(command) /= "wait") THEN
     300           32 :          CALL swarm_message_file_write(report, master%comlog_unit)
     301           32 :          CALL swarm_message_file_write(cmd, master%comlog_unit)
     302           32 :          IF (master%n_waiting > 0 .AND. master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
     303            0 :             "SWARM| Number of waiting workers:", master%n_waiting
     304           32 :          master%ignore_last_iteration = .FALSE.
     305              :       ELSE
     306            0 :          master%ignore_last_iteration = .TRUE.
     307              :       END IF
     308           32 :       CALL timestop(handle)
     309              :    END SUBROUTINE swarm_master_steer
     310              : 
     311              : ! **************************************************************************************************
     312              : !> \brief Finalizes the swarm master
     313              : !> \param master ...
     314              : !> \author Ole Schuett
     315              : ! **************************************************************************************************
     316            3 :    SUBROUTINE swarm_master_finalize(master)
     317              :       TYPE(swarm_master_type)                            :: master
     318              : 
     319              :       TYPE(cp_logger_type), POINTER                      :: logger
     320              : 
     321            3 :       IF (master%iw > 0) THEN
     322            3 :          WRITE (master%iw, "(1X,A,T71,I10)") "SWARM| Total number of iterations ", master%i_iteration
     323            3 :          WRITE (master%iw, "(A)") " SWARM| Shutting down the master."
     324              :       END IF
     325              : 
     326            6 :       SELECT CASE (master%behavior)
     327              :       CASE (swarm_do_glbopt)
     328            3 :          CALL glbopt_master_finalize(master%glbopt)
     329            3 :          DEALLOCATE (master%glbopt)
     330              :       CASE DEFAULT
     331            3 :          CPABORT("got unknown behavior")
     332              :       END SELECT
     333              : 
     334            3 :       DEALLOCATE (master%queued_commands)
     335              : 
     336            3 :       logger => cp_get_default_logger()
     337              :       CALL cp_print_key_finished_output(master%iw, logger, &
     338            3 :                                         master%swarm_section, "PRINT%MASTER_RUN_INFO")
     339              :       CALL cp_print_key_finished_output(master%comlog_unit, logger, &
     340            3 :                                         master%swarm_section, "PRINT%COMMUNICATION_LOG")
     341              : 
     342              :       !CALL rm_timer_env() !pops the top-most timer
     343            3 :    END SUBROUTINE swarm_master_finalize
     344              : 
     345            0 : END MODULE swarm_master
     346              : 
        

Generated by: LCOV version 2.0-1