LCOV - code coverage report
Current view: top level - src - hfx_load_balance_methods.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:1f285aa) Lines: 713 1045 68.2 %
Date: 2024-04-23 06:49:27 Functions: 9 14 64.3 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Routines for optimizing load balance between processes in HFX calculations
      10             : !> \par History
      11             : !>      04.2008 created [Manuel Guidon]
      12             : !> \author Manuel Guidon
      13             : ! **************************************************************************************************
      14             : MODULE hfx_load_balance_methods
      15             :    USE cell_types, ONLY: cell_type
      16             :    USE cp_files, ONLY: close_file, &
      17             :                        open_file
      18             :    USE message_passing, ONLY: mp_para_env_type
      19             :    USE hfx_pair_list_methods, ONLY: build_atomic_pair_list, &
      20             :                                     build_pair_list
      21             :    USE hfx_types, ONLY: &
      22             :       hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &
      23             :       hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &
      24             :       pair_list_type, pair_set_list_type
      25             :    USE input_constants, ONLY: hfx_do_eval_energy, &
      26             :                               hfx_do_eval_forces
      27             :    USE kinds, ONLY: dp, &
      28             :                     int_8
      29             :    USE message_passing, ONLY: mp_waitall, mp_request_type
      30             :    USE parallel_rng_types, ONLY: UNIFORM, &
      31             :                                  rng_stream_type
      32             :    USE particle_types, ONLY: particle_type
      33             :    USE util, ONLY: sort
      34             : #include "./base/base_uses.f90"
      35             : 
      36             :    IMPLICIT NONE
      37             :    PRIVATE
      38             : 
      39             :    PUBLIC :: hfx_load_balance, &
      40             :              hfx_update_load_balance, &
      41             :              collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy
      42             : 
      43             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
      44             : 
      45             :    REAL(KIND=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
      46             :                                                  -1.91570128356921242E-002_dp, -1.6668495454436603_dp, &
      47             :                                                  1.7512639006523709_dp, -9.76074323945336081E-002_dp, &
      48             :                                                  2.6230786127311889_dp, -0.31870737623014189_dp, &
      49             :                                                  7.9588203912690973_dp, 1.8331423413134813_dp, &
      50             :                                                  -0.15427618665346299_dp, 0.19749436090711650_dp/)
      51             :    REAL(KIND=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
      52             :                                                  -9.36564055598656797E-002_dp, 0.64284973765086939_dp, &
      53             :                                                  1.0137565430060556_dp, -6.80088178288954567E-003_dp, &
      54             :                                                  1.1692629207374552_dp, -2.6314710080507573_dp, &
      55             :                                                  19.237814781880786_dp, 1.0505934173661349_dp, &
      56             :                                                  0.80382371955699250_dp, 0.49903401991818103_dp/)
      57             :    REAL(KIND=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350E-002_dp, 0.38073304105744837_dp/)
      58             :    REAL(KIND=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
      59             :                                                  -9.41673106447732111E-002_dp, 0.94568006899317825_dp, &
      60             :                                                  -1.4511897117448544_dp, 0.59178934677316952_dp, &
      61             :                                                  2.7291149361757236_dp, -0.50555512044800210_dp, &
      62             :                                                  8.3508180969609871_dp, 1.6829982496141809_dp, &
      63             :                                                  -0.74895370472152600_dp, 0.43801726744197500_dp/)
      64             :    REAL(KIND=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
      65             :                                                  5.33216585432061581E-003_dp, 0.45572145697283628_dp, &
      66             :                                                  1.8119743851500618_dp, -0.12533918548421166_dp, &
      67             :                                                  -1.4040312084552751_dp, -4.5331650463917859_dp, &
      68             :                                                  12.593431549069477_dp, 1.1311978374487595_dp, &
      69             :                                                  1.4245996087624646_dp, 1.1425350529853495_dp/)
      70             :    REAL(KIND=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
      71             : 
      72             : !***
      73             : 
      74             : CONTAINS
      75             : 
      76             : ! **************************************************************************************************
      77             : !> \brief Distributes the computation of eri's to all available processes.
      78             : !> \param x_data Object that stores the indices array
      79             : !> \param eps_schwarz screening parameter
      80             : !> \param particle_set , atomic_kind_set, para_env ...
      81             : !> \param max_set Maximum number of set to be considered
      82             : !> \param para_env para_env
      83             : !> \param coeffs_set screening functions
      84             : !> \param coeffs_kind screening functions
      85             : !> \param is_assoc_atomic_block_global KS-matrix sparsity
      86             : !> \param do_periodic flag for periodicity
      87             : !> \param load_balance_parameter Parameters for Monte-Carlo routines
      88             : !> \param kind_of helper array for mapping
      89             : !> \param basis_parameter Basis set parameters
      90             : !> \param pmax_set Initial screening matrix
      91             : !> \param pmax_atom ...
      92             : !> \param i_thread Process ID of current Thread
      93             : !> \param n_threads Total Number of Threads
      94             : !> \param cell cell
      95             : !> \param do_p_screening Flag for initial p screening
      96             : !> \param map_atom_to_kind_atom ...
      97             : !> \param nkind ...
      98             : !> \param eval_type ...
      99             : !> \param pmax_block ...
     100             : !> \param use_virial ...
     101             : !> \par History
     102             : !>      06.2007 created [Manuel Guidon]
     103             : !>      08.2007 new parallel scheme [Manuel Guidon]
     104             : !>      09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
     105             : !>      11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
     106             : !>      02.2009 completely refactored [Manuel Guidon]
     107             : !> \author Manuel Guidon
     108             : !> \note
     109             : !>      The optimization is done via a binning procedure followed by simple
     110             : !>      Monte Carlo procedure:
     111             : !>      In a first step the total amount of integrals in the system is calculated,
     112             : !>      taking into account the sparsity of the KS-matrix , the screening based
     113             : !>      on near/farfield approximations and if desired the screening on an initial
     114             : !>      density matrix.
     115             : !>      In a second step, bins are generate that contain approximately the same number
     116             : !>      of integrals, and a cost for these bins is estimated (currently the number of integrals)
     117             : !>      In a third step, a Monte Carlo procedure optimizes the assignment
     118             : !>      of the different loads to each process
     119             : !>      At the end each process owns an unique array of *atomic* indices-ranges
     120             : !>      that are used to decide whether a process has to calculate a certain
     121             : !>      bunch of integrals or not
     122             : ! **************************************************************************************************
     123        1744 :    SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
     124             :                                coeffs_set, coeffs_kind, &
     125        1744 :                                is_assoc_atomic_block_global, do_periodic, &
     126             :                                load_balance_parameter, kind_of, basis_parameter, pmax_set, &
     127             :                                pmax_atom, i_thread, n_threads, cell, &
     128             :                                do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
     129             :                                pmax_block, use_virial)
     130             :       TYPE(hfx_type), POINTER                            :: x_data
     131             :       REAL(dp), INTENT(IN)                               :: eps_schwarz
     132             :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     133             :       INTEGER, INTENT(IN)                                :: max_set
     134             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     135             :       TYPE(hfx_screen_coeff_type), &
     136             :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
     137             :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
     138             :          POINTER                                         :: coeffs_kind
     139             :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
     140             :       LOGICAL                                            :: do_periodic
     141             :       TYPE(hfx_load_balance_type), POINTER               :: load_balance_parameter
     142             :       INTEGER                                            :: kind_of(*)
     143             :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
     144             :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
     145             :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
     146             :       INTEGER, INTENT(IN)                                :: i_thread, n_threads
     147             :       TYPE(cell_type), POINTER                           :: cell
     148             :       LOGICAL, INTENT(IN)                                :: do_p_screening
     149             :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
     150             :       INTEGER, INTENT(IN)                                :: nkind, eval_type
     151             :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
     152             :       LOGICAL, INTENT(IN)                                :: use_virial
     153             : 
     154             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'hfx_load_balance'
     155             : 
     156             :       CHARACTER(LEN=512)                                 :: error_msg
     157             :       INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
     158             :                  handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
     159             :                  jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
     160             :                  latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
     161             :                  new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
     162             :                  objective_block_size, objective_nblocks, source, total_blocks
     163        5232 :       TYPE(mp_request_type), DIMENSION(2) :: req
     164             :       INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
     165             :                         distribution_counter_end, distribution_counter_start, global_quartet_counter, &
     166             :                         local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
     167        1744 :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer_in, buffer_out
     168        1744 :       INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
     169        1744 :                                                             sendbuffer, swapbuffer
     170             :       INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
     171             :       INTEGER(int_8), SAVE                               :: shm_global_quartet_counter, &
     172             :                                                             shm_local_quartet_counter
     173        3488 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount, rdispl, tmp_index, tmp_pos, &
     174        1744 :                                                             to_be_sorted
     175             :       INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
     176             :       INTEGER, SAVE                                      :: shm_nblocks
     177             :       LOGICAL                                            :: changed, last_bin_needs_to_be_filled, &
     178             :                                                             optimized
     179             :       LOGICAL, DIMENSION(:, :), POINTER, SAVE            :: atomic_pair_list
     180             :       REAL(dp)                                           :: coeffs_kind_max0, log10_eps_schwarz, &
     181             :                                                             log_2, pmax_blocks
     182        3488 :       TYPE(hfx_block_range_type), DIMENSION(:), POINTER  :: blocks_guess, tmp_blocks, tmp_blocks2
     183             :       TYPE(hfx_block_range_type), DIMENSION(:), &
     184             :          POINTER, SAVE                                   :: shm_blocks
     185        5232 :       TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
     186             :       TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
     187             :          SAVE                                            :: full_dist
     188             :       TYPE(pair_list_type)                               :: list_ij, list_kl
     189             :       TYPE(pair_set_list_type), ALLOCATABLE, &
     190        1744 :          DIMENSION(:)                                    :: set_list_ij, set_list_kl
     191             : 
     192        1744 : !$OMP BARRIER
     193        3488 : !$OMP MASTER
     194        1744 :       CALL timeset(routineN, handle)
     195             : !$OMP END MASTER
     196        1744 : !$OMP BARRIER
     197             : 
     198        1744 :       log10_eps_schwarz = LOG10(eps_schwarz)
     199        1744 :       log_2 = LOG10(2.0_dp)
     200       12020 :       coeffs_kind_max0 = MAXVAL(coeffs_kind(:, :)%x(2))
     201        1744 :       ncpu = para_env%num_pe
     202        1744 :       n_processes = ncpu*n_threads
     203        1744 :       natom = SIZE(particle_set)
     204             : 
     205        1744 :       block_size = load_balance_parameter%block_size
     206       33348 :       ALLOCATE (set_list_ij((max_set*block_size)**2))
     207       33348 :       ALLOCATE (set_list_kl((max_set*block_size)**2))
     208             : 
     209        1744 :       IF (.NOT. load_balance_parameter%blocks_initialized) THEN
     210        1216 : !$OMP BARRIER
     211        1216 : !$OMP MASTER
     212        1216 :          CALL timeset(routineN//"_range", handle_range)
     213             : 
     214        1216 :          nblocks = MAX((natom + block_size - 1)/block_size, 1)
     215        7478 :          ALLOCATE (blocks_guess(nblocks))
     216        7478 :          ALLOCATE (tmp_blocks(natom))
     217        7478 :          ALLOCATE (tmp_blocks2(natom))
     218             : 
     219        1216 :          pmax_blocks = 0.0_dp
     220        2432 :          SELECT CASE (eval_type)
     221             :          CASE (hfx_do_eval_energy)
     222        1216 :             atomic_pair_list => x_data%atomic_pair_list
     223             :          CASE (hfx_do_eval_forces)
     224        1216 :             atomic_pair_list => x_data%atomic_pair_list_forces
     225             :          END SELECT
     226       20716 :          atomic_pair_list = .TRUE.
     227             :          CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
     228             :                           list_ij, list_kl, set_list_ij, set_list_kl, &
     229             :                           particle_set, &
     230             :                           coeffs_set, coeffs_kind, &
     231             :                           is_assoc_atomic_block_global, do_periodic, &
     232             :                           kind_of, basis_parameter, pmax_set, pmax_atom, &
     233             :                           pmax_blocks, cell, &
     234             :                           do_p_screening, map_atom_to_kind_atom, eval_type, &
     235        1216 :                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     236             : 
     237        1216 :          total_block_self_cost = 0
     238             : 
     239        5046 :          DO i = 1, nblocks
     240        5046 :             total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
     241             :          END DO
     242             : 
     243        1216 :          CALL para_env%sum(total_block_self_cost)
     244             : 
     245        1216 :          objective_block_size = load_balance_parameter%block_size
     246        1216 :          objective_nblocks = MAX((natom + objective_block_size - 1)/objective_block_size, 1)
     247             : 
     248        1216 :          self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
     249             : 
     250        5046 :          DO i = 1, nblocks
     251        5046 :             tmp_blocks2(i) = blocks_guess(i)
     252             :          END DO
     253             : 
     254             :          optimized = .FALSE.
     255             :          i = 0
     256        2432 :          DO WHILE (.NOT. optimized)
     257        1216 :             i = i + 1
     258        1216 :             current_block_id = 0
     259        1216 :             changed = .FALSE.
     260        5046 :             DO atom_block = 1, nblocks
     261        3830 :                current_block_id = current_block_id + 1
     262        3830 :                iatom_start = tmp_blocks2(atom_block)%istart
     263        3830 :                iatom_end = tmp_blocks2(atom_block)%iend
     264        5046 :                IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
     265           0 :                   changed = .TRUE.
     266           0 :                   new_iatom_start = iatom_start
     267           0 :                   new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
     268           0 :                   new_jatom_start = new_iatom_end + 1
     269           0 :                   new_jatom_end = iatom_end
     270           0 :                   tmp_blocks(current_block_id)%istart = new_iatom_start
     271           0 :                   tmp_blocks(current_block_id)%iend = new_iatom_end
     272             :                   tmp_blocks(current_block_id)%cost = estimate_block_cost( &
     273             :                                                       natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     274             :                                                       new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
     275             :                                                       new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
     276             :                                                       particle_set, &
     277             :                                                       coeffs_set, coeffs_kind, &
     278             :                                                       is_assoc_atomic_block_global, do_periodic, &
     279             :                                                       kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     280             :                                                       cell, &
     281             :                                                       do_p_screening, map_atom_to_kind_atom, eval_type, &
     282           0 :                                                       log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     283           0 :                   current_block_id = current_block_id + 1
     284           0 :                   tmp_blocks(current_block_id)%istart = new_jatom_start
     285           0 :                   tmp_blocks(current_block_id)%iend = new_jatom_end
     286             :                   tmp_blocks(current_block_id)%cost = estimate_block_cost( &
     287             :                                                       natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     288             :                                                       new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
     289             :                                                       new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
     290             :                                                       particle_set, &
     291             :                                                       coeffs_set, coeffs_kind, &
     292             :                                                       is_assoc_atomic_block_global, do_periodic, &
     293             :                                                       kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     294             :                                                       cell, &
     295             :                                                       do_p_screening, map_atom_to_kind_atom, eval_type, &
     296           0 :                                                       log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     297             :                ELSE
     298        3830 :                   tmp_blocks(current_block_id)%istart = iatom_start
     299        3830 :                   tmp_blocks(current_block_id)%iend = iatom_end
     300        3830 :                   tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
     301             :                END IF
     302             :             END DO
     303        1216 :             IF (.NOT. changed) optimized = .TRUE.
     304        1216 :             IF (i > 20) optimized = .TRUE.
     305        1216 :             nblocks = current_block_id
     306        6262 :             DO atom_block = 1, nblocks
     307        5046 :                tmp_blocks2(atom_block) = tmp_blocks(atom_block)
     308             :             END DO
     309             :          END DO
     310             : 
     311        1216 :          DEALLOCATE (tmp_blocks2)
     312             : 
     313             :          ! ** count number of non empty blocks on each node
     314        1216 :          non_empty_blocks = 0
     315        5046 :          DO atom_block = 1, nblocks
     316        3830 :             IF (tmp_blocks(atom_block)%istart == 0) CYCLE
     317        5046 :             non_empty_blocks = non_empty_blocks + 1
     318             :          END DO
     319             : 
     320        3648 :          ALLOCATE (rcount(ncpu))
     321        3634 :          rcount = 0
     322        1216 :          rcount(para_env%mepos + 1) = non_empty_blocks
     323        1216 :          CALL para_env%sum(rcount)
     324             : 
     325             :          ! ** sum all non_empty_blocks
     326        1216 :          total_blocks = 0
     327        3634 :          DO i = 1, ncpu
     328        3634 :             total_blocks = total_blocks + rcount(i)
     329             :          END DO
     330             : 
     331             :          ! ** calculate offsets
     332        3648 :          ALLOCATE (rdispl(ncpu))
     333        3634 :          rcount(:) = rcount(:)*3
     334        1216 :          rdispl(1) = 0
     335        2418 :          DO i = 2, ncpu
     336        2418 :             rdispl(i) = rdispl(i - 1) + rcount(i - 1)
     337             :          END DO
     338             : 
     339        3620 :          ALLOCATE (buffer_in(3*non_empty_blocks))
     340             : 
     341        1216 :          non_empty_blocks = 0
     342        5046 :          DO atom_block = 1, nblocks
     343        3830 :             IF (tmp_blocks(atom_block)%istart == 0) CYCLE
     344        1936 :             buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
     345        1936 :             buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
     346        1936 :             buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
     347        5046 :             non_empty_blocks = non_empty_blocks + 1
     348             :          END DO
     349             : 
     350        1216 :          nblocks = total_blocks
     351             : 
     352        7478 :          ALLOCATE (tmp_blocks2(nblocks))
     353             : 
     354        3648 :          ALLOCATE (buffer_out(3*nblocks))
     355             : 
     356             :          ! ** Gather all three arrays
     357        1216 :          CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
     358             : 
     359        5046 :          DO i = 1, nblocks
     360        3830 :             tmp_blocks2(i)%istart = INT(buffer_out((i - 1)*3 + 1))
     361        3830 :             tmp_blocks2(i)%iend = INT(buffer_out((i - 1)*3 + 2))
     362        5046 :             tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
     363             :          END DO
     364             : 
     365             :          ! ** Now we sort the blocks
     366        3648 :          ALLOCATE (to_be_sorted(nblocks))
     367        3648 :          ALLOCATE (tmp_index(nblocks))
     368             : 
     369        5046 :          DO atom_block = 1, nblocks
     370        5046 :             to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
     371             :          END DO
     372             : 
     373        1216 :          CALL sort(to_be_sorted, nblocks, tmp_index)
     374             : 
     375        7478 :          ALLOCATE (x_data%blocks(nblocks))
     376             : 
     377        5046 :          DO atom_block = 1, nblocks
     378        5046 :             x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
     379             :          END DO
     380             : 
     381        1216 :          shm_blocks => x_data%blocks
     382        1216 :          shm_nblocks = nblocks
     383             : 
     384             :          ! ** Set nblocks in structure
     385        1216 :          load_balance_parameter%nblocks = nblocks
     386             : 
     387        1216 :          DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
     388             : 
     389        1216 :          DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
     390             : 
     391        1216 :          load_balance_parameter%blocks_initialized = .TRUE.
     392             : 
     393        8876 :          x_data%blocks = shm_blocks
     394        1216 :          load_balance_parameter%nblocks = shm_nblocks
     395        1216 :          load_balance_parameter%blocks_initialized = .TRUE.
     396             : 
     397        4864 :          ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
     398       20716 :          x_data%pmax_block = 0.0_dp
     399        1216 :          pmax_block => x_data%pmax_block
     400        2432 :          CALL timestop(handle_range)
     401             : !$OMP END MASTER
     402        1216 : !$OMP BARRIER
     403             : 
     404        1216 :          IF (.NOT. load_balance_parameter%blocks_initialized) THEN
     405           0 :             ALLOCATE (x_data%blocks(shm_nblocks))
     406           0 :             x_data%blocks = shm_blocks
     407           0 :             load_balance_parameter%nblocks = shm_nblocks
     408           0 :             load_balance_parameter%blocks_initialized = .TRUE.
     409             :          END IF
     410             :          !! ** precalculate maximum density matrix elements in blocks
     411        1216 : !$OMP BARRIER
     412             :       END IF
     413             : 
     414        1744 : !$OMP BARRIER
     415        1744 : !$OMP MASTER
     416        1744 :       pmax_block => x_data%pmax_block
     417       28132 :       pmax_block = 0.0_dp
     418        1744 :       IF (do_p_screening) THEN
     419        1454 :          DO iatom_block = 1, shm_nblocks
     420        1118 :             iatom_start = x_data%blocks(iatom_block)%istart
     421        1118 :             iatom_end = x_data%blocks(iatom_block)%iend
     422        5648 :             DO jatom_block = 1, shm_nblocks
     423        4194 :                jatom_start = x_data%blocks(jatom_block)%istart
     424        4194 :                jatom_end = x_data%blocks(jatom_block)%iend
     425       13700 :                pmax_block(iatom_block, jatom_block) = MAXVAL(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
     426             :             END DO
     427             :          END DO
     428             :       END IF
     429             : 
     430        2960 :       SELECT CASE (eval_type)
     431             :       CASE (hfx_do_eval_energy)
     432        1216 :          atomic_pair_list => x_data%atomic_pair_list
     433             :       CASE (hfx_do_eval_forces)
     434        1744 :          atomic_pair_list => x_data%atomic_pair_list_forces
     435             :       END SELECT
     436             :       CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
     437             :                                   do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
     438        1744 :                                   x_data%blocks)
     439             : 
     440             : !$OMP END MASTER
     441        1744 : !$OMP BARRIER
     442             : 
     443             :       !! If there is only 1 cpu skip the binning
     444        1744 :       IF (n_processes == 1) THEN
     445          28 :          ALLOCATE (tmp_dist(1))
     446          14 :          tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
     447          14 :          tmp_dist(1)%istart = 0_int_8
     448          14 :          ptr_to_tmp_dist => tmp_dist(:)
     449          28 :          SELECT CASE (eval_type)
     450             :          CASE (hfx_do_eval_energy)
     451          14 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
     452             :          CASE (hfx_do_eval_forces)
     453          14 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
     454             :          END SELECT
     455          14 :          DEALLOCATE (tmp_dist)
     456             :       ELSE
     457             :          !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
     458        1730 : !$OMP BARRIER
     459        1730 : !$OMP MASTER
     460        1730 :          CALL timeset(routineN//"_count", handle_inner)
     461             : !$OMP END MASTER
     462        1730 : !$OMP BARRIER
     463             : 
     464        1730 :          cost_per_core = 0_int_8
     465        1730 :          my_process_id = para_env%mepos*n_threads + i_thread
     466        1730 :          nblocks = load_balance_parameter%nblocks
     467             : 
     468      481024 :          DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
     469             : 
     470      479294 :             latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
     471      479294 :             tmp_block = atom_block/nblocks
     472      479294 :             katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     473      479294 :             IF (latom_block < katom_block) CYCLE
     474      268410 :             tmp_block = tmp_block/nblocks
     475      268410 :             jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     476      268410 :             tmp_block = tmp_block/nblocks
     477      268410 :             iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     478      268410 :             IF (jatom_block < iatom_block) CYCLE
     479             : 
     480      151191 :             iatom_start = x_data%blocks(iatom_block)%istart
     481      151191 :             iatom_end = x_data%blocks(iatom_block)%iend
     482      151191 :             jatom_start = x_data%blocks(jatom_block)%istart
     483      151191 :             jatom_end = x_data%blocks(jatom_block)%iend
     484      151191 :             katom_start = x_data%blocks(katom_block)%istart
     485      151191 :             katom_end = x_data%blocks(katom_block)%iend
     486      151191 :             latom_start = x_data%blocks(latom_block)%istart
     487      151191 :             latom_end = x_data%blocks(latom_block)%iend
     488             : 
     489      286182 :             SELECT CASE (eval_type)
     490             :             CASE (hfx_do_eval_energy)
     491             :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
     492             :                                  pmax_block(latom_block, jatom_block), &
     493             :                                  pmax_block(latom_block, iatom_block), &
     494      134991 :                                  pmax_block(katom_block, jatom_block))
     495             :             CASE (hfx_do_eval_forces)
     496             :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
     497             :                                  pmax_block(latom_block, jatom_block), &
     498             :                                  pmax_block(latom_block, iatom_block) + &
     499      151191 :                                  pmax_block(katom_block, jatom_block))
     500             :             END SELECT
     501             : 
     502      151191 :             IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
     503             : 
     504             :             cost_per_core = cost_per_core &
     505             :                             + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     506             :                                                   iatom_start, iatom_end, jatom_start, jatom_end, &
     507             :                                                   katom_start, katom_end, latom_start, latom_end, &
     508             :                                                   particle_set, &
     509             :                                                   coeffs_set, coeffs_kind, &
     510             :                                                   is_assoc_atomic_block_global, do_periodic, &
     511             :                                                   kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     512             :                                                   cell, &
     513             :                                                   do_p_screening, map_atom_to_kind_atom, eval_type, &
     514      481024 :                                                   log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     515             : 
     516             :          END DO ! atom_block
     517             : 
     518        1730 :          nbins = load_balance_parameter%nbins
     519        1730 :          cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
     520             : 
     521        1730 : !$OMP BARRIER
     522        1730 : !$OMP MASTER
     523        1730 :          CALL timestop(handle_inner)
     524             : !$OMP END MASTER
     525        1730 : !$OMP BARRIER
     526             : 
     527             : ! new load balancing test
     528             :          IF (.FALSE.) THEN
     529             :             CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
     530             :                                             natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     531             :                                             particle_set, &
     532             :                                             coeffs_set, coeffs_kind, &
     533             :                                             is_assoc_atomic_block_global, do_periodic, &
     534             :                                             kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     535             :                                             cell, x_data, para_env, pmax_block, &
     536             :                                             do_p_screening, map_atom_to_kind_atom, eval_type, &
     537             :                                             log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     538             :          END IF
     539             : 
     540        1730 : !$OMP BARRIER
     541        1730 : !$OMP MASTER
     542        1730 :          CALL timeset(routineN//"_bin", handle_inner)
     543             : !$OMP END MASTER
     544        1730 : !$OMP BARRIER
     545             : 
     546      115910 :          ALLOCATE (binned_dist(nbins))
     547      112450 :          binned_dist(:)%istart = -1_int_8
     548      112450 :          binned_dist(:)%number_of_atom_quartets = 0_int_8
     549      112450 :          binned_dist(:)%cost = 0_int_8
     550      112450 :          binned_dist(:)%time_first_scf = 0.0_dp
     551      112450 :          binned_dist(:)%time_other_scf = 0.0_dp
     552      112450 :          binned_dist(:)%time_forces = 0.0_dp
     553             : 
     554        1730 :          current_cost = 0
     555        1730 :          mepos = 1
     556        1730 :          distribution_counter_start = 1
     557        1730 :          distribution_counter_end = 0
     558        1730 :          ibin = 1
     559             : 
     560        1730 :          global_quartet_counter = 0
     561        1730 :          local_quartet_counter = 0
     562        1730 :          last_bin_needs_to_be_filled = .FALSE.
     563      481024 :          DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
     564      479294 :             latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
     565      479294 :             tmp_block = atom_block/nblocks
     566      479294 :             katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     567      479294 :             IF (latom_block < katom_block) CYCLE
     568      268410 :             tmp_block = tmp_block/nblocks
     569      268410 :             jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     570      268410 :             tmp_block = tmp_block/nblocks
     571      268410 :             iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     572      268410 :             IF (jatom_block < iatom_block) CYCLE
     573             : 
     574      151191 :             distribution_counter_end = distribution_counter_end + 1
     575      151191 :             global_quartet_counter = global_quartet_counter + 1
     576      151191 :             last_bin_needs_to_be_filled = .TRUE.
     577             : 
     578      151191 :             IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
     579             : 
     580      151191 :             iatom_start = x_data%blocks(iatom_block)%istart
     581      151191 :             iatom_end = x_data%blocks(iatom_block)%iend
     582      151191 :             jatom_start = x_data%blocks(jatom_block)%istart
     583      151191 :             jatom_end = x_data%blocks(jatom_block)%iend
     584      151191 :             katom_start = x_data%blocks(katom_block)%istart
     585      151191 :             katom_end = x_data%blocks(katom_block)%iend
     586      151191 :             latom_start = x_data%blocks(latom_block)%istart
     587      151191 :             latom_end = x_data%blocks(latom_block)%iend
     588             : 
     589      286182 :             SELECT CASE (eval_type)
     590             :             CASE (hfx_do_eval_energy)
     591             :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
     592             :                                  pmax_block(latom_block, jatom_block), &
     593             :                                  pmax_block(latom_block, iatom_block), &
     594      134991 :                                  pmax_block(katom_block, jatom_block))
     595             :             CASE (hfx_do_eval_forces)
     596             :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
     597             :                                  pmax_block(latom_block, jatom_block), &
     598             :                                  pmax_block(latom_block, iatom_block) + &
     599      151191 :                                  pmax_block(katom_block, jatom_block))
     600             :             END SELECT
     601             : 
     602      151191 :             IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
     603             : 
     604             :             current_cost = current_cost &
     605             :                            + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     606             :                                                  iatom_start, iatom_end, jatom_start, jatom_end, &
     607             :                                                  katom_start, katom_end, latom_start, latom_end, &
     608             :                                                  particle_set, &
     609             :                                                  coeffs_set, coeffs_kind, &
     610             :                                                  is_assoc_atomic_block_global, do_periodic, &
     611             :                                                  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     612             :                                                  cell, &
     613             :                                                  do_p_screening, map_atom_to_kind_atom, eval_type, &
     614      150565 :                                                  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     615             : 
     616      152295 :             IF (current_cost >= cost_per_bin) THEN
     617       18688 :                IF (ibin == nbins) THEN
     618             :                   binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
     619           0 :                                                               distribution_counter_end - distribution_counter_start + 1
     620             :                ELSE
     621       18688 :                   binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
     622             :                END IF
     623       18688 :                binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
     624       18688 :                ibin = MIN(ibin + 1, nbins)
     625       18688 :                distribution_counter_start = distribution_counter_end + 1
     626       18688 :                current_cost = 0
     627       18688 :                last_bin_needs_to_be_filled = .FALSE.
     628             :             END IF
     629             :          END DO
     630             : 
     631        1730 : !$OMP BARRIER
     632        1730 : !$OMP MASTER
     633        1730 :          CALL timestop(handle_inner)
     634        1730 :          CALL timeset(routineN//"_dist", handle_inner)
     635             : !$OMP END MASTER
     636        1730 : !$OMP BARRIER
     637             :          !! Fill the last bin if necessary
     638        1730 :          IF (last_bin_needs_to_be_filled) THEN
     639         510 :             binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
     640         510 :             IF (ibin == nbins) THEN
     641             :                binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
     642           0 :                                                            distribution_counter_end - distribution_counter_start + 1
     643             :             ELSE
     644         510 :                binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
     645             :             END IF
     646             :          END IF
     647             : 
     648             :          !! Sanity-Check
     649      112450 :          DO ibin = 1, nbins
     650      112450 :             local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
     651             :          END DO
     652        1730 : !$OMP BARRIER
     653        1730 : !$OMP MASTER
     654        1730 :          shm_local_quartet_counter = 0
     655        1730 :          shm_global_quartet_counter = 0
     656             : !$OMP END MASTER
     657        1730 : !$OMP BARRIER
     658        1730 : !$OMP ATOMIC
     659             :          shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
     660        1730 : !$OMP ATOMIC
     661             :          shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
     662             : 
     663        1730 : !$OMP BARRIER
     664        1730 : !$OMP MASTER
     665        1730 :          CALL para_env%sum(shm_local_quartet_counter)
     666        1730 :          CALL para_env%sum(shm_global_quartet_counter)
     667        1730 :          IF (para_env%is_source()) THEN
     668         865 :             IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
     669             :                WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
     670           0 :                   "Number of local quartets (", shm_local_quartet_counter, &
     671           0 :                   ") and number of global quartets (", shm_global_quartet_counter, &
     672           0 :                   ") are different. Please send in a bug report."
     673           0 :                CPABORT(error_msg)
     674             :             END IF
     675             :          END IF
     676             : !$OMP END MASTER
     677             : 
     678        1730 : !$OMP BARRIER
     679        1730 : !$OMP MASTER
     680        5190 :          ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
     681      223170 :          cost_matrix = 0
     682             : !$OMP END MASTER
     683        1730 : !$OMP BARRIER
     684        1730 :          icpu = para_env%mepos + 1
     685      112450 :          DO i = 1, nbins
     686      112450 :             cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
     687             :          END DO
     688        1730 :          mepos = para_env%mepos
     689        1730 : !$OMP BARRIER
     690             : 
     691        1730 : !$OMP MASTER
     692             :          ! sync before/after ring of isendrecv
     693        1730 :          CALL para_env%sync()
     694             : 
     695        5190 :          ALLOCATE (sendbuffer(nbins*n_threads))
     696        5190 :          ALLOCATE (recbuffer(nbins*n_threads))
     697             : 
     698      223170 :          sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
     699             : 
     700        1730 :          dest = MODULO(mepos + 1, ncpu)
     701        1730 :          source = MODULO(mepos - 1, ncpu)
     702        5190 :          DO icpu = 0, ncpu - 1
     703        3460 :             IF (icpu .NE. ncpu - 1) THEN
     704             :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
     705        1730 :                                        req(1), req(2), 13)
     706             :             END IF
     707        3460 :             data_from = MODULO(mepos - icpu, ncpu)
     708      446340 :             cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
     709        3460 :             IF (icpu .NE. ncpu - 1) THEN
     710        1730 :                CALL mp_waitall(req)
     711             :             END IF
     712        3460 :             swapbuffer => sendbuffer
     713        3460 :             sendbuffer => recbuffer
     714        5190 :             recbuffer => swapbuffer
     715             :          END DO
     716        1730 :          DEALLOCATE (recbuffer, sendbuffer)
     717             : !$OMP END MASTER
     718        1730 : !$OMP BARRIER
     719             : 
     720        1730 : !$OMP BARRIER
     721        1730 : !$OMP MASTER
     722        1730 :          CALL timestop(handle_inner)
     723        1730 :          CALL timeset(routineN//"_opt", handle_inner)
     724             : !$OMP END MASTER
     725        1730 : !$OMP BARRIER
     726             : 
     727             :          !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
     728        1730 : !$OMP BARRIER
     729        5190 :          ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
     730      444610 :          local_cost_matrix = cost_matrix
     731        1730 : !$OMP MASTER
     732        5190 :          ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
     733             : 
     734             :          CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
     735        1730 :                                     shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
     736             : 
     737        1730 :          CALL timestop(handle_inner)
     738        1730 :          CALL timeset(routineN//"_redist", handle_inner)
     739             :          !! Collect local data to global array
     740      339080 :          ALLOCATE (full_dist(ncpu*n_threads, nbins))
     741             : 
     742      333890 :          full_dist(:, :)%istart = 0_int_8
     743      333890 :          full_dist(:, :)%number_of_atom_quartets = 0_int_8
     744      333890 :          full_dist(:, :)%cost = 0_int_8
     745      333890 :          full_dist(:, :)%time_first_scf = 0.0_dp
     746      333890 :          full_dist(:, :)%time_other_scf = 0.0_dp
     747      335620 :          full_dist(:, :)%time_forces = 0.0_dp
     748             : !$OMP END MASTER
     749        1730 : !$OMP BARRIER
     750        1730 :          mepos = para_env%mepos + 1
     751      223170 :          full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
     752             : 
     753        1730 : !$OMP BARRIER
     754        1730 : !$OMP MASTER
     755        5190 :          ALLOCATE (sendbuffer(3*nbins*n_threads))
     756        5190 :          ALLOCATE (recbuffer(3*nbins*n_threads))
     757        1730 :          mepos = para_env%mepos
     758        3460 :          DO j = 1, n_threads
     759      114180 :             DO i = 1, nbins
     760      110720 :                sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
     761      110720 :                sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
     762      112450 :                sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
     763             :             END DO
     764             :          END DO
     765             : 
     766             :          ! sync before/after ring of isendrecv
     767        1730 :          CALL para_env%sync()
     768        1730 :          dest = MODULO(mepos + 1, ncpu)
     769        1730 :          source = MODULO(mepos - 1, ncpu)
     770        5190 :          DO icpu = 0, ncpu - 1
     771        3460 :             IF (icpu .NE. ncpu - 1) THEN
     772             :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
     773        1730 :                                        req(1), req(2), 13)
     774             :             END IF
     775        3460 :             data_from = MODULO(mepos - icpu, ncpu)
     776        6920 :             DO j = 1, n_threads
     777      228360 :                DO i = 1, nbins
     778      221440 :                   full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
     779      221440 :                   full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
     780      224900 :                   full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
     781             :                END DO
     782             :             END DO
     783             : 
     784        3460 :             IF (icpu .NE. ncpu - 1) THEN
     785        1730 :                CALL mp_waitall(req)
     786             :             END IF
     787        3460 :             swapbuffer => sendbuffer
     788        3460 :             sendbuffer => recbuffer
     789        5190 :             recbuffer => swapbuffer
     790             :          END DO
     791        1730 :          DEALLOCATE (recbuffer, sendbuffer)
     792             : 
     793             :          ! sync before/after ring of isendrecv
     794        1730 :          CALL para_env%sync()
     795             : !$OMP END MASTER
     796        1730 : !$OMP BARRIER
     797             :          !! reorder the distribution according to the distribution vector
     798        5190 :          ALLOCATE (tmp_pos(ncpu*n_threads))
     799        5190 :          tmp_pos = 1
     800      226630 :          ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
     801             : 
     802      223170 :          tmp_dist(:)%istart = 0_int_8
     803      223170 :          tmp_dist(:)%number_of_atom_quartets = 0_int_8
     804      223170 :          tmp_dist(:)%cost = 0_int_8
     805      223170 :          tmp_dist(:)%time_first_scf = 0.0_dp
     806      223170 :          tmp_dist(:)%time_other_scf = 0.0_dp
     807      223170 :          tmp_dist(:)%time_forces = 0.0_dp
     808             : 
     809        5190 :          DO icpu = 1, n_processes
     810      226630 :             DO i = 1, nbins
     811      221440 :                mepos = my_process_id + 1
     812      224900 :                IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
     813      110720 :                   tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
     814      110720 :                   tmp_pos(mepos) = tmp_pos(mepos) + 1
     815             :                END IF
     816             :             END DO
     817             :          END DO
     818             : 
     819             :          !! Assign the load to each process
     820             :          NULLIFY (ptr_to_tmp_dist)
     821        1730 :          mepos = my_process_id + 1
     822        1730 :          ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
     823        2932 :          SELECT CASE (eval_type)
     824             :          CASE (hfx_do_eval_energy)
     825        1202 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
     826             :          CASE (hfx_do_eval_forces)
     827        1730 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
     828             :          END SELECT
     829             : 
     830        1730 : !$OMP BARRIER
     831        1730 : !$OMP MASTER
     832        1730 :          DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
     833             : !$OMP END MASTER
     834        1730 : !$OMP BARRIER
     835        1730 :          DEALLOCATE (tmp_dist, tmp_pos)
     836        1730 :          DEALLOCATE (binned_dist, local_cost_matrix)
     837        1730 :          DEALLOCATE (set_list_ij, set_list_kl)
     838             : 
     839        1730 : !$OMP BARRIER
     840        1730 : !$OMP MASTER
     841        1730 :          CALL timestop(handle_inner)
     842             : !$OMP END MASTER
     843        1730 : !$OMP BARRIER
     844             :       END IF
     845        1744 : !$OMP BARRIER
     846        1744 : !$OMP MASTER
     847        1744 :       CALL timestop(handle)
     848             : !$OMP END MASTER
     849        1744 : !$OMP BARRIER
     850     3618800 :    END SUBROUTINE hfx_load_balance
     851             : 
     852             : ! **************************************************************************************************
     853             : !> \brief Reference implementation of new recursive load balancing routine
     854             : !>        Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
     855             : !>        each process in a P-Q grid such that every process has more or less the
     856             : !>        same amount of work. Has no output at the moment (not used) but writes
     857             : !>        its computed load balance values into a file. Possible output is ready
     858             : !>        to use in the two arrays p_atom_blocks & q_atom_blocks
     859             : !> \param n_processes ...
     860             : !> \param my_process_id ...
     861             : !> \param nblocks ...
     862             : !> \param natom ...
     863             : !> \param nkind ...
     864             : !> \param list_ij ...
     865             : !> \param list_kl ...
     866             : !> \param set_list_ij ...
     867             : !> \param set_list_kl ...
     868             : !> \param particle_set ...
     869             : !> \param coeffs_set ...
     870             : !> \param coeffs_kind ...
     871             : !> \param is_assoc_atomic_block_global ...
     872             : !> \param do_periodic ...
     873             : !> \param kind_of ...
     874             : !> \param basis_parameter ...
     875             : !> \param pmax_set ...
     876             : !> \param pmax_atom ...
     877             : !> \param pmax_blocks ...
     878             : !> \param cell ...
     879             : !> \param x_data ...
     880             : !> \param para_env ...
     881             : !> \param pmax_block ...
     882             : !> \param do_p_screening ...
     883             : !> \param map_atom_to_kind_atom ...
     884             : !> \param eval_type ...
     885             : !> \param log10_eps_schwarz ...
     886             : !> \param log_2 ...
     887             : !> \param coeffs_kind_max0 ...
     888             : !> \param use_virial ...
     889             : !> \param atomic_pair_list ...
     890             : !> \par History
     891             : !>      03.2011 created [Michael Steinlechner]
     892             : !> \author Michael Steinlechner
     893             : ! **************************************************************************************************
     894             : 
     895           0 :    SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
     896             :                                          natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     897             :                                          particle_set, &
     898             :                                          coeffs_set, coeffs_kind, &
     899           0 :                                          is_assoc_atomic_block_global, do_periodic, &
     900             :                                          kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     901             :                                          cell, x_data, para_env, pmax_block, &
     902             :                                          do_p_screening, map_atom_to_kind_atom, eval_type, &
     903             :                                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     904             : 
     905             : ! input variables:
     906             :       INTEGER, INTENT(IN)                                :: n_processes, my_process_id, nblocks, &
     907             :                                                             natom, nkind
     908             :       TYPE(pair_list_type), INTENT(IN)                   :: list_ij, list_kl
     909             :       TYPE(pair_set_list_type), ALLOCATABLE, &
     910             :          DIMENSION(:), INTENT(IN)                        :: set_list_ij, set_list_kl
     911             :       TYPE(particle_type), DIMENSION(:), INTENT(IN), &
     912             :          POINTER                                         :: particle_set
     913             :       TYPE(hfx_screen_coeff_type), &
     914             :          DIMENSION(:, :, :, :), INTENT(IN), POINTER      :: coeffs_set
     915             :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
     916             :          INTENT(IN), POINTER                             :: coeffs_kind
     917             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: is_assoc_atomic_block_global
     918             :       LOGICAL, INTENT(IN)                                :: do_periodic
     919             :       INTEGER, INTENT(IN)                                :: kind_of(*)
     920             :       TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
     921             :          POINTER                                         :: basis_parameter
     922             :       TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
     923             :          POINTER                                         :: pmax_set
     924             :       REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_atom
     925             :       REAL(dp)                                           :: pmax_blocks
     926             :       TYPE(cell_type), INTENT(IN), POINTER               :: cell
     927             :       TYPE(hfx_type), INTENT(IN), POINTER                :: x_data
     928             :       TYPE(mp_para_env_type), INTENT(IN)        :: para_env
     929             :       REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_block
     930             :       LOGICAL, INTENT(IN)                                :: do_p_screening
     931             :       INTEGER, DIMENSION(:), INTENT(IN), POINTER         :: map_atom_to_kind_atom
     932             :       INTEGER, INTENT(IN)                                :: eval_type
     933             :       REAL(dp), INTENT(IN)                               :: log10_eps_schwarz, log_2, &
     934             :                                                             coeffs_kind_max0
     935             :       LOGICAL, INTENT(IN)                                :: use_virial
     936             :       LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER      :: atomic_pair_list
     937             : 
     938             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_recursive_load_balance'
     939             : 
     940             :       INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
     941             :                  jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
     942             :                  nP, nQ, numBins, p, q, sizeP, sizeQ, unit_nr
     943             :       INTEGER(int_8)                                     :: local_cost, pidx, qidx, sumP, sumQ
     944           0 :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: local_cost_vector
     945           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: blocksize, p_atom_blocks, permute, &
     946           0 :                                                             q_atom_blocks
     947             :       REAL(dp)                                           :: maximum, mean
     948             : 
     949             : ! internal variables:
     950             : 
     951           0 : !$OMP BARRIER
     952           0 : !$OMP MASTER
     953           0 :       CALL timeset(routineN, handle)
     954             : !$OMP END MASTER
     955           0 : !$OMP BARRIER
     956             : 
     957             :       ! calculate best p/q distribution grid for the n_processes
     958           0 :       CALL hfx_calculate_PQ(p, q, numBins, n_processes)
     959             : 
     960           0 :       ALLOCATE (blocksize(numBins))
     961           0 :       ALLOCATE (permute(nblocks**2))
     962           0 :       DO i = 1, nblocks**2
     963           0 :          permute(i) = i
     964             :       END DO
     965             : 
     966             :       ! call the main recursive permutation routine.
     967             :       ! Output:
     968             :       !   blocksize :: vector (size numBins) with the sizes for each column/row block
     969             :       !   permute   :: permutation vector
     970             :       CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numBins, &
     971             :                                  permute, 1, &
     972             :                                  my_process_id, n_processes, nblocks, &
     973             :                                  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     974             :                                  particle_set, &
     975             :                                  coeffs_set, coeffs_kind, &
     976             :                                  is_assoc_atomic_block_global, do_periodic, &
     977             :                                  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     978             :                                  cell, x_data, para_env, pmax_block, &
     979             :                                  do_p_screening, map_atom_to_kind_atom, eval_type, &
     980           0 :                                  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     981             : 
     982             :       ! number of blocks per processor in p-direction (vertical)
     983           0 :       nP = numBins/p
     984             :       ! number of blocks per processor in q-direction (horizontal)
     985           0 :       nQ = numBins/q
     986             : 
     987             :       ! calc own position in P-Q-processor grid (PQ-grid is column-major)
     988           0 :       pidx = MODULO(INT(my_process_id), INT(p)) + 1
     989           0 :       qidx = my_process_id/p + 1
     990             : 
     991           0 :       sizeP = SUM(blocksize((nP*(pidx - 1) + 1):(nP*pidx)))
     992           0 :       sizeQ = SUM(blocksize((nQ*(qidx - 1) + 1):(nQ*qidx)))
     993             : 
     994           0 :       sumP = SUM(blocksize(1:(nP*(pidx - 1))))
     995           0 :       sumQ = SUM(blocksize(1:(nQ*(qidx - 1))))
     996             : 
     997           0 :       ALLOCATE (p_atom_blocks(sizeP))
     998           0 :       ALLOCATE (q_atom_blocks(sizeQ))
     999             : 
    1000           0 :       p_atom_blocks(:) = permute((sumP + 1):(sumP + sizeP))
    1001           0 :       q_atom_blocks(:) = permute((sumQ + 1):(sumQ + sizeQ))
    1002             : 
    1003             :       ! from here on, we are actually finished, each process has been
    1004             :       ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
    1005             :       ! what follows is just a small routine to calculate the local cost
    1006             :       ! for each processor which is then written to a file.
    1007             : 
    1008             :       ! calculate local cost for each processor!
    1009             :       ! ****************************************
    1010             :       local_cost = 0
    1011           0 :       DO i = 1, sizeP
    1012           0 :          DO j = 1, sizeQ
    1013             : 
    1014             :             !       get corresponding 4D block indices out of our own P-Q-block
    1015           0 :             latom_block = MODULO(q_atom_blocks(j), nblocks)
    1016           0 :             iatom_block = q_atom_blocks(j)/nblocks + 1
    1017           0 :             jatom_block = MODULO(p_atom_blocks(i), nblocks)
    1018           0 :             katom_block = p_atom_blocks(i)/nblocks + 1
    1019             : 
    1020             :             !       symmetry checks.
    1021           0 :             IF (latom_block < katom_block) CYCLE
    1022           0 :             IF (jatom_block < iatom_block) CYCLE
    1023             : 
    1024           0 :             iatom_start = x_data%blocks(iatom_block)%istart
    1025           0 :             iatom_end = x_data%blocks(iatom_block)%iend
    1026           0 :             jatom_start = x_data%blocks(jatom_block)%istart
    1027           0 :             jatom_end = x_data%blocks(jatom_block)%iend
    1028           0 :             katom_start = x_data%blocks(katom_block)%istart
    1029           0 :             katom_end = x_data%blocks(katom_block)%iend
    1030           0 :             latom_start = x_data%blocks(latom_block)%istart
    1031           0 :             latom_end = x_data%blocks(latom_block)%iend
    1032             : 
    1033             :             !       whatever.
    1034           0 :             SELECT CASE (eval_type)
    1035             :             CASE (hfx_do_eval_energy)
    1036             :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
    1037             :                                  pmax_block(latom_block, jatom_block), &
    1038             :                                  pmax_block(latom_block, iatom_block), &
    1039           0 :                                  pmax_block(katom_block, jatom_block))
    1040             :             CASE (hfx_do_eval_forces)
    1041             :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
    1042             :                                  pmax_block(latom_block, jatom_block), &
    1043             :                                  pmax_block(latom_block, iatom_block) + &
    1044           0 :                                  pmax_block(katom_block, jatom_block))
    1045             :             END SELECT
    1046             : 
    1047             :             !       screening.
    1048           0 :             IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
    1049             : 
    1050             :             !       estimate the cost of this atom_block.
    1051             :             local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
    1052             :                                                           set_list_kl, &
    1053             :                                                           iatom_start, iatom_end, jatom_start, jatom_end, &
    1054             :                                                           katom_start, katom_end, latom_start, latom_end, &
    1055             :                                                           particle_set, &
    1056             :                                                           coeffs_set, coeffs_kind, &
    1057             :                                                           is_assoc_atomic_block_global, do_periodic, &
    1058             :                                                           kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1059             :                                                           cell, &
    1060             :                                                           do_p_screening, map_atom_to_kind_atom, eval_type, &
    1061           0 :                                                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1062             :          END DO
    1063             :       END DO
    1064             : 
    1065           0 :       ALLOCATE (local_cost_vector(n_processes))
    1066           0 :       local_cost_vector = 0
    1067           0 :       local_cost_vector(my_process_id + 1) = local_cost
    1068           0 :       CALL para_env%sum(local_cost_vector)
    1069             : 
    1070           0 :       mean = SUM(local_cost_vector)/n_processes
    1071           0 :       maximum = MAXVAL(local_cost_vector)
    1072             : 
    1073           0 : !$OMP     BARRIER
    1074           0 : !$OMP     MASTER
    1075             :       ! only output once
    1076           0 :       IF (my_process_id == 0) THEN
    1077           0 :          CALL open_file(unit_number=unit_nr, file_name="loads.dat")
    1078           0 :          WRITE (unit_nr, *) 'maximum cost:', maximum
    1079           0 :          WRITE (unit_nr, *) 'mean cost:', mean
    1080           0 :          WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
    1081           0 :          WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
    1082           0 :          DO i = 1, n_processes
    1083           0 :             WRITE (unit_nr, *) local_cost_vector(i)
    1084             :          END DO
    1085           0 :          CALL close_file(unit_nr)
    1086             :       END IF
    1087             : !$OMP     END MASTER
    1088           0 : !$OMP     BARRIER
    1089             : 
    1090           0 :       DEALLOCATE (local_cost_vector)
    1091           0 :       DEALLOCATE (p_atom_blocks, q_atom_blocks)
    1092           0 :       DEALLOCATE (blocksize, permute)
    1093             : 
    1094           0 : !$OMP BARRIER
    1095           0 : !$OMP MASTER
    1096           0 :       CALL timestop(handle)
    1097             : !$OMP END MASTER
    1098           0 : !$OMP BARRIER
    1099             : 
    1100           0 :    END SUBROUTINE hfx_recursive_load_balance
    1101             : 
    1102             : ! **************************************************************************************************
    1103             : !> \brief Small routine to calculate the optimal P-Q-processor grid distribution
    1104             : !>        for a given number of processors N
    1105             : !>        and the corresponding number of Bins for the load balancing routine
    1106             : !> \param p     number of rows on P-Q process grid (output)
    1107             : !> \param q     number of columns on P-Q process grid (output)
    1108             : !> \param nBins number of Bins (output)
    1109             : !> \param N     number of processes (input)
    1110             : !> \par History
    1111             : !>      03.2011 created [Michael Steinlechner]
    1112             : !> \author Michael Steinlechner
    1113             : ! **************************************************************************************************
    1114           0 :    SUBROUTINE hfx_calculate_PQ(p, q, nBins, N)
    1115             : 
    1116             :       INTEGER, INTENT(OUT)                               :: p, q, nBins
    1117             :       INTEGER, INTENT(IN)                                :: N
    1118             : 
    1119             :       INTEGER                                            :: a, b, k
    1120             :       REAL(dp)                                           :: sqN
    1121             : 
    1122           0 :       k = 2
    1123           0 :       sqN = SQRT(REAL(N, KIND=dp))
    1124           0 :       p = 1
    1125             : 
    1126           0 :       DO WHILE (REAL(k, KIND=dp) <= sqN)
    1127           0 :          IF (MODULO(N, k) == 0) THEN
    1128           0 :             p = k
    1129             :          END IF
    1130           0 :          k = k + 1
    1131             :       END DO
    1132           0 :       q = N/p
    1133             : 
    1134             :       ! now compute the least common multiple of p & q to get the number of necessary bins
    1135             :       ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
    1136             :       ! and use euclid's algorithm for GCD computation.
    1137           0 :       a = p
    1138           0 :       b = q
    1139             : 
    1140           0 :       DO WHILE (b .NE. 0)
    1141           0 :          IF (a > b) THEN
    1142           0 :             a = a - b
    1143             :          ELSE
    1144           0 :             b = b - a
    1145             :          END IF
    1146             :       END DO
    1147             :       ! gcd(p,q) is now saved in a
    1148             : 
    1149           0 :       nBins = p*q/a
    1150             : 
    1151           0 :    END SUBROUTINE
    1152             : 
    1153             : ! **************************************************************************************************
    1154             : !> \brief Recursive permutation routine for the load balancing of the integral
    1155             : !>       computation
    1156             : !> \param blocksize     vector of blocksizes, size(nProc), which contains for
    1157             : !>                      each process the local blocksize (OUTPUT)
    1158             : !> \param blockstart    starting row/column idx of the block which is to be examined
    1159             : !>                      at this point (INPUT)
    1160             : !> \param blockend      ending row/column idx of the block which is to be examined
    1161             : !>                      (INPUT)
    1162             : !> \param nProc_in      number of bins into which the current block has to be divided
    1163             : !>                      (INPUT)
    1164             : !> \param permute       permutation vector which balances column/row cost
    1165             : !>                      size(nblocks^2). (OUTPUT)
    1166             : !> \param step ...
    1167             : !> \param my_process_id ...
    1168             : !> \param n_processes ...
    1169             : !> \param nblocks ...
    1170             : !> \param natom ...
    1171             : !> \param nkind ...
    1172             : !> \param list_ij ...
    1173             : !> \param list_kl ...
    1174             : !> \param set_list_ij ...
    1175             : !> \param set_list_kl ...
    1176             : !> \param particle_set ...
    1177             : !> \param coeffs_set ...
    1178             : !> \param coeffs_kind ...
    1179             : !> \param is_assoc_atomic_block_global ...
    1180             : !> \param do_periodic ...
    1181             : !> \param kind_of ...
    1182             : !> \param basis_parameter ...
    1183             : !> \param pmax_set ...
    1184             : !> \param pmax_atom ...
    1185             : !> \param pmax_blocks ...
    1186             : !> \param cell ...
    1187             : !> \param x_data ...
    1188             : !> \param para_env ...
    1189             : !> \param pmax_block ...
    1190             : !> \param do_p_screening ...
    1191             : !> \param map_atom_to_kind_atom ...
    1192             : !> \param eval_type ...
    1193             : !> \param log10_eps_schwarz ...
    1194             : !> \param log_2 ...
    1195             : !> \param coeffs_kind_max0 ...
    1196             : !> \param use_virial ...
    1197             : !> \param atomic_pair_list ...
    1198             : !> \par History
    1199             : !>      03.2011 created [Michael Steinlechner]
    1200             : !> \author Michael Steinlechner
    1201             : ! **************************************************************************************************
    1202           0 :    RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
    1203           0 :                                               permute, step, &
    1204             :                                               my_process_id, n_processes, nblocks, &
    1205             :                                               natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    1206             :                                               particle_set, &
    1207             :                                               coeffs_set, coeffs_kind, &
    1208           0 :                                               is_assoc_atomic_block_global, do_periodic, &
    1209             :                                               kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1210             :                                               cell, x_data, para_env, pmax_block, &
    1211             :                                               do_p_screening, map_atom_to_kind_atom, eval_type, &
    1212             :                                               log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1213             : 
    1214             :       INTEGER                                            :: nProc_in, blockend, blockstart
    1215             :       INTEGER, DIMENSION(nProc_in)                       :: blocksize
    1216             :       INTEGER                                            :: nblocks, n_processes, my_process_id
    1217             :       INTEGER, INTENT(IN)                                :: step
    1218             :       INTEGER, DIMENSION(nblocks*nblocks)                :: permute
    1219             :       INTEGER                                            :: natom
    1220             :       INTEGER, INTENT(IN)                                :: nkind
    1221             :       TYPE(pair_list_type)                               :: list_ij, list_kl
    1222             :       TYPE(pair_set_list_type), ALLOCATABLE, &
    1223             :          DIMENSION(:)                                    :: set_list_ij, set_list_kl
    1224             :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1225             :       TYPE(hfx_screen_coeff_type), &
    1226             :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
    1227             :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
    1228             :          POINTER                                         :: coeffs_kind
    1229             :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
    1230             :       LOGICAL                                            :: do_periodic
    1231             :       INTEGER                                            :: kind_of(*)
    1232             :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
    1233             :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
    1234             :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
    1235             :       REAL(dp)                                           :: pmax_blocks
    1236             :       TYPE(cell_type), POINTER                           :: cell
    1237             :       TYPE(hfx_type), POINTER                            :: x_data
    1238             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    1239             :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
    1240             :       LOGICAL, INTENT(IN)                                :: do_p_screening
    1241             :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
    1242             :       INTEGER, INTENT(IN)                                :: eval_type
    1243             :       REAL(dp)                                           :: log10_eps_schwarz, log_2, &
    1244             :                                                             coeffs_kind_max0
    1245             :       LOGICAL, INTENT(IN)                                :: use_virial
    1246             :       LOGICAL, DIMENSION(:, :), POINTER                  :: atomic_pair_list
    1247             : 
    1248             :       INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
    1249             :                  jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
    1250             :                  latom_end, latom_start, nbins, nProc, row, startoffset
    1251             :       INTEGER(int_8)                                     :: atom_block, tmp_block
    1252           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: ithblocksize, localblocksize
    1253           0 :       INTEGER, DIMENSION(blockend - blockstart + 1)          :: bin_perm, tmp_perm
    1254             :       REAL(dp)                                           :: partialcost
    1255           0 :       REAL(dp), DIMENSION(nblocks*nblocks)               :: cost_vector
    1256             : 
    1257           0 :       nProc = nProc_in
    1258           0 :       cost_vector = 0.0_dp
    1259             : 
    1260             : !   loop over local atom_blocks.
    1261           0 :       DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
    1262             : 
    1263             : !       get corresponding 4D block indices
    1264           0 :          latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
    1265           0 :          tmp_block = atom_block/nblocks
    1266           0 :          katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
    1267           0 :          IF (latom_block < katom_block) CYCLE
    1268           0 :          tmp_block = tmp_block/nblocks
    1269           0 :          jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
    1270           0 :          tmp_block = tmp_block/nblocks
    1271           0 :          iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
    1272           0 :          IF (jatom_block < iatom_block) CYCLE
    1273             : 
    1274             : !       get 2D indices of this atom_block (with permutation applied)
    1275             : !       for this, we need to invert the permutation, this means
    1276             : !       find position in permutation vector where value==idx
    1277             : 
    1278           0 :          row = (katom_block - 1)*nblocks + jatom_block
    1279           0 :          inv_perm = 1
    1280           0 :          DO WHILE (permute(inv_perm) .NE. row)
    1281           0 :             inv_perm = inv_perm + 1
    1282             :          END DO
    1283           0 :          row = inv_perm
    1284             : 
    1285           0 :          col = (iatom_block - 1)*nblocks + latom_block
    1286           0 :          inv_perm = 1
    1287           0 :          DO WHILE (permute(inv_perm) .NE. col)
    1288           0 :             inv_perm = inv_perm + 1
    1289             :          END DO
    1290           0 :          col = inv_perm
    1291             : 
    1292             : !       if row/col outside our current diagonal block, skip calculation.
    1293           0 :          IF (col < blockstart .OR. col > blockend) CYCLE
    1294           0 :          IF (row < blockstart .OR. row > blockend) CYCLE
    1295             : 
    1296           0 :          iatom_start = x_data%blocks(iatom_block)%istart
    1297           0 :          iatom_end = x_data%blocks(iatom_block)%iend
    1298           0 :          jatom_start = x_data%blocks(jatom_block)%istart
    1299           0 :          jatom_end = x_data%blocks(jatom_block)%iend
    1300           0 :          katom_start = x_data%blocks(katom_block)%istart
    1301           0 :          katom_end = x_data%blocks(katom_block)%iend
    1302           0 :          latom_start = x_data%blocks(latom_block)%istart
    1303           0 :          latom_end = x_data%blocks(latom_block)%iend
    1304             : 
    1305             : !       whatever.
    1306           0 :          SELECT CASE (eval_type)
    1307             :          CASE (hfx_do_eval_energy)
    1308             :             pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
    1309             :                               pmax_block(latom_block, jatom_block), &
    1310             :                               pmax_block(latom_block, iatom_block), &
    1311           0 :                               pmax_block(katom_block, jatom_block))
    1312             :          CASE (hfx_do_eval_forces)
    1313             :             pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
    1314             :                               pmax_block(latom_block, jatom_block), &
    1315             :                               pmax_block(latom_block, iatom_block) + &
    1316           0 :                               pmax_block(katom_block, jatom_block))
    1317             :          END SELECT
    1318             : 
    1319             : !       screening.
    1320           0 :          IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
    1321             : 
    1322             : !       every second recursion step, compute row sum instead of column sum
    1323             : 
    1324           0 :          IF (MODULO(step, 2) .EQ. 0) THEN
    1325             :             idx = row
    1326             :          ELSE
    1327           0 :             idx = col
    1328             :          END IF
    1329             : 
    1330             : !       estimate the cost of this atom_block.
    1331             :          partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
    1332             :                                            set_list_kl, &
    1333             :                                            iatom_start, iatom_end, jatom_start, jatom_end, &
    1334             :                                            katom_start, katom_end, latom_start, latom_end, &
    1335             :                                            particle_set, &
    1336             :                                            coeffs_set, coeffs_kind, &
    1337             :                                            is_assoc_atomic_block_global, do_periodic, &
    1338             :                                            kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1339             :                                            cell, &
    1340             :                                            do_p_screening, map_atom_to_kind_atom, eval_type, &
    1341           0 :                                            log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1342             : 
    1343           0 :          cost_vector(idx) = cost_vector(idx) + partialcost
    1344             :       END DO ! atom_block
    1345             : 
    1346             : !   sum costvector over all processes
    1347           0 :       CALL para_env%sum(cost_vector)
    1348             : 
    1349             : !   calculate next prime factor of nProc
    1350           0 :       nBins = 2
    1351           0 :       DO WHILE (MODULO(INT(nProc), INT(nBins)) .NE. 0)
    1352           0 :          nBins = nBins + 1
    1353             :       END DO
    1354             : 
    1355           0 :       nProc = nProc/nBins
    1356             : 
    1357             : ! ... do the binning...
    1358             : 
    1359           0 :       ALLOCATE (localblocksize(nBins))
    1360           0 :       CALL hfx_permute_binning(nBins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
    1361             : 
    1362             : !... and update the permutation vector
    1363             : 
    1364           0 :       tmp_perm = permute(blockstart:blockend)
    1365           0 :       permute(blockstart:blockend) = tmp_perm(bin_perm)
    1366             : 
    1367             : !   split recursion into the nBins Bins
    1368           0 :       IF (nProc > 1) THEN
    1369           0 :          ALLOCATE (ithblocksize(nProc))
    1370           0 :          DO i = 1, nBins
    1371           0 :             startoffset = SUM(localblocksize(1:(i - 1)))
    1372           0 :             endoffset = SUM(localblocksize(1:i)) - 1
    1373             : 
    1374             :             CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nProc, &
    1375             :                                        permute, step + 1, &
    1376             :                                        my_process_id, n_processes, nblocks, &
    1377             :                                        natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    1378             :                                        particle_set, &
    1379             :                                        coeffs_set, coeffs_kind, &
    1380             :                                        is_assoc_atomic_block_global, do_periodic, &
    1381             :                                        kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1382             :                                        cell, x_data, para_env, pmax_block, &
    1383             :                                        do_p_screening, map_atom_to_kind_atom, eval_type, &
    1384           0 :                                        log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1385           0 :             blocksize(((i - 1)*nProc + 1):(i*nProc)) = ithblocksize
    1386             :          END DO
    1387           0 :          DEALLOCATE (ithblocksize)
    1388             :       ELSE
    1389           0 :          DO i = 1, nBins
    1390           0 :             blocksize(i) = localblocksize(i)
    1391             :          END DO
    1392             :       END IF
    1393             : 
    1394           0 :       DEALLOCATE (localblocksize)
    1395             : 
    1396           0 :    END SUBROUTINE hfx_recursive_permute
    1397             : 
    1398             : ! **************************************************************************************************
    1399             : !> \brief small binning routine for the recursive load balancing
    1400             : !>
    1401             : !> \param nBins         number of Bins (INPUT)
    1402             : !> \param costvector    vector of current row/column costs which have to be binned (INPUT)
    1403             : !> \param maxbinsize    upper bound for bin size (INPUT)
    1404             : !> \param perm          resulting permutation due to be binning routine (OUTPUT)
    1405             : !> \param block_count   vector of size(nbins) which contains the size of each bin (OUTPUT)
    1406             : !> \par History
    1407             : !>      03.2011 created [Michael Steinlechner]
    1408             : !> \author Michael Steinlechner
    1409             : ! **************************************************************************************************
    1410           0 :    SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
    1411             : 
    1412             :       INTEGER, INTENT(IN)                                :: nBins, maxbinsize
    1413             :       REAL(dp), DIMENSION(maxbinsize), INTENT(IN)        :: costvector
    1414             :       INTEGER, DIMENSION(maxbinsize), INTENT(OUT)        :: perm
    1415             :       INTEGER, DIMENSION(nBins), INTENT(OUT)             :: block_count
    1416             : 
    1417             :       INTEGER                                            :: i, j, mod_idx, offset
    1418           0 :       INTEGER, DIMENSION(nBins, maxbinsize)              :: bin
    1419           0 :       INTEGER, DIMENSION(nBins)                          :: bin_idx
    1420           0 :       INTEGER, DIMENSION(maxbinsize)                     :: idx
    1421           0 :       REAL(dp), DIMENSION(maxbinsize)                    :: vec
    1422           0 :       REAL(dp), DIMENSION(nBins)                         :: bincosts
    1423             : 
    1424             : ! be careful not to change costvector (copy it!)
    1425             : 
    1426           0 :       vec = costvector
    1427           0 :       block_count = 0
    1428           0 :       bincosts = 0
    1429             : 
    1430             :       !sort the array (ascending)
    1431           0 :       CALL sort(vec, maxbinsize, idx)
    1432             : 
    1433             :       ! count the loop down to distribute the largest cols/rows first
    1434           0 :       DO i = maxbinsize, 1, -1
    1435           0 :          IF (vec(i) == 0) THEN
    1436             :             ! spread zero-cost col/rows evenly among procs
    1437           0 :             mod_idx = MODULO(i, nBins) + 1 !(note the fortran offset by one!)
    1438           0 :             block_count(mod_idx) = block_count(mod_idx) + 1
    1439           0 :             bin(mod_idx, block_count(mod_idx)) = idx(i)
    1440             :          ELSE
    1441             :             ! sort the bins so that the one with the lowest cost is at the
    1442             :             ! first place, where we then assign the current col/row
    1443           0 :             CALL sort(bincosts, nBins, bin_idx)
    1444           0 :             block_count = block_count(bin_idx)
    1445           0 :             bin = bin(bin_idx, :)
    1446             : 
    1447           0 :             bincosts(1) = bincosts(1) + vec(i)
    1448           0 :             block_count(1) = block_count(1) + 1
    1449           0 :             bin(1, block_count(1)) = idx(i)
    1450             :          END IF
    1451             :       END DO
    1452             : 
    1453             :       ! construct permutation vector from the binning
    1454             :       offset = 0
    1455           0 :       DO i = 1, nBins
    1456           0 :          DO j = 1, block_count(i)
    1457           0 :             perm(offset + j) = bin(i, j)
    1458             :          END DO
    1459           0 :          offset = offset + block_count(i)
    1460             :       END DO
    1461             : 
    1462           0 :    END SUBROUTINE hfx_permute_binning
    1463             : 
    1464             : ! **************************************************************************************************
    1465             : !> \brief Cheap way of redistributing the eri's
    1466             : !> \param x_data Object that stores the indices array
    1467             : !> \param para_env para_env
    1468             : !> \param load_balance_parameter contains parmameter for Monte-Carlo routines
    1469             : !> \param i_thread current thread ID
    1470             : !> \param n_threads Total Number of threads
    1471             : !> \param eval_type ...
    1472             : !> \par History
    1473             : !>      12.2007 created [Manuel Guidon]
    1474             : !>      02.2009 optimize Memory Usage [Manuel Guidon]
    1475             : !> \author Manuel Guidon
    1476             : !> \note
    1477             : !>      The cost matrix is given by the walltime for each bin that is measured
    1478             : !>      during the calculation
    1479             : ! **************************************************************************************************
    1480        1712 :    SUBROUTINE hfx_update_load_balance(x_data, para_env, &
    1481             :                                       load_balance_parameter, &
    1482             :                                       i_thread, n_threads, eval_type)
    1483             : 
    1484             :       TYPE(hfx_type), POINTER                            :: x_data
    1485             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    1486             :       TYPE(hfx_load_balance_type)                        :: load_balance_parameter
    1487             :       INTEGER, INTENT(IN)                                :: i_thread, n_threads, eval_type
    1488             : 
    1489             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_update_load_balance'
    1490             : 
    1491             :       INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
    1492             :                  my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
    1493        5136 :       TYPE(mp_request_type), DIMENSION(2) :: req
    1494        1712 :       INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
    1495        1712 :                                                             sendbuffer, swapbuffer
    1496             :       INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
    1497        1712 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: tmp_pos
    1498             :       INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: bins_per_rank
    1499             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE        :: bin_histogram
    1500             :       INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
    1501             :       INTEGER, SAVE                                      :: max_bin_size
    1502        3424 :       TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
    1503             :       TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
    1504             :          SAVE                                            :: full_dist
    1505             : 
    1506        1712 : !$OMP BARRIER
    1507        3424 : !$OMP MASTER
    1508        1712 :       CALL timeset(routineN, handle)
    1509             : !$OMP END MASTER
    1510        1712 : !$OMP BARRIER
    1511             : 
    1512        1712 :       ncpu = para_env%num_pe
    1513        1712 :       n_processes = ncpu*n_threads
    1514             :       !! If there is only 1 cpu skip the binning
    1515        1712 :       IF (n_processes == 1) THEN
    1516           0 :          ALLOCATE (tmp_dist(1))
    1517           0 :          tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
    1518           0 :          tmp_dist(1)%istart = 0_int_8
    1519           0 :          ptr_to_tmp_dist => tmp_dist(:)
    1520           0 :          SELECT CASE (eval_type)
    1521             :          CASE (hfx_do_eval_energy)
    1522           0 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
    1523             :          CASE (hfx_do_eval_forces)
    1524           0 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
    1525             :          END SELECT
    1526           0 :          DEALLOCATE (tmp_dist)
    1527             :       ELSE
    1528        1712 :          mepos = para_env%mepos
    1529        1712 :          my_process_id = para_env%mepos*n_threads + i_thread
    1530        1712 :          nbins = load_balance_parameter%nbins
    1531        1712 : !$OMP MASTER
    1532        5136 :          ALLOCATE (bin_histogram(n_processes, 2))
    1533       11984 :          bin_histogram = 0
    1534             : !$OMP END MASTER
    1535        1712 : !$OMP BARRIER
    1536        2622 :          SELECT CASE (eval_type)
    1537             :          CASE (hfx_do_eval_energy)
    1538         910 :             my_bin_size = SIZE(x_data%distribution_energy)
    1539             :          CASE (hfx_do_eval_forces)
    1540        1712 :             my_bin_size = SIZE(x_data%distribution_forces)
    1541             :          END SELECT
    1542        1712 :          bin_histogram(my_process_id + 1, 1) = my_bin_size
    1543        1712 : !$OMP BARRIER
    1544        1712 : !$OMP MASTER
    1545        1712 :          CALL para_env%sum(bin_histogram(:, 1))
    1546        1712 :          bin_histogram(1, 2) = bin_histogram(1, 1)
    1547        3424 :          DO iprocess = 2, n_processes
    1548        3424 :             bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
    1549             :          END DO
    1550             : 
    1551        3424 :          max_bin_size = MAXVAL(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
    1552        1712 :          CALL para_env%max(max_bin_size)
    1553             : !$OMP END MASTER
    1554        1712 : !$OMP BARRIER
    1555      114698 :          ALLOCATE (binned_dist(my_bin_size))
    1556             :          !! Use old binned_dist, but with timings cost
    1557         910 :          SELECT CASE (eval_type)
    1558             :          CASE (hfx_do_eval_energy)
    1559      117390 :             binned_dist = x_data%distribution_energy
    1560             :          CASE (hfx_do_eval_forces)
    1561      104368 :             binned_dist = x_data%distribution_forces
    1562             :          END SELECT
    1563             : 
    1564      111280 :          DO ibin = 1, my_bin_size
    1565      111280 :             IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
    1566       94384 :                binned_dist(ibin)%cost = 0
    1567             :             ELSE
    1568        8808 :                SELECT CASE (eval_type)
    1569             :                CASE (hfx_do_eval_energy)
    1570        8808 :                   IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
    1571             :                      binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_first_scf + &
    1572        8808 :                                                    binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
    1573             :                   ELSE
    1574           0 :                      binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
    1575             :                   END IF
    1576             :                CASE (hfx_do_eval_forces)
    1577       15184 :                   binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
    1578             :                END SELECT
    1579             :             END IF
    1580             :          END DO
    1581        1712 : !$OMP BARRIER
    1582        1712 : !$OMP MASTER
    1583             :          !! store all local results in a big cost matrix
    1584        5136 :          ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
    1585      220848 :          cost_matrix = 0
    1586        5136 :          ALLOCATE (sendbuffer(max_bin_size*n_threads))
    1587        5136 :          ALLOCATE (recbuffer(max_bin_size*n_threads))
    1588             : !$OMP END MASTER
    1589        1712 : !$OMP BARRIER
    1590        1712 :          my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
    1591        1712 :          icpu = para_env%mepos + 1
    1592      111280 :          DO i = 1, my_bin_size
    1593      111280 :             cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
    1594             :          END DO
    1595             : 
    1596        1712 :          mepos = para_env%mepos
    1597        1712 : !$OMP BARRIER
    1598        1712 : !$OMP MASTER
    1599        5136 :          ALLOCATE (bins_per_rank(ncpu))
    1600        5136 :          bins_per_rank = 0
    1601        5136 :          DO icpu = 1, ncpu
    1602        8560 :             bins_per_rank(icpu) = SUM(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
    1603             :          END DO
    1604             :          sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
    1605      220848 :             cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
    1606             : 
    1607        1712 :          dest = MODULO(mepos + 1, ncpu)
    1608        1712 :          source = MODULO(mepos - 1, ncpu)
    1609             :          ! sync before/after ring of isendrecv
    1610        1712 :          CALL para_env%sync()
    1611        5136 :          DO icpu = 0, ncpu - 1
    1612        3424 :             IF (icpu .NE. ncpu - 1) THEN
    1613             :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
    1614        1712 :                                        req(1), req(2), 13)
    1615             :             END IF
    1616        3424 :             data_from = MODULO(mepos - icpu, ncpu)
    1617        8560 :             start_idx = SUM(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
    1618        3424 :             end_idx = start_idx + bins_per_rank(data_from + 1) - 1
    1619      441696 :             cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
    1620             : 
    1621        3424 :             IF (icpu .NE. ncpu - 1) THEN
    1622        1712 :                CALL mp_waitall(req)
    1623             :             END IF
    1624        3424 :             swapbuffer => sendbuffer
    1625        3424 :             sendbuffer => recbuffer
    1626        5136 :             recbuffer => swapbuffer
    1627             :          END DO
    1628        1712 :          DEALLOCATE (recbuffer, sendbuffer)
    1629             :          ! sync before/after ring of isendrecv
    1630        1712 :          CALL para_env%sync()
    1631             : !$OMP END MASTER
    1632        1712 : !$OMP BARRIER
    1633        5136 :          ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
    1634      439984 :          local_cost_matrix = cost_matrix
    1635        1712 : !$OMP MASTER
    1636        5136 :          ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
    1637             :          CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
    1638        1712 :                                     shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
    1639             : 
    1640      620168 :          ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
    1641             : 
    1642      615032 :          full_dist(:, :)%istart = 0_int_8
    1643      615032 :          full_dist(:, :)%number_of_atom_quartets = 0_int_8
    1644      615032 :          full_dist(:, :)%cost = 0_int_8
    1645      615032 :          full_dist(:, :)%time_first_scf = 0.0_dp
    1646      615032 :          full_dist(:, :)%time_other_scf = 0.0_dp
    1647      615032 :          full_dist(:, :)%time_forces = 0.0_dp
    1648             : !$OMP END MASTER
    1649             : 
    1650        1712 : !$OMP BARRIER
    1651        1712 :          mepos = para_env%mepos + 1
    1652      220848 :          full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
    1653        1712 : !$OMP BARRIER
    1654        1712 : !$OMP MASTER
    1655        5136 :          ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
    1656        5136 :          ALLOCATE (recbuffer(3*max_bin_size*n_threads))
    1657        1712 :          mepos = para_env%mepos
    1658        3424 :          DO j = 1, n_threads
    1659      207864 :             DO i = 1, max_bin_size
    1660      204440 :                sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
    1661      204440 :                sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
    1662      206152 :                sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
    1663             :             END DO
    1664             :          END DO
    1665        1712 :          dest = MODULO(mepos + 1, ncpu)
    1666        1712 :          source = MODULO(mepos - 1, ncpu)
    1667             :          ! sync before/after ring of isendrecv
    1668        1712 :          CALL para_env%sync()
    1669        5136 :          DO icpu = 0, ncpu - 1
    1670        3424 :             IF (icpu .NE. ncpu - 1) THEN
    1671             :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
    1672        1712 :                                        req(1), req(2), 13)
    1673             :             END IF
    1674        3424 :             data_from = MODULO(mepos - icpu, ncpu)
    1675        6848 :             DO j = 1, n_threads
    1676      415728 :                DO i = 1, max_bin_size
    1677      408880 :                   full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
    1678      408880 :                   full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
    1679      412304 :                   full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
    1680             :                END DO
    1681             :             END DO
    1682             : 
    1683        3424 :             IF (icpu .NE. ncpu - 1) THEN
    1684        1712 :                CALL mp_waitall(req)
    1685             :             END IF
    1686        3424 :             swapbuffer => sendbuffer
    1687        3424 :             sendbuffer => recbuffer
    1688        5136 :             recbuffer => swapbuffer
    1689             :          END DO
    1690             :          ! sync before/after ring of isendrecv
    1691        1712 :          DEALLOCATE (recbuffer, sendbuffer)
    1692        1712 :          CALL para_env%sync()
    1693             : !$OMP END MASTER
    1694        1712 : !$OMP BARRIER
    1695             :          !! reorder the distribution according to the distribution vector
    1696        5136 :          ALLOCATE (tmp_pos(ncpu*n_threads))
    1697        5136 :          tmp_pos = 1
    1698      224272 :          ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
    1699             : 
    1700      220848 :          tmp_dist(:)%istart = 0_int_8
    1701      220848 :          tmp_dist(:)%number_of_atom_quartets = 0_int_8
    1702      220848 :          tmp_dist(:)%cost = 0_int_8
    1703      220848 :          tmp_dist(:)%time_first_scf = 0.0_dp
    1704      220848 :          tmp_dist(:)%time_other_scf = 0.0_dp
    1705      220848 :          tmp_dist(:)%time_forces = 0.0_dp
    1706             : 
    1707        5136 :          mepos = my_process_id + 1
    1708        5136 :          DO icpu = 1, n_processes
    1709      224272 :             DO i = 1, bin_histogram(icpu, 1)
    1710      222560 :                IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
    1711      109568 :                   tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
    1712      109568 :                   tmp_pos(mepos) = tmp_pos(mepos) + 1
    1713             :                END IF
    1714             :             END DO
    1715             :          END DO
    1716             : 
    1717             :          !! Assign the load to each process
    1718             :          NULLIFY (ptr_to_tmp_dist)
    1719        1712 :          mepos = my_process_id + 1
    1720        1712 :          ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
    1721         910 :          SELECT CASE (eval_type)
    1722             :          CASE (hfx_do_eval_energy)
    1723         910 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
    1724             :          CASE (hfx_do_eval_forces)
    1725        1712 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
    1726             :          END SELECT
    1727             : 
    1728        1712 : !$OMP BARRIER
    1729        1712 : !$OMP MASTER
    1730        1712 :          DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
    1731        1712 :          DEALLOCATE (bins_per_rank, bin_histogram)
    1732             : !$OMP END MASTER
    1733        1712 : !$OMP BARRIER
    1734        1712 :          DEALLOCATE (tmp_dist, tmp_pos)
    1735        1712 :          DEALLOCATE (binned_dist, local_cost_matrix)
    1736             :       END IF
    1737        1712 : !$OMP BARRIER
    1738        1712 : !$OMP MASTER
    1739        1712 :       CALL timestop(handle)
    1740             : !$OMP END MASTER
    1741        1712 : !$OMP BARRIER
    1742             : 
    1743        3424 :    END SUBROUTINE hfx_update_load_balance
    1744             : 
    1745             : ! **************************************************************************************************
    1746             : !> \brief estimates the cost of a set quartet with info available at load balance time
    1747             : !>        i.e. without much info on the primitives primitives
    1748             : !> \param nsa ...
    1749             : !> \param nsb ...
    1750             : !> \param nsc ...
    1751             : !> \param nsd ...
    1752             : !> \param npgfa ...
    1753             : !> \param npgfb ...
    1754             : !> \param npgfc ...
    1755             : !> \param npgfd ...
    1756             : !> \param ratio ...
    1757             : !> \param p1 ...
    1758             : !> \param p2 ...
    1759             : !> \param p3 ...
    1760             : !> \return ...
    1761             : !> \par History
    1762             : !>      08.2009 created Joost VandeVondele
    1763             : !> \author Joost VandeVondele
    1764             : ! **************************************************************************************************
    1765     8831621 :    FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
    1766             :       IMPLICIT NONE
    1767             :       REAL(KIND=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
    1768             :       INTEGER(KIND=int_8) :: res
    1769             :       REAL(KIND=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
    1770             : 
    1771             :       INTEGER   :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
    1772             : 
    1773     8831621 :       estimate1 = estimate_basic(p1)
    1774     8831621 :       estimate2 = estimate_basic(p2)
    1775     8831621 :       mu = LOG(ABS(1.0E6_dp*p3(1)) + 1)
    1776     8831621 :       sigma = p3(2)*0.1_dp*mu
    1777     8831621 :       switch = 1.0_dp/(1.0_dp + EXP((LOG(estimate1) - mu)/sigma))
    1778     8831621 :       estimate = estimate1*(1.0_dp - switch) + estimate2*switch
    1779     8831621 :       res = INT(estimate*0.001_dp, KIND=int_8) + 1
    1780             : 
    1781             :    CONTAINS
    1782             : 
    1783             : ! **************************************************************************************************
    1784             : !> \brief ...
    1785             : !> \param p ...
    1786             : !> \return ...
    1787             : ! **************************************************************************************************
    1788    17663242 :       REAL(KIND=dp) FUNCTION estimate_basic(p) RESULT(res)
    1789             :          REAL(KIND=dp)                                      :: p(12)
    1790             : 
    1791             :          REAL(KIND=dp)                                      :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
    1792             :                                                                p7, p8, p9
    1793             : 
    1794    17663242 :          p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
    1795    17663242 :          p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
    1796    17663242 :          p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
    1797             :          res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
    1798             :                poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
    1799             :                poly2(npgfd, p4, p5, p6)*EXP(-p7*ratio + p8*ratio**2) + &
    1800    17663242 :               1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
    1801    17663242 :          res = 1 + ABS(res)
    1802    17663242 :       END FUNCTION estimate_basic
    1803             : 
    1804             : ! **************************************************************************************************
    1805             : !> \brief ...
    1806             : !> \param x ...
    1807             : !> \param a0 ...
    1808             : !> \param a1 ...
    1809             : !> \param a2 ...
    1810             : !> \return ...
    1811             : ! **************************************************************************************************
    1812   211958904 :       REAL(KIND=dp) FUNCTION poly2(x, a0, a1, a2)
    1813             :          INTEGER                                            :: x
    1814             :          REAL(KIND=dp)                                      :: a0, a1, a2
    1815             : 
    1816   211958904 :          poly2 = a0 + a1*x + a2*x*x
    1817   211958904 :       END FUNCTION poly2
    1818             : 
    1819             :    END FUNCTION cost_model
    1820             : ! **************************************************************************************************
    1821             : !> \brief Minimizes the maximum cost per cpu by shuffling around all bins
    1822             : !> \param total_number_of_bins ...
    1823             : !> \param number_of_processes ...
    1824             : !> \param bin_costs costs per bin
    1825             : !> \param distribution_vector will contain the final distribution
    1826             : !> \param do_randomize ...
    1827             : !> \par History
    1828             : !>      03.2009 created from a hack by Joost [Manuel Guidon]
    1829             : !> \author Manuel Guidon
    1830             : ! **************************************************************************************************
    1831        3442 :    SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
    1832             :                                     distribution_vector, do_randomize)
    1833             :       INTEGER                                            :: total_number_of_bins, number_of_processes
    1834             :       INTEGER(int_8), DIMENSION(:), POINTER              :: bin_costs
    1835             :       INTEGER, DIMENSION(:), POINTER                     :: distribution_vector
    1836             :       LOGICAL, INTENT(IN)                                :: do_randomize
    1837             : 
    1838             :       INTEGER                                            :: i, itmp, j, nstep
    1839        3442 :       INTEGER(int_8), DIMENSION(:), POINTER              :: my_cost_cpu, tmp_cost, tmp_cpu_cost
    1840             :       INTEGER, DIMENSION(:), POINTER                     :: tmp_cpu_index, tmp_index
    1841        3442 :       TYPE(rng_stream_type), ALLOCATABLE                 :: rng_stream
    1842             : 
    1843        3442 :       nstep = MAX(1, INT(number_of_processes)/2)
    1844             : 
    1845       10326 :       ALLOCATE (tmp_cost(total_number_of_bins))
    1846       10326 :       ALLOCATE (tmp_index(total_number_of_bins))
    1847       10326 :       ALLOCATE (tmp_cpu_cost(number_of_processes))
    1848       10326 :       ALLOCATE (tmp_cpu_index(number_of_processes))
    1849       10326 :       ALLOCATE (my_cost_cpu(number_of_processes))
    1850      888036 :       tmp_cost = bin_costs
    1851             : 
    1852        3442 :       CALL sort(tmp_cost, total_number_of_bins, tmp_index)
    1853       10326 :       my_cost_cpu = 0
    1854             :       !
    1855             :       ! assign the largest remaining bin to the CPU with the smallest load
    1856             :       ! gives near perfect distributions for a sufficient number of bins ...
    1857             :       ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
    1858             :       ! each cpu a similar number of tasks.
    1859             :       ! it also avoids degenerate cases where thousands of zero sized tasks
    1860             :       ! are assigned to the same (least loaded) cpu
    1861             :       !
    1862        3442 :       IF (do_randomize) &
    1863             :          rng_stream = rng_stream_type(name="uniform_rng", &
    1864           0 :                                       distribution_type=UNIFORM)
    1865             : 
    1866      444018 :       DO i = total_number_of_bins, 1, -nstep
    1867     2643456 :          tmp_cpu_cost = my_cost_cpu
    1868      440576 :          CALL sort(tmp_cpu_cost, INT(number_of_processes), tmp_cpu_index)
    1869      440576 :          IF (do_randomize) THEN
    1870           0 :             CALL rng_stream%shuffle(tmp_cpu_index(1:MIN(i, nstep)))
    1871             :          END IF
    1872      884594 :          DO j = 1, MIN(i, nstep)
    1873      440576 :             itmp = tmp_cpu_index(j)
    1874      440576 :             distribution_vector(tmp_index(i - j + 1)) = itmp
    1875      881152 :             my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
    1876             :          END DO
    1877             :       END DO
    1878             : 
    1879        3442 :       DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
    1880        3442 :       DEALLOCATE (tmp_cpu_index, my_cost_cpu)
    1881        6884 :    END SUBROUTINE optimize_distribution
    1882             : 
    1883             : ! **************************************************************************************************
    1884             : !> \brief Given a 2d index pair, this function returns a 1d index pair for
    1885             : !>        a symmetric upper triangle NxN matrix
    1886             : !>        The compiler should inline this function, therefore it appears in
    1887             : !>        several modules
    1888             : !> \param i 2d index
    1889             : !> \param j 2d index
    1890             : !> \param N matrix size
    1891             : !> \return ...
    1892             : !> \par History
    1893             : !>      03.2009 created [Manuel Guidon]
    1894             : !> \author Manuel Guidon
    1895             : ! **************************************************************************************************
    1896       64808 :    PURE FUNCTION get_1D_idx(i, j, N)
    1897             :       INTEGER, INTENT(IN)                                :: i, j
    1898             :       INTEGER(int_8), INTENT(IN)                         :: N
    1899             :       INTEGER(int_8)                                     :: get_1D_idx
    1900             : 
    1901             :       INTEGER(int_8)                                     :: min_ij
    1902             : 
    1903       64808 :       min_ij = MIN(i, j)
    1904       64808 :       get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N
    1905             : 
    1906       64808 :    END FUNCTION get_1D_idx
    1907             : 
    1908             : ! **************************************************************************************************
    1909             : !> \brief ...
    1910             : !> \param natom ...
    1911             : !> \param nkind ...
    1912             : !> \param list_ij ...
    1913             : !> \param list_kl ...
    1914             : !> \param set_list_ij ...
    1915             : !> \param set_list_kl ...
    1916             : !> \param iatom_start ...
    1917             : !> \param iatom_end ...
    1918             : !> \param jatom_start ...
    1919             : !> \param jatom_end ...
    1920             : !> \param katom_start ...
    1921             : !> \param katom_end ...
    1922             : !> \param latom_start ...
    1923             : !> \param latom_end ...
    1924             : !> \param particle_set ...
    1925             : !> \param coeffs_set ...
    1926             : !> \param coeffs_kind ...
    1927             : !> \param is_assoc_atomic_block_global ...
    1928             : !> \param do_periodic ...
    1929             : !> \param kind_of ...
    1930             : !> \param basis_parameter ...
    1931             : !> \param pmax_set ...
    1932             : !> \param pmax_atom ...
    1933             : !> \param pmax_blocks ...
    1934             : !> \param cell ...
    1935             : !> \param do_p_screening ...
    1936             : !> \param map_atom_to_kind_atom ...
    1937             : !> \param eval_type ...
    1938             : !> \param log10_eps_schwarz ...
    1939             : !> \param log_2 ...
    1940             : !> \param coeffs_kind_max0 ...
    1941             : !> \param use_virial ...
    1942             : !> \param atomic_pair_list ...
    1943             : !> \return ...
    1944             : ! **************************************************************************************************
    1945      303066 :    FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    1946             :                                 iatom_start, iatom_end, jatom_start, jatom_end, &
    1947             :                                 katom_start, katom_end, latom_start, latom_end, &
    1948             :                                 particle_set, &
    1949      303066 :                                 coeffs_set, coeffs_kind, &
    1950      303066 :                                 is_assoc_atomic_block_global, do_periodic, &
    1951             :                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1952             :                                 cell, &
    1953             :                                 do_p_screening, map_atom_to_kind_atom, eval_type, &
    1954             :                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
    1955      303066 :                                 atomic_pair_list)
    1956             : 
    1957             :       INTEGER, INTENT(IN)                                :: natom, nkind
    1958             :       TYPE(pair_list_type)                               :: list_ij, list_kl
    1959             :       TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
    1960             :       INTEGER, INTENT(IN)                                :: iatom_start, iatom_end, jatom_start, &
    1961             :                                                             jatom_end, katom_start, katom_end, &
    1962             :                                                             latom_start, latom_end
    1963             :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1964             :       TYPE(hfx_screen_coeff_type), &
    1965             :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
    1966             :       TYPE(hfx_screen_coeff_type), &
    1967             :          DIMENSION(nkind, nkind)                         :: coeffs_kind
    1968             :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
    1969             :       LOGICAL                                            :: do_periodic
    1970             :       INTEGER                                            :: kind_of(*)
    1971             :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
    1972             :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
    1973             :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
    1974             :       REAL(dp)                                           :: pmax_blocks
    1975             :       TYPE(cell_type), POINTER                           :: cell
    1976             :       LOGICAL, INTENT(IN)                                :: do_p_screening
    1977             :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
    1978             :       INTEGER, INTENT(IN)                                :: eval_type
    1979             :       REAL(dp)                                           :: log10_eps_schwarz, log_2, &
    1980             :                                                             coeffs_kind_max0
    1981             :       LOGICAL, INTENT(IN)                                :: use_virial
    1982             :       LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list
    1983             :       INTEGER(int_8)                                     :: estimate_block_cost
    1984             : 
    1985             :       INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
    1986             :                  i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
    1987             :                  jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
    1988      303066 :       INTEGER, DIMENSION(:), POINTER                     :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
    1989      303066 :                                                             nsgfb, nsgfc, nsgfd
    1990             :       REAL(dp)                                           :: actual_pmax_atom, cost_tmp, max_val1, &
    1991             :                                                             max_val2, pmax_entry, rab2, rcd2, &
    1992             :                                                             screen_kind_ij, screen_kind_kl
    1993      303066 :       REAL(dp), DIMENSION(:, :), POINTER                 :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
    1994             : 
    1995      303066 :       estimate_block_cost = 0_int_8
    1996             : 
    1997             :       CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
    1998             :                            kind_of, basis_parameter, particle_set, &
    1999             :                            do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
    2000      303066 :                            log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
    2001             : 
    2002             :       CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
    2003             :                            kind_of, basis_parameter, particle_set, &
    2004             :                            do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
    2005      303066 :                            log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
    2006             : 
    2007      575868 :       DO i_list_ij = 1, list_ij%n_element
    2008      272802 :          iatom = list_ij%elements(i_list_ij)%pair(1)
    2009      272802 :          jatom = list_ij%elements(i_list_ij)%pair(2)
    2010      272802 :          i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
    2011      272802 :          i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
    2012      272802 :          ikind = list_ij%elements(i_list_ij)%kind_pair(1)
    2013      272802 :          jkind = list_ij%elements(i_list_ij)%kind_pair(2)
    2014      272802 :          rab2 = list_ij%elements(i_list_ij)%dist2
    2015             : 
    2016      272802 :          nsgfa => basis_parameter(ikind)%nsgf
    2017      272802 :          nsgfb => basis_parameter(jkind)%nsgf
    2018      272802 :          npgfa => basis_parameter(ikind)%npgf
    2019      272802 :          npgfb => basis_parameter(jkind)%npgf
    2020             : 
    2021      830352 :          DO i_list_kl = 1, list_kl%n_element
    2022             : 
    2023      254484 :             katom = list_kl%elements(i_list_kl)%pair(1)
    2024      254484 :             latom = list_kl%elements(i_list_kl)%pair(2)
    2025             : 
    2026      254484 :             IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE
    2027      140198 :             IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) CYCLE
    2028             : 
    2029      134582 :             IF (eval_type == hfx_do_eval_forces) THEN
    2030       14558 :                IF (.NOT. use_virial) THEN
    2031       13176 :                   IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) CYCLE
    2032             :                END IF
    2033             :             END IF
    2034             : 
    2035      133266 :             i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
    2036      133266 :             i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
    2037      133266 :             kkind = list_kl%elements(i_list_kl)%kind_pair(1)
    2038      133266 :             lkind = list_kl%elements(i_list_kl)%kind_pair(2)
    2039      133266 :             rcd2 = list_kl%elements(i_list_kl)%dist2
    2040             : 
    2041      133266 :             nsgfc => basis_parameter(kkind)%nsgf
    2042      133266 :             nsgfd => basis_parameter(lkind)%nsgf
    2043      133266 :             npgfc => basis_parameter(kkind)%npgf
    2044      133266 :             npgfd => basis_parameter(lkind)%npgf
    2045             : 
    2046      133266 :             IF (do_p_screening) THEN
    2047             :                actual_pmax_atom = MAX(pmax_atom(katom, iatom), &
    2048             :                                       pmax_atom(latom, jatom), &
    2049             :                                       pmax_atom(latom, iatom), &
    2050       16562 :                                       pmax_atom(katom, jatom))
    2051             :             ELSE
    2052             :                actual_pmax_atom = 0.0_dp
    2053             :             END IF
    2054             : 
    2055             :             screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
    2056      133266 :                              coeffs_kind(jkind, ikind)%x(2)
    2057             :             screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
    2058      133266 :                              coeffs_kind(lkind, kkind)%x(2)
    2059      133266 :             IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
    2060             : 
    2061      126088 :             IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
    2062             :                        is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
    2063             :                        is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
    2064             :                        is_assoc_atomic_block_global(latom, jatom) >= 1)) CYCLE
    2065             : 
    2066      122904 :             IF (do_p_screening) THEN
    2067        6378 :                SELECT CASE (eval_type)
    2068             :                CASE (hfx_do_eval_energy)
    2069        6378 :                   swap_id = 0
    2070        6378 :                   kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
    2071        6378 :                   IF (ikind >= kkind) THEN
    2072             :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2073             :                                                                map_atom_to_kind_atom(katom), &
    2074        5746 :                                                                map_atom_to_kind_atom(iatom))
    2075             :                   ELSE
    2076             :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2077             :                                                                map_atom_to_kind_atom(iatom), &
    2078         632 :                                                                map_atom_to_kind_atom(katom))
    2079         632 :                      swap_id = swap_id + 1
    2080             :                   END IF
    2081        6378 :                   kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
    2082        6378 :                   IF (jkind >= lkind) THEN
    2083             :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2084             :                                                                map_atom_to_kind_atom(latom), &
    2085        5978 :                                                                map_atom_to_kind_atom(jatom))
    2086             :                   ELSE
    2087             :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2088             :                                                                map_atom_to_kind_atom(jatom), &
    2089         400 :                                                                map_atom_to_kind_atom(latom))
    2090         400 :                      swap_id = swap_id + 2
    2091             :                   END IF
    2092        6378 :                   kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
    2093        6378 :                   IF (ikind >= lkind) THEN
    2094             :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2095             :                                                                map_atom_to_kind_atom(latom), &
    2096        3594 :                                                                map_atom_to_kind_atom(iatom))
    2097             :                   ELSE
    2098             :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2099             :                                                                map_atom_to_kind_atom(iatom), &
    2100        2784 :                                                                map_atom_to_kind_atom(latom))
    2101        2784 :                      swap_id = swap_id + 4
    2102             :                   END IF
    2103        6378 :                   kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
    2104        6378 :                   IF (jkind >= kkind) THEN
    2105             :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2106             :                                                                map_atom_to_kind_atom(katom), &
    2107        6348 :                                                                map_atom_to_kind_atom(jatom))
    2108             :                   ELSE
    2109             :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2110             :                                                                map_atom_to_kind_atom(jatom), &
    2111          30 :                                                                map_atom_to_kind_atom(katom))
    2112          30 :                      swap_id = swap_id + 8
    2113             :                   END IF
    2114             :                CASE (hfx_do_eval_forces)
    2115        9824 :                   swap_id = 16
    2116        9824 :                   kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
    2117        9824 :                   IF (ikind >= kkind) THEN
    2118             :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2119             :                                                                map_atom_to_kind_atom(katom), &
    2120        9290 :                                                                map_atom_to_kind_atom(iatom))
    2121             :                   ELSE
    2122             :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2123             :                                                                map_atom_to_kind_atom(iatom), &
    2124         534 :                                                                map_atom_to_kind_atom(katom))
    2125         534 :                      swap_id = swap_id + 1
    2126             :                   END IF
    2127        9824 :                   kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
    2128        9824 :                   IF (jkind >= lkind) THEN
    2129             :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2130             :                                                                map_atom_to_kind_atom(latom), &
    2131        9824 :                                                                map_atom_to_kind_atom(jatom))
    2132             :                   ELSE
    2133             :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2134             :                                                                map_atom_to_kind_atom(jatom), &
    2135           0 :                                                                map_atom_to_kind_atom(latom))
    2136           0 :                      swap_id = swap_id + 2
    2137             :                   END IF
    2138        9824 :                   kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
    2139        9824 :                   IF (ikind >= lkind) THEN
    2140             :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2141             :                                                                map_atom_to_kind_atom(latom), &
    2142        8074 :                                                                map_atom_to_kind_atom(iatom))
    2143             :                   ELSE
    2144             :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2145             :                                                                map_atom_to_kind_atom(iatom), &
    2146        1750 :                                                                map_atom_to_kind_atom(latom))
    2147        1750 :                      swap_id = swap_id + 4
    2148             :                   END IF
    2149        9824 :                   kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
    2150       26026 :                   IF (jkind >= kkind) THEN
    2151             :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2152             :                                                                map_atom_to_kind_atom(katom), &
    2153        9824 :                                                                map_atom_to_kind_atom(jatom))
    2154             :                   ELSE
    2155             :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2156             :                                                                map_atom_to_kind_atom(jatom), &
    2157           0 :                                                                map_atom_to_kind_atom(katom))
    2158           0 :                      swap_id = swap_id + 8
    2159             :                   END IF
    2160             :                END SELECT
    2161             :             END IF
    2162             : 
    2163     1080388 :             DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
    2164      684682 :                iset = set_list_ij(i_set_list_ij)%pair(1)
    2165      684682 :                jset = set_list_ij(i_set_list_ij)%pair(2)
    2166             : 
    2167             :                max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
    2168      684682 :                           coeffs_set(jset, iset, jkind, ikind)%x(2)
    2169             : 
    2170      684682 :                IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
    2171     9928560 :                DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
    2172     9005754 :                   kset = set_list_kl(i_set_list_kl)%pair(1)
    2173     9005754 :                   lset = set_list_kl(i_set_list_kl)%pair(2)
    2174             : 
    2175             :                   max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
    2176     9005754 :                                          coeffs_set(lset, kset, lkind, kkind)%x(2))
    2177             : 
    2178     9005754 :                   IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) CYCLE
    2179     8001728 :                   IF (do_p_screening) THEN
    2180             :                      CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
    2181             :                                        iset, jset, kset, lset, &
    2182      995549 :                                        pmax_entry, swap_id)
    2183      995549 :                      IF (eval_type == hfx_do_eval_forces) THEN
    2184      578154 :                         pmax_entry = log_2 + pmax_entry
    2185             :                      END IF
    2186             :                   ELSE
    2187     7006179 :                      pmax_entry = 0.0_dp
    2188             :                   END IF
    2189     8001728 :                   max_val2 = max_val2 + pmax_entry
    2190     8001728 :                   IF (max_val2 < log10_eps_schwarz) CYCLE
    2191      684682 :                   SELECT CASE (eval_type)
    2192             :                   CASE (hfx_do_eval_energy)
    2193             :                      cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
    2194             :                                            npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
    2195             :                                            max_val2/log10_eps_schwarz, &
    2196     6905536 :                                            p1_energy, p2_energy, p3_energy)
    2197     6905536 :                      estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
    2198             :                   CASE (hfx_do_eval_forces)
    2199             :                      cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
    2200             :                                            npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
    2201             :                                            max_val2/log10_eps_schwarz, &
    2202      914300 :                                            p1_forces, p2_forces, p3_forces)
    2203     8734136 :                      estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
    2204             :                   END SELECT
    2205             :                END DO ! i_set_list_kl
    2206             :             END DO ! i_set_list_ij
    2207             :          END DO ! i_list_kl
    2208             :       END DO ! i_list_ij
    2209             : 
    2210      303066 :    END FUNCTION estimate_block_cost
    2211             : 
    2212             : ! **************************************************************************************************
    2213             : !> \brief ...
    2214             : !> \param nkind ...
    2215             : !> \param para_env ...
    2216             : !> \param natom ...
    2217             : !> \param block_size ...
    2218             : !> \param nblock ...
    2219             : !> \param blocks ...
    2220             : !> \param list_ij ...
    2221             : !> \param list_kl ...
    2222             : !> \param set_list_ij ...
    2223             : !> \param set_list_kl ...
    2224             : !> \param particle_set ...
    2225             : !> \param coeffs_set ...
    2226             : !> \param coeffs_kind ...
    2227             : !> \param is_assoc_atomic_block_global ...
    2228             : !> \param do_periodic ...
    2229             : !> \param kind_of ...
    2230             : !> \param basis_parameter ...
    2231             : !> \param pmax_set ...
    2232             : !> \param pmax_atom ...
    2233             : !> \param pmax_blocks ...
    2234             : !> \param cell ...
    2235             : !> \param do_p_screening ...
    2236             : !> \param map_atom_to_kind_atom ...
    2237             : !> \param eval_type ...
    2238             : !> \param log10_eps_schwarz ...
    2239             : !> \param log_2 ...
    2240             : !> \param coeffs_kind_max0 ...
    2241             : !> \param use_virial ...
    2242             : !> \param atomic_pair_list ...
    2243             : ! **************************************************************************************************
    2244        1216 :    SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
    2245        1216 :                           list_ij, list_kl, set_list_ij, set_list_kl, &
    2246             :                           particle_set, &
    2247             :                           coeffs_set, coeffs_kind, &
    2248        1216 :                           is_assoc_atomic_block_global, do_periodic, &
    2249             :                           kind_of, basis_parameter, pmax_set, pmax_atom, &
    2250             :                           pmax_blocks, cell, &
    2251             :                           do_p_screening, map_atom_to_kind_atom, eval_type, &
    2252             :                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
    2253        1216 :                           atomic_pair_list)
    2254             : 
    2255             :       INTEGER, INTENT(IN)                                :: nkind
    2256             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    2257             :       INTEGER                                            :: natom, block_size, nblock
    2258             :       TYPE(hfx_block_range_type), DIMENSION(1:nblock)    :: blocks
    2259             :       TYPE(pair_list_type)                               :: list_ij, list_kl
    2260             :       TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
    2261             :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    2262             :       TYPE(hfx_screen_coeff_type), &
    2263             :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
    2264             :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
    2265             :          POINTER                                         :: coeffs_kind
    2266             :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
    2267             :       LOGICAL                                            :: do_periodic
    2268             :       INTEGER                                            :: kind_of(*)
    2269             :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
    2270             :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
    2271             :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
    2272             :       REAL(dp)                                           :: pmax_blocks
    2273             :       TYPE(cell_type), POINTER                           :: cell
    2274             :       LOGICAL, INTENT(IN)                                :: do_p_screening
    2275             :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
    2276             :       INTEGER, INTENT(IN)                                :: eval_type
    2277             :       REAL(dp)                                           :: log10_eps_schwarz, log_2, &
    2278             :                                                             coeffs_kind_max0
    2279             :       LOGICAL, INTENT(IN)                                :: use_virial
    2280             :       LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list
    2281             : 
    2282             :       INTEGER                                            :: atom_block, i, iatom_block, iatom_end, &
    2283             :                                                             iatom_start, my_cpu_rank, ncpus
    2284             : 
    2285        5046 :       DO atom_block = 0, nblock - 1
    2286        3830 :          iatom_block = MODULO(atom_block, nblock) + 1
    2287        3830 :          iatom_start = (iatom_block - 1)*block_size + 1
    2288        3830 :          iatom_end = MIN(iatom_block*block_size, natom)
    2289        3830 :          blocks(atom_block + 1)%istart = iatom_start
    2290        3830 :          blocks(atom_block + 1)%iend = iatom_end
    2291        5046 :          blocks(atom_block + 1)%cost = 0_int_8
    2292             :       END DO
    2293             : 
    2294        1216 :       ncpus = para_env%num_pe
    2295        1216 :       my_cpu_rank = para_env%mepos
    2296        5046 :       DO i = 1, nblock
    2297        3830 :          IF (MODULO(i, ncpus) /= my_cpu_rank) THEN
    2298        1894 :             blocks(i)%istart = 0
    2299        1894 :             blocks(i)%iend = 0
    2300        1894 :             CYCLE
    2301             :          END IF
    2302        1936 :          iatom_start = blocks(i)%istart
    2303        1936 :          iatom_end = blocks(i)%iend
    2304             :          blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    2305             :                                               iatom_start, iatom_end, iatom_start, iatom_end, &
    2306             :                                               iatom_start, iatom_end, iatom_start, iatom_end, &
    2307             :                                               particle_set, &
    2308             :                                               coeffs_set, coeffs_kind, &
    2309             :                                               is_assoc_atomic_block_global, do_periodic, &
    2310             :                                               kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    2311             :                                               cell, &
    2312             :                                               do_p_screening, map_atom_to_kind_atom, eval_type, &
    2313        3152 :                                               log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    2314             : 
    2315             :       END DO
    2316        1216 :    END SUBROUTINE init_blocks
    2317             : 
    2318             : ! **************************************************************************************************
    2319             : !> \brief ...
    2320             : !> \param para_env ...
    2321             : !> \param x_data ...
    2322             : !> \param iw ...
    2323             : !> \param n_threads ...
    2324             : !> \param i_thread ...
    2325             : !> \param eval_type ...
    2326             : ! **************************************************************************************************
    2327           0 :    SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
    2328             :                                         eval_type)
    2329             : 
    2330             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    2331             :       TYPE(hfx_type), POINTER                            :: x_data
    2332             :       INTEGER, INTENT(IN)                                :: iw, n_threads, i_thread, eval_type
    2333             : 
    2334             :       INTEGER                                            :: i, j, k, my_rank, nbins, nranks, &
    2335             :                                                             total_bins
    2336             :       INTEGER(int_8)                                     :: avg_bin, avg_rank, max_bin, max_rank, &
    2337             :                                                             min_bin, min_rank, sum_bin, sum_rank
    2338           0 :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer, buffer_in, buffer_out, summary
    2339             :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE    :: shm_cost_vector
    2340           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bins_per_rank, rdispl, sort_idx
    2341             :       INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: shm_bins_per_rank, shm_displ
    2342             : 
    2343           0 :       SELECT CASE (eval_type)
    2344             :       CASE (hfx_do_eval_energy)
    2345           0 :          nbins = SIZE(x_data%distribution_energy)
    2346             :       CASE (hfx_do_eval_forces)
    2347           0 :          nbins = SIZE(x_data%distribution_forces)
    2348             :       END SELECT
    2349             : 
    2350           0 : !$OMP MASTER
    2351           0 :       ALLOCATE (shm_bins_per_rank(n_threads))
    2352           0 :       ALLOCATE (shm_displ(n_threads + 1))
    2353             : !$OMP END MASTER
    2354           0 : !$OMP BARRIER
    2355             : 
    2356           0 :       shm_bins_per_rank(i_thread + 1) = nbins
    2357           0 : !$OMP BARRIER
    2358           0 :       nbins = 0
    2359           0 :       DO i = 1, n_threads
    2360           0 :          nbins = nbins + shm_bins_per_rank(i)
    2361             :       END DO
    2362           0 :       my_rank = para_env%mepos
    2363           0 :       nranks = para_env%num_pe
    2364             : 
    2365           0 : !$OMP BARRIER
    2366           0 : !$OMP MASTER
    2367           0 :       ALLOCATE (bins_per_rank(nranks))
    2368           0 :       bins_per_rank = 0
    2369             : 
    2370           0 :       bins_per_rank(my_rank + 1) = nbins
    2371             : 
    2372           0 :       CALL para_env%sum(bins_per_rank)
    2373             : 
    2374           0 :       total_bins = 0
    2375           0 :       DO i = 1, nranks
    2376           0 :          total_bins = total_bins + bins_per_rank(i)
    2377             :       END DO
    2378             : 
    2379           0 :       ALLOCATE (shm_cost_vector(2*total_bins))
    2380           0 :       shm_cost_vector = -1_int_8
    2381           0 :       shm_displ(1) = 1
    2382           0 :       DO i = 2, n_threads
    2383           0 :          shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
    2384             :       END DO
    2385           0 :       shm_displ(n_threads + 1) = nbins + 1
    2386             : !$OMP END MASTER
    2387           0 : !$OMP BARRIER
    2388           0 :       j = 0
    2389           0 :       SELECT CASE (eval_type)
    2390             :       CASE (hfx_do_eval_energy)
    2391           0 :          DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
    2392           0 :             j = j + 1
    2393           0 :             shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
    2394           0 :             shm_cost_vector(2*i) = INT(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, KIND=int_8)
    2395             :          END DO
    2396             :       CASE (hfx_do_eval_forces)
    2397           0 :          DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
    2398           0 :             j = j + 1
    2399           0 :             shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
    2400           0 :             shm_cost_vector(2*i) = INT(x_data%distribution_forces(j)%time_forces*10000.0_dp, KIND=int_8)
    2401             :          END DO
    2402             :       END SELECT
    2403           0 : !$OMP BARRIER
    2404           0 : !$OMP MASTER
    2405             :       ! ** calculate offsets
    2406           0 :       ALLOCATE (rdispl(nranks))
    2407           0 :       bins_per_rank(:) = bins_per_rank(:)*2
    2408           0 :       rdispl(1) = 0
    2409           0 :       DO i = 2, nranks
    2410           0 :          rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
    2411             :       END DO
    2412             : 
    2413           0 :       ALLOCATE (buffer_in(2*nbins))
    2414           0 :       ALLOCATE (buffer_out(2*total_bins))
    2415             : 
    2416           0 :       DO i = 1, nbins
    2417           0 :          buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
    2418           0 :          buffer_in(2*i) = shm_cost_vector(2*i)
    2419             :       END DO
    2420             : 
    2421           0 :       CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
    2422             : 
    2423           0 :       IF (iw > 0) THEN
    2424             : 
    2425           0 :          ALLOCATE (summary(2*nranks))
    2426           0 :          summary = 0_int_8
    2427             : 
    2428           0 :          WRITE (iw, '( /, 1X, 79("-") )')
    2429           0 :          WRITE (iw, '( " -", 77X, "-" )')
    2430           0 :          SELECT CASE (eval_type)
    2431             :          CASE (hfx_do_eval_energy)
    2432           0 :             WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
    2433             :          CASE (hfx_do_eval_forces)
    2434           0 :             WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
    2435             :          END SELECT
    2436           0 :          WRITE (iw, '( " -", 77X, "-" )')
    2437           0 :          WRITE (iw, '( 1X, 79("-") )')
    2438             : 
    2439           0 :          WRITE (iw, FMT="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
    2440           0 :          WRITE (iw, '( 1X, 79("-"), / )')
    2441           0 :          k = 0
    2442           0 :          DO i = 1, nranks
    2443           0 :             DO j = 1, bins_per_rank(i)/2
    2444           0 :                k = k + 1
    2445             :                WRITE (iw, FMT="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
    2446           0 :                   i - 1, j, buffer_out(2*(k - 1) + 1), REAL(buffer_out(2*k), dp)/10000.0_dp
    2447           0 :                summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
    2448           0 :                summary(2*i) = summary(2*i) + buffer_out(2*k)
    2449             :             END DO
    2450             :          END DO
    2451             : 
    2452             :          !** Summary
    2453             :          max_bin = 0_int_8
    2454             :          min_bin = HUGE(min_bin)
    2455             :          sum_bin = 0_int_8
    2456           0 :          DO i = 1, total_bins
    2457           0 :             sum_bin = sum_bin + buffer_out(2*i)
    2458           0 :             max_bin = MAX(max_bin, buffer_out(2*i))
    2459           0 :             min_bin = MIN(min_bin, buffer_out(2*i))
    2460             :          END DO
    2461           0 :          avg_bin = sum_bin/total_bins
    2462             : 
    2463           0 :          max_rank = 0_int_8
    2464           0 :          min_rank = HUGE(min_rank)
    2465           0 :          sum_rank = 0_int_8
    2466           0 :          DO i = 1, nranks
    2467           0 :             sum_rank = sum_rank + summary(2*i)
    2468           0 :             max_rank = MAX(max_rank, summary(2*i))
    2469           0 :             min_rank = MIN(min_rank, summary(2*i))
    2470             :          END DO
    2471           0 :          avg_rank = sum_rank/nranks
    2472             : 
    2473           0 :          WRITE (iw, FMT='(/,T3,A,/)') "SUMMARY:"
    2474           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max bin", REAL(max_bin, dp)/10000.0_dp
    2475           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min bin", REAL(min_bin, dp)/10000.0_dp
    2476           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum bin", REAL(sum_bin, dp)/10000.0_dp
    2477           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg bin", REAL(avg_bin, dp)/10000.0_dp
    2478           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max rank", REAL(max_rank, dp)/10000.0_dp
    2479           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min rank", REAL(min_rank, dp)/10000.0_dp
    2480           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum rank", REAL(sum_rank, dp)/10000.0_dp
    2481           0 :          WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg rank", REAL(avg_rank, dp)/10000.0_dp
    2482             : 
    2483           0 :          ALLOCATE (buffer(nranks))
    2484           0 :          ALLOCATE (sort_idx(nranks))
    2485             : 
    2486           0 :          DO i = 1, nranks
    2487           0 :             buffer(i) = summary(2*i)
    2488             :          END DO
    2489             : 
    2490           0 :          CALL sort(buffer, nranks, sort_idx)
    2491             : 
    2492           0 :          WRITE (iw, FMT="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
    2493           0 :          DO i = nranks, 1, -1
    2494           0 :        WRITE (iw, FMT="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), REAL(buffer(i), dp)/10000.0_dp
    2495             :          END DO
    2496             : 
    2497           0 :          DEALLOCATE (summary, buffer, sort_idx)
    2498             : 
    2499             :       END IF
    2500             : 
    2501           0 :       DEALLOCATE (buffer_in, buffer_out, rdispl)
    2502             : 
    2503           0 :       CALL para_env%sync()
    2504             : 
    2505           0 :       DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
    2506             : !$OMP END MASTER
    2507           0 : !$OMP BARRIER
    2508             : 
    2509           0 :    END SUBROUTINE collect_load_balance_info
    2510             : 
    2511             :    #:include 'hfx_get_pmax_val.fypp'
    2512             : 
    2513             : END MODULE hfx_load_balance_methods

Generated by: LCOV version 1.15