LCOV - code coverage report
Current view: top level - src - hfx_load_balance_methods.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 68.1 % 1046 712
Test Date: 2025-12-04 06:27:48 Functions: 64.3 % 14 9

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief Routines for optimizing load balance between processes in HFX calculations
      10              : !> \par History
      11              : !>      04.2008 created [Manuel Guidon]
      12              : !> \author Manuel Guidon
      13              : ! **************************************************************************************************
      14              : MODULE hfx_load_balance_methods
      15              :    USE cell_types, ONLY: cell_type
      16              :    USE cp_files, ONLY: close_file, &
      17              :                        open_file
      18              :    USE message_passing, ONLY: mp_para_env_type
      19              :    USE hfx_pair_list_methods, ONLY: build_atomic_pair_list, &
      20              :                                     build_pair_list
      21              :    USE hfx_types, ONLY: &
      22              :       hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &
      23              :       hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &
      24              :       pair_list_type, pair_set_list_type
      25              :    USE input_constants, ONLY: hfx_do_eval_energy, &
      26              :                               hfx_do_eval_forces
      27              :    USE kinds, ONLY: dp, &
      28              :                     int_8
      29              :    USE message_passing, ONLY: mp_waitall, mp_request_type
      30              :    USE parallel_rng_types, ONLY: UNIFORM, &
      31              :                                  rng_stream_type
      32              :    USE particle_types, ONLY: particle_type
      33              :    USE util, ONLY: sort
      34              : #include "./base/base_uses.f90"
      35              : 
      36              :    IMPLICIT NONE
      37              :    PRIVATE
      38              : 
      39              :    PUBLIC :: hfx_load_balance, &
      40              :              hfx_update_load_balance, &
      41              :              collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy
      42              : 
      43              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
      44              : 
      45              :    REAL(KIND=dp), PARAMETER :: p1_energy(12) = [2.9461408209700424_dp, 1.0624718662999657_dp, &
      46              :                                                 -1.91570128356921242E-002_dp, -1.6668495454436603_dp, &
      47              :                                                 1.7512639006523709_dp, -9.76074323945336081E-002_dp, &
      48              :                                                 2.6230786127311889_dp, -0.31870737623014189_dp, &
      49              :                                                 7.9588203912690973_dp, 1.8331423413134813_dp, &
      50              :                                                 -0.15427618665346299_dp, 0.19749436090711650_dp]
      51              :    REAL(KIND=dp), PARAMETER :: p2_energy(12) = [2.3104682960662593_dp, 1.8744052737304417_dp, &
      52              :                                                 -9.36564055598656797E-002_dp, 0.64284973765086939_dp, &
      53              :                                                 1.0137565430060556_dp, -6.80088178288954567E-003_dp, &
      54              :                                                 1.1692629207374552_dp, -2.6314710080507573_dp, &
      55              :                                                 19.237814781880786_dp, 1.0505934173661349_dp, &
      56              :                                                 0.80382371955699250_dp, 0.49903401991818103_dp]
      57              :    REAL(KIND=dp), PARAMETER :: p3_energy(2) = [7.82336287670072350E-002_dp, 0.38073304105744837_dp]
      58              :    REAL(KIND=dp), PARAMETER :: p1_forces(12) = [2.5746279948798874_dp, 1.3420575378609276_dp, &
      59              :                                                 -9.41673106447732111E-002_dp, 0.94568006899317825_dp, &
      60              :                                                 -1.4511897117448544_dp, 0.59178934677316952_dp, &
      61              :                                                 2.7291149361757236_dp, -0.50555512044800210_dp, &
      62              :                                                 8.3508180969609871_dp, 1.6829982496141809_dp, &
      63              :                                                 -0.74895370472152600_dp, 0.43801726744197500_dp]
      64              :    REAL(KIND=dp), PARAMETER :: p2_forces(12) = [2.6398568961569020_dp, 2.3024918834564101_dp, &
      65              :                                                 5.33216585432061581E-003_dp, 0.45572145697283628_dp, &
      66              :                                                 1.8119743851500618_dp, -0.12533918548421166_dp, &
      67              :                                                 -1.4040312084552751_dp, -4.5331650463917859_dp, &
      68              :                                                 12.593431549069477_dp, 1.1311978374487595_dp, &
      69              :                                                 1.4245996087624646_dp, 1.1425350529853495_dp]
      70              :    REAL(KIND=dp), PARAMETER :: p3_forces(2) = [0.12051930516830946_dp, 1.3828051586144336_dp]
      71              : 
      72              : !***
      73              : 
      74              : CONTAINS
      75              : 
      76              : ! **************************************************************************************************
      77              : !> \brief Distributes the computation of eri's to all available processes.
      78              : !> \param x_data Object that stores the indices array
      79              : !> \param eps_schwarz screening parameter
      80              : !> \param particle_set , atomic_kind_set, para_env ...
      81              : !> \param max_set Maximum number of set to be considered
      82              : !> \param para_env para_env
      83              : !> \param coeffs_set screening functions
      84              : !> \param coeffs_kind screening functions
      85              : !> \param is_assoc_atomic_block_global KS-matrix sparsity
      86              : !> \param do_periodic flag for periodicity
      87              : !> \param load_balance_parameter Parameters for Monte-Carlo routines
      88              : !> \param kind_of helper array for mapping
      89              : !> \param basis_parameter Basis set parameters
      90              : !> \param pmax_set Initial screening matrix
      91              : !> \param pmax_atom ...
      92              : !> \param i_thread Process ID of current Thread
      93              : !> \param n_threads Total Number of Threads
      94              : !> \param cell cell
      95              : !> \param do_p_screening Flag for initial p screening
      96              : !> \param map_atom_to_kind_atom ...
      97              : !> \param nkind ...
      98              : !> \param eval_type ...
      99              : !> \param pmax_block ...
     100              : !> \param use_virial ...
     101              : !> \par History
     102              : !>      06.2007 created [Manuel Guidon]
     103              : !>      08.2007 new parallel scheme [Manuel Guidon]
     104              : !>      09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
     105              : !>      11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
     106              : !>      02.2009 completely refactored [Manuel Guidon]
     107              : !> \author Manuel Guidon
     108              : !> \note
     109              : !>      The optimization is done via a binning procedure followed by simple
     110              : !>      Monte Carlo procedure:
     111              : !>      In a first step the total amount of integrals in the system is calculated,
     112              : !>      taking into account the sparsity of the KS-matrix , the screening based
     113              : !>      on near/farfield approximations and if desired the screening on an initial
     114              : !>      density matrix.
     115              : !>      In a second step, bins are generate that contain approximately the same number
     116              : !>      of integrals, and a cost for these bins is estimated (currently the number of integrals)
     117              : !>      In a third step, a Monte Carlo procedure optimizes the assignment
     118              : !>      of the different loads to each process
     119              : !>      At the end each process owns an unique array of *atomic* indices-ranges
     120              : !>      that are used to decide whether a process has to calculate a certain
     121              : !>      bunch of integrals or not
     122              : ! **************************************************************************************************
     123         1792 :    SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
     124              :                                coeffs_set, coeffs_kind, &
     125         1792 :                                is_assoc_atomic_block_global, do_periodic, &
     126              :                                load_balance_parameter, kind_of, basis_parameter, pmax_set, &
     127              :                                pmax_atom, i_thread, n_threads, cell, &
     128              :                                do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
     129              :                                pmax_block, use_virial)
     130              :       TYPE(hfx_type), POINTER                            :: x_data
     131              :       REAL(dp), INTENT(IN)                               :: eps_schwarz
     132              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     133              :       INTEGER, INTENT(IN)                                :: max_set
     134              :       TYPE(mp_para_env_type), POINTER                    :: para_env
     135              :       TYPE(hfx_screen_coeff_type), &
     136              :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
     137              :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
     138              :          POINTER                                         :: coeffs_kind
     139              :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
     140              :       LOGICAL                                            :: do_periodic
     141              :       TYPE(hfx_load_balance_type), POINTER               :: load_balance_parameter
     142              :       INTEGER                                            :: kind_of(*)
     143              :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
     144              :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
     145              :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
     146              :       INTEGER, INTENT(IN)                                :: i_thread, n_threads
     147              :       TYPE(cell_type), POINTER                           :: cell
     148              :       LOGICAL, INTENT(IN)                                :: do_p_screening
     149              :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
     150              :       INTEGER, INTENT(IN)                                :: nkind, eval_type
     151              :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
     152              :       LOGICAL, INTENT(IN)                                :: use_virial
     153              : 
     154              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'hfx_load_balance'
     155              : 
     156              :       CHARACTER(LEN=512)                                 :: error_msg
     157              :       INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
     158              :                  handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
     159              :                  jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
     160              :                  latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
     161              :                  new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
     162              :                  objective_block_size, objective_nblocks, source, total_blocks
     163         5376 :       TYPE(mp_request_type), DIMENSION(2) :: req
     164              :       INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
     165              :                         distribution_counter_end, distribution_counter_start, global_quartet_counter, &
     166              :                         local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
     167         1792 :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer_in, buffer_out
     168         1792 :       INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
     169         1792 :                                                             sendbuffer, swapbuffer
     170              :       INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
     171              :       INTEGER(int_8), SAVE                               :: shm_global_quartet_counter, &
     172              :                                                             shm_local_quartet_counter
     173         3584 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount, rdispl, tmp_index, tmp_pos, &
     174         1792 :                                                             to_be_sorted
     175              :       INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
     176              :       INTEGER, SAVE                                      :: shm_nblocks
     177              :       LOGICAL                                            :: changed, last_bin_needs_to_be_filled, &
     178              :                                                             optimized
     179              :       LOGICAL, DIMENSION(:, :), POINTER, SAVE            :: atomic_pair_list
     180              :       REAL(dp)                                           :: coeffs_kind_max0, log10_eps_schwarz, &
     181              :                                                             log_2, pmax_blocks
     182         3584 :       TYPE(hfx_block_range_type), DIMENSION(:), POINTER  :: blocks_guess, tmp_blocks, tmp_blocks2
     183              :       TYPE(hfx_block_range_type), DIMENSION(:), &
     184              :          POINTER, SAVE                                   :: shm_blocks
     185         5376 :       TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
     186              :       TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
     187              :          SAVE                                            :: full_dist
     188              :       TYPE(pair_list_type)                               :: list_ij, list_kl
     189              :       TYPE(pair_set_list_type), ALLOCATABLE, &
     190         1792 :          DIMENSION(:)                                    :: set_list_ij, set_list_kl
     191              : 
     192         1792 : !$OMP BARRIER
     193         3584 : !$OMP MASTER
     194         1792 :       CALL timeset(routineN, handle)
     195              : !$OMP END MASTER
     196         1792 : !$OMP BARRIER
     197              : 
     198         1792 :       log10_eps_schwarz = LOG10(eps_schwarz)
     199         1792 :       log_2 = LOG10(2.0_dp)
     200        12332 :       coeffs_kind_max0 = MAXVAL(coeffs_kind(:, :)%x(2))
     201         1792 :       ncpu = para_env%num_pe
     202         1792 :       n_processes = ncpu*n_threads
     203         1792 :       natom = SIZE(particle_set)
     204              : 
     205         1792 :       block_size = load_balance_parameter%block_size
     206        34824 :       ALLOCATE (set_list_ij((max_set*block_size)**2))
     207        33032 :       ALLOCATE (set_list_kl((max_set*block_size)**2))
     208              : 
     209         1792 :       IF (.NOT. load_balance_parameter%blocks_initialized) THEN
     210         1236 : !$OMP BARRIER
     211         1236 : !$OMP MASTER
     212         1236 :          CALL timeset(routineN//"_range", handle_range)
     213              : 
     214         1236 :          nblocks = MAX((natom + block_size - 1)/block_size, 1)
     215         7612 :          ALLOCATE (blocks_guess(nblocks))
     216         7612 :          ALLOCATE (tmp_blocks(natom))
     217         6376 :          ALLOCATE (tmp_blocks2(natom))
     218              : 
     219         1236 :          pmax_blocks = 0.0_dp
     220         2472 :          SELECT CASE (eval_type)
     221              :          CASE (hfx_do_eval_energy)
     222         1236 :             atomic_pair_list => x_data%atomic_pair_list
     223              :          CASE (hfx_do_eval_forces)
     224         1236 :             atomic_pair_list => x_data%atomic_pair_list_forces
     225              :          END SELECT
     226        21076 :          atomic_pair_list = .TRUE.
     227              :          CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
     228              :                           list_ij, list_kl, set_list_ij, set_list_kl, &
     229              :                           particle_set, &
     230              :                           coeffs_set, coeffs_kind, &
     231              :                           is_assoc_atomic_block_global, do_periodic, &
     232              :                           kind_of, basis_parameter, pmax_set, pmax_atom, &
     233              :                           pmax_blocks, cell, &
     234              :                           do_p_screening, map_atom_to_kind_atom, eval_type, &
     235         1236 :                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     236              : 
     237         1236 :          total_block_self_cost = 0
     238              : 
     239         5140 :          DO i = 1, nblocks
     240         5140 :             total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
     241              :          END DO
     242              : 
     243         1236 :          CALL para_env%sum(total_block_self_cost)
     244              : 
     245         1236 :          objective_block_size = load_balance_parameter%block_size
     246         1236 :          objective_nblocks = MAX((natom + objective_block_size - 1)/objective_block_size, 1)
     247              : 
     248         1236 :          self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
     249              : 
     250         5140 :          DO i = 1, nblocks
     251         5140 :             tmp_blocks2(i) = blocks_guess(i)
     252              :          END DO
     253              : 
     254              :          optimized = .FALSE.
     255              :          i = 0
     256         2472 :          DO WHILE (.NOT. optimized)
     257         1236 :             i = i + 1
     258         1236 :             current_block_id = 0
     259         1236 :             changed = .FALSE.
     260         5140 :             DO atom_block = 1, nblocks
     261         3904 :                current_block_id = current_block_id + 1
     262         3904 :                iatom_start = tmp_blocks2(atom_block)%istart
     263         3904 :                iatom_end = tmp_blocks2(atom_block)%iend
     264         5140 :                IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
     265            0 :                   changed = .TRUE.
     266            0 :                   new_iatom_start = iatom_start
     267            0 :                   new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
     268            0 :                   new_jatom_start = new_iatom_end + 1
     269            0 :                   new_jatom_end = iatom_end
     270            0 :                   tmp_blocks(current_block_id)%istart = new_iatom_start
     271            0 :                   tmp_blocks(current_block_id)%iend = new_iatom_end
     272              :                   tmp_blocks(current_block_id)%cost = estimate_block_cost( &
     273              :                                                       natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     274              :                                                       new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
     275              :                                                       new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
     276              :                                                       particle_set, &
     277              :                                                       coeffs_set, coeffs_kind, &
     278              :                                                       is_assoc_atomic_block_global, do_periodic, &
     279              :                                                       kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     280              :                                                       cell, &
     281              :                                                       do_p_screening, map_atom_to_kind_atom, eval_type, &
     282            0 :                                                       log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     283            0 :                   current_block_id = current_block_id + 1
     284            0 :                   tmp_blocks(current_block_id)%istart = new_jatom_start
     285            0 :                   tmp_blocks(current_block_id)%iend = new_jatom_end
     286              :                   tmp_blocks(current_block_id)%cost = estimate_block_cost( &
     287              :                                                       natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     288              :                                                       new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
     289              :                                                       new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
     290              :                                                       particle_set, &
     291              :                                                       coeffs_set, coeffs_kind, &
     292              :                                                       is_assoc_atomic_block_global, do_periodic, &
     293              :                                                       kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     294              :                                                       cell, &
     295              :                                                       do_p_screening, map_atom_to_kind_atom, eval_type, &
     296            0 :                                                       log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     297              :                ELSE
     298         3904 :                   tmp_blocks(current_block_id)%istart = iatom_start
     299         3904 :                   tmp_blocks(current_block_id)%iend = iatom_end
     300         3904 :                   tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
     301              :                END IF
     302              :             END DO
     303         1236 :             IF (.NOT. changed) optimized = .TRUE.
     304         1236 :             IF (i > 20) optimized = .TRUE.
     305         1236 :             nblocks = current_block_id
     306         6376 :             DO atom_block = 1, nblocks
     307         5140 :                tmp_blocks2(atom_block) = tmp_blocks(atom_block)
     308              :             END DO
     309              :          END DO
     310              : 
     311         1236 :          DEALLOCATE (tmp_blocks2)
     312              : 
     313              :          ! ** count number of non empty blocks on each node
     314         1236 :          non_empty_blocks = 0
     315         5140 :          DO atom_block = 1, nblocks
     316         3904 :             IF (tmp_blocks(atom_block)%istart == 0) CYCLE
     317         5140 :             non_empty_blocks = non_empty_blocks + 1
     318              :          END DO
     319              : 
     320         3708 :          ALLOCATE (rcount(ncpu))
     321         3706 :          rcount = 0
     322         1236 :          rcount(para_env%mepos + 1) = non_empty_blocks
     323         1236 :          CALL para_env%sum(rcount)
     324              : 
     325              :          ! ** sum all non_empty_blocks
     326         1236 :          total_blocks = 0
     327         3706 :          DO i = 1, ncpu
     328         3706 :             total_blocks = total_blocks + rcount(i)
     329              :          END DO
     330              : 
     331              :          ! ** calculate offsets
     332         2472 :          ALLOCATE (rdispl(ncpu))
     333         3706 :          rcount(:) = rcount(:)*3
     334         1236 :          rdispl(1) = 0
     335         2470 :          DO i = 2, ncpu
     336         2470 :             rdispl(i) = rdispl(i - 1) + rcount(i - 1)
     337              :          END DO
     338              : 
     339         3680 :          ALLOCATE (buffer_in(3*non_empty_blocks))
     340              : 
     341         1236 :          non_empty_blocks = 0
     342         5140 :          DO atom_block = 1, nblocks
     343         3904 :             IF (tmp_blocks(atom_block)%istart == 0) CYCLE
     344         1955 :             buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
     345         1955 :             buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
     346         1955 :             buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
     347         5140 :             non_empty_blocks = non_empty_blocks + 1
     348              :          END DO
     349              : 
     350         1236 :          nblocks = total_blocks
     351              : 
     352         7612 :          ALLOCATE (tmp_blocks2(nblocks))
     353              : 
     354         3708 :          ALLOCATE (buffer_out(3*nblocks))
     355              : 
     356              :          ! ** Gather all three arrays
     357         1236 :          CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
     358              : 
     359         5140 :          DO i = 1, nblocks
     360         3904 :             tmp_blocks2(i)%istart = INT(buffer_out((i - 1)*3 + 1))
     361         3904 :             tmp_blocks2(i)%iend = INT(buffer_out((i - 1)*3 + 2))
     362         5140 :             tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
     363              :          END DO
     364              : 
     365              :          ! ** Now we sort the blocks
     366         3708 :          ALLOCATE (to_be_sorted(nblocks))
     367         2472 :          ALLOCATE (tmp_index(nblocks))
     368              : 
     369         5140 :          DO atom_block = 1, nblocks
     370         5140 :             to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
     371              :          END DO
     372              : 
     373         1236 :          CALL sort(to_be_sorted, nblocks, tmp_index)
     374              : 
     375         6376 :          ALLOCATE (x_data%blocks(nblocks))
     376              : 
     377         5140 :          DO atom_block = 1, nblocks
     378         5140 :             x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
     379              :          END DO
     380              : 
     381         1236 :          shm_blocks => x_data%blocks
     382         1236 :          shm_nblocks = nblocks
     383              : 
     384              :          ! ** Set nblocks in structure
     385         1236 :          load_balance_parameter%nblocks = nblocks
     386              : 
     387         1236 :          DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
     388              : 
     389         1236 :          DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
     390              : 
     391         1236 :          load_balance_parameter%blocks_initialized = .TRUE.
     392              : 
     393         9044 :          x_data%blocks = shm_blocks
     394         1236 :          load_balance_parameter%nblocks = shm_nblocks
     395         1236 :          load_balance_parameter%blocks_initialized = .TRUE.
     396              : 
     397         4944 :          ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
     398        21076 :          x_data%pmax_block = 0.0_dp
     399         1236 :          pmax_block => x_data%pmax_block
     400         2472 :          CALL timestop(handle_range)
     401              : !$OMP END MASTER
     402         1236 : !$OMP BARRIER
     403              : 
     404         1236 :          IF (.NOT. load_balance_parameter%blocks_initialized) THEN
     405            0 :             ALLOCATE (x_data%blocks(shm_nblocks))
     406            0 :             x_data%blocks = shm_blocks
     407            0 :             load_balance_parameter%nblocks = shm_nblocks
     408            0 :             load_balance_parameter%blocks_initialized = .TRUE.
     409              :          END IF
     410              :          !! ** precalculate maximum density matrix elements in blocks
     411         1236 : !$OMP BARRIER
     412              :       END IF
     413              : 
     414         1792 : !$OMP BARRIER
     415         1792 : !$OMP MASTER
     416         1792 :       pmax_block => x_data%pmax_block
     417        28760 :       pmax_block = 0.0_dp
     418         1792 :       IF (do_p_screening) THEN
     419         1470 :          DO iatom_block = 1, shm_nblocks
     420         1128 :             iatom_start = x_data%blocks(iatom_block)%istart
     421         1128 :             iatom_end = x_data%blocks(iatom_block)%iend
     422         5678 :             DO jatom_block = 1, shm_nblocks
     423         4208 :                jatom_start = x_data%blocks(jatom_block)%istart
     424         4208 :                jatom_end = x_data%blocks(jatom_block)%iend
     425        13752 :                pmax_block(iatom_block, jatom_block) = MAXVAL(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
     426              :             END DO
     427              :          END DO
     428              :       END IF
     429              : 
     430         3028 :       SELECT CASE (eval_type)
     431              :       CASE (hfx_do_eval_energy)
     432         1236 :          atomic_pair_list => x_data%atomic_pair_list
     433              :       CASE (hfx_do_eval_forces)
     434         1792 :          atomic_pair_list => x_data%atomic_pair_list_forces
     435              :       END SELECT
     436              :       CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
     437              :                                   do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
     438         1792 :                                   x_data%blocks)
     439              : 
     440              : !$OMP END MASTER
     441         1792 : !$OMP BARRIER
     442              : 
     443              :       !! If there is only 1 cpu skip the binning
     444         1792 :       IF (n_processes == 1) THEN
     445            4 :          ALLOCATE (tmp_dist(1))
     446            2 :          tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
     447            2 :          tmp_dist(1)%istart = 0_int_8
     448            2 :          ptr_to_tmp_dist => tmp_dist(:)
     449            4 :          SELECT CASE (eval_type)
     450              :          CASE (hfx_do_eval_energy)
     451            2 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
     452              :          CASE (hfx_do_eval_forces)
     453            2 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
     454              :          END SELECT
     455            2 :          DEALLOCATE (tmp_dist)
     456              :       ELSE
     457              :          !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
     458         1790 : !$OMP BARRIER
     459         1790 : !$OMP MASTER
     460         1790 :          CALL timeset(routineN//"_count", handle_inner)
     461              : !$OMP END MASTER
     462         1790 : !$OMP BARRIER
     463              : 
     464         1790 :          cost_per_core = 0_int_8
     465         1790 :          my_process_id = para_env%mepos*n_threads + i_thread
     466         1790 :          nblocks = load_balance_parameter%nblocks
     467              : 
     468       483973 :          DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
     469              : 
     470       482183 :             latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
     471       482183 :             tmp_block = atom_block/nblocks
     472       482183 :             katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     473       482183 :             IF (latom_block < katom_block) CYCLE
     474       270289 :             tmp_block = tmp_block/nblocks
     475       270289 :             jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     476       270289 :             tmp_block = tmp_block/nblocks
     477       270289 :             iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     478       270289 :             IF (jatom_block < iatom_block) CYCLE
     479              : 
     480       152416 :             iatom_start = x_data%blocks(iatom_block)%istart
     481       152416 :             iatom_end = x_data%blocks(iatom_block)%iend
     482       152416 :             jatom_start = x_data%blocks(jatom_block)%istart
     483       152416 :             jatom_end = x_data%blocks(jatom_block)%iend
     484       152416 :             katom_start = x_data%blocks(katom_block)%istart
     485       152416 :             katom_end = x_data%blocks(katom_block)%iend
     486       152416 :             latom_start = x_data%blocks(latom_block)%istart
     487       152416 :             latom_end = x_data%blocks(latom_block)%iend
     488              : 
     489       288344 :             SELECT CASE (eval_type)
     490              :             CASE (hfx_do_eval_energy)
     491              :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
     492              :                                  pmax_block(latom_block, jatom_block), &
     493              :                                  pmax_block(latom_block, iatom_block), &
     494       135928 :                                  pmax_block(katom_block, jatom_block))
     495              :             CASE (hfx_do_eval_forces)
     496              :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
     497              :                                  pmax_block(latom_block, jatom_block), &
     498              :                                  pmax_block(latom_block, iatom_block) + &
     499       152416 :                                  pmax_block(katom_block, jatom_block))
     500              :             END SELECT
     501              : 
     502       152416 :             IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
     503              : 
     504              :             cost_per_core = cost_per_core &
     505              :                             + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     506              :                                                   iatom_start, iatom_end, jatom_start, jatom_end, &
     507              :                                                   katom_start, katom_end, latom_start, latom_end, &
     508              :                                                   particle_set, &
     509              :                                                   coeffs_set, coeffs_kind, &
     510              :                                                   is_assoc_atomic_block_global, do_periodic, &
     511              :                                                   kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     512              :                                                   cell, &
     513              :                                                   do_p_screening, map_atom_to_kind_atom, eval_type, &
     514       483973 :                                                   log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     515              : 
     516              :          END DO ! atom_block
     517              : 
     518         1790 :          nbins = load_balance_parameter%nbins
     519         1790 :          cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
     520              : 
     521         1790 : !$OMP BARRIER
     522         1790 : !$OMP MASTER
     523         1790 :          CALL timestop(handle_inner)
     524              : !$OMP END MASTER
     525         1790 : !$OMP BARRIER
     526              : 
     527              : ! new load balancing test
     528              :          IF (.FALSE.) THEN
     529              :             CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
     530              :                                             natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     531              :                                             particle_set, &
     532              :                                             coeffs_set, coeffs_kind, &
     533              :                                             is_assoc_atomic_block_global, do_periodic, &
     534              :                                             kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     535              :                                             cell, x_data, para_env, pmax_block, &
     536              :                                             do_p_screening, map_atom_to_kind_atom, eval_type, &
     537              :                                             log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     538              :          END IF
     539              : 
     540         1790 : !$OMP BARRIER
     541         1790 : !$OMP MASTER
     542         1790 :          CALL timeset(routineN//"_bin", handle_inner)
     543              : !$OMP END MASTER
     544         1790 : !$OMP BARRIER
     545              : 
     546       119930 :          ALLOCATE (binned_dist(nbins))
     547       116350 :          binned_dist(:)%istart = -1_int_8
     548       116350 :          binned_dist(:)%number_of_atom_quartets = 0_int_8
     549       116350 :          binned_dist(:)%cost = 0_int_8
     550       116350 :          binned_dist(:)%time_first_scf = 0.0_dp
     551       116350 :          binned_dist(:)%time_other_scf = 0.0_dp
     552       116350 :          binned_dist(:)%time_forces = 0.0_dp
     553              : 
     554         1790 :          current_cost = 0
     555         1790 :          mepos = 1
     556         1790 :          distribution_counter_start = 1
     557         1790 :          distribution_counter_end = 0
     558         1790 :          ibin = 1
     559              : 
     560         1790 :          global_quartet_counter = 0
     561         1790 :          local_quartet_counter = 0
     562         1790 :          last_bin_needs_to_be_filled = .FALSE.
     563       483973 :          DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
     564       482183 :             latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
     565       482183 :             tmp_block = atom_block/nblocks
     566       482183 :             katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     567       482183 :             IF (latom_block < katom_block) CYCLE
     568       270289 :             tmp_block = tmp_block/nblocks
     569       270289 :             jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     570       270289 :             tmp_block = tmp_block/nblocks
     571       270289 :             iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
     572       270289 :             IF (jatom_block < iatom_block) CYCLE
     573              : 
     574       152416 :             distribution_counter_end = distribution_counter_end + 1
     575       152416 :             global_quartet_counter = global_quartet_counter + 1
     576       152416 :             last_bin_needs_to_be_filled = .TRUE.
     577              : 
     578       152416 :             IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
     579              : 
     580       152416 :             iatom_start = x_data%blocks(iatom_block)%istart
     581       152416 :             iatom_end = x_data%blocks(iatom_block)%iend
     582       152416 :             jatom_start = x_data%blocks(jatom_block)%istart
     583       152416 :             jatom_end = x_data%blocks(jatom_block)%iend
     584       152416 :             katom_start = x_data%blocks(katom_block)%istart
     585       152416 :             katom_end = x_data%blocks(katom_block)%iend
     586       152416 :             latom_start = x_data%blocks(latom_block)%istart
     587       152416 :             latom_end = x_data%blocks(latom_block)%iend
     588              : 
     589       288344 :             SELECT CASE (eval_type)
     590              :             CASE (hfx_do_eval_energy)
     591              :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
     592              :                                  pmax_block(latom_block, jatom_block), &
     593              :                                  pmax_block(latom_block, iatom_block), &
     594       135928 :                                  pmax_block(katom_block, jatom_block))
     595              :             CASE (hfx_do_eval_forces)
     596              :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
     597              :                                  pmax_block(latom_block, jatom_block), &
     598              :                                  pmax_block(latom_block, iatom_block) + &
     599       152416 :                                  pmax_block(katom_block, jatom_block))
     600              :             END SELECT
     601              : 
     602       152416 :             IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
     603              : 
     604              :             current_cost = current_cost &
     605              :                            + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     606              :                                                  iatom_start, iatom_end, jatom_start, jatom_end, &
     607              :                                                  katom_start, katom_end, latom_start, latom_end, &
     608              :                                                  particle_set, &
     609              :                                                  coeffs_set, coeffs_kind, &
     610              :                                                  is_assoc_atomic_block_global, do_periodic, &
     611              :                                                  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     612              :                                                  cell, &
     613              :                                                  do_p_screening, map_atom_to_kind_atom, eval_type, &
     614       151790 :                                                  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     615              : 
     616       153580 :             IF (current_cost >= cost_per_bin) THEN
     617        19253 :                IF (ibin == nbins) THEN
     618              :                   binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
     619            0 :                                                               distribution_counter_end - distribution_counter_start + 1
     620              :                ELSE
     621        19253 :                   binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
     622              :                END IF
     623        19253 :                binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
     624        19253 :                ibin = MIN(ibin + 1, nbins)
     625        19253 :                distribution_counter_start = distribution_counter_end + 1
     626        19253 :                current_cost = 0
     627        19253 :                last_bin_needs_to_be_filled = .FALSE.
     628              :             END IF
     629              :          END DO
     630              : 
     631         1790 : !$OMP BARRIER
     632         1790 : !$OMP MASTER
     633         1790 :          CALL timestop(handle_inner)
     634         1790 :          CALL timeset(routineN//"_dist", handle_inner)
     635              : !$OMP END MASTER
     636         1790 : !$OMP BARRIER
     637              :          !! Fill the last bin if necessary
     638         1790 :          IF (last_bin_needs_to_be_filled) THEN
     639          531 :             binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
     640          531 :             IF (ibin == nbins) THEN
     641              :                binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
     642            0 :                                                            distribution_counter_end - distribution_counter_start + 1
     643              :             ELSE
     644          531 :                binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
     645              :             END IF
     646              :          END IF
     647              : 
     648              :          !! Sanity-Check
     649       116350 :          DO ibin = 1, nbins
     650       116350 :             local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
     651              :          END DO
     652         1790 : !$OMP BARRIER
     653         1790 : !$OMP MASTER
     654         1790 :          shm_local_quartet_counter = 0
     655         1790 :          shm_global_quartet_counter = 0
     656              : !$OMP END MASTER
     657         1790 : !$OMP BARRIER
     658         1790 : !$OMP ATOMIC
     659              :          shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
     660         1790 : !$OMP ATOMIC
     661              :          shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
     662              : 
     663         1790 : !$OMP BARRIER
     664         1790 : !$OMP MASTER
     665         1790 :          CALL para_env%sum(shm_local_quartet_counter)
     666         1790 :          CALL para_env%sum(shm_global_quartet_counter)
     667         1790 :          IF (para_env%is_source()) THEN
     668          895 :             IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
     669              :                WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
     670            0 :                   "Number of local quartets (", shm_local_quartet_counter, &
     671            0 :                   ") and number of global quartets (", shm_global_quartet_counter, &
     672            0 :                   ") are different. Please send in a bug report."
     673            0 :                CPABORT(error_msg)
     674              :             END IF
     675              :          END IF
     676              : !$OMP END MASTER
     677              : 
     678         1790 : !$OMP BARRIER
     679         1790 : !$OMP MASTER
     680         5370 :          ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
     681       230910 :          cost_matrix = 0
     682              : !$OMP END MASTER
     683         1790 : !$OMP BARRIER
     684         1790 :          icpu = para_env%mepos + 1
     685       116350 :          DO i = 1, nbins
     686       116350 :             cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
     687              :          END DO
     688         1790 :          mepos = para_env%mepos
     689         1790 : !$OMP BARRIER
     690              : 
     691         1790 : !$OMP MASTER
     692              :          ! sync before/after ring of isendrecv
     693         1790 :          CALL para_env%sync()
     694              : 
     695         5370 :          ALLOCATE (sendbuffer(nbins*n_threads))
     696         3580 :          ALLOCATE (recbuffer(nbins*n_threads))
     697              : 
     698       230910 :          sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
     699              : 
     700         1790 :          dest = MODULO(mepos + 1, ncpu)
     701         1790 :          source = MODULO(mepos - 1, ncpu)
     702         5370 :          DO icpu = 0, ncpu - 1
     703         3580 :             IF (icpu /= ncpu - 1) THEN
     704              :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
     705         1790 :                                        req(1), req(2), 13)
     706              :             END IF
     707         3580 :             data_from = MODULO(mepos - icpu, ncpu)
     708       461820 :             cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
     709         3580 :             IF (icpu /= ncpu - 1) THEN
     710         1790 :                CALL mp_waitall(req)
     711              :             END IF
     712         3580 :             swapbuffer => sendbuffer
     713         3580 :             sendbuffer => recbuffer
     714         5370 :             recbuffer => swapbuffer
     715              :          END DO
     716         1790 :          DEALLOCATE (recbuffer, sendbuffer)
     717              : !$OMP END MASTER
     718         1790 : !$OMP BARRIER
     719              : 
     720         1790 : !$OMP BARRIER
     721         1790 : !$OMP MASTER
     722         1790 :          CALL timestop(handle_inner)
     723         1790 :          CALL timeset(routineN//"_opt", handle_inner)
     724              : !$OMP END MASTER
     725         1790 : !$OMP BARRIER
     726              : 
     727              :          !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
     728         1790 : !$OMP BARRIER
     729         5370 :          ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
     730       460030 :          local_cost_matrix = cost_matrix
     731         1790 : !$OMP MASTER
     732         5370 :          ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
     733              : 
     734              :          CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
     735         1790 :                                     shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
     736              : 
     737         1790 :          CALL timestop(handle_inner)
     738         1790 :          CALL timeset(routineN//"_redist", handle_inner)
     739              :          !! Collect local data to global array
     740       350840 :          ALLOCATE (full_dist(ncpu*n_threads, nbins))
     741              : 
     742       345470 :          full_dist(:, :)%istart = 0_int_8
     743       345470 :          full_dist(:, :)%number_of_atom_quartets = 0_int_8
     744       345470 :          full_dist(:, :)%cost = 0_int_8
     745       345470 :          full_dist(:, :)%time_first_scf = 0.0_dp
     746       345470 :          full_dist(:, :)%time_other_scf = 0.0_dp
     747       347260 :          full_dist(:, :)%time_forces = 0.0_dp
     748              : !$OMP END MASTER
     749         1790 : !$OMP BARRIER
     750         1790 :          mepos = para_env%mepos + 1
     751       230910 :          full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
     752              : 
     753         1790 : !$OMP BARRIER
     754         1790 : !$OMP MASTER
     755         5370 :          ALLOCATE (sendbuffer(3*nbins*n_threads))
     756         3580 :          ALLOCATE (recbuffer(3*nbins*n_threads))
     757         1790 :          mepos = para_env%mepos
     758         3580 :          DO j = 1, n_threads
     759       118140 :             DO i = 1, nbins
     760       114560 :                sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
     761       114560 :                sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
     762       116350 :                sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
     763              :             END DO
     764              :          END DO
     765              : 
     766              :          ! sync before/after ring of isendrecv
     767         1790 :          CALL para_env%sync()
     768         1790 :          dest = MODULO(mepos + 1, ncpu)
     769         1790 :          source = MODULO(mepos - 1, ncpu)
     770         5370 :          DO icpu = 0, ncpu - 1
     771         3580 :             IF (icpu /= ncpu - 1) THEN
     772              :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
     773         1790 :                                        req(1), req(2), 13)
     774              :             END IF
     775         3580 :             data_from = MODULO(mepos - icpu, ncpu)
     776         7160 :             DO j = 1, n_threads
     777       236280 :                DO i = 1, nbins
     778       229120 :                   full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
     779       229120 :                   full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
     780       232700 :                   full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
     781              :                END DO
     782              :             END DO
     783              : 
     784         3580 :             IF (icpu /= ncpu - 1) THEN
     785         1790 :                CALL mp_waitall(req)
     786              :             END IF
     787         3580 :             swapbuffer => sendbuffer
     788         3580 :             sendbuffer => recbuffer
     789         5370 :             recbuffer => swapbuffer
     790              :          END DO
     791         1790 :          DEALLOCATE (recbuffer, sendbuffer)
     792              : 
     793              :          ! sync before/after ring of isendrecv
     794         1790 :          CALL para_env%sync()
     795              : !$OMP END MASTER
     796         1790 : !$OMP BARRIER
     797              :          !! reorder the distribution according to the distribution vector
     798         5370 :          ALLOCATE (tmp_pos(ncpu*n_threads))
     799         5370 :          tmp_pos = 1
     800       234490 :          ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
     801              : 
     802       230910 :          tmp_dist(:)%istart = 0_int_8
     803       230910 :          tmp_dist(:)%number_of_atom_quartets = 0_int_8
     804       230910 :          tmp_dist(:)%cost = 0_int_8
     805       230910 :          tmp_dist(:)%time_first_scf = 0.0_dp
     806       230910 :          tmp_dist(:)%time_other_scf = 0.0_dp
     807       230910 :          tmp_dist(:)%time_forces = 0.0_dp
     808              : 
     809         5370 :          DO icpu = 1, n_processes
     810       234490 :             DO i = 1, nbins
     811       229120 :                mepos = my_process_id + 1
     812       232700 :                IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
     813       114560 :                   tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
     814       114560 :                   tmp_pos(mepos) = tmp_pos(mepos) + 1
     815              :                END IF
     816              :             END DO
     817              :          END DO
     818              : 
     819              :          !! Assign the load to each process
     820              :          NULLIFY (ptr_to_tmp_dist)
     821         1790 :          mepos = my_process_id + 1
     822         1790 :          ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
     823         3024 :          SELECT CASE (eval_type)
     824              :          CASE (hfx_do_eval_energy)
     825         1234 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
     826              :          CASE (hfx_do_eval_forces)
     827         1790 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
     828              :          END SELECT
     829              : 
     830         1790 : !$OMP BARRIER
     831         1790 : !$OMP MASTER
     832         1790 :          DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
     833              : !$OMP END MASTER
     834         1790 : !$OMP BARRIER
     835         1790 :          DEALLOCATE (tmp_dist, tmp_pos)
     836         1790 :          DEALLOCATE (binned_dist, local_cost_matrix)
     837         1790 :          DEALLOCATE (set_list_ij, set_list_kl)
     838              : 
     839         1790 : !$OMP BARRIER
     840         1790 : !$OMP MASTER
     841         1790 :          CALL timestop(handle_inner)
     842              : !$OMP END MASTER
     843         1790 : !$OMP BARRIER
     844              :       END IF
     845         1792 : !$OMP BARRIER
     846         1792 : !$OMP MASTER
     847         1792 :       CALL timestop(handle)
     848              : !$OMP END MASTER
     849         1792 : !$OMP BARRIER
     850      3718400 :    END SUBROUTINE hfx_load_balance
     851              : 
     852              : ! **************************************************************************************************
     853              : !> \brief Reference implementation of new recursive load balancing routine
     854              : !>        Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
     855              : !>        each process in a P-Q grid such that every process has more or less the
     856              : !>        same amount of work. Has no output at the moment (not used) but writes
     857              : !>        its computed load balance values into a file. Possible output is ready
     858              : !>        to use in the two arrays p_atom_blocks & q_atom_blocks
     859              : !> \param n_processes ...
     860              : !> \param my_process_id ...
     861              : !> \param nblocks ...
     862              : !> \param natom ...
     863              : !> \param nkind ...
     864              : !> \param list_ij ...
     865              : !> \param list_kl ...
     866              : !> \param set_list_ij ...
     867              : !> \param set_list_kl ...
     868              : !> \param particle_set ...
     869              : !> \param coeffs_set ...
     870              : !> \param coeffs_kind ...
     871              : !> \param is_assoc_atomic_block_global ...
     872              : !> \param do_periodic ...
     873              : !> \param kind_of ...
     874              : !> \param basis_parameter ...
     875              : !> \param pmax_set ...
     876              : !> \param pmax_atom ...
     877              : !> \param pmax_blocks ...
     878              : !> \param cell ...
     879              : !> \param x_data ...
     880              : !> \param para_env ...
     881              : !> \param pmax_block ...
     882              : !> \param do_p_screening ...
     883              : !> \param map_atom_to_kind_atom ...
     884              : !> \param eval_type ...
     885              : !> \param log10_eps_schwarz ...
     886              : !> \param log_2 ...
     887              : !> \param coeffs_kind_max0 ...
     888              : !> \param use_virial ...
     889              : !> \param atomic_pair_list ...
     890              : !> \par History
     891              : !>      03.2011 created [Michael Steinlechner]
     892              : !> \author Michael Steinlechner
     893              : ! **************************************************************************************************
     894              : 
     895            0 :    SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
     896              :                                          natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     897              :                                          particle_set, &
     898              :                                          coeffs_set, coeffs_kind, &
     899            0 :                                          is_assoc_atomic_block_global, do_periodic, &
     900              :                                          kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     901              :                                          cell, x_data, para_env, pmax_block, &
     902              :                                          do_p_screening, map_atom_to_kind_atom, eval_type, &
     903              :                                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     904              : 
     905              : ! input variables:
     906              :       INTEGER, INTENT(IN)                                :: n_processes, my_process_id, nblocks, &
     907              :                                                             natom, nkind
     908              :       TYPE(pair_list_type), INTENT(IN)                   :: list_ij, list_kl
     909              :       TYPE(pair_set_list_type), ALLOCATABLE, &
     910              :          DIMENSION(:), INTENT(IN)                        :: set_list_ij, set_list_kl
     911              :       TYPE(particle_type), DIMENSION(:), INTENT(IN), &
     912              :          POINTER                                         :: particle_set
     913              :       TYPE(hfx_screen_coeff_type), &
     914              :          DIMENSION(:, :, :, :), INTENT(IN), POINTER      :: coeffs_set
     915              :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
     916              :          INTENT(IN), POINTER                             :: coeffs_kind
     917              :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: is_assoc_atomic_block_global
     918              :       LOGICAL, INTENT(IN)                                :: do_periodic
     919              :       INTEGER, INTENT(IN)                                :: kind_of(*)
     920              :       TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
     921              :          POINTER                                         :: basis_parameter
     922              :       TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
     923              :          POINTER                                         :: pmax_set
     924              :       REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_atom
     925              :       REAL(dp)                                           :: pmax_blocks
     926              :       TYPE(cell_type), INTENT(IN), POINTER               :: cell
     927              :       TYPE(hfx_type), INTENT(IN), POINTER                :: x_data
     928              :       TYPE(mp_para_env_type), INTENT(IN)        :: para_env
     929              :       REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_block
     930              :       LOGICAL, INTENT(IN)                                :: do_p_screening
     931              :       INTEGER, DIMENSION(:), INTENT(IN), POINTER         :: map_atom_to_kind_atom
     932              :       INTEGER, INTENT(IN)                                :: eval_type
     933              :       REAL(dp), INTENT(IN)                               :: log10_eps_schwarz, log_2, &
     934              :                                                             coeffs_kind_max0
     935              :       LOGICAL, INTENT(IN)                                :: use_virial
     936              :       LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER      :: atomic_pair_list
     937              : 
     938              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_recursive_load_balance'
     939              : 
     940              :       INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
     941              :                  jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
     942              :                  nP, nQ, numBins, p, q, sizeP, sizeQ, unit_nr
     943              :       INTEGER(int_8)                                     :: local_cost, pidx, qidx, sumP, sumQ
     944            0 :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: local_cost_vector
     945            0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: blocksize, p_atom_blocks, permute, &
     946            0 :                                                             q_atom_blocks
     947              :       REAL(dp)                                           :: maximum, mean
     948              : 
     949              : ! internal variables:
     950              : 
     951            0 : !$OMP BARRIER
     952            0 : !$OMP MASTER
     953            0 :       CALL timeset(routineN, handle)
     954              : !$OMP END MASTER
     955            0 : !$OMP BARRIER
     956              : 
     957              :       ! calculate best p/q distribution grid for the n_processes
     958            0 :       CALL hfx_calculate_PQ(p, q, numBins, n_processes)
     959              : 
     960            0 :       ALLOCATE (blocksize(numBins))
     961            0 :       ALLOCATE (permute(nblocks**2))
     962            0 :       DO i = 1, nblocks**2
     963            0 :          permute(i) = i
     964              :       END DO
     965              : 
     966              :       ! call the main recursive permutation routine.
     967              :       ! Output:
     968              :       !   blocksize :: vector (size numBins) with the sizes for each column/row block
     969              :       !   permute   :: permutation vector
     970              :       CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numBins, &
     971              :                                  permute, 1, &
     972              :                                  my_process_id, n_processes, nblocks, &
     973              :                                  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
     974              :                                  particle_set, &
     975              :                                  coeffs_set, coeffs_kind, &
     976              :                                  is_assoc_atomic_block_global, do_periodic, &
     977              :                                  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
     978              :                                  cell, x_data, para_env, pmax_block, &
     979              :                                  do_p_screening, map_atom_to_kind_atom, eval_type, &
     980            0 :                                  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
     981              : 
     982              :       ! number of blocks per processor in p-direction (vertical)
     983            0 :       nP = numBins/p
     984              :       ! number of blocks per processor in q-direction (horizontal)
     985            0 :       nQ = numBins/q
     986              : 
     987              :       ! calc own position in P-Q-processor grid (PQ-grid is column-major)
     988            0 :       pidx = MODULO(INT(my_process_id), INT(p)) + 1
     989            0 :       qidx = my_process_id/p + 1
     990              : 
     991            0 :       sizeP = SUM(blocksize((nP*(pidx - 1) + 1):(nP*pidx)))
     992            0 :       sizeQ = SUM(blocksize((nQ*(qidx - 1) + 1):(nQ*qidx)))
     993              : 
     994            0 :       sumP = SUM(blocksize(1:(nP*(pidx - 1))))
     995            0 :       sumQ = SUM(blocksize(1:(nQ*(qidx - 1))))
     996              : 
     997            0 :       ALLOCATE (p_atom_blocks(sizeP))
     998            0 :       ALLOCATE (q_atom_blocks(sizeQ))
     999              : 
    1000            0 :       p_atom_blocks(:) = permute((sumP + 1):(sumP + sizeP))
    1001            0 :       q_atom_blocks(:) = permute((sumQ + 1):(sumQ + sizeQ))
    1002              : 
    1003              :       ! from here on, we are actually finished, each process has been
    1004              :       ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
    1005              :       ! what follows is just a small routine to calculate the local cost
    1006              :       ! for each processor which is then written to a file.
    1007              : 
    1008              :       ! calculate local cost for each processor!
    1009              :       ! ****************************************
    1010              :       local_cost = 0
    1011            0 :       DO i = 1, sizeP
    1012            0 :          DO j = 1, sizeQ
    1013              : 
    1014              :             !       get corresponding 4D block indices out of our own P-Q-block
    1015            0 :             latom_block = MODULO(q_atom_blocks(j), nblocks)
    1016            0 :             iatom_block = q_atom_blocks(j)/nblocks + 1
    1017            0 :             jatom_block = MODULO(p_atom_blocks(i), nblocks)
    1018            0 :             katom_block = p_atom_blocks(i)/nblocks + 1
    1019              : 
    1020              :             !       symmetry checks.
    1021            0 :             IF (latom_block < katom_block) CYCLE
    1022            0 :             IF (jatom_block < iatom_block) CYCLE
    1023              : 
    1024            0 :             iatom_start = x_data%blocks(iatom_block)%istart
    1025            0 :             iatom_end = x_data%blocks(iatom_block)%iend
    1026            0 :             jatom_start = x_data%blocks(jatom_block)%istart
    1027            0 :             jatom_end = x_data%blocks(jatom_block)%iend
    1028            0 :             katom_start = x_data%blocks(katom_block)%istart
    1029            0 :             katom_end = x_data%blocks(katom_block)%iend
    1030            0 :             latom_start = x_data%blocks(latom_block)%istart
    1031            0 :             latom_end = x_data%blocks(latom_block)%iend
    1032              : 
    1033              :             !       whatever.
    1034            0 :             SELECT CASE (eval_type)
    1035              :             CASE (hfx_do_eval_energy)
    1036              :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
    1037              :                                  pmax_block(latom_block, jatom_block), &
    1038              :                                  pmax_block(latom_block, iatom_block), &
    1039            0 :                                  pmax_block(katom_block, jatom_block))
    1040              :             CASE (hfx_do_eval_forces)
    1041              :                pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
    1042              :                                  pmax_block(latom_block, jatom_block), &
    1043              :                                  pmax_block(latom_block, iatom_block) + &
    1044            0 :                                  pmax_block(katom_block, jatom_block))
    1045              :             END SELECT
    1046              : 
    1047              :             !       screening.
    1048            0 :             IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
    1049              : 
    1050              :             !       estimate the cost of this atom_block.
    1051              :             local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
    1052              :                                                           set_list_kl, &
    1053              :                                                           iatom_start, iatom_end, jatom_start, jatom_end, &
    1054              :                                                           katom_start, katom_end, latom_start, latom_end, &
    1055              :                                                           particle_set, &
    1056              :                                                           coeffs_set, coeffs_kind, &
    1057              :                                                           is_assoc_atomic_block_global, do_periodic, &
    1058              :                                                           kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1059              :                                                           cell, &
    1060              :                                                           do_p_screening, map_atom_to_kind_atom, eval_type, &
    1061            0 :                                                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1062              :          END DO
    1063              :       END DO
    1064              : 
    1065            0 :       ALLOCATE (local_cost_vector(n_processes))
    1066            0 :       local_cost_vector = 0
    1067            0 :       local_cost_vector(my_process_id + 1) = local_cost
    1068            0 :       CALL para_env%sum(local_cost_vector)
    1069              : 
    1070            0 :       mean = SUM(local_cost_vector)/n_processes
    1071            0 :       maximum = MAXVAL(local_cost_vector)
    1072              : 
    1073            0 : !$OMP     BARRIER
    1074            0 : !$OMP     MASTER
    1075              :       ! only output once
    1076            0 :       IF (my_process_id == 0) THEN
    1077            0 :          CALL open_file(unit_number=unit_nr, file_name="loads.dat")
    1078            0 :          WRITE (unit_nr, *) 'maximum cost:', maximum
    1079            0 :          WRITE (unit_nr, *) 'mean cost:', mean
    1080            0 :          WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
    1081            0 :          WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
    1082            0 :          DO i = 1, n_processes
    1083            0 :             WRITE (unit_nr, *) local_cost_vector(i)
    1084              :          END DO
    1085            0 :          CALL close_file(unit_nr)
    1086              :       END IF
    1087              : !$OMP     END MASTER
    1088            0 : !$OMP     BARRIER
    1089              : 
    1090            0 :       DEALLOCATE (local_cost_vector)
    1091            0 :       DEALLOCATE (p_atom_blocks, q_atom_blocks)
    1092            0 :       DEALLOCATE (blocksize, permute)
    1093              : 
    1094            0 : !$OMP BARRIER
    1095            0 : !$OMP MASTER
    1096            0 :       CALL timestop(handle)
    1097              : !$OMP END MASTER
    1098            0 : !$OMP BARRIER
    1099              : 
    1100            0 :    END SUBROUTINE hfx_recursive_load_balance
    1101              : 
    1102              : ! **************************************************************************************************
    1103              : !> \brief Small routine to calculate the optimal P-Q-processor grid distribution
    1104              : !>        for a given number of processors N
    1105              : !>        and the corresponding number of Bins for the load balancing routine
    1106              : !> \param p     number of rows on P-Q process grid (output)
    1107              : !> \param q     number of columns on P-Q process grid (output)
    1108              : !> \param nBins number of Bins (output)
    1109              : !> \param N     number of processes (input)
    1110              : !> \par History
    1111              : !>      03.2011 created [Michael Steinlechner]
    1112              : !> \author Michael Steinlechner
    1113              : ! **************************************************************************************************
    1114            0 :    SUBROUTINE hfx_calculate_PQ(p, q, nBins, N)
    1115              : 
    1116              :       INTEGER, INTENT(OUT)                               :: p, q, nBins
    1117              :       INTEGER, INTENT(IN)                                :: N
    1118              : 
    1119              :       INTEGER                                            :: a, b, k
    1120              :       REAL(dp)                                           :: sqN
    1121              : 
    1122            0 :       k = 2
    1123            0 :       sqN = SQRT(REAL(N, KIND=dp))
    1124            0 :       p = 1
    1125              : 
    1126            0 :       DO WHILE (REAL(k, KIND=dp) <= sqN)
    1127            0 :          IF (MODULO(N, k) == 0) THEN
    1128            0 :             p = k
    1129              :          END IF
    1130            0 :          k = k + 1
    1131              :       END DO
    1132            0 :       q = N/p
    1133              : 
    1134              :       ! now compute the least common multiple of p & q to get the number of necessary bins
    1135              :       ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
    1136              :       ! and use euclid's algorithm for GCD computation.
    1137            0 :       a = p
    1138            0 :       b = q
    1139              : 
    1140            0 :       DO WHILE (b /= 0)
    1141            0 :          IF (a > b) THEN
    1142            0 :             a = a - b
    1143              :          ELSE
    1144            0 :             b = b - a
    1145              :          END IF
    1146              :       END DO
    1147              :       ! gcd(p,q) is now saved in a
    1148              : 
    1149            0 :       nBins = p*q/a
    1150              : 
    1151            0 :    END SUBROUTINE hfx_calculate_PQ
    1152              : 
    1153              : ! **************************************************************************************************
    1154              : !> \brief Recursive permutation routine for the load balancing of the integral
    1155              : !>       computation
    1156              : !> \param blocksize     vector of blocksizes, size(nProc), which contains for
    1157              : !>                      each process the local blocksize (OUTPUT)
    1158              : !> \param blockstart    starting row/column idx of the block which is to be examined
    1159              : !>                      at this point (INPUT)
    1160              : !> \param blockend      ending row/column idx of the block which is to be examined
    1161              : !>                      (INPUT)
    1162              : !> \param nProc_in      number of bins into which the current block has to be divided
    1163              : !>                      (INPUT)
    1164              : !> \param permute       permutation vector which balances column/row cost
    1165              : !>                      size(nblocks^2). (OUTPUT)
    1166              : !> \param step ...
    1167              : !> \param my_process_id ...
    1168              : !> \param n_processes ...
    1169              : !> \param nblocks ...
    1170              : !> \param natom ...
    1171              : !> \param nkind ...
    1172              : !> \param list_ij ...
    1173              : !> \param list_kl ...
    1174              : !> \param set_list_ij ...
    1175              : !> \param set_list_kl ...
    1176              : !> \param particle_set ...
    1177              : !> \param coeffs_set ...
    1178              : !> \param coeffs_kind ...
    1179              : !> \param is_assoc_atomic_block_global ...
    1180              : !> \param do_periodic ...
    1181              : !> \param kind_of ...
    1182              : !> \param basis_parameter ...
    1183              : !> \param pmax_set ...
    1184              : !> \param pmax_atom ...
    1185              : !> \param pmax_blocks ...
    1186              : !> \param cell ...
    1187              : !> \param x_data ...
    1188              : !> \param para_env ...
    1189              : !> \param pmax_block ...
    1190              : !> \param do_p_screening ...
    1191              : !> \param map_atom_to_kind_atom ...
    1192              : !> \param eval_type ...
    1193              : !> \param log10_eps_schwarz ...
    1194              : !> \param log_2 ...
    1195              : !> \param coeffs_kind_max0 ...
    1196              : !> \param use_virial ...
    1197              : !> \param atomic_pair_list ...
    1198              : !> \par History
    1199              : !>      03.2011 created [Michael Steinlechner]
    1200              : !> \author Michael Steinlechner
    1201              : ! **************************************************************************************************
    1202            0 :    RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
    1203            0 :                                               permute, step, &
    1204              :                                               my_process_id, n_processes, nblocks, &
    1205              :                                               natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    1206              :                                               particle_set, &
    1207              :                                               coeffs_set, coeffs_kind, &
    1208            0 :                                               is_assoc_atomic_block_global, do_periodic, &
    1209              :                                               kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1210              :                                               cell, x_data, para_env, pmax_block, &
    1211              :                                               do_p_screening, map_atom_to_kind_atom, eval_type, &
    1212              :                                               log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1213              : 
    1214              :       INTEGER                                            :: nProc_in, blockend, blockstart
    1215              :       INTEGER, DIMENSION(nProc_in)                       :: blocksize
    1216              :       INTEGER                                            :: nblocks, n_processes, my_process_id
    1217              :       INTEGER, INTENT(IN)                                :: step
    1218              :       INTEGER, DIMENSION(nblocks*nblocks)                :: permute
    1219              :       INTEGER                                            :: natom
    1220              :       INTEGER, INTENT(IN)                                :: nkind
    1221              :       TYPE(pair_list_type)                               :: list_ij, list_kl
    1222              :       TYPE(pair_set_list_type), ALLOCATABLE, &
    1223              :          DIMENSION(:)                                    :: set_list_ij, set_list_kl
    1224              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1225              :       TYPE(hfx_screen_coeff_type), &
    1226              :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
    1227              :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
    1228              :          POINTER                                         :: coeffs_kind
    1229              :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
    1230              :       LOGICAL                                            :: do_periodic
    1231              :       INTEGER                                            :: kind_of(*)
    1232              :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
    1233              :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
    1234              :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
    1235              :       REAL(dp)                                           :: pmax_blocks
    1236              :       TYPE(cell_type), POINTER                           :: cell
    1237              :       TYPE(hfx_type), POINTER                            :: x_data
    1238              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    1239              :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
    1240              :       LOGICAL, INTENT(IN)                                :: do_p_screening
    1241              :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
    1242              :       INTEGER, INTENT(IN)                                :: eval_type
    1243              :       REAL(dp)                                           :: log10_eps_schwarz, log_2, &
    1244              :                                                             coeffs_kind_max0
    1245              :       LOGICAL, INTENT(IN)                                :: use_virial
    1246              :       LOGICAL, DIMENSION(:, :), POINTER                  :: atomic_pair_list
    1247              : 
    1248              :       INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
    1249              :                  jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
    1250              :                  latom_end, latom_start, nbins, nProc, row, startoffset
    1251              :       INTEGER(int_8)                                     :: atom_block, tmp_block
    1252            0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: ithblocksize, localblocksize
    1253            0 :       INTEGER, DIMENSION(blockend - blockstart + 1)          :: bin_perm, tmp_perm
    1254              :       REAL(dp)                                           :: partialcost
    1255            0 :       REAL(dp), DIMENSION(nblocks*nblocks)               :: cost_vector
    1256              : 
    1257            0 :       nProc = nProc_in
    1258            0 :       cost_vector = 0.0_dp
    1259              : 
    1260              : !   loop over local atom_blocks.
    1261            0 :       DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
    1262              : 
    1263              : !       get corresponding 4D block indices
    1264            0 :          latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
    1265            0 :          tmp_block = atom_block/nblocks
    1266            0 :          katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
    1267            0 :          IF (latom_block < katom_block) CYCLE
    1268            0 :          tmp_block = tmp_block/nblocks
    1269            0 :          jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
    1270            0 :          tmp_block = tmp_block/nblocks
    1271            0 :          iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
    1272            0 :          IF (jatom_block < iatom_block) CYCLE
    1273              : 
    1274              : !       get 2D indices of this atom_block (with permutation applied)
    1275              : !       for this, we need to invert the permutation, this means
    1276              : !       find position in permutation vector where value==idx
    1277              : 
    1278            0 :          row = (katom_block - 1)*nblocks + jatom_block
    1279            0 :          inv_perm = 1
    1280            0 :          DO WHILE (permute(inv_perm) /= row)
    1281            0 :             inv_perm = inv_perm + 1
    1282              :          END DO
    1283            0 :          row = inv_perm
    1284              : 
    1285            0 :          col = (iatom_block - 1)*nblocks + latom_block
    1286            0 :          inv_perm = 1
    1287            0 :          DO WHILE (permute(inv_perm) /= col)
    1288            0 :             inv_perm = inv_perm + 1
    1289              :          END DO
    1290            0 :          col = inv_perm
    1291              : 
    1292              : !       if row/col outside our current diagonal block, skip calculation.
    1293            0 :          IF (col < blockstart .OR. col > blockend) CYCLE
    1294            0 :          IF (row < blockstart .OR. row > blockend) CYCLE
    1295              : 
    1296            0 :          iatom_start = x_data%blocks(iatom_block)%istart
    1297            0 :          iatom_end = x_data%blocks(iatom_block)%iend
    1298            0 :          jatom_start = x_data%blocks(jatom_block)%istart
    1299            0 :          jatom_end = x_data%blocks(jatom_block)%iend
    1300            0 :          katom_start = x_data%blocks(katom_block)%istart
    1301            0 :          katom_end = x_data%blocks(katom_block)%iend
    1302            0 :          latom_start = x_data%blocks(latom_block)%istart
    1303            0 :          latom_end = x_data%blocks(latom_block)%iend
    1304              : 
    1305              : !       whatever.
    1306            0 :          SELECT CASE (eval_type)
    1307              :          CASE (hfx_do_eval_energy)
    1308              :             pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
    1309              :                               pmax_block(latom_block, jatom_block), &
    1310              :                               pmax_block(latom_block, iatom_block), &
    1311            0 :                               pmax_block(katom_block, jatom_block))
    1312              :          CASE (hfx_do_eval_forces)
    1313              :             pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
    1314              :                               pmax_block(latom_block, jatom_block), &
    1315              :                               pmax_block(latom_block, iatom_block) + &
    1316            0 :                               pmax_block(katom_block, jatom_block))
    1317              :          END SELECT
    1318              : 
    1319              : !       screening.
    1320            0 :          IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
    1321              : 
    1322              : !       every second recursion step, compute row sum instead of column sum
    1323              : 
    1324            0 :          IF (MODULO(step, 2) == 0) THEN
    1325              :             idx = row
    1326              :          ELSE
    1327            0 :             idx = col
    1328              :          END IF
    1329              : 
    1330              : !       estimate the cost of this atom_block.
    1331              :          partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
    1332              :                                            set_list_kl, &
    1333              :                                            iatom_start, iatom_end, jatom_start, jatom_end, &
    1334              :                                            katom_start, katom_end, latom_start, latom_end, &
    1335              :                                            particle_set, &
    1336              :                                            coeffs_set, coeffs_kind, &
    1337              :                                            is_assoc_atomic_block_global, do_periodic, &
    1338              :                                            kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1339              :                                            cell, &
    1340              :                                            do_p_screening, map_atom_to_kind_atom, eval_type, &
    1341            0 :                                            log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1342              : 
    1343            0 :          cost_vector(idx) = cost_vector(idx) + partialcost
    1344              :       END DO ! atom_block
    1345              : 
    1346              : !   sum costvector over all processes
    1347            0 :       CALL para_env%sum(cost_vector)
    1348              : 
    1349              : !   calculate next prime factor of nProc
    1350            0 :       nBins = 2
    1351            0 :       DO WHILE (MODULO(INT(nProc), INT(nBins)) /= 0)
    1352            0 :          nBins = nBins + 1
    1353              :       END DO
    1354              : 
    1355            0 :       nProc = nProc/nBins
    1356              : 
    1357              : ! ... do the binning...
    1358              : 
    1359            0 :       ALLOCATE (localblocksize(nBins))
    1360            0 :       CALL hfx_permute_binning(nBins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
    1361              : 
    1362              : !... and update the permutation vector
    1363              : 
    1364            0 :       tmp_perm = permute(blockstart:blockend)
    1365            0 :       permute(blockstart:blockend) = tmp_perm(bin_perm)
    1366              : 
    1367              : !   split recursion into the nBins Bins
    1368            0 :       IF (nProc > 1) THEN
    1369            0 :          ALLOCATE (ithblocksize(nProc))
    1370            0 :          DO i = 1, nBins
    1371            0 :             startoffset = SUM(localblocksize(1:(i - 1)))
    1372            0 :             endoffset = SUM(localblocksize(1:i)) - 1
    1373              : 
    1374              :             CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nProc, &
    1375              :                                        permute, step + 1, &
    1376              :                                        my_process_id, n_processes, nblocks, &
    1377              :                                        natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    1378              :                                        particle_set, &
    1379              :                                        coeffs_set, coeffs_kind, &
    1380              :                                        is_assoc_atomic_block_global, do_periodic, &
    1381              :                                        kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1382              :                                        cell, x_data, para_env, pmax_block, &
    1383              :                                        do_p_screening, map_atom_to_kind_atom, eval_type, &
    1384            0 :                                        log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    1385            0 :             blocksize(((i - 1)*nProc + 1):(i*nProc)) = ithblocksize
    1386              :          END DO
    1387            0 :          DEALLOCATE (ithblocksize)
    1388              :       ELSE
    1389            0 :          DO i = 1, nBins
    1390            0 :             blocksize(i) = localblocksize(i)
    1391              :          END DO
    1392              :       END IF
    1393              : 
    1394            0 :       DEALLOCATE (localblocksize)
    1395              : 
    1396            0 :    END SUBROUTINE hfx_recursive_permute
    1397              : 
    1398              : ! **************************************************************************************************
    1399              : !> \brief small binning routine for the recursive load balancing
    1400              : !>
    1401              : !> \param nBins         number of Bins (INPUT)
    1402              : !> \param costvector    vector of current row/column costs which have to be binned (INPUT)
    1403              : !> \param maxbinsize    upper bound for bin size (INPUT)
    1404              : !> \param perm          resulting permutation due to be binning routine (OUTPUT)
    1405              : !> \param block_count   vector of size(nbins) which contains the size of each bin (OUTPUT)
    1406              : !> \par History
    1407              : !>      03.2011 created [Michael Steinlechner]
    1408              : !> \author Michael Steinlechner
    1409              : ! **************************************************************************************************
    1410            0 :    SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
    1411              : 
    1412              :       INTEGER, INTENT(IN)                                :: nBins, maxbinsize
    1413              :       REAL(dp), DIMENSION(maxbinsize), INTENT(IN)        :: costvector
    1414              :       INTEGER, DIMENSION(maxbinsize), INTENT(OUT)        :: perm
    1415              :       INTEGER, DIMENSION(nBins), INTENT(OUT)             :: block_count
    1416              : 
    1417              :       INTEGER                                            :: i, j, mod_idx, offset
    1418            0 :       INTEGER, DIMENSION(nBins, maxbinsize)              :: bin
    1419            0 :       INTEGER, DIMENSION(nBins)                          :: bin_idx
    1420            0 :       INTEGER, DIMENSION(maxbinsize)                     :: idx
    1421            0 :       REAL(dp), DIMENSION(maxbinsize)                    :: vec
    1422            0 :       REAL(dp), DIMENSION(nBins)                         :: bincosts
    1423              : 
    1424              : ! be careful not to change costvector (copy it!)
    1425              : 
    1426            0 :       vec = costvector
    1427            0 :       block_count = 0
    1428            0 :       bincosts = 0
    1429              : 
    1430              :       !sort the array (ascending)
    1431            0 :       CALL sort(vec, maxbinsize, idx)
    1432              : 
    1433              :       ! count the loop down to distribute the largest cols/rows first
    1434            0 :       DO i = maxbinsize, 1, -1
    1435            0 :          IF (vec(i) == 0) THEN
    1436              :             ! spread zero-cost col/rows evenly among procs
    1437            0 :             mod_idx = MODULO(i, nBins) + 1 !(note the fortran offset by one!)
    1438            0 :             block_count(mod_idx) = block_count(mod_idx) + 1
    1439            0 :             bin(mod_idx, block_count(mod_idx)) = idx(i)
    1440              :          ELSE
    1441              :             ! sort the bins so that the one with the lowest cost is at the
    1442              :             ! first place, where we then assign the current col/row
    1443            0 :             CALL sort(bincosts, nBins, bin_idx)
    1444            0 :             block_count = block_count(bin_idx)
    1445            0 :             bin = bin(bin_idx, :)
    1446              : 
    1447            0 :             bincosts(1) = bincosts(1) + vec(i)
    1448            0 :             block_count(1) = block_count(1) + 1
    1449            0 :             bin(1, block_count(1)) = idx(i)
    1450              :          END IF
    1451              :       END DO
    1452              : 
    1453              :       ! construct permutation vector from the binning
    1454              :       offset = 0
    1455            0 :       DO i = 1, nBins
    1456            0 :          DO j = 1, block_count(i)
    1457            0 :             perm(offset + j) = bin(i, j)
    1458              :          END DO
    1459            0 :          offset = offset + block_count(i)
    1460              :       END DO
    1461              : 
    1462            0 :    END SUBROUTINE hfx_permute_binning
    1463              : 
    1464              : ! **************************************************************************************************
    1465              : !> \brief Cheap way of redistributing the eri's
    1466              : !> \param x_data Object that stores the indices array
    1467              : !> \param para_env para_env
    1468              : !> \param load_balance_parameter contains parmameter for Monte-Carlo routines
    1469              : !> \param i_thread current thread ID
    1470              : !> \param n_threads Total Number of threads
    1471              : !> \param eval_type ...
    1472              : !> \par History
    1473              : !>      12.2007 created [Manuel Guidon]
    1474              : !>      02.2009 optimize Memory Usage [Manuel Guidon]
    1475              : !> \author Manuel Guidon
    1476              : !> \note
    1477              : !>      The cost matrix is given by the walltime for each bin that is measured
    1478              : !>      during the calculation
    1479              : ! **************************************************************************************************
    1480         1772 :    SUBROUTINE hfx_update_load_balance(x_data, para_env, &
    1481              :                                       load_balance_parameter, &
    1482              :                                       i_thread, n_threads, eval_type)
    1483              : 
    1484              :       TYPE(hfx_type), POINTER                            :: x_data
    1485              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    1486              :       TYPE(hfx_load_balance_type)                        :: load_balance_parameter
    1487              :       INTEGER, INTENT(IN)                                :: i_thread, n_threads, eval_type
    1488              : 
    1489              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_update_load_balance'
    1490              : 
    1491              :       INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
    1492              :                  my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
    1493         5316 :       TYPE(mp_request_type), DIMENSION(2) :: req
    1494         1772 :       INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
    1495         1772 :                                                             sendbuffer, swapbuffer
    1496              :       INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
    1497         1772 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: tmp_pos
    1498              :       INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: bins_per_rank
    1499              :       INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE        :: bin_histogram
    1500              :       INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
    1501              :       INTEGER, SAVE                                      :: max_bin_size
    1502         3544 :       TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
    1503              :       TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
    1504              :          SAVE                                            :: full_dist
    1505              : 
    1506         1772 : !$OMP BARRIER
    1507         3544 : !$OMP MASTER
    1508         1772 :       CALL timeset(routineN, handle)
    1509              : !$OMP END MASTER
    1510         1772 : !$OMP BARRIER
    1511              : 
    1512         1772 :       ncpu = para_env%num_pe
    1513         1772 :       n_processes = ncpu*n_threads
    1514              :       !! If there is only 1 cpu skip the binning
    1515         1772 :       IF (n_processes == 1) THEN
    1516            0 :          ALLOCATE (tmp_dist(1))
    1517            0 :          tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
    1518            0 :          tmp_dist(1)%istart = 0_int_8
    1519            0 :          ptr_to_tmp_dist => tmp_dist(:)
    1520            0 :          SELECT CASE (eval_type)
    1521              :          CASE (hfx_do_eval_energy)
    1522            0 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
    1523              :          CASE (hfx_do_eval_forces)
    1524            0 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
    1525              :          END SELECT
    1526            0 :          DEALLOCATE (tmp_dist)
    1527              :       ELSE
    1528         1772 :          mepos = para_env%mepos
    1529         1772 :          my_process_id = para_env%mepos*n_threads + i_thread
    1530         1772 :          nbins = load_balance_parameter%nbins
    1531         1772 : !$OMP MASTER
    1532         5316 :          ALLOCATE (bin_histogram(n_processes, 2))
    1533        12404 :          bin_histogram = 0
    1534              : !$OMP END MASTER
    1535         1772 : !$OMP BARRIER
    1536         2724 :          SELECT CASE (eval_type)
    1537              :          CASE (hfx_do_eval_energy)
    1538          952 :             my_bin_size = SIZE(x_data%distribution_energy)
    1539              :          CASE (hfx_do_eval_forces)
    1540         1772 :             my_bin_size = SIZE(x_data%distribution_forces)
    1541              :          END SELECT
    1542         1772 :          bin_histogram(my_process_id + 1, 1) = my_bin_size
    1543         1772 : !$OMP BARRIER
    1544         1772 : !$OMP MASTER
    1545         1772 :          CALL para_env%sum(bin_histogram(:, 1))
    1546         1772 :          bin_histogram(1, 2) = bin_histogram(1, 1)
    1547         3544 :          DO iprocess = 2, n_processes
    1548         3544 :             bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
    1549              :          END DO
    1550              : 
    1551         3544 :          max_bin_size = MAXVAL(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
    1552         1772 :          CALL para_env%max(max_bin_size)
    1553              : !$OMP END MASTER
    1554         1772 : !$OMP BARRIER
    1555       118718 :          ALLOCATE (binned_dist(my_bin_size))
    1556              :          !! Use old binned_dist, but with timings cost
    1557          952 :          SELECT CASE (eval_type)
    1558              :          CASE (hfx_do_eval_energy)
    1559       122808 :             binned_dist = x_data%distribution_energy
    1560              :          CASE (hfx_do_eval_forces)
    1561       106732 :             binned_dist = x_data%distribution_forces
    1562              :          END SELECT
    1563              : 
    1564       115180 :          DO ibin = 1, my_bin_size
    1565       115180 :             IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
    1566        98141 :                binned_dist(ibin)%cost = 0
    1567              :             ELSE
    1568         8763 :                SELECT CASE (eval_type)
    1569              :                CASE (hfx_do_eval_energy)
    1570         8763 :                   IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
    1571              :                      binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_first_scf + &
    1572         8763 :                                                    binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
    1573              :                   ELSE
    1574            0 :                      binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
    1575              :                   END IF
    1576              :                CASE (hfx_do_eval_forces)
    1577        15267 :                   binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
    1578              :                END SELECT
    1579              :             END IF
    1580              :          END DO
    1581         1772 : !$OMP BARRIER
    1582         1772 : !$OMP MASTER
    1583              :          !! store all local results in a big cost matrix
    1584         5316 :          ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
    1585       228588 :          cost_matrix = 0
    1586         5316 :          ALLOCATE (sendbuffer(max_bin_size*n_threads))
    1587         3544 :          ALLOCATE (recbuffer(max_bin_size*n_threads))
    1588              : !$OMP END MASTER
    1589         1772 : !$OMP BARRIER
    1590         1772 :          my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
    1591         1772 :          icpu = para_env%mepos + 1
    1592       115180 :          DO i = 1, my_bin_size
    1593       115180 :             cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
    1594              :          END DO
    1595              : 
    1596         1772 :          mepos = para_env%mepos
    1597         1772 : !$OMP BARRIER
    1598         1772 : !$OMP MASTER
    1599         5316 :          ALLOCATE (bins_per_rank(ncpu))
    1600         5316 :          bins_per_rank = 0
    1601         5316 :          DO icpu = 1, ncpu
    1602         8860 :             bins_per_rank(icpu) = SUM(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
    1603              :          END DO
    1604              :          sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
    1605       228588 :             cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
    1606              : 
    1607         1772 :          dest = MODULO(mepos + 1, ncpu)
    1608         1772 :          source = MODULO(mepos - 1, ncpu)
    1609              :          ! sync before/after ring of isendrecv
    1610         1772 :          CALL para_env%sync()
    1611         5316 :          DO icpu = 0, ncpu - 1
    1612         3544 :             IF (icpu /= ncpu - 1) THEN
    1613              :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
    1614         1772 :                                        req(1), req(2), 13)
    1615              :             END IF
    1616         3544 :             data_from = MODULO(mepos - icpu, ncpu)
    1617         8860 :             start_idx = SUM(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
    1618         3544 :             end_idx = start_idx + bins_per_rank(data_from + 1) - 1
    1619       457176 :             cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
    1620              : 
    1621         3544 :             IF (icpu /= ncpu - 1) THEN
    1622         1772 :                CALL mp_waitall(req)
    1623              :             END IF
    1624         3544 :             swapbuffer => sendbuffer
    1625         3544 :             sendbuffer => recbuffer
    1626         5316 :             recbuffer => swapbuffer
    1627              :          END DO
    1628         1772 :          DEALLOCATE (recbuffer, sendbuffer)
    1629              :          ! sync before/after ring of isendrecv
    1630         1772 :          CALL para_env%sync()
    1631              : !$OMP END MASTER
    1632         1772 : !$OMP BARRIER
    1633         5316 :          ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
    1634       455404 :          local_cost_matrix = cost_matrix
    1635         1772 : !$OMP MASTER
    1636         5316 :          ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
    1637              :          CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
    1638         1772 :                                     shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
    1639              : 
    1640       643790 :          ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
    1641              : 
    1642       638474 :          full_dist(:, :)%istart = 0_int_8
    1643       638474 :          full_dist(:, :)%number_of_atom_quartets = 0_int_8
    1644       638474 :          full_dist(:, :)%cost = 0_int_8
    1645       638474 :          full_dist(:, :)%time_first_scf = 0.0_dp
    1646       638474 :          full_dist(:, :)%time_other_scf = 0.0_dp
    1647       638474 :          full_dist(:, :)%time_forces = 0.0_dp
    1648              : !$OMP END MASTER
    1649              : 
    1650         1772 : !$OMP BARRIER
    1651         1772 :          mepos = para_env%mepos + 1
    1652       228588 :          full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
    1653         1772 : !$OMP BARRIER
    1654         1772 : !$OMP MASTER
    1655         5316 :          ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
    1656         3544 :          ALLOCATE (recbuffer(3*max_bin_size*n_threads))
    1657         1772 :          mepos = para_env%mepos
    1658         3544 :          DO j = 1, n_threads
    1659       215778 :             DO i = 1, max_bin_size
    1660       212234 :                sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
    1661       212234 :                sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
    1662       214006 :                sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
    1663              :             END DO
    1664              :          END DO
    1665         1772 :          dest = MODULO(mepos + 1, ncpu)
    1666         1772 :          source = MODULO(mepos - 1, ncpu)
    1667              :          ! sync before/after ring of isendrecv
    1668         1772 :          CALL para_env%sync()
    1669         5316 :          DO icpu = 0, ncpu - 1
    1670         3544 :             IF (icpu /= ncpu - 1) THEN
    1671              :                CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
    1672         1772 :                                        req(1), req(2), 13)
    1673              :             END IF
    1674         3544 :             data_from = MODULO(mepos - icpu, ncpu)
    1675         7088 :             DO j = 1, n_threads
    1676       431556 :                DO i = 1, max_bin_size
    1677       424468 :                   full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
    1678       424468 :                   full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
    1679       428012 :                   full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
    1680              :                END DO
    1681              :             END DO
    1682              : 
    1683         3544 :             IF (icpu /= ncpu - 1) THEN
    1684         1772 :                CALL mp_waitall(req)
    1685              :             END IF
    1686         3544 :             swapbuffer => sendbuffer
    1687         3544 :             sendbuffer => recbuffer
    1688         5316 :             recbuffer => swapbuffer
    1689              :          END DO
    1690              :          ! sync before/after ring of isendrecv
    1691         1772 :          DEALLOCATE (recbuffer, sendbuffer)
    1692         1772 :          CALL para_env%sync()
    1693              : !$OMP END MASTER
    1694         1772 : !$OMP BARRIER
    1695              :          !! reorder the distribution according to the distribution vector
    1696         5316 :          ALLOCATE (tmp_pos(ncpu*n_threads))
    1697         5316 :          tmp_pos = 1
    1698       232132 :          ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
    1699              : 
    1700       228588 :          tmp_dist(:)%istart = 0_int_8
    1701       228588 :          tmp_dist(:)%number_of_atom_quartets = 0_int_8
    1702       228588 :          tmp_dist(:)%cost = 0_int_8
    1703       228588 :          tmp_dist(:)%time_first_scf = 0.0_dp
    1704       228588 :          tmp_dist(:)%time_other_scf = 0.0_dp
    1705       228588 :          tmp_dist(:)%time_forces = 0.0_dp
    1706              : 
    1707         5316 :          mepos = my_process_id + 1
    1708         5316 :          DO icpu = 1, n_processes
    1709       232132 :             DO i = 1, bin_histogram(icpu, 1)
    1710       230360 :                IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
    1711       113408 :                   tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
    1712       113408 :                   tmp_pos(mepos) = tmp_pos(mepos) + 1
    1713              :                END IF
    1714              :             END DO
    1715              :          END DO
    1716              : 
    1717              :          !! Assign the load to each process
    1718              :          NULLIFY (ptr_to_tmp_dist)
    1719         1772 :          mepos = my_process_id + 1
    1720         1772 :          ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
    1721          952 :          SELECT CASE (eval_type)
    1722              :          CASE (hfx_do_eval_energy)
    1723          952 :             CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
    1724              :          CASE (hfx_do_eval_forces)
    1725         1772 :             CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
    1726              :          END SELECT
    1727              : 
    1728         1772 : !$OMP BARRIER
    1729         1772 : !$OMP MASTER
    1730         1772 :          DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
    1731         1772 :          DEALLOCATE (bins_per_rank, bin_histogram)
    1732              : !$OMP END MASTER
    1733         1772 : !$OMP BARRIER
    1734         1772 :          DEALLOCATE (tmp_dist, tmp_pos)
    1735         1772 :          DEALLOCATE (binned_dist, local_cost_matrix)
    1736              :       END IF
    1737         1772 : !$OMP BARRIER
    1738         1772 : !$OMP MASTER
    1739         1772 :       CALL timestop(handle)
    1740              : !$OMP END MASTER
    1741         1772 : !$OMP BARRIER
    1742              : 
    1743         3544 :    END SUBROUTINE hfx_update_load_balance
    1744              : 
    1745              : ! **************************************************************************************************
    1746              : !> \brief estimates the cost of a set quartet with info available at load balance time
    1747              : !>        i.e. without much info on the primitives primitives
    1748              : !> \param nsa ...
    1749              : !> \param nsb ...
    1750              : !> \param nsc ...
    1751              : !> \param nsd ...
    1752              : !> \param npgfa ...
    1753              : !> \param npgfb ...
    1754              : !> \param npgfc ...
    1755              : !> \param npgfd ...
    1756              : !> \param ratio ...
    1757              : !> \param p1 ...
    1758              : !> \param p2 ...
    1759              : !> \param p3 ...
    1760              : !> \return ...
    1761              : !> \par History
    1762              : !>      08.2009 created Joost VandeVondele
    1763              : !> \author Joost VandeVondele
    1764              : ! **************************************************************************************************
    1765      9087360 :    FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
    1766              :       IMPLICIT NONE
    1767              :       REAL(KIND=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
    1768              :       INTEGER(KIND=int_8) :: res
    1769              :       REAL(KIND=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
    1770              : 
    1771              :       INTEGER   :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
    1772              : 
    1773      9087360 :       estimate1 = estimate_basic(p1)
    1774      9087360 :       estimate2 = estimate_basic(p2)
    1775      9087360 :       mu = LOG(ABS(1.0E6_dp*p3(1)) + 1)
    1776      9087360 :       sigma = p3(2)*0.1_dp*mu
    1777      9087360 :       switch = 1.0_dp/(1.0_dp + EXP((LOG(estimate1) - mu)/sigma))
    1778      9087360 :       estimate = estimate1*(1.0_dp - switch) + estimate2*switch
    1779      9087360 :       res = INT(estimate*0.001_dp, KIND=int_8) + 1
    1780              : 
    1781              :    CONTAINS
    1782              : 
    1783              : ! **************************************************************************************************
    1784              : !> \brief ...
    1785              : !> \param p ...
    1786              : !> \return ...
    1787              : ! **************************************************************************************************
    1788     18174720 :       REAL(KIND=dp) FUNCTION estimate_basic(p) RESULT(res)
    1789              :          REAL(KIND=dp)                                      :: p(12)
    1790              : 
    1791              :          REAL(KIND=dp)                                      :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
    1792              :                                                                p7, p8, p9
    1793              : 
    1794     18174720 :          p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
    1795     18174720 :          p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
    1796     18174720 :          p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
    1797              :          res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
    1798              :                poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
    1799              :                poly2(npgfd, p4, p5, p6)*EXP(-p7*ratio + p8*ratio**2) + &
    1800     18174720 :               1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
    1801     18174720 :          res = 1 + ABS(res)
    1802     18174720 :       END FUNCTION estimate_basic
    1803              : 
    1804              : ! **************************************************************************************************
    1805              : !> \brief ...
    1806              : !> \param x ...
    1807              : !> \param a0 ...
    1808              : !> \param a1 ...
    1809              : !> \param a2 ...
    1810              : !> \return ...
    1811              : ! **************************************************************************************************
    1812    218096640 :       REAL(KIND=dp) FUNCTION poly2(x, a0, a1, a2)
    1813              :          INTEGER, INTENT(IN)                                :: x
    1814              :          REAL(KIND=dp), INTENT(IN)                          :: a0, a1, a2
    1815              :          REAL(KIND=dp)                                      :: r
    1816              : 
    1817    218096640 :          r = REAL(x, KIND=dp)
    1818    218096640 :          poly2 = a0 + (a1 + a2*r)*r
    1819    218096640 :       END FUNCTION poly2
    1820              : 
    1821              :    END FUNCTION cost_model
    1822              : ! **************************************************************************************************
    1823              : !> \brief Minimizes the maximum cost per cpu by shuffling around all bins
    1824              : !> \param total_number_of_bins ...
    1825              : !> \param number_of_processes ...
    1826              : !> \param bin_costs costs per bin
    1827              : !> \param distribution_vector will contain the final distribution
    1828              : !> \param do_randomize ...
    1829              : !> \par History
    1830              : !>      03.2009 created from a hack by Joost [Manuel Guidon]
    1831              : !> \author Manuel Guidon
    1832              : ! **************************************************************************************************
    1833         3562 :    SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
    1834              :                                     distribution_vector, do_randomize)
    1835              :       INTEGER                                            :: total_number_of_bins, number_of_processes
    1836              :       INTEGER(int_8), DIMENSION(:), POINTER              :: bin_costs
    1837              :       INTEGER, DIMENSION(:), POINTER                     :: distribution_vector
    1838              :       LOGICAL, INTENT(IN)                                :: do_randomize
    1839              : 
    1840              :       INTEGER                                            :: i, itmp, j, nstep
    1841         3562 :       INTEGER(int_8), DIMENSION(:), POINTER              :: my_cost_cpu, tmp_cost, tmp_cpu_cost
    1842              :       INTEGER, DIMENSION(:), POINTER                     :: tmp_cpu_index, tmp_index
    1843         3562 :       TYPE(rng_stream_type), ALLOCATABLE                 :: rng_stream
    1844              : 
    1845         3562 :       nstep = MAX(1, INT(number_of_processes)/2)
    1846              : 
    1847        10686 :       ALLOCATE (tmp_cost(total_number_of_bins))
    1848        10686 :       ALLOCATE (tmp_index(total_number_of_bins))
    1849        10686 :       ALLOCATE (tmp_cpu_cost(number_of_processes))
    1850        10686 :       ALLOCATE (tmp_cpu_index(number_of_processes))
    1851         7124 :       ALLOCATE (my_cost_cpu(number_of_processes))
    1852       918996 :       tmp_cost = bin_costs
    1853              : 
    1854         3562 :       CALL sort(tmp_cost, total_number_of_bins, tmp_index)
    1855        10686 :       my_cost_cpu = 0
    1856              :       !
    1857              :       ! assign the largest remaining bin to the CPU with the smallest load
    1858              :       ! gives near perfect distributions for a sufficient number of bins ...
    1859              :       ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
    1860              :       ! each cpu a similar number of tasks.
    1861              :       ! it also avoids degenerate cases where thousands of zero sized tasks
    1862              :       ! are assigned to the same (least loaded) cpu
    1863              :       !
    1864         3562 :       IF (do_randomize) &
    1865              :          rng_stream = rng_stream_type(name="uniform_rng", &
    1866            0 :                                       distribution_type=UNIFORM)
    1867              : 
    1868       459498 :       DO i = total_number_of_bins, 1, -nstep
    1869      2735616 :          tmp_cpu_cost = my_cost_cpu
    1870       455936 :          CALL sort(tmp_cpu_cost, INT(number_of_processes), tmp_cpu_index)
    1871       455936 :          IF (do_randomize) THEN
    1872            0 :             CALL rng_stream%shuffle(tmp_cpu_index(1:MIN(i, nstep)))
    1873              :          END IF
    1874       915434 :          DO j = 1, MIN(i, nstep)
    1875       455936 :             itmp = tmp_cpu_index(j)
    1876       455936 :             distribution_vector(tmp_index(i - j + 1)) = itmp
    1877       911872 :             my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
    1878              :          END DO
    1879              :       END DO
    1880              : 
    1881         3562 :       DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
    1882         3562 :       DEALLOCATE (tmp_cpu_index, my_cost_cpu)
    1883         7124 :    END SUBROUTINE optimize_distribution
    1884              : 
    1885              : ! **************************************************************************************************
    1886              : !> \brief Given a 2d index pair, this function returns a 1d index pair for
    1887              : !>        a symmetric upper triangle NxN matrix
    1888              : !>        The compiler should inline this function, therefore it appears in
    1889              : !>        several modules
    1890              : !> \param i 2d index
    1891              : !> \param j 2d index
    1892              : !> \param N matrix size
    1893              : !> \return ...
    1894              : !> \par History
    1895              : !>      03.2009 created [Manuel Guidon]
    1896              : !> \author Manuel Guidon
    1897              : ! **************************************************************************************************
    1898        64652 :    PURE FUNCTION get_1D_idx(i, j, N)
    1899              :       INTEGER, INTENT(IN)                                :: i, j
    1900              :       INTEGER(int_8), INTENT(IN)                         :: N
    1901              :       INTEGER(int_8)                                     :: get_1D_idx
    1902              : 
    1903              :       INTEGER(int_8)                                     :: min_ij
    1904              : 
    1905        64652 :       min_ij = MIN(i, j)
    1906        64652 :       get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N
    1907              : 
    1908        64652 :    END FUNCTION get_1D_idx
    1909              : 
    1910              : ! **************************************************************************************************
    1911              : !> \brief ...
    1912              : !> \param natom ...
    1913              : !> \param nkind ...
    1914              : !> \param list_ij ...
    1915              : !> \param list_kl ...
    1916              : !> \param set_list_ij ...
    1917              : !> \param set_list_kl ...
    1918              : !> \param iatom_start ...
    1919              : !> \param iatom_end ...
    1920              : !> \param jatom_start ...
    1921              : !> \param jatom_end ...
    1922              : !> \param katom_start ...
    1923              : !> \param katom_end ...
    1924              : !> \param latom_start ...
    1925              : !> \param latom_end ...
    1926              : !> \param particle_set ...
    1927              : !> \param coeffs_set ...
    1928              : !> \param coeffs_kind ...
    1929              : !> \param is_assoc_atomic_block_global ...
    1930              : !> \param do_periodic ...
    1931              : !> \param kind_of ...
    1932              : !> \param basis_parameter ...
    1933              : !> \param pmax_set ...
    1934              : !> \param pmax_atom ...
    1935              : !> \param pmax_blocks ...
    1936              : !> \param cell ...
    1937              : !> \param do_p_screening ...
    1938              : !> \param map_atom_to_kind_atom ...
    1939              : !> \param eval_type ...
    1940              : !> \param log10_eps_schwarz ...
    1941              : !> \param log_2 ...
    1942              : !> \param coeffs_kind_max0 ...
    1943              : !> \param use_virial ...
    1944              : !> \param atomic_pair_list ...
    1945              : !> \return ...
    1946              : ! **************************************************************************************************
    1947       305535 :    FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    1948              :                                 iatom_start, iatom_end, jatom_start, jatom_end, &
    1949              :                                 katom_start, katom_end, latom_start, latom_end, &
    1950              :                                 particle_set, &
    1951       305535 :                                 coeffs_set, coeffs_kind, &
    1952       305535 :                                 is_assoc_atomic_block_global, do_periodic, &
    1953              :                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    1954              :                                 cell, &
    1955              :                                 do_p_screening, map_atom_to_kind_atom, eval_type, &
    1956              :                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
    1957       305535 :                                 atomic_pair_list)
    1958              : 
    1959              :       INTEGER, INTENT(IN)                                :: natom, nkind
    1960              :       TYPE(pair_list_type)                               :: list_ij, list_kl
    1961              :       TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
    1962              :       INTEGER, INTENT(IN)                                :: iatom_start, iatom_end, jatom_start, &
    1963              :                                                             jatom_end, katom_start, katom_end, &
    1964              :                                                             latom_start, latom_end
    1965              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1966              :       TYPE(hfx_screen_coeff_type), &
    1967              :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
    1968              :       TYPE(hfx_screen_coeff_type), &
    1969              :          DIMENSION(nkind, nkind)                         :: coeffs_kind
    1970              :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
    1971              :       LOGICAL                                            :: do_periodic
    1972              :       INTEGER                                            :: kind_of(*)
    1973              :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
    1974              :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
    1975              :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
    1976              :       REAL(dp)                                           :: pmax_blocks
    1977              :       TYPE(cell_type), POINTER                           :: cell
    1978              :       LOGICAL, INTENT(IN)                                :: do_p_screening
    1979              :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
    1980              :       INTEGER, INTENT(IN)                                :: eval_type
    1981              :       REAL(dp)                                           :: log10_eps_schwarz, log_2, &
    1982              :                                                             coeffs_kind_max0
    1983              :       LOGICAL, INTENT(IN)                                :: use_virial
    1984              :       LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list
    1985              :       INTEGER(int_8)                                     :: estimate_block_cost
    1986              : 
    1987              :       INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
    1988              :                  i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
    1989              :                  jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
    1990       305535 :       INTEGER, DIMENSION(:), POINTER                     :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
    1991       305535 :                                                             nsgfb, nsgfc, nsgfd
    1992              :       REAL(dp)                                           :: actual_pmax_atom, cost_tmp, max_val1, &
    1993              :                                                             max_val2, pmax_entry, rab2, rcd2, &
    1994              :                                                             screen_kind_ij, screen_kind_kl
    1995       305535 :       REAL(dp), DIMENSION(:, :), POINTER                 :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
    1996              : 
    1997       305535 :       estimate_block_cost = 0_int_8
    1998              : 
    1999              :       CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
    2000              :                            kind_of, basis_parameter, particle_set, &
    2001              :                            do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
    2002       305535 :                            log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
    2003              : 
    2004              :       CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
    2005              :                            kind_of, basis_parameter, particle_set, &
    2006              :                            do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
    2007       305535 :                            log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
    2008              : 
    2009       580784 :       DO i_list_ij = 1, list_ij%n_element
    2010       275249 :          iatom = list_ij%elements(i_list_ij)%pair(1)
    2011       275249 :          jatom = list_ij%elements(i_list_ij)%pair(2)
    2012       275249 :          i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
    2013       275249 :          i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
    2014       275249 :          ikind = list_ij%elements(i_list_ij)%kind_pair(1)
    2015       275249 :          jkind = list_ij%elements(i_list_ij)%kind_pair(2)
    2016       275249 :          rab2 = list_ij%elements(i_list_ij)%dist2
    2017              : 
    2018       275249 :          nsgfa => basis_parameter(ikind)%nsgf
    2019       275249 :          nsgfb => basis_parameter(jkind)%nsgf
    2020       275249 :          npgfa => basis_parameter(ikind)%npgf
    2021       275249 :          npgfb => basis_parameter(jkind)%npgf
    2022              : 
    2023       837719 :          DO i_list_kl = 1, list_kl%n_element
    2024              : 
    2025       256935 :             katom = list_kl%elements(i_list_kl)%pair(1)
    2026       256935 :             latom = list_kl%elements(i_list_kl)%pair(2)
    2027              : 
    2028       256935 :             IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE
    2029       141681 :             IF (((iatom + jatom) == (katom + latom)) .AND. (katom < iatom)) CYCLE
    2030              : 
    2031       135997 :             IF (eval_type == hfx_do_eval_forces) THEN
    2032        14896 :                IF (.NOT. use_virial) THEN
    2033        13514 :                   IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) CYCLE
    2034              :                END IF
    2035              :             END IF
    2036              : 
    2037       134615 :             i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
    2038       134615 :             i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
    2039       134615 :             kkind = list_kl%elements(i_list_kl)%kind_pair(1)
    2040       134615 :             lkind = list_kl%elements(i_list_kl)%kind_pair(2)
    2041       134615 :             rcd2 = list_kl%elements(i_list_kl)%dist2
    2042              : 
    2043       134615 :             nsgfc => basis_parameter(kkind)%nsgf
    2044       134615 :             nsgfd => basis_parameter(lkind)%nsgf
    2045       134615 :             npgfc => basis_parameter(kkind)%npgf
    2046       134615 :             npgfd => basis_parameter(lkind)%npgf
    2047              : 
    2048       134615 :             IF (do_p_screening) THEN
    2049              :                actual_pmax_atom = MAX(pmax_atom(katom, iatom), &
    2050              :                                       pmax_atom(latom, jatom), &
    2051              :                                       pmax_atom(latom, iatom), &
    2052        16523 :                                       pmax_atom(katom, jatom))
    2053              :             ELSE
    2054              :                actual_pmax_atom = 0.0_dp
    2055              :             END IF
    2056              : 
    2057              :             screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
    2058       134615 :                              coeffs_kind(jkind, ikind)%x(2)
    2059              :             screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
    2060       134615 :                              coeffs_kind(lkind, kkind)%x(2)
    2061       134615 :             IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
    2062              : 
    2063       127437 :             IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
    2064              :                        is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
    2065              :                        is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
    2066              :                        is_assoc_atomic_block_global(latom, jatom) >= 1)) CYCLE
    2067              : 
    2068       124253 :             IF (do_p_screening) THEN
    2069         6243 :                SELECT CASE (eval_type)
    2070              :                CASE (hfx_do_eval_energy)
    2071         6243 :                   swap_id = 0
    2072         6243 :                   kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
    2073         6243 :                   IF (ikind >= kkind) THEN
    2074              :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2075              :                                                                map_atom_to_kind_atom(katom), &
    2076         5629 :                                                                map_atom_to_kind_atom(iatom))
    2077              :                   ELSE
    2078              :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2079              :                                                                map_atom_to_kind_atom(iatom), &
    2080          614 :                                                                map_atom_to_kind_atom(katom))
    2081          614 :                      swap_id = swap_id + 1
    2082              :                   END IF
    2083         6243 :                   kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
    2084         6243 :                   IF (jkind >= lkind) THEN
    2085              :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2086              :                                                                map_atom_to_kind_atom(latom), &
    2087         5879 :                                                                map_atom_to_kind_atom(jatom))
    2088              :                   ELSE
    2089              :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2090              :                                                                map_atom_to_kind_atom(jatom), &
    2091          364 :                                                                map_atom_to_kind_atom(latom))
    2092          364 :                      swap_id = swap_id + 2
    2093              :                   END IF
    2094         6243 :                   kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
    2095         6243 :                   IF (ikind >= lkind) THEN
    2096              :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2097              :                                                                map_atom_to_kind_atom(latom), &
    2098         3489 :                                                                map_atom_to_kind_atom(iatom))
    2099              :                   ELSE
    2100              :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2101              :                                                                map_atom_to_kind_atom(iatom), &
    2102         2754 :                                                                map_atom_to_kind_atom(latom))
    2103         2754 :                      swap_id = swap_id + 4
    2104              :                   END IF
    2105         6243 :                   kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
    2106         6243 :                   IF (jkind >= kkind) THEN
    2107              :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2108              :                                                                map_atom_to_kind_atom(katom), &
    2109         6243 :                                                                map_atom_to_kind_atom(jatom))
    2110              :                   ELSE
    2111              :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2112              :                                                                map_atom_to_kind_atom(jatom), &
    2113            0 :                                                                map_atom_to_kind_atom(katom))
    2114            0 :                      swap_id = swap_id + 8
    2115              :                   END IF
    2116              :                CASE (hfx_do_eval_forces)
    2117         9920 :                   swap_id = 16
    2118         9920 :                   kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
    2119         9920 :                   IF (ikind >= kkind) THEN
    2120              :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2121              :                                                                map_atom_to_kind_atom(katom), &
    2122         9382 :                                                                map_atom_to_kind_atom(iatom))
    2123              :                   ELSE
    2124              :                      ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2125              :                                                                map_atom_to_kind_atom(iatom), &
    2126          538 :                                                                map_atom_to_kind_atom(katom))
    2127          538 :                      swap_id = swap_id + 1
    2128              :                   END IF
    2129         9920 :                   kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
    2130         9920 :                   IF (jkind >= lkind) THEN
    2131              :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2132              :                                                                map_atom_to_kind_atom(latom), &
    2133         9920 :                                                                map_atom_to_kind_atom(jatom))
    2134              :                   ELSE
    2135              :                      ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2136              :                                                                map_atom_to_kind_atom(jatom), &
    2137            0 :                                                                map_atom_to_kind_atom(latom))
    2138            0 :                      swap_id = swap_id + 2
    2139              :                   END IF
    2140         9920 :                   kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
    2141         9920 :                   IF (ikind >= lkind) THEN
    2142              :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2143              :                                                                map_atom_to_kind_atom(latom), &
    2144         8154 :                                                                map_atom_to_kind_atom(iatom))
    2145              :                   ELSE
    2146              :                      ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2147              :                                                                map_atom_to_kind_atom(iatom), &
    2148         1766 :                                                                map_atom_to_kind_atom(latom))
    2149         1766 :                      swap_id = swap_id + 4
    2150              :                   END IF
    2151         9920 :                   kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
    2152        26083 :                   IF (jkind >= kkind) THEN
    2153              :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2154              :                                                                map_atom_to_kind_atom(katom), &
    2155         9920 :                                                                map_atom_to_kind_atom(jatom))
    2156              :                   ELSE
    2157              :                      ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
    2158              :                                                                map_atom_to_kind_atom(jatom), &
    2159            0 :                                                                map_atom_to_kind_atom(katom))
    2160            0 :                      swap_id = swap_id + 8
    2161              :                   END IF
    2162              :                END SELECT
    2163              :             END IF
    2164              : 
    2165      1098243 :             DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
    2166       698741 :                iset = set_list_ij(i_set_list_ij)%pair(1)
    2167       698741 :                jset = set_list_ij(i_set_list_ij)%pair(2)
    2168              : 
    2169              :                max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
    2170       698741 :                           coeffs_set(jset, iset, jkind, ikind)%x(2)
    2171              : 
    2172       698741 :                IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
    2173     10202695 :                DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
    2174      9263379 :                   kset = set_list_kl(i_set_list_kl)%pair(1)
    2175      9263379 :                   lset = set_list_kl(i_set_list_kl)%pair(2)
    2176              : 
    2177              :                   max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
    2178      9263379 :                                          coeffs_set(lset, kset, lkind, kkind)%x(2))
    2179              : 
    2180      9263379 :                   IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) CYCLE
    2181      8257499 :                   IF (do_p_screening) THEN
    2182              :                      CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
    2183              :                                        iset, jset, kset, lset, &
    2184      1006828 :                                        pmax_entry, swap_id)
    2185      1006828 :                      IF (eval_type == hfx_do_eval_forces) THEN
    2186       598336 :                         pmax_entry = log_2 + pmax_entry
    2187              :                      END IF
    2188              :                   ELSE
    2189      7250671 :                      pmax_entry = 0.0_dp
    2190              :                   END IF
    2191      8257499 :                   max_val2 = max_val2 + pmax_entry
    2192      8257499 :                   IF (max_val2 < log10_eps_schwarz) CYCLE
    2193       698741 :                   SELECT CASE (eval_type)
    2194              :                   CASE (hfx_do_eval_energy)
    2195              :                      cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
    2196              :                                            npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
    2197              :                                            max_val2/log10_eps_schwarz, &
    2198      7113469 :                                            p1_energy, p2_energy, p3_energy)
    2199      7113469 :                      estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
    2200              :                   CASE (hfx_do_eval_forces)
    2201              :                      cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
    2202              :                                            npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
    2203              :                                            max_val2/log10_eps_schwarz, &
    2204       962122 :                                            p1_forces, p2_forces, p3_forces)
    2205      9037713 :                      estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
    2206              :                   END SELECT
    2207              :                END DO ! i_set_list_kl
    2208              :             END DO ! i_set_list_ij
    2209              :          END DO ! i_list_kl
    2210              :       END DO ! i_list_ij
    2211              : 
    2212       305535 :    END FUNCTION estimate_block_cost
    2213              : 
    2214              : ! **************************************************************************************************
    2215              : !> \brief ...
    2216              : !> \param nkind ...
    2217              : !> \param para_env ...
    2218              : !> \param natom ...
    2219              : !> \param block_size ...
    2220              : !> \param nblock ...
    2221              : !> \param blocks ...
    2222              : !> \param list_ij ...
    2223              : !> \param list_kl ...
    2224              : !> \param set_list_ij ...
    2225              : !> \param set_list_kl ...
    2226              : !> \param particle_set ...
    2227              : !> \param coeffs_set ...
    2228              : !> \param coeffs_kind ...
    2229              : !> \param is_assoc_atomic_block_global ...
    2230              : !> \param do_periodic ...
    2231              : !> \param kind_of ...
    2232              : !> \param basis_parameter ...
    2233              : !> \param pmax_set ...
    2234              : !> \param pmax_atom ...
    2235              : !> \param pmax_blocks ...
    2236              : !> \param cell ...
    2237              : !> \param do_p_screening ...
    2238              : !> \param map_atom_to_kind_atom ...
    2239              : !> \param eval_type ...
    2240              : !> \param log10_eps_schwarz ...
    2241              : !> \param log_2 ...
    2242              : !> \param coeffs_kind_max0 ...
    2243              : !> \param use_virial ...
    2244              : !> \param atomic_pair_list ...
    2245              : ! **************************************************************************************************
    2246         1236 :    SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
    2247         1236 :                           list_ij, list_kl, set_list_ij, set_list_kl, &
    2248              :                           particle_set, &
    2249              :                           coeffs_set, coeffs_kind, &
    2250         1236 :                           is_assoc_atomic_block_global, do_periodic, &
    2251              :                           kind_of, basis_parameter, pmax_set, pmax_atom, &
    2252              :                           pmax_blocks, cell, &
    2253              :                           do_p_screening, map_atom_to_kind_atom, eval_type, &
    2254              :                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
    2255         1236 :                           atomic_pair_list)
    2256              : 
    2257              :       INTEGER, INTENT(IN)                                :: nkind
    2258              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    2259              :       INTEGER                                            :: natom, block_size, nblock
    2260              :       TYPE(hfx_block_range_type), DIMENSION(1:nblock)    :: blocks
    2261              :       TYPE(pair_list_type)                               :: list_ij, list_kl
    2262              :       TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
    2263              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    2264              :       TYPE(hfx_screen_coeff_type), &
    2265              :          DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
    2266              :       TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
    2267              :          POINTER                                         :: coeffs_kind
    2268              :       INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
    2269              :       LOGICAL                                            :: do_periodic
    2270              :       INTEGER                                            :: kind_of(*)
    2271              :       TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
    2272              :       TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
    2273              :       REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
    2274              :       REAL(dp)                                           :: pmax_blocks
    2275              :       TYPE(cell_type), POINTER                           :: cell
    2276              :       LOGICAL, INTENT(IN)                                :: do_p_screening
    2277              :       INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
    2278              :       INTEGER, INTENT(IN)                                :: eval_type
    2279              :       REAL(dp)                                           :: log10_eps_schwarz, log_2, &
    2280              :                                                             coeffs_kind_max0
    2281              :       LOGICAL, INTENT(IN)                                :: use_virial
    2282              :       LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list
    2283              : 
    2284              :       INTEGER                                            :: atom_block, i, iatom_block, iatom_end, &
    2285              :                                                             iatom_start, my_cpu_rank, ncpus
    2286              : 
    2287         5140 :       DO atom_block = 0, nblock - 1
    2288         3904 :          iatom_block = MODULO(atom_block, nblock) + 1
    2289         3904 :          iatom_start = (iatom_block - 1)*block_size + 1
    2290         3904 :          iatom_end = MIN(iatom_block*block_size, natom)
    2291         3904 :          blocks(atom_block + 1)%istart = iatom_start
    2292         3904 :          blocks(atom_block + 1)%iend = iatom_end
    2293         5140 :          blocks(atom_block + 1)%cost = 0_int_8
    2294              :       END DO
    2295              : 
    2296         1236 :       ncpus = para_env%num_pe
    2297         1236 :       my_cpu_rank = para_env%mepos
    2298         5140 :       DO i = 1, nblock
    2299         3904 :          IF (MODULO(i, ncpus) /= my_cpu_rank) THEN
    2300         1949 :             blocks(i)%istart = 0
    2301         1949 :             blocks(i)%iend = 0
    2302         1949 :             CYCLE
    2303              :          END IF
    2304         1955 :          iatom_start = blocks(i)%istart
    2305         1955 :          iatom_end = blocks(i)%iend
    2306              :          blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
    2307              :                                               iatom_start, iatom_end, iatom_start, iatom_end, &
    2308              :                                               iatom_start, iatom_end, iatom_start, iatom_end, &
    2309              :                                               particle_set, &
    2310              :                                               coeffs_set, coeffs_kind, &
    2311              :                                               is_assoc_atomic_block_global, do_periodic, &
    2312              :                                               kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
    2313              :                                               cell, &
    2314              :                                               do_p_screening, map_atom_to_kind_atom, eval_type, &
    2315         3191 :                                               log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
    2316              : 
    2317              :       END DO
    2318         1236 :    END SUBROUTINE init_blocks
    2319              : 
    2320              : ! **************************************************************************************************
    2321              : !> \brief ...
    2322              : !> \param para_env ...
    2323              : !> \param x_data ...
    2324              : !> \param iw ...
    2325              : !> \param n_threads ...
    2326              : !> \param i_thread ...
    2327              : !> \param eval_type ...
    2328              : ! **************************************************************************************************
    2329            0 :    SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
    2330              :                                         eval_type)
    2331              : 
    2332              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    2333              :       TYPE(hfx_type), POINTER                            :: x_data
    2334              :       INTEGER, INTENT(IN)                                :: iw, n_threads, i_thread, eval_type
    2335              : 
    2336              :       INTEGER                                            :: i, j, k, my_rank, nbins, nranks, &
    2337              :                                                             total_bins
    2338              :       INTEGER(int_8)                                     :: avg_bin, avg_rank, max_bin, max_rank, &
    2339              :                                                             min_bin, min_rank, sum_bin, sum_rank
    2340            0 :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer, buffer_in, buffer_out, summary
    2341              :       INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE    :: shm_cost_vector
    2342            0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bins_per_rank, rdispl, sort_idx
    2343              :       INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: shm_bins_per_rank, shm_displ
    2344              : 
    2345            0 :       SELECT CASE (eval_type)
    2346              :       CASE (hfx_do_eval_energy)
    2347            0 :          nbins = SIZE(x_data%distribution_energy)
    2348              :       CASE (hfx_do_eval_forces)
    2349            0 :          nbins = SIZE(x_data%distribution_forces)
    2350              :       END SELECT
    2351              : 
    2352            0 : !$OMP MASTER
    2353            0 :       ALLOCATE (shm_bins_per_rank(n_threads))
    2354            0 :       ALLOCATE (shm_displ(n_threads + 1))
    2355              : !$OMP END MASTER
    2356            0 : !$OMP BARRIER
    2357              : 
    2358            0 :       shm_bins_per_rank(i_thread + 1) = nbins
    2359            0 : !$OMP BARRIER
    2360            0 :       nbins = 0
    2361            0 :       DO i = 1, n_threads
    2362            0 :          nbins = nbins + shm_bins_per_rank(i)
    2363              :       END DO
    2364            0 :       my_rank = para_env%mepos
    2365            0 :       nranks = para_env%num_pe
    2366              : 
    2367            0 : !$OMP BARRIER
    2368            0 : !$OMP MASTER
    2369            0 :       ALLOCATE (bins_per_rank(nranks))
    2370            0 :       bins_per_rank = 0
    2371              : 
    2372            0 :       bins_per_rank(my_rank + 1) = nbins
    2373              : 
    2374            0 :       CALL para_env%sum(bins_per_rank)
    2375              : 
    2376            0 :       total_bins = 0
    2377            0 :       DO i = 1, nranks
    2378            0 :          total_bins = total_bins + bins_per_rank(i)
    2379              :       END DO
    2380              : 
    2381            0 :       ALLOCATE (shm_cost_vector(2*total_bins))
    2382            0 :       shm_cost_vector = -1_int_8
    2383            0 :       shm_displ(1) = 1
    2384            0 :       DO i = 2, n_threads
    2385            0 :          shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
    2386              :       END DO
    2387            0 :       shm_displ(n_threads + 1) = nbins + 1
    2388              : !$OMP END MASTER
    2389            0 : !$OMP BARRIER
    2390            0 :       j = 0
    2391            0 :       SELECT CASE (eval_type)
    2392              :       CASE (hfx_do_eval_energy)
    2393            0 :          DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
    2394            0 :             j = j + 1
    2395            0 :             shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
    2396            0 :             shm_cost_vector(2*i) = INT(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, KIND=int_8)
    2397              :          END DO
    2398              :       CASE (hfx_do_eval_forces)
    2399            0 :          DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
    2400            0 :             j = j + 1
    2401            0 :             shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
    2402            0 :             shm_cost_vector(2*i) = INT(x_data%distribution_forces(j)%time_forces*10000.0_dp, KIND=int_8)
    2403              :          END DO
    2404              :       END SELECT
    2405            0 : !$OMP BARRIER
    2406            0 : !$OMP MASTER
    2407              :       ! ** calculate offsets
    2408            0 :       ALLOCATE (rdispl(nranks))
    2409            0 :       bins_per_rank(:) = bins_per_rank(:)*2
    2410            0 :       rdispl(1) = 0
    2411            0 :       DO i = 2, nranks
    2412            0 :          rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
    2413              :       END DO
    2414              : 
    2415            0 :       ALLOCATE (buffer_in(2*nbins))
    2416            0 :       ALLOCATE (buffer_out(2*total_bins))
    2417              : 
    2418            0 :       DO i = 1, nbins
    2419            0 :          buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
    2420            0 :          buffer_in(2*i) = shm_cost_vector(2*i)
    2421              :       END DO
    2422              : 
    2423            0 :       CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
    2424              : 
    2425            0 :       IF (iw > 0) THEN
    2426              : 
    2427            0 :          ALLOCATE (summary(2*nranks))
    2428            0 :          summary = 0_int_8
    2429              : 
    2430            0 :          WRITE (iw, '( /, 1X, 79("-") )')
    2431            0 :          WRITE (iw, '( " -", 77X, "-" )')
    2432            0 :          SELECT CASE (eval_type)
    2433              :          CASE (hfx_do_eval_energy)
    2434            0 :             WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
    2435              :          CASE (hfx_do_eval_forces)
    2436            0 :             WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
    2437              :          END SELECT
    2438            0 :          WRITE (iw, '( " -", 77X, "-" )')
    2439            0 :          WRITE (iw, '( 1X, 79("-") )')
    2440              : 
    2441            0 :          WRITE (iw, FMT="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
    2442            0 :          WRITE (iw, '( 1X, 79("-"), / )')
    2443            0 :          k = 0
    2444            0 :          DO i = 1, nranks
    2445            0 :             DO j = 1, bins_per_rank(i)/2
    2446            0 :                k = k + 1
    2447              :                WRITE (iw, FMT="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
    2448            0 :                   i - 1, j, buffer_out(2*(k - 1) + 1), REAL(buffer_out(2*k), dp)/10000.0_dp
    2449            0 :                summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
    2450            0 :                summary(2*i) = summary(2*i) + buffer_out(2*k)
    2451              :             END DO
    2452              :          END DO
    2453              : 
    2454              :          !** Summary
    2455              :          max_bin = 0_int_8
    2456              :          min_bin = HUGE(min_bin)
    2457              :          sum_bin = 0_int_8
    2458            0 :          DO i = 1, total_bins
    2459            0 :             sum_bin = sum_bin + buffer_out(2*i)
    2460            0 :             max_bin = MAX(max_bin, buffer_out(2*i))
    2461            0 :             min_bin = MIN(min_bin, buffer_out(2*i))
    2462              :          END DO
    2463            0 :          avg_bin = sum_bin/total_bins
    2464              : 
    2465            0 :          max_rank = 0_int_8
    2466            0 :          min_rank = HUGE(min_rank)
    2467            0 :          sum_rank = 0_int_8
    2468            0 :          DO i = 1, nranks
    2469            0 :             sum_rank = sum_rank + summary(2*i)
    2470            0 :             max_rank = MAX(max_rank, summary(2*i))
    2471            0 :             min_rank = MIN(min_rank, summary(2*i))
    2472              :          END DO
    2473            0 :          avg_rank = sum_rank/nranks
    2474              : 
    2475            0 :          WRITE (iw, FMT='(/,T3,A,/)') "SUMMARY:"
    2476            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max bin", REAL(max_bin, dp)/10000.0_dp
    2477            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min bin", REAL(min_bin, dp)/10000.0_dp
    2478            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum bin", REAL(sum_bin, dp)/10000.0_dp
    2479            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg bin", REAL(avg_bin, dp)/10000.0_dp
    2480            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max rank", REAL(max_rank, dp)/10000.0_dp
    2481            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min rank", REAL(min_rank, dp)/10000.0_dp
    2482            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum rank", REAL(sum_rank, dp)/10000.0_dp
    2483            0 :          WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg rank", REAL(avg_rank, dp)/10000.0_dp
    2484              : 
    2485            0 :          ALLOCATE (buffer(nranks))
    2486            0 :          ALLOCATE (sort_idx(nranks))
    2487              : 
    2488            0 :          DO i = 1, nranks
    2489            0 :             buffer(i) = summary(2*i)
    2490              :          END DO
    2491              : 
    2492            0 :          CALL sort(buffer, nranks, sort_idx)
    2493              : 
    2494            0 :          WRITE (iw, FMT="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
    2495            0 :          DO i = nranks, 1, -1
    2496            0 :        WRITE (iw, FMT="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), REAL(buffer(i), dp)/10000.0_dp
    2497              :          END DO
    2498              : 
    2499            0 :          DEALLOCATE (summary, buffer, sort_idx)
    2500              : 
    2501              :       END IF
    2502              : 
    2503            0 :       DEALLOCATE (buffer_in, buffer_out, rdispl)
    2504              : 
    2505            0 :       CALL para_env%sync()
    2506              : 
    2507            0 :       DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
    2508              : !$OMP END MASTER
    2509            0 : !$OMP BARRIER
    2510              : 
    2511            0 :    END SUBROUTINE collect_load_balance_info
    2512              : 
    2513              :    #:include 'hfx_get_pmax_val.fypp'
    2514              : 
    2515              : END MODULE hfx_load_balance_methods
        

Generated by: LCOV version 2.0-1