LCOV - code coverage report
Current view: top level - src - task_list_methods.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 85.2 % 1072 913
Test Date: 2025-12-04 06:27:48 Functions: 97.1 % 35 34

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief generate the tasks lists used by collocate and integrate routines
      10              : !> \par History
      11              : !>      01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
      12              : !> \author Joost VandeVondele
      13              : ! **************************************************************************************************
      14              : MODULE task_list_methods
      15              :    USE offload_api, ONLY: offload_create_buffer, offload_buffer_type
      16              :    USE grid_api, ONLY: grid_create_basis_set, grid_create_task_list
      17              :    USE ao_util, ONLY: exp_radius_very_extended
      18              :    USE basis_set_types, ONLY: get_gto_basis_set, &
      19              :                               gto_basis_set_p_type, &
      20              :                               gto_basis_set_type
      21              :    USE cell_types, ONLY: cell_type, &
      22              :                          pbc
      23              :    USE cp_control_types, ONLY: dft_control_type
      24              :    USE cube_utils, ONLY: compute_cube_center, &
      25              :                          cube_info_type, &
      26              :                          return_cube, &
      27              :                          return_cube_nonortho
      28              :    USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
      29              :                            dbcsr_finalize, &
      30              :                            dbcsr_get_block_p, &
      31              :                            dbcsr_get_info, &
      32              :                            dbcsr_p_type, &
      33              :                            dbcsr_put_block, &
      34              :                            dbcsr_type, &
      35              :                            dbcsr_work_create
      36              :    USE gaussian_gridlevels, ONLY: gaussian_gridlevel, &
      37              :                                   gridlevel_info_type
      38              :    USE kinds, ONLY: default_string_length, &
      39              :                     dp, &
      40              :                     int_8
      41              :    USE kpoint_types, ONLY: get_kpoint_info, &
      42              :                            kpoint_type
      43              :    USE memory_utilities, ONLY: reallocate
      44              :    USE message_passing, ONLY: &
      45              :       mp_comm_type
      46              :    USE particle_types, ONLY: particle_type
      47              :    USE particle_methods, ONLY: get_particle_set
      48              :    USE pw_env_types, ONLY: pw_env_get, &
      49              :                            pw_env_type
      50              :    USE qs_kind_types, ONLY: get_qs_kind, &
      51              :                             qs_kind_type
      52              :    USE qs_ks_types, ONLY: get_ks_env, &
      53              :                           qs_ks_env_type
      54              :    USE qs_neighbor_list_types, ONLY: neighbor_list_set_p_type
      55              :    USE realspace_grid_types, ONLY: realspace_grid_desc_p_type, &
      56              :                                    realspace_grid_desc_type, &
      57              :                                    rs_grid_create, &
      58              :                                    rs_grid_locate_rank, &
      59              :                                    rs_grid_release, &
      60              :                                    rs_grid_reorder_ranks, realspace_grid_type
      61              :    USE task_list_types, ONLY: deserialize_task, &
      62              :                               reallocate_tasks, &
      63              :                               serialize_task, &
      64              :                               task_list_type, &
      65              :                               atom_pair_type, &
      66              :                               task_size_in_int8, &
      67              :                               task_type
      68              :    USE util, ONLY: sort
      69              : 
      70              : !$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
      71              : !$                    omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
      72              : #include "./base/base_uses.f90"
      73              : 
      74              :    #:include './common/array_sort.fypp'
      75              : 
      76              :    IMPLICIT NONE
      77              : 
      78              :    LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
      79              : 
      80              :    PRIVATE
      81              : 
      82              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
      83              : 
      84              :    PUBLIC :: generate_qs_task_list, &
      85              :              task_list_inner_loop
      86              :    PUBLIC :: distribute_tasks, &
      87              :              rs_distribute_matrix, &
      88              :              rs_scatter_matrices, &
      89              :              rs_gather_matrices, &
      90              :              rs_copy_to_buffer, &
      91              :              rs_copy_to_matrices
      92              : 
      93              : CONTAINS
      94              : 
      95              : ! **************************************************************************************************
      96              : !> \brief ...
      97              : !> \param ks_env ...
      98              : !> \param task_list ...
      99              : !> \param basis_type ...
     100              : !> \param reorder_rs_grid_ranks Flag that indicates if this routine should
     101              : !>        or should not overwrite the rs descriptor (see comment below)
     102              : !> \param skip_load_balance_distributed ...
     103              : !> \param pw_env_external ...
     104              : !> \param sab_orb_external ...
     105              : !> \par History
     106              : !>      01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
     107              : !>      04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
     108              : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
     109              : !>      06.2015 adjusted to be used with multiple images (k-points) [JGH]
     110              : !> \note  If this routine is called several times with different task lists,
     111              : !>        the default behaviour is to re-optimize the grid ranks and overwrite
     112              : !>        the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
     113              : !>        of performing a new optimization by leaving the rank order in
     114              : !>        its current state.
     115              : ! **************************************************************************************************
     116        14110 :    SUBROUTINE generate_qs_task_list(ks_env, task_list, basis_type, &
     117              :                                     reorder_rs_grid_ranks, skip_load_balance_distributed, &
     118              :                                     pw_env_external, sab_orb_external)
     119              : 
     120              :       TYPE(qs_ks_env_type), POINTER                      :: ks_env
     121              :       TYPE(task_list_type), POINTER                      :: task_list
     122              :       CHARACTER(LEN=*), INTENT(IN)                       :: basis_type
     123              :       LOGICAL, INTENT(IN)                                :: reorder_rs_grid_ranks, &
     124              :                                                             skip_load_balance_distributed
     125              :       TYPE(pw_env_type), OPTIONAL, POINTER               :: pw_env_external
     126              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     127              :          OPTIONAL, POINTER                               :: sab_orb_external
     128              : 
     129              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list'
     130              :       INTEGER, PARAMETER                                 :: max_tasks = 2000
     131              : 
     132              :       INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
     133              :                  ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
     134              :                  jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
     135        14110 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: blocks
     136              :       INTEGER, DIMENSION(3)                              :: cellind
     137        14110 :       INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
     138        14110 :                                                             npgfb, nsgf
     139        14110 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
     140              :       LOGICAL                                            :: dokp
     141              :       REAL(KIND=dp)                                      :: kind_radius_a, kind_radius_b
     142              :       REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
     143        14110 :       REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
     144        14110 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
     145              :       TYPE(cell_type), POINTER                           :: cell
     146        14110 :       TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
     147              :       TYPE(dft_control_type), POINTER                    :: dft_control
     148              :       TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
     149        14110 :       TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER  :: basis_set_list
     150              :       TYPE(gto_basis_set_type), POINTER                  :: basis_set_a, basis_set_b, orb_basis_set
     151              :       TYPE(kpoint_type), POINTER                         :: kpoints
     152              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     153        14110 :          POINTER                                         :: sab_orb
     154        14110 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     155              :       TYPE(pw_env_type), POINTER                         :: pw_env
     156        14110 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     157              :       TYPE(qs_kind_type), POINTER                        :: qs_kind
     158              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     159        14110 :          POINTER                                         :: rs_descs
     160        14110 :       TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
     161        14110 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     162              : 
     163        14110 :       CALL timeset(routineN, handle)
     164              : 
     165              :       CALL get_ks_env(ks_env, &
     166              :                       qs_kind_set=qs_kind_set, &
     167              :                       cell=cell, &
     168              :                       particle_set=particle_set, &
     169        14110 :                       dft_control=dft_control)
     170              : 
     171        14110 :       CALL get_ks_env(ks_env, sab_orb=sab_orb)
     172        14110 :       IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
     173              : 
     174        14110 :       CALL get_ks_env(ks_env, pw_env=pw_env)
     175        14110 :       IF (PRESENT(pw_env_external)) pw_env => pw_env_external
     176        14110 :       CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
     177              : 
     178              :       ! *** assign from pw_env
     179        14110 :       gridlevel_info => pw_env%gridlevel_info
     180        14110 :       cube_info => pw_env%cube_info
     181              : 
     182              :       ! find maximum numbers
     183        14110 :       nkind = SIZE(qs_kind_set)
     184        14110 :       natoms = SIZE(particle_set)
     185        14110 :       maxset = 0
     186        14110 :       maxpgf = 0
     187        39275 :       DO ikind = 1, nkind
     188        25165 :          qs_kind => qs_kind_set(ikind)
     189              :          CALL get_qs_kind(qs_kind=qs_kind, &
     190        25165 :                           basis_set=orb_basis_set, basis_type=basis_type)
     191              : 
     192        25165 :          IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
     193        25165 :          CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
     194              : 
     195        25165 :          maxset = MAX(nseta, maxset)
     196        92336 :          maxpgf = MAX(MAXVAL(npgfa), maxpgf)
     197              :       END DO
     198              : 
     199              :       ! kpoint related
     200        14110 :       nimages = dft_control%nimages
     201        14110 :       IF (nimages > 1) THEN
     202          286 :          dokp = .TRUE.
     203          286 :          NULLIFY (kpoints)
     204          286 :          CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
     205          286 :          CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
     206              :       ELSE
     207        13824 :          dokp = .FALSE.
     208        13824 :          NULLIFY (cell_to_index)
     209              :       END IF
     210              : 
     211              :       ! free the atom_pair lists if allocated
     212        14110 :       IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
     213        14110 :       IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
     214              : 
     215              :       ! construct a list of all tasks
     216        14110 :       IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
     217         8432 :          CALL reallocate_tasks(task_list%tasks, max_tasks)
     218              :       END IF
     219        14110 :       task_list%ntasks = 0
     220        14110 :       curr_tasks = SIZE(task_list%tasks)
     221              : 
     222        67495 :       ALLOCATE (basis_set_list(nkind))
     223        39275 :       DO ikind = 1, nkind
     224        25165 :          qs_kind => qs_kind_set(ikind)
     225              :          CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
     226        25165 :                           basis_type=basis_type)
     227        39275 :          IF (ASSOCIATED(basis_set_a)) THEN
     228        25165 :             basis_set_list(ikind)%gto_basis_set => basis_set_a
     229              :          ELSE
     230            0 :             NULLIFY (basis_set_list(ikind)%gto_basis_set)
     231              :          END IF
     232              :       END DO
     233              : !!$OMP PARALLEL DEFAULT(NONE) &
     234              : !!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info,  &
     235              : !!$OMP         curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
     236              : !!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
     237              : !!$OMP          la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
     238              : !!$OMP          lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
     239              : !!$OMP          cindex, slot)
     240              :       ! Loop over neighbor list
     241              : !!$OMP DO SCHEDULE(GUIDED)
     242      1381393 :       DO slot = 1, sab_orb(1)%nl_size
     243      1367283 :          ikind = sab_orb(1)%nlist_task(slot)%ikind
     244      1367283 :          jkind = sab_orb(1)%nlist_task(slot)%jkind
     245      1367283 :          iatom = sab_orb(1)%nlist_task(slot)%iatom
     246      1367283 :          jatom = sab_orb(1)%nlist_task(slot)%jatom
     247      5469132 :          rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
     248      5469132 :          cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
     249              : 
     250      1367283 :          basis_set_a => basis_set_list(ikind)%gto_basis_set
     251      1367283 :          IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE
     252      1367283 :          basis_set_b => basis_set_list(jkind)%gto_basis_set
     253      1367283 :          IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE
     254      1367283 :          ra(:) = pbc(particle_set(iatom)%r, cell)
     255              :          ! basis ikind
     256      1367283 :          la_max => basis_set_a%lmax
     257      1367283 :          la_min => basis_set_a%lmin
     258      1367283 :          npgfa => basis_set_a%npgf
     259      1367283 :          nseta = basis_set_a%nset
     260      1367283 :          rpgfa => basis_set_a%pgf_radius
     261      1367283 :          set_radius_a => basis_set_a%set_radius
     262      1367283 :          kind_radius_a = basis_set_a%kind_radius
     263      1367283 :          zeta => basis_set_a%zet
     264              :          ! basis jkind
     265      1367283 :          lb_max => basis_set_b%lmax
     266      1367283 :          lb_min => basis_set_b%lmin
     267      1367283 :          npgfb => basis_set_b%npgf
     268      1367283 :          nsetb = basis_set_b%nset
     269      1367283 :          rpgfb => basis_set_b%pgf_radius
     270      1367283 :          set_radius_b => basis_set_b%set_radius
     271      1367283 :          kind_radius_b = basis_set_b%kind_radius
     272      1367283 :          zetb => basis_set_b%zet
     273              : 
     274      1367283 :          IF (dokp) THEN
     275       154885 :             cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
     276              :          ELSE
     277      1212398 :             cindex = 1
     278              :          END IF
     279              : 
     280              :          CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
     281              :                                    rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
     282              :                                    iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
     283              :                                    set_radius_a, set_radius_b, ra, rab, &
     284      1381393 :                                    la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
     285              : 
     286              :       END DO
     287              : !!$OMP END PARALLEL
     288              : 
     289              :       ! redistribute the task list so that all tasks map on the local rs grids
     290              :       CALL distribute_tasks( &
     291              :          rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
     292              :          tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
     293              :          atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
     294              :          reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
     295        14110 :          skip_load_balance_distributed=skip_load_balance_distributed)
     296              : 
     297              :       ! compute offsets for rs_scatter_matrix / rs_copy_matrix
     298        42330 :       ALLOCATE (nsgf(natoms))
     299        14110 :       CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
     300        14110 :       IF (ASSOCIATED(task_list%atom_pair_send)) THEN
     301              :          ! only needed when there is a distributed grid
     302              :          CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
     303              :                               nsgf=nsgf, &
     304              :                               group_size=rs_descs(1)%rs_desc%group_size, &
     305              :                               pair_offsets=task_list%pair_offsets_send, &
     306              :                               rank_offsets=task_list%rank_offsets_send, &
     307              :                               rank_sizes=task_list%rank_sizes_send, &
     308           48 :                               buffer_size=task_list%buffer_size_send)
     309              :       END IF
     310              :       CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
     311              :                            nsgf=nsgf, &
     312              :                            group_size=rs_descs(1)%rs_desc%group_size, &
     313              :                            pair_offsets=task_list%pair_offsets_recv, &
     314              :                            rank_offsets=task_list%rank_offsets_recv, &
     315              :                            rank_sizes=task_list%rank_sizes_recv, &
     316        14110 :                            buffer_size=task_list%buffer_size_recv)
     317        14110 :       DEALLOCATE (basis_set_list, nsgf)
     318              : 
     319              :       ! If the rank order has changed, reallocate any of the distributed rs_grids
     320        14110 :       IF (reorder_rs_grid_ranks) THEN
     321        60040 :          DO i = 1, gridlevel_info%ngrid_levels
     322        60040 :             IF (rs_descs(i)%rs_desc%distributed) THEN
     323           54 :                CALL rs_grid_release(rs_grids(i))
     324           54 :                CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
     325              :             END IF
     326              :          END DO
     327              :       END IF
     328              : 
     329              :       CALL create_grid_task_list(task_list=task_list, &
     330              :                                  qs_kind_set=qs_kind_set, &
     331              :                                  particle_set=particle_set, &
     332              :                                  cell=cell, &
     333              :                                  basis_type=basis_type, &
     334        14110 :                                  rs_grids=rs_grids)
     335              : 
     336              :       ! Now we have the final list of tasks, setup the task_list with the
     337              :       ! data needed for the loops in integrate_v/calculate_rho
     338              : 
     339        14110 :       IF (ASSOCIATED(task_list%taskstart)) THEN
     340         5678 :          DEALLOCATE (task_list%taskstart)
     341              :       END IF
     342        14110 :       IF (ASSOCIATED(task_list%taskstop)) THEN
     343         5678 :          DEALLOCATE (task_list%taskstop)
     344              :       END IF
     345        14110 :       IF (ASSOCIATED(task_list%npairs)) THEN
     346         5678 :          DEALLOCATE (task_list%npairs)
     347              :       END IF
     348              : 
     349              :       ! First, count the number of unique atom pairs per grid level
     350              : 
     351        42330 :       ALLOCATE (task_list%npairs(SIZE(rs_descs)))
     352              : 
     353        14110 :       iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
     354        14110 :       ipair = 0
     355        70090 :       task_list%npairs = 0
     356              : 
     357      7827117 :       DO i = 1, task_list%ntasks
     358      7813007 :          igrid_level = task_list%tasks(i)%grid_level
     359      7813007 :          img = task_list%tasks(i)%image
     360      7813007 :          iatom = task_list%tasks(i)%iatom
     361      7813007 :          jatom = task_list%tasks(i)%jatom
     362      7813007 :          iset = task_list%tasks(i)%iset
     363      7813007 :          jset = task_list%tasks(i)%jset
     364      7813007 :          ipgf = task_list%tasks(i)%ipgf
     365      7813007 :          jpgf = task_list%tasks(i)%jpgf
     366      7827117 :          IF (igrid_level /= igrid_level_old) THEN
     367        38350 :             IF (igrid_level_old /= -1) THEN
     368        24472 :                task_list%npairs(igrid_level_old) = ipair
     369              :             END IF
     370              :             ipair = 1
     371              :             igrid_level_old = igrid_level
     372              :             iatom_old = iatom
     373              :             jatom_old = jatom
     374              :             img_old = img
     375      7774657 :          ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
     376       513449 :             ipair = ipair + 1
     377       513449 :             iatom_old = iatom
     378       513449 :             jatom_old = jatom
     379       513449 :             img_old = img
     380              :          END IF
     381              :       END DO
     382              :       ! Take care of the last iteration
     383        14110 :       IF (task_list%ntasks /= 0) THEN
     384        13878 :          task_list%npairs(igrid_level) = ipair
     385              :       END IF
     386              : 
     387              :       ! Second, for each atom pair, find the indices in the task list
     388              :       ! of the first and last task
     389              : 
     390              :       ! Array sized for worst case
     391       112188 :       ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs)))
     392       112188 :       ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs)))
     393              : 
     394        14110 :       iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
     395        14110 :       ipair = 0
     396      1160804 :       task_list%taskstart = 0
     397      1160804 :       task_list%taskstop = 0
     398              : 
     399      7827117 :       DO i = 1, task_list%ntasks
     400      7813007 :          igrid_level = task_list%tasks(i)%grid_level
     401      7813007 :          img = task_list%tasks(i)%image
     402      7813007 :          iatom = task_list%tasks(i)%iatom
     403      7813007 :          jatom = task_list%tasks(i)%jatom
     404      7813007 :          iset = task_list%tasks(i)%iset
     405      7813007 :          jset = task_list%tasks(i)%jset
     406      7813007 :          ipgf = task_list%tasks(i)%ipgf
     407      7813007 :          jpgf = task_list%tasks(i)%jpgf
     408      7827117 :          IF (igrid_level /= igrid_level_old) THEN
     409        38350 :             IF (igrid_level_old /= -1) THEN
     410        24472 :                task_list%taskstop(ipair, igrid_level_old) = i - 1
     411              :             END IF
     412        38350 :             ipair = 1
     413        38350 :             task_list%taskstart(ipair, igrid_level) = i
     414        38350 :             igrid_level_old = igrid_level
     415        38350 :             iatom_old = iatom
     416        38350 :             jatom_old = jatom
     417        38350 :             img_old = img
     418      7774657 :          ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
     419       513449 :             ipair = ipair + 1
     420       513449 :             task_list%taskstart(ipair, igrid_level) = i
     421       513449 :             task_list%taskstop(ipair - 1, igrid_level) = i - 1
     422       513449 :             iatom_old = iatom
     423       513449 :             jatom_old = jatom
     424       513449 :             img_old = img
     425              :          END IF
     426              :       END DO
     427              :       ! Take care of the last iteration
     428        14110 :       IF (task_list%ntasks /= 0) THEN
     429        13878 :          task_list%taskstop(ipair, igrid_level) = task_list%ntasks
     430              :       END IF
     431              : 
     432              :       ! Debug task destribution
     433              :       IF (debug_this_module) THEN
     434              :          tasks => task_list%tasks
     435              :          WRITE (6, *)
     436              :          WRITE (6, *) "Total number of tasks              ", task_list%ntasks
     437              :          DO igrid_level = 1, gridlevel_info%ngrid_levels
     438              :             WRITE (6, *) "Total number of pairs(grid_level)  ", &
     439              :                igrid_level, task_list%npairs(igrid_level)
     440              :          END DO
     441              :          WRITE (6, *)
     442              : 
     443              :          DO igrid_level = 1, gridlevel_info%ngrid_levels
     444              : 
     445              :             ALLOCATE (blocks(natoms, natoms, nimages))
     446              :             blocks = -1
     447              :             DO ipair = 1, task_list%npairs(igrid_level)
     448              :                itask = task_list%taskstart(ipair, igrid_level)
     449              :                ilevel = task_list%tasks(itask)%grid_level
     450              :                img = task_list%tasks(itask)%image
     451              :                iatom = task_list%tasks(itask)%iatom
     452              :                jatom = task_list%tasks(itask)%jatom
     453              :                iset = task_list%tasks(itask)%iset
     454              :                jset = task_list%tasks(itask)%jset
     455              :                ipgf = task_list%tasks(itask)%ipgf
     456              :                jpgf = task_list%tasks(itask)%jpgf
     457              :                IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
     458              :                   blocks(iatom, jatom, img) = 1
     459              :                   blocks(jatom, iatom, img) = 1
     460              :                ELSE
     461              :                   WRITE (6, *) "TASK LIST CONFLICT IN PAIR       ", ipair
     462              :                   WRITE (6, *) "Reuse of iatom, jatom, image     ", iatom, jatom, img
     463              :                END IF
     464              : 
     465              :                iatom_old = iatom
     466              :                jatom_old = jatom
     467              :                img_old = img
     468              :                DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
     469              :                   ilevel = task_list%tasks(itask)%grid_level
     470              :                   img = task_list%tasks(itask)%image
     471              :                   iatom = task_list%tasks(itask)%iatom
     472              :                   jatom = task_list%tasks(itask)%jatom
     473              :                   iset = task_list%tasks(itask)%iset
     474              :                   jset = task_list%tasks(itask)%jset
     475              :                   ipgf = task_list%tasks(itask)%ipgf
     476              :                   jpgf = task_list%tasks(itask)%jpgf
     477              :                   IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
     478              :                      WRITE (6, *) "TASK LIST CONFLICT IN TASK       ", itask
     479              :                      WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
     480              :                      WRITE (6, *) "Should be    iatom, jatom, image ", iatom_old, jatom_old, img_old
     481              :                   END IF
     482              : 
     483              :                END DO
     484              :             END DO
     485              :             DEALLOCATE (blocks)
     486              : 
     487              :          END DO
     488              : 
     489              :       END IF
     490              : 
     491        14110 :       CALL timestop(handle)
     492              : 
     493        14110 :    END SUBROUTINE generate_qs_task_list
     494              : 
     495              : ! **************************************************************************************************
     496              : !> \brief Sends the task list data to the grid API.
     497              : !> \author Ole Schuett
     498              : ! **************************************************************************************************
     499        14110 :    SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
     500              :       TYPE(task_list_type), POINTER                      :: task_list
     501              :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     502              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     503              :       TYPE(cell_type), POINTER                           :: cell
     504              :       CHARACTER(LEN=*)                                   :: basis_type
     505              :       TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
     506              : 
     507              :       TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
     508              :       INTEGER                                            :: nset, natoms, nkinds, ntasks, &
     509              :                                                             ikind, iatom, itask, nsgf
     510        14110 :       INTEGER, DIMENSION(:), ALLOCATABLE                 :: atom_kinds, level_list, iatom_list, jatom_list, &
     511        14110 :                                                             iset_list, jset_list, ipgf_list, jpgf_list, &
     512        14110 :                                                             border_mask_list, block_num_list
     513        14110 :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: radius_list
     514        14110 :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE        :: rab_list, atom_positions
     515        14110 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     516        14110 :       INTEGER, DIMENSION(:, :), POINTER                  :: first_sgf
     517        14110 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: sphi, zet
     518        14110 :       INTEGER, DIMENSION(:), POINTER                     :: lmax, lmin, npgf, nsgf_set
     519              : 
     520        14110 :       nkinds = SIZE(qs_kind_set)
     521        14110 :       natoms = SIZE(particle_set)
     522        14110 :       ntasks = task_list%ntasks
     523        14110 :       tasks => task_list%tasks
     524              : 
     525        14110 :       IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
     526              :          ! Basis sets do not change during simulation - only need to create them once.
     527        40273 :          ALLOCATE (task_list%grid_basis_sets(nkinds))
     528        23409 :          DO ikind = 1, nkinds
     529        14977 :             CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
     530              :             CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
     531              :                                    nset=nset, &
     532              :                                    nsgf=nsgf, &
     533              :                                    nsgf_set=nsgf_set, &
     534              :                                    npgf=npgf, &
     535              :                                    first_sgf=first_sgf, &
     536              :                                    lmax=lmax, &
     537              :                                    lmin=lmin, &
     538              :                                    sphi=sphi, &
     539        14977 :                                    zet=zet)
     540              :             CALL grid_create_basis_set(nset=nset, &
     541              :                                        nsgf=nsgf, &
     542              :                                        maxco=SIZE(sphi, 1), &
     543              :                                        maxpgf=SIZE(zet, 1), &
     544              :                                        lmin=lmin, &
     545              :                                        lmax=lmax, &
     546              :                                        npgf=npgf, &
     547              :                                        nsgf_set=nsgf_set, &
     548              :                                        first_sgf=first_sgf, &
     549              :                                        sphi=sphi, &
     550              :                                        zet=zet, &
     551        23409 :                                        basis_set=task_list%grid_basis_sets(ikind))
     552              :          END DO
     553              :       END IF
     554              : 
     555              :       ! Pack task list infos
     556        70550 :       ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
     557        65807 :       DO iatom = 1, natoms
     558        51697 :          atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
     559        65807 :          atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
     560              :       END DO
     561              : 
     562        69854 :       ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
     563        69622 :       ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
     564        41866 :       ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
     565        70086 :       ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
     566              : 
     567      7827117 :       DO itask = 1, ntasks
     568      7813007 :          level_list(itask) = tasks(itask)%grid_level
     569      7813007 :          iatom_list(itask) = tasks(itask)%iatom
     570      7813007 :          jatom_list(itask) = tasks(itask)%jatom
     571      7813007 :          iset_list(itask) = tasks(itask)%iset
     572      7813007 :          jset_list(itask) = tasks(itask)%jset
     573      7813007 :          ipgf_list(itask) = tasks(itask)%ipgf
     574      7813007 :          jpgf_list(itask) = tasks(itask)%jpgf
     575      7813007 :          IF (tasks(itask)%dist_type == 2) THEN
     576            0 :             border_mask_list(itask) = IAND(63, NOT(tasks(itask)%subpatch_pattern))  ! invert last 6 bits
     577              :          ELSE
     578      7813007 :             border_mask_list(itask) = 0 ! no masking
     579              :          END IF
     580      7813007 :          block_num_list(itask) = tasks(itask)%pair_index  ! change of nomenclature pair_index -> block_num
     581      7813007 :          radius_list(itask) = tasks(itask)%radius
     582     31266138 :          rab_list(:, itask) = tasks(itask)%rab(:)
     583              :       END DO
     584              : 
     585              :       CALL grid_create_task_list(ntasks=ntasks, &
     586              :                                  natoms=natoms, &
     587              :                                  nkinds=nkinds, &
     588              :                                  nblocks=SIZE(task_list%pair_offsets_recv), &
     589              :                                  block_offsets=task_list%pair_offsets_recv, &
     590              :                                  atom_positions=atom_positions, &
     591              :                                  atom_kinds=atom_kinds, &
     592              :                                  basis_sets=task_list%grid_basis_sets, &
     593              :                                  level_list=level_list, &
     594              :                                  iatom_list=iatom_list, &
     595              :                                  jatom_list=jatom_list, &
     596              :                                  iset_list=iset_list, &
     597              :                                  jset_list=jset_list, &
     598              :                                  ipgf_list=ipgf_list, &
     599              :                                  jpgf_list=jpgf_list, &
     600              :                                  border_mask_list=border_mask_list, &
     601              :                                  block_num_list=block_num_list, &
     602              :                                  radius_list=radius_list, &
     603              :                                  rab_list=rab_list, &
     604              :                                  rs_grids=rs_grids, &
     605        14110 :                                  task_list=task_list%grid_task_list)
     606              : 
     607        14110 :       CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
     608        14110 :       CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
     609              : 
     610        28220 :    END SUBROUTINE create_grid_task_list
     611              : 
     612              : ! **************************************************************************************************
     613              : !> \brief ...
     614              : !> \param tasks ...
     615              : !> \param ntasks ...
     616              : !> \param curr_tasks ...
     617              : !> \param rs_descs ...
     618              : !> \param dft_control ...
     619              : !> \param cube_info ...
     620              : !> \param gridlevel_info ...
     621              : !> \param cindex ...
     622              : !> \param iatom ...
     623              : !> \param jatom ...
     624              : !> \param rpgfa ...
     625              : !> \param rpgfb ...
     626              : !> \param zeta ...
     627              : !> \param zetb ...
     628              : !> \param kind_radius_b ...
     629              : !> \param set_radius_a ...
     630              : !> \param set_radius_b ...
     631              : !> \param ra ...
     632              : !> \param rab ...
     633              : !> \param la_max ...
     634              : !> \param la_min ...
     635              : !> \param lb_max ...
     636              : !> \param lb_min ...
     637              : !> \param npgfa ...
     638              : !> \param npgfb ...
     639              : !> \param nseta ...
     640              : !> \param nsetb ...
     641              : !> \par History
     642              : !>      Joost VandeVondele: 10.2008 refactored
     643              : ! **************************************************************************************************
     644      1378740 :    SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
     645              :                                    cube_info, gridlevel_info, cindex, &
     646              :                                    iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
     647              :                                    la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
     648              : 
     649              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     650              :       INTEGER                                            :: ntasks, curr_tasks
     651              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     652              :          POINTER                                         :: rs_descs
     653              :       TYPE(dft_control_type), POINTER                    :: dft_control
     654              :       TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
     655              :       TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
     656              :       INTEGER                                            :: cindex, iatom, jatom
     657              :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
     658              :       REAL(KIND=dp)                                      :: kind_radius_b
     659              :       REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
     660              :       REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
     661              :       INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
     662              :                                                             npgfb
     663              :       INTEGER                                            :: nseta, nsetb
     664              : 
     665              :       INTEGER                                            :: cube_center(3), igrid_level, ipgf, iset, &
     666              :                                                             jpgf, jset, lb_cube(3), ub_cube(3)
     667              :       REAL(KIND=dp)                                      :: dab, rab2, radius, zetp
     668              : 
     669      1378740 :       rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
     670      1378740 :       dab = SQRT(rab2)
     671              : 
     672      4428179 :       loop_iset: DO iset = 1, nseta
     673              : 
     674      3049439 :          IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE loop_iset
     675              : 
     676      9192002 :          loop_jset: DO jset = 1, nsetb
     677              : 
     678      5698482 :             IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE loop_jset
     679              : 
     680     14876692 :             loop_ipgf: DO ipgf = 1, npgfa(iset)
     681              : 
     682      8330579 :                IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE loop_ipgf
     683              : 
     684     25157284 :                loop_jpgf: DO jpgf = 1, npgfb(jset)
     685              : 
     686     14320508 :                   IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE loop_jpgf
     687              : 
     688      7976897 :                   zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
     689      7976897 :                   igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
     690              : 
     691              :                   CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
     692              :                                               rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
     693              :                                               la_max(iset), zeta(ipgf, iset), la_min(iset), &
     694              :                                               lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
     695      7976897 :                                               ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
     696              : 
     697              :                   CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
     698              :                                     rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
     699              :                                     la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
     700              :                                     igrid_level, gridlevel_info%ngrid_levels, cube_center, &
     701     22651087 :                                     lb_cube, ub_cube, radius)
     702              : 
     703              :                END DO loop_jpgf
     704              : 
     705              :             END DO loop_ipgf
     706              : 
     707              :          END DO loop_jset
     708              : 
     709              :       END DO loop_iset
     710              : 
     711      1378740 :    END SUBROUTINE task_list_inner_loop
     712              : 
     713              : ! **************************************************************************************************
     714              : !> \brief combines the calculation of several basic properties of a given pgf:
     715              : !>  its center, the bounding cube, the radius, the cost,
     716              : !>  tries to predict the time needed for processing this task
     717              : !>      in this way an improved load balance might be obtained
     718              : !> \param cube_center ...
     719              : !> \param lb_cube ...
     720              : !> \param ub_cube ...
     721              : !> \param radius ...
     722              : !> \param rs_desc ...
     723              : !> \param cube_info ...
     724              : !> \param la_max ...
     725              : !> \param zeta ...
     726              : !> \param la_min ...
     727              : !> \param lb_max ...
     728              : !> \param zetb ...
     729              : !> \param lb_min ...
     730              : !> \param ra ...
     731              : !> \param rab ...
     732              : !> \param rab2 ...
     733              : !> \param eps ...
     734              : !> \par History
     735              : !>      10.2008 refactored [Joost VandeVondele]
     736              : !> \note
     737              : !>      -) this requires the radius to be computed in the same way as
     738              : !>      collocate_pgf_product, we should factor that part into a subroutine
     739              : !>      -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
     740              : !>         this is more or less true for map_consistent
     741              : !>      -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
     742              : !>         the same, this could lead to a small speedup
     743              : !>      -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
     744              : !>         fitting the measured data on an opteron/g95 using the expression
     745              : !>         a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
     746              : ! **************************************************************************************************
     747      7976897 :    SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
     748              :                                      rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
     749              : 
     750              :       INTEGER, DIMENSION(3), INTENT(OUT)                 :: cube_center, lb_cube, ub_cube
     751              :       REAL(KIND=dp), INTENT(OUT)                         :: radius
     752              :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
     753              :       TYPE(cube_info_type), INTENT(IN)                   :: cube_info
     754              :       INTEGER, INTENT(IN)                                :: la_max
     755              :       REAL(KIND=dp), INTENT(IN)                          :: zeta
     756              :       INTEGER, INTENT(IN)                                :: la_min, lb_max
     757              :       REAL(KIND=dp), INTENT(IN)                          :: zetb
     758              :       INTEGER, INTENT(IN)                                :: lb_min
     759              :       REAL(KIND=dp), INTENT(IN)                          :: ra(3), rab(3), rab2, eps
     760              : 
     761              :       INTEGER                                            :: extent(3)
     762      7976897 :       INTEGER, DIMENSION(:), POINTER                     :: sphere_bounds
     763              :       REAL(KIND=dp)                                      :: cutoff, f, prefactor, rb(3), zetp
     764              :       REAL(KIND=dp), DIMENSION(3)                        :: rp
     765              : 
     766              : ! the radius for this task
     767              : 
     768      7976897 :       zetp = zeta + zetb
     769     31907588 :       rp(:) = ra(:) + zetb/zetp*rab(:)
     770     31907588 :       rb(:) = ra(:) + rab(:)
     771      7976897 :       cutoff = 1.0_dp
     772      7976897 :       f = zetb/zetp
     773      7976897 :       prefactor = EXP(-zeta*f*rab2)
     774              :       radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
     775      7976897 :                                         zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
     776              : 
     777      7976897 :       CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
     778              :       ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
     779     31907588 :       cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:))
     780     31907588 :       cube_center(:) = cube_center(:) + rs_desc%lb(:)
     781              : 
     782      7976897 :       IF (rs_desc%orthorhombic) THEN
     783      7541105 :          CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
     784              :       ELSE
     785       435792 :          CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
     786              :          !? unclear if extent is computed correctly.
     787      1743168 :          extent(:) = ub_cube(:) - lb_cube(:)
     788      1743168 :          lb_cube(:) = -extent(:)/2 - 1
     789      1743168 :          ub_cube(:) = extent(:)/2
     790              :       END IF
     791              : 
     792      7976897 :    END SUBROUTINE compute_pgf_properties
     793              : ! **************************************************************************************************
     794              : !> \brief predicts the cost of a task in kcycles for a given task
     795              : !>        the model is based on a fit of actual data, and might need updating
     796              : !>        as collocate_pgf_product changes (or CPUs/compilers change)
     797              : !>        maybe some dynamic approach, improving the cost model on the fly could
     798              : !>        work as well
     799              : !>        the cost model does not yet take into account the fraction of space
     800              : !>        that is mapped locally for a given cube and rs_grid (generalised tasks)
     801              : !> \param lb_cube ...
     802              : !> \param ub_cube ...
     803              : !> \param fraction ...
     804              : !> \param lmax ...
     805              : !> \param is_ortho ...
     806              : !> \return ...
     807              : ! **************************************************************************************************
     808      7976897 :    INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
     809              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
     810              :       REAL(KIND=dp), INTENT(IN)                          :: fraction
     811              :       INTEGER                                            :: lmax
     812              :       LOGICAL                                            :: is_ortho
     813              : 
     814              :       INTEGER                                            :: cmax
     815              :       REAL(KIND=dp)                                      :: v1, v2, v3, v4, v5
     816              : 
     817     31907588 :       cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2)
     818              : 
     819      7976897 :       IF (is_ortho) THEN
     820              :          v1 = 1.504760E+00_dp
     821              :          v2 = 3.126770E+00_dp
     822              :          v3 = 5.074106E+00_dp
     823              :          v4 = 1.091568E+00_dp
     824              :          v5 = 1.070187E+00_dp
     825              :       ELSE
     826       435792 :          v1 = 7.831105E+00_dp
     827       435792 :          v2 = 2.675174E+00_dp
     828       435792 :          v3 = 7.546553E+00_dp
     829       435792 :          v4 = 6.122446E-01_dp
     830       435792 :          v5 = 3.886382E+00_dp
     831              :       END IF
     832      7976897 :       cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
     833              : 
     834      7976897 :    END FUNCTION cost_model
     835              : ! **************************************************************************************************
     836              : !> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
     837              : !>        this determines by which CPUs a given pgf gets collocated
     838              : !>        the format of the task array is as follows
     839              : !>        tasks(1,i) := destination
     840              : !>        tasks(2,i) := source
     841              : !>        tasks(3,i) := compressed type (iatom, jatom, ....)
     842              : !>        tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
     843              : !>        tasks(5,i) := cost
     844              : !>        tasks(6,i) := alternate destination code (0 if none available)
     845              : !>
     846              : !> \param tasks ...
     847              : !> \param ntasks ...
     848              : !> \param curr_tasks ...
     849              : !> \param rab ...
     850              : !> \param cindex ...
     851              : !> \param iatom ...
     852              : !> \param jatom ...
     853              : !> \param iset ...
     854              : !> \param jset ...
     855              : !> \param ipgf ...
     856              : !> \param jpgf ...
     857              : !> \param la_max ...
     858              : !> \param lb_max ...
     859              : !> \param rs_desc ...
     860              : !> \param igrid_level ...
     861              : !> \param n_levels ...
     862              : !> \param cube_center ...
     863              : !> \param lb_cube ...
     864              : !> \param ub_cube ...
     865              : !> \par History
     866              : !>      10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
     867              : ! **************************************************************************************************
     868      7976897 :    SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
     869              :                            rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
     870              :                            la_max, lb_max, rs_desc, igrid_level, n_levels, &
     871              :                            cube_center, lb_cube, ub_cube, radius)
     872              : 
     873              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     874              :       INTEGER, INTENT(INOUT)                             :: ntasks, curr_tasks
     875              :       REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: rab
     876              :       INTEGER, INTENT(IN)                                :: cindex, iatom, jatom, iset, jset, ipgf, &
     877              :                                                             jpgf, la_max, lb_max
     878              :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
     879              :       INTEGER, INTENT(IN)                                :: igrid_level, n_levels
     880              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center, lb_cube, ub_cube
     881              :       REAL(KIND=dp), INTENT(IN)                          :: radius
     882              : 
     883              :       INTEGER, PARAMETER                                 :: add_tasks = 1000
     884              :       REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
     885              : 
     886              :       INTEGER                                            :: added_tasks, cost, j, lmax
     887              :       LOGICAL                                            :: is_ortho
     888              :       REAL(KIND=dp)                                      :: tfraction
     889              : 
     890              : !$OMP SINGLE
     891      7976897 :       ntasks = ntasks + 1
     892      7976897 :       IF (ntasks > curr_tasks) THEN
     893          990 :          curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks)
     894          990 :          CALL reallocate_tasks(tasks, curr_tasks)
     895              :       END IF
     896              : !$OMP END SINGLE
     897              : 
     898      7976897 :       IF (rs_desc%distributed) THEN
     899              : 
     900              :          ! finds the node(s) that need to process this task
     901              :          ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
     902              :          CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
     903         1380 :                            ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
     904              : 
     905              :       ELSE
     906      7975517 :          tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
     907      7975517 :          tasks(ntasks)%dist_type = 0
     908      7975517 :          tasks(ntasks)%subpatch_pattern = 0
     909      7975517 :          added_tasks = 1
     910              :       END IF
     911              : 
     912      7976897 :       lmax = la_max + lb_max
     913      7976897 :       is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
     914              :       ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
     915              :       ! this could be refined in the future
     916      7976897 :       tfraction = 1.0_dp/added_tasks
     917              : 
     918      7976897 :       cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
     919              : 
     920     15953794 :       DO j = 1, added_tasks
     921      7976897 :          tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
     922      7976897 :          tasks(ntasks - added_tasks + j)%cost = cost
     923      7976897 :          tasks(ntasks - added_tasks + j)%grid_level = igrid_level
     924      7976897 :          tasks(ntasks - added_tasks + j)%image = cindex
     925      7976897 :          tasks(ntasks - added_tasks + j)%iatom = iatom
     926      7976897 :          tasks(ntasks - added_tasks + j)%jatom = jatom
     927      7976897 :          tasks(ntasks - added_tasks + j)%iset = iset
     928      7976897 :          tasks(ntasks - added_tasks + j)%jset = jset
     929      7976897 :          tasks(ntasks - added_tasks + j)%ipgf = ipgf
     930      7976897 :          tasks(ntasks - added_tasks + j)%jpgf = jpgf
     931     31907588 :          tasks(ntasks - added_tasks + j)%rab = rab
     932     15953794 :          tasks(ntasks - added_tasks + j)%radius = radius
     933              :       END DO
     934              : 
     935      7976897 :    END SUBROUTINE pgf_to_tasks
     936              : 
     937              : ! **************************************************************************************************
     938              : !> \brief performs load balancing of the tasks on the distributed grids
     939              : !> \param tasks ...
     940              : !> \param ntasks ...
     941              : !> \param rs_descs ...
     942              : !> \param grid_level ...
     943              : !> \param natoms ...
     944              : !> \par History
     945              : !>      created 2008-10-03 [Joost VandeVondele]
     946              : ! **************************************************************************************************
     947           54 :    SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
     948              : 
     949              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     950              :       INTEGER                                            :: ntasks
     951              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     952              :          POINTER                                         :: rs_descs
     953              :       INTEGER                                            :: grid_level, natoms
     954              : 
     955              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed'
     956              : 
     957              :       INTEGER                                            :: handle
     958           54 :       INTEGER, DIMENSION(:, :, :), POINTER               :: list
     959              : 
     960           54 :       CALL timeset(routineN, handle)
     961              : 
     962           54 :       NULLIFY (list)
     963              :       ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
     964              :       ! if a destination would not be in this list, it is a bug
     965           54 :       CALL create_destination_list(list, rs_descs, grid_level)
     966              : 
     967              :       ! now, walk over the tasks, filling in the loads of each destination
     968           54 :       CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.TRUE.)
     969              : 
     970              :       ! optimize loads & fluxes
     971           54 :       CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
     972              : 
     973              :       ! now, walk over the tasks, using the list to set the destinations
     974           54 :       CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.FALSE.)
     975              : 
     976           54 :       DEALLOCATE (list)
     977              : 
     978           54 :       CALL timestop(handle)
     979              : 
     980           54 :    END SUBROUTINE load_balance_distributed
     981              : 
     982              : ! **************************************************************************************************
     983              : !> \brief this serial routine adjusts the fluxes in the global list
     984              : !>
     985              : !> \param list_global ...
     986              : !> \par History
     987              : !>      created 2008-10-06 [Joost VandeVondele]
     988              : ! **************************************************************************************************
     989           27 :    SUBROUTINE balance_global_list(list_global)
     990              :       INTEGER, DIMENSION(:, :, 0:)                       :: list_global
     991              : 
     992              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list'
     993              :       INTEGER, PARAMETER                                 :: Max_Iter = 100
     994              :       REAL(KIND=dp), PARAMETER                           :: Tolerance_factor = 0.005_dp
     995              : 
     996              :       INTEGER                                            :: dest, handle, icpu, idest, iflux, &
     997              :                                                             ilocal, k, maxdest, Ncpu, Nflux
     998           27 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: flux_connections
     999              :       LOGICAL                                            :: solution_optimal
    1000              :       REAL(KIND=dp)                                      :: average, load_shift, max_load_shift, &
    1001              :                                                             tolerance
    1002           27 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: load, optimized_flux, optimized_load
    1003           27 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: flux_limits
    1004              : 
    1005           27 :       CALL timeset(routineN, handle)
    1006              : 
    1007           27 :       Ncpu = SIZE(list_global, 3)
    1008           27 :       maxdest = SIZE(list_global, 2)
    1009           81 :       ALLOCATE (load(0:Ncpu - 1))
    1010           81 :       load = 0.0_dp
    1011           54 :       ALLOCATE (optimized_load(0:Ncpu - 1))
    1012              : 
    1013              :       ! figure out the number of fluxes
    1014              :       ! we assume that the global_list is symmetric
    1015           81 :       Nflux = 0
    1016           81 :       DO icpu = 0, ncpu - 1
    1017          189 :          DO idest = 1, maxdest
    1018          108 :             dest = list_global(1, idest, icpu)
    1019          162 :             IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1
    1020              :          END DO
    1021              :       END DO
    1022           81 :       ALLOCATE (optimized_flux(Nflux))
    1023           81 :       ALLOCATE (flux_limits(2, Nflux))
    1024           81 :       ALLOCATE (flux_connections(2, Nflux))
    1025              : 
    1026              :       ! reorder data
    1027          108 :       flux_limits = 0
    1028              :       Nflux = 0
    1029           81 :       DO icpu = 0, ncpu - 1
    1030          162 :          load(icpu) = SUM(list_global(2, :, icpu))
    1031          189 :          DO idest = 1, maxdest
    1032          108 :             dest = list_global(1, idest, icpu)
    1033          162 :             IF (dest < ncpu) THEN
    1034          108 :                IF (dest /= icpu) THEN
    1035           54 :                   IF (dest > icpu) THEN
    1036           27 :                      Nflux = Nflux + 1
    1037           27 :                      flux_limits(2, Nflux) = list_global(2, idest, icpu)
    1038           27 :                      flux_connections(1, Nflux) = icpu
    1039           27 :                      flux_connections(2, Nflux) = dest
    1040              :                   ELSE
    1041           27 :                      DO iflux = 1, Nflux
    1042           27 :                         IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
    1043           27 :                            flux_limits(1, iflux) = -list_global(2, idest, icpu)
    1044           27 :                            EXIT
    1045              :                         END IF
    1046              :                      END DO
    1047              :                   END IF
    1048              :                END IF
    1049              :             END IF
    1050              :          END DO
    1051              :       END DO
    1052              : 
    1053              :       solution_optimal = .FALSE.
    1054           54 :       optimized_flux = 0.0_dp
    1055              : 
    1056              :       ! an iterative solver, if iterated till convergence the maximum load is minimal
    1057              :       ! we terminate before things are fully converged, since this does show up in the timings
    1058              :       ! once the largest shift becomes less than a small fraction of the average load, we're done
    1059              :       ! we're perfectly happy if the load balance is within 1 percent or so
    1060              :       ! the maximum load normally converges even faster
    1061           81 :       average = SUM(load)/SIZE(load)
    1062           27 :       tolerance = Tolerance_factor*average
    1063              : 
    1064           81 :       optimized_load(:) = load
    1065          332 :       DO k = 1, Max_iter
    1066              :          max_load_shift = 0.0_dp
    1067          658 :          DO iflux = 1, Nflux
    1068          329 :             load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
    1069          329 :             load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
    1070          329 :             load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
    1071          329 :             max_load_shift = MAX(ABS(load_shift), max_load_shift)
    1072          329 :             optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
    1073          329 :             optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
    1074          658 :             optimized_flux(iflux) = optimized_flux(iflux) + load_shift
    1075              :          END DO
    1076          332 :          IF (max_load_shift < tolerance) THEN
    1077              :             solution_optimal = .TRUE.
    1078              :             EXIT
    1079              :          END IF
    1080              :       END DO
    1081              : 
    1082              :       ! now adjust the load list to reflect the optimized fluxes
    1083              :       ! reorder data
    1084              :       Nflux = 0
    1085           81 :       DO icpu = 0, ncpu - 1
    1086          162 :          DO idest = 1, maxdest
    1087          162 :             IF (list_global(1, idest, icpu) == icpu) ilocal = idest
    1088              :          END DO
    1089          189 :          DO idest = 1, maxdest
    1090          108 :             dest = list_global(1, idest, icpu)
    1091          162 :             IF (dest < ncpu) THEN
    1092          108 :                IF (dest /= icpu) THEN
    1093           54 :                   IF (dest > icpu) THEN
    1094           27 :                      Nflux = Nflux + 1
    1095           27 :                      IF (optimized_flux(Nflux) > 0) THEN
    1096              :                         list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1097            0 :                                                        list_global(2, idest, icpu) - NINT(optimized_flux(Nflux))
    1098            0 :                         list_global(2, idest, icpu) = NINT(optimized_flux(Nflux))
    1099              :                      ELSE
    1100              :                         list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1101           27 :                                                        list_global(2, idest, icpu)
    1102           27 :                         list_global(2, idest, icpu) = 0
    1103              :                      END IF
    1104              :                   ELSE
    1105           27 :                      DO iflux = 1, Nflux
    1106           27 :                         IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
    1107           27 :                            IF (optimized_flux(iflux) > 0) THEN
    1108              :                               list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1109            0 :                                                              list_global(2, idest, icpu)
    1110            0 :                               list_global(2, idest, icpu) = 0
    1111              :                            ELSE
    1112              :                               list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1113           27 :                                                              list_global(2, idest, icpu) + NINT(optimized_flux(iflux))
    1114           27 :                               list_global(2, idest, icpu) = -NINT(optimized_flux(iflux))
    1115              :                            END IF
    1116              :                            EXIT
    1117              :                         END IF
    1118              :                      END DO
    1119              :                   END IF
    1120              :                END IF
    1121              :             END IF
    1122              :          END DO
    1123              :       END DO
    1124              : 
    1125           27 :       CALL timestop(handle)
    1126              : 
    1127           54 :    END SUBROUTINE balance_global_list
    1128              : 
    1129              : ! **************************************************************************************************
    1130              : !> \brief this routine gets back optimized loads for all destinations
    1131              : !>
    1132              : !> \param list ...
    1133              : !> \param group ...
    1134              : !> \param my_pos ...
    1135              : !> \par History
    1136              : !>      created 2008-10-06 [Joost VandeVondele]
    1137              : !>      Modified 2016-01   [EPCC] Reduce memory requirements on P processes
    1138              : !>                                from O(P^2) to O(P)
    1139              : ! **************************************************************************************************
    1140           54 :    SUBROUTINE optimize_load_list(list, group, my_pos)
    1141              :       INTEGER, DIMENSION(:, :, 0:)                       :: list
    1142              :       TYPE(mp_comm_type), INTENT(IN) :: group
    1143              :       INTEGER, INTENT(IN)                                :: my_pos
    1144              : 
    1145              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list'
    1146              :       INTEGER, PARAMETER                                 :: rank_of_root = 0
    1147              : 
    1148              :       INTEGER                                            :: handle, icpu, idest, maxdest, ncpu
    1149              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: load_all
    1150           54 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: load_partial
    1151              :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: list_global
    1152              : 
    1153           54 :       CALL timeset(routineN, handle)
    1154              : 
    1155           54 :       ncpu = SIZE(list, 3)
    1156           54 :       maxdest = SIZE(list, 2)
    1157              : 
    1158              :       !find total workload ...
    1159          162 :       ALLOCATE (load_all(maxdest*ncpu))
    1160          108 :       load_all(:) = RESHAPE(list(2, :, :), [maxdest*ncpu])
    1161           54 :       CALL group%sum(load_all(:), rank_of_root)
    1162              : 
    1163              :       ! ... and optimise the work per process
    1164          216 :       ALLOCATE (list_global(2, maxdest, ncpu))
    1165           54 :       IF (rank_of_root == my_pos) THEN
    1166          189 :          list_global(1, :, :) = list(1, :, :)
    1167          243 :          list_global(2, :, :) = RESHAPE(load_all, [maxdest, ncpu])
    1168           27 :          CALL balance_global_list(list_global)
    1169              :       END IF
    1170           54 :       CALL group%bcast(list_global, rank_of_root)
    1171              : 
    1172              :       !figure out how much can be sent to other processes
    1173          216 :       ALLOCATE (load_partial(maxdest, ncpu))
    1174              :       ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
    1175          162 :       CALL group%sum_partial(RESHAPE(load_all, [maxdest, ncpu]), load_partial(:, :))
    1176              : 
    1177          162 :       DO icpu = 1, ncpu
    1178          378 :          DO idest = 1, maxdest
    1179              : 
    1180              :             !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
    1181          324 :             IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
    1182           37 :                IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
    1183              :                   list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
    1184           10 :                                              - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
    1185              :                ELSE
    1186           27 :                   list(2, idest, icpu - 1) = 0
    1187              :                END IF
    1188              :             END IF
    1189              : 
    1190              :          END DO
    1191              :       END DO
    1192              : 
    1193              :       !clean up before leaving
    1194           54 :       DEALLOCATE (load_all)
    1195           54 :       DEALLOCATE (list_global)
    1196           54 :       DEALLOCATE (load_partial)
    1197              : 
    1198           54 :       CALL timestop(handle)
    1199           54 :    END SUBROUTINE optimize_load_list
    1200              : 
    1201              : ! **************************************************************************************************
    1202              : !> \brief fill the load list with values derived from the tasks array
    1203              : !>        from the alternate locations, we select the alternate location that
    1204              : !>        can be used without increasing the number of matrix blocks needed to
    1205              : !>        distribute.
    1206              : !>        Replicated tasks are not yet considered
    1207              : !>
    1208              : !> \param list ...
    1209              : !> \param rs_descs ...
    1210              : !> \param grid_level ...
    1211              : !> \param tasks ...
    1212              : !> \param ntasks ...
    1213              : !> \param natoms ...
    1214              : !> \param create_list ...
    1215              : !> \par History
    1216              : !>      created 2008-10-06 [Joost VandeVondele]
    1217              : ! **************************************************************************************************
    1218          108 :    SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
    1219              :       INTEGER, DIMENSION(:, :, 0:)                       :: list
    1220              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1221              :          POINTER                                         :: rs_descs
    1222              :       INTEGER                                            :: grid_level
    1223              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1224              :       INTEGER                                            :: ntasks, natoms
    1225              :       LOGICAL                                            :: create_list
    1226              : 
    1227              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list'
    1228              : 
    1229              :       INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
    1230              :                  itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
    1231              :                  rank
    1232              :       INTEGER(KIND=int_8)                                :: bit_pattern, ipair, ipair_old, natom8
    1233              :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: loads
    1234          108 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: all_dests, index
    1235              :       INTEGER, DIMENSION(6)                              :: options
    1236              : 
    1237          108 :       CALL timeset(routineN, handle)
    1238              : 
    1239          324 :       ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
    1240          108 :       CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.FALSE.)
    1241              : 
    1242          108 :       maxdest = SIZE(list, 2)
    1243          108 :       ncpu = SIZE(list, 3)
    1244          108 :       natom8 = natoms
    1245              : 
    1246              :       ! first find the tasks that deal with the same atom pair
    1247          108 :       itask_stop = 0
    1248          108 :       ipair_old = HUGE(ipair_old)
    1249          108 :       img_old = -1
    1250          108 :       ALLOCATE (all_dests(0))
    1251          108 :       ALLOCATE (INDEX(0))
    1252              : 
    1253              :       DO
    1254              : 
    1255              :          ! first find the range of tasks that deal with the same atom pair
    1256          290 :          itask_start = itask_stop + 1
    1257          290 :          itask_stop = itask_start
    1258          290 :          IF (itask_stop > ntasks) EXIT
    1259          182 :          ilevel = tasks(itask_stop)%grid_level
    1260          182 :          img_old = tasks(itask_stop)%image
    1261          182 :          iatom = tasks(itask_stop)%iatom
    1262          182 :          jatom = tasks(itask_stop)%jatom
    1263          182 :          iset = tasks(itask_stop)%iset
    1264          182 :          jset = tasks(itask_stop)%jset
    1265          182 :          ipgf = tasks(itask_stop)%ipgf
    1266          182 :          jpgf = tasks(itask_stop)%jpgf
    1267              : 
    1268          182 :          ipair_old = (iatom - 1)*natom8 + (jatom - 1)
    1269              :          DO
    1270         7386 :             IF (itask_stop + 1 > ntasks) EXIT
    1271         7298 :             ilevel = tasks(itask_stop + 1)%grid_level
    1272         7298 :             img = tasks(itask_stop + 1)%image
    1273         7298 :             iatom = tasks(itask_stop + 1)%iatom
    1274         7298 :             jatom = tasks(itask_stop + 1)%jatom
    1275         7298 :             iset = tasks(itask_stop + 1)%iset
    1276         7298 :             jset = tasks(itask_stop + 1)%jset
    1277         7298 :             ipgf = tasks(itask_stop + 1)%ipgf
    1278         7298 :             jpgf = tasks(itask_stop + 1)%jpgf
    1279              : 
    1280         7298 :             ipair = (iatom - 1)*natom8 + (jatom - 1)
    1281         7386 :             IF (ipair == ipair_old .AND. img == img_old) THEN
    1282              :                itask_stop = itask_stop + 1
    1283              :             ELSE
    1284              :                EXIT
    1285              :             END IF
    1286              :          END DO
    1287          182 :          ipair = ipair_old
    1288          182 :          nshort = itask_stop - itask_start + 1
    1289              : 
    1290              :          ! find the unique list of destinations on this grid level only
    1291          182 :          DEALLOCATE (all_dests)
    1292          546 :          ALLOCATE (all_dests(nshort))
    1293          182 :          DEALLOCATE (index)
    1294          364 :          ALLOCATE (INDEX(nshort))
    1295         7568 :          DO i = 1, nshort
    1296         7386 :             ilevel = tasks(itask_start + i - 1)%grid_level
    1297         7386 :             img = tasks(itask_start + i - 1)%image
    1298         7386 :             iatom = tasks(itask_start + i - 1)%iatom
    1299         7386 :             jatom = tasks(itask_start + i - 1)%jatom
    1300         7386 :             iset = tasks(itask_start + i - 1)%iset
    1301         7386 :             jset = tasks(itask_start + i - 1)%jset
    1302         7386 :             ipgf = tasks(itask_start + i - 1)%ipgf
    1303         7386 :             jpgf = tasks(itask_start + i - 1)%jpgf
    1304              : 
    1305         7568 :             IF (ilevel == grid_level) THEN
    1306         2760 :                all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
    1307              :             ELSE
    1308         4626 :                all_dests(i) = HUGE(all_dests(i))
    1309              :             END IF
    1310              :          END DO
    1311          182 :          CALL sort(all_dests, nshort, index)
    1312          182 :          ndest_pair = 1
    1313         7386 :          DO i = 2, nshort
    1314         7386 :             IF ((all_dests(ndest_pair) /= all_dests(i)) .AND. (all_dests(i) /= HUGE(all_dests(i)))) THEN
    1315           10 :                ndest_pair = ndest_pair + 1
    1316           10 :                all_dests(ndest_pair) = all_dests(i)
    1317              :             END IF
    1318              :          END DO
    1319              : 
    1320         7676 :          DO itask = itask_start, itask_stop
    1321              : 
    1322         7386 :             dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
    1323         7386 :             ilevel = tasks(itask)%grid_level
    1324         7386 :             img = tasks(itask)%image
    1325         7386 :             iatom = tasks(itask)%iatom
    1326         7386 :             jatom = tasks(itask)%jatom
    1327         7386 :             iset = tasks(itask)%iset
    1328         7386 :             jset = tasks(itask)%jset
    1329         7386 :             ipgf = tasks(itask)%ipgf
    1330         7386 :             jpgf = tasks(itask)%jpgf
    1331              : 
    1332              :             ! Only proceed with tasks which are on this grid level
    1333         7386 :             IF (ilevel /= grid_level) CYCLE
    1334         2760 :             ipair = (iatom - 1)*natom8 + (jatom - 1)
    1335         2760 :             cost = INT(tasks(itask)%cost)
    1336              : 
    1337          182 :             SELECT CASE (tasks(itask)%dist_type)
    1338              :             CASE (1)
    1339         2760 :                bit_pattern = tasks(itask)%subpatch_pattern
    1340         2760 :                nopt = 0
    1341         2760 :                IF (BTEST(bit_pattern, 0)) THEN
    1342            0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [-1, 0, 0])
    1343            0 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1344            0 :                      nopt = nopt + 1
    1345            0 :                      options(nopt) = rank
    1346              :                   END IF
    1347              :                END IF
    1348         2760 :                IF (BTEST(bit_pattern, 1)) THEN
    1349            0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [+1, 0, 0])
    1350            0 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1351            0 :                      nopt = nopt + 1
    1352            0 :                      options(nopt) = rank
    1353              :                   END IF
    1354              :                END IF
    1355         2760 :                IF (BTEST(bit_pattern, 2)) THEN
    1356           48 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, -1, 0])
    1357           96 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1358            0 :                      nopt = nopt + 1
    1359            0 :                      options(nopt) = rank
    1360              :                   END IF
    1361              :                END IF
    1362         2760 :                IF (BTEST(bit_pattern, 3)) THEN
    1363            0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, +1, 0])
    1364            0 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1365            0 :                      nopt = nopt + 1
    1366            0 :                      options(nopt) = rank
    1367              :                   END IF
    1368              :                END IF
    1369         2760 :                IF (BTEST(bit_pattern, 4)) THEN
    1370         1150 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, -1])
    1371         2200 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1372          100 :                      nopt = nopt + 1
    1373          100 :                      options(nopt) = rank
    1374              :                   END IF
    1375              :                END IF
    1376         2760 :                IF (BTEST(bit_pattern, 5)) THEN
    1377          500 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, +1])
    1378         1000 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1379           80 :                      nopt = nopt + 1
    1380           80 :                      options(nopt) = rank
    1381              :                   END IF
    1382              :                END IF
    1383         2760 :                IF (nopt > 0) THEN
    1384              :                   ! set it to the rank with the lowest load
    1385          180 :                   rank = options(1)
    1386          180 :                   DO iopt = 2, nopt
    1387          180 :                      IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
    1388              :                   END DO
    1389              :                ELSE
    1390         2580 :                   rank = dest
    1391              :                END IF
    1392         2760 :                li = list_index(list, rank, dest)
    1393         2760 :                IF (create_list) THEN
    1394         1380 :                   list(2, li, dest) = list(2, li, dest) + cost
    1395              :                ELSE
    1396         1380 :                   IF (list(1, li, dest) == dest) THEN
    1397         1290 :                      tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1398              :                   ELSE
    1399           90 :                      IF (list(2, li, dest) >= cost) THEN
    1400           10 :                         list(2, li, dest) = list(2, li, dest) - cost
    1401           10 :                         tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
    1402              :                      ELSE
    1403           80 :                         tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1404              :                      END IF
    1405              :                   END IF
    1406              :                END IF
    1407              :             CASE (2) ! generalised
    1408            0 :                li = list_index(list, dest, dest)
    1409            0 :                IF (create_list) THEN
    1410            0 :                   list(2, li, dest) = list(2, li, dest) + cost
    1411              :                ELSE
    1412            0 :                   IF (list(1, li, dest) == dest) THEN
    1413            0 :                      tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1414              :                   ELSE
    1415            0 :                      IF (list(2, li, dest) >= cost) THEN
    1416            0 :                         list(2, li, dest) = list(2, li, dest) - cost
    1417            0 :                         tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
    1418              :                      ELSE
    1419            0 :                         tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1420              :                      END IF
    1421              :                   END IF
    1422              :                END IF
    1423              :             CASE DEFAULT
    1424         7386 :                CPABORT("")
    1425              :             END SELECT
    1426              : 
    1427              :          END DO
    1428              : 
    1429              :       END DO
    1430              : 
    1431          108 :       CALL timestop(handle)
    1432              : 
    1433          216 :    END SUBROUTINE compute_load_list
    1434              : ! **************************************************************************************************
    1435              : !> \brief small helper function to return the proper index in the list array
    1436              : !>
    1437              : !> \param list ...
    1438              : !> \param rank ...
    1439              : !> \param dest ...
    1440              : !> \return ...
    1441              : !> \par History
    1442              : !>      created 2008-10-06 [Joost VandeVondele]
    1443              : ! **************************************************************************************************
    1444         2760 :    INTEGER FUNCTION list_index(list, rank, dest)
    1445              :       INTEGER, DIMENSION(:, :, 0:), INTENT(IN)           :: list
    1446              :       INTEGER, INTENT(IN)                                :: rank, dest
    1447              : 
    1448         2760 :       list_index = 1
    1449         1424 :       DO
    1450         4184 :          IF (list(1, list_index, dest) == rank) EXIT
    1451         1424 :          list_index = list_index + 1
    1452              :       END DO
    1453         2760 :    END FUNCTION list_index
    1454              : ! **************************************************************************************************
    1455              : !> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
    1456              : !>        note that we allocate it with an additional field to store the load of this destination
    1457              : !>
    1458              : !> \param list ...
    1459              : !> \param rs_descs ...
    1460              : !> \param grid_level ...
    1461              : !> \par History
    1462              : !>      created 2008-10-06 [Joost VandeVondele]
    1463              : ! **************************************************************************************************
    1464           54 :    SUBROUTINE create_destination_list(list, rs_descs, grid_level)
    1465              :       INTEGER, DIMENSION(:, :, :), POINTER               :: list
    1466              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1467              :          POINTER                                         :: rs_descs
    1468              :       INTEGER, INTENT(IN)                                :: grid_level
    1469              : 
    1470              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list'
    1471              : 
    1472              :       INTEGER                                            :: handle, i, icpu, j, maxcount, ncpu, &
    1473              :                                                             ultimate_max
    1474           54 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index, sublist
    1475              : 
    1476           54 :       CALL timeset(routineN, handle)
    1477              : 
    1478           54 :       CPASSERT(.NOT. ASSOCIATED(list))
    1479           54 :       ncpu = rs_descs(grid_level)%rs_desc%group_size
    1480           54 :       ultimate_max = 7
    1481              : 
    1482          162 :       ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
    1483              : 
    1484           54 :       ALLOCATE (INDEX(ultimate_max))
    1485           54 :       ALLOCATE (sublist(ultimate_max))
    1486          432 :       sublist = HUGE(sublist)
    1487              : 
    1488           54 :       maxcount = 1
    1489          162 :       DO icpu = 0, ncpu - 1
    1490          108 :          sublist(1) = icpu
    1491          108 :          sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [-1, 0, 0])
    1492          108 :          sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [+1, 0, 0])
    1493          108 :          sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, -1, 0])
    1494          108 :          sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, +1, 0])
    1495          108 :          sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, -1])
    1496          108 :          sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, +1])
    1497              :          ! only retain unique values of the destination
    1498          108 :          CALL sort(sublist, ultimate_max, index)
    1499          108 :          j = 1
    1500          756 :          DO i = 2, 7
    1501          756 :             IF (sublist(i) /= sublist(j)) THEN
    1502          108 :                j = j + 1
    1503          108 :                sublist(j) = sublist(i)
    1504              :             END IF
    1505              :          END DO
    1506          108 :          maxcount = MAX(maxcount, j)
    1507          648 :          sublist(j + 1:ultimate_max) = HUGE(sublist)
    1508          864 :          list(1, :, icpu) = sublist
    1509          918 :          list(2, :, icpu) = 0
    1510              :       END DO
    1511              : 
    1512           54 :       CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
    1513              : 
    1514           54 :       CALL timestop(handle)
    1515              : 
    1516          108 :    END SUBROUTINE create_destination_list
    1517              : 
    1518              : ! **************************************************************************************************
    1519              : !> \brief given a task list, compute the load of each process everywhere
    1520              : !>        giving this function the ability to loop over a (sub)set of rs_grids,
    1521              : !>        and do all the communication in one shot, would speed it up
    1522              : !> \param loads ...
    1523              : !> \param rs_descs ...
    1524              : !> \param grid_level ...
    1525              : !> \param ntasks ...
    1526              : !> \param tasks ...
    1527              : !> \param use_reordered_ranks ...
    1528              : !> \par History
    1529              : !>      none
    1530              : !> \author MattW 21/11/2007
    1531              : ! **************************************************************************************************
    1532          480 :    SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
    1533              :       INTEGER(KIND=int_8), DIMENSION(:)                  :: loads
    1534              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1535              :          POINTER                                         :: rs_descs
    1536              :       INTEGER                                            :: grid_level, ntasks
    1537              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1538              :       LOGICAL, INTENT(IN)                                :: use_reordered_ranks
    1539              : 
    1540              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads'
    1541              : 
    1542              :       INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
    1543              :                                                             iset, jatom, jpgf, jset
    1544              :       INTEGER(KIND=int_8)                                :: total_cost_local
    1545              :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf_i, send_buf_i
    1546              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1547              : 
    1548          480 :       CALL timeset(routineN, handle)
    1549              : 
    1550          480 :       desc => rs_descs(grid_level)%rs_desc
    1551              : 
    1552              :       ! allocate local arrays
    1553         1440 :       ALLOCATE (send_buf_i(desc%group_size))
    1554          960 :       ALLOCATE (recv_buf_i(desc%group_size))
    1555              : 
    1556              :       ! communication step 1 : compute the total local cost of the tasks
    1557              :       !                        each proc needs to know the amount of work he will receive
    1558              : 
    1559              :       ! send buffer now contains for each target the cost of the tasks it will receive
    1560         1440 :       send_buf_i = 0
    1561        36792 :       DO i = 1, ntasks
    1562        36312 :          ilevel = tasks(i)%grid_level
    1563        36312 :          img = tasks(i)%image
    1564        36312 :          iatom = tasks(i)%iatom
    1565        36312 :          jatom = tasks(i)%jatom
    1566        36312 :          iset = tasks(i)%iset
    1567        36312 :          jset = tasks(i)%jset
    1568        36312 :          ipgf = tasks(i)%ipgf
    1569        36312 :          jpgf = tasks(i)%jpgf
    1570        36312 :          IF (ilevel /= grid_level) CYCLE
    1571        10476 :          IF (use_reordered_ranks) THEN
    1572              :             send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
    1573              :                send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
    1574         3618 :                + tasks(i)%cost
    1575              :          ELSE
    1576              :             send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
    1577              :                send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
    1578         6378 :                + tasks(i)%cost
    1579              :          END IF
    1580              :       END DO
    1581          480 :       CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
    1582              : 
    1583              :       ! communication step 2 : compute the global cost of the tasks
    1584         1440 :       total_cost_local = SUM(recv_buf_i)
    1585              : 
    1586              :       ! after this step, the recv buffer contains the local cost for each CPU
    1587         1440 :       CALL desc%group%allgather(total_cost_local, loads)
    1588              : 
    1589          480 :       CALL timestop(handle)
    1590              : 
    1591          960 :    END SUBROUTINE get_current_loads
    1592              : ! **************************************************************************************************
    1593              : !> \brief performs load balancing shifting tasks on the replicated grids
    1594              : !>        this modifies the destination of some of the tasks on replicated
    1595              : !>        grids, and in this way balances the load
    1596              : !> \param rs_descs ...
    1597              : !> \param ntasks ...
    1598              : !> \param tasks ...
    1599              : !> \par History
    1600              : !>      none
    1601              : !> \author MattW 21/11/2007
    1602              : ! **************************************************************************************************
    1603           48 :    SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
    1604              : 
    1605              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1606              :          POINTER                                         :: rs_descs
    1607              :       INTEGER                                            :: ntasks
    1608              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1609              : 
    1610              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated'
    1611              : 
    1612              :       INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
    1613              :                                                             iset, j, jatom, jpgf, jset, &
    1614              :                                                             no_overloaded, no_underloaded, &
    1615              :                                                             proc_receiving
    1616              :       INTEGER(KIND=int_8)                                :: average_cost, cost_task_rep, count, &
    1617              :                                                             offset, total_cost_global
    1618           48 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: load_imbalance, loads, recv_buf_i
    1619           48 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index
    1620              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1621              : 
    1622           48 :       CALL timeset(routineN, handle)
    1623              : 
    1624           48 :       desc => rs_descs(1)%rs_desc
    1625              : 
    1626              :       ! allocate local arrays
    1627          144 :       ALLOCATE (recv_buf_i(desc%group_size))
    1628           96 :       ALLOCATE (loads(desc%group_size))
    1629              : 
    1630          144 :       recv_buf_i = 0
    1631          234 :       DO i = 1, SIZE(rs_descs)
    1632          186 :          CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.TRUE.)
    1633          606 :          recv_buf_i(:) = recv_buf_i + loads
    1634              :       END DO
    1635              : 
    1636          144 :       total_cost_global = SUM(recv_buf_i)
    1637           48 :       average_cost = total_cost_global/desc%group_size
    1638              : 
    1639              :       !
    1640              :       ! compute how to redistribute the replicated tasks so that the average cost is reached
    1641              :       !
    1642              : 
    1643              :       ! load imbalance measures the load of a given CPU relative
    1644              :       ! to the optimal load distribution (load=average)
    1645          144 :       ALLOCATE (load_imbalance(desc%group_size))
    1646          144 :       ALLOCATE (INDEX(desc%group_size))
    1647              : 
    1648          144 :       load_imbalance(:) = recv_buf_i - average_cost
    1649           48 :       no_overloaded = 0
    1650           48 :       no_underloaded = 0
    1651              : 
    1652          144 :       DO i = 1, desc%group_size
    1653           96 :          IF (load_imbalance(i) > 0) no_overloaded = no_overloaded + 1
    1654          144 :          IF (load_imbalance(i) < 0) no_underloaded = no_underloaded + 1
    1655              :       END DO
    1656              : 
    1657              :       ! sort the recv_buffer on number of tasks, gives us index which provides a
    1658              :       ! mapping between processor ranks and how overloaded the processor
    1659           48 :       CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
    1660              : 
    1661              :       ! find out the number of replicated tasks each proc has
    1662              :       ! but only those tasks which have not yet been assigned
    1663           48 :       cost_task_rep = 0
    1664         3666 :       DO i = 1, ntasks
    1665              :          IF (tasks(i)%dist_type == 0 &
    1666         3666 :              .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
    1667         2238 :             cost_task_rep = cost_task_rep + tasks(i)%cost
    1668              :          END IF
    1669              :       END DO
    1670              : 
    1671              :       ! now, correct the load imbalance for the overloaded CPUs
    1672              :       ! they will send away not more than the total load of replicated tasks
    1673           48 :       CALL desc%group%allgather(cost_task_rep, recv_buf_i)
    1674              : 
    1675          144 :       DO i = 1, desc%group_size
    1676              :          ! At the moment we can only offload replicated tasks
    1677           96 :          IF (load_imbalance(i) > 0) &
    1678           96 :             load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i))
    1679              :       END DO
    1680              : 
    1681              :       ! simplest algorithm I can think of of is that the processor with the most
    1682              :       ! excess tasks fills up the process needing most, then moves on to next most.
    1683              :       ! At the moment if we've got less replicated tasks than we're overloaded then
    1684              :       ! task balancing will be incomplete
    1685              : 
    1686              :       ! only need to do anything if I've excess tasks
    1687           48 :       IF (load_imbalance(desc%my_pos + 1) > 0) THEN
    1688              : 
    1689           22 :          count = 0 ! weighted amount of tasks offloaded
    1690           22 :          offset = 0 ! no of underloaded processes already filled by other more overloaded procs
    1691              : 
    1692              :          ! calculate offset
    1693           22 :          DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
    1694           22 :             IF (INDEX(i) == desc%my_pos + 1) THEN
    1695              :                EXIT
    1696              :             ELSE
    1697            0 :                offset = offset + load_imbalance(INDEX(i))
    1698              :             END IF
    1699              :          END DO
    1700              : 
    1701              :          ! find my starting processor to send to
    1702           22 :          proc_receiving = HUGE(proc_receiving)
    1703           22 :          DO i = 1, no_underloaded
    1704           22 :             offset = offset + load_imbalance(INDEX(i))
    1705           22 :             IF (offset <= 0) THEN
    1706              :                proc_receiving = i
    1707              :                EXIT
    1708              :             END IF
    1709              :          END DO
    1710              : 
    1711              :          ! offset now contains minus the number of tasks proc_receiving requires
    1712              :          ! we fill this up by adjusting the destination of tasks on the replicated grid,
    1713              :          ! then move to next most underloaded proc
    1714         1380 :          DO j = 1, ntasks
    1715              :             IF (tasks(j)%dist_type == 0 &
    1716         1380 :                 .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
    1717              :                ! just avoid sending to non existing procs due to integer truncation
    1718              :                ! in the computation of the average
    1719          808 :                IF (proc_receiving > no_underloaded) EXIT
    1720              :                ! set new destination
    1721          808 :                ilevel = tasks(j)%grid_level
    1722          808 :                img = tasks(j)%image
    1723          808 :                iatom = tasks(j)%iatom
    1724          808 :                jatom = tasks(j)%jatom
    1725          808 :                iset = tasks(j)%iset
    1726          808 :                jset = tasks(j)%jset
    1727          808 :                ipgf = tasks(j)%ipgf
    1728          808 :                jpgf = tasks(j)%jpgf
    1729          808 :                tasks(j)%destination = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs))
    1730          808 :                offset = offset + tasks(j)%cost
    1731          808 :                count = count + tasks(j)%cost
    1732          808 :                IF (count >= load_imbalance(desc%my_pos + 1)) EXIT
    1733          786 :                IF (offset > 0) THEN
    1734            0 :                   proc_receiving = proc_receiving + 1
    1735              :                   ! just avoid sending to non existing procs due to integer truncation
    1736              :                   ! in the computation of the average
    1737            0 :                   IF (proc_receiving > no_underloaded) EXIT
    1738            0 :                   offset = load_imbalance(INDEX(proc_receiving))
    1739              :                END IF
    1740              :             END IF
    1741              :          END DO
    1742              :       END IF
    1743              : 
    1744           48 :       DEALLOCATE (index)
    1745           48 :       DEALLOCATE (load_imbalance)
    1746              : 
    1747           48 :       CALL timestop(handle)
    1748              : 
    1749           96 :    END SUBROUTINE load_balance_replicated
    1750              : 
    1751              : ! **************************************************************************************************
    1752              : !> \brief given an input task list, redistribute so that all tasks can be processed locally,
    1753              : !>        i.e. dest equals rank
    1754              : !> \param rs_descs ...
    1755              : !> \param ntasks ...
    1756              : !> \param tasks ...
    1757              : !> \param ntasks_recv ...
    1758              : !> \param tasks_recv ...
    1759              : !> \par History
    1760              : !>      none
    1761              : !> \author MattW 21/11/2007
    1762              : ! **************************************************************************************************
    1763           48 :    SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
    1764              : 
    1765              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1766              :          POINTER                                         :: rs_descs
    1767              :       INTEGER                                            :: ntasks
    1768              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1769              :       INTEGER                                            :: ntasks_recv
    1770              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks_recv
    1771              : 
    1772              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks'
    1773              : 
    1774              :       INTEGER                                            :: handle, i, j, k, l, rank
    1775              :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf, send_buf
    1776              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: recv_disps, recv_sizes, send_disps, &
    1777              :                                                             send_sizes
    1778              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1779              : 
    1780           48 :       CALL timeset(routineN, handle)
    1781              : 
    1782           48 :       desc => rs_descs(1)%rs_desc
    1783              : 
    1784              :       ! allocate local arrays
    1785          144 :       ALLOCATE (send_sizes(desc%group_size))
    1786           96 :       ALLOCATE (recv_sizes(desc%group_size))
    1787           96 :       ALLOCATE (send_disps(desc%group_size))
    1788           96 :       ALLOCATE (recv_disps(desc%group_size))
    1789          144 :       ALLOCATE (send_buf(desc%group_size))
    1790           96 :       ALLOCATE (recv_buf(desc%group_size))
    1791              : 
    1792              :       ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
    1793          144 :       send_buf = 0
    1794         3666 :       DO i = 1, ntasks
    1795              :          rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
    1796         3618 :                 %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
    1797         3666 :          send_buf(rank + 1) = send_buf(rank + 1) + 1
    1798              :       END DO
    1799              : 
    1800           48 :       CALL desc%group%alltoall(send_buf, recv_buf, 1)
    1801              : 
    1802              :       ! pack the tasks, and send them around
    1803              : 
    1804          144 :       send_sizes = 0
    1805          144 :       send_disps = 0
    1806          144 :       recv_sizes = 0
    1807          144 :       recv_disps = 0
    1808              : 
    1809           48 :       send_sizes(1) = INT(send_buf(1)*task_size_in_int8)
    1810           48 :       recv_sizes(1) = INT(recv_buf(1)*task_size_in_int8)
    1811           96 :       DO i = 2, desc%group_size
    1812           48 :          send_sizes(i) = INT(send_buf(i)*task_size_in_int8)
    1813           48 :          recv_sizes(i) = INT(recv_buf(i)*task_size_in_int8)
    1814           48 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1815           96 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1816              :       END DO
    1817              : 
    1818              :       ! deallocate old send/recv buffers
    1819           48 :       DEALLOCATE (send_buf)
    1820           48 :       DEALLOCATE (recv_buf)
    1821              : 
    1822              :       ! allocate them with new sizes
    1823          233 :       ALLOCATE (send_buf(SUM(send_sizes)))
    1824          238 :       ALLOCATE (recv_buf(SUM(recv_sizes)))
    1825              : 
    1826              :       ! do packing
    1827        61554 :       send_buf = 0
    1828          144 :       send_sizes = 0
    1829         3666 :       DO j = 1, ntasks
    1830              :          i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
    1831         3618 :              %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
    1832         3618 :          l = send_disps(i) + send_sizes(i)
    1833         3618 :          CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
    1834         3666 :          send_sizes(i) = send_sizes(i) + task_size_in_int8
    1835              :       END DO
    1836              : 
    1837              :       ! do communication
    1838           48 :       CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
    1839              : 
    1840           48 :       DEALLOCATE (send_buf)
    1841              : 
    1842          144 :       ntasks_recv = SUM(recv_sizes)/task_size_in_int8
    1843         3904 :       ALLOCATE (tasks_recv(ntasks_recv))
    1844              : 
    1845              :       ! do unpacking
    1846           48 :       l = 0
    1847          144 :       DO i = 1, desc%group_size
    1848         3762 :          DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
    1849         3618 :             l = l + 1
    1850         3618 :             k = recv_disps(i) + j*task_size_in_int8
    1851         3714 :             CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
    1852              :          END DO
    1853              :       END DO
    1854              : 
    1855           48 :       DEALLOCATE (recv_buf)
    1856           48 :       DEALLOCATE (send_sizes)
    1857           48 :       DEALLOCATE (recv_sizes)
    1858           48 :       DEALLOCATE (send_disps)
    1859           48 :       DEALLOCATE (recv_disps)
    1860              : 
    1861           48 :       CALL timestop(handle)
    1862              : 
    1863           48 :    END SUBROUTINE create_local_tasks
    1864              : 
    1865              : ! **************************************************************************************************
    1866              : !> \brief Assembles tasks to be performed on local grid
    1867              : !> \param rs_descs the grids
    1868              : !> \param ntasks Number of tasks for local processing
    1869              : !> \param natoms ...
    1870              : !> \param nimages ...
    1871              : !> \param tasks the task set generated on this processor
    1872              : !> \param rval ...
    1873              : !> \param atom_pair_send ...
    1874              : !> \param atom_pair_recv ...
    1875              : !> \param symmetric ...
    1876              : !> \param reorder_rs_grid_ranks ...
    1877              : !> \param skip_load_balance_distributed ...
    1878              : !> \par History
    1879              : !>      none
    1880              : !> \author MattW 21/11/2007
    1881              : ! **************************************************************************************************
    1882        16504 :    SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
    1883              :                                tasks, atom_pair_send, atom_pair_recv, &
    1884              :                                symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
    1885              : 
    1886              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1887              :          POINTER                                         :: rs_descs
    1888              :       INTEGER                                            :: ntasks, natoms
    1889              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1890              :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair_send, atom_pair_recv
    1891              :       LOGICAL, INTENT(IN)                                :: symmetric, reorder_rs_grid_ranks, &
    1892              :                                                             skip_load_balance_distributed
    1893              : 
    1894              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks'
    1895              : 
    1896              :       INTEGER                                            :: handle, igrid_level, irank, ntasks_recv
    1897              :       INTEGER(KIND=int_8)                                :: load_gap, max_load, replicated_load
    1898        16504 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: total_loads, total_loads_tmp, trial_loads
    1899        16504 :       INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: loads
    1900        16504 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: indices, real2virtual, total_index
    1901              :       LOGICAL                                            :: distributed_grids, fixed_first_grid
    1902              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1903        16504 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks_recv
    1904              : 
    1905        16504 :       CALL timeset(routineN, handle)
    1906              : 
    1907        16504 :       CPASSERT(ASSOCIATED(tasks))
    1908              : 
    1909              :       ! *** figure out if we have distributed grids
    1910        16504 :       distributed_grids = .FALSE.
    1911        82024 :       DO igrid_level = 1, SIZE(rs_descs)
    1912        82024 :          IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1913           54 :             distributed_grids = .TRUE.
    1914              :          END IF
    1915              :       END DO
    1916        16504 :       desc => rs_descs(1)%rs_desc
    1917              : 
    1918        16504 :       IF (distributed_grids) THEN
    1919              : 
    1920          192 :          ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
    1921          144 :          ALLOCATE (total_loads(0:desc%group_size - 1))
    1922              : 
    1923          144 :          total_loads = 0
    1924              : 
    1925              :          ! First round of balancing on the distributed grids
    1926              :          ! we just balance each of the distributed grids independently
    1927          234 :          DO igrid_level = 1, SIZE(rs_descs)
    1928          234 :             IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1929              : 
    1930           54 :                IF (.NOT. skip_load_balance_distributed) &
    1931           54 :                   CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
    1932              : 
    1933              :                CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
    1934           54 :                                       tasks, use_reordered_ranks=.FALSE.)
    1935              : 
    1936          162 :                total_loads(:) = total_loads + loads(:, igrid_level)
    1937              : 
    1938              :             END IF
    1939              :          END DO
    1940              : 
    1941              :          ! calculate the total load of replicated tasks, so we can decide if it is worth
    1942              :          ! reordering the distributed grid levels
    1943              : 
    1944           48 :          replicated_load = 0
    1945          234 :          DO igrid_level = 1, SIZE(rs_descs)
    1946          234 :             IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
    1947              :                CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
    1948          132 :                                       tasks, use_reordered_ranks=.FALSE.)
    1949          396 :                replicated_load = replicated_load + SUM(loads(:, igrid_level))
    1950              :             END IF
    1951              :          END DO
    1952              : 
    1953              :          !IF (desc%my_pos==0) THEN
    1954              :          ! WRITE(*,*) "Total replicated load is ",replicated_load
    1955              :          !END IF
    1956              : 
    1957              :          ! Now we adjust the rank ordering based on the current loads
    1958              :          ! we leave the first distributed level and all the replicated levels in the default order
    1959           48 :          IF (reorder_rs_grid_ranks) THEN
    1960           48 :             fixed_first_grid = .FALSE.
    1961          234 :             DO igrid_level = 1, SIZE(rs_descs)
    1962          234 :                IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1963           54 :                   IF (fixed_first_grid .EQV. .FALSE.) THEN
    1964          144 :                      total_loads(:) = loads(:, igrid_level)
    1965              :                      fixed_first_grid = .TRUE.
    1966              :                   ELSE
    1967           18 :                      ALLOCATE (trial_loads(0:desc%group_size - 1))
    1968              : 
    1969           18 :                      trial_loads(:) = total_loads + loads(:, igrid_level)
    1970           18 :                      max_load = MAXVAL(trial_loads)
    1971              :                      load_gap = 0
    1972           18 :                      DO irank = 0, desc%group_size - 1
    1973           18 :                         load_gap = load_gap + max_load - trial_loads(irank)
    1974              :                      END DO
    1975              : 
    1976              :                      ! If there is not enough replicated load to load balance well enough
    1977              :                      ! then we will reorder this grid level
    1978            6 :                      IF (load_gap > replicated_load*1.05_dp) THEN
    1979              : 
    1980           18 :                         ALLOCATE (indices(0:desc%group_size - 1))
    1981           12 :                         ALLOCATE (total_index(0:desc%group_size - 1))
    1982           12 :                         ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
    1983           12 :                         ALLOCATE (real2virtual(0:desc%group_size - 1))
    1984              : 
    1985           18 :                         total_loads_tmp(:) = total_loads
    1986            6 :                         CALL sort(total_loads_tmp, desc%group_size, total_index)
    1987            6 :                         CALL sort(loads(:, igrid_level), desc%group_size, indices)
    1988              : 
    1989              :                         ! Reorder so that the rank with smallest load on this grid level is paired with
    1990              :                         ! the highest load in total
    1991           18 :                         DO irank = 0, desc%group_size - 1
    1992              :                            total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
    1993           12 :                                                                  loads(desc%group_size - irank - 1, igrid_level)
    1994           18 :                            real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
    1995              :                         END DO
    1996              : 
    1997            6 :                         CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
    1998              : 
    1999            6 :                         DEALLOCATE (indices)
    2000            6 :                         DEALLOCATE (total_index)
    2001            6 :                         DEALLOCATE (total_loads_tmp)
    2002            6 :                         DEALLOCATE (real2virtual)
    2003              :                      ELSE
    2004            0 :                         total_loads(:) = trial_loads
    2005              :                      END IF
    2006              : 
    2007            6 :                      DEALLOCATE (trial_loads)
    2008              : 
    2009              :                   END IF
    2010              :                END IF
    2011              :             END DO
    2012              :          END IF
    2013              : 
    2014              :          ! Now we use the replicated tasks to balance out the rest of the load
    2015           48 :          CALL load_balance_replicated(rs_descs, ntasks, tasks)
    2016              : 
    2017              :          !total_loads = 0
    2018              :          !DO igrid_level=1,SIZE(rs_descs)
    2019              :          !  CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
    2020              :          !                         tasks, use_reordered_ranks=.TRUE.)
    2021              :          !  total_loads = total_loads + loads(:, igrid_level)
    2022              :          !END DO
    2023              : 
    2024              :          !IF (desc%my_pos==0) THEN
    2025              :          !  WRITE(*,*) ""
    2026              :          !  WRITE(*,*) "At the end of the load balancing procedure"
    2027              :          !  WRITE(*,*) "Maximum  load:",MAXVAL(total_loads)
    2028              :          !  WRITE(*,*) "Average  load:",SUM(total_loads)/SIZE(total_loads)
    2029              :          !  WRITE(*,*) "Minimum  load:",MINVAL(total_loads)
    2030              :          !ENDIF
    2031              : 
    2032              :          ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
    2033           48 :          CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
    2034              : 
    2035              :          !
    2036              :          ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
    2037              :          ! we will be sending. These lists are needed for redistribute_matrix.
    2038              :          !
    2039           48 :          CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.TRUE., symmetric=symmetric, rs_descs=rs_descs)
    2040              : 
    2041              :          ! natom_send=SIZE(atom_pair_send)
    2042              :          ! CALL desc%group%sum(natom_send)
    2043              :          ! IF (desc%my_pos==0) THEN
    2044              :          !     WRITE(*,*) ""
    2045              :          !     WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
    2046              :          ! ENDIF
    2047              : 
    2048           48 :          CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
    2049              : 
    2050              :          ! cleanup, at this point we  don't need the original tasks anymore
    2051           48 :          DEALLOCATE (tasks)
    2052           48 :          DEALLOCATE (loads)
    2053           48 :          DEALLOCATE (total_loads)
    2054              : 
    2055              :       ELSE
    2056        16456 :          tasks_recv => tasks
    2057        16456 :          ntasks_recv = ntasks
    2058        16456 :          CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
    2059              :          ! not distributed, hence atom_pair_send not needed
    2060              :       END IF
    2061              : 
    2062              :       ! here we sort the task list we will process locally.
    2063        49229 :       ALLOCATE (indices(ntasks_recv))
    2064        16504 :       CALL tasks_sort(tasks_recv, ntasks_recv, indices)
    2065        16504 :       DEALLOCATE (indices)
    2066              : 
    2067              :       !
    2068              :       ! final lists are ready
    2069              :       !
    2070              : 
    2071        16504 :       tasks => tasks_recv
    2072        16504 :       ntasks = ntasks_recv
    2073              : 
    2074        16504 :       CALL timestop(handle)
    2075              : 
    2076        33008 :    END SUBROUTINE distribute_tasks
    2077              : 
    2078              : ! **************************************************************************************************
    2079              : !> \brief ...
    2080              : !> \param atom_pair ...
    2081              : !> \param my_tasks ...
    2082              : !> \param send ...
    2083              : !> \param symmetric ...
    2084              : !> \param natoms ...
    2085              : !> \param nimages ...
    2086              : !> \param rs_descs ...
    2087              : ! **************************************************************************************************
    2088        16552 :    SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
    2089              : 
    2090              :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair
    2091              :       TYPE(task_type), DIMENSION(:), INTENT(INOUT)       :: tasks
    2092              :       INTEGER, INTENT(IN)                                :: ntasks
    2093              :       LOGICAL, INTENT(IN)                                :: send, symmetric
    2094              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
    2095              : 
    2096              :       INTEGER                                            :: i, ilevel, iatom, jatom, npairs, virt_rank
    2097        16552 :       INTEGER, DIMENSION(:), ALLOCATABLE                 :: indices
    2098        16552 :       TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE    :: atom_pair_tmp
    2099              : 
    2100        16552 :       CPASSERT(.NOT. ASSOCIATED(atom_pair))
    2101        16552 :       IF (ntasks == 0) THEN
    2102          290 :          ALLOCATE (atom_pair(0))
    2103              :          RETURN
    2104              :       END IF
    2105              : 
    2106              :       ! calculate list of atom pairs
    2107              :       ! fill pair list taking into account symmetry
    2108      8029301 :       ALLOCATE (atom_pair_tmp(ntasks))
    2109      7996777 :       DO i = 1, ntasks
    2110      7980515 :          atom_pair_tmp(i)%image = tasks(i)%image
    2111      7980515 :          iatom = tasks(i)%iatom
    2112      7980515 :          jatom = tasks(i)%jatom
    2113      7980515 :          IF (symmetric .AND. iatom > jatom) THEN
    2114              :             ! iatom / jatom swapped
    2115      3008071 :             atom_pair_tmp(i)%row = jatom
    2116      3008071 :             atom_pair_tmp(i)%col = iatom
    2117              :          ELSE
    2118      4972444 :             atom_pair_tmp(i)%row = iatom
    2119      4972444 :             atom_pair_tmp(i)%col = jatom
    2120              :          END IF
    2121              : 
    2122      7996777 :          IF (send) THEN
    2123              :             ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
    2124              :             ! actually has the correct part of the rs_grid to do the mapping
    2125         3618 :             ilevel = tasks(i)%grid_level
    2126         3618 :             virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
    2127         3618 :             atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
    2128              :          ELSE
    2129              :             ! If we are receiving, then no conversion is needed as the rank is that of the process with the
    2130              :             ! required matrix block, and the ordering of the rs grid is irrelevant
    2131      7976897 :             atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
    2132              :          END IF
    2133              :       END DO
    2134              : 
    2135              :       ! find unique atom pairs that I'm sending/receiving
    2136        48786 :       ALLOCATE (indices(ntasks))
    2137        16262 :       CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
    2138        16262 :       npairs = 1
    2139        16262 :       tasks(indices(1))%pair_index = 1
    2140      7980515 :       DO i = 2, ntasks
    2141      7964253 :          IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
    2142       267845 :             npairs = npairs + 1
    2143       267845 :             atom_pair_tmp(npairs) = atom_pair_tmp(i)
    2144              :          END IF
    2145      7980515 :          tasks(indices(i))%pair_index = npairs
    2146              :       END DO
    2147        16262 :       DEALLOCATE (indices)
    2148              : 
    2149              :       ! Copy unique pairs to final location.
    2150       332893 :       ALLOCATE (atom_pair(npairs))
    2151       300369 :       atom_pair(:) = atom_pair_tmp(:npairs)
    2152        16262 :       DEALLOCATE (atom_pair_tmp)
    2153              : 
    2154              :    END SUBROUTINE get_atom_pair
    2155              : 
    2156              : ! **************************************************************************************************
    2157              : !> \brief redistributes the matrix so that it can be used in realspace operations
    2158              : !>        i.e. according to the task lists for collocate and integrate.
    2159              : !>        This routine can become a bottleneck in large calculations.
    2160              : !> \param rs_descs ...
    2161              : !> \param pmats ...
    2162              : !> \param atom_pair_send ...
    2163              : !> \param atom_pair_recv ...
    2164              : !> \param natoms ...
    2165              : !> \param nimages ...
    2166              : !> \param scatter ...
    2167              : !> \param hmats ...
    2168              : ! **************************************************************************************************
    2169            0 :    SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
    2170              :                                    nimages, scatter, hmats)
    2171              : 
    2172              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    2173              :          POINTER                                         :: rs_descs
    2174              :       TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: pmats
    2175              :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair_send, atom_pair_recv
    2176              :       INTEGER                                            :: nimages
    2177              :       LOGICAL                                            :: scatter
    2178              :       TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
    2179              :          POINTER                                         :: hmats
    2180              : 
    2181              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix'
    2182              : 
    2183              :       INTEGER                                            :: acol, arow, handle, i, img, j, k, l, me, &
    2184              :                                                             nblkcols_total, nblkrows_total, ncol, &
    2185              :                                                             nrow, nthread, nthread_left
    2186            0 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
    2187            0 :                                             recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
    2188            0 :                                             send_pair_disps, send_sizes
    2189            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, row_blk_size
    2190              :       LOGICAL                                            :: found
    2191            0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: recv_buf_r, send_buf_r
    2192            0 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: h_block, p_block
    2193              :       TYPE(dbcsr_type), POINTER                          :: hmat, pmat
    2194              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    2195            0 :       REAL(kind=dp), DIMENSION(:), POINTER                            :: vector
    2196              : 
    2197            0 : !$    INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
    2198              : 
    2199            0 :       CALL timeset(routineN, handle)
    2200              : 
    2201            0 :       IF (.NOT. scatter) THEN
    2202            0 :          CPASSERT(PRESENT(hmats))
    2203              :       END IF
    2204              : 
    2205            0 :       desc => rs_descs(1)%rs_desc
    2206            0 :       me = desc%my_pos + 1
    2207              : 
    2208              :       ! allocate local arrays
    2209            0 :       ALLOCATE (send_sizes(desc%group_size))
    2210            0 :       ALLOCATE (recv_sizes(desc%group_size))
    2211            0 :       ALLOCATE (send_disps(desc%group_size))
    2212            0 :       ALLOCATE (recv_disps(desc%group_size))
    2213            0 :       ALLOCATE (send_pair_count(desc%group_size))
    2214            0 :       ALLOCATE (recv_pair_count(desc%group_size))
    2215            0 :       ALLOCATE (send_pair_disps(desc%group_size))
    2216            0 :       ALLOCATE (recv_pair_disps(desc%group_size))
    2217              : 
    2218            0 :       pmat => pmats(1)%matrix
    2219              :       CALL dbcsr_get_info(pmat, &
    2220              :                           row_blk_size=row_blk_size, &
    2221              :                           col_blk_size=col_blk_size, &
    2222              :                           nblkrows_total=nblkrows_total, &
    2223            0 :                           nblkcols_total=nblkcols_total)
    2224            0 :       ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
    2225            0 :                 first_col(nblkcols_total), last_col(nblkcols_total))
    2226            0 :       CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
    2227            0 :       CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
    2228              : 
    2229              :       ! set up send buffer sizes
    2230            0 :       send_sizes = 0
    2231            0 :       send_pair_count = 0
    2232            0 :       DO i = 1, SIZE(atom_pair_send)
    2233            0 :          k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
    2234            0 :          arow = atom_pair_send(i)%row
    2235            0 :          acol = atom_pair_send(i)%col
    2236            0 :          nrow = last_row(arow) - first_row(arow) + 1
    2237            0 :          ncol = last_col(acol) - first_col(acol) + 1
    2238            0 :          send_sizes(k) = send_sizes(k) + nrow*ncol
    2239            0 :          send_pair_count(k) = send_pair_count(k) + 1
    2240              :       END DO
    2241              : 
    2242            0 :       send_disps = 0
    2243            0 :       send_pair_disps = 0
    2244            0 :       DO i = 2, desc%group_size
    2245            0 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    2246            0 :          send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
    2247              :       END DO
    2248              : 
    2249            0 :       ALLOCATE (send_buf_r(SUM(send_sizes)))
    2250              : 
    2251              :       ! set up recv buffer
    2252              : 
    2253            0 :       recv_sizes = 0
    2254            0 :       recv_pair_count = 0
    2255            0 :       DO i = 1, SIZE(atom_pair_recv)
    2256            0 :          k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
    2257            0 :          arow = atom_pair_recv(i)%row
    2258            0 :          acol = atom_pair_recv(i)%col
    2259            0 :          nrow = last_row(arow) - first_row(arow) + 1
    2260            0 :          ncol = last_col(acol) - first_col(acol) + 1
    2261            0 :          recv_sizes(k) = recv_sizes(k) + nrow*ncol
    2262            0 :          recv_pair_count(k) = recv_pair_count(k) + 1
    2263              :       END DO
    2264              : 
    2265            0 :       recv_disps = 0
    2266            0 :       recv_pair_disps = 0
    2267            0 :       DO i = 2, desc%group_size
    2268            0 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    2269            0 :          recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
    2270              :       END DO
    2271            0 :       ALLOCATE (recv_buf_r(SUM(recv_sizes)))
    2272              : 
    2273              : !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
    2274              : !$OMP          SHARED(desc,send_pair_count,send_pair_disps,nimages),&
    2275              : !$OMP          SHARED(last_row,first_row,last_col,first_col),&
    2276              : !$OMP          SHARED(pmats,send_buf_r,send_disps,send_sizes),&
    2277              : !$OMP          SHARED(atom_pair_send,me,hmats,nblkrows_total),&
    2278              : !$OMP          SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
    2279              : !$OMP          SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
    2280              : !$OMP          PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
    2281            0 : !$OMP          PRIVATE(nthread,h_block,nthread_left,hmat,pmat,vector)
    2282              : 
    2283              :       nthread = 1
    2284              : !$    nthread = omp_get_num_threads()
    2285              :       nthread_left = 1
    2286              : !$    nthread_left = MAX(1, nthread - 1)
    2287              : 
    2288              :       ! do packing
    2289              : !$OMP DO schedule(guided)
    2290              :       DO l = 1, desc%group_size
    2291              :          IF (l == me) CYCLE
    2292              :          send_sizes(l) = 0
    2293              :          DO i = 1, send_pair_count(l)
    2294              :             arow = atom_pair_send(send_pair_disps(l) + i)%row
    2295              :             acol = atom_pair_send(send_pair_disps(l) + i)%col
    2296              :             img = atom_pair_send(send_pair_disps(l) + i)%image
    2297              :             nrow = last_row(arow) - first_row(arow) + 1
    2298              :             ncol = last_col(acol) - first_col(acol) + 1
    2299              :             pmat => pmats(img)%matrix
    2300              :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
    2301              :             CPASSERT(found)
    2302              : 
    2303              :             DO k = 1, ncol
    2304              :                DO j = 1, nrow
    2305              :                   send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
    2306              :                END DO
    2307              :             END DO
    2308              :             send_sizes(l) = send_sizes(l) + nrow*ncol
    2309              :          END DO
    2310              :       END DO
    2311              : !$OMP END DO
    2312              : 
    2313              :       IF (.NOT. scatter) THEN
    2314              :          ! We need locks to protect concurrent summation into H
    2315              : !$OMP SINGLE
    2316              : !$       ALLOCATE (locks(nthread*10))
    2317              : !$OMP END SINGLE
    2318              : 
    2319              : !$OMP DO
    2320              : !$       do i = 1, nthread*10
    2321              : !$          call omp_init_lock(locks(i))
    2322              : !$       end do
    2323              : !$OMP END DO
    2324              :       END IF
    2325              : 
    2326              : !$OMP MASTER
    2327              :       ! do communication
    2328              :       CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
    2329              :                                recv_buf_r, recv_sizes, recv_disps)
    2330              : !$OMP END MASTER
    2331              : 
    2332              :       ! If this is a scatter, then no need to copy local blocks,
    2333              :       ! If not, we sum them directly into H (bypassing the alltoall)
    2334              :       IF (.NOT. scatter) THEN
    2335              : 
    2336              :          ! Distribute work over remaining threads assuming one is still in the alltoall
    2337              : !$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
    2338              :          DO i = 1, send_pair_count(me)
    2339              :             arow = atom_pair_send(send_pair_disps(me) + i)%row
    2340              :             acol = atom_pair_send(send_pair_disps(me) + i)%col
    2341              :             img = atom_pair_send(send_pair_disps(me) + i)%image
    2342              :             nrow = last_row(arow) - first_row(arow) + 1
    2343              :             ncol = last_col(acol) - first_col(acol) + 1
    2344              :             hmat => hmats(img)%matrix
    2345              :             pmat => pmats(img)%matrix
    2346              :             CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
    2347              :             CPASSERT(found)
    2348              :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
    2349              :             CPASSERT(found)
    2350              : 
    2351              : !$          call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2352              :             DO k = 1, ncol
    2353              :                DO j = 1, nrow
    2354              :                   h_block(j, k) = h_block(j, k) + p_block(j, k)
    2355              :                END DO
    2356              :             END DO
    2357              : !$          call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2358              :          END DO
    2359              : !$OMP END DO
    2360              :       ELSE
    2361              :          ! We will insert new blocks into P, so create mutable work matrices
    2362              :          DO img = 1, nimages
    2363              :             pmat => pmats(img)%matrix
    2364              :             CALL dbcsr_work_create(pmat, work_mutable=.TRUE., &
    2365              :                                    nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
    2366              :                                    n=nthread)
    2367              :          END DO
    2368              :       END IF
    2369              : 
    2370              : ! wait for comm and setup to finish
    2371              : !$OMP BARRIER
    2372              : 
    2373              :       !do unpacking
    2374              : !$OMP DO schedule(guided)
    2375              :       DO l = 1, desc%group_size
    2376              :          IF (l == me) CYCLE
    2377              :          recv_sizes(l) = 0
    2378              :          DO i = 1, recv_pair_count(l)
    2379              :             arow = atom_pair_recv(recv_pair_disps(l) + i)%row
    2380              :             acol = atom_pair_recv(recv_pair_disps(l) + i)%col
    2381              :             img = atom_pair_recv(recv_pair_disps(l) + i)%image
    2382              :             nrow = last_row(arow) - first_row(arow) + 1
    2383              :             ncol = last_col(acol) - first_col(acol) + 1
    2384              :             pmat => pmats(img)%matrix
    2385              :             NULLIFY (p_block)
    2386              :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
    2387              : 
    2388              :             IF (PRESENT(hmats)) THEN
    2389              :                hmat => hmats(img)%matrix
    2390              :                CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
    2391              :                CPASSERT(found)
    2392              :             END IF
    2393              : 
    2394              :             IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
    2395              :                vector => recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol)
    2396              :                CALL dbcsr_put_block(pmat, arow, acol, block=RESHAPE(vector, [nrow, ncol]))
    2397              :             END IF
    2398              :             IF (.NOT. scatter) THEN
    2399              : !$             call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2400              :                DO k = 1, ncol
    2401              :                   DO j = 1, nrow
    2402              :                      h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
    2403              :                   END DO
    2404              :                END DO
    2405              : !$             call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2406              :             END IF
    2407              :             recv_sizes(l) = recv_sizes(l) + nrow*ncol
    2408              :          END DO
    2409              :       END DO
    2410              : !$OMP END DO
    2411              : 
    2412              : !$    IF (.not. scatter) THEN
    2413              : !$OMP DO
    2414              : !$       do i = 1, nthread*10
    2415              : !$          call omp_destroy_lock(locks(i))
    2416              : !$       end do
    2417              : !$OMP END DO
    2418              : !$    END IF
    2419              : 
    2420              : !$OMP SINGLE
    2421              : !$    IF (.not. scatter) THEN
    2422              : !$       DEALLOCATE (locks)
    2423              : !$    END IF
    2424              : !$OMP END SINGLE NOWAIT
    2425              : 
    2426              :       IF (scatter) THEN
    2427              :          ! Blocks were added to P
    2428              :          DO img = 1, nimages
    2429              :             pmat => pmats(img)%matrix
    2430              :             CALL dbcsr_finalize(pmat)
    2431              :          END DO
    2432              :       END IF
    2433              : !$OMP END PARALLEL
    2434              : 
    2435            0 :       DEALLOCATE (send_buf_r)
    2436            0 :       DEALLOCATE (recv_buf_r)
    2437              : 
    2438            0 :       DEALLOCATE (send_sizes)
    2439            0 :       DEALLOCATE (recv_sizes)
    2440            0 :       DEALLOCATE (send_disps)
    2441            0 :       DEALLOCATE (recv_disps)
    2442            0 :       DEALLOCATE (send_pair_count)
    2443            0 :       DEALLOCATE (recv_pair_count)
    2444            0 :       DEALLOCATE (send_pair_disps)
    2445            0 :       DEALLOCATE (recv_pair_disps)
    2446              : 
    2447            0 :       DEALLOCATE (first_row, last_row, first_col, last_col)
    2448              : 
    2449            0 :       CALL timestop(handle)
    2450              : 
    2451            0 :    END SUBROUTINE rs_distribute_matrix
    2452              : 
    2453              : ! **************************************************************************************************
    2454              : !> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
    2455              : !> \author Ole Schuett
    2456              : ! **************************************************************************************************
    2457        14158 :    SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
    2458              :                               pair_offsets, rank_offsets, rank_sizes, buffer_size)
    2459              :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: pairs
    2460              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: nsgf
    2461              :       INTEGER, INTENT(IN)                                :: group_size
    2462              :       INTEGER, DIMENSION(:), POINTER                     :: pair_offsets, rank_offsets, rank_sizes
    2463              :       INTEGER, INTENT(INOUT)                             :: buffer_size
    2464              : 
    2465              :       INTEGER                                            :: acol, arow, i, block_size, total_size, k, prev_k
    2466              : 
    2467        14158 :       IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
    2468        14158 :       IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
    2469        14158 :       IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
    2470              : 
    2471              :       ! calculate buffer_size and pair_offsets
    2472        42235 :       ALLOCATE (pair_offsets(SIZE(pairs)))
    2473        14158 :       total_size = 0
    2474       286847 :       DO i = 1, SIZE(pairs)
    2475       272689 :          pair_offsets(i) = total_size
    2476       272689 :          arow = pairs(i)%row
    2477       272689 :          acol = pairs(i)%col
    2478       272689 :          block_size = nsgf(arow)*nsgf(acol)
    2479       286847 :          total_size = total_size + block_size
    2480              :       END DO
    2481        14158 :       buffer_size = total_size
    2482              : 
    2483              :       ! calculate rank_offsets and rank_sizes
    2484        42474 :       ALLOCATE (rank_offsets(group_size))
    2485        28316 :       ALLOCATE (rank_sizes(group_size))
    2486        41254 :       rank_offsets = 0
    2487        41254 :       rank_sizes = 0
    2488        14158 :       IF (SIZE(pairs) > 0) THEN
    2489        13919 :          prev_k = pairs(1)%rank + 1
    2490       286608 :          DO i = 1, SIZE(pairs)
    2491       272689 :             k = pairs(i)%rank + 1
    2492       272689 :             CPASSERT(k >= prev_k) ! expecting the pairs to be ordered by rank
    2493       286608 :             IF (k > prev_k) THEN
    2494           61 :                rank_offsets(k) = pair_offsets(i)
    2495           61 :                rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
    2496           61 :                prev_k = k
    2497              :             END IF
    2498              :          END DO
    2499        13919 :          rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
    2500              :       END IF
    2501              : 
    2502        14158 :    END SUBROUTINE rs_calc_offsets
    2503              : 
    2504              : ! **************************************************************************************************
    2505              : !> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
    2506              : !> \author Ole Schuett
    2507              : ! **************************************************************************************************
    2508          266 :    SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
    2509              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2510              :       TYPE(offload_buffer_type), INTENT(INOUT)           :: dest_buffer
    2511              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2512              :       TYPE(mp_comm_type), INTENT(IN)                     :: group
    2513              : 
    2514              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_scatter_matrices'
    2515              : 
    2516              :       INTEGER                                            :: handle
    2517              :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: buffer_send
    2518              : 
    2519          266 :       CALL timeset(routineN, handle)
    2520          777 :       ALLOCATE (buffer_send(task_list%buffer_size_send))
    2521              : 
    2522              :       ! pack dbcsr blocks into send buffer
    2523          266 :       CPASSERT(ASSOCIATED(task_list%atom_pair_send))
    2524              :       CALL rs_pack_buffer(src_matrices=src_matrices, &
    2525              :                           dest_buffer=buffer_send, &
    2526              :                           atom_pair=task_list%atom_pair_send, &
    2527          266 :                           pair_offsets=task_list%pair_offsets_send)
    2528              : 
    2529              :       ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
    2530              :       CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
    2531              :                           dest_buffer%host_buffer, &
    2532         2394 :                           task_list%rank_sizes_recv, task_list%rank_offsets_recv)
    2533              : 
    2534          266 :       DEALLOCATE (buffer_send)
    2535          266 :       CALL timestop(handle)
    2536              : 
    2537          266 :    END SUBROUTINE rs_scatter_matrices
    2538              : 
    2539              : ! **************************************************************************************************
    2540              : !> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
    2541              : !> \author Ole Schuett
    2542              : ! **************************************************************************************************
    2543          218 :    SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
    2544              :       TYPE(offload_buffer_type), INTENT(IN)              :: src_buffer
    2545              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2546              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2547              :       TYPE(mp_comm_type), INTENT(IN)                     :: group
    2548              : 
    2549              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_gather_matrices'
    2550              : 
    2551              :       INTEGER                                            :: handle
    2552              :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: buffer_send
    2553              : 
    2554          218 :       CALL timeset(routineN, handle)
    2555              : 
    2556              :       ! Caution: The meaning of send and recv are reversed in this routine.
    2557          640 :       ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
    2558              : 
    2559              :       ! mpi all-to-all communication
    2560              :       CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
    2561         1962 :                           buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
    2562              : 
    2563              :       ! unpack dbcsr blocks from send buffer
    2564          218 :       CPASSERT(ASSOCIATED(task_list%atom_pair_send))
    2565              :       CALL rs_unpack_buffer(src_buffer=buffer_send, &
    2566              :                             dest_matrices=dest_matrices, &
    2567              :                             atom_pair=task_list%atom_pair_send, &
    2568          218 :                             pair_offsets=task_list%pair_offsets_send)
    2569              : 
    2570          218 :       DEALLOCATE (buffer_send)
    2571          218 :       CALL timestop(handle)
    2572              : 
    2573          218 :    END SUBROUTINE rs_gather_matrices
    2574              : 
    2575              : ! **************************************************************************************************
    2576              : !> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
    2577              : !> \author Ole Schuett
    2578              : ! **************************************************************************************************
    2579       231477 :    SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
    2580              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2581              :       TYPE(offload_buffer_type), INTENT(INOUT)           :: dest_buffer
    2582              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2583              : 
    2584              :       CALL rs_pack_buffer(src_matrices=src_matrices, &
    2585              :                           dest_buffer=dest_buffer%host_buffer, &
    2586              :                           atom_pair=task_list%atom_pair_recv, &
    2587       231477 :                           pair_offsets=task_list%pair_offsets_recv)
    2588              : 
    2589       231477 :    END SUBROUTINE rs_copy_to_buffer
    2590              : 
    2591              : ! **************************************************************************************************
    2592              : !> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
    2593              : !> \author Ole Schuett
    2594              : ! **************************************************************************************************
    2595       190600 :    SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
    2596              :       TYPE(offload_buffer_type), INTENT(IN)              :: src_buffer
    2597              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2598              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2599              : 
    2600              :       CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
    2601              :                             dest_matrices=dest_matrices, &
    2602              :                             atom_pair=task_list%atom_pair_recv, &
    2603       190600 :                             pair_offsets=task_list%pair_offsets_recv)
    2604              : 
    2605       190600 :    END SUBROUTINE rs_copy_to_matrices
    2606              : 
    2607              : ! **************************************************************************************************
    2608              : !> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
    2609              : !> \author Ole Schuett
    2610              : ! **************************************************************************************************
    2611       231743 :    SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
    2612              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2613              :       REAL(KIND=dp), DIMENSION(:), INTENT(INOUT)         :: dest_buffer
    2614              :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: atom_pair
    2615              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: pair_offsets
    2616              : 
    2617              :       INTEGER                                            :: acol, arow, img, i, offset, block_size
    2618              :       LOGICAL                                            :: found
    2619       231743 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: block
    2620              : 
    2621              : !$OMP PARALLEL DEFAULT(NONE), &
    2622              : !$OMP          SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
    2623       231743 : !$OMP          PRIVATE(acol,arow,img,i,offset,block_size,found,block)
    2624              : !$OMP DO schedule(guided)
    2625              :       DO i = 1, SIZE(atom_pair)
    2626              :          arow = atom_pair(i)%row
    2627              :          acol = atom_pair(i)%col
    2628              :          img = atom_pair(i)%image
    2629              :          CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
    2630              :                                 block=block, found=found)
    2631              :          CPASSERT(found)
    2632              :          block_size = SIZE(block)
    2633              :          offset = pair_offsets(i)
    2634              :          dest_buffer(offset + 1:offset + block_size) = RESHAPE(block, shape=[block_size])
    2635              :       END DO
    2636              : !$OMP END DO
    2637              : !$OMP END PARALLEL
    2638              : 
    2639       231743 :    END SUBROUTINE rs_pack_buffer
    2640              : 
    2641              : ! **************************************************************************************************
    2642              : !> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
    2643              : !> \author Ole Schuett
    2644              : ! **************************************************************************************************
    2645       190818 :    SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
    2646              :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: src_buffer
    2647              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2648              :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: atom_pair
    2649              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: pair_offsets
    2650              : 
    2651              :       INTEGER                                            :: acol, arow, img, i, offset, &
    2652              :                                                             nrows, ncols, lock_num
    2653              :       LOGICAL                                            :: found
    2654       190818 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: block
    2655       190818 :       INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
    2656              : 
    2657              :       ! initialize locks
    2658       572454 :       ALLOCATE (locks(10*omp_get_max_threads()))
    2659      2098998 :       DO i = 1, SIZE(locks)
    2660      2098998 :          CALL omp_init_lock(locks(i))
    2661              :       END DO
    2662              : 
    2663              : !$OMP PARALLEL DEFAULT(NONE), &
    2664              : !$OMP          SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
    2665       190818 : !$OMP          PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
    2666              : !$OMP DO schedule(guided)
    2667              :       DO i = 1, SIZE(atom_pair)
    2668              :          arow = atom_pair(i)%row
    2669              :          acol = atom_pair(i)%col
    2670              :          img = atom_pair(i)%image
    2671              :          CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
    2672              :                                 block=block, found=found)
    2673              :          CPASSERT(found)
    2674              :          nrows = SIZE(block, 1)
    2675              :          ncols = SIZE(block, 2)
    2676              :          offset = pair_offsets(i)
    2677              :          lock_num = MODULO(arow, SIZE(locks)) + 1  ! map matrix rows round-robin to available locks
    2678              : 
    2679              :          CALL omp_set_lock(locks(lock_num))
    2680              :          block = block + RESHAPE(src_buffer(offset + 1:offset + nrows*ncols), shape=[nrows, ncols])
    2681              :          CALL omp_unset_lock(locks(lock_num))
    2682              :       END DO
    2683              : !$OMP END DO
    2684              : !$OMP END PARALLEL
    2685              : 
    2686              :       ! destroy locks
    2687      2098998 :       DO i = 1, SIZE(locks)
    2688      2098998 :          CALL omp_destroy_lock(locks(i))
    2689              :       END DO
    2690       190818 :       DEALLOCATE (locks)
    2691              : 
    2692       190818 :    END SUBROUTINE rs_unpack_buffer
    2693              : 
    2694              : ! **************************************************************************************************
    2695              : !> \brief determines the rank of the processor who's real rs grid contains point
    2696              : !> \param rs_desc ...
    2697              : !> \param igrid_level ...
    2698              : !> \param n_levels ...
    2699              : !> \param cube_center ...
    2700              : !> \param ntasks ...
    2701              : !> \param tasks ...
    2702              : !> \param lb_cube ...
    2703              : !> \param ub_cube ...
    2704              : !> \param added_tasks ...
    2705              : !> \par History
    2706              : !>      11.2007 created [MattW]
    2707              : !>      10.2008 rewritten [Joost VandeVondele]
    2708              : !> \author MattW
    2709              : ! **************************************************************************************************
    2710         1380 :    SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
    2711              :                            lb_cube, ub_cube, added_tasks)
    2712              : 
    2713              :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
    2714              :       INTEGER, INTENT(IN)                                :: igrid_level, n_levels
    2715              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center
    2716              :       INTEGER, INTENT(INOUT)                             :: ntasks
    2717              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    2718              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
    2719              :       INTEGER, INTENT(OUT)                               :: added_tasks
    2720              : 
    2721              :       INTEGER, PARAMETER                                 :: add_tasks = 1000
    2722              :       REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
    2723              : 
    2724              :       INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
    2725              :                  lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
    2726              :       INTEGER                                            :: bit_pattern
    2727              :       LOGICAL                                            :: dir_periodic(3)
    2728              : 
    2729         1380 :       coord(1) = rs_desc%x2coord(cube_center(1))
    2730         1380 :       coord(2) = rs_desc%y2coord(cube_center(2))
    2731         1380 :       coord(3) = rs_desc%z2coord(cube_center(3))
    2732         1380 :       dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
    2733              : 
    2734              :       ! the real cube coordinates
    2735         5520 :       lbc = lb_cube + cube_center
    2736         5520 :       ubc = ub_cube + cube_center
    2737              : 
    2738         9660 :       IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) <= lbc) .AND. &
    2739              :           ALL((rs_desc%ub_global(:, dest) + rs_desc%border) >= ubc)) THEN
    2740              :          !standard distributed collocation/integration
    2741         1380 :          tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
    2742         1380 :          tasks(ntasks)%dist_type = 1
    2743         1380 :          tasks(ntasks)%subpatch_pattern = 0
    2744         1380 :          added_tasks = 1
    2745              : 
    2746              :          ! here we figure out if there is an alternate location for this task
    2747              :          ! i.e. even though the cube_center is not in the real local domain,
    2748              :          ! it might fully fit in the halo of the neighbor
    2749              :          ! if its radius is smaller than the maximum radius
    2750              :          ! the list of possible neighbors is stored here as a bit pattern
    2751              :          ! to reconstruct what the rank of the neigbor is,
    2752              :          ! we can use (note this requires the correct rs_grid)
    2753              :          !  IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[-1,0,0])
    2754              :          !  IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[+1,0,0])
    2755              :          !  IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,-1,0])
    2756              :          !  IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,+1,0])
    2757              :          !  IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,-1])
    2758              :          !  IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,+1])
    2759         1380 :          bit_index = 0
    2760         1380 :          bit_pattern = 0
    2761         5520 :          DO i = 1, 3
    2762         5520 :             IF (rs_desc%perd(i) == 1) THEN
    2763         2760 :                bit_pattern = IBCLR(bit_pattern, bit_index)
    2764         2760 :                bit_index = bit_index + 1
    2765         2760 :                bit_pattern = IBCLR(bit_pattern, bit_index)
    2766         2760 :                bit_index = bit_index + 1
    2767              :             ELSE
    2768              :                ! fits the left neighbor ?
    2769         1380 :                IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
    2770          599 :                   bit_pattern = IBSET(bit_pattern, bit_index)
    2771          599 :                   bit_index = bit_index + 1
    2772              :                ELSE
    2773          781 :                   bit_pattern = IBCLR(bit_pattern, bit_index)
    2774          781 :                   bit_index = bit_index + 1
    2775              :                END IF
    2776              :                ! fits the right neighbor ?
    2777         1380 :                IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
    2778          250 :                   bit_pattern = IBSET(bit_pattern, bit_index)
    2779          250 :                   bit_index = bit_index + 1
    2780              :                ELSE
    2781         1130 :                   bit_pattern = IBCLR(bit_pattern, bit_index)
    2782         1130 :                   bit_index = bit_index + 1
    2783              :                END IF
    2784              :             END IF
    2785              :          END DO
    2786         1380 :          tasks(ntasks)%subpatch_pattern = bit_pattern
    2787              : 
    2788              :       ELSE
    2789              :          ! generalised collocation/integration needed
    2790              :          ! first we figure out how many neighbors we have to add to include the lbc/ubc
    2791              :          ! in the available domains (inclusive of halo points)
    2792              :          ! first we 'ignore' periodic boundary conditions
    2793              :          ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
    2794            0 :          lb_coord = coord
    2795            0 :          ub_coord = coord
    2796            0 :          lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
    2797            0 :          ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
    2798            0 :          DO i = 1, 3
    2799              :             ! only if the grid is not periodic in this direction we need to take care of adding neighbors
    2800            0 :             IF (rs_desc%perd(i) == 0) THEN
    2801              :                ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
    2802            0 :                DO
    2803            0 :                   IF (lb_domain(i) > lbc(i)) THEN
    2804            0 :                      lb_coord(i) = lb_coord(i) - 1
    2805            0 :                      icoord = MODULO(lb_coord, rs_desc%group_dim)
    2806            0 :                      idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2807            0 :                      lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
    2808              :                   ELSE
    2809              :                      EXIT
    2810              :                   END IF
    2811              :                END DO
    2812              :                ! same for the upper bound
    2813            0 :                DO
    2814            0 :                   IF (ub_domain(i) < ubc(i)) THEN
    2815            0 :                      ub_coord(i) = ub_coord(i) + 1
    2816            0 :                      icoord = MODULO(ub_coord, rs_desc%group_dim)
    2817            0 :                      idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2818            0 :                      ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
    2819              :                   ELSE
    2820              :                      EXIT
    2821              :                   END IF
    2822              :                END DO
    2823              :             END IF
    2824              :          END DO
    2825              : 
    2826              :          ! some care is needed for the periodic boundaries
    2827            0 :          DO i = 1, 3
    2828            0 :             IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
    2829            0 :                dir_periodic(i) = .TRUE.
    2830            0 :                lb_coord(i) = 0
    2831            0 :                ub_coord(i) = rs_desc%group_dim(i) - 1
    2832              :             ELSE
    2833            0 :                dir_periodic(i) = .FALSE.
    2834              :             END IF
    2835              :          END DO
    2836              : 
    2837            0 :          added_tasks = PRODUCT(ub_coord - lb_coord + 1)
    2838            0 :          itask = ntasks
    2839            0 :          ntasks = ntasks + added_tasks - 1
    2840            0 :          IF (ntasks > SIZE(tasks)) THEN
    2841            0 :             curr_tasks = INT((SIZE(tasks) + add_tasks)*mult_tasks)
    2842            0 :             CALL reallocate_tasks(tasks, curr_tasks)
    2843              :          END IF
    2844            0 :          DO iz = lb_coord(3), ub_coord(3)
    2845            0 :          DO iy = lb_coord(2), ub_coord(2)
    2846            0 :          DO ix = lb_coord(1), ub_coord(1)
    2847            0 :             icoord = MODULO([ix, iy, iz], rs_desc%group_dim)
    2848            0 :             idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2849            0 :             tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
    2850            0 :             tasks(itask)%dist_type = 2
    2851            0 :             tasks(itask)%subpatch_pattern = 0
    2852              :             ! encode the domain size for this task
    2853              :             ! if the bit is set, we need to add the border in that direction
    2854            0 :             IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
    2855            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 0)
    2856            0 :             IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
    2857            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 1)
    2858            0 :             IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
    2859            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 2)
    2860            0 :             IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
    2861            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 3)
    2862            0 :             IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
    2863            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 4)
    2864            0 :             IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
    2865            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 5)
    2866            0 :             itask = itask + 1
    2867              :          END DO
    2868              :          END DO
    2869              :          END DO
    2870              :       END IF
    2871              : 
    2872         1380 :    END SUBROUTINE rs_find_node
    2873              : 
    2874              : ! **************************************************************************************************
    2875              : !> \brief utility functions for encoding the grid level with a rank, allowing
    2876              : !>        different grid levels to maintain a different rank ordering without
    2877              : !>        losing information.  These encoded_ints are stored in the tasks(1:2,:) array
    2878              : !> \param rank ...
    2879              : !> \param grid_level ...
    2880              : !> \param n_levels ...
    2881              : !> \return ...
    2882              : !> \par History
    2883              : !>      4.2009 created [Iain Bethune]
    2884              : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2885              : ! **************************************************************************************************
    2886     15955982 :    FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
    2887              : 
    2888              :       INTEGER, INTENT(IN)                                :: rank, grid_level, n_levels
    2889              :       INTEGER                                            :: encoded_int
    2890              : 
    2891              : ! ordered so can still sort by rank
    2892              : 
    2893     15955982 :       encoded_int = rank*n_levels + grid_level - 1
    2894              : 
    2895     15955982 :    END FUNCTION encode_rank
    2896              : 
    2897              : ! **************************************************************************************************
    2898              : !> \brief ...
    2899              : !> \param encoded_int ...
    2900              : !> \param n_levels ...
    2901              : !> \return ...
    2902              : ! **************************************************************************************************
    2903      8023981 :    FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
    2904              : 
    2905              :       INTEGER, INTENT(IN)                                :: encoded_int
    2906              :       INTEGER, INTENT(IN)                                :: n_levels
    2907              :       INTEGER                                            :: rank
    2908              : 
    2909      8023981 :       rank = INT(encoded_int/n_levels)
    2910              : 
    2911      8023981 :    END FUNCTION decode_rank
    2912              : 
    2913              : ! **************************************************************************************************
    2914              : !> \brief ...
    2915              : !> \param encoded_int ...
    2916              : !> \param n_levels ...
    2917              : !> \return ...
    2918              : ! **************************************************************************************************
    2919         7236 :    FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
    2920              : 
    2921              :       INTEGER, INTENT(IN)                                :: encoded_int
    2922              :       INTEGER, INTENT(IN)                                :: n_levels
    2923              :       INTEGER                                            :: grid_level
    2924              : 
    2925         7236 :       grid_level = INT(MODULO(encoded_int, n_levels)) + 1
    2926              : 
    2927         7236 :    END FUNCTION decode_level
    2928              : 
    2929              : ! **************************************************************************************************
    2930              : !> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
    2931              : !>        grouped, and for each atom pair all set pairs are grouped and for each set pair,
    2932              : !>        all pgfs are grouped.
    2933              : !>        This yields the right order of the tasks for collocation after the sort
    2934              : !>        in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
    2935              : !>        The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
    2936              : !>        that a given density matrix block will be decontracted several times,
    2937              : !>        but cache effects on the grid make up for this.
    2938              : !> \param a ...
    2939              : !> \param b ...
    2940              : !> \return ...
    2941              : !> \author Ole Schuett
    2942              : ! **************************************************************************************************
    2943     75058299 :    PURE FUNCTION tasks_less_than(a, b) RESULT(res)
    2944              :       TYPE(task_type), INTENT(IN)                        :: a, b
    2945              :       LOGICAL                                            :: res
    2946              : 
    2947     75058299 :       IF (a%grid_level /= b%grid_level) THEN
    2948     19126837 :          res = a%grid_level < b%grid_level
    2949              : 
    2950     55931462 :       ELSE IF (a%image /= b%image) THEN
    2951      5606317 :          res = a%image < b%image
    2952              : 
    2953     50325145 :       ELSE IF (a%iatom /= b%iatom) THEN
    2954      6459612 :          res = a%iatom < b%iatom
    2955              : 
    2956     43865533 :       ELSE IF (a%jatom /= b%jatom) THEN
    2957      6984787 :          res = a%jatom < b%jatom
    2958              : 
    2959     36880746 :       ELSE IF (a%iset /= b%iset) THEN
    2960      5908172 :          res = a%iset < b%iset
    2961              : 
    2962     30972574 :       ELSE IF (a%jset /= b%jset) THEN
    2963      6311613 :          res = a%jset < b%jset
    2964              : 
    2965     24660961 :       ELSE IF (a%ipgf /= b%ipgf) THEN
    2966      6299435 :          res = a%ipgf < b%ipgf
    2967              : 
    2968              :       ELSE
    2969     18361526 :          res = a%jpgf < b%jpgf
    2970              : 
    2971              :       END IF
    2972     75058299 :    END FUNCTION tasks_less_than
    2973              : 
    2974    178193482 :    #:call array_sort(prefix='tasks', type='TYPE(task_type)')
    2975              :    #:endcall
    2976              : 
    2977              : ! **************************************************************************************************
    2978              : !> \brief Order atom pairs to find duplicates.
    2979              : !> \param a ...
    2980              : !> \param b ...
    2981              : !> \return ...
    2982              : !> \author Ole Schuett
    2983              : ! **************************************************************************************************
    2984     32091542 :    PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
    2985              :       TYPE(atom_pair_type), INTENT(IN)                   :: a, b
    2986              :       LOGICAL                                            :: res
    2987              : 
    2988     32091542 :       IF (a%rank /= b%rank) THEN
    2989         4243 :          res = a%rank < b%rank
    2990              : 
    2991     32087299 :       ELSE IF (a%row /= b%row) THEN
    2992      5223410 :          res = a%row < b%row
    2993              : 
    2994     26863889 :       ELSE IF (a%col /= b%col) THEN
    2995      5214176 :          res = a%col < b%col
    2996              : 
    2997              :       ELSE
    2998     21649713 :          res = a%image < b%image
    2999              : 
    3000              :       END IF
    3001     32091542 :    END FUNCTION atom_pair_less_than
    3002              : 
    3003     67944060 :    #:call array_sort(prefix='atom_pair', type='TYPE(atom_pair_type)')
    3004              :    #:endcall
    3005              : 
    3006              : END MODULE task_list_methods
        

Generated by: LCOV version 2.0-1