LCOV - code coverage report
Current view: top level - src - task_list_methods.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:3db43b4) Lines: 85.2 % 1077 918
Test Date: 2026-04-03 06:55:30 Functions: 97.1 % 35 34

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2026 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief generate the tasks lists used by collocate and integrate routines
      10              : !> \par History
      11              : !>      01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
      12              : !> \author Joost VandeVondele
      13              : ! **************************************************************************************************
      14              : MODULE task_list_methods
      15              :    USE offload_api, ONLY: offload_create_buffer, offload_buffer_type
      16              :    USE grid_api, ONLY: grid_create_basis_set, grid_create_task_list
      17              :    USE ao_util, ONLY: exp_radius_very_extended
      18              :    USE basis_set_types, ONLY: get_gto_basis_set, &
      19              :                               gto_basis_set_p_type, &
      20              :                               gto_basis_set_type
      21              :    USE cell_types, ONLY: cell_type, &
      22              :                          pbc
      23              :    USE cp_control_types, ONLY: dft_control_type
      24              :    USE cube_utils, ONLY: compute_cube_center, &
      25              :                          cube_info_type, &
      26              :                          return_cube, &
      27              :                          return_cube_nonortho
      28              :    USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
      29              :                            dbcsr_finalize, &
      30              :                            dbcsr_get_block_p, &
      31              :                            dbcsr_get_info, &
      32              :                            dbcsr_p_type, &
      33              :                            dbcsr_put_block, &
      34              :                            dbcsr_type, &
      35              :                            dbcsr_work_create
      36              :    USE gaussian_gridlevels, ONLY: gaussian_gridlevel, &
      37              :                                   gridlevel_info_type
      38              :    USE kinds, ONLY: default_string_length, &
      39              :                     dp, &
      40              :                     int_8
      41              :    USE kpoint_types, ONLY: get_kpoint_info, &
      42              :                            kpoint_type
      43              :    USE memory_utilities, ONLY: reallocate
      44              :    USE message_passing, ONLY: &
      45              :       mp_comm_type
      46              :    USE particle_types, ONLY: particle_type
      47              :    USE particle_methods, ONLY: get_particle_set
      48              :    USE pw_env_types, ONLY: pw_env_get, &
      49              :                            pw_env_type
      50              :    USE qs_kind_types, ONLY: get_qs_kind, &
      51              :                             qs_kind_type
      52              :    USE qs_ks_types, ONLY: get_ks_env, &
      53              :                           qs_ks_env_type
      54              :    USE qs_neighbor_list_types, ONLY: neighbor_list_set_p_type
      55              :    USE realspace_grid_types, ONLY: realspace_grid_desc_p_type, &
      56              :                                    realspace_grid_desc_type, &
      57              :                                    rs_grid_create, &
      58              :                                    rs_grid_locate_rank, &
      59              :                                    rs_grid_release, &
      60              :                                    rs_grid_reorder_ranks, realspace_grid_type
      61              :    USE task_list_types, ONLY: deserialize_task, &
      62              :                               reallocate_tasks, &
      63              :                               serialize_task, &
      64              :                               task_list_type, &
      65              :                               atom_pair_type, &
      66              :                               task_size_in_int8, &
      67              :                               task_type
      68              :    USE util, ONLY: sort
      69              : 
      70              : !$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
      71              : !$                    omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
      72              : #include "./base/base_uses.f90"
      73              : 
      74              :    #:include './common/array_sort.fypp'
      75              : 
      76              :    IMPLICIT NONE
      77              : 
      78              :    LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
      79              : 
      80              :    PRIVATE
      81              : 
      82              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
      83              : 
      84              :    PUBLIC :: generate_qs_task_list, &
      85              :              task_list_inner_loop
      86              :    PUBLIC :: distribute_tasks, &
      87              :              rs_distribute_matrix, &
      88              :              rs_scatter_matrices, &
      89              :              rs_gather_matrices, &
      90              :              rs_copy_to_buffer, &
      91              :              rs_copy_to_matrices
      92              : 
      93              : CONTAINS
      94              : 
      95              : ! **************************************************************************************************
      96              : !> \brief ...
      97              : !> \param ks_env ...
      98              : !> \param task_list ...
      99              : !> \param basis_type ...
     100              : !> \param reorder_rs_grid_ranks Flag that indicates if this routine should
     101              : !>        or should not overwrite the rs descriptor (see comment below)
     102              : !> \param skip_load_balance_distributed ...
     103              : !> \param pw_env_external ...
     104              : !> \param sab_orb_external ...
     105              : !> \param ext_kpoints ...
     106              : !> \par History
     107              : !>      01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
     108              : !>      04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
     109              : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
     110              : !>      06.2015 adjusted to be used with multiple images (k-points) [JGH]
     111              : !> \note  If this routine is called several times with different task lists,
     112              : !>        the default behaviour is to re-optimize the grid ranks and overwrite
     113              : !>        the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
     114              : !>        of performing a new optimization by leaving the rank order in
     115              : !>        its current state.
     116              : ! **************************************************************************************************
     117        15024 :    SUBROUTINE generate_qs_task_list(ks_env, task_list, basis_type, &
     118              :                                     reorder_rs_grid_ranks, skip_load_balance_distributed, &
     119              :                                     pw_env_external, sab_orb_external, ext_kpoints)
     120              : 
     121              :       TYPE(qs_ks_env_type), POINTER                      :: ks_env
     122              :       TYPE(task_list_type), POINTER                      :: task_list
     123              :       CHARACTER(LEN=*), INTENT(IN)                       :: basis_type
     124              :       LOGICAL, INTENT(IN)                                :: reorder_rs_grid_ranks, &
     125              :                                                             skip_load_balance_distributed
     126              :       TYPE(pw_env_type), OPTIONAL, POINTER               :: pw_env_external
     127              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     128              :          OPTIONAL, POINTER                               :: sab_orb_external
     129              :       TYPE(kpoint_type), OPTIONAL, POINTER               :: ext_kpoints
     130              : 
     131              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list'
     132              :       INTEGER, PARAMETER                                 :: max_tasks = 2000
     133              : 
     134              :       INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
     135              :                  ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
     136              :                  jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
     137        15024 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: blocks
     138              :       INTEGER, DIMENSION(3)                              :: cellind
     139        15024 :       INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
     140        15024 :                                                             npgfb, nsgf
     141        15024 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
     142              :       LOGICAL                                            :: dokp
     143              :       REAL(KIND=dp)                                      :: kind_radius_a, kind_radius_b
     144              :       REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
     145        15024 :       REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
     146        15024 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
     147              :       TYPE(cell_type), POINTER                           :: cell
     148        15024 :       TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
     149              :       TYPE(dft_control_type), POINTER                    :: dft_control
     150              :       TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
     151        15024 :       TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER  :: basis_set_list
     152              :       TYPE(gto_basis_set_type), POINTER                  :: basis_set_a, basis_set_b, orb_basis_set
     153              :       TYPE(kpoint_type), POINTER                         :: kpoints
     154              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     155        15024 :          POINTER                                         :: sab_orb
     156        15024 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     157              :       TYPE(pw_env_type), POINTER                         :: pw_env
     158        15024 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     159              :       TYPE(qs_kind_type), POINTER                        :: qs_kind
     160              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     161        15024 :          POINTER                                         :: rs_descs
     162        15024 :       TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
     163        15024 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     164              : 
     165        15024 :       CALL timeset(routineN, handle)
     166              : 
     167              :       CALL get_ks_env(ks_env, &
     168              :                       qs_kind_set=qs_kind_set, &
     169              :                       cell=cell, &
     170              :                       particle_set=particle_set, &
     171        15024 :                       dft_control=dft_control)
     172              : 
     173        15024 :       CALL get_ks_env(ks_env, sab_orb=sab_orb)
     174        15024 :       IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
     175              : 
     176        15024 :       CALL get_ks_env(ks_env, pw_env=pw_env)
     177        15024 :       IF (PRESENT(pw_env_external)) pw_env => pw_env_external
     178        15024 :       CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
     179              : 
     180              :       ! *** assign from pw_env
     181        15024 :       gridlevel_info => pw_env%gridlevel_info
     182        15024 :       cube_info => pw_env%cube_info
     183              : 
     184              :       ! find maximum numbers
     185        15024 :       nkind = SIZE(qs_kind_set)
     186        15024 :       natoms = SIZE(particle_set)
     187        15024 :       maxset = 0
     188        15024 :       maxpgf = 0
     189        41965 :       DO ikind = 1, nkind
     190        26941 :          qs_kind => qs_kind_set(ikind)
     191              :          CALL get_qs_kind(qs_kind=qs_kind, &
     192        26941 :                           basis_set=orb_basis_set, basis_type=basis_type)
     193              : 
     194        26941 :          IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
     195        26941 :          CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
     196              : 
     197        26941 :          maxset = MAX(nseta, maxset)
     198        98842 :          maxpgf = MAX(MAXVAL(npgfa), maxpgf)
     199              :       END DO
     200              : 
     201              :       ! kpoint related
     202        15024 :       nimages = dft_control%nimages
     203        15024 :       dokp = .FALSE.
     204        15024 :       NULLIFY (cell_to_index)
     205        15024 :       NULLIFY (kpoints)
     206        15024 :       IF (PRESENT(ext_kpoints)) THEN
     207          832 :          IF (ASSOCIATED(ext_kpoints)) THEN
     208           10 :             dokp = .TRUE.
     209           10 :             CALL get_kpoint_info(kpoint=ext_kpoints, cell_to_index=cell_to_index)
     210           10 :             nimages = SIZE(ext_kpoints%index_to_cell, 2)
     211              :          END IF
     212              :       END IF
     213        15024 :       IF (.NOT. dokp .AND. nimages > 1) THEN
     214          336 :          dokp = .TRUE.
     215          336 :          CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
     216          336 :          CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
     217              :       END IF
     218              : 
     219              :       ! free the atom_pair lists if allocated
     220        15024 :       IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
     221        15024 :       IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
     222              : 
     223              :       ! construct a list of all tasks
     224        15024 :       IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
     225         8992 :          CALL reallocate_tasks(task_list%tasks, max_tasks)
     226              :       END IF
     227        15024 :       task_list%ntasks = 0
     228        15024 :       curr_tasks = SIZE(task_list%tasks)
     229              : 
     230        72013 :       ALLOCATE (basis_set_list(nkind))
     231        41965 :       DO ikind = 1, nkind
     232        26941 :          qs_kind => qs_kind_set(ikind)
     233              :          CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
     234        26941 :                           basis_type=basis_type)
     235        41965 :          IF (ASSOCIATED(basis_set_a)) THEN
     236        26941 :             basis_set_list(ikind)%gto_basis_set => basis_set_a
     237              :          ELSE
     238            0 :             NULLIFY (basis_set_list(ikind)%gto_basis_set)
     239              :          END IF
     240              :       END DO
     241              : !!$OMP PARALLEL DEFAULT(NONE) &
     242              : !!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info,  &
     243              : !!$OMP         curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
     244              : !!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
     245              : !!$OMP          la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
     246              : !!$OMP          lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
     247              : !!$OMP          cindex, slot)
     248              :       ! Loop over neighbor list
     249              : !!$OMP DO SCHEDULE(GUIDED)
     250      1510140 :       DO slot = 1, sab_orb(1)%nl_size
     251      1495116 :          ikind = sab_orb(1)%nlist_task(slot)%ikind
     252      1495116 :          jkind = sab_orb(1)%nlist_task(slot)%jkind
     253      1495116 :          iatom = sab_orb(1)%nlist_task(slot)%iatom
     254      1495116 :          jatom = sab_orb(1)%nlist_task(slot)%jatom
     255      5980464 :          rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
     256      5980464 :          cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
     257              : 
     258      1495116 :          basis_set_a => basis_set_list(ikind)%gto_basis_set
     259      1495116 :          IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE
     260      1495116 :          basis_set_b => basis_set_list(jkind)%gto_basis_set
     261      1495116 :          IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE
     262      1495116 :          ra(:) = pbc(particle_set(iatom)%r, cell)
     263              :          ! basis ikind
     264      1495116 :          la_max => basis_set_a%lmax
     265      1495116 :          la_min => basis_set_a%lmin
     266      1495116 :          npgfa => basis_set_a%npgf
     267      1495116 :          nseta = basis_set_a%nset
     268      1495116 :          rpgfa => basis_set_a%pgf_radius
     269      1495116 :          set_radius_a => basis_set_a%set_radius
     270      1495116 :          kind_radius_a = basis_set_a%kind_radius
     271      1495116 :          zeta => basis_set_a%zet
     272              :          ! basis jkind
     273      1495116 :          lb_max => basis_set_b%lmax
     274      1495116 :          lb_min => basis_set_b%lmin
     275      1495116 :          npgfb => basis_set_b%npgf
     276      1495116 :          nsetb = basis_set_b%nset
     277      1495116 :          rpgfb => basis_set_b%pgf_radius
     278      1495116 :          set_radius_b => basis_set_b%set_radius
     279      1495116 :          kind_radius_b = basis_set_b%kind_radius
     280      1495116 :          zetb => basis_set_b%zet
     281              : 
     282      1495116 :          IF (dokp) THEN
     283       250624 :             cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
     284              :          ELSE
     285      1244492 :             cindex = 1
     286              :          END IF
     287              : 
     288              :          CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
     289              :                                    rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
     290              :                                    iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
     291              :                                    set_radius_a, set_radius_b, ra, rab, &
     292      1510140 :                                    la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
     293              : 
     294              :       END DO
     295              : !!$OMP END PARALLEL
     296              : 
     297              :       ! redistribute the task list so that all tasks map on the local rs grids
     298              :       CALL distribute_tasks( &
     299              :          rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
     300              :          tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
     301              :          atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
     302              :          reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
     303        15024 :          skip_load_balance_distributed=skip_load_balance_distributed)
     304              : 
     305              :       ! compute offsets for rs_scatter_matrix / rs_copy_matrix
     306        45072 :       ALLOCATE (nsgf(natoms))
     307        15024 :       CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
     308        15024 :       IF (ASSOCIATED(task_list%atom_pair_send)) THEN
     309              :          ! only needed when there is a distributed grid
     310              :          CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
     311              :                               nsgf=nsgf, &
     312              :                               group_size=rs_descs(1)%rs_desc%group_size, &
     313              :                               pair_offsets=task_list%pair_offsets_send, &
     314              :                               rank_offsets=task_list%rank_offsets_send, &
     315              :                               rank_sizes=task_list%rank_sizes_send, &
     316           48 :                               buffer_size=task_list%buffer_size_send)
     317              :       END IF
     318              :       CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
     319              :                            nsgf=nsgf, &
     320              :                            group_size=rs_descs(1)%rs_desc%group_size, &
     321              :                            pair_offsets=task_list%pair_offsets_recv, &
     322              :                            rank_offsets=task_list%rank_offsets_recv, &
     323              :                            rank_sizes=task_list%rank_sizes_recv, &
     324        15024 :                            buffer_size=task_list%buffer_size_recv)
     325        15024 :       DEALLOCATE (basis_set_list, nsgf)
     326              : 
     327              :       ! If the rank order has changed, reallocate any of the distributed rs_grids
     328        15024 :       IF (reorder_rs_grid_ranks) THEN
     329        63134 :          DO i = 1, gridlevel_info%ngrid_levels
     330        63134 :             IF (rs_descs(i)%rs_desc%distributed) THEN
     331           54 :                CALL rs_grid_release(rs_grids(i))
     332           54 :                CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
     333              :             END IF
     334              :          END DO
     335              :       END IF
     336              : 
     337              :       CALL create_grid_task_list(task_list=task_list, &
     338              :                                  qs_kind_set=qs_kind_set, &
     339              :                                  particle_set=particle_set, &
     340              :                                  cell=cell, &
     341              :                                  basis_type=basis_type, &
     342        15024 :                                  rs_grids=rs_grids)
     343              : 
     344              :       ! Now we have the final list of tasks, setup the task_list with the
     345              :       ! data needed for the loops in integrate_v/calculate_rho
     346              : 
     347        15024 :       IF (ASSOCIATED(task_list%taskstart)) THEN
     348         6032 :          DEALLOCATE (task_list%taskstart)
     349              :       END IF
     350        15024 :       IF (ASSOCIATED(task_list%taskstop)) THEN
     351         6032 :          DEALLOCATE (task_list%taskstop)
     352              :       END IF
     353        15024 :       IF (ASSOCIATED(task_list%npairs)) THEN
     354         6032 :          DEALLOCATE (task_list%npairs)
     355              :       END IF
     356              : 
     357              :       ! First, count the number of unique atom pairs per grid level
     358              : 
     359        45072 :       ALLOCATE (task_list%npairs(SIZE(rs_descs)))
     360              : 
     361        15024 :       iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
     362        15024 :       ipair = 0
     363        74604 :       task_list%npairs = 0
     364              : 
     365      8446080 :       DO i = 1, task_list%ntasks
     366      8431056 :          igrid_level = task_list%tasks(i)%grid_level
     367      8431056 :          img = task_list%tasks(i)%image
     368      8431056 :          iatom = task_list%tasks(i)%iatom
     369      8431056 :          jatom = task_list%tasks(i)%jatom
     370      8431056 :          iset = task_list%tasks(i)%iset
     371      8431056 :          jset = task_list%tasks(i)%jset
     372      8431056 :          ipgf = task_list%tasks(i)%ipgf
     373      8431056 :          jpgf = task_list%tasks(i)%jpgf
     374      8446080 :          IF (igrid_level /= igrid_level_old) THEN
     375        41041 :             IF (igrid_level_old /= -1) THEN
     376        26258 :                task_list%npairs(igrid_level_old) = ipair
     377              :             END IF
     378              :             ipair = 1
     379              :             igrid_level_old = igrid_level
     380              :             iatom_old = iatom
     381              :             jatom_old = jatom
     382              :             img_old = img
     383      8390015 :          ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
     384       684146 :             ipair = ipair + 1
     385       684146 :             iatom_old = iatom
     386       684146 :             jatom_old = jatom
     387       684146 :             img_old = img
     388              :          END IF
     389              :       END DO
     390              :       ! Take care of the last iteration
     391        15024 :       IF (task_list%ntasks /= 0) THEN
     392        14783 :          task_list%npairs(igrid_level) = ipair
     393              :       END IF
     394              : 
     395              :       ! Second, for each atom pair, find the indices in the task list
     396              :       ! of the first and last task
     397              : 
     398              :       ! Array sized for worst case
     399       119435 :       ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs)))
     400       119435 :       ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs)))
     401              : 
     402        15024 :       iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
     403        15024 :       ipair = 0
     404      1561340 :       task_list%taskstart = 0
     405      1561340 :       task_list%taskstop = 0
     406              : 
     407      8446080 :       DO i = 1, task_list%ntasks
     408      8431056 :          igrid_level = task_list%tasks(i)%grid_level
     409      8431056 :          img = task_list%tasks(i)%image
     410      8431056 :          iatom = task_list%tasks(i)%iatom
     411      8431056 :          jatom = task_list%tasks(i)%jatom
     412      8431056 :          iset = task_list%tasks(i)%iset
     413      8431056 :          jset = task_list%tasks(i)%jset
     414      8431056 :          ipgf = task_list%tasks(i)%ipgf
     415      8431056 :          jpgf = task_list%tasks(i)%jpgf
     416      8446080 :          IF (igrid_level /= igrid_level_old) THEN
     417        41041 :             IF (igrid_level_old /= -1) THEN
     418        26258 :                task_list%taskstop(ipair, igrid_level_old) = i - 1
     419              :             END IF
     420        41041 :             ipair = 1
     421        41041 :             task_list%taskstart(ipair, igrid_level) = i
     422        41041 :             igrid_level_old = igrid_level
     423        41041 :             iatom_old = iatom
     424        41041 :             jatom_old = jatom
     425        41041 :             img_old = img
     426      8390015 :          ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
     427       684146 :             ipair = ipair + 1
     428       684146 :             task_list%taskstart(ipair, igrid_level) = i
     429       684146 :             task_list%taskstop(ipair - 1, igrid_level) = i - 1
     430       684146 :             iatom_old = iatom
     431       684146 :             jatom_old = jatom
     432       684146 :             img_old = img
     433              :          END IF
     434              :       END DO
     435              :       ! Take care of the last iteration
     436        15024 :       IF (task_list%ntasks /= 0) THEN
     437        14783 :          task_list%taskstop(ipair, igrid_level) = task_list%ntasks
     438              :       END IF
     439              : 
     440              :       ! Debug task destribution
     441              :       IF (debug_this_module) THEN
     442              :          tasks => task_list%tasks
     443              :          WRITE (6, *)
     444              :          WRITE (6, *) "Total number of tasks              ", task_list%ntasks
     445              :          DO igrid_level = 1, gridlevel_info%ngrid_levels
     446              :             WRITE (6, *) "Total number of pairs(grid_level)  ", &
     447              :                igrid_level, task_list%npairs(igrid_level)
     448              :          END DO
     449              :          WRITE (6, *)
     450              : 
     451              :          DO igrid_level = 1, gridlevel_info%ngrid_levels
     452              : 
     453              :             ALLOCATE (blocks(natoms, natoms, nimages))
     454              :             blocks = -1
     455              :             DO ipair = 1, task_list%npairs(igrid_level)
     456              :                itask = task_list%taskstart(ipair, igrid_level)
     457              :                ilevel = task_list%tasks(itask)%grid_level
     458              :                img = task_list%tasks(itask)%image
     459              :                iatom = task_list%tasks(itask)%iatom
     460              :                jatom = task_list%tasks(itask)%jatom
     461              :                iset = task_list%tasks(itask)%iset
     462              :                jset = task_list%tasks(itask)%jset
     463              :                ipgf = task_list%tasks(itask)%ipgf
     464              :                jpgf = task_list%tasks(itask)%jpgf
     465              :                IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
     466              :                   blocks(iatom, jatom, img) = 1
     467              :                   blocks(jatom, iatom, img) = 1
     468              :                ELSE
     469              :                   WRITE (6, *) "TASK LIST CONFLICT IN PAIR       ", ipair
     470              :                   WRITE (6, *) "Reuse of iatom, jatom, image     ", iatom, jatom, img
     471              :                END IF
     472              : 
     473              :                iatom_old = iatom
     474              :                jatom_old = jatom
     475              :                img_old = img
     476              :                DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
     477              :                   ilevel = task_list%tasks(itask)%grid_level
     478              :                   img = task_list%tasks(itask)%image
     479              :                   iatom = task_list%tasks(itask)%iatom
     480              :                   jatom = task_list%tasks(itask)%jatom
     481              :                   iset = task_list%tasks(itask)%iset
     482              :                   jset = task_list%tasks(itask)%jset
     483              :                   ipgf = task_list%tasks(itask)%ipgf
     484              :                   jpgf = task_list%tasks(itask)%jpgf
     485              :                   IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
     486              :                      WRITE (6, *) "TASK LIST CONFLICT IN TASK       ", itask
     487              :                      WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
     488              :                      WRITE (6, *) "Should be    iatom, jatom, image ", iatom_old, jatom_old, img_old
     489              :                   END IF
     490              : 
     491              :                END DO
     492              :             END DO
     493              :             DEALLOCATE (blocks)
     494              : 
     495              :          END DO
     496              : 
     497              :       END IF
     498              : 
     499        15024 :       CALL timestop(handle)
     500              : 
     501        15024 :    END SUBROUTINE generate_qs_task_list
     502              : 
     503              : ! **************************************************************************************************
     504              : !> \brief Sends the task list data to the grid API.
     505              : !> \author Ole Schuett
     506              : ! **************************************************************************************************
     507        15024 :    SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
     508              :       TYPE(task_list_type), POINTER                      :: task_list
     509              :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     510              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     511              :       TYPE(cell_type), POINTER                           :: cell
     512              :       CHARACTER(LEN=*)                                   :: basis_type
     513              :       TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
     514              : 
     515              :       TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
     516              :       INTEGER                                            :: nset, natoms, nkinds, ntasks, &
     517              :                                                             ikind, iatom, itask, nsgf
     518        15024 :       INTEGER, DIMENSION(:), ALLOCATABLE                 :: atom_kinds, level_list, iatom_list, jatom_list, &
     519        15024 :                                                             iset_list, jset_list, ipgf_list, jpgf_list, &
     520        15024 :                                                             border_mask_list, block_num_list
     521        15024 :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: radius_list
     522        15024 :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE        :: rab_list, atom_positions
     523        15024 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     524        15024 :       INTEGER, DIMENSION(:, :), POINTER                  :: first_sgf
     525        15024 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: sphi, zet
     526        15024 :       INTEGER, DIMENSION(:), POINTER                     :: lmax, lmin, npgf, nsgf_set
     527              : 
     528        15024 :       nkinds = SIZE(qs_kind_set)
     529        15024 :       natoms = SIZE(particle_set)
     530        15024 :       ntasks = task_list%ntasks
     531        15024 :       tasks => task_list%tasks
     532              : 
     533        15024 :       IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
     534              :          ! Basis sets do not change during simulation - only need to create them once.
     535        43009 :          ALLOCATE (task_list%grid_basis_sets(nkinds))
     536        25025 :          DO ikind = 1, nkinds
     537        16033 :             CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
     538              :             CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
     539              :                                    nset=nset, &
     540              :                                    nsgf=nsgf, &
     541              :                                    nsgf_set=nsgf_set, &
     542              :                                    npgf=npgf, &
     543              :                                    first_sgf=first_sgf, &
     544              :                                    lmax=lmax, &
     545              :                                    lmin=lmin, &
     546              :                                    sphi=sphi, &
     547        16033 :                                    zet=zet)
     548              :             CALL grid_create_basis_set(nset=nset, &
     549              :                                        nsgf=nsgf, &
     550              :                                        maxco=SIZE(sphi, 1), &
     551              :                                        maxpgf=SIZE(zet, 1), &
     552              :                                        lmin=lmin, &
     553              :                                        lmax=lmax, &
     554              :                                        npgf=npgf, &
     555              :                                        nsgf_set=nsgf_set, &
     556              :                                        first_sgf=first_sgf, &
     557              :                                        sphi=sphi, &
     558              :                                        zet=zet, &
     559        25025 :                                        basis_set=task_list%grid_basis_sets(ikind))
     560              :          END DO
     561              :       END IF
     562              : 
     563              :       ! Pack task list infos
     564        75120 :       ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
     565        69545 :       DO iatom = 1, natoms
     566        54521 :          atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
     567        69545 :          atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
     568              :       END DO
     569              : 
     570        74397 :       ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
     571        74156 :       ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
     572        44590 :       ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
     573        74638 :       ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
     574              : 
     575      8446080 :       DO itask = 1, ntasks
     576      8431056 :          level_list(itask) = tasks(itask)%grid_level
     577      8431056 :          iatom_list(itask) = tasks(itask)%iatom
     578      8431056 :          jatom_list(itask) = tasks(itask)%jatom
     579      8431056 :          iset_list(itask) = tasks(itask)%iset
     580      8431056 :          jset_list(itask) = tasks(itask)%jset
     581      8431056 :          ipgf_list(itask) = tasks(itask)%ipgf
     582      8431056 :          jpgf_list(itask) = tasks(itask)%jpgf
     583      8431056 :          IF (tasks(itask)%dist_type == 2) THEN
     584            0 :             border_mask_list(itask) = IAND(63, NOT(tasks(itask)%subpatch_pattern))  ! invert last 6 bits
     585              :          ELSE
     586      8431056 :             border_mask_list(itask) = 0 ! no masking
     587              :          END IF
     588      8431056 :          block_num_list(itask) = tasks(itask)%pair_index  ! change of nomenclature pair_index -> block_num
     589      8431056 :          radius_list(itask) = tasks(itask)%radius
     590     33739248 :          rab_list(:, itask) = tasks(itask)%rab(:)
     591              :       END DO
     592              : 
     593              :       CALL grid_create_task_list(ntasks=ntasks, &
     594              :                                  natoms=natoms, &
     595              :                                  nkinds=nkinds, &
     596              :                                  nblocks=SIZE(task_list%pair_offsets_recv), &
     597              :                                  block_offsets=task_list%pair_offsets_recv, &
     598              :                                  atom_positions=atom_positions, &
     599              :                                  atom_kinds=atom_kinds, &
     600              :                                  basis_sets=task_list%grid_basis_sets, &
     601              :                                  level_list=level_list, &
     602              :                                  iatom_list=iatom_list, &
     603              :                                  jatom_list=jatom_list, &
     604              :                                  iset_list=iset_list, &
     605              :                                  jset_list=jset_list, &
     606              :                                  ipgf_list=ipgf_list, &
     607              :                                  jpgf_list=jpgf_list, &
     608              :                                  border_mask_list=border_mask_list, &
     609              :                                  block_num_list=block_num_list, &
     610              :                                  radius_list=radius_list, &
     611              :                                  rab_list=rab_list, &
     612              :                                  rs_grids=rs_grids, &
     613        15024 :                                  task_list=task_list%grid_task_list)
     614              : 
     615        15024 :       CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
     616        15024 :       CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
     617              : 
     618        30048 :    END SUBROUTINE create_grid_task_list
     619              : 
     620              : ! **************************************************************************************************
     621              : !> \brief ...
     622              : !> \param tasks ...
     623              : !> \param ntasks ...
     624              : !> \param curr_tasks ...
     625              : !> \param rs_descs ...
     626              : !> \param dft_control ...
     627              : !> \param cube_info ...
     628              : !> \param gridlevel_info ...
     629              : !> \param cindex ...
     630              : !> \param iatom ...
     631              : !> \param jatom ...
     632              : !> \param rpgfa ...
     633              : !> \param rpgfb ...
     634              : !> \param zeta ...
     635              : !> \param zetb ...
     636              : !> \param kind_radius_b ...
     637              : !> \param set_radius_a ...
     638              : !> \param set_radius_b ...
     639              : !> \param ra ...
     640              : !> \param rab ...
     641              : !> \param la_max ...
     642              : !> \param la_min ...
     643              : !> \param lb_max ...
     644              : !> \param lb_min ...
     645              : !> \param npgfa ...
     646              : !> \param npgfb ...
     647              : !> \param nseta ...
     648              : !> \param nsetb ...
     649              : !> \par History
     650              : !>      Joost VandeVondele: 10.2008 refactored
     651              : ! **************************************************************************************************
     652      1506573 :    SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
     653              :                                    cube_info, gridlevel_info, cindex, &
     654              :                                    iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
     655              :                                    la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
     656              : 
     657              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     658              :       INTEGER                                            :: ntasks, curr_tasks
     659              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     660              :          POINTER                                         :: rs_descs
     661              :       TYPE(dft_control_type), POINTER                    :: dft_control
     662              :       TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
     663              :       TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
     664              :       INTEGER                                            :: cindex, iatom, jatom
     665              :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
     666              :       REAL(KIND=dp)                                      :: kind_radius_b
     667              :       REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
     668              :       REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
     669              :       INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
     670              :                                                             npgfb
     671              :       INTEGER                                            :: nseta, nsetb
     672              : 
     673              :       INTEGER                                            :: cube_center(3), igrid_level, ipgf, iset, &
     674              :                                                             jpgf, jset, lb_cube(3), ub_cube(3)
     675              :       REAL(KIND=dp)                                      :: dab, rab2, radius, zetp
     676              : 
     677      1506573 :       rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
     678      1506573 :       dab = SQRT(rab2)
     679              : 
     680      4694580 :       loop_iset: DO iset = 1, nseta
     681              : 
     682      3188007 :          IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE loop_iset
     683              : 
     684      9619107 :          loop_jset: DO jset = 1, nsetb
     685              : 
     686      5861550 :             IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE loop_jset
     687              : 
     688     15804494 :             loop_ipgf: DO ipgf = 1, npgfa(iset)
     689              : 
     690      8961907 :                IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE loop_ipgf
     691              : 
     692     26968954 :                loop_jpgf: DO jpgf = 1, npgfb(jset)
     693              : 
     694     15649003 :                   IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE loop_jpgf
     695              : 
     696      8594946 :                   zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
     697      8594946 :                   igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
     698              : 
     699              :                   CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
     700              :                                               rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
     701              :                                               la_max(iset), zeta(ipgf, iset), la_min(iset), &
     702              :                                               lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
     703      8594946 :                                               ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
     704              : 
     705              :                   CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
     706              :                                     rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
     707              :                                     la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
     708              :                                     igrid_level, gridlevel_info%ngrid_levels, cube_center, &
     709     24610910 :                                     lb_cube, ub_cube, radius)
     710              : 
     711              :                END DO loop_jpgf
     712              : 
     713              :             END DO loop_ipgf
     714              : 
     715              :          END DO loop_jset
     716              : 
     717              :       END DO loop_iset
     718              : 
     719      1506573 :    END SUBROUTINE task_list_inner_loop
     720              : 
     721              : ! **************************************************************************************************
     722              : !> \brief combines the calculation of several basic properties of a given pgf:
     723              : !>  its center, the bounding cube, the radius, the cost,
     724              : !>  tries to predict the time needed for processing this task
     725              : !>      in this way an improved load balance might be obtained
     726              : !> \param cube_center ...
     727              : !> \param lb_cube ...
     728              : !> \param ub_cube ...
     729              : !> \param radius ...
     730              : !> \param rs_desc ...
     731              : !> \param cube_info ...
     732              : !> \param la_max ...
     733              : !> \param zeta ...
     734              : !> \param la_min ...
     735              : !> \param lb_max ...
     736              : !> \param zetb ...
     737              : !> \param lb_min ...
     738              : !> \param ra ...
     739              : !> \param rab ...
     740              : !> \param rab2 ...
     741              : !> \param eps ...
     742              : !> \par History
     743              : !>      10.2008 refactored [Joost VandeVondele]
     744              : !> \note
     745              : !>      -) this requires the radius to be computed in the same way as
     746              : !>      collocate_pgf_product, we should factor that part into a subroutine
     747              : !>      -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
     748              : !>         this is more or less true for map_consistent
     749              : !>      -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
     750              : !>         the same, this could lead to a small speedup
     751              : !>      -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
     752              : !>         fitting the measured data on an opteron/g95 using the expression
     753              : !>         a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
     754              : ! **************************************************************************************************
     755      8594946 :    SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
     756              :                                      rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
     757              : 
     758              :       INTEGER, DIMENSION(3), INTENT(OUT)                 :: cube_center, lb_cube, ub_cube
     759              :       REAL(KIND=dp), INTENT(OUT)                         :: radius
     760              :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
     761              :       TYPE(cube_info_type), INTENT(IN)                   :: cube_info
     762              :       INTEGER, INTENT(IN)                                :: la_max
     763              :       REAL(KIND=dp), INTENT(IN)                          :: zeta
     764              :       INTEGER, INTENT(IN)                                :: la_min, lb_max
     765              :       REAL(KIND=dp), INTENT(IN)                          :: zetb
     766              :       INTEGER, INTENT(IN)                                :: lb_min
     767              :       REAL(KIND=dp), INTENT(IN)                          :: ra(3), rab(3), rab2, eps
     768              : 
     769              :       INTEGER                                            :: extent(3)
     770      8594946 :       INTEGER, DIMENSION(:), POINTER                     :: sphere_bounds
     771              :       REAL(KIND=dp)                                      :: cutoff, f, prefactor, rb(3), zetp
     772              :       REAL(KIND=dp), DIMENSION(3)                        :: rp
     773              : 
     774              : ! the radius for this task
     775              : 
     776      8594946 :       zetp = zeta + zetb
     777     34379784 :       rp(:) = ra(:) + zetb/zetp*rab(:)
     778     34379784 :       rb(:) = ra(:) + rab(:)
     779      8594946 :       cutoff = 1.0_dp
     780      8594946 :       f = zetb/zetp
     781      8594946 :       prefactor = EXP(-zeta*f*rab2)
     782              :       radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
     783      8594946 :                                         zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
     784              : 
     785      8594946 :       CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
     786              :       ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
     787     34379784 :       cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:))
     788     34379784 :       cube_center(:) = cube_center(:) + rs_desc%lb(:)
     789              : 
     790      8594946 :       IF (rs_desc%orthorhombic) THEN
     791      8144401 :          CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
     792              :       ELSE
     793       450545 :          CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
     794              :          !? unclear if extent is computed correctly.
     795      1802180 :          extent(:) = ub_cube(:) - lb_cube(:)
     796      1802180 :          lb_cube(:) = -extent(:)/2 - 1
     797      1802180 :          ub_cube(:) = extent(:)/2
     798              :       END IF
     799              : 
     800      8594946 :    END SUBROUTINE compute_pgf_properties
     801              : ! **************************************************************************************************
     802              : !> \brief predicts the cost of a task in kcycles for a given task
     803              : !>        the model is based on a fit of actual data, and might need updating
     804              : !>        as collocate_pgf_product changes (or CPUs/compilers change)
     805              : !>        maybe some dynamic approach, improving the cost model on the fly could
     806              : !>        work as well
     807              : !>        the cost model does not yet take into account the fraction of space
     808              : !>        that is mapped locally for a given cube and rs_grid (generalised tasks)
     809              : !> \param lb_cube ...
     810              : !> \param ub_cube ...
     811              : !> \param fraction ...
     812              : !> \param lmax ...
     813              : !> \param is_ortho ...
     814              : !> \return ...
     815              : ! **************************************************************************************************
     816      8594946 :    INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
     817              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
     818              :       REAL(KIND=dp), INTENT(IN)                          :: fraction
     819              :       INTEGER                                            :: lmax
     820              :       LOGICAL                                            :: is_ortho
     821              : 
     822              :       INTEGER                                            :: cmax
     823              :       REAL(KIND=dp)                                      :: v1, v2, v3, v4, v5
     824              : 
     825     34379784 :       cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2)
     826              : 
     827      8594946 :       IF (is_ortho) THEN
     828              :          v1 = 1.504760E+00_dp
     829              :          v2 = 3.126770E+00_dp
     830              :          v3 = 5.074106E+00_dp
     831              :          v4 = 1.091568E+00_dp
     832              :          v5 = 1.070187E+00_dp
     833              :       ELSE
     834       450545 :          v1 = 7.831105E+00_dp
     835       450545 :          v2 = 2.675174E+00_dp
     836       450545 :          v3 = 7.546553E+00_dp
     837       450545 :          v4 = 6.122446E-01_dp
     838       450545 :          v5 = 3.886382E+00_dp
     839              :       END IF
     840      8594946 :       cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
     841              : 
     842      8594946 :    END FUNCTION cost_model
     843              : ! **************************************************************************************************
     844              : !> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
     845              : !>        this determines by which CPUs a given pgf gets collocated
     846              : !>        the format of the task array is as follows
     847              : !>        tasks(1,i) := destination
     848              : !>        tasks(2,i) := source
     849              : !>        tasks(3,i) := compressed type (iatom, jatom, ....)
     850              : !>        tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
     851              : !>        tasks(5,i) := cost
     852              : !>        tasks(6,i) := alternate destination code (0 if none available)
     853              : !>
     854              : !> \param tasks ...
     855              : !> \param ntasks ...
     856              : !> \param curr_tasks ...
     857              : !> \param rab ...
     858              : !> \param cindex ...
     859              : !> \param iatom ...
     860              : !> \param jatom ...
     861              : !> \param iset ...
     862              : !> \param jset ...
     863              : !> \param ipgf ...
     864              : !> \param jpgf ...
     865              : !> \param la_max ...
     866              : !> \param lb_max ...
     867              : !> \param rs_desc ...
     868              : !> \param igrid_level ...
     869              : !> \param n_levels ...
     870              : !> \param cube_center ...
     871              : !> \param lb_cube ...
     872              : !> \param ub_cube ...
     873              : !> \par History
     874              : !>      10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
     875              : ! **************************************************************************************************
     876      8594946 :    SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
     877              :                            rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
     878              :                            la_max, lb_max, rs_desc, igrid_level, n_levels, &
     879              :                            cube_center, lb_cube, ub_cube, radius)
     880              : 
     881              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     882              :       INTEGER, INTENT(INOUT)                             :: ntasks, curr_tasks
     883              :       REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: rab
     884              :       INTEGER, INTENT(IN)                                :: cindex, iatom, jatom, iset, jset, ipgf, &
     885              :                                                             jpgf, la_max, lb_max
     886              :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
     887              :       INTEGER, INTENT(IN)                                :: igrid_level, n_levels
     888              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center, lb_cube, ub_cube
     889              :       REAL(KIND=dp), INTENT(IN)                          :: radius
     890              : 
     891              :       INTEGER, PARAMETER                                 :: add_tasks = 1000
     892              :       REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
     893              : 
     894              :       INTEGER                                            :: added_tasks, cost, j, lmax
     895              :       LOGICAL                                            :: is_ortho
     896              :       REAL(KIND=dp)                                      :: tfraction
     897              : 
     898              : !$OMP SINGLE
     899      8594946 :       ntasks = ntasks + 1
     900      8594946 :       IF (ntasks > curr_tasks) THEN
     901         1058 :          curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks)
     902         1058 :          CALL reallocate_tasks(tasks, curr_tasks)
     903              :       END IF
     904              : !$OMP END SINGLE
     905              : 
     906      8594946 :       IF (rs_desc%distributed) THEN
     907              : 
     908              :          ! finds the node(s) that need to process this task
     909              :          ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
     910              :          CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
     911         1380 :                            ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
     912              : 
     913              :       ELSE
     914      8593566 :          tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
     915      8593566 :          tasks(ntasks)%dist_type = 0
     916      8593566 :          tasks(ntasks)%subpatch_pattern = 0
     917      8593566 :          added_tasks = 1
     918              :       END IF
     919              : 
     920      8594946 :       lmax = la_max + lb_max
     921      8594946 :       is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
     922              :       ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
     923              :       ! this could be refined in the future
     924      8594946 :       tfraction = 1.0_dp/added_tasks
     925              : 
     926      8594946 :       cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
     927              : 
     928     17189892 :       DO j = 1, added_tasks
     929      8594946 :          tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
     930      8594946 :          tasks(ntasks - added_tasks + j)%cost = cost
     931      8594946 :          tasks(ntasks - added_tasks + j)%grid_level = igrid_level
     932      8594946 :          tasks(ntasks - added_tasks + j)%image = cindex
     933      8594946 :          tasks(ntasks - added_tasks + j)%iatom = iatom
     934      8594946 :          tasks(ntasks - added_tasks + j)%jatom = jatom
     935      8594946 :          tasks(ntasks - added_tasks + j)%iset = iset
     936      8594946 :          tasks(ntasks - added_tasks + j)%jset = jset
     937      8594946 :          tasks(ntasks - added_tasks + j)%ipgf = ipgf
     938      8594946 :          tasks(ntasks - added_tasks + j)%jpgf = jpgf
     939     34379784 :          tasks(ntasks - added_tasks + j)%rab = rab
     940     17189892 :          tasks(ntasks - added_tasks + j)%radius = radius
     941              :       END DO
     942              : 
     943      8594946 :    END SUBROUTINE pgf_to_tasks
     944              : 
     945              : ! **************************************************************************************************
     946              : !> \brief performs load balancing of the tasks on the distributed grids
     947              : !> \param tasks ...
     948              : !> \param ntasks ...
     949              : !> \param rs_descs ...
     950              : !> \param grid_level ...
     951              : !> \param natoms ...
     952              : !> \par History
     953              : !>      created 2008-10-03 [Joost VandeVondele]
     954              : ! **************************************************************************************************
     955           54 :    SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
     956              : 
     957              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     958              :       INTEGER                                            :: ntasks
     959              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     960              :          POINTER                                         :: rs_descs
     961              :       INTEGER                                            :: grid_level, natoms
     962              : 
     963              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed'
     964              : 
     965              :       INTEGER                                            :: handle
     966           54 :       INTEGER, DIMENSION(:, :, :), POINTER               :: list
     967              : 
     968           54 :       CALL timeset(routineN, handle)
     969              : 
     970           54 :       NULLIFY (list)
     971              :       ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
     972              :       ! if a destination would not be in this list, it is a bug
     973           54 :       CALL create_destination_list(list, rs_descs, grid_level)
     974              : 
     975              :       ! now, walk over the tasks, filling in the loads of each destination
     976           54 :       CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.TRUE.)
     977              : 
     978              :       ! optimize loads & fluxes
     979           54 :       CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
     980              : 
     981              :       ! now, walk over the tasks, using the list to set the destinations
     982           54 :       CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.FALSE.)
     983              : 
     984           54 :       DEALLOCATE (list)
     985              : 
     986           54 :       CALL timestop(handle)
     987              : 
     988           54 :    END SUBROUTINE load_balance_distributed
     989              : 
     990              : ! **************************************************************************************************
     991              : !> \brief this serial routine adjusts the fluxes in the global list
     992              : !>
     993              : !> \param list_global ...
     994              : !> \par History
     995              : !>      created 2008-10-06 [Joost VandeVondele]
     996              : ! **************************************************************************************************
     997           27 :    SUBROUTINE balance_global_list(list_global)
     998              :       INTEGER, DIMENSION(:, :, 0:)                       :: list_global
     999              : 
    1000              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list'
    1001              :       INTEGER, PARAMETER                                 :: Max_Iter = 100
    1002              :       REAL(KIND=dp), PARAMETER                           :: Tolerance_factor = 0.005_dp
    1003              : 
    1004              :       INTEGER                                            :: dest, handle, icpu, idest, iflux, &
    1005              :                                                             ilocal, k, maxdest, Ncpu, Nflux
    1006           27 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: flux_connections
    1007              :       LOGICAL                                            :: solution_optimal
    1008              :       REAL(KIND=dp)                                      :: average, load_shift, max_load_shift, &
    1009              :                                                             tolerance
    1010           27 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: load, optimized_flux, optimized_load
    1011           27 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: flux_limits
    1012              : 
    1013           27 :       CALL timeset(routineN, handle)
    1014              : 
    1015           27 :       Ncpu = SIZE(list_global, 3)
    1016           27 :       maxdest = SIZE(list_global, 2)
    1017           81 :       ALLOCATE (load(0:Ncpu - 1))
    1018           81 :       load = 0.0_dp
    1019           54 :       ALLOCATE (optimized_load(0:Ncpu - 1))
    1020              : 
    1021              :       ! figure out the number of fluxes
    1022              :       ! we assume that the global_list is symmetric
    1023           81 :       Nflux = 0
    1024           81 :       DO icpu = 0, ncpu - 1
    1025          189 :          DO idest = 1, maxdest
    1026          108 :             dest = list_global(1, idest, icpu)
    1027          162 :             IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1
    1028              :          END DO
    1029              :       END DO
    1030           81 :       ALLOCATE (optimized_flux(Nflux))
    1031           81 :       ALLOCATE (flux_limits(2, Nflux))
    1032           81 :       ALLOCATE (flux_connections(2, Nflux))
    1033              : 
    1034              :       ! reorder data
    1035          108 :       flux_limits = 0
    1036              :       Nflux = 0
    1037           81 :       DO icpu = 0, ncpu - 1
    1038          162 :          load(icpu) = SUM(list_global(2, :, icpu))
    1039          189 :          DO idest = 1, maxdest
    1040          108 :             dest = list_global(1, idest, icpu)
    1041          162 :             IF (dest < ncpu) THEN
    1042          108 :                IF (dest /= icpu) THEN
    1043           54 :                   IF (dest > icpu) THEN
    1044           27 :                      Nflux = Nflux + 1
    1045           27 :                      flux_limits(2, Nflux) = list_global(2, idest, icpu)
    1046           27 :                      flux_connections(1, Nflux) = icpu
    1047           27 :                      flux_connections(2, Nflux) = dest
    1048              :                   ELSE
    1049           27 :                      DO iflux = 1, Nflux
    1050           27 :                         IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
    1051           27 :                            flux_limits(1, iflux) = -list_global(2, idest, icpu)
    1052           27 :                            EXIT
    1053              :                         END IF
    1054              :                      END DO
    1055              :                   END IF
    1056              :                END IF
    1057              :             END IF
    1058              :          END DO
    1059              :       END DO
    1060              : 
    1061              :       solution_optimal = .FALSE.
    1062           54 :       optimized_flux = 0.0_dp
    1063              : 
    1064              :       ! an iterative solver, if iterated till convergence the maximum load is minimal
    1065              :       ! we terminate before things are fully converged, since this does show up in the timings
    1066              :       ! once the largest shift becomes less than a small fraction of the average load, we're done
    1067              :       ! we're perfectly happy if the load balance is within 1 percent or so
    1068              :       ! the maximum load normally converges even faster
    1069           81 :       average = SUM(load)/SIZE(load)
    1070           27 :       tolerance = Tolerance_factor*average
    1071              : 
    1072           81 :       optimized_load(:) = load
    1073          332 :       DO k = 1, Max_iter
    1074              :          max_load_shift = 0.0_dp
    1075          658 :          DO iflux = 1, Nflux
    1076          329 :             load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
    1077          329 :             load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
    1078          329 :             load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
    1079          329 :             max_load_shift = MAX(ABS(load_shift), max_load_shift)
    1080          329 :             optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
    1081          329 :             optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
    1082          658 :             optimized_flux(iflux) = optimized_flux(iflux) + load_shift
    1083              :          END DO
    1084          332 :          IF (max_load_shift < tolerance) THEN
    1085              :             solution_optimal = .TRUE.
    1086              :             EXIT
    1087              :          END IF
    1088              :       END DO
    1089              : 
    1090              :       ! now adjust the load list to reflect the optimized fluxes
    1091              :       ! reorder data
    1092              :       Nflux = 0
    1093           81 :       DO icpu = 0, ncpu - 1
    1094          162 :          DO idest = 1, maxdest
    1095          162 :             IF (list_global(1, idest, icpu) == icpu) ilocal = idest
    1096              :          END DO
    1097          189 :          DO idest = 1, maxdest
    1098          108 :             dest = list_global(1, idest, icpu)
    1099          162 :             IF (dest < ncpu) THEN
    1100          108 :                IF (dest /= icpu) THEN
    1101           54 :                   IF (dest > icpu) THEN
    1102           27 :                      Nflux = Nflux + 1
    1103           27 :                      IF (optimized_flux(Nflux) > 0) THEN
    1104              :                         list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1105            0 :                                                        list_global(2, idest, icpu) - NINT(optimized_flux(Nflux))
    1106            0 :                         list_global(2, idest, icpu) = NINT(optimized_flux(Nflux))
    1107              :                      ELSE
    1108              :                         list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1109           27 :                                                        list_global(2, idest, icpu)
    1110           27 :                         list_global(2, idest, icpu) = 0
    1111              :                      END IF
    1112              :                   ELSE
    1113           27 :                      DO iflux = 1, Nflux
    1114           27 :                         IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
    1115           27 :                            IF (optimized_flux(iflux) > 0) THEN
    1116              :                               list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1117            0 :                                                              list_global(2, idest, icpu)
    1118            0 :                               list_global(2, idest, icpu) = 0
    1119              :                            ELSE
    1120              :                               list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1121           27 :                                                              list_global(2, idest, icpu) + NINT(optimized_flux(iflux))
    1122           27 :                               list_global(2, idest, icpu) = -NINT(optimized_flux(iflux))
    1123              :                            END IF
    1124              :                            EXIT
    1125              :                         END IF
    1126              :                      END DO
    1127              :                   END IF
    1128              :                END IF
    1129              :             END IF
    1130              :          END DO
    1131              :       END DO
    1132              : 
    1133           27 :       CALL timestop(handle)
    1134              : 
    1135           54 :    END SUBROUTINE balance_global_list
    1136              : 
    1137              : ! **************************************************************************************************
    1138              : !> \brief this routine gets back optimized loads for all destinations
    1139              : !>
    1140              : !> \param list ...
    1141              : !> \param group ...
    1142              : !> \param my_pos ...
    1143              : !> \par History
    1144              : !>      created 2008-10-06 [Joost VandeVondele]
    1145              : !>      Modified 2016-01   [EPCC] Reduce memory requirements on P processes
    1146              : !>                                from O(P^2) to O(P)
    1147              : ! **************************************************************************************************
    1148           54 :    SUBROUTINE optimize_load_list(list, group, my_pos)
    1149              :       INTEGER, DIMENSION(:, :, 0:)                       :: list
    1150              :       TYPE(mp_comm_type), INTENT(IN) :: group
    1151              :       INTEGER, INTENT(IN)                                :: my_pos
    1152              : 
    1153              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list'
    1154              :       INTEGER, PARAMETER                                 :: rank_of_root = 0
    1155              : 
    1156              :       INTEGER                                            :: handle, icpu, idest, maxdest, ncpu
    1157              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: load_all
    1158           54 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: load_partial
    1159              :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: list_global
    1160              : 
    1161           54 :       CALL timeset(routineN, handle)
    1162              : 
    1163           54 :       ncpu = SIZE(list, 3)
    1164           54 :       maxdest = SIZE(list, 2)
    1165              : 
    1166              :       !find total workload ...
    1167          162 :       ALLOCATE (load_all(maxdest*ncpu))
    1168          108 :       load_all(:) = RESHAPE(list(2, :, :), [maxdest*ncpu])
    1169           54 :       CALL group%sum(load_all(:), rank_of_root)
    1170              : 
    1171              :       ! ... and optimise the work per process
    1172          216 :       ALLOCATE (list_global(2, maxdest, ncpu))
    1173           54 :       IF (rank_of_root == my_pos) THEN
    1174          189 :          list_global(1, :, :) = list(1, :, :)
    1175          243 :          list_global(2, :, :) = RESHAPE(load_all, [maxdest, ncpu])
    1176           27 :          CALL balance_global_list(list_global)
    1177              :       END IF
    1178           54 :       CALL group%bcast(list_global, rank_of_root)
    1179              : 
    1180              :       !figure out how much can be sent to other processes
    1181          216 :       ALLOCATE (load_partial(maxdest, ncpu))
    1182              :       ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
    1183          162 :       CALL group%sum_partial(RESHAPE(load_all, [maxdest, ncpu]), load_partial(:, :))
    1184              : 
    1185          162 :       DO icpu = 1, ncpu
    1186          378 :          DO idest = 1, maxdest
    1187              : 
    1188              :             !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
    1189          324 :             IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
    1190           37 :                IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
    1191              :                   list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
    1192           10 :                                              - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
    1193              :                ELSE
    1194           27 :                   list(2, idest, icpu - 1) = 0
    1195              :                END IF
    1196              :             END IF
    1197              : 
    1198              :          END DO
    1199              :       END DO
    1200              : 
    1201              :       !clean up before leaving
    1202           54 :       DEALLOCATE (load_all)
    1203           54 :       DEALLOCATE (list_global)
    1204           54 :       DEALLOCATE (load_partial)
    1205              : 
    1206           54 :       CALL timestop(handle)
    1207           54 :    END SUBROUTINE optimize_load_list
    1208              : 
    1209              : ! **************************************************************************************************
    1210              : !> \brief fill the load list with values derived from the tasks array
    1211              : !>        from the alternate locations, we select the alternate location that
    1212              : !>        can be used without increasing the number of matrix blocks needed to
    1213              : !>        distribute.
    1214              : !>        Replicated tasks are not yet considered
    1215              : !>
    1216              : !> \param list ...
    1217              : !> \param rs_descs ...
    1218              : !> \param grid_level ...
    1219              : !> \param tasks ...
    1220              : !> \param ntasks ...
    1221              : !> \param natoms ...
    1222              : !> \param create_list ...
    1223              : !> \par History
    1224              : !>      created 2008-10-06 [Joost VandeVondele]
    1225              : ! **************************************************************************************************
    1226          108 :    SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
    1227              :       INTEGER, DIMENSION(:, :, 0:)                       :: list
    1228              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1229              :          POINTER                                         :: rs_descs
    1230              :       INTEGER                                            :: grid_level
    1231              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1232              :       INTEGER                                            :: ntasks, natoms
    1233              :       LOGICAL                                            :: create_list
    1234              : 
    1235              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list'
    1236              : 
    1237              :       INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
    1238              :                  itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
    1239              :                  rank
    1240              :       INTEGER(KIND=int_8)                                :: bit_pattern, ipair, ipair_old, natom8
    1241              :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: loads
    1242          108 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: all_dests, index
    1243              :       INTEGER, DIMENSION(6)                              :: options
    1244              : 
    1245          108 :       CALL timeset(routineN, handle)
    1246              : 
    1247          324 :       ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
    1248          108 :       CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.FALSE.)
    1249              : 
    1250          108 :       maxdest = SIZE(list, 2)
    1251          108 :       ncpu = SIZE(list, 3)
    1252          108 :       natom8 = natoms
    1253              : 
    1254              :       ! first find the tasks that deal with the same atom pair
    1255          108 :       itask_stop = 0
    1256          108 :       ipair_old = HUGE(ipair_old)
    1257          108 :       img_old = -1
    1258          108 :       ALLOCATE (all_dests(0))
    1259          108 :       ALLOCATE (INDEX(0))
    1260              : 
    1261              :       DO
    1262              : 
    1263              :          ! first find the range of tasks that deal with the same atom pair
    1264          290 :          itask_start = itask_stop + 1
    1265          290 :          itask_stop = itask_start
    1266          290 :          IF (itask_stop > ntasks) EXIT
    1267          182 :          ilevel = tasks(itask_stop)%grid_level
    1268          182 :          img_old = tasks(itask_stop)%image
    1269          182 :          iatom = tasks(itask_stop)%iatom
    1270          182 :          jatom = tasks(itask_stop)%jatom
    1271          182 :          iset = tasks(itask_stop)%iset
    1272          182 :          jset = tasks(itask_stop)%jset
    1273          182 :          ipgf = tasks(itask_stop)%ipgf
    1274          182 :          jpgf = tasks(itask_stop)%jpgf
    1275              : 
    1276          182 :          ipair_old = (iatom - 1)*natom8 + (jatom - 1)
    1277              :          DO
    1278         7386 :             IF (itask_stop + 1 > ntasks) EXIT
    1279         7298 :             ilevel = tasks(itask_stop + 1)%grid_level
    1280         7298 :             img = tasks(itask_stop + 1)%image
    1281         7298 :             iatom = tasks(itask_stop + 1)%iatom
    1282         7298 :             jatom = tasks(itask_stop + 1)%jatom
    1283         7298 :             iset = tasks(itask_stop + 1)%iset
    1284         7298 :             jset = tasks(itask_stop + 1)%jset
    1285         7298 :             ipgf = tasks(itask_stop + 1)%ipgf
    1286         7298 :             jpgf = tasks(itask_stop + 1)%jpgf
    1287              : 
    1288         7298 :             ipair = (iatom - 1)*natom8 + (jatom - 1)
    1289         7386 :             IF (ipair == ipair_old .AND. img == img_old) THEN
    1290              :                itask_stop = itask_stop + 1
    1291              :             ELSE
    1292              :                EXIT
    1293              :             END IF
    1294              :          END DO
    1295          182 :          ipair = ipair_old
    1296          182 :          nshort = itask_stop - itask_start + 1
    1297              : 
    1298              :          ! find the unique list of destinations on this grid level only
    1299          182 :          DEALLOCATE (all_dests)
    1300          546 :          ALLOCATE (all_dests(nshort))
    1301          182 :          DEALLOCATE (index)
    1302          364 :          ALLOCATE (INDEX(nshort))
    1303         7568 :          DO i = 1, nshort
    1304         7386 :             ilevel = tasks(itask_start + i - 1)%grid_level
    1305         7386 :             img = tasks(itask_start + i - 1)%image
    1306         7386 :             iatom = tasks(itask_start + i - 1)%iatom
    1307         7386 :             jatom = tasks(itask_start + i - 1)%jatom
    1308         7386 :             iset = tasks(itask_start + i - 1)%iset
    1309         7386 :             jset = tasks(itask_start + i - 1)%jset
    1310         7386 :             ipgf = tasks(itask_start + i - 1)%ipgf
    1311         7386 :             jpgf = tasks(itask_start + i - 1)%jpgf
    1312              : 
    1313         7568 :             IF (ilevel == grid_level) THEN
    1314         2760 :                all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
    1315              :             ELSE
    1316         4626 :                all_dests(i) = HUGE(all_dests(i))
    1317              :             END IF
    1318              :          END DO
    1319          182 :          CALL sort(all_dests, nshort, index)
    1320          182 :          ndest_pair = 1
    1321         7386 :          DO i = 2, nshort
    1322         7386 :             IF ((all_dests(ndest_pair) /= all_dests(i)) .AND. (all_dests(i) /= HUGE(all_dests(i)))) THEN
    1323           10 :                ndest_pair = ndest_pair + 1
    1324           10 :                all_dests(ndest_pair) = all_dests(i)
    1325              :             END IF
    1326              :          END DO
    1327              : 
    1328         7676 :          DO itask = itask_start, itask_stop
    1329              : 
    1330         7386 :             dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
    1331         7386 :             ilevel = tasks(itask)%grid_level
    1332         7386 :             img = tasks(itask)%image
    1333         7386 :             iatom = tasks(itask)%iatom
    1334         7386 :             jatom = tasks(itask)%jatom
    1335         7386 :             iset = tasks(itask)%iset
    1336         7386 :             jset = tasks(itask)%jset
    1337         7386 :             ipgf = tasks(itask)%ipgf
    1338         7386 :             jpgf = tasks(itask)%jpgf
    1339              : 
    1340              :             ! Only proceed with tasks which are on this grid level
    1341         7386 :             IF (ilevel /= grid_level) CYCLE
    1342         2760 :             ipair = (iatom - 1)*natom8 + (jatom - 1)
    1343         2760 :             cost = INT(tasks(itask)%cost)
    1344              : 
    1345          182 :             SELECT CASE (tasks(itask)%dist_type)
    1346              :             CASE (1)
    1347         2760 :                bit_pattern = tasks(itask)%subpatch_pattern
    1348         2760 :                nopt = 0
    1349         2760 :                IF (BTEST(bit_pattern, 0)) THEN
    1350            0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [-1, 0, 0])
    1351            0 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1352            0 :                      nopt = nopt + 1
    1353            0 :                      options(nopt) = rank
    1354              :                   END IF
    1355              :                END IF
    1356         2760 :                IF (BTEST(bit_pattern, 1)) THEN
    1357            0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [+1, 0, 0])
    1358            0 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1359            0 :                      nopt = nopt + 1
    1360            0 :                      options(nopt) = rank
    1361              :                   END IF
    1362              :                END IF
    1363         2760 :                IF (BTEST(bit_pattern, 2)) THEN
    1364           48 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, -1, 0])
    1365           96 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1366            0 :                      nopt = nopt + 1
    1367            0 :                      options(nopt) = rank
    1368              :                   END IF
    1369              :                END IF
    1370         2760 :                IF (BTEST(bit_pattern, 3)) THEN
    1371            0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, +1, 0])
    1372            0 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1373            0 :                      nopt = nopt + 1
    1374            0 :                      options(nopt) = rank
    1375              :                   END IF
    1376              :                END IF
    1377         2760 :                IF (BTEST(bit_pattern, 4)) THEN
    1378         1150 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, -1])
    1379         2200 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1380          100 :                      nopt = nopt + 1
    1381          100 :                      options(nopt) = rank
    1382              :                   END IF
    1383              :                END IF
    1384         2760 :                IF (BTEST(bit_pattern, 5)) THEN
    1385          500 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, +1])
    1386         1000 :                   IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
    1387           80 :                      nopt = nopt + 1
    1388           80 :                      options(nopt) = rank
    1389              :                   END IF
    1390              :                END IF
    1391         2760 :                IF (nopt > 0) THEN
    1392              :                   ! set it to the rank with the lowest load
    1393          180 :                   rank = options(1)
    1394          180 :                   DO iopt = 2, nopt
    1395          180 :                      IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
    1396              :                   END DO
    1397              :                ELSE
    1398         2580 :                   rank = dest
    1399              :                END IF
    1400         2760 :                li = list_index(list, rank, dest)
    1401         2760 :                IF (create_list) THEN
    1402         1380 :                   list(2, li, dest) = list(2, li, dest) + cost
    1403              :                ELSE
    1404         1380 :                   IF (list(1, li, dest) == dest) THEN
    1405         1290 :                      tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1406              :                   ELSE
    1407           90 :                      IF (list(2, li, dest) >= cost) THEN
    1408           10 :                         list(2, li, dest) = list(2, li, dest) - cost
    1409           10 :                         tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
    1410              :                      ELSE
    1411           80 :                         tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1412              :                      END IF
    1413              :                   END IF
    1414              :                END IF
    1415              :             CASE (2) ! generalised
    1416            0 :                li = list_index(list, dest, dest)
    1417            0 :                IF (create_list) THEN
    1418            0 :                   list(2, li, dest) = list(2, li, dest) + cost
    1419              :                ELSE
    1420            0 :                   IF (list(1, li, dest) == dest) THEN
    1421            0 :                      tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1422              :                   ELSE
    1423            0 :                      IF (list(2, li, dest) >= cost) THEN
    1424            0 :                         list(2, li, dest) = list(2, li, dest) - cost
    1425            0 :                         tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
    1426              :                      ELSE
    1427            0 :                         tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1428              :                      END IF
    1429              :                   END IF
    1430              :                END IF
    1431              :             CASE DEFAULT
    1432         7386 :                CPABORT("")
    1433              :             END SELECT
    1434              : 
    1435              :          END DO
    1436              : 
    1437              :       END DO
    1438              : 
    1439          108 :       CALL timestop(handle)
    1440              : 
    1441          216 :    END SUBROUTINE compute_load_list
    1442              : ! **************************************************************************************************
    1443              : !> \brief small helper function to return the proper index in the list array
    1444              : !>
    1445              : !> \param list ...
    1446              : !> \param rank ...
    1447              : !> \param dest ...
    1448              : !> \return ...
    1449              : !> \par History
    1450              : !>      created 2008-10-06 [Joost VandeVondele]
    1451              : ! **************************************************************************************************
    1452         2760 :    INTEGER FUNCTION list_index(list, rank, dest)
    1453              :       INTEGER, DIMENSION(:, :, 0:), INTENT(IN)           :: list
    1454              :       INTEGER, INTENT(IN)                                :: rank, dest
    1455              : 
    1456         2760 :       list_index = 1
    1457         1424 :       DO
    1458         4184 :          IF (list(1, list_index, dest) == rank) EXIT
    1459         1424 :          list_index = list_index + 1
    1460              :       END DO
    1461         2760 :    END FUNCTION list_index
    1462              : ! **************************************************************************************************
    1463              : !> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
    1464              : !>        note that we allocate it with an additional field to store the load of this destination
    1465              : !>
    1466              : !> \param list ...
    1467              : !> \param rs_descs ...
    1468              : !> \param grid_level ...
    1469              : !> \par History
    1470              : !>      created 2008-10-06 [Joost VandeVondele]
    1471              : ! **************************************************************************************************
    1472           54 :    SUBROUTINE create_destination_list(list, rs_descs, grid_level)
    1473              :       INTEGER, DIMENSION(:, :, :), POINTER               :: list
    1474              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1475              :          POINTER                                         :: rs_descs
    1476              :       INTEGER, INTENT(IN)                                :: grid_level
    1477              : 
    1478              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list'
    1479              : 
    1480              :       INTEGER                                            :: handle, i, icpu, j, maxcount, ncpu, &
    1481              :                                                             ultimate_max
    1482           54 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index, sublist
    1483              : 
    1484           54 :       CALL timeset(routineN, handle)
    1485              : 
    1486           54 :       CPASSERT(.NOT. ASSOCIATED(list))
    1487           54 :       ncpu = rs_descs(grid_level)%rs_desc%group_size
    1488           54 :       ultimate_max = 7
    1489              : 
    1490          162 :       ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
    1491              : 
    1492           54 :       ALLOCATE (INDEX(ultimate_max))
    1493           54 :       ALLOCATE (sublist(ultimate_max))
    1494          432 :       sublist = HUGE(sublist)
    1495              : 
    1496           54 :       maxcount = 1
    1497          162 :       DO icpu = 0, ncpu - 1
    1498          108 :          sublist(1) = icpu
    1499          108 :          sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [-1, 0, 0])
    1500          108 :          sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [+1, 0, 0])
    1501          108 :          sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, -1, 0])
    1502          108 :          sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, +1, 0])
    1503          108 :          sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, -1])
    1504          108 :          sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, +1])
    1505              :          ! only retain unique values of the destination
    1506          108 :          CALL sort(sublist, ultimate_max, index)
    1507          108 :          j = 1
    1508          756 :          DO i = 2, 7
    1509          756 :             IF (sublist(i) /= sublist(j)) THEN
    1510          108 :                j = j + 1
    1511          108 :                sublist(j) = sublist(i)
    1512              :             END IF
    1513              :          END DO
    1514          108 :          maxcount = MAX(maxcount, j)
    1515          648 :          sublist(j + 1:ultimate_max) = HUGE(sublist)
    1516          864 :          list(1, :, icpu) = sublist
    1517          918 :          list(2, :, icpu) = 0
    1518              :       END DO
    1519              : 
    1520           54 :       CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
    1521              : 
    1522           54 :       CALL timestop(handle)
    1523              : 
    1524          108 :    END SUBROUTINE create_destination_list
    1525              : 
    1526              : ! **************************************************************************************************
    1527              : !> \brief given a task list, compute the load of each process everywhere
    1528              : !>        giving this function the ability to loop over a (sub)set of rs_grids,
    1529              : !>        and do all the communication in one shot, would speed it up
    1530              : !> \param loads ...
    1531              : !> \param rs_descs ...
    1532              : !> \param grid_level ...
    1533              : !> \param ntasks ...
    1534              : !> \param tasks ...
    1535              : !> \param use_reordered_ranks ...
    1536              : !> \par History
    1537              : !>      none
    1538              : !> \author MattW 21/11/2007
    1539              : ! **************************************************************************************************
    1540          480 :    SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
    1541              :       INTEGER(KIND=int_8), DIMENSION(:)                  :: loads
    1542              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1543              :          POINTER                                         :: rs_descs
    1544              :       INTEGER                                            :: grid_level, ntasks
    1545              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1546              :       LOGICAL, INTENT(IN)                                :: use_reordered_ranks
    1547              : 
    1548              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads'
    1549              : 
    1550              :       INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
    1551              :                                                             iset, jatom, jpgf, jset
    1552              :       INTEGER(KIND=int_8)                                :: total_cost_local
    1553              :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf_i, send_buf_i
    1554              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1555              : 
    1556          480 :       CALL timeset(routineN, handle)
    1557              : 
    1558          480 :       desc => rs_descs(grid_level)%rs_desc
    1559              : 
    1560              :       ! allocate local arrays
    1561         1440 :       ALLOCATE (send_buf_i(desc%group_size))
    1562          960 :       ALLOCATE (recv_buf_i(desc%group_size))
    1563              : 
    1564              :       ! communication step 1 : compute the total local cost of the tasks
    1565              :       !                        each proc needs to know the amount of work he will receive
    1566              : 
    1567              :       ! send buffer now contains for each target the cost of the tasks it will receive
    1568         1440 :       send_buf_i = 0
    1569        36792 :       DO i = 1, ntasks
    1570        36312 :          ilevel = tasks(i)%grid_level
    1571        36312 :          img = tasks(i)%image
    1572        36312 :          iatom = tasks(i)%iatom
    1573        36312 :          jatom = tasks(i)%jatom
    1574        36312 :          iset = tasks(i)%iset
    1575        36312 :          jset = tasks(i)%jset
    1576        36312 :          ipgf = tasks(i)%ipgf
    1577        36312 :          jpgf = tasks(i)%jpgf
    1578        36312 :          IF (ilevel /= grid_level) CYCLE
    1579        10476 :          IF (use_reordered_ranks) THEN
    1580              :             send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
    1581              :                send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
    1582         3618 :                + tasks(i)%cost
    1583              :          ELSE
    1584              :             send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
    1585              :                send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
    1586         6378 :                + tasks(i)%cost
    1587              :          END IF
    1588              :       END DO
    1589          480 :       CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
    1590              : 
    1591              :       ! communication step 2 : compute the global cost of the tasks
    1592         1440 :       total_cost_local = SUM(recv_buf_i)
    1593              : 
    1594              :       ! after this step, the recv buffer contains the local cost for each CPU
    1595         1440 :       CALL desc%group%allgather(total_cost_local, loads)
    1596              : 
    1597          480 :       CALL timestop(handle)
    1598              : 
    1599          960 :    END SUBROUTINE get_current_loads
    1600              : ! **************************************************************************************************
    1601              : !> \brief performs load balancing shifting tasks on the replicated grids
    1602              : !>        this modifies the destination of some of the tasks on replicated
    1603              : !>        grids, and in this way balances the load
    1604              : !> \param rs_descs ...
    1605              : !> \param ntasks ...
    1606              : !> \param tasks ...
    1607              : !> \par History
    1608              : !>      none
    1609              : !> \author MattW 21/11/2007
    1610              : ! **************************************************************************************************
    1611           48 :    SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
    1612              : 
    1613              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1614              :          POINTER                                         :: rs_descs
    1615              :       INTEGER                                            :: ntasks
    1616              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1617              : 
    1618              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated'
    1619              : 
    1620              :       INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
    1621              :                                                             iset, j, jatom, jpgf, jset, &
    1622              :                                                             no_overloaded, no_underloaded, &
    1623              :                                                             proc_receiving
    1624              :       INTEGER(KIND=int_8)                                :: average_cost, cost_task_rep, count, &
    1625              :                                                             offset, total_cost_global
    1626           48 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: load_imbalance, loads, recv_buf_i
    1627           48 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index
    1628              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1629              : 
    1630           48 :       CALL timeset(routineN, handle)
    1631              : 
    1632           48 :       desc => rs_descs(1)%rs_desc
    1633              : 
    1634              :       ! allocate local arrays
    1635          144 :       ALLOCATE (recv_buf_i(desc%group_size))
    1636           96 :       ALLOCATE (loads(desc%group_size))
    1637              : 
    1638          144 :       recv_buf_i = 0
    1639          234 :       DO i = 1, SIZE(rs_descs)
    1640          186 :          CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.TRUE.)
    1641          606 :          recv_buf_i(:) = recv_buf_i + loads
    1642              :       END DO
    1643              : 
    1644          144 :       total_cost_global = SUM(recv_buf_i)
    1645           48 :       average_cost = total_cost_global/desc%group_size
    1646              : 
    1647              :       !
    1648              :       ! compute how to redistribute the replicated tasks so that the average cost is reached
    1649              :       !
    1650              : 
    1651              :       ! load imbalance measures the load of a given CPU relative
    1652              :       ! to the optimal load distribution (load=average)
    1653          144 :       ALLOCATE (load_imbalance(desc%group_size))
    1654          144 :       ALLOCATE (INDEX(desc%group_size))
    1655              : 
    1656          144 :       load_imbalance(:) = recv_buf_i - average_cost
    1657           48 :       no_overloaded = 0
    1658           48 :       no_underloaded = 0
    1659              : 
    1660          144 :       DO i = 1, desc%group_size
    1661           96 :          IF (load_imbalance(i) > 0) no_overloaded = no_overloaded + 1
    1662          144 :          IF (load_imbalance(i) < 0) no_underloaded = no_underloaded + 1
    1663              :       END DO
    1664              : 
    1665              :       ! sort the recv_buffer on number of tasks, gives us index which provides a
    1666              :       ! mapping between processor ranks and how overloaded the processor
    1667           48 :       CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
    1668              : 
    1669              :       ! find out the number of replicated tasks each proc has
    1670              :       ! but only those tasks which have not yet been assigned
    1671           48 :       cost_task_rep = 0
    1672         3666 :       DO i = 1, ntasks
    1673              :          IF (tasks(i)%dist_type == 0 &
    1674         3666 :              .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
    1675         2238 :             cost_task_rep = cost_task_rep + tasks(i)%cost
    1676              :          END IF
    1677              :       END DO
    1678              : 
    1679              :       ! now, correct the load imbalance for the overloaded CPUs
    1680              :       ! they will send away not more than the total load of replicated tasks
    1681           48 :       CALL desc%group%allgather(cost_task_rep, recv_buf_i)
    1682              : 
    1683          144 :       DO i = 1, desc%group_size
    1684              :          ! At the moment we can only offload replicated tasks
    1685           96 :          IF (load_imbalance(i) > 0) &
    1686           96 :             load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i))
    1687              :       END DO
    1688              : 
    1689              :       ! simplest algorithm I can think of of is that the processor with the most
    1690              :       ! excess tasks fills up the process needing most, then moves on to next most.
    1691              :       ! At the moment if we've got less replicated tasks than we're overloaded then
    1692              :       ! task balancing will be incomplete
    1693              : 
    1694              :       ! only need to do anything if I've excess tasks
    1695           48 :       IF (load_imbalance(desc%my_pos + 1) > 0) THEN
    1696              : 
    1697           22 :          count = 0 ! weighted amount of tasks offloaded
    1698           22 :          offset = 0 ! no of underloaded processes already filled by other more overloaded procs
    1699              : 
    1700              :          ! calculate offset
    1701           22 :          DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
    1702           22 :             IF (INDEX(i) == desc%my_pos + 1) THEN
    1703              :                EXIT
    1704              :             ELSE
    1705            0 :                offset = offset + load_imbalance(INDEX(i))
    1706              :             END IF
    1707              :          END DO
    1708              : 
    1709              :          ! find my starting processor to send to
    1710           22 :          proc_receiving = HUGE(proc_receiving)
    1711           22 :          DO i = 1, no_underloaded
    1712           22 :             offset = offset + load_imbalance(INDEX(i))
    1713           22 :             IF (offset <= 0) THEN
    1714              :                proc_receiving = i
    1715              :                EXIT
    1716              :             END IF
    1717              :          END DO
    1718              : 
    1719              :          ! offset now contains minus the number of tasks proc_receiving requires
    1720              :          ! we fill this up by adjusting the destination of tasks on the replicated grid,
    1721              :          ! then move to next most underloaded proc
    1722         1380 :          DO j = 1, ntasks
    1723              :             IF (tasks(j)%dist_type == 0 &
    1724         1380 :                 .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
    1725              :                ! just avoid sending to non existing procs due to integer truncation
    1726              :                ! in the computation of the average
    1727          808 :                IF (proc_receiving > no_underloaded) EXIT
    1728              :                ! set new destination
    1729          808 :                ilevel = tasks(j)%grid_level
    1730          808 :                img = tasks(j)%image
    1731          808 :                iatom = tasks(j)%iatom
    1732          808 :                jatom = tasks(j)%jatom
    1733          808 :                iset = tasks(j)%iset
    1734          808 :                jset = tasks(j)%jset
    1735          808 :                ipgf = tasks(j)%ipgf
    1736          808 :                jpgf = tasks(j)%jpgf
    1737          808 :                tasks(j)%destination = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs))
    1738          808 :                offset = offset + tasks(j)%cost
    1739          808 :                count = count + tasks(j)%cost
    1740          808 :                IF (count >= load_imbalance(desc%my_pos + 1)) EXIT
    1741          786 :                IF (offset > 0) THEN
    1742            0 :                   proc_receiving = proc_receiving + 1
    1743              :                   ! just avoid sending to non existing procs due to integer truncation
    1744              :                   ! in the computation of the average
    1745            0 :                   IF (proc_receiving > no_underloaded) EXIT
    1746            0 :                   offset = load_imbalance(INDEX(proc_receiving))
    1747              :                END IF
    1748              :             END IF
    1749              :          END DO
    1750              :       END IF
    1751              : 
    1752           48 :       DEALLOCATE (index)
    1753           48 :       DEALLOCATE (load_imbalance)
    1754              : 
    1755           48 :       CALL timestop(handle)
    1756              : 
    1757           96 :    END SUBROUTINE load_balance_replicated
    1758              : 
    1759              : ! **************************************************************************************************
    1760              : !> \brief given an input task list, redistribute so that all tasks can be processed locally,
    1761              : !>        i.e. dest equals rank
    1762              : !> \param rs_descs ...
    1763              : !> \param ntasks ...
    1764              : !> \param tasks ...
    1765              : !> \param ntasks_recv ...
    1766              : !> \param tasks_recv ...
    1767              : !> \par History
    1768              : !>      none
    1769              : !> \author MattW 21/11/2007
    1770              : ! **************************************************************************************************
    1771           48 :    SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
    1772              : 
    1773              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1774              :          POINTER                                         :: rs_descs
    1775              :       INTEGER                                            :: ntasks
    1776              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1777              :       INTEGER                                            :: ntasks_recv
    1778              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks_recv
    1779              : 
    1780              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks'
    1781              : 
    1782              :       INTEGER                                            :: handle, i, j, k, l, rank
    1783              :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf, send_buf
    1784              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: recv_disps, recv_sizes, send_disps, &
    1785              :                                                             send_sizes
    1786              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1787              : 
    1788           48 :       CALL timeset(routineN, handle)
    1789              : 
    1790           48 :       desc => rs_descs(1)%rs_desc
    1791              : 
    1792              :       ! allocate local arrays
    1793          144 :       ALLOCATE (send_sizes(desc%group_size))
    1794           96 :       ALLOCATE (recv_sizes(desc%group_size))
    1795           96 :       ALLOCATE (send_disps(desc%group_size))
    1796           96 :       ALLOCATE (recv_disps(desc%group_size))
    1797          144 :       ALLOCATE (send_buf(desc%group_size))
    1798           96 :       ALLOCATE (recv_buf(desc%group_size))
    1799              : 
    1800              :       ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
    1801          144 :       send_buf = 0
    1802         3666 :       DO i = 1, ntasks
    1803              :          rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
    1804         3618 :                 %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
    1805         3666 :          send_buf(rank + 1) = send_buf(rank + 1) + 1
    1806              :       END DO
    1807              : 
    1808           48 :       CALL desc%group%alltoall(send_buf, recv_buf, 1)
    1809              : 
    1810              :       ! pack the tasks, and send them around
    1811              : 
    1812          144 :       send_sizes = 0
    1813          144 :       send_disps = 0
    1814          144 :       recv_sizes = 0
    1815          144 :       recv_disps = 0
    1816              : 
    1817           48 :       send_sizes(1) = INT(send_buf(1)*task_size_in_int8)
    1818           48 :       recv_sizes(1) = INT(recv_buf(1)*task_size_in_int8)
    1819           96 :       DO i = 2, desc%group_size
    1820           48 :          send_sizes(i) = INT(send_buf(i)*task_size_in_int8)
    1821           48 :          recv_sizes(i) = INT(recv_buf(i)*task_size_in_int8)
    1822           48 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1823           96 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1824              :       END DO
    1825              : 
    1826              :       ! deallocate old send/recv buffers
    1827           48 :       DEALLOCATE (send_buf)
    1828           48 :       DEALLOCATE (recv_buf)
    1829              : 
    1830              :       ! allocate them with new sizes
    1831          233 :       ALLOCATE (send_buf(SUM(send_sizes)))
    1832          238 :       ALLOCATE (recv_buf(SUM(recv_sizes)))
    1833              : 
    1834              :       ! do packing
    1835        61554 :       send_buf = 0
    1836          144 :       send_sizes = 0
    1837         3666 :       DO j = 1, ntasks
    1838              :          i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
    1839         3618 :              %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
    1840         3618 :          l = send_disps(i) + send_sizes(i)
    1841         3618 :          CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
    1842         3666 :          send_sizes(i) = send_sizes(i) + task_size_in_int8
    1843              :       END DO
    1844              : 
    1845              :       ! do communication
    1846           48 :       CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
    1847              : 
    1848           48 :       DEALLOCATE (send_buf)
    1849              : 
    1850          144 :       ntasks_recv = SUM(recv_sizes)/task_size_in_int8
    1851         3904 :       ALLOCATE (tasks_recv(ntasks_recv))
    1852              : 
    1853              :       ! do unpacking
    1854           48 :       l = 0
    1855          144 :       DO i = 1, desc%group_size
    1856         3762 :          DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
    1857         3618 :             l = l + 1
    1858         3618 :             k = recv_disps(i) + j*task_size_in_int8
    1859         3714 :             CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
    1860              :          END DO
    1861              :       END DO
    1862              : 
    1863           48 :       DEALLOCATE (recv_buf)
    1864           48 :       DEALLOCATE (send_sizes)
    1865           48 :       DEALLOCATE (recv_sizes)
    1866           48 :       DEALLOCATE (send_disps)
    1867           48 :       DEALLOCATE (recv_disps)
    1868              : 
    1869           48 :       CALL timestop(handle)
    1870              : 
    1871           48 :    END SUBROUTINE create_local_tasks
    1872              : 
    1873              : ! **************************************************************************************************
    1874              : !> \brief Assembles tasks to be performed on local grid
    1875              : !> \param rs_descs the grids
    1876              : !> \param ntasks Number of tasks for local processing
    1877              : !> \param natoms ...
    1878              : !> \param nimages ...
    1879              : !> \param tasks the task set generated on this processor
    1880              : !> \param rval ...
    1881              : !> \param atom_pair_send ...
    1882              : !> \param atom_pair_recv ...
    1883              : !> \param symmetric ...
    1884              : !> \param reorder_rs_grid_ranks ...
    1885              : !> \param skip_load_balance_distributed ...
    1886              : !> \par History
    1887              : !>      none
    1888              : !> \author MattW 21/11/2007
    1889              : ! **************************************************************************************************
    1890        17418 :    SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
    1891              :                                tasks, atom_pair_send, atom_pair_recv, &
    1892              :                                symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
    1893              : 
    1894              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1895              :          POINTER                                         :: rs_descs
    1896              :       INTEGER                                            :: ntasks, natoms
    1897              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1898              :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair_send, atom_pair_recv
    1899              :       LOGICAL, INTENT(IN)                                :: symmetric, reorder_rs_grid_ranks, &
    1900              :                                                             skip_load_balance_distributed
    1901              : 
    1902              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks'
    1903              : 
    1904              :       INTEGER                                            :: handle, igrid_level, irank, ntasks_recv
    1905              :       INTEGER(KIND=int_8)                                :: load_gap, max_load, replicated_load
    1906        17418 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: total_loads, total_loads_tmp, trial_loads
    1907        17418 :       INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: loads
    1908        17418 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: indices, real2virtual, total_index
    1909              :       LOGICAL                                            :: distributed_grids, fixed_first_grid
    1910              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1911        17418 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks_recv
    1912              : 
    1913        17418 :       CALL timeset(routineN, handle)
    1914              : 
    1915        17418 :       CPASSERT(ASSOCIATED(tasks))
    1916              : 
    1917              :       ! *** figure out if we have distributed grids
    1918        17418 :       distributed_grids = .FALSE.
    1919        86538 :       DO igrid_level = 1, SIZE(rs_descs)
    1920        86538 :          IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1921           54 :             distributed_grids = .TRUE.
    1922              :          END IF
    1923              :       END DO
    1924        17418 :       desc => rs_descs(1)%rs_desc
    1925              : 
    1926        17418 :       IF (distributed_grids) THEN
    1927              : 
    1928          192 :          ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
    1929          144 :          ALLOCATE (total_loads(0:desc%group_size - 1))
    1930              : 
    1931          144 :          total_loads = 0
    1932              : 
    1933              :          ! First round of balancing on the distributed grids
    1934              :          ! we just balance each of the distributed grids independently
    1935          234 :          DO igrid_level = 1, SIZE(rs_descs)
    1936          234 :             IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1937              : 
    1938           54 :                IF (.NOT. skip_load_balance_distributed) &
    1939           54 :                   CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
    1940              : 
    1941              :                CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
    1942           54 :                                       tasks, use_reordered_ranks=.FALSE.)
    1943              : 
    1944          162 :                total_loads(:) = total_loads + loads(:, igrid_level)
    1945              : 
    1946              :             END IF
    1947              :          END DO
    1948              : 
    1949              :          ! calculate the total load of replicated tasks, so we can decide if it is worth
    1950              :          ! reordering the distributed grid levels
    1951              : 
    1952           48 :          replicated_load = 0
    1953          234 :          DO igrid_level = 1, SIZE(rs_descs)
    1954          234 :             IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
    1955              :                CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
    1956          132 :                                       tasks, use_reordered_ranks=.FALSE.)
    1957          396 :                replicated_load = replicated_load + SUM(loads(:, igrid_level))
    1958              :             END IF
    1959              :          END DO
    1960              : 
    1961              :          !IF (desc%my_pos==0) THEN
    1962              :          ! WRITE(*,*) "Total replicated load is ",replicated_load
    1963              :          !END IF
    1964              : 
    1965              :          ! Now we adjust the rank ordering based on the current loads
    1966              :          ! we leave the first distributed level and all the replicated levels in the default order
    1967           48 :          IF (reorder_rs_grid_ranks) THEN
    1968           48 :             fixed_first_grid = .FALSE.
    1969          234 :             DO igrid_level = 1, SIZE(rs_descs)
    1970          234 :                IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1971           54 :                   IF (fixed_first_grid .EQV. .FALSE.) THEN
    1972          144 :                      total_loads(:) = loads(:, igrid_level)
    1973              :                      fixed_first_grid = .TRUE.
    1974              :                   ELSE
    1975           18 :                      ALLOCATE (trial_loads(0:desc%group_size - 1))
    1976              : 
    1977           18 :                      trial_loads(:) = total_loads + loads(:, igrid_level)
    1978           18 :                      max_load = MAXVAL(trial_loads)
    1979              :                      load_gap = 0
    1980           18 :                      DO irank = 0, desc%group_size - 1
    1981           18 :                         load_gap = load_gap + max_load - trial_loads(irank)
    1982              :                      END DO
    1983              : 
    1984              :                      ! If there is not enough replicated load to load balance well enough
    1985              :                      ! then we will reorder this grid level
    1986            6 :                      IF (load_gap > replicated_load*1.05_dp) THEN
    1987              : 
    1988           18 :                         ALLOCATE (indices(0:desc%group_size - 1))
    1989           12 :                         ALLOCATE (total_index(0:desc%group_size - 1))
    1990           12 :                         ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
    1991           12 :                         ALLOCATE (real2virtual(0:desc%group_size - 1))
    1992              : 
    1993           18 :                         total_loads_tmp(:) = total_loads
    1994            6 :                         CALL sort(total_loads_tmp, desc%group_size, total_index)
    1995            6 :                         CALL sort(loads(:, igrid_level), desc%group_size, indices)
    1996              : 
    1997              :                         ! Reorder so that the rank with smallest load on this grid level is paired with
    1998              :                         ! the highest load in total
    1999           18 :                         DO irank = 0, desc%group_size - 1
    2000              :                            total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
    2001           12 :                                                                  loads(desc%group_size - irank - 1, igrid_level)
    2002           18 :                            real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
    2003              :                         END DO
    2004              : 
    2005            6 :                         CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
    2006              : 
    2007            6 :                         DEALLOCATE (indices)
    2008            6 :                         DEALLOCATE (total_index)
    2009            6 :                         DEALLOCATE (total_loads_tmp)
    2010            6 :                         DEALLOCATE (real2virtual)
    2011              :                      ELSE
    2012            0 :                         total_loads(:) = trial_loads
    2013              :                      END IF
    2014              : 
    2015            6 :                      DEALLOCATE (trial_loads)
    2016              : 
    2017              :                   END IF
    2018              :                END IF
    2019              :             END DO
    2020              :          END IF
    2021              : 
    2022              :          ! Now we use the replicated tasks to balance out the rest of the load
    2023           48 :          CALL load_balance_replicated(rs_descs, ntasks, tasks)
    2024              : 
    2025              :          !total_loads = 0
    2026              :          !DO igrid_level=1,SIZE(rs_descs)
    2027              :          !  CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
    2028              :          !                         tasks, use_reordered_ranks=.TRUE.)
    2029              :          !  total_loads = total_loads + loads(:, igrid_level)
    2030              :          !END DO
    2031              : 
    2032              :          !IF (desc%my_pos==0) THEN
    2033              :          !  WRITE(*,*) ""
    2034              :          !  WRITE(*,*) "At the end of the load balancing procedure"
    2035              :          !  WRITE(*,*) "Maximum  load:",MAXVAL(total_loads)
    2036              :          !  WRITE(*,*) "Average  load:",SUM(total_loads)/SIZE(total_loads)
    2037              :          !  WRITE(*,*) "Minimum  load:",MINVAL(total_loads)
    2038              :          !ENDIF
    2039              : 
    2040              :          ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
    2041           48 :          CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
    2042              : 
    2043              :          !
    2044              :          ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
    2045              :          ! we will be sending. These lists are needed for redistribute_matrix.
    2046              :          !
    2047           48 :          CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.TRUE., symmetric=symmetric, rs_descs=rs_descs)
    2048              : 
    2049              :          ! natom_send=SIZE(atom_pair_send)
    2050              :          ! CALL desc%group%sum(natom_send)
    2051              :          ! IF (desc%my_pos==0) THEN
    2052              :          !     WRITE(*,*) ""
    2053              :          !     WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
    2054              :          ! ENDIF
    2055              : 
    2056           48 :          CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
    2057              : 
    2058              :          ! cleanup, at this point we  don't need the original tasks anymore
    2059           48 :          DEALLOCATE (tasks)
    2060           48 :          DEALLOCATE (loads)
    2061           48 :          DEALLOCATE (total_loads)
    2062              : 
    2063              :       ELSE
    2064        17370 :          tasks_recv => tasks
    2065        17370 :          ntasks_recv = ntasks
    2066        17370 :          CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
    2067              :          ! not distributed, hence atom_pair_send not needed
    2068              :       END IF
    2069              : 
    2070              :       ! here we sort the task list we will process locally.
    2071        51962 :       ALLOCATE (indices(ntasks_recv))
    2072        17418 :       CALL tasks_sort(tasks_recv, ntasks_recv, indices)
    2073        17418 :       DEALLOCATE (indices)
    2074              : 
    2075              :       !
    2076              :       ! final lists are ready
    2077              :       !
    2078              : 
    2079        17418 :       tasks => tasks_recv
    2080        17418 :       ntasks = ntasks_recv
    2081              : 
    2082        17418 :       CALL timestop(handle)
    2083              : 
    2084        34836 :    END SUBROUTINE distribute_tasks
    2085              : 
    2086              : ! **************************************************************************************************
    2087              : !> \brief ...
    2088              : !> \param atom_pair ...
    2089              : !> \param my_tasks ...
    2090              : !> \param send ...
    2091              : !> \param symmetric ...
    2092              : !> \param natoms ...
    2093              : !> \param nimages ...
    2094              : !> \param rs_descs ...
    2095              : ! **************************************************************************************************
    2096        17466 :    SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
    2097              : 
    2098              :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair
    2099              :       TYPE(task_type), DIMENSION(:), INTENT(INOUT)       :: tasks
    2100              :       INTEGER, INTENT(IN)                                :: ntasks
    2101              :       LOGICAL, INTENT(IN)                                :: send, symmetric
    2102              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
    2103              : 
    2104              :       INTEGER                                            :: i, ilevel, iatom, jatom, npairs, virt_rank
    2105        17466 :       INTEGER, DIMENSION(:), ALLOCATABLE                 :: indices
    2106        17466 :       TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE    :: atom_pair_tmp
    2107              : 
    2108        17466 :       CPASSERT(.NOT. ASSOCIATED(atom_pair))
    2109        17466 :       IF (ntasks == 0) THEN
    2110          299 :          ALLOCATE (atom_pair(0))
    2111              :          RETURN
    2112              :       END IF
    2113              : 
    2114              :       ! calculate list of atom pairs
    2115              :       ! fill pair list taking into account symmetry
    2116      8650065 :       ALLOCATE (atom_pair_tmp(ntasks))
    2117      8615731 :       DO i = 1, ntasks
    2118      8598564 :          atom_pair_tmp(i)%image = tasks(i)%image
    2119      8598564 :          iatom = tasks(i)%iatom
    2120      8598564 :          jatom = tasks(i)%jatom
    2121      8598564 :          IF (symmetric .AND. iatom > jatom) THEN
    2122              :             ! iatom / jatom swapped
    2123      3272316 :             atom_pair_tmp(i)%row = jatom
    2124      3272316 :             atom_pair_tmp(i)%col = iatom
    2125              :          ELSE
    2126      5326248 :             atom_pair_tmp(i)%row = iatom
    2127      5326248 :             atom_pair_tmp(i)%col = jatom
    2128              :          END IF
    2129              : 
    2130      8615731 :          IF (send) THEN
    2131              :             ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
    2132              :             ! actually has the correct part of the rs_grid to do the mapping
    2133         3618 :             ilevel = tasks(i)%grid_level
    2134         3618 :             virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
    2135         3618 :             atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
    2136              :          ELSE
    2137              :             ! If we are receiving, then no conversion is needed as the rank is that of the process with the
    2138              :             ! required matrix block, and the ordering of the rs grid is irrelevant
    2139      8594946 :             atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
    2140              :          END IF
    2141              :       END DO
    2142              : 
    2143              :       ! find unique atom pairs that I'm sending/receiving
    2144        51501 :       ALLOCATE (indices(ntasks))
    2145        17167 :       CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
    2146        17167 :       npairs = 1
    2147        17167 :       tasks(indices(1))%pair_index = 1
    2148      8598564 :       DO i = 2, ntasks
    2149      8581397 :          IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
    2150       365224 :             npairs = npairs + 1
    2151       365224 :             atom_pair_tmp(npairs) = atom_pair_tmp(i)
    2152              :          END IF
    2153      8598564 :          tasks(indices(i))%pair_index = npairs
    2154              :       END DO
    2155        17167 :       DEALLOCATE (indices)
    2156              : 
    2157              :       ! Copy unique pairs to final location.
    2158       433892 :       ALLOCATE (atom_pair(npairs))
    2159       399558 :       atom_pair(:) = atom_pair_tmp(:npairs)
    2160        17167 :       DEALLOCATE (atom_pair_tmp)
    2161              : 
    2162              :    END SUBROUTINE get_atom_pair
    2163              : 
    2164              : ! **************************************************************************************************
    2165              : !> \brief redistributes the matrix so that it can be used in realspace operations
    2166              : !>        i.e. according to the task lists for collocate and integrate.
    2167              : !>        This routine can become a bottleneck in large calculations.
    2168              : !> \param rs_descs ...
    2169              : !> \param pmats ...
    2170              : !> \param atom_pair_send ...
    2171              : !> \param atom_pair_recv ...
    2172              : !> \param natoms ...
    2173              : !> \param nimages ...
    2174              : !> \param scatter ...
    2175              : !> \param hmats ...
    2176              : ! **************************************************************************************************
    2177            0 :    SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
    2178              :                                    nimages, scatter, hmats)
    2179              : 
    2180              :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    2181              :          POINTER                                         :: rs_descs
    2182              :       TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: pmats
    2183              :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair_send, atom_pair_recv
    2184              :       INTEGER                                            :: nimages
    2185              :       LOGICAL                                            :: scatter
    2186              :       TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
    2187              :          POINTER                                         :: hmats
    2188              : 
    2189              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix'
    2190              : 
    2191              :       INTEGER                                            :: acol, arow, handle, i, img, j, k, l, me, &
    2192              :                                                             nblkcols_total, nblkrows_total, ncol, &
    2193              :                                                             nrow, nthread, nthread_left
    2194            0 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
    2195            0 :                                             recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
    2196            0 :                                             send_pair_disps, send_sizes
    2197            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, row_blk_size
    2198              :       LOGICAL                                            :: found
    2199            0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: recv_buf_r, send_buf_r
    2200            0 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: h_block, p_block
    2201              :       TYPE(dbcsr_type), POINTER                          :: hmat, pmat
    2202              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    2203            0 :       REAL(kind=dp), DIMENSION(:), POINTER                            :: vector
    2204              : 
    2205            0 : !$    INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
    2206              : 
    2207            0 :       CALL timeset(routineN, handle)
    2208              : 
    2209            0 :       IF (.NOT. scatter) THEN
    2210            0 :          CPASSERT(PRESENT(hmats))
    2211              :       END IF
    2212              : 
    2213            0 :       desc => rs_descs(1)%rs_desc
    2214            0 :       me = desc%my_pos + 1
    2215              : 
    2216              :       ! allocate local arrays
    2217            0 :       ALLOCATE (send_sizes(desc%group_size))
    2218            0 :       ALLOCATE (recv_sizes(desc%group_size))
    2219            0 :       ALLOCATE (send_disps(desc%group_size))
    2220            0 :       ALLOCATE (recv_disps(desc%group_size))
    2221            0 :       ALLOCATE (send_pair_count(desc%group_size))
    2222            0 :       ALLOCATE (recv_pair_count(desc%group_size))
    2223            0 :       ALLOCATE (send_pair_disps(desc%group_size))
    2224            0 :       ALLOCATE (recv_pair_disps(desc%group_size))
    2225              : 
    2226            0 :       pmat => pmats(1)%matrix
    2227              :       CALL dbcsr_get_info(pmat, &
    2228              :                           row_blk_size=row_blk_size, &
    2229              :                           col_blk_size=col_blk_size, &
    2230              :                           nblkrows_total=nblkrows_total, &
    2231            0 :                           nblkcols_total=nblkcols_total)
    2232            0 :       ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
    2233            0 :                 first_col(nblkcols_total), last_col(nblkcols_total))
    2234            0 :       CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
    2235            0 :       CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
    2236              : 
    2237              :       ! set up send buffer sizes
    2238            0 :       send_sizes = 0
    2239            0 :       send_pair_count = 0
    2240            0 :       DO i = 1, SIZE(atom_pair_send)
    2241            0 :          k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
    2242            0 :          arow = atom_pair_send(i)%row
    2243            0 :          acol = atom_pair_send(i)%col
    2244            0 :          nrow = last_row(arow) - first_row(arow) + 1
    2245            0 :          ncol = last_col(acol) - first_col(acol) + 1
    2246            0 :          send_sizes(k) = send_sizes(k) + nrow*ncol
    2247            0 :          send_pair_count(k) = send_pair_count(k) + 1
    2248              :       END DO
    2249              : 
    2250            0 :       send_disps = 0
    2251            0 :       send_pair_disps = 0
    2252            0 :       DO i = 2, desc%group_size
    2253            0 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    2254            0 :          send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
    2255              :       END DO
    2256              : 
    2257            0 :       ALLOCATE (send_buf_r(SUM(send_sizes)))
    2258              : 
    2259              :       ! set up recv buffer
    2260              : 
    2261            0 :       recv_sizes = 0
    2262            0 :       recv_pair_count = 0
    2263            0 :       DO i = 1, SIZE(atom_pair_recv)
    2264            0 :          k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
    2265            0 :          arow = atom_pair_recv(i)%row
    2266            0 :          acol = atom_pair_recv(i)%col
    2267            0 :          nrow = last_row(arow) - first_row(arow) + 1
    2268            0 :          ncol = last_col(acol) - first_col(acol) + 1
    2269            0 :          recv_sizes(k) = recv_sizes(k) + nrow*ncol
    2270            0 :          recv_pair_count(k) = recv_pair_count(k) + 1
    2271              :       END DO
    2272              : 
    2273            0 :       recv_disps = 0
    2274            0 :       recv_pair_disps = 0
    2275            0 :       DO i = 2, desc%group_size
    2276            0 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    2277            0 :          recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
    2278              :       END DO
    2279            0 :       ALLOCATE (recv_buf_r(SUM(recv_sizes)))
    2280              : 
    2281              : !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
    2282              : !$OMP          SHARED(desc,send_pair_count,send_pair_disps,nimages),&
    2283              : !$OMP          SHARED(last_row,first_row,last_col,first_col),&
    2284              : !$OMP          SHARED(pmats,send_buf_r,send_disps,send_sizes),&
    2285              : !$OMP          SHARED(atom_pair_send,me,hmats,nblkrows_total),&
    2286              : !$OMP          SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
    2287              : !$OMP          SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
    2288              : !$OMP          PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
    2289            0 : !$OMP          PRIVATE(nthread,h_block,nthread_left,hmat,pmat,vector)
    2290              : 
    2291              :       nthread = 1
    2292              : !$    nthread = omp_get_num_threads()
    2293              :       nthread_left = 1
    2294              : !$    nthread_left = MAX(1, nthread - 1)
    2295              : 
    2296              :       ! do packing
    2297              : !$OMP DO schedule(guided)
    2298              :       DO l = 1, desc%group_size
    2299              :          IF (l == me) CYCLE
    2300              :          send_sizes(l) = 0
    2301              :          DO i = 1, send_pair_count(l)
    2302              :             arow = atom_pair_send(send_pair_disps(l) + i)%row
    2303              :             acol = atom_pair_send(send_pair_disps(l) + i)%col
    2304              :             img = atom_pair_send(send_pair_disps(l) + i)%image
    2305              :             nrow = last_row(arow) - first_row(arow) + 1
    2306              :             ncol = last_col(acol) - first_col(acol) + 1
    2307              :             pmat => pmats(img)%matrix
    2308              :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
    2309              :             CPASSERT(found)
    2310              : 
    2311              :             DO k = 1, ncol
    2312              :                DO j = 1, nrow
    2313              :                   send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
    2314              :                END DO
    2315              :             END DO
    2316              :             send_sizes(l) = send_sizes(l) + nrow*ncol
    2317              :          END DO
    2318              :       END DO
    2319              : !$OMP END DO
    2320              : 
    2321              :       IF (.NOT. scatter) THEN
    2322              :          ! We need locks to protect concurrent summation into H
    2323              : !$OMP SINGLE
    2324              : !$       ALLOCATE (locks(nthread*10))
    2325              : !$OMP END SINGLE
    2326              : 
    2327              : !$OMP DO
    2328              : !$       do i = 1, nthread*10
    2329              : !$          call omp_init_lock(locks(i))
    2330              : !$       end do
    2331              : !$OMP END DO
    2332              :       END IF
    2333              : 
    2334              : !$OMP MASTER
    2335              :       ! do communication
    2336              :       CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
    2337              :                                recv_buf_r, recv_sizes, recv_disps)
    2338              : !$OMP END MASTER
    2339              : 
    2340              :       ! If this is a scatter, then no need to copy local blocks,
    2341              :       ! If not, we sum them directly into H (bypassing the alltoall)
    2342              :       IF (.NOT. scatter) THEN
    2343              : 
    2344              :          ! Distribute work over remaining threads assuming one is still in the alltoall
    2345              : !$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
    2346              :          DO i = 1, send_pair_count(me)
    2347              :             arow = atom_pair_send(send_pair_disps(me) + i)%row
    2348              :             acol = atom_pair_send(send_pair_disps(me) + i)%col
    2349              :             img = atom_pair_send(send_pair_disps(me) + i)%image
    2350              :             nrow = last_row(arow) - first_row(arow) + 1
    2351              :             ncol = last_col(acol) - first_col(acol) + 1
    2352              :             hmat => hmats(img)%matrix
    2353              :             pmat => pmats(img)%matrix
    2354              :             CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
    2355              :             CPASSERT(found)
    2356              :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
    2357              :             CPASSERT(found)
    2358              : 
    2359              : !$          call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2360              :             DO k = 1, ncol
    2361              :                DO j = 1, nrow
    2362              :                   h_block(j, k) = h_block(j, k) + p_block(j, k)
    2363              :                END DO
    2364              :             END DO
    2365              : !$          call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2366              :          END DO
    2367              : !$OMP END DO
    2368              :       ELSE
    2369              :          ! We will insert new blocks into P, so create mutable work matrices
    2370              :          DO img = 1, nimages
    2371              :             pmat => pmats(img)%matrix
    2372              :             CALL dbcsr_work_create(pmat, work_mutable=.TRUE., &
    2373              :                                    nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
    2374              :                                    n=nthread)
    2375              :          END DO
    2376              :       END IF
    2377              : 
    2378              : ! wait for comm and setup to finish
    2379              : !$OMP BARRIER
    2380              : 
    2381              :       !do unpacking
    2382              : !$OMP DO schedule(guided)
    2383              :       DO l = 1, desc%group_size
    2384              :          IF (l == me) CYCLE
    2385              :          recv_sizes(l) = 0
    2386              :          DO i = 1, recv_pair_count(l)
    2387              :             arow = atom_pair_recv(recv_pair_disps(l) + i)%row
    2388              :             acol = atom_pair_recv(recv_pair_disps(l) + i)%col
    2389              :             img = atom_pair_recv(recv_pair_disps(l) + i)%image
    2390              :             nrow = last_row(arow) - first_row(arow) + 1
    2391              :             ncol = last_col(acol) - first_col(acol) + 1
    2392              :             pmat => pmats(img)%matrix
    2393              :             NULLIFY (p_block)
    2394              :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
    2395              : 
    2396              :             IF (PRESENT(hmats)) THEN
    2397              :                hmat => hmats(img)%matrix
    2398              :                CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
    2399              :                CPASSERT(found)
    2400              :             END IF
    2401              : 
    2402              :             IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
    2403              :                vector => recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol)
    2404              :                CALL dbcsr_put_block(pmat, arow, acol, block=RESHAPE(vector, [nrow, ncol]))
    2405              :             END IF
    2406              :             IF (.NOT. scatter) THEN
    2407              : !$             call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2408              :                DO k = 1, ncol
    2409              :                   DO j = 1, nrow
    2410              :                      h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
    2411              :                   END DO
    2412              :                END DO
    2413              : !$             call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2414              :             END IF
    2415              :             recv_sizes(l) = recv_sizes(l) + nrow*ncol
    2416              :          END DO
    2417              :       END DO
    2418              : !$OMP END DO
    2419              : 
    2420              : !$    IF (.not. scatter) THEN
    2421              : !$OMP DO
    2422              : !$       do i = 1, nthread*10
    2423              : !$          call omp_destroy_lock(locks(i))
    2424              : !$       end do
    2425              : !$OMP END DO
    2426              : !$    END IF
    2427              : 
    2428              : !$OMP SINGLE
    2429              : !$    IF (.not. scatter) THEN
    2430              : !$       DEALLOCATE (locks)
    2431              : !$    END IF
    2432              : !$OMP END SINGLE NOWAIT
    2433              : 
    2434              :       IF (scatter) THEN
    2435              :          ! Blocks were added to P
    2436              :          DO img = 1, nimages
    2437              :             pmat => pmats(img)%matrix
    2438              :             CALL dbcsr_finalize(pmat)
    2439              :          END DO
    2440              :       END IF
    2441              : !$OMP END PARALLEL
    2442              : 
    2443            0 :       DEALLOCATE (send_buf_r)
    2444            0 :       DEALLOCATE (recv_buf_r)
    2445              : 
    2446            0 :       DEALLOCATE (send_sizes)
    2447            0 :       DEALLOCATE (recv_sizes)
    2448            0 :       DEALLOCATE (send_disps)
    2449            0 :       DEALLOCATE (recv_disps)
    2450            0 :       DEALLOCATE (send_pair_count)
    2451            0 :       DEALLOCATE (recv_pair_count)
    2452            0 :       DEALLOCATE (send_pair_disps)
    2453            0 :       DEALLOCATE (recv_pair_disps)
    2454              : 
    2455            0 :       DEALLOCATE (first_row, last_row, first_col, last_col)
    2456              : 
    2457            0 :       CALL timestop(handle)
    2458              : 
    2459            0 :    END SUBROUTINE rs_distribute_matrix
    2460              : 
    2461              : ! **************************************************************************************************
    2462              : !> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
    2463              : !> \author Ole Schuett
    2464              : ! **************************************************************************************************
    2465        15072 :    SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
    2466              :                               pair_offsets, rank_offsets, rank_sizes, buffer_size)
    2467              :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: pairs
    2468              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: nsgf
    2469              :       INTEGER, INTENT(IN)                                :: group_size
    2470              :       INTEGER, DIMENSION(:), POINTER                     :: pair_offsets, rank_offsets, rank_sizes
    2471              :       INTEGER, INTENT(INOUT)                             :: buffer_size
    2472              : 
    2473              :       INTEGER                                            :: acol, arow, i, block_size, total_size, k, prev_k
    2474              : 
    2475        15072 :       IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
    2476        15072 :       IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
    2477        15072 :       IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
    2478              : 
    2479              :       ! calculate buffer_size and pair_offsets
    2480        44968 :       ALLOCATE (pair_offsets(SIZE(pairs)))
    2481        15072 :       total_size = 0
    2482       386045 :       DO i = 1, SIZE(pairs)
    2483       370973 :          pair_offsets(i) = total_size
    2484       370973 :          arow = pairs(i)%row
    2485       370973 :          acol = pairs(i)%col
    2486       370973 :          block_size = nsgf(arow)*nsgf(acol)
    2487       386045 :          total_size = total_size + block_size
    2488              :       END DO
    2489        15072 :       buffer_size = total_size
    2490              : 
    2491              :       ! calculate rank_offsets and rank_sizes
    2492        45216 :       ALLOCATE (rank_offsets(group_size))
    2493        30144 :       ALLOCATE (rank_sizes(group_size))
    2494        43992 :       rank_offsets = 0
    2495        43992 :       rank_sizes = 0
    2496        15072 :       IF (SIZE(pairs) > 0) THEN
    2497        14824 :          prev_k = pairs(1)%rank + 1
    2498       385797 :          DO i = 1, SIZE(pairs)
    2499       370973 :             k = pairs(i)%rank + 1
    2500       370973 :             CPASSERT(k >= prev_k) ! expecting the pairs to be ordered by rank
    2501       385797 :             IF (k > prev_k) THEN
    2502           61 :                rank_offsets(k) = pair_offsets(i)
    2503           61 :                rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
    2504           61 :                prev_k = k
    2505              :             END IF
    2506              :          END DO
    2507        14824 :          rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
    2508              :       END IF
    2509              : 
    2510        15072 :    END SUBROUTINE rs_calc_offsets
    2511              : 
    2512              : ! **************************************************************************************************
    2513              : !> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
    2514              : !> \author Ole Schuett
    2515              : ! **************************************************************************************************
    2516          266 :    SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
    2517              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2518              :       TYPE(offload_buffer_type), INTENT(INOUT)           :: dest_buffer
    2519              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2520              :       TYPE(mp_comm_type), INTENT(IN)                     :: group
    2521              : 
    2522              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_scatter_matrices'
    2523              : 
    2524              :       INTEGER                                            :: handle
    2525              :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: buffer_send
    2526              : 
    2527          266 :       CALL timeset(routineN, handle)
    2528          777 :       ALLOCATE (buffer_send(task_list%buffer_size_send))
    2529              : 
    2530              :       ! pack dbcsr blocks into send buffer
    2531          266 :       CPASSERT(ASSOCIATED(task_list%atom_pair_send))
    2532              :       CALL rs_pack_buffer(src_matrices=src_matrices, &
    2533              :                           dest_buffer=buffer_send, &
    2534              :                           atom_pair=task_list%atom_pair_send, &
    2535          266 :                           pair_offsets=task_list%pair_offsets_send)
    2536              : 
    2537              :       ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
    2538              :       CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
    2539              :                           dest_buffer%host_buffer, &
    2540         2394 :                           task_list%rank_sizes_recv, task_list%rank_offsets_recv)
    2541              : 
    2542          266 :       DEALLOCATE (buffer_send)
    2543          266 :       CALL timestop(handle)
    2544              : 
    2545          266 :    END SUBROUTINE rs_scatter_matrices
    2546              : 
    2547              : ! **************************************************************************************************
    2548              : !> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
    2549              : !> \author Ole Schuett
    2550              : ! **************************************************************************************************
    2551          218 :    SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
    2552              :       TYPE(offload_buffer_type), INTENT(IN)              :: src_buffer
    2553              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2554              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2555              :       TYPE(mp_comm_type), INTENT(IN)                     :: group
    2556              : 
    2557              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_gather_matrices'
    2558              : 
    2559              :       INTEGER                                            :: handle
    2560              :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: buffer_send
    2561              : 
    2562          218 :       CALL timeset(routineN, handle)
    2563              : 
    2564              :       ! Caution: The meaning of send and recv are reversed in this routine.
    2565          640 :       ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
    2566              : 
    2567              :       ! mpi all-to-all communication
    2568              :       CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
    2569         1962 :                           buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
    2570              : 
    2571              :       ! unpack dbcsr blocks from send buffer
    2572          218 :       CPASSERT(ASSOCIATED(task_list%atom_pair_send))
    2573              :       CALL rs_unpack_buffer(src_buffer=buffer_send, &
    2574              :                             dest_matrices=dest_matrices, &
    2575              :                             atom_pair=task_list%atom_pair_send, &
    2576          218 :                             pair_offsets=task_list%pair_offsets_send)
    2577              : 
    2578          218 :       DEALLOCATE (buffer_send)
    2579          218 :       CALL timestop(handle)
    2580              : 
    2581          218 :    END SUBROUTINE rs_gather_matrices
    2582              : 
    2583              : ! **************************************************************************************************
    2584              : !> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
    2585              : !> \author Ole Schuett
    2586              : ! **************************************************************************************************
    2587       245735 :    SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
    2588              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2589              :       TYPE(offload_buffer_type), INTENT(INOUT)           :: dest_buffer
    2590              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2591              : 
    2592              :       CALL rs_pack_buffer(src_matrices=src_matrices, &
    2593              :                           dest_buffer=dest_buffer%host_buffer, &
    2594              :                           atom_pair=task_list%atom_pair_recv, &
    2595       245735 :                           pair_offsets=task_list%pair_offsets_recv)
    2596              : 
    2597       245735 :    END SUBROUTINE rs_copy_to_buffer
    2598              : 
    2599              : ! **************************************************************************************************
    2600              : !> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
    2601              : !> \author Ole Schuett
    2602              : ! **************************************************************************************************
    2603       201630 :    SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
    2604              :       TYPE(offload_buffer_type), INTENT(IN)              :: src_buffer
    2605              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2606              :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2607              : 
    2608              :       CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
    2609              :                             dest_matrices=dest_matrices, &
    2610              :                             atom_pair=task_list%atom_pair_recv, &
    2611       201630 :                             pair_offsets=task_list%pair_offsets_recv)
    2612              : 
    2613       201630 :    END SUBROUTINE rs_copy_to_matrices
    2614              : 
    2615              : ! **************************************************************************************************
    2616              : !> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
    2617              : !> \author Ole Schuett
    2618              : ! **************************************************************************************************
    2619       246001 :    SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
    2620              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2621              :       REAL(KIND=dp), DIMENSION(:), INTENT(INOUT)         :: dest_buffer
    2622              :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: atom_pair
    2623              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: pair_offsets
    2624              : 
    2625              :       INTEGER                                            :: acol, arow, img, i, offset, block_size
    2626              :       LOGICAL                                            :: found
    2627       246001 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: block
    2628              : 
    2629              : !$OMP PARALLEL DEFAULT(NONE), &
    2630              : !$OMP          SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
    2631       246001 : !$OMP          PRIVATE(acol,arow,img,i,offset,block_size,found,block)
    2632              : !$OMP DO schedule(guided)
    2633              :       DO i = 1, SIZE(atom_pair)
    2634              :          arow = atom_pair(i)%row
    2635              :          acol = atom_pair(i)%col
    2636              :          img = atom_pair(i)%image
    2637              :          CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
    2638              :                                 block=block, found=found)
    2639              :          CPASSERT(found)
    2640              :          block_size = SIZE(block)
    2641              :          offset = pair_offsets(i)
    2642              :          dest_buffer(offset + 1:offset + block_size) = RESHAPE(block, shape=[block_size])
    2643              :       END DO
    2644              : !$OMP END DO
    2645              : !$OMP END PARALLEL
    2646              : 
    2647       246001 :    END SUBROUTINE rs_pack_buffer
    2648              : 
    2649              : ! **************************************************************************************************
    2650              : !> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
    2651              : !> \author Ole Schuett
    2652              : ! **************************************************************************************************
    2653       201848 :    SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
    2654              :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: src_buffer
    2655              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2656              :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: atom_pair
    2657              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: pair_offsets
    2658              : 
    2659              :       INTEGER                                            :: acol, arow, img, i, offset, &
    2660              :                                                             nrows, ncols, lock_num
    2661              :       LOGICAL                                            :: found
    2662       201848 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: block
    2663       201848 :       INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
    2664              : 
    2665              :       ! initialize locks
    2666       605544 :       ALLOCATE (locks(10*omp_get_max_threads()))
    2667      2220328 :       DO i = 1, SIZE(locks)
    2668      2220328 :          CALL omp_init_lock(locks(i))
    2669              :       END DO
    2670              : 
    2671              : !$OMP PARALLEL DEFAULT(NONE), &
    2672              : !$OMP          SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
    2673       201848 : !$OMP          PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
    2674              : !$OMP DO schedule(guided)
    2675              :       DO i = 1, SIZE(atom_pair)
    2676              :          arow = atom_pair(i)%row
    2677              :          acol = atom_pair(i)%col
    2678              :          img = atom_pair(i)%image
    2679              :          CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
    2680              :                                 block=block, found=found)
    2681              :          CPASSERT(found)
    2682              :          nrows = SIZE(block, 1)
    2683              :          ncols = SIZE(block, 2)
    2684              :          offset = pair_offsets(i)
    2685              :          lock_num = MODULO(arow, SIZE(locks)) + 1  ! map matrix rows round-robin to available locks
    2686              : 
    2687              :          CALL omp_set_lock(locks(lock_num))
    2688              :          block = block + RESHAPE(src_buffer(offset + 1:offset + nrows*ncols), shape=[nrows, ncols])
    2689              :          CALL omp_unset_lock(locks(lock_num))
    2690              :       END DO
    2691              : !$OMP END DO
    2692              : !$OMP END PARALLEL
    2693              : 
    2694              :       ! destroy locks
    2695      2220328 :       DO i = 1, SIZE(locks)
    2696      2220328 :          CALL omp_destroy_lock(locks(i))
    2697              :       END DO
    2698       201848 :       DEALLOCATE (locks)
    2699              : 
    2700       201848 :    END SUBROUTINE rs_unpack_buffer
    2701              : 
    2702              : ! **************************************************************************************************
    2703              : !> \brief determines the rank of the processor who's real rs grid contains point
    2704              : !> \param rs_desc ...
    2705              : !> \param igrid_level ...
    2706              : !> \param n_levels ...
    2707              : !> \param cube_center ...
    2708              : !> \param ntasks ...
    2709              : !> \param tasks ...
    2710              : !> \param lb_cube ...
    2711              : !> \param ub_cube ...
    2712              : !> \param added_tasks ...
    2713              : !> \par History
    2714              : !>      11.2007 created [MattW]
    2715              : !>      10.2008 rewritten [Joost VandeVondele]
    2716              : !> \author MattW
    2717              : ! **************************************************************************************************
    2718         1380 :    SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
    2719              :                            lb_cube, ub_cube, added_tasks)
    2720              : 
    2721              :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
    2722              :       INTEGER, INTENT(IN)                                :: igrid_level, n_levels
    2723              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center
    2724              :       INTEGER, INTENT(INOUT)                             :: ntasks
    2725              :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    2726              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
    2727              :       INTEGER, INTENT(OUT)                               :: added_tasks
    2728              : 
    2729              :       INTEGER, PARAMETER                                 :: add_tasks = 1000
    2730              :       REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
    2731              : 
    2732              :       INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
    2733              :                  lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
    2734              :       INTEGER                                            :: bit_pattern
    2735              :       LOGICAL                                            :: dir_periodic(3)
    2736              : 
    2737         1380 :       coord(1) = rs_desc%x2coord(cube_center(1))
    2738         1380 :       coord(2) = rs_desc%y2coord(cube_center(2))
    2739         1380 :       coord(3) = rs_desc%z2coord(cube_center(3))
    2740         1380 :       dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
    2741              : 
    2742              :       ! the real cube coordinates
    2743         5520 :       lbc = lb_cube + cube_center
    2744         5520 :       ubc = ub_cube + cube_center
    2745              : 
    2746         9660 :       IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) <= lbc) .AND. &
    2747              :           ALL((rs_desc%ub_global(:, dest) + rs_desc%border) >= ubc)) THEN
    2748              :          !standard distributed collocation/integration
    2749         1380 :          tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
    2750         1380 :          tasks(ntasks)%dist_type = 1
    2751         1380 :          tasks(ntasks)%subpatch_pattern = 0
    2752         1380 :          added_tasks = 1
    2753              : 
    2754              :          ! here we figure out if there is an alternate location for this task
    2755              :          ! i.e. even though the cube_center is not in the real local domain,
    2756              :          ! it might fully fit in the halo of the neighbor
    2757              :          ! if its radius is smaller than the maximum radius
    2758              :          ! the list of possible neighbors is stored here as a bit pattern
    2759              :          ! to reconstruct what the rank of the neigbor is,
    2760              :          ! we can use (note this requires the correct rs_grid)
    2761              :          !  IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[-1,0,0])
    2762              :          !  IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[+1,0,0])
    2763              :          !  IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,-1,0])
    2764              :          !  IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,+1,0])
    2765              :          !  IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,-1])
    2766              :          !  IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,+1])
    2767         1380 :          bit_index = 0
    2768         1380 :          bit_pattern = 0
    2769         5520 :          DO i = 1, 3
    2770         5520 :             IF (rs_desc%perd(i) == 1) THEN
    2771         2760 :                bit_pattern = IBCLR(bit_pattern, bit_index)
    2772         2760 :                bit_index = bit_index + 1
    2773         2760 :                bit_pattern = IBCLR(bit_pattern, bit_index)
    2774         2760 :                bit_index = bit_index + 1
    2775              :             ELSE
    2776              :                ! fits the left neighbor ?
    2777         1380 :                IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
    2778          599 :                   bit_pattern = IBSET(bit_pattern, bit_index)
    2779          599 :                   bit_index = bit_index + 1
    2780              :                ELSE
    2781          781 :                   bit_pattern = IBCLR(bit_pattern, bit_index)
    2782          781 :                   bit_index = bit_index + 1
    2783              :                END IF
    2784              :                ! fits the right neighbor ?
    2785         1380 :                IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
    2786          250 :                   bit_pattern = IBSET(bit_pattern, bit_index)
    2787          250 :                   bit_index = bit_index + 1
    2788              :                ELSE
    2789         1130 :                   bit_pattern = IBCLR(bit_pattern, bit_index)
    2790         1130 :                   bit_index = bit_index + 1
    2791              :                END IF
    2792              :             END IF
    2793              :          END DO
    2794         1380 :          tasks(ntasks)%subpatch_pattern = bit_pattern
    2795              : 
    2796              :       ELSE
    2797              :          ! generalised collocation/integration needed
    2798              :          ! first we figure out how many neighbors we have to add to include the lbc/ubc
    2799              :          ! in the available domains (inclusive of halo points)
    2800              :          ! first we 'ignore' periodic boundary conditions
    2801              :          ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
    2802            0 :          lb_coord = coord
    2803            0 :          ub_coord = coord
    2804            0 :          lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
    2805            0 :          ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
    2806            0 :          DO i = 1, 3
    2807              :             ! only if the grid is not periodic in this direction we need to take care of adding neighbors
    2808            0 :             IF (rs_desc%perd(i) == 0) THEN
    2809              :                ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
    2810            0 :                DO
    2811            0 :                   IF (lb_domain(i) > lbc(i)) THEN
    2812            0 :                      lb_coord(i) = lb_coord(i) - 1
    2813            0 :                      icoord = MODULO(lb_coord, rs_desc%group_dim)
    2814            0 :                      idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2815            0 :                      lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
    2816              :                   ELSE
    2817              :                      EXIT
    2818              :                   END IF
    2819              :                END DO
    2820              :                ! same for the upper bound
    2821            0 :                DO
    2822            0 :                   IF (ub_domain(i) < ubc(i)) THEN
    2823            0 :                      ub_coord(i) = ub_coord(i) + 1
    2824            0 :                      icoord = MODULO(ub_coord, rs_desc%group_dim)
    2825            0 :                      idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2826            0 :                      ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
    2827              :                   ELSE
    2828              :                      EXIT
    2829              :                   END IF
    2830              :                END DO
    2831              :             END IF
    2832              :          END DO
    2833              : 
    2834              :          ! some care is needed for the periodic boundaries
    2835            0 :          DO i = 1, 3
    2836            0 :             IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
    2837            0 :                dir_periodic(i) = .TRUE.
    2838            0 :                lb_coord(i) = 0
    2839            0 :                ub_coord(i) = rs_desc%group_dim(i) - 1
    2840              :             ELSE
    2841            0 :                dir_periodic(i) = .FALSE.
    2842              :             END IF
    2843              :          END DO
    2844              : 
    2845            0 :          added_tasks = PRODUCT(ub_coord - lb_coord + 1)
    2846            0 :          itask = ntasks
    2847            0 :          ntasks = ntasks + added_tasks - 1
    2848            0 :          IF (ntasks > SIZE(tasks)) THEN
    2849            0 :             curr_tasks = INT((SIZE(tasks) + add_tasks)*mult_tasks)
    2850            0 :             CALL reallocate_tasks(tasks, curr_tasks)
    2851              :          END IF
    2852            0 :          DO iz = lb_coord(3), ub_coord(3)
    2853            0 :          DO iy = lb_coord(2), ub_coord(2)
    2854            0 :          DO ix = lb_coord(1), ub_coord(1)
    2855            0 :             icoord = MODULO([ix, iy, iz], rs_desc%group_dim)
    2856            0 :             idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2857            0 :             tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
    2858            0 :             tasks(itask)%dist_type = 2
    2859            0 :             tasks(itask)%subpatch_pattern = 0
    2860              :             ! encode the domain size for this task
    2861              :             ! if the bit is set, we need to add the border in that direction
    2862            0 :             IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
    2863            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 0)
    2864            0 :             IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
    2865            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 1)
    2866            0 :             IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
    2867            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 2)
    2868            0 :             IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
    2869            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 3)
    2870            0 :             IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
    2871            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 4)
    2872            0 :             IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
    2873            0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 5)
    2874            0 :             itask = itask + 1
    2875              :          END DO
    2876              :          END DO
    2877              :          END DO
    2878              :       END IF
    2879              : 
    2880         1380 :    END SUBROUTINE rs_find_node
    2881              : 
    2882              : ! **************************************************************************************************
    2883              : !> \brief utility functions for encoding the grid level with a rank, allowing
    2884              : !>        different grid levels to maintain a different rank ordering without
    2885              : !>        losing information.  These encoded_ints are stored in the tasks(1:2,:) array
    2886              : !> \param rank ...
    2887              : !> \param grid_level ...
    2888              : !> \param n_levels ...
    2889              : !> \return ...
    2890              : !> \par History
    2891              : !>      4.2009 created [Iain Bethune]
    2892              : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2893              : ! **************************************************************************************************
    2894     17192080 :    FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
    2895              : 
    2896              :       INTEGER, INTENT(IN)                                :: rank, grid_level, n_levels
    2897              :       INTEGER                                            :: encoded_int
    2898              : 
    2899              : ! ordered so can still sort by rank
    2900              : 
    2901     17192080 :       encoded_int = rank*n_levels + grid_level - 1
    2902              : 
    2903     17192080 :    END FUNCTION encode_rank
    2904              : 
    2905              : ! **************************************************************************************************
    2906              : !> \brief ...
    2907              : !> \param encoded_int ...
    2908              : !> \param n_levels ...
    2909              : !> \return ...
    2910              : ! **************************************************************************************************
    2911      8642030 :    FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
    2912              : 
    2913              :       INTEGER, INTENT(IN)                                :: encoded_int
    2914              :       INTEGER, INTENT(IN)                                :: n_levels
    2915              :       INTEGER                                            :: rank
    2916              : 
    2917      8642030 :       rank = INT(encoded_int/n_levels)
    2918              : 
    2919      8642030 :    END FUNCTION decode_rank
    2920              : 
    2921              : ! **************************************************************************************************
    2922              : !> \brief ...
    2923              : !> \param encoded_int ...
    2924              : !> \param n_levels ...
    2925              : !> \return ...
    2926              : ! **************************************************************************************************
    2927         7236 :    FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
    2928              : 
    2929              :       INTEGER, INTENT(IN)                                :: encoded_int
    2930              :       INTEGER, INTENT(IN)                                :: n_levels
    2931              :       INTEGER                                            :: grid_level
    2932              : 
    2933         7236 :       grid_level = INT(MODULO(encoded_int, n_levels)) + 1
    2934              : 
    2935         7236 :    END FUNCTION decode_level
    2936              : 
    2937              : ! **************************************************************************************************
    2938              : !> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
    2939              : !>        grouped, and for each atom pair all set pairs are grouped and for each set pair,
    2940              : !>        all pgfs are grouped.
    2941              : !>        This yields the right order of the tasks for collocation after the sort
    2942              : !>        in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
    2943              : !>        The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
    2944              : !>        that a given density matrix block will be decontracted several times,
    2945              : !>        but cache effects on the grid make up for this.
    2946              : !> \param a ...
    2947              : !> \param b ...
    2948              : !> \return ...
    2949              : !> \author Ole Schuett
    2950              : ! **************************************************************************************************
    2951     81441977 :    PURE FUNCTION tasks_less_than(a, b) RESULT(res)
    2952              :       TYPE(task_type), INTENT(IN)                        :: a, b
    2953              :       LOGICAL                                            :: res
    2954              : 
    2955     81441977 :       IF (a%grid_level /= b%grid_level) THEN
    2956     20279384 :          res = a%grid_level < b%grid_level
    2957              : 
    2958     61162593 :       ELSE IF (a%image /= b%image) THEN
    2959      7834043 :          res = a%image < b%image
    2960              : 
    2961     53328550 :       ELSE IF (a%iatom /= b%iatom) THEN
    2962      6934692 :          res = a%iatom < b%iatom
    2963              : 
    2964     46393858 :       ELSE IF (a%jatom /= b%jatom) THEN
    2965      7562331 :          res = a%jatom < b%jatom
    2966              : 
    2967     38831527 :       ELSE IF (a%iset /= b%iset) THEN
    2968      6046190 :          res = a%iset < b%iset
    2969              : 
    2970     32785337 :       ELSE IF (a%jset /= b%jset) THEN
    2971      6427889 :          res = a%jset < b%jset
    2972              : 
    2973     26357448 :       ELSE IF (a%ipgf /= b%ipgf) THEN
    2974      6848925 :          res = a%ipgf < b%ipgf
    2975              : 
    2976              :       ELSE
    2977     19508523 :          res = a%jpgf < b%jpgf
    2978              : 
    2979              :       END IF
    2980     81441977 :    END FUNCTION tasks_less_than
    2981              : 
    2982    193275283 :    #:call array_sort(prefix='tasks', type='TYPE(task_type)')
    2983              :    #:endcall
    2984              : 
    2985              : ! **************************************************************************************************
    2986              : !> \brief Order atom pairs to find duplicates.
    2987              : !> \param a ...
    2988              : !> \param b ...
    2989              : !> \return ...
    2990              : !> \author Ole Schuett
    2991              : ! **************************************************************************************************
    2992     36460913 :    PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
    2993              :       TYPE(atom_pair_type), INTENT(IN)                   :: a, b
    2994              :       LOGICAL                                            :: res
    2995              : 
    2996     36460913 :       IF (a%rank /= b%rank) THEN
    2997         4243 :          res = a%rank < b%rank
    2998              : 
    2999     36456670 :       ELSE IF (a%row /= b%row) THEN
    3000      5764761 :          res = a%row < b%row
    3001              : 
    3002     30691909 :       ELSE IF (a%col /= b%col) THEN
    3003      5784008 :          res = a%col < b%col
    3004              : 
    3005              :       ELSE
    3006     24907901 :          res = a%image < b%image
    3007              : 
    3008              :       END IF
    3009     36460913 :    END FUNCTION atom_pair_less_than
    3010              : 
    3011     77467935 :    #:call array_sort(prefix='atom_pair', type='TYPE(atom_pair_type)')
    3012              :    #:endcall
    3013              : 
    3014              : END MODULE task_list_methods
        

Generated by: LCOV version 2.0-1