LCOV - CP2K Regtests (git:42dac4a) - src/pw/realspace_grid

LCOV - code coverage report

Current view:	top level - src/pw - realspace_grid_types.F (source / functions)		Coverage	Total	Hit
Test:	CP2K Regtests (git:42dac4a)	Lines:	93.8 %	788	739
Test Date:	2025-07-25 12:55:17	Functions:	70.4 %	27	19

            Line data    Source code

       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \note
      10              : !>      Basic type for real space grid methods
      11              : !> \par History
      12              : !>      JGH (22-May-2002) : New routine rs_grid_zero
      13              : !>      JGH (12-Jun-2002) : Bug fix for mpi groups
      14              : !>      JGH (19-Jun-2003) : Added routine for task distribution
      15              : !>      JGH (23-Nov-2003) : Added routine for task loop separation
      16              : !> \author JGH (18-Mar-2001)
      17              : ! **************************************************************************************************
      18              : MODULE realspace_grid_types
      19              :    USE cp_array_utils,                  ONLY: cp_1d_r_p_type
      20              :    USE cp_log_handling,                 ONLY: cp_to_string
      21              :    USE kahan_sum,                       ONLY: accurate_sum
      22              :    USE kinds,                           ONLY: dp,&
      23              :                                               int_8
      24              :    USE machine,                         ONLY: m_memory
      25              :    USE mathlib,                         ONLY: det_3x3
      26              :    USE message_passing,                 ONLY: mp_comm_null,&
      27              :                                               mp_comm_type,&
      28              :                                               mp_request_null,&
      29              :                                               mp_request_type,&
      30              :                                               mp_waitall,&
      31              :                                               mp_waitany
      32              :    USE offload_api,                     ONLY: offload_buffer_type,&
      33              :                                               offload_create_buffer,&
      34              :                                               offload_free_buffer
      35              :    USE pw_grid_types,                   ONLY: PW_MODE_LOCAL,&
      36              :                                               pw_grid_type
      37              :    USE pw_grids,                        ONLY: pw_grid_release,&
      38              :                                               pw_grid_retain
      39              :    USE pw_methods,                      ONLY: pw_integrate_function
      40              :    USE pw_types,                        ONLY: pw_r3d_rs_type
      41              :    USE util,                            ONLY: get_limit
      42              : 
      43              : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      44              : 
      45              : #include "../base/base_uses.f90"
      46              : 
      47              :    IMPLICIT NONE
      48              : 
      49              :    PRIVATE
      50              :    PUBLIC :: realspace_grid_type, &
      51              :              realspace_grid_desc_type, &
      52              :              realspace_grid_p_type, &
      53              :              realspace_grid_desc_p_type, &
      54              :              realspace_grid_input_type
      55              : 
      56              :    PUBLIC :: transfer_rs2pw, &
      57              :              transfer_pw2rs, &
      58              :              rs_grid_zero, &
      59              :              rs_grid_set_box, &
      60              :              rs_grid_create, &
      61              :              rs_grid_create_descriptor, &
      62              :              rs_grid_retain_descriptor, &
      63              :              rs_grid_release, &
      64              :              rs_grid_release_descriptor, &
      65              :              rs_grid_reorder_ranks, &
      66              :              rs_grid_print, &
      67              :              rs_grid_locate_rank, &
      68              :              rs_grid_max_ngpts, &
      69              :              rs_grid_mult_and_add, &
      70              :              map_gaussian_here
      71              : 
      72              :    INTEGER, PARAMETER, PUBLIC               :: rsgrid_distributed = 0, &
      73              :                                                rsgrid_replicated = 1, &
      74              :                                                rsgrid_automatic = 2
      75              : 
      76              :    LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
      77              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'realspace_grid_types'
      78              : 
      79              : ! **************************************************************************************************
      80              :    TYPE realspace_grid_input_type
      81              :       INTEGER       :: distribution_type = rsgrid_replicated
      82              :       INTEGER       :: distribution_layout(3) = -1
      83              :       REAL(KIND=dp) :: memory_factor = 0.0_dp
      84              :       LOGICAL       :: lock_distribution = .FALSE.
      85              :       INTEGER       :: nsmax = -1
      86              :       REAL(KIND=dp) :: halo_reduction_factor = 1.0_dp
      87              :    END TYPE realspace_grid_input_type
      88              : 
      89              : ! **************************************************************************************************
      90              :    TYPE realspace_grid_desc_type
      91              :       TYPE(pw_grid_type), POINTER   :: pw => NULL() ! the pw grid
      92              : 
      93              :       INTEGER :: ref_count = 0 ! reference count
      94              : 
      95              :       INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
      96              :       INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
      97              :       INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
      98              :       INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
      99              : 
     100              :       INTEGER :: border = 0 ! border points
     101              : 
     102              :       INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
     103              :       REAL(KIND=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
     104              :       REAL(KIND=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
     105              :       LOGICAL :: orthorhombic = .TRUE. ! grid symmetry
     106              : 
     107              :       LOGICAL :: parallel = .TRUE. ! whether the corresponding pw grid is distributed
     108              :       LOGICAL :: distributed = .TRUE. ! whether the rs grid is distributed
     109              :       ! these MPI related quantities are only meaningful depending on how the grid has been laid out
     110              :       ! they are most useful for fully distributed grids, where they reflect the topology of the grid
     111              :       TYPE(mp_comm_type) :: group = mp_comm_null
     112              :       INTEGER :: my_pos = -1
     113              :       INTEGER :: group_size = 0
     114              :       INTEGER, DIMENSION(3) :: group_dim = -1
     115              :       INTEGER, DIMENSION(3) :: group_coor = -1
     116              :       INTEGER, DIMENSION(3) :: neighbours = -1
     117              :       ! only meaningful on distributed grids
     118              :       ! a list of bounds for each CPU
     119              :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
     120              :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
     121              :       ! a mapping from linear rank to 3d coord
     122              :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
     123              :       INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
     124              :       ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
     125              :       INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
     126              :       INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
     127              :       INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
     128              : 
     129              :       INTEGER                :: my_virtual_pos = -1
     130              :       INTEGER, DIMENSION(3) :: virtual_group_coor = -1
     131              : 
     132              :       INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
     133              : 
     134              :    END TYPE realspace_grid_desc_type
     135              : 
     136              :    TYPE realspace_grid_type
     137              : 
     138              :       TYPE(realspace_grid_desc_type), POINTER :: desc => NULL()
     139              : 
     140              :       INTEGER :: ngpts_local = -1 ! local dimensions
     141              :       INTEGER, DIMENSION(3) :: npts_local = -1
     142              :       INTEGER, DIMENSION(3) :: lb_local = -1
     143              :       INTEGER, DIMENSION(3) :: ub_local = -1
     144              :       INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
     145              :       INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
     146              : 
     147              :       INTEGER, DIMENSION(:), ALLOCATABLE         :: px, py, pz ! index translators
     148              :       TYPE(offload_buffer_type)                  :: buffer = offload_buffer_type() ! owner of the grid's memory
     149              :       REAL(KIND=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => NULL() ! the grid (pointer to buffer%host_buffer)
     150              : 
     151              :    END TYPE realspace_grid_type
     152              : 
     153              : ! **************************************************************************************************
     154              :    TYPE realspace_grid_p_type
     155              :       TYPE(realspace_grid_type), POINTER :: rs_grid => NULL()
     156              :    END TYPE realspace_grid_p_type
     157              : 
     158              :    TYPE realspace_grid_desc_p_type
     159              :       TYPE(realspace_grid_desc_type), POINTER :: rs_desc => NULL()
     160              :    END TYPE realspace_grid_desc_p_type
     161              : 
     162              : CONTAINS
     163              : 
     164              : ! **************************************************************************************************
     165              : !> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
     166              : !>        only possible if rs_grid is a distributed grid
     167              : !> \param rs_desc ...
     168              : !> \param rank_in ...
     169              : !> \param shift ...
     170              : !> \return ...
     171              : ! **************************************************************************************************
     172         2346 :    PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
     173              :       TYPE(realspace_grid_desc_type), INTENT(IN)         :: rs_desc
     174              :       INTEGER, INTENT(IN)                                :: rank_in
     175              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: shift
     176              :       INTEGER                                            :: rank_out
     177              : 
     178              :       INTEGER                                            :: coord(3)
     179              : 
     180         9384 :       coord = MODULO(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
     181         2346 :       rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
     182         2346 :    END FUNCTION rs_grid_locate_rank
     183              : 
     184              : ! **************************************************************************************************
     185              : !> \brief Determine the setup of real space grids - this is divided up into the
     186              : !>        creation of a descriptor and the actual grid itself (see rs_grid_create)
     187              : !> \param desc ...
     188              : !> \param pw_grid ...
     189              : !> \param input_settings ...
     190              : !> \param border_points ...
     191              : !> \par History
     192              : !>      JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
     193              : !>      Iain Bethune (05-Sep-2008) : modified cut heuristic
     194              : !>      (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
     195              : !>      - Create a descriptor for realspace grids with a number of border
     196              : !>        points as exactly given by the optional argument border_points.
     197              : !>        These grids are always distributed.
     198              : !>        (27.11.2013, Matthias Krack)
     199              : !> \author JGH (18-Mar-2001)
     200              : ! **************************************************************************************************
     201        32334 :    SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
     202              :       TYPE(realspace_grid_desc_type), POINTER            :: desc
     203              :       TYPE(pw_grid_type), INTENT(INOUT), TARGET          :: pw_grid
     204              :       TYPE(realspace_grid_input_type), INTENT(IN)        :: input_settings
     205              :       INTEGER, INTENT(IN), OPTIONAL                      :: border_points
     206              : 
     207              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_grid_create_descriptor'
     208              : 
     209              :       INTEGER                                            :: border_size, dir, handle, i, j, k, l, &
     210              :                                                             lb(2), min_npts_real, n_slices(3), &
     211              :                                                             n_slices_tmp(3), nmin
     212              :       LOGICAL                                            :: overlap
     213              :       REAL(KIND=dp)                                      :: ratio, ratio_best, volume, volume_dist
     214              : 
     215        32334 :       CALL timeset(routineN, handle)
     216              : 
     217        32334 :       IF (PRESENT(border_points)) THEN
     218          128 :          border_size = border_points
     219              :       ELSE
     220              :          border_size = 0
     221              :       END IF
     222              : 
     223      1584366 :       ALLOCATE (desc)
     224              : 
     225        32334 :       CALL pw_grid%para%group%sync()
     226              : 
     227        32334 :       desc%pw => pw_grid
     228        32334 :       CALL pw_grid_retain(desc%pw)
     229              : 
     230       420342 :       desc%dh = pw_grid%dh
     231       420342 :       desc%dh_inv = pw_grid%dh_inv
     232        32334 :       desc%orthorhombic = pw_grid%orthorhombic
     233        32334 :       desc%ref_count = 1
     234              : 
     235        32334 :       IF (pw_grid%para%mode == PW_MODE_LOCAL) THEN
     236              :          ! The corresponding group has dimension 1
     237              :          ! All operations will be done locally
     238        16776 :          desc%npts = pw_grid%npts
     239        16776 :          desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
     240        16776 :          desc%lb = pw_grid%bounds(1, :)
     241        16776 :          desc%ub = pw_grid%bounds(2, :)
     242         4194 :          desc%border = border_size
     243         4194 :          IF (border_size == 0) THEN
     244        16776 :             desc%perd = 1
     245              :          ELSE
     246            0 :             desc%perd = 0
     247              :          END IF
     248         4194 :          desc%parallel = .FALSE.
     249         4194 :          desc%distributed = .FALSE.
     250         4194 :          desc%group = mp_comm_null
     251         4194 :          desc%group_size = 1
     252        16776 :          desc%group_dim = 1
     253        16776 :          desc%group_coor = 0
     254         4194 :          desc%my_pos = 0
     255              :       ELSE
     256              :          ! group size of desc grid
     257              :          ! global grid dimensions are still the same
     258        28140 :          desc%group_size = pw_grid%para%group%num_pe
     259       112560 :          desc%npts = pw_grid%npts
     260       112560 :          desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
     261       112560 :          desc%lb = pw_grid%bounds(1, :)
     262       112560 :          desc%ub = pw_grid%bounds(2, :)
     263              : 
     264              :          ! this is the eventual border size
     265        28140 :          IF (border_size == 0) THEN
     266        28012 :             nmin = (input_settings%nsmax + 1)/2
     267        28012 :             nmin = MAX(0, NINT(nmin*input_settings%halo_reduction_factor))
     268              :          ELSE
     269              :             ! Set explicitly the requested border size
     270              :             nmin = border_size
     271              :          END IF
     272              : 
     273        28140 :          IF (input_settings%distribution_type == rsgrid_replicated) THEN
     274              : 
     275        44168 :             n_slices = 1
     276        11042 :             IF (border_size > 0) THEN
     277              :                CALL cp_abort(__LOCATION__, &
     278              :                              "An explicit border size > 0 is not yet working for "// &
     279              :                              "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
     280            0 :                              "distributed for RS_GRID explicitly.")
     281              :             END IF
     282              : 
     283              :          ELSE
     284              : 
     285        68392 :             n_slices = 1
     286        17098 :             ratio_best = -HUGE(ratio_best)
     287              : 
     288              :             ! don't allow distributions with more processors than real grid points
     289        51294 :             DO k = 1, MIN(desc%npts(3), desc%group_size)
     290       119686 :             DO j = 1, MIN(desc%npts(2), desc%group_size)
     291        68392 :                i = MIN(desc%npts(1), desc%group_size/(j*k))
     292       273568 :                n_slices_tmp = (/i, j, k/)
     293              : 
     294              :                ! we don't match the actual number of CPUs
     295       273568 :                IF (PRODUCT(n_slices_tmp) .NE. desc%group_size) CYCLE
     296              : 
     297              :                ! we see if there has been a input constraint
     298              :                ! i.e. if the layout is not -1 we need to fullfil it
     299       410610 :                IF (.NOT. ALL(PACK(n_slices_tmp == input_settings%distribution_layout, &
     300              :                                   (/-1, -1, -1/) /= input_settings%distribution_layout) &
     301        51294 :                              )) CYCLE
     302              : 
     303              :                ! We can not work with a grid that has more local than global grid points.
     304              :                ! This can happen when a halo region wraps around and overlaps with the other halo.
     305        51068 :                overlap = .FALSE.
     306       204272 :                DO dir = 1, 3
     307       204272 :                   IF (n_slices_tmp(dir) > 1) THEN
     308       153204 :                      DO l = 0, n_slices_tmp(dir) - 1
     309       102136 :                         lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
     310       153204 :                         IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .TRUE.
     311              :                      END DO
     312              :                   END IF
     313              :                END DO
     314        51068 :                IF (overlap) CYCLE
     315              : 
     316              :                ! a heuristic optimisation to reduce the memory usage
     317              :                ! we go for the smallest local to real volume
     318              :                ! volume of the box without the wings / volume of the box with the wings
     319              :                ! with prefactodesc to promote less cuts in Z dimension
     320              :                ratio = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp)/ &
     321              :                        PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp + &
     322        58198 :                                MERGE((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
     323        42510 :                IF (ratio > ratio_best) THEN
     324         8298 :                   ratio_best = ratio
     325         8298 :                   n_slices = n_slices_tmp
     326              :                END IF
     327              : 
     328              :             END DO
     329              :             END DO
     330              : 
     331              :             ! if automatic we can still decide this is a replicated grid
     332              :             ! if the memory gain (or the gain is messages) is too small.
     333        17098 :             IF (input_settings%distribution_type == rsgrid_automatic) THEN
     334        66192 :                volume = PRODUCT(REAL(desc%npts, KIND=dp))
     335              :                volume_dist = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices + &
     336        66192 :                                      MERGE((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
     337        16548 :                IF (volume < volume_dist*input_settings%memory_factor) THEN
     338        66192 :                   n_slices = 1
     339              :                END IF
     340              :             END IF
     341              : 
     342              :          END IF
     343              : 
     344       112560 :          desc%group_dim(:) = n_slices(:)
     345        28140 :          CALL desc%group%from_dup(pw_grid%para%group)
     346        28140 :          desc%group_size = desc%group%num_pe
     347        28140 :          desc%my_pos = desc%group%mepos
     348              : 
     349       112382 :          IF (ALL(n_slices == 1)) THEN
     350              :             ! CASE 1 : only one slice: we do not need overlapping regions and special
     351              :             !          recombination of the total density
     352        27982 :             desc%border = border_size
     353        27982 :             IF (border_size == 0) THEN
     354       111928 :                desc%perd = 1
     355              :             ELSE
     356            0 :                desc%perd = 0
     357              :             END IF
     358        27982 :             desc%distributed = .FALSE.
     359        27982 :             desc%parallel = .TRUE.
     360       111928 :             desc%group_coor(:) = 0
     361        27982 :             desc%my_virtual_pos = 0
     362              : 
     363        83946 :             ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
     364        83946 :             ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
     365              :             ! Start with no reordering
     366        83946 :             DO i = 0, desc%group_size - 1
     367        55964 :                desc%virtual2real(i) = i
     368        83946 :                desc%real2virtual(i) = i
     369              :             END DO
     370              :          ELSE
     371              :             ! CASE 2 : general case
     372              :             ! periodicity is no longer enforced arbritary directions
     373          158 :             IF (border_size == 0) THEN
     374          120 :                desc%perd = 1
     375          120 :                DO dir = 1, 3
     376          120 :                   IF (n_slices(dir) > 1) desc%perd(dir) = 0
     377              :                END DO
     378              :             ELSE
     379          512 :                desc%perd(:) = 0
     380              :             END IF
     381              :             ! we keep a border of nmin points
     382          158 :             desc%border = nmin
     383              :             ! we are going parallel on the real space grid
     384          158 :             desc%parallel = .TRUE.
     385          158 :             desc%distributed = .TRUE.
     386              : 
     387              :             ! set up global info about the distribution
     388          474 :             ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
     389          790 :             ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
     390          474 :             ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
     391          474 :             ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
     392          474 :             ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
     393          474 :             ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
     394          474 :             ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
     395              : 
     396          474 :             DO i = 0, desc%group_size - 1
     397              :                ! Calculate coordinates in a row-major order (to be SMP-friendly)
     398          316 :                desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
     399              :                desc%rank2coord(2, i) = MODULO(i, desc%group_dim(2)*desc%group_dim(3)) &
     400          316 :                                        /desc%group_dim(3)
     401          316 :                desc%rank2coord(3, i) = MODULO(i, desc%group_dim(3))
     402              : 
     403          316 :                IF (i == desc%my_pos) THEN
     404          632 :                   desc%group_coor = desc%rank2coord(:, i)
     405              :                END IF
     406              : 
     407          316 :                desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
     408              :                ! the lb_global and ub_global correspond to lb_real and ub_real of each task
     409         1264 :                desc%lb_global(:, i) = desc%lb
     410         1264 :                desc%ub_global(:, i) = desc%ub
     411         1422 :                DO dir = 1, 3
     412         1264 :                   IF (desc%group_dim(dir) .GT. 1) THEN
     413          316 :                      lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
     414          316 :                      desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
     415          316 :                      desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
     416              :                   END IF
     417              :                END DO
     418              :             END DO
     419              : 
     420              :             ! map a grid point to a CPU coord
     421          632 :             DO dir = 1, 3
     422         1264 :                DO l = 0, desc%group_dim(dir) - 1
     423          632 :                   IF (desc%group_dim(dir) .GT. 1) THEN
     424          316 :                      lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
     425          948 :                      lb = lb + desc%lb(dir) - 1
     426              :                   ELSE
     427          316 :                      lb(1) = desc%lb(dir)
     428          316 :                      lb(2) = desc%ub(dir)
     429              :                   END IF
     430          474 :                   SELECT CASE (dir)
     431              :                   CASE (1)
     432        11696 :                      desc%x2coord(lb(1):lb(2)) = l
     433              :                   CASE (2)
     434        12104 :                      desc%y2coord(lb(1):lb(2)) = l
     435              :                   CASE (3)
     436        12200 :                      desc%z2coord(lb(1):lb(2)) = l
     437              :                   END SELECT
     438              :                END DO
     439              :             END DO
     440              : 
     441              :             ! an upper bound for the number of neighbours the border is overlapping with
     442          632 :             DO dir = 1, 3
     443          474 :                desc%neighbours(dir) = 0
     444          632 :                IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
     445          414 :                   min_npts_real = HUGE(0)
     446          986 :                   DO l = 0, n_slices(dir) - 1
     447          572 :                      lb = get_limit(desc%npts(dir), n_slices(dir), l)
     448          986 :                      min_npts_real = MIN(lb(2) - lb(1) + 1, min_npts_real)
     449              :                   END DO
     450          414 :                   desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
     451              :                END IF
     452              :             END DO
     453              : 
     454          474 :             ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
     455          474 :             ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
     456              :             ! Start with no reordering
     457          474 :             DO i = 0, desc%group_size - 1
     458          316 :                desc%virtual2real(i) = i
     459          474 :                desc%real2virtual(i) = i
     460              :             END DO
     461              : 
     462          158 :             desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
     463          632 :             desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
     464              : 
     465              :          END IF
     466              :       END IF
     467              : 
     468        32334 :       CALL timestop(handle)
     469              : 
     470        32334 :    END SUBROUTINE rs_grid_create_descriptor
     471              : 
     472              : ! **************************************************************************************************
     473              : !> \brief ...
     474              : !> \param rs ...
     475              : !> \param desc ...
     476              : ! **************************************************************************************************
     477      5141934 :    SUBROUTINE rs_grid_create(rs, desc)
     478              :       TYPE(realspace_grid_type), INTENT(OUT)             :: rs
     479              :       TYPE(realspace_grid_desc_type), INTENT(INOUT), &
     480              :          TARGET                                          :: desc
     481              : 
     482              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'rs_grid_create'
     483              : 
     484              :       INTEGER                                            :: handle
     485              : 
     486       244854 :       CALL timeset(routineN, handle)
     487              : 
     488       244854 :       rs%desc => desc
     489       244854 :       CALL rs_grid_retain_descriptor(rs%desc)
     490              : 
     491       244854 :       IF (desc%pw%para%mode == PW_MODE_LOCAL) THEN
     492              :          ! The corresponding group has dimension 1
     493              :          ! All operations will be done locally
     494        64376 :          rs%lb_real = desc%lb
     495        64376 :          rs%ub_real = desc%ub
     496        64376 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     497        64376 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     498        64376 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     499        64376 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     500              :       END IF
     501              : 
     502       977020 :       IF (ALL(rs%desc%group_dim == 1)) THEN
     503              :          ! CASE 1 : only one slice: we do not need overlapping regions and special
     504              :          !          recombination of the total density
     505       972312 :          rs%lb_real = desc%lb
     506       972312 :          rs%ub_real = desc%ub
     507       972312 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     508       972312 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     509       972312 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     510       972312 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     511              :       ELSE
     512              :          ! CASE 2 : general case
     513              :          ! extract some more derived quantities about the local grid
     514         7104 :          rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
     515         7104 :          rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
     516         7104 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     517         7104 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     518         7104 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     519         7104 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     520              :       END IF
     521              : 
     522       244854 :       CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
     523              :       rs%r(rs%lb_local(1):rs%ub_local(1), &
     524              :            rs%lb_local(2):rs%ub_local(2), &
     525       244854 :            rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
     526              : 
     527       734562 :       ALLOCATE (rs%px(desc%npts(1)))
     528       734562 :       ALLOCATE (rs%py(desc%npts(2)))
     529       734562 :       ALLOCATE (rs%pz(desc%npts(3)))
     530              : 
     531       244854 :       CALL timestop(handle)
     532              : 
     533       244854 :    END SUBROUTINE rs_grid_create
     534              : 
     535              : ! **************************************************************************************************
     536              : !> \brief Defines a new ordering of ranks on this realspace grid, recalculating
     537              : !>        the data bounds and reallocating the grid.  As a result, each MPI process
     538              : !>        now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
     539              : !>        and a virtual rank (the rank of the process where the data now owned by this
     540              : !>        process would reside in an ordinary cartesian distribution).
     541              : !>        NB. Since the grid size required may change, the caller should be sure to release
     542              : !>        and recreate the corresponding rs_grids
     543              : !>        The desc%real2virtual and desc%virtual2real arrays can be used to map
     544              : !>        a physical rank to the 'rank' of data owned by that process and vice versa
     545              : !> \param desc ...
     546              : !> \param real2virtual ...
     547              : !> \par History
     548              : !>        04-2009 created [Iain Bethune]
     549              : !>          (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
     550              : ! **************************************************************************************************
     551            6 :    PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
     552              : 
     553              :       TYPE(realspace_grid_desc_type), INTENT(INOUT)      :: desc
     554              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: real2virtual
     555              : 
     556              :       INTEGER                                            :: i
     557              : 
     558           18 :       desc%real2virtual(:) = real2virtual
     559              : 
     560           18 :       DO i = 0, desc%group_size - 1
     561           18 :          desc%virtual2real(desc%real2virtual(i)) = i
     562              :       END DO
     563              : 
     564            6 :       desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
     565              : 
     566           12 :       IF (.NOT. ALL(desc%group_dim == 1)) THEN
     567           24 :          desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
     568              :       END IF
     569              : 
     570            6 :    END SUBROUTINE
     571              : 
     572              : ! **************************************************************************************************
     573              : !> \brief Print information on grids to output
     574              : !> \param rs ...
     575              : !> \param iounit ...
     576              : !> \author JGH (17-May-2007)
     577              : ! **************************************************************************************************
     578        14288 :    SUBROUTINE rs_grid_print(rs, iounit)
     579              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     580              :       INTEGER, INTENT(in)                                :: iounit
     581              : 
     582              :       INTEGER                                            :: dir, i, nn
     583              :       REAL(KIND=dp)                                      :: pp(3)
     584              : 
     585        14288 :       IF (rs%desc%parallel) THEN
     586        14006 :          IF (iounit > 0) THEN
     587              :             WRITE (iounit, '(/,A,T71,I10)') &
     588         6261 :                " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
     589        25044 :             DO i = 1, 3
     590        18783 :                WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID|   Bounds ", &
     591        43827 :                   i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
     592              :             END DO
     593         6261 :             IF (.NOT. rs%desc%distributed) THEN
     594         6246 :                WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
     595              :                WRITE (iounit, '(A,T71,I10)') &
     596         6246 :                   " RS_GRID| Group size ", rs%desc%group_dim(2)
     597              :             ELSE
     598           60 :                DO dir = 1, 3
     599           60 :                   IF (rs%desc%perd(dir) /= 1) THEN
     600              :                      WRITE (iounit, '(A,T71,I3,A)') &
     601           15 :                         " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
     602              :                      WRITE (iounit, '(A,T71,I10)') &
     603           15 :                         " RS_GRID| Real space distribution along direction ", dir
     604              :                      WRITE (iounit, '(A,T71,I10)') &
     605           15 :                         " RS_GRID| Border size ", rs%desc%border
     606              :                   END IF
     607              :                END DO
     608              :             END IF
     609              :          END IF
     610        14006 :          IF (rs%desc%distributed) THEN
     611          120 :             DO dir = 1, 3
     612          120 :                IF (rs%desc%perd(dir) /= 1) THEN
     613           30 :                   nn = rs%npts_local(dir)
     614           30 :                   CALL rs%desc%group%sum(nn)
     615          120 :                   pp(1) = REAL(nn, KIND=dp)/REAL(PRODUCT(rs%desc%group_dim), KIND=dp)
     616           30 :                   nn = rs%npts_local(dir)
     617           30 :                   CALL rs%desc%group%max(nn)
     618           30 :                   pp(2) = REAL(nn, KIND=dp)
     619           30 :                   nn = rs%npts_local(dir)
     620           30 :                   CALL rs%desc%group%min(nn)
     621           30 :                   pp(3) = REAL(nn, KIND=dp)
     622           30 :                   IF (iounit > 0) THEN
     623           15 :                      WRITE (iounit, '(A,T48,A)') " RS_GRID|   Distribution", &
     624           30 :                         "  Average         Max         Min"
     625           15 :                      WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID|   Planes   ", &
     626           30 :                         pp(1), NINT(pp(2)), NINT(pp(3))
     627              :                   END IF
     628              :                END IF
     629              :             END DO
     630              : !          WRITE ( iounit, '(/)' )
     631              :          END IF
     632              :       ELSE
     633          282 :          IF (iounit > 0) THEN
     634              :             WRITE (iounit, '(/,A,T71,I10)') &
     635          172 :                " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
     636          688 :             DO i = 1, 3
     637          516 :                WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID|   Bounds ", &
     638         1204 :                   i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
     639              :             END DO
     640              : !         WRITE ( iounit, '(/)' )
     641              :          END IF
     642              :       END IF
     643              : 
     644        14288 :    END SUBROUTINE rs_grid_print
     645              : 
     646              : ! **************************************************************************************************
     647              : !> \brief ...
     648              : !> \param rs ...
     649              : !> \param pw ...
     650              : ! **************************************************************************************************
     651      1104464 :    SUBROUTINE transfer_rs2pw(rs, pw)
     652              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     653              :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw
     654              : 
     655              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'transfer_rs2pw'
     656              : 
     657              :       INTEGER                                            :: handle, handle2, i
     658              : 
     659      1104464 :       CALL timeset(routineN, handle2)
     660      1104464 :       CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
     661              : 
     662      1104464 :       IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
     663            0 :          CPABORT("Different rs and pw indentifiers")
     664              : 
     665      1104464 :       IF (rs%desc%distributed) THEN
     666         1852 :          CALL transfer_rs2pw_distributed(rs, pw)
     667      1102612 :       ELSE IF (rs%desc%parallel) THEN
     668       887570 :          CALL transfer_rs2pw_replicated(rs, pw)
     669              :       ELSE ! treat simple serial case locally
     670       215042 :          IF (rs%desc%border == 0) THEN
     671       860168 :             CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
     672              :          ELSE
     673            0 :             CPASSERT(LBOUND(pw%array, 3) .EQ. rs%lb_real(3))
     674            0 : !$OMP          PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
     675              :             DO i = rs%lb_real(3), rs%ub_real(3)
     676              :                pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
     677              :                                         rs%lb_real(2):rs%ub_real(2), i)
     678              :             END DO
     679              : !$OMP          END PARALLEL DO
     680              :          END IF
     681              :       END IF
     682              : 
     683      1104464 :       CALL timestop(handle)
     684      1104464 :       CALL timestop(handle2)
     685              : 
     686      1104464 :    END SUBROUTINE transfer_rs2pw
     687              : 
     688              : ! **************************************************************************************************
     689              : !> \brief ...
     690              : !> \param rs ...
     691              : !> \param pw ...
     692              : ! **************************************************************************************************
     693      1061691 :    SUBROUTINE transfer_pw2rs(rs, pw)
     694              : 
     695              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     696              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     697              : 
     698              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'transfer_pw2rs'
     699              : 
     700              :       INTEGER                                            :: handle, handle2, i, im, j, jm, k, km
     701              : 
     702      1061691 :       CALL timeset(routineN, handle2)
     703      1061691 :       CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
     704              : 
     705      1061691 :       IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
     706            0 :          CPABORT("Different rs and pw indentifiers")
     707              : 
     708      1061691 :       IF (rs%desc%distributed) THEN
     709          872 :          CALL transfer_pw2rs_distributed(rs, pw)
     710      1060819 :       ELSE IF (rs%desc%parallel) THEN
     711       809578 :          CALL transfer_pw2rs_replicated(rs, pw)
     712              :       ELSE ! treat simple serial case locally
     713       251241 :          IF (rs%desc%border == 0) THEN
     714      1004964 :             CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
     715              :          ELSE
     716              : !$OMP          PARALLEL DO DEFAULT(NONE) &
     717              : !$OMP                      PRIVATE(i,im,j,jm,k,km) &
     718            0 : !$OMP                      SHARED(pw,rs)
     719              :             DO k = rs%lb_local(3), rs%ub_local(3)
     720              :                IF (k < rs%lb_real(3)) THEN
     721              :                   km = k + rs%desc%npts(3)
     722              :                ELSE IF (k > rs%ub_real(3)) THEN
     723              :                   km = k - rs%desc%npts(3)
     724              :                ELSE
     725              :                   km = k
     726              :                END IF
     727              :                DO j = rs%lb_local(2), rs%ub_local(2)
     728              :                   IF (j < rs%lb_real(2)) THEN
     729              :                      jm = j + rs%desc%npts(2)
     730              :                   ELSE IF (j > rs%ub_real(2)) THEN
     731              :                      jm = j - rs%desc%npts(2)
     732              :                   ELSE
     733              :                      jm = j
     734              :                   END IF
     735              :                   DO i = rs%lb_local(1), rs%ub_local(1)
     736              :                      IF (i < rs%lb_real(1)) THEN
     737              :                         im = i + rs%desc%npts(1)
     738              :                      ELSE IF (i > rs%ub_real(1)) THEN
     739              :                         im = i - rs%desc%npts(1)
     740              :                      ELSE
     741              :                         im = i
     742              :                      END IF
     743              :                      rs%r(i, j, k) = pw%array(im, jm, km)
     744              :                   END DO
     745              :                END DO
     746              :             END DO
     747              : !$OMP          END PARALLEL DO
     748              :          END IF
     749              :       END IF
     750              : 
     751      1061691 :       CALL timestop(handle)
     752      1061691 :       CALL timestop(handle2)
     753              : 
     754      1061691 :    END SUBROUTINE transfer_pw2rs
     755              : 
     756              : ! **************************************************************************************************
     757              : !> \brief transfer from a realspace grid to a planewave grid
     758              : !> \param rs ...
     759              : !> \param pw ...
     760              : ! **************************************************************************************************
     761       887570 :    SUBROUTINE transfer_rs2pw_replicated(rs, pw)
     762              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     763              :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw
     764              : 
     765              :       INTEGER                                            :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
     766              :                                                             source
     767       887570 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount
     768              :       INTEGER, DIMENSION(3)                              :: lb, ub
     769       887570 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recvbuf, sendbuf, swaparray
     770              : 
     771              :       ASSOCIATE (np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
     772              :                  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
     773              :                  grid => rs%r)
     774      2662710 :          ALLOCATE (rcount(0:np - 1))
     775      2662710 :          DO ip = 1, np
     776      7988130 :             rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
     777              :          END DO
     778      2662710 :          nma = MAXVAL(rcount(0:np - 1))
     779      3550280 :          ALLOCATE (sendbuf(nma), recvbuf(nma))
     780  33563048760 :          sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
     781              : 
     782              :          !sample peak memory
     783       887570 :          CALL m_memory()
     784              : 
     785       887570 :          dest = MODULO(mepos + 1, np)
     786       887570 :          source = MODULO(mepos - 1, np)
     787  16781524380 :          sendbuf = 0.0_dp
     788              : 
     789      1775140 :          DO ip = 1, np
     790              : 
     791      7100560 :             lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
     792      7100560 :             ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
     793              :             ! this loop takes about the same time as the message passing call
     794              :             ! notice that the range of ix is only a small fraction of the first index of grid
     795              :             ! therefore it seems faster to have the second index as the innermost loop
     796              :             ! if this runs on many cpus
     797              :             ! tested on itanium, pentium4, opteron, ultrasparc...
     798      7100560 :             s = ub - lb + 1
     799     44711264 :             DO iz = lb(3), ub(3)
     800    792216402 :                DO ix = lb(1), ub(1)
     801    747505138 :                   ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
     802  34125456352 :                   DO iy = lb(2), ub(2)
     803  33335015090 :                      sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
     804  34082520228 :                      ii = ii + s(1)
     805              :                   END DO
     806              :                END DO
     807              :             END DO
     808      1775140 :             IF (ip .EQ. np) EXIT
     809       887570 :             CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
     810       887570 :             CALL MOVE_ALLOC(sendbuf, swaparray)
     811       887570 :             CALL MOVE_ALLOC(recvbuf, sendbuf)
     812      1775140 :             CALL MOVE_ALLOC(swaparray, recvbuf)
     813              :          END DO
     814       887570 :          nn = rcount(mepos)
     815              :       END ASSOCIATE
     816              : 
     817       887570 :       CALL dcopy(nn, sendbuf, 1, pw%array, 1)
     818              : 
     819       887570 :       DEALLOCATE (rcount)
     820       887570 :       DEALLOCATE (sendbuf)
     821       887570 :       DEALLOCATE (recvbuf)
     822              : 
     823       887570 :    END SUBROUTINE transfer_rs2pw_replicated
     824              : 
     825              : ! **************************************************************************************************
     826              : !> \brief transfer from a planewave grid to a realspace grid
     827              : !> \param rs ...
     828              : !> \param pw ...
     829              : ! **************************************************************************************************
     830       809578 :    SUBROUTINE transfer_pw2rs_replicated(rs, pw)
     831              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     832              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     833              : 
     834              :       INTEGER                                            :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
     835              :                                                             k, km, nma, nn, source
     836       809578 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount
     837              :       INTEGER, DIMENSION(3)                              :: lb, ub
     838       809578 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recvbuf, sendbuf, swaparray
     839      2428734 :       TYPE(mp_request_type), DIMENSION(2)                :: req
     840              : 
     841              :       ASSOCIATE (np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
     842              :                  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
     843              :                  grid => rs%r)
     844      2428734 :          ALLOCATE (rcount(0:np - 1))
     845      2428734 :          DO ip = 1, np
     846      7286202 :             rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
     847              :          END DO
     848      2428734 :          nma = MAXVAL(rcount(0:np - 1))
     849      3238312 :          ALLOCATE (sendbuf(nma), recvbuf(nma))
     850  31922438672 :          sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
     851              : 
     852              :          !sample peak memory
     853       809578 :          CALL m_memory()
     854              : 
     855       809578 :          nn = rcount(mepos)
     856       809578 :          CALL dcopy(nn, pw%array, 1, sendbuf, 1)
     857              : 
     858       809578 :          dest = MODULO(mepos + 1, np)
     859       809578 :          source = MODULO(mepos - 1, np)
     860              : 
     861      2428734 :          DO ip = 0, np - 1
     862              :             ! we must shift the buffer only np-1 times around
     863      1619156 :             IF (ip .NE. np - 1) THEN
     864              :                CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
     865       809578 :                                     req(1), req(2), 13)
     866              :             END IF
     867      6476624 :             lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
     868      6476624 :             ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
     869      1619156 :             ii = 0
     870              :             ! this loop takes about the same time as the message passing call
     871              :             ! If I read the code correctly then:
     872     41708248 :             DO iz = lb(3), ub(3)
     873   1440864364 :                DO iy = lb(2), ub(2)
     874  33146859640 :                   DO ix = lb(1), ub(1)
     875  31707614432 :                      ii = ii + 1
     876  33106770548 :                      grid(ix, iy, iz) = sendbuf(ii)
     877              :                   END DO
     878              :                END DO
     879              :             END DO
     880      1619156 :             IF (ip .NE. np - 1) THEN
     881       809578 :                CALL mp_waitall(req)
     882              :             END IF
     883      1619156 :             CALL MOVE_ALLOC(sendbuf, swaparray)
     884      1619156 :             CALL MOVE_ALLOC(recvbuf, sendbuf)
     885      2428734 :             CALL MOVE_ALLOC(swaparray, recvbuf)
     886              :          END DO
     887      1619156 :          IF (rs%desc%border > 0) THEN
     888              : !$OMP       PARALLEL DO DEFAULT(NONE) &
     889              : !$OMP                   PRIVATE(i,im,j,jm,k,km) &
     890            0 : !$OMP                   SHARED(rs)
     891              :             DO k = rs%lb_local(3), rs%ub_local(3)
     892              :                IF (k < rs%lb_real(3)) THEN
     893              :                   km = k + rs%desc%npts(3)
     894              :                ELSE IF (k > rs%ub_real(3)) THEN
     895              :                   km = k - rs%desc%npts(3)
     896              :                ELSE
     897              :                   km = k
     898              :                END IF
     899              :                DO j = rs%lb_local(2), rs%ub_local(2)
     900              :                   IF (j < rs%lb_real(2)) THEN
     901              :                      jm = j + rs%desc%npts(2)
     902              :                   ELSE IF (j > rs%ub_real(2)) THEN
     903              :                      jm = j - rs%desc%npts(2)
     904              :                   ELSE
     905              :                      jm = j
     906              :                   END IF
     907              :                   DO i = rs%lb_local(1), rs%ub_local(1)
     908              :                      IF (i < rs%lb_real(1)) THEN
     909              :                         im = i + rs%desc%npts(1)
     910              :                      ELSE IF (i > rs%ub_real(1)) THEN
     911              :                         im = i - rs%desc%npts(1)
     912              :                      ELSE
     913              :                         im = i
     914              :                      END IF
     915              :                      rs%r(i, j, k) = rs%r(im, jm, km)
     916              :                   END DO
     917              :                END DO
     918              :             END DO
     919              : !$OMP       END PARALLEL DO
     920              :          END IF
     921              :       END ASSOCIATE
     922              : 
     923       809578 :       DEALLOCATE (rcount)
     924       809578 :       DEALLOCATE (sendbuf)
     925       809578 :       DEALLOCATE (recvbuf)
     926              : 
     927       809578 :    END SUBROUTINE transfer_pw2rs_replicated
     928              : 
     929              : ! **************************************************************************************************
     930              : !> \brief does the rs2pw transfer in the case where the rs grid is
     931              : !>       distributed (3D domain decomposition)
     932              : !> \param rs ...
     933              : !> \param pw ...
     934              : !> \par History
     935              : !>      12.2007 created [Matt Watkins]
     936              : !>      9.2008 reduced amount of halo data sent [Iain Bethune]
     937              : !>      10.2008 added non-blocking communication [Iain Bethune]
     938              : !>      4.2009 added support for rank-reordering on the grid [Iain Bethune]
     939              : !>      12.2009 added OMP and sparse alltoall [Iain Bethune]
     940              : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
     941              : !> \note
     942              : !>       the transfer is a two step procedure. For example, for the rs2pw transfer:
     943              : !>
     944              : !>       1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
     945              : !>       2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
     946              : !>
     947              : !>       the halo exchange is most expensive on a large number of CPUs. Particular in this halo
     948              : !>       exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
     949              : !>       with the central domain of several CPUs (i.e. next nearest neighbors)
     950              : ! **************************************************************************************************
     951         1852 :    SUBROUTINE transfer_rs2pw_distributed(rs, pw)
     952              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     953              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     954              : 
     955              :       CHARACTER(LEN=200)                                 :: error_string
     956              :       INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
     957              :          n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
     958         1852 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dshifts, recv_disps, recv_sizes, &
     959         1852 :                                                             send_disps, send_sizes, ushifts
     960         3704 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: bounds, recv_tasks, send_tasks
     961              :       INTEGER, DIMENSION(2)                              :: neighbours, pos
     962              :       INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
     963              :          lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
     964              :       LOGICAL, DIMENSION(3)                              :: halo_swapped
     965              :       REAL(KIND=dp)                                      :: pw_sum, rs_sum
     966         1852 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: recv_buf_3d_down, recv_buf_3d_up, &
     967         1852 :                                                             send_buf_3d_down, send_buf_3d_up
     968         3704 :       TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_bufs, send_bufs
     969         1852 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_reqs, send_reqs
     970         9260 :       TYPE(mp_request_type), DIMENSION(4)                :: req
     971              : 
     972         1852 :       num_threads = 1
     973         1852 :       my_id = 0
     974              : 
     975              :       ! safety check, to be removed once we're absolute sure the routine is correct
     976              :       IF (debug_this_module) THEN
     977              :          rs_sum = accurate_sum(rs%r)*ABS(det_3x3(rs%desc%dh))
     978              :          CALL rs%desc%group%sum(rs_sum)
     979              :       END IF
     980              : 
     981         1852 :       halo_swapped = .FALSE.
     982              :       ! We don't need to send the 'edges' of the halos that have already been sent
     983              :       ! Halos are contiguous in memory in z-direction only, so swap these first,
     984              :       ! and send less data in the y and x directions which are more expensive
     985              : 
     986         7408 :       DO idir = 3, 1, -1
     987              : 
     988         5556 :          IF (rs%desc%perd(idir) .NE. 1) THEN
     989              : 
     990        14412 :             ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
     991         9608 :             ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
     992              : 
     993        14412 :             ushifts = 0
     994        14412 :             dshifts = 0
     995              : 
     996              :             ! check that we don't try to send data to ourself
     997         6656 :             DO n_shifts = 1, MIN(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
     998              : 
     999              :                ! need to take into account the possible varying widths of neighbouring cells
    1000              :                ! offset_up and offset_down hold the real size of the neighbouring cells
    1001         1852 :                position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
    1002         1852 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1003         1852 :                dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1004              : 
    1005         1852 :                position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
    1006         1852 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1007         1852 :                ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1008              : 
    1009              :                ! The border data has to be send/received from the neighbours
    1010              :                ! First we calculate the source and destination processes for the shift
    1011              :                ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
    1012              : 
    1013         1852 :                CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
    1014              : 
    1015         7408 :                lb_send_down(:) = rs%lb_local(:)
    1016         7408 :                lb_recv_down(:) = rs%lb_local(:)
    1017         7408 :                ub_recv_down(:) = rs%ub_local(:)
    1018         7408 :                ub_send_down(:) = rs%ub_local(:)
    1019              : 
    1020         1852 :                IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1021         1852 :                   ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
    1022              :                   lb_send_down(idir) = MAX(lb_send_down(idir), &
    1023         1852 :                                            lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
    1024              : 
    1025         1852 :                   ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
    1026              :                   lb_recv_down(idir) = MAX(lb_recv_down(idir) + rs%desc%border, &
    1027         1852 :                                            ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
    1028              :                ELSE
    1029            0 :                   lb_send_down(idir) = 0
    1030            0 :                   ub_send_down(idir) = -1
    1031            0 :                   lb_recv_down(idir) = 0
    1032            0 :                   ub_recv_down(idir) = -1
    1033              :                END IF
    1034              : 
    1035         7408 :                DO i = 1, 3
    1036         7408 :                   IF (halo_swapped(i)) THEN
    1037          554 :                      lb_send_down(i) = rs%lb_real(i)
    1038          554 :                      ub_send_down(i) = rs%ub_real(i)
    1039          554 :                      lb_recv_down(i) = rs%lb_real(i)
    1040          554 :                      ub_recv_down(i) = rs%ub_real(i)
    1041              :                   END IF
    1042              :                END DO
    1043              : 
    1044              :                ! post the receive
    1045            0 :                ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
    1046         9260 :                                           lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
    1047         1852 :                CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
    1048              : 
    1049              :                ! now allocate, pack and send the send buffer
    1050         7408 :                nn = PRODUCT(ub_send_down - lb_send_down + 1)
    1051            0 :                ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
    1052         9260 :                                           lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
    1053              : 
    1054              : !$OMP PARALLEL DEFAULT(NONE), &
    1055              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1056         1852 : !$OMP          SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
    1057              : !$             num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
    1058              : !$             my_id = omp_get_thread_num()
    1059              :                IF (my_id < num_threads) THEN
    1060              :                   lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
    1061              :                   ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
    1062              : 
    1063              :                   send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
    1064              :                                    lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
    1065              :                                                  lb_send_down(2):ub_send_down(2), lb:ub)
    1066              :                END IF
    1067              : !$OMP END PARALLEL
    1068              : 
    1069         1852 :                CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
    1070              : 
    1071              :                ! Now for the other direction
    1072         1852 :                CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
    1073              : 
    1074         7408 :                lb_send_up(:) = rs%lb_local(:)
    1075         7408 :                lb_recv_up(:) = rs%lb_local(:)
    1076         7408 :                ub_recv_up(:) = rs%ub_local(:)
    1077         7408 :                ub_send_up(:) = rs%ub_local(:)
    1078              : 
    1079         1852 :                IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1080              : 
    1081         1852 :                   lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
    1082              :                   ub_send_up(idir) = MIN(ub_send_up(idir), &
    1083         1852 :                                          ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
    1084              : 
    1085         1852 :                   lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
    1086              :                   ub_recv_up(idir) = MIN(ub_recv_up(idir) - rs%desc%border, &
    1087         1852 :                                          lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
    1088              :                ELSE
    1089            0 :                   lb_send_up(idir) = 0
    1090            0 :                   ub_send_up(idir) = -1
    1091            0 :                   lb_recv_up(idir) = 0
    1092            0 :                   ub_recv_up(idir) = -1
    1093              :                END IF
    1094              : 
    1095         7408 :                DO i = 1, 3
    1096         7408 :                   IF (halo_swapped(i)) THEN
    1097          554 :                      lb_send_up(i) = rs%lb_real(i)
    1098          554 :                      ub_send_up(i) = rs%ub_real(i)
    1099          554 :                      lb_recv_up(i) = rs%lb_real(i)
    1100          554 :                      ub_recv_up(i) = rs%ub_real(i)
    1101              :                   END IF
    1102              :                END DO
    1103              : 
    1104              :                ! post the receive
    1105            0 :                ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
    1106         9260 :                                         lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
    1107         1852 :                CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
    1108              : 
    1109              :                ! now allocate,pack and send the send buffer
    1110         7408 :                nn = PRODUCT(ub_send_up - lb_send_up + 1)
    1111            0 :                ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
    1112         9260 :                                         lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
    1113              : 
    1114              : !$OMP PARALLEL DEFAULT(NONE), &
    1115              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1116         1852 : !$OMP          SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
    1117              : !$             num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
    1118              : !$             my_id = omp_get_thread_num()
    1119              :                IF (my_id < num_threads) THEN
    1120              :                   lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
    1121              :                   ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
    1122              : 
    1123              :                   send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
    1124              :                                  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
    1125              :                                                lb_send_up(2):ub_send_up(2), lb:ub)
    1126              :                END IF
    1127              : !$OMP END PARALLEL
    1128              : 
    1129         1852 :                CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
    1130              : 
    1131              :                ! wait for a recv to complete, then we can unpack
    1132              : 
    1133         5556 :                DO i = 1, 2
    1134              : 
    1135         3704 :                   CALL mp_waitany(req(1:2), completed)
    1136              : 
    1137         5556 :                   IF (completed .EQ. 1) THEN
    1138              : 
    1139              :                      ! only some procs may need later shifts
    1140         1852 :                      IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
    1141              :                         ! Sum the data in the RS Grid
    1142              : !$OMP PARALLEL DEFAULT(NONE), &
    1143              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1144         1852 : !$OMP          SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
    1145              : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
    1146              : !$                      my_id = omp_get_thread_num()
    1147              :                         IF (my_id < num_threads) THEN
    1148              :                            lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
    1149              :                            ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
    1150              : 
    1151              :                            rs%r(lb_recv_down(1):ub_recv_down(1), &
    1152              :                                 lb_recv_down(2):ub_recv_down(2), lb:ub) = &
    1153              :                               rs%r(lb_recv_down(1):ub_recv_down(1), &
    1154              :                                    lb_recv_down(2):ub_recv_down(2), lb:ub) + &
    1155              :                               recv_buf_3d_down(:, :, lb:ub)
    1156              :                         END IF
    1157              : !$OMP END PARALLEL
    1158              :                      END IF
    1159         1852 :                      DEALLOCATE (recv_buf_3d_down)
    1160              :                   ELSE
    1161              : 
    1162              :                      ! only some procs may need later shifts
    1163         1852 :                      IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
    1164              :                         ! Sum the data in the RS Grid
    1165              : !$OMP PARALLEL DEFAULT(NONE), &
    1166              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1167         1852 : !$OMP          SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
    1168              : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
    1169              : !$                      my_id = omp_get_thread_num()
    1170              :                         IF (my_id < num_threads) THEN
    1171              :                            lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
    1172              :                            ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
    1173              : 
    1174              :                            rs%r(lb_recv_up(1):ub_recv_up(1), &
    1175              :                                 lb_recv_up(2):ub_recv_up(2), lb:ub) = &
    1176              :                               rs%r(lb_recv_up(1):ub_recv_up(1), &
    1177              :                                    lb_recv_up(2):ub_recv_up(2), lb:ub) + &
    1178              :                               recv_buf_3d_up(:, :, lb:ub)
    1179              :                         END IF
    1180              : !$OMP END PARALLEL
    1181              :                      END IF
    1182         1852 :                      DEALLOCATE (recv_buf_3d_up)
    1183              :                   END IF
    1184              : 
    1185              :                END DO
    1186              : 
    1187              :                ! make sure the sends have completed before we deallocate
    1188              : 
    1189         1852 :                CALL mp_waitall(req(3:4))
    1190              : 
    1191         1852 :                DEALLOCATE (send_buf_3d_down)
    1192         8508 :                DEALLOCATE (send_buf_3d_up)
    1193              :             END DO
    1194              : 
    1195         4804 :             DEALLOCATE (dshifts)
    1196         4804 :             DEALLOCATE (ushifts)
    1197              : 
    1198              :          END IF
    1199              : 
    1200         7408 :          halo_swapped(idir) = .TRUE.
    1201              : 
    1202              :       END DO
    1203              : 
    1204              :       ! This is the real redistribution
    1205         7408 :       ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
    1206              : 
    1207              :       ! work out the pw grid points each proc holds
    1208         5556 :       DO i = 0, pw%pw_grid%para%group%num_pe - 1
    1209        11112 :          bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
    1210        11112 :          bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
    1211        11112 :          bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
    1212        12964 :          bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
    1213              :       END DO
    1214              : 
    1215         7408 :       ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1216         5556 :       ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1217         3704 :       ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1218         3704 :       ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1219         3704 :       ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1220         3704 :       ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1221         5556 :       send_tasks(:, 1) = 1
    1222         5556 :       send_tasks(:, 2) = 0
    1223         5556 :       send_tasks(:, 3) = 1
    1224         5556 :       send_tasks(:, 4) = 0
    1225         5556 :       send_tasks(:, 5) = 1
    1226         5556 :       send_tasks(:, 6) = 0
    1227         5556 :       send_sizes = 0
    1228         5556 :       recv_sizes = 0
    1229              : 
    1230         1852 :       my_rs_rank = rs%desc%my_pos
    1231         1852 :       my_pw_rank = pw%pw_grid%para%group%mepos
    1232              : 
    1233              :       ! find the processors that should hold our data
    1234              :       ! should be part of the rs grid type
    1235              :       ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
    1236              :       ! do the recv and send tasks in two separate loops which will
    1237              :       ! load balance better for OpenMP with large numbers of MPI tasks
    1238              : 
    1239              : !$OMP PARALLEL DO DEFAULT(NONE), &
    1240              : !$OMP             PRIVATE(coords,idir,pos,lb_send,ub_send), &
    1241         1852 : !$OMP             SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
    1242              :       DO i = 0, rs%desc%group_size - 1
    1243              : 
    1244              :          coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
    1245              :          !calculate the rs grid points on each processor
    1246              :          !coords is the part of the grid that rank i actually holds
    1247              :          DO idir = 1, 3
    1248              :             pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1249              :             pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1250              :             lb_send(idir) = pos(1)
    1251              :             ub_send(idir) = pos(2)
    1252              :          END DO
    1253              : 
    1254              :          IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
    1255              :          IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
    1256              :          IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
    1257              :          IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
    1258              : 
    1259              :          recv_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
    1260              :          recv_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
    1261              :          recv_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
    1262              :          recv_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
    1263              :          recv_tasks(i, 5) = lb_send(3)
    1264              :          recv_tasks(i, 6) = ub_send(3)
    1265              :          recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
    1266              :                          (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
    1267              : 
    1268              :       END DO
    1269              : !$OMP END PARALLEL DO
    1270              : 
    1271         7408 :       coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
    1272         7408 :       DO idir = 1, 3
    1273         5556 :          pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1274        16668 :          pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1275         5556 :          lb_send(idir) = pos(1)
    1276         7408 :          ub_send(idir) = pos(2)
    1277              :       END DO
    1278              : 
    1279         1852 :       lb_recv(:) = lb_send(:)
    1280         1852 :       ub_recv(:) = ub_send(:)
    1281              : !$OMP PARALLEL DO DEFAULT(NONE), &
    1282         1852 : !$OMP             SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
    1283              :       DO j = 0, pw%pw_grid%para%group%num_pe - 1
    1284              : 
    1285              :          IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
    1286              :          IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
    1287              :          IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
    1288              :          IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
    1289              : 
    1290              :          send_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
    1291              :          send_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
    1292              :          send_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
    1293              :          send_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
    1294              :          send_tasks(j, 5) = lb_send(3)
    1295              :          send_tasks(j, 6) = ub_send(3)
    1296              :          send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
    1297              :                          (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
    1298              : 
    1299              :       END DO
    1300              : !$OMP END PARALLEL DO
    1301              : 
    1302         1852 :       send_disps(0) = 0
    1303         1852 :       recv_disps(0) = 0
    1304         3704 :       DO i = 1, pw%pw_grid%para%group%num_pe - 1
    1305         1852 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1306         3704 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1307              :       END DO
    1308              : 
    1309        12964 :       CPASSERT(SUM(send_sizes) == PRODUCT(ub_recv - lb_recv + 1))
    1310              : 
    1311         9260 :       ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
    1312        11112 :       ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
    1313              : 
    1314         5556 :       DO i = 0, rs%desc%group_size - 1
    1315         3704 :          IF (send_sizes(i) .NE. 0) THEN
    1316        10374 :             ALLOCATE (send_bufs(i)%array(send_sizes(i)))
    1317              :          ELSE
    1318          246 :             NULLIFY (send_bufs(i)%array)
    1319              :          END IF
    1320         5556 :          IF (recv_sizes(i) .NE. 0) THEN
    1321        10374 :             ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
    1322              :          ELSE
    1323          246 :             NULLIFY (recv_bufs(i)%array)
    1324              :          END IF
    1325              :       END DO
    1326              : 
    1327         9260 :       ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
    1328         5556 :       recv_reqs = mp_request_null
    1329              : 
    1330         5556 :       DO i = 0, rs%desc%group_size - 1
    1331         5556 :          IF (recv_sizes(i) .NE. 0) THEN
    1332         3458 :             CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
    1333              :          END IF
    1334              :       END DO
    1335              : 
    1336              :       ! do packing
    1337              : !$OMP PARALLEL DO DEFAULT(NONE), &
    1338              : !$OMP             PRIVATE(k,z,y,x), &
    1339         1852 : !$OMP             SHARED(rs,send_tasks,send_bufs,send_disps)
    1340              :       DO i = 0, rs%desc%group_size - 1
    1341              :          k = 0
    1342              :          DO z = send_tasks(i, 5), send_tasks(i, 6)
    1343              :             DO y = send_tasks(i, 3), send_tasks(i, 4)
    1344              :                DO x = send_tasks(i, 1), send_tasks(i, 2)
    1345              :                   k = k + 1
    1346              :                   send_bufs(i)%array(k) = rs%r(x, y, z)
    1347              :                END DO
    1348              :             END DO
    1349              :          END DO
    1350              :       END DO
    1351              : !$OMP END PARALLEL DO
    1352              : 
    1353         9260 :       ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
    1354         5556 :       send_reqs = mp_request_null
    1355              : 
    1356         5556 :       DO i = 0, rs%desc%group_size - 1
    1357         5556 :          IF (send_sizes(i) .NE. 0) THEN
    1358         3458 :             CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
    1359              :          END IF
    1360              :       END DO
    1361              : 
    1362              :       ! do unpacking
    1363              :       ! no OMP here so we can unpack each message as it arrives
    1364         5556 :       DO i = 0, rs%desc%group_size - 1
    1365         3704 :          IF (recv_sizes(i) .EQ. 0) CYCLE
    1366              : 
    1367         3458 :          CALL mp_waitany(recv_reqs, completed)
    1368         3458 :          k = 0
    1369       143780 :          DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
    1370     10754290 :             DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
    1371    452245022 :                DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
    1372    441494436 :                   k = k + 1
    1373    452106552 :                   pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
    1374              :                END DO
    1375              :             END DO
    1376              :          END DO
    1377              :       END DO
    1378              : 
    1379         1852 :       CALL mp_waitall(send_reqs)
    1380              : 
    1381         1852 :       DEALLOCATE (recv_reqs)
    1382         1852 :       DEALLOCATE (send_reqs)
    1383              : 
    1384         5556 :       DO i = 0, rs%desc%group_size - 1
    1385         3704 :          IF (ASSOCIATED(send_bufs(i)%array)) THEN
    1386         3458 :             DEALLOCATE (send_bufs(i)%array)
    1387              :          END IF
    1388         5556 :          IF (ASSOCIATED(recv_bufs(i)%array)) THEN
    1389         3458 :             DEALLOCATE (recv_bufs(i)%array)
    1390              :          END IF
    1391              :       END DO
    1392              : 
    1393         1852 :       DEALLOCATE (send_bufs)
    1394         1852 :       DEALLOCATE (recv_bufs)
    1395         1852 :       DEALLOCATE (send_tasks)
    1396         1852 :       DEALLOCATE (send_sizes)
    1397         1852 :       DEALLOCATE (send_disps)
    1398         1852 :       DEALLOCATE (recv_tasks)
    1399         1852 :       DEALLOCATE (recv_sizes)
    1400         1852 :       DEALLOCATE (recv_disps)
    1401              : 
    1402              :       IF (debug_this_module) THEN
    1403              :          ! safety check, to be removed once we're absolute sure the routine is correct
    1404              :          pw_sum = pw_integrate_function(pw)
    1405              :          IF (ABS(pw_sum - rs_sum)/MAX(1.0_dp, ABS(pw_sum), ABS(rs_sum)) > EPSILON(rs_sum)*1000) THEN
    1406              :             WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
    1407              :                rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, ABS(pw_sum - rs_sum)
    1408              :             CALL cp_abort(__LOCATION__, &
    1409              :                           error_string//" Please report this bug ... quick workaround: use "// &
    1410              :                           "DISTRIBUTION_TYPE REPLICATED")
    1411              :          END IF
    1412              :       END IF
    1413              : 
    1414         1852 :    END SUBROUTINE transfer_rs2pw_distributed
    1415              : 
    1416              : ! **************************************************************************************************
    1417              : !> \brief does the pw2rs transfer in the case where the rs grid is
    1418              : !>       distributed (3D domain decomposition)
    1419              : !> \param rs ...
    1420              : !> \param pw ...
    1421              : !> \par History
    1422              : !>      12.2007 created [Matt Watkins]
    1423              : !>      9.2008 reduced amount of halo data sent [Iain Bethune]
    1424              : !>      10.2008 added non-blocking communication [Iain Bethune]
    1425              : !>      4.2009 added support for rank-reordering on the grid [Iain Bethune]
    1426              : !>      12.2009 added OMP and sparse alltoall [Iain Bethune]
    1427              : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
    1428              : !> \note
    1429              : !>       the transfer is a two step procedure. For example, for the rs2pw transfer:
    1430              : !>
    1431              : !>       1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
    1432              : !>       2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
    1433              : !>
    1434              : !>       the halo exchange is most expensive on a large number of CPUs. Particular in this halo
    1435              : !>       exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
    1436              : !>       with the central domain of several CPUs (i.e. next nearest neighbors)
    1437              : ! **************************************************************************************************
    1438          872 :    SUBROUTINE transfer_pw2rs_distributed(rs, pw)
    1439              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1440              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
    1441              : 
    1442              :       INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
    1443              :          n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
    1444          872 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dshifts, recv_disps, recv_sizes, &
    1445          872 :                                                             send_disps, send_sizes, ushifts
    1446         1744 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: bounds, recv_tasks, send_tasks
    1447              :       INTEGER, DIMENSION(2)                              :: neighbours, pos
    1448              :       INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
    1449              :          lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
    1450              :       LOGICAL, DIMENSION(3)                              :: halo_swapped
    1451          872 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: recv_buf_3d_down, recv_buf_3d_up, &
    1452          872 :                                                             send_buf_3d_down, send_buf_3d_up
    1453         1744 :       TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_bufs, send_bufs
    1454          872 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_reqs, send_reqs
    1455         4360 :       TYPE(mp_request_type), DIMENSION(4)                :: req
    1456              : 
    1457          872 :       num_threads = 1
    1458          872 :       my_id = 0
    1459              : 
    1460          872 :       CALL rs_grid_zero(rs)
    1461              : 
    1462              :       ! This is the real redistribution
    1463              : 
    1464         3488 :       ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
    1465              : 
    1466         2616 :       DO i = 0, pw%pw_grid%para%group%num_pe - 1
    1467         5232 :          bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
    1468         5232 :          bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
    1469         5232 :          bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
    1470         6104 :          bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
    1471              :       END DO
    1472              : 
    1473         3488 :       ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1474         2616 :       ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1475         1744 :       ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1476         1744 :       ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1477         1744 :       ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1478         1744 :       ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1479              : 
    1480        16568 :       send_tasks = 0
    1481         2616 :       send_tasks(:, 1) = 1
    1482         2616 :       send_tasks(:, 2) = 0
    1483         2616 :       send_tasks(:, 3) = 1
    1484         2616 :       send_tasks(:, 4) = 0
    1485         2616 :       send_tasks(:, 5) = 1
    1486         2616 :       send_tasks(:, 6) = 0
    1487         2616 :       send_sizes = 0
    1488              : 
    1489        16568 :       recv_tasks = 0
    1490         2616 :       recv_tasks(:, 1) = 1
    1491         2616 :       recv_tasks(:, 2) = 0
    1492         2616 :       send_tasks(:, 3) = 1
    1493         2616 :       send_tasks(:, 4) = 0
    1494         2616 :       send_tasks(:, 5) = 1
    1495         2616 :       send_tasks(:, 6) = 0
    1496         2616 :       recv_sizes = 0
    1497              : 
    1498          872 :       my_rs_rank = rs%desc%my_pos
    1499          872 :       my_pw_rank = pw%pw_grid%para%group%mepos
    1500              : 
    1501              :       ! find the processors that should hold our data
    1502              :       ! should be part of the rs grid type
    1503              :       ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
    1504              :       ! do the recv and send tasks in two separate loops which will
    1505              :       ! load balance better for OpenMP with large numbers of MPI tasks
    1506              : 
    1507              :       ! this is the reverse of rs2pw: what were the sends are now the recvs
    1508              : 
    1509              : !$OMP PARALLEL DO DEFAULT(NONE), &
    1510              : !$OMP             PRIVATE(coords,idir,pos,lb_send,ub_send), &
    1511          872 : !$OMP             SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
    1512              :       DO i = 0, pw%pw_grid%para%group%num_pe - 1
    1513              : 
    1514              :          coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
    1515              :          !calculate the real rs grid points on each processor
    1516              :          !coords is the part of the grid that rank i actually holds
    1517              :          DO idir = 1, 3
    1518              :             pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1519              :             pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1520              :             lb_send(idir) = pos(1)
    1521              :             ub_send(idir) = pos(2)
    1522              :          END DO
    1523              : 
    1524              :          IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
    1525              :          IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
    1526              :          IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
    1527              :          IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
    1528              : 
    1529              :          send_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
    1530              :          send_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
    1531              :          send_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
    1532              :          send_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
    1533              :          send_tasks(i, 5) = lb_send(3)
    1534              :          send_tasks(i, 6) = ub_send(3)
    1535              :          send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
    1536              :                          (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
    1537              : 
    1538              :       END DO
    1539              : !$OMP END PARALLEL DO
    1540              : 
    1541         3488 :       coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
    1542         3488 :       DO idir = 1, 3
    1543         2616 :          pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1544         7848 :          pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1545         2616 :          lb_send(idir) = pos(1)
    1546         3488 :          ub_send(idir) = pos(2)
    1547              :       END DO
    1548              : 
    1549          872 :       lb_recv(:) = lb_send(:)
    1550          872 :       ub_recv(:) = ub_send(:)
    1551              : 
    1552              : !$OMP PARALLEL DO DEFAULT(NONE), &
    1553          872 : !$OMP             SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
    1554              :       DO j = 0, pw%pw_grid%para%group%num_pe - 1
    1555              : 
    1556              :          IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
    1557              :          IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
    1558              :          IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
    1559              :          IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
    1560              : 
    1561              :          recv_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
    1562              :          recv_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
    1563              :          recv_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
    1564              :          recv_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
    1565              :          recv_tasks(j, 5) = lb_send(3)
    1566              :          recv_tasks(j, 6) = ub_send(3)
    1567              :          recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
    1568              :                          (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
    1569              : 
    1570              :       END DO
    1571              : !$OMP END PARALLEL DO
    1572              : 
    1573          872 :       send_disps(0) = 0
    1574          872 :       recv_disps(0) = 0
    1575         1744 :       DO i = 1, pw%pw_grid%para%group%num_pe - 1
    1576          872 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1577         1744 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1578              :       END DO
    1579              : 
    1580         6104 :       CPASSERT(SUM(recv_sizes) == PRODUCT(ub_recv - lb_recv + 1))
    1581              : 
    1582         4360 :       ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
    1583         5232 :       ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
    1584              : 
    1585         2616 :       DO i = 0, rs%desc%group_size - 1
    1586         1744 :          IF (send_sizes(i) .NE. 0) THEN
    1587         4986 :             ALLOCATE (send_bufs(i)%array(send_sizes(i)))
    1588              :          ELSE
    1589           82 :             NULLIFY (send_bufs(i)%array)
    1590              :          END IF
    1591         2616 :          IF (recv_sizes(i) .NE. 0) THEN
    1592         4986 :             ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
    1593              :          ELSE
    1594           82 :             NULLIFY (recv_bufs(i)%array)
    1595              :          END IF
    1596              :       END DO
    1597              : 
    1598         4360 :       ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
    1599         2616 :       recv_reqs = mp_request_null
    1600              : 
    1601         2616 :       DO i = 0, rs%desc%group_size - 1
    1602         2616 :          IF (recv_sizes(i) .NE. 0) THEN
    1603         1662 :             CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
    1604              :          END IF
    1605              :       END DO
    1606              : 
    1607              :       ! do packing
    1608              : !$OMP PARALLEL DO DEFAULT(NONE), &
    1609              : !$OMP             PRIVATE(k,z,y,x), &
    1610          872 : !$OMP             SHARED(pw,rs,send_tasks,send_bufs,send_disps)
    1611              :       DO i = 0, rs%desc%group_size - 1
    1612              :          k = 0
    1613              :          DO z = send_tasks(i, 5), send_tasks(i, 6)
    1614              :             DO y = send_tasks(i, 3), send_tasks(i, 4)
    1615              :                DO x = send_tasks(i, 1), send_tasks(i, 2)
    1616              :                   k = k + 1
    1617              :                   send_bufs(i)%array(k) = pw%array(x, y, z)
    1618              :                END DO
    1619              :             END DO
    1620              :          END DO
    1621              :       END DO
    1622              : !$OMP END PARALLEL DO
    1623              : 
    1624         4360 :       ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
    1625         2616 :       send_reqs = mp_request_null
    1626              : 
    1627         2616 :       DO i = 0, rs%desc%group_size - 1
    1628         2616 :          IF (send_sizes(i) .NE. 0) THEN
    1629         1662 :             CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
    1630              :          END IF
    1631              :       END DO
    1632              : 
    1633              :       ! do unpacking
    1634              :       ! no OMP here so we can unpack each message as it arrives
    1635              : 
    1636         2616 :       DO i = 0, rs%desc%group_size - 1
    1637         1744 :          IF (recv_sizes(i) .EQ. 0) CYCLE
    1638              : 
    1639         1662 :          CALL mp_waitany(recv_reqs, completed)
    1640         1662 :          k = 0
    1641        66556 :          DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
    1642      4763300 :             DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
    1643    194725231 :                DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
    1644    189963675 :                   k = k + 1
    1645    194661209 :                   rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
    1646              :                END DO
    1647              :             END DO
    1648              :          END DO
    1649              :       END DO
    1650              : 
    1651          872 :       CALL mp_waitall(send_reqs)
    1652              : 
    1653          872 :       DEALLOCATE (recv_reqs)
    1654          872 :       DEALLOCATE (send_reqs)
    1655              : 
    1656         2616 :       DO i = 0, rs%desc%group_size - 1
    1657         1744 :          IF (ASSOCIATED(send_bufs(i)%array)) THEN
    1658         1662 :             DEALLOCATE (send_bufs(i)%array)
    1659              :          END IF
    1660         2616 :          IF (ASSOCIATED(recv_bufs(i)%array)) THEN
    1661         1662 :             DEALLOCATE (recv_bufs(i)%array)
    1662              :          END IF
    1663              :       END DO
    1664              : 
    1665          872 :       DEALLOCATE (send_bufs)
    1666          872 :       DEALLOCATE (recv_bufs)
    1667          872 :       DEALLOCATE (send_tasks)
    1668          872 :       DEALLOCATE (send_sizes)
    1669          872 :       DEALLOCATE (send_disps)
    1670          872 :       DEALLOCATE (recv_tasks)
    1671          872 :       DEALLOCATE (recv_sizes)
    1672          872 :       DEALLOCATE (recv_disps)
    1673              : 
    1674              :       ! now pass wings around
    1675          872 :       halo_swapped = .FALSE.
    1676              : 
    1677         3488 :       DO idir = 1, 3
    1678              : 
    1679         2616 :          IF (rs%desc%perd(idir) /= 1) THEN
    1680              : 
    1681         5568 :             ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
    1682         3712 :             ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
    1683         5568 :             ushifts = 0
    1684         5568 :             dshifts = 0
    1685              : 
    1686         3712 :             DO n_shifts = 1, rs%desc%neighbours(idir)
    1687              : 
    1688              :                ! need to take into account the possible varying widths of neighbouring cells
    1689              :                ! ushifts and dshifts hold the real size of the neighbouring cells
    1690              : 
    1691         1856 :                position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
    1692         1856 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1693         1856 :                dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1694              : 
    1695         1856 :                position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
    1696         1856 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1697         1856 :                ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1698              : 
    1699              :                ! The border data has to be send/received from the neighbors
    1700              :                ! First we calculate the source and destination processes for the shift
    1701              :                ! The first shift is "downwards"
    1702              : 
    1703         1856 :                CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
    1704              : 
    1705         7424 :                lb_send_down(:) = rs%lb_local(:)
    1706         7424 :                ub_send_down(:) = rs%ub_local(:)
    1707         7424 :                lb_recv_down(:) = rs%lb_local(:)
    1708         7424 :                ub_recv_down(:) = rs%ub_local(:)
    1709              : 
    1710         1856 :                IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1711         1856 :                   lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
    1712              :                   ub_send_down(idir) = MIN(ub_send_down(idir) - rs%desc%border, &
    1713         1856 :                                            lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
    1714              : 
    1715         1856 :                   lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
    1716              :                   ub_recv_down(idir) = MIN(ub_recv_down(idir), &
    1717         1856 :                                            ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
    1718              :                ELSE
    1719            0 :                   lb_send_down(idir) = 0
    1720            0 :                   ub_send_down(idir) = -1
    1721            0 :                   lb_recv_down(idir) = 0
    1722            0 :                   ub_recv_down(idir) = -1
    1723              :                END IF
    1724              : 
    1725         7424 :                DO i = 1, 3
    1726         7424 :                   IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
    1727         1560 :                      lb_send_down(i) = rs%lb_real(i)
    1728         1560 :                      ub_send_down(i) = rs%ub_real(i)
    1729         1560 :                      lb_recv_down(i) = rs%lb_real(i)
    1730         1560 :                      ub_recv_down(i) = rs%ub_real(i)
    1731              :                   END IF
    1732              :                END DO
    1733              : 
    1734              :                ! allocate the recv buffer
    1735         7424 :                nn = PRODUCT(ub_recv_down - lb_recv_down + 1)
    1736            0 :                ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
    1737         9280 :                                           lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
    1738              : 
    1739              :                ! recv buffer is now ready, so post the receive
    1740         1856 :                CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
    1741              : 
    1742              :                ! now allocate,pack and send the send buffer
    1743         7424 :                nn = PRODUCT(ub_send_down - lb_send_down + 1)
    1744            0 :                ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
    1745         9280 :                                           lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
    1746              : 
    1747              : !$OMP PARALLEL DEFAULT(NONE), &
    1748              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1749         1856 : !$OMP          SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
    1750              : !$             num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
    1751              : !$             my_id = omp_get_thread_num()
    1752              :                IF (my_id < num_threads) THEN
    1753              :                   lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
    1754              :                   ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
    1755              : 
    1756              :                   send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
    1757              :                                    lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
    1758              :                                                  lb_send_down(2):ub_send_down(2), lb:ub)
    1759              :                END IF
    1760              : !$OMP END PARALLEL
    1761              : 
    1762         1856 :                CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
    1763              : 
    1764              :                ! Now for the other direction
    1765              : 
    1766         1856 :                CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
    1767              : 
    1768         7424 :                lb_send_up(:) = rs%lb_local(:)
    1769         7424 :                ub_send_up(:) = rs%ub_local(:)
    1770         7424 :                lb_recv_up(:) = rs%lb_local(:)
    1771         7424 :                ub_recv_up(:) = rs%ub_local(:)
    1772              : 
    1773         1856 :                IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1774         1856 :                   ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
    1775              :                   lb_send_up(idir) = MAX(lb_send_up(idir) + rs%desc%border, &
    1776         1856 :                                          ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
    1777              : 
    1778         1856 :                   ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
    1779              :                   lb_recv_up(idir) = MAX(lb_recv_up(idir), &
    1780         1856 :                                          lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
    1781              :                ELSE
    1782            0 :                   lb_send_up(idir) = 0
    1783            0 :                   ub_send_up(idir) = -1
    1784            0 :                   lb_recv_up(idir) = 0
    1785            0 :                   ub_recv_up(idir) = -1
    1786              :                END IF
    1787              : 
    1788         7424 :                DO i = 1, 3
    1789         7424 :                   IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
    1790         1560 :                      lb_send_up(i) = rs%lb_real(i)
    1791         1560 :                      ub_send_up(i) = rs%ub_real(i)
    1792         1560 :                      lb_recv_up(i) = rs%lb_real(i)
    1793         1560 :                      ub_recv_up(i) = rs%ub_real(i)
    1794              :                   END IF
    1795              :                END DO
    1796              : 
    1797              :                ! allocate the recv buffer
    1798         7424 :                nn = PRODUCT(ub_recv_up - lb_recv_up + 1)
    1799            0 :                ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
    1800         9280 :                                         lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
    1801              : 
    1802              :                ! recv buffer is now ready, so post the receive
    1803              : 
    1804         1856 :                CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
    1805              : 
    1806              :                ! now allocate,pack and send the send buffer
    1807         7424 :                nn = PRODUCT(ub_send_up - lb_send_up + 1)
    1808            0 :                ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
    1809         9280 :                                         lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
    1810              : 
    1811              : !$OMP PARALLEL DEFAULT(NONE), &
    1812              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1813         1856 : !$OMP          SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
    1814              : !$             num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
    1815              : !$             my_id = omp_get_thread_num()
    1816              :                IF (my_id < num_threads) THEN
    1817              :                   lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
    1818              :                   ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
    1819              : 
    1820              :                   send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
    1821              :                                  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
    1822              :                                                lb_send_up(2):ub_send_up(2), lb:ub)
    1823              :                END IF
    1824              : !$OMP END PARALLEL
    1825              : 
    1826         1856 :                CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
    1827              : 
    1828              :                ! wait for a recv to complete, then we can unpack
    1829              : 
    1830         5568 :                DO i = 1, 2
    1831              : 
    1832         3712 :                   CALL mp_waitany(req(1:2), completed)
    1833              : 
    1834         5568 :                   IF (completed .EQ. 1) THEN
    1835              : 
    1836              :                      ! only some procs may need later shifts
    1837         1856 :                      IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
    1838              : 
    1839              :                         ! Add the data to the RS Grid
    1840              : !$OMP PARALLEL DEFAULT(NONE), &
    1841              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1842         1856 : !$OMP          SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
    1843              : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
    1844              : !$                      my_id = omp_get_thread_num()
    1845              :                         IF (my_id < num_threads) THEN
    1846              :                            lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
    1847              :                            ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
    1848              : 
    1849              :                            rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
    1850              :                                 lb:ub) = recv_buf_3d_down(:, :, lb:ub)
    1851              :                         END IF
    1852              : !$OMP END PARALLEL
    1853              :                      END IF
    1854              : 
    1855         1856 :                      DEALLOCATE (recv_buf_3d_down)
    1856              :                   ELSE
    1857              : 
    1858              :                      ! only some procs may need later shifts
    1859         1856 :                      IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
    1860              : 
    1861              :                         ! Add the data to the RS Grid
    1862              : !$OMP PARALLEL DEFAULT(NONE), &
    1863              : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1864         1856 : !$OMP          SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
    1865              : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
    1866              : !$                      my_id = omp_get_thread_num()
    1867              :                         IF (my_id < num_threads) THEN
    1868              :                            lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
    1869              :                            ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
    1870              : 
    1871              :                            rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
    1872              :                                 lb:ub) = recv_buf_3d_up(:, :, lb:ub)
    1873              :                         END IF
    1874              : !$OMP END PARALLEL
    1875              :                      END IF
    1876              : 
    1877         1856 :                      DEALLOCATE (recv_buf_3d_up)
    1878              :                   END IF
    1879              :                END DO
    1880              : 
    1881         1856 :                CALL mp_waitall(req(3:4))
    1882              : 
    1883         1856 :                DEALLOCATE (send_buf_3d_down)
    1884         5568 :                DEALLOCATE (send_buf_3d_up)
    1885              :             END DO
    1886              : 
    1887         1856 :             DEALLOCATE (ushifts)
    1888         1856 :             DEALLOCATE (dshifts)
    1889              :          END IF
    1890              : 
    1891         3488 :          halo_swapped(idir) = .TRUE.
    1892              : 
    1893              :       END DO
    1894              : 
    1895          872 :    END SUBROUTINE transfer_pw2rs_distributed
    1896              : 
    1897              : ! **************************************************************************************************
    1898              : !> \brief Initialize grid to zero
    1899              : !> \param rs ...
    1900              : !> \par History
    1901              : !>      none
    1902              : !> \author JGH (23-Mar-2002)
    1903              : ! **************************************************************************************************
    1904       357124 :    SUBROUTINE rs_grid_zero(rs)
    1905              : 
    1906              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1907              : 
    1908              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'rs_grid_zero'
    1909              : 
    1910              :       INTEGER                                            :: handle, i, j, k, l(3), u(3)
    1911              : 
    1912       357124 :       CALL timeset(routineN, handle)
    1913      1071372 :       l(1) = LBOUND(rs%r, 1); l(2) = LBOUND(rs%r, 2); l(3) = LBOUND(rs%r, 3)
    1914      1071372 :       u(1) = UBOUND(rs%r, 1); u(2) = UBOUND(rs%r, 2); u(3) = UBOUND(rs%r, 3)
    1915              : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
    1916              : !$OMP             PRIVATE(i,j,k) &
    1917       357124 : !$OMP             SHARED(rs,l,u)
    1918              :       DO k = l(3), u(3)
    1919              :       DO j = l(2), u(2)
    1920              :       DO i = l(1), u(1)
    1921              :          rs%r(i, j, k) = 0.0_dp
    1922              :       END DO
    1923              :       END DO
    1924              :       END DO
    1925              : !$OMP END PARALLEL DO
    1926       357124 :       CALL timestop(handle)
    1927              : 
    1928       357124 :    END SUBROUTINE rs_grid_zero
    1929              : 
    1930              : ! **************************************************************************************************
    1931              : !> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
    1932              : !> \param rs1 ...
    1933              : !> \param rs2 ...
    1934              : !> \param rs3 ...
    1935              : !> \param scalar ...
    1936              : !> \par History
    1937              : !>      none
    1938              : !> \author
    1939              : ! **************************************************************************************************
    1940         1404 :    SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
    1941              : 
    1942              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs1, rs2, rs3
    1943              :       REAL(dp), INTENT(IN)                               :: scalar
    1944              : 
    1945              :       CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_mult_and_add'
    1946              : 
    1947              :       INTEGER                                            :: handle, i, j, k, l(3), u(3)
    1948              : 
    1949              : !-----------------------------------------------------------------------------!
    1950              : 
    1951         1404 :       CALL timeset(routineN, handle)
    1952         1404 :       IF (scalar .NE. 0.0_dp) THEN
    1953         4212 :          l(1) = LBOUND(rs1%r, 1); l(2) = LBOUND(rs1%r, 2); l(3) = LBOUND(rs1%r, 3)
    1954         4212 :          u(1) = UBOUND(rs1%r, 1); u(2) = UBOUND(rs1%r, 2); u(3) = UBOUND(rs1%r, 3)
    1955              : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
    1956              : !$OMP             PRIVATE(i,j,k) &
    1957         1404 : !$OMP             SHARED(rs1,rs2,rs3,scalar,l,u)
    1958              :          DO k = l(3), u(3)
    1959              :          DO j = l(2), u(2)
    1960              :          DO i = l(1), u(1)
    1961              :             rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
    1962              :          END DO
    1963              :          END DO
    1964              :          END DO
    1965              : !$OMP END PARALLEL DO
    1966              :       END IF
    1967         1404 :       CALL timestop(handle)
    1968         1404 :    END SUBROUTINE rs_grid_mult_and_add
    1969              : 
    1970              : ! **************************************************************************************************
    1971              : !> \brief Set box matrix info for real space grid
    1972              : !>      This is needed for variable cell simulations
    1973              : !> \param pw_grid ...
    1974              : !> \param rs ...
    1975              : !> \par History
    1976              : !>      none
    1977              : !> \author JGH (15-May-2007)
    1978              : ! **************************************************************************************************
    1979       185004 :    SUBROUTINE rs_grid_set_box(pw_grid, rs)
    1980              : 
    1981              :       TYPE(pw_grid_type), INTENT(IN), TARGET             :: pw_grid
    1982              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1983              : 
    1984       185004 :       CPASSERT(ASSOCIATED(rs%desc%pw, pw_grid))
    1985      2405052 :       rs%desc%dh = pw_grid%dh
    1986      2405052 :       rs%desc%dh_inv = pw_grid%dh_inv
    1987              : 
    1988       185004 :    END SUBROUTINE rs_grid_set_box
    1989              : 
    1990              : ! **************************************************************************************************
    1991              : !> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
    1992              : !> \param rs_desc the grid descriptor to retain
    1993              : !> \par History
    1994              : !>      04.2009 created [Iain Bethune]
    1995              : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    1996              : ! **************************************************************************************************
    1997       247254 :    SUBROUTINE rs_grid_retain_descriptor(rs_desc)
    1998              :       TYPE(realspace_grid_desc_type), INTENT(INOUT)      :: rs_desc
    1999              : 
    2000       247254 :       CPASSERT(rs_desc%ref_count > 0)
    2001       247254 :       rs_desc%ref_count = rs_desc%ref_count + 1
    2002       247254 :    END SUBROUTINE rs_grid_retain_descriptor
    2003              : 
    2004              : ! **************************************************************************************************
    2005              : !> \brief releases the given rs grid (see doc/ReferenceCounting.html)
    2006              : !> \param rs_grid the rs grid to release
    2007              : !> \par History
    2008              : !>      03.2003 created [fawzi]
    2009              : !> \author fawzi
    2010              : ! **************************************************************************************************
    2011       246606 :    SUBROUTINE rs_grid_release(rs_grid)
    2012              :       TYPE(realspace_grid_type), INTENT(INOUT)           :: rs_grid
    2013              : 
    2014       246606 :       CALL rs_grid_release_descriptor(rs_grid%desc)
    2015              : 
    2016       246606 :       CALL offload_free_buffer(rs_grid%buffer)
    2017       246606 :       NULLIFY (rs_grid%r)
    2018              : 
    2019       246606 :       IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
    2020       246606 :       IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
    2021       246606 :       IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
    2022       246606 :    END SUBROUTINE rs_grid_release
    2023              : 
    2024              : ! **************************************************************************************************
    2025              : !> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
    2026              : !> \param rs_desc the rs grid descriptor to release
    2027              : !> \par History
    2028              : !>      04.2009 created [Iain Bethune]
    2029              : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2030              : ! **************************************************************************************************
    2031       283211 :    SUBROUTINE rs_grid_release_descriptor(rs_desc)
    2032              :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
    2033              : 
    2034       283211 :       IF (ASSOCIATED(rs_desc)) THEN
    2035       279588 :          CPASSERT(rs_desc%ref_count > 0)
    2036       279588 :          rs_desc%ref_count = rs_desc%ref_count - 1
    2037       279588 :          IF (rs_desc%ref_count == 0) THEN
    2038              : 
    2039        32334 :             CALL pw_grid_release(rs_desc%pw)
    2040              : 
    2041        32334 :             IF (rs_desc%parallel) THEN
    2042              :                ! release the group communicator
    2043        28140 :                CALL rs_desc%group%free()
    2044              : 
    2045        28140 :                DEALLOCATE (rs_desc%virtual2real)
    2046        28140 :                DEALLOCATE (rs_desc%real2virtual)
    2047              :             END IF
    2048              : 
    2049        32334 :             IF (rs_desc%distributed) THEN
    2050          158 :                DEALLOCATE (rs_desc%rank2coord)
    2051          158 :                DEALLOCATE (rs_desc%coord2rank)
    2052          158 :                DEALLOCATE (rs_desc%lb_global)
    2053          158 :                DEALLOCATE (rs_desc%ub_global)
    2054          158 :                DEALLOCATE (rs_desc%x2coord)
    2055          158 :                DEALLOCATE (rs_desc%y2coord)
    2056          158 :                DEALLOCATE (rs_desc%z2coord)
    2057              :             END IF
    2058              : 
    2059        32334 :             DEALLOCATE (rs_desc)
    2060              :          END IF
    2061              :       END IF
    2062       283211 :       NULLIFY (rs_desc)
    2063       283211 :    END SUBROUTINE rs_grid_release_descriptor
    2064              : 
    2065              : ! **************************************************************************************************
    2066              : !> \brief emulates the function of an MPI_cart_shift operation, but the shift is
    2067              : !>        done in virtual coordinates, and the corresponding real ranks are returned
    2068              : !> \param rs_grid ...
    2069              : !> \param dir ...
    2070              : !> \param disp ...
    2071              : !> \param source ...
    2072              : !> \param dest ...
    2073              : !> \par History
    2074              : !>      04.2009 created [Iain Bethune]
    2075              : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2076              : ! **************************************************************************************************
    2077         7416 :    PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
    2078              : 
    2079              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs_grid
    2080              :       INTEGER, INTENT(IN)                                :: dir, disp
    2081              :       INTEGER, INTENT(OUT)                               :: source, dest
    2082              : 
    2083              :       INTEGER, DIMENSION(3)                              :: shift_coords
    2084              : 
    2085        29664 :       shift_coords = rs_grid%desc%virtual_group_coor
    2086         7416 :       shift_coords(dir) = MODULO(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
    2087         7416 :       dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
    2088        29664 :       shift_coords = rs_grid%desc%virtual_group_coor
    2089         7416 :       shift_coords(dir) = MODULO(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
    2090         7416 :       source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
    2091              : 
    2092         7416 :    END SUBROUTINE
    2093              : 
    2094              : ! **************************************************************************************************
    2095              : !> \brief returns the maximum number of points in the local grid of any process
    2096              : !>        to account for the case where the grid may later be reordered
    2097              : !> \param desc ...
    2098              : !> \return ...
    2099              : !> \par History
    2100              : !>      10.2011 created [Iain Bethune]
    2101              : ! **************************************************************************************************
    2102            0 :    FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
    2103              :       TYPE(realspace_grid_desc_type), INTENT(IN)         :: desc
    2104              :       INTEGER                                            :: max_ngpts
    2105              : 
    2106              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'rs_grid_max_ngpts'
    2107              : 
    2108              :       INTEGER                                            :: handle, i
    2109              :       INTEGER, DIMENSION(3)                              :: lb, ub
    2110              : 
    2111            0 :       CALL timeset(routineN, handle)
    2112              : 
    2113            0 :       max_ngpts = 0
    2114            0 :       IF ((desc%pw%para%mode == PW_MODE_LOCAL) .OR. &
    2115              :           (ALL(desc%group_dim == 1))) THEN
    2116            0 :          CPASSERT(PRODUCT(INT(desc%npts, KIND=int_8)) < HUGE(1))
    2117            0 :          max_ngpts = PRODUCT(desc%npts)
    2118              :       ELSE
    2119            0 :          DO i = 0, desc%group_size - 1
    2120            0 :             lb = desc%lb_global(:, i)
    2121            0 :             ub = desc%ub_global(:, i)
    2122            0 :             lb = lb - desc%border*(1 - desc%perd)
    2123            0 :             ub = ub + desc%border*(1 - desc%perd)
    2124            0 :             CPASSERT(PRODUCT(INT(ub - lb + 1, KIND=int_8)) < HUGE(1))
    2125            0 :             max_ngpts = MAX(max_ngpts, PRODUCT(ub - lb + 1))
    2126              :          END DO
    2127              :       END IF
    2128              : 
    2129            0 :       CALL timestop(handle)
    2130              : 
    2131            0 :    END FUNCTION rs_grid_max_ngpts
    2132              : 
    2133              : ! **************************************************************************************************
    2134              : !> \brief ...
    2135              : !> \param rs_grid ...
    2136              : !> \param h_inv ...
    2137              : !> \param ra ...
    2138              : !> \param offset ...
    2139              : !> \param group_size ...
    2140              : !> \param my_pos ...
    2141              : !> \return ...
    2142              : ! **************************************************************************************************
    2143      1586445 :    PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
    2144              :       TYPE(realspace_grid_type), INTENT(IN)              :: rs_grid
    2145              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(IN)         :: h_inv
    2146              :       REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: ra
    2147              :       INTEGER, INTENT(IN), OPTIONAL                      :: offset, group_size, my_pos
    2148              : 
    2149              :       INTEGER                                            :: dir, lb(3), location(3), tp(3), ub(3)
    2150              : 
    2151      1586445 :       res = .FALSE.
    2152              : 
    2153      6345772 :       IF (.NOT. ALL(rs_grid%desc%perd == 1)) THEN
    2154           32 :          DO dir = 1, 3
    2155              :             ! bounds of local grid (i.e. removing the 'wings'), if periodic
    2156           96 :             tp(dir) = FLOOR(DOT_PRODUCT(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
    2157           24 :             tp(dir) = MODULO(tp(dir), rs_grid%desc%npts(dir))
    2158           24 :             IF (rs_grid%desc%perd(dir) .NE. 1) THEN
    2159            8 :                lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
    2160            8 :                ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
    2161              :             ELSE
    2162           16 :                lb(dir) = rs_grid%lb_local(dir)
    2163           16 :                ub(dir) = rs_grid%ub_local(dir)
    2164              :             END IF
    2165              :             ! distributed grid, only map if it is local to the grid
    2166           32 :             location(dir) = tp(dir) + rs_grid%desc%lb(dir)
    2167              :          END DO
    2168           60 :          IF (ALL(lb(:) <= location(:)) .AND. ALL(location(:) <= ub(:))) THEN
    2169            4 :             res = .TRUE.
    2170              :          END IF
    2171              :       ELSE
    2172      1586437 :          IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
    2173              :             ! not distributed, just a round-robin distribution over the full set of CPUs
    2174      1586437 :             IF (MODULO(offset, group_size) == my_pos) res = .TRUE.
    2175              :          END IF
    2176              :       END IF
    2177              : 
    2178      1586445 :    END FUNCTION map_gaussian_here
    2179              : 
    2180            0 : END MODULE realspace_grid_types

Generated by: LCOV version 2.0-1