LCOV - code coverage report
Current view: top level - src/pw - realspace_grid_types.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:1a29073) Lines: 742 791 93.8 %
Date: 2024-04-17 06:30:47 Functions: 19 27 70.4 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \note
      10             : !>      Basic type for real space grid methods
      11             : !> \par History
      12             : !>      JGH (22-May-2002) : New routine rs_grid_zero
      13             : !>      JGH (12-Jun-2002) : Bug fix for mpi groups
      14             : !>      JGH (19-Jun-2003) : Added routine for task distribution
      15             : !>      JGH (23-Nov-2003) : Added routine for task loop separation
      16             : !> \author JGH (18-Mar-2001)
      17             : ! **************************************************************************************************
      18             : MODULE realspace_grid_types
      19             :    USE cp_array_utils,                  ONLY: cp_1d_r_p_type
      20             :    USE cp_log_handling,                 ONLY: cp_to_string
      21             :    USE kahan_sum,                       ONLY: accurate_sum
      22             :    USE kinds,                           ONLY: dp,&
      23             :                                               int_8
      24             :    USE machine,                         ONLY: m_memory
      25             :    USE mathlib,                         ONLY: det_3x3
      26             :    USE message_passing,                 ONLY: mp_comm_null,&
      27             :                                               mp_comm_type,&
      28             :                                               mp_request_null,&
      29             :                                               mp_request_type,&
      30             :                                               mp_waitall,&
      31             :                                               mp_waitany
      32             :    USE offload_api,                     ONLY: offload_buffer_type,&
      33             :                                               offload_create_buffer,&
      34             :                                               offload_free_buffer
      35             :    USE pw_grid_types,                   ONLY: PW_MODE_LOCAL,&
      36             :                                               pw_grid_type
      37             :    USE pw_grids,                        ONLY: pw_grid_release,&
      38             :                                               pw_grid_retain
      39             :    USE pw_methods,                      ONLY: pw_integrate_function
      40             :    USE pw_types,                        ONLY: pw_r3d_rs_type
      41             :    USE util,                            ONLY: get_limit
      42             : 
      43             : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      44             : 
      45             : #include "../base/base_uses.f90"
      46             : 
      47             :    IMPLICIT NONE
      48             : 
      49             :    PRIVATE
      50             :    PUBLIC :: realspace_grid_type, &
      51             :              realspace_grid_desc_type, &
      52             :              realspace_grid_p_type, &
      53             :              realspace_grid_desc_p_type, &
      54             :              realspace_grid_input_type
      55             : 
      56             :    PUBLIC :: transfer_rs2pw, &
      57             :              transfer_pw2rs, &
      58             :              rs_grid_zero, &
      59             :              rs_grid_set_box, &
      60             :              rs_grid_create, &
      61             :              rs_grid_create_descriptor, &
      62             :              rs_grid_retain_descriptor, &
      63             :              rs_grid_release, &
      64             :              rs_grid_release_descriptor, &
      65             :              rs_grid_reorder_ranks, &
      66             :              rs_grid_print, &
      67             :              rs_grid_locate_rank, &
      68             :              rs_grid_max_ngpts, &
      69             :              rs_grid_mult_and_add, &
      70             :              map_gaussian_here
      71             : 
      72             :    INTEGER, PARAMETER, PUBLIC               :: rsgrid_distributed = 0, &
      73             :                                                rsgrid_replicated = 1, &
      74             :                                                rsgrid_automatic = 2
      75             : 
      76             :    LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
      77             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'realspace_grid_types'
      78             : 
      79             : ! **************************************************************************************************
      80             :    TYPE realspace_grid_input_type
      81             :       INTEGER       :: distribution_type = rsgrid_replicated
      82             :       INTEGER       :: distribution_layout(3) = -1
      83             :       REAL(KIND=dp) :: memory_factor = 0.0_dp
      84             :       LOGICAL       :: lock_distribution = .FALSE.
      85             :       INTEGER       :: nsmax = -1
      86             :       REAL(KIND=dp) :: halo_reduction_factor = 1.0_dp
      87             :    END TYPE realspace_grid_input_type
      88             : 
      89             : ! **************************************************************************************************
      90             :    TYPE realspace_grid_desc_type
      91             :       TYPE(pw_grid_type), POINTER   :: pw => NULL() ! the pw grid
      92             : 
      93             :       INTEGER :: ref_count = 0 ! reference count
      94             : 
      95             :       INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
      96             :       INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
      97             :       INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
      98             :       INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
      99             : 
     100             :       INTEGER :: border = 0 ! border points
     101             : 
     102             :       INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
     103             :       REAL(KIND=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
     104             :       REAL(KIND=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
     105             :       LOGICAL :: orthorhombic = .TRUE. ! grid symmetry
     106             : 
     107             :       LOGICAL :: parallel = .TRUE. ! whether the corresponding pw grid is distributed
     108             :       LOGICAL :: distributed = .TRUE. ! whether the rs grid is distributed
     109             :       ! these MPI related quantities are only meaningful depending on how the grid has been laid out
     110             :       ! they are most useful for fully distributed grids, where they reflect the topology of the grid
     111             :       TYPE(mp_comm_type) :: group = mp_comm_null
     112             :       INTEGER :: my_pos = -1
     113             :       LOGICAL :: group_head = .FALSE.
     114             :       INTEGER :: group_size = 0
     115             :       INTEGER, DIMENSION(3) :: group_dim = -1
     116             :       INTEGER, DIMENSION(3) :: group_coor = -1
     117             :       INTEGER, DIMENSION(3) :: neighbours = -1
     118             :       ! only meaningful on distributed grids
     119             :       ! a list of bounds for each CPU
     120             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
     121             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
     122             :       ! a mapping from linear rank to 3d coord
     123             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
     124             :       INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
     125             :       ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
     126             :       INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
     127             :       INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
     128             :       INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
     129             : 
     130             :       INTEGER                :: my_virtual_pos = -1
     131             :       INTEGER, DIMENSION(3) :: virtual_group_coor = -1
     132             : 
     133             :       INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
     134             : 
     135             :    END TYPE realspace_grid_desc_type
     136             : 
     137             :    TYPE realspace_grid_type
     138             : 
     139             :       TYPE(realspace_grid_desc_type), POINTER :: desc => NULL()
     140             : 
     141             :       INTEGER :: ngpts_local = -1 ! local dimensions
     142             :       INTEGER, DIMENSION(3) :: npts_local = -1
     143             :       INTEGER, DIMENSION(3) :: lb_local = -1
     144             :       INTEGER, DIMENSION(3) :: ub_local = -1
     145             :       INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
     146             :       INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
     147             : 
     148             :       INTEGER, DIMENSION(:), ALLOCATABLE         :: px, py, pz ! index translators
     149             :       TYPE(offload_buffer_type)                  :: buffer = offload_buffer_type() ! owner of the grid's memory
     150             :       REAL(KIND=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => NULL() ! the grid (pointer to buffer%host_buffer)
     151             : 
     152             :    END TYPE realspace_grid_type
     153             : 
     154             : ! **************************************************************************************************
     155             :    TYPE realspace_grid_p_type
     156             :       TYPE(realspace_grid_type), POINTER :: rs_grid => NULL()
     157             :    END TYPE realspace_grid_p_type
     158             : 
     159             :    TYPE realspace_grid_desc_p_type
     160             :       TYPE(realspace_grid_desc_type), POINTER :: rs_desc => NULL()
     161             :    END TYPE realspace_grid_desc_p_type
     162             : 
     163             : CONTAINS
     164             : 
     165             : ! **************************************************************************************************
     166             : !> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
     167             : !>        only possible if rs_grid is a distributed grid
     168             : !> \param rs_desc ...
     169             : !> \param rank_in ...
     170             : !> \param shift ...
     171             : !> \return ...
     172             : ! **************************************************************************************************
     173        2346 :    PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
     174             :       TYPE(realspace_grid_desc_type), INTENT(IN)         :: rs_desc
     175             :       INTEGER, INTENT(IN)                                :: rank_in
     176             :       INTEGER, DIMENSION(3), INTENT(IN)                  :: shift
     177             :       INTEGER                                            :: rank_out
     178             : 
     179             :       INTEGER                                            :: coord(3)
     180             : 
     181        9384 :       coord = MODULO(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
     182        2346 :       rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
     183        2346 :    END FUNCTION rs_grid_locate_rank
     184             : 
     185             : ! **************************************************************************************************
     186             : !> \brief Determine the setup of real space grids - this is divided up into the
     187             : !>        creation of a descriptor and the actual grid itself (see rs_grid_create)
     188             : !> \param desc ...
     189             : !> \param pw_grid ...
     190             : !> \param input_settings ...
     191             : !> \param border_points ...
     192             : !> \par History
     193             : !>      JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
     194             : !>      Iain Bethune (05-Sep-2008) : modified cut heuristic
     195             : !>      (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
     196             : !>      - Create a descriptor for realspace grids with a number of border
     197             : !>        points as exactly given by the optional argument border_points.
     198             : !>        These grids are always distributed.
     199             : !>        (27.11.2013, Matthias Krack)
     200             : !> \author JGH (18-Mar-2001)
     201             : ! **************************************************************************************************
     202       30296 :    SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
     203             :       TYPE(realspace_grid_desc_type), POINTER            :: desc
     204             :       TYPE(pw_grid_type), INTENT(INOUT), TARGET          :: pw_grid
     205             :       TYPE(realspace_grid_input_type), INTENT(IN)        :: input_settings
     206             :       INTEGER, INTENT(IN), OPTIONAL                      :: border_points
     207             : 
     208             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_grid_create_descriptor'
     209             : 
     210             :       INTEGER                                            :: border_size, dir, handle, i, j, k, l, &
     211             :                                                             lb(2), min_npts_real, n_slices(3), &
     212             :                                                             n_slices_tmp(3), nmin
     213             :       LOGICAL                                            :: overlap
     214             :       REAL(KIND=dp)                                      :: ratio, ratio_best, volume, volume_dist
     215             : 
     216       30296 :       CALL timeset(routineN, handle)
     217             : 
     218       30296 :       IF (PRESENT(border_points)) THEN
     219         128 :          border_size = border_points
     220             :       ELSE
     221             :          border_size = 0
     222             :       END IF
     223             : 
     224     1484504 :       ALLOCATE (desc)
     225             : 
     226       30296 :       CALL pw_grid%para%group%sync()
     227             : 
     228       30296 :       desc%pw => pw_grid
     229       30296 :       CALL pw_grid_retain(desc%pw)
     230             : 
     231      393848 :       desc%dh = pw_grid%dh
     232      393848 :       desc%dh_inv = pw_grid%dh_inv
     233       30296 :       desc%orthorhombic = pw_grid%orthorhombic
     234       30296 :       desc%ref_count = 1
     235             : 
     236       30296 :       IF (pw_grid%para%mode == PW_MODE_LOCAL) THEN
     237             :          ! The corresponding group has dimension 1
     238             :          ! All operations will be done locally
     239       15624 :          desc%npts = pw_grid%npts
     240       15624 :          desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
     241       15624 :          desc%lb = pw_grid%bounds(1, :)
     242       15624 :          desc%ub = pw_grid%bounds(2, :)
     243        3906 :          desc%border = border_size
     244        3906 :          IF (border_size == 0) THEN
     245       15624 :             desc%perd = 1
     246             :          ELSE
     247           0 :             desc%perd = 0
     248             :          END IF
     249        3906 :          desc%parallel = .FALSE.
     250        3906 :          desc%distributed = .FALSE.
     251        3906 :          desc%group = mp_comm_null
     252        3906 :          desc%group_size = 1
     253        3906 :          desc%group_head = .TRUE.
     254       15624 :          desc%group_dim = 1
     255       15624 :          desc%group_coor = 0
     256        3906 :          desc%my_pos = 0
     257             :       ELSE
     258             :          ! group size of desc grid
     259             :          ! global grid dimensions are still the same
     260       26390 :          desc%group_size = pw_grid%para%group_size
     261      105560 :          desc%npts = pw_grid%npts
     262      105560 :          desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
     263      105560 :          desc%lb = pw_grid%bounds(1, :)
     264      105560 :          desc%ub = pw_grid%bounds(2, :)
     265             : 
     266             :          ! this is the eventual border size
     267       26390 :          IF (border_size == 0) THEN
     268       26262 :             nmin = (input_settings%nsmax + 1)/2
     269       26262 :             nmin = MAX(0, NINT(nmin*input_settings%halo_reduction_factor))
     270             :          ELSE
     271             :             ! Set explicitly the requested border size
     272             :             nmin = border_size
     273             :          END IF
     274             : 
     275       26390 :          IF (input_settings%distribution_type == rsgrid_replicated) THEN
     276             : 
     277       43448 :             n_slices = 1
     278       10862 :             IF (border_size > 0) THEN
     279             :                CALL cp_abort(__LOCATION__, &
     280             :                              "An explicit border size > 0 is not yet working for "// &
     281             :                              "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
     282           0 :                              "distributed for RS_GRID explicitly.")
     283             :             END IF
     284             : 
     285             :          ELSE
     286             : 
     287       62112 :             n_slices = 1
     288       15528 :             ratio_best = -HUGE(ratio_best)
     289             : 
     290             :             ! don't allow distributions with more processors than real grid points
     291       46584 :             DO k = 1, MIN(desc%npts(3), desc%group_size)
     292      108696 :             DO j = 1, MIN(desc%npts(2), desc%group_size)
     293       62112 :                i = MIN(desc%npts(1), desc%group_size/(j*k))
     294      248448 :                n_slices_tmp = (/i, j, k/)
     295             : 
     296             :                ! we don't match the actual number of CPUs
     297      248448 :                IF (PRODUCT(n_slices_tmp) .NE. desc%group_size) CYCLE
     298             : 
     299             :                ! we see if there has been a input constraint
     300             :                ! i.e. if the layout is not -1 we need to fullfil it
     301      372930 :                IF (.NOT. ALL(PACK(n_slices_tmp == input_settings%distribution_layout, &
     302             :                                   (/-1, -1, -1/) /= input_settings%distribution_layout) &
     303       46584 :                              )) CYCLE
     304             : 
     305             :                ! We can not work with a grid that has more local than global grid points.
     306             :                ! This can happen when a halo region wraps around and overlaps with the other halo.
     307       46358 :                overlap = .FALSE.
     308      185432 :                DO dir = 1, 3
     309      185432 :                   IF (n_slices_tmp(dir) > 1) THEN
     310      139074 :                      DO l = 0, n_slices_tmp(dir) - 1
     311       92716 :                         lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
     312      139074 :                         IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .TRUE.
     313             :                      END DO
     314             :                   END IF
     315             :                END DO
     316       46358 :                IF (overlap) CYCLE
     317             : 
     318             :                ! a heuristic optimisation to reduce the memory usage
     319             :                ! we go for the smallest local to real volume
     320             :                ! volume of the box without the wings / volume of the box with the wings
     321             :                ! with prefactodesc to promote less cuts in Z dimension
     322             :                ratio = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp)/ &
     323             :                        PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp + &
     324       56854 :                                MERGE((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
     325       39178 :                IF (ratio > ratio_best) THEN
     326        8106 :                   ratio_best = ratio
     327        8106 :                   n_slices = n_slices_tmp
     328             :                END IF
     329             : 
     330             :             END DO
     331             :             END DO
     332             : 
     333             :             ! if automatic we can still decide this is a replicated grid
     334             :             ! if the memory gain (or the gain is messages) is too small.
     335       15528 :             IF (input_settings%distribution_type == rsgrid_automatic) THEN
     336       59912 :                volume = PRODUCT(REAL(desc%npts, KIND=dp))
     337             :                volume_dist = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices + &
     338       59912 :                                      MERGE((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
     339       14978 :                IF (volume < volume_dist*input_settings%memory_factor) THEN
     340       59912 :                   n_slices = 1
     341             :                END IF
     342             :             END IF
     343             : 
     344             :          END IF
     345             : 
     346      105560 :          desc%group_dim(:) = n_slices(:)
     347       26390 :          CALL desc%group%from_dup(pw_grid%para%group)
     348       26390 :          desc%group_size = desc%group%num_pe
     349       26390 :          desc%my_pos = desc%group%mepos
     350             : 
     351      105382 :          IF (ALL(n_slices == 1)) THEN
     352             :             ! CASE 1 : only one slice: we do not need overlapping regions and special
     353             :             !          recombination of the total density
     354       26232 :             desc%border = border_size
     355       26232 :             IF (border_size == 0) THEN
     356      104928 :                desc%perd = 1
     357             :             ELSE
     358           0 :                desc%perd = 0
     359             :             END IF
     360       26232 :             desc%distributed = .FALSE.
     361       26232 :             desc%parallel = .TRUE.
     362       26232 :             desc%group_head = pw_grid%para%group_head
     363      104928 :             desc%group_coor(:) = 0
     364       26232 :             desc%my_virtual_pos = 0
     365             : 
     366       78696 :             ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
     367       78696 :             ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
     368             :             ! Start with no reordering
     369       78696 :             DO i = 0, desc%group_size - 1
     370       52464 :                desc%virtual2real(i) = i
     371       78696 :                desc%real2virtual(i) = i
     372             :             END DO
     373             :          ELSE
     374             :             ! CASE 2 : general case
     375             :             ! periodicity is no longer enforced arbritary directions
     376         158 :             IF (border_size == 0) THEN
     377         120 :                desc%perd = 1
     378         120 :                DO dir = 1, 3
     379         120 :                   IF (n_slices(dir) > 1) desc%perd(dir) = 0
     380             :                END DO
     381             :             ELSE
     382         512 :                desc%perd(:) = 0
     383             :             END IF
     384             :             ! we keep a border of nmin points
     385         158 :             desc%border = nmin
     386             :             ! we are going parallel on the real space grid
     387         158 :             desc%parallel = .TRUE.
     388         158 :             desc%distributed = .TRUE.
     389         158 :             desc%group_head = (desc%my_pos == 0)
     390             : 
     391             :             ! set up global info about the distribution
     392         474 :             ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
     393         790 :             ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
     394         474 :             ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
     395         474 :             ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
     396         474 :             ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
     397         474 :             ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
     398         474 :             ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
     399             : 
     400         474 :             DO i = 0, desc%group_size - 1
     401             :                ! Calculate coordinates in a row-major order (to be SMP-friendly)
     402         316 :                desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
     403             :                desc%rank2coord(2, i) = MODULO(i, desc%group_dim(2)*desc%group_dim(3)) &
     404         316 :                                        /desc%group_dim(3)
     405         316 :                desc%rank2coord(3, i) = MODULO(i, desc%group_dim(3))
     406             : 
     407         316 :                IF (i == desc%my_pos) THEN
     408         632 :                   desc%group_coor = desc%rank2coord(:, i)
     409             :                END IF
     410             : 
     411         316 :                desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
     412             :                ! the lb_global and ub_global correspond to lb_real and ub_real of each task
     413        1264 :                desc%lb_global(:, i) = desc%lb
     414        1264 :                desc%ub_global(:, i) = desc%ub
     415        1422 :                DO dir = 1, 3
     416        1264 :                   IF (desc%group_dim(dir) .GT. 1) THEN
     417         316 :                      lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
     418         316 :                      desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
     419         316 :                      desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
     420             :                   END IF
     421             :                END DO
     422             :             END DO
     423             : 
     424             :             ! map a grid point to a CPU coord
     425         632 :             DO dir = 1, 3
     426        1264 :                DO l = 0, desc%group_dim(dir) - 1
     427         632 :                   IF (desc%group_dim(dir) .GT. 1) THEN
     428         316 :                      lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
     429         948 :                      lb = lb + desc%lb(dir) - 1
     430             :                   ELSE
     431         316 :                      lb(1) = desc%lb(dir)
     432         316 :                      lb(2) = desc%ub(dir)
     433             :                   END IF
     434         474 :                   SELECT CASE (dir)
     435             :                   CASE (1)
     436       11696 :                      desc%x2coord(lb(1):lb(2)) = l
     437             :                   CASE (2)
     438       12104 :                      desc%y2coord(lb(1):lb(2)) = l
     439             :                   CASE (3)
     440       12200 :                      desc%z2coord(lb(1):lb(2)) = l
     441             :                   END SELECT
     442             :                END DO
     443             :             END DO
     444             : 
     445             :             ! an upper bound for the number of neighbours the border is overlapping with
     446         632 :             DO dir = 1, 3
     447         474 :                desc%neighbours(dir) = 0
     448         632 :                IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
     449         414 :                   min_npts_real = HUGE(0)
     450         986 :                   DO l = 0, n_slices(dir) - 1
     451         572 :                      lb = get_limit(desc%npts(dir), n_slices(dir), l)
     452         986 :                      min_npts_real = MIN(lb(2) - lb(1) + 1, min_npts_real)
     453             :                   END DO
     454         414 :                   desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
     455             :                END IF
     456             :             END DO
     457             : 
     458         474 :             ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
     459         474 :             ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
     460             :             ! Start with no reordering
     461         474 :             DO i = 0, desc%group_size - 1
     462         316 :                desc%virtual2real(i) = i
     463         474 :                desc%real2virtual(i) = i
     464             :             END DO
     465             : 
     466         158 :             desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
     467         632 :             desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
     468             : 
     469             :          END IF
     470             :       END IF
     471             : 
     472       30296 :       CALL timestop(handle)
     473             : 
     474       30296 :    END SUBROUTINE rs_grid_create_descriptor
     475             : 
     476             : ! **************************************************************************************************
     477             : !> \brief ...
     478             : !> \param rs ...
     479             : !> \param desc ...
     480             : ! **************************************************************************************************
     481     4465440 :    SUBROUTINE rs_grid_create(rs, desc)
     482             :       TYPE(realspace_grid_type), INTENT(OUT)             :: rs
     483             :       TYPE(realspace_grid_desc_type), INTENT(INOUT), &
     484             :          TARGET                                          :: desc
     485             : 
     486             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'rs_grid_create'
     487             : 
     488             :       INTEGER                                            :: handle
     489             : 
     490      223272 :       CALL timeset(routineN, handle)
     491             : 
     492      223272 :       rs%desc => desc
     493      223272 :       CALL rs_grid_retain_descriptor(rs%desc)
     494             : 
     495      223272 :       IF (desc%pw%para%mode == PW_MODE_LOCAL) THEN
     496             :          ! The corresponding group has dimension 1
     497             :          ! All operations will be done locally
     498       63224 :          rs%lb_real = desc%lb
     499       63224 :          rs%ub_real = desc%ub
     500       63224 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     501       63224 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     502       63224 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     503       63224 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     504             :       END IF
     505             : 
     506      890704 :       IF (ALL(rs%desc%group_dim == 1)) THEN
     507             :          ! CASE 1 : only one slice: we do not need overlapping regions and special
     508             :          !          recombination of the total density
     509      886032 :          rs%lb_real = desc%lb
     510      886032 :          rs%ub_real = desc%ub
     511      886032 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     512      886032 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     513      886032 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     514      886032 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     515             :       ELSE
     516             :          ! CASE 2 : general case
     517             :          ! extract some more derived quantities about the local grid
     518        7056 :          rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
     519        7056 :          rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
     520        7056 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     521        7056 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     522        7056 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     523        7056 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     524             :       END IF
     525             : 
     526      223272 :       CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
     527             :       rs%r(rs%lb_local(1):rs%ub_local(1), &
     528             :            rs%lb_local(2):rs%ub_local(2), &
     529      223272 :            rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
     530             : 
     531      669816 :       ALLOCATE (rs%px(desc%npts(1)))
     532      669816 :       ALLOCATE (rs%py(desc%npts(2)))
     533      669816 :       ALLOCATE (rs%pz(desc%npts(3)))
     534             : 
     535      223272 :       CALL timestop(handle)
     536             : 
     537      223272 :    END SUBROUTINE rs_grid_create
     538             : 
     539             : ! **************************************************************************************************
     540             : !> \brief Defines a new ordering of ranks on this realspace grid, recalculating
     541             : !>        the data bounds and reallocating the grid.  As a result, each MPI process
     542             : !>        now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
     543             : !>        and a virtual rank (the rank of the process where the data now owned by this
     544             : !>        process would reside in an ordinary cartesian distribution).
     545             : !>        NB. Since the grid size required may change, the caller should be sure to release
     546             : !>        and recreate the corresponding rs_grids
     547             : !>        The desc%real2virtual and desc%virtual2real arrays can be used to map
     548             : !>        a physical rank to the 'rank' of data owned by that process and vice versa
     549             : !> \param desc ...
     550             : !> \param real2virtual ...
     551             : !> \par History
     552             : !>        04-2009 created [Iain Bethune]
     553             : !>          (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
     554             : ! **************************************************************************************************
     555           6 :    PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
     556             : 
     557             :       TYPE(realspace_grid_desc_type), INTENT(INOUT)      :: desc
     558             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: real2virtual
     559             : 
     560             :       INTEGER                                            :: i
     561             : 
     562          18 :       desc%real2virtual(:) = real2virtual
     563             : 
     564          18 :       DO i = 0, desc%group_size - 1
     565          18 :          desc%virtual2real(desc%real2virtual(i)) = i
     566             :       END DO
     567             : 
     568           6 :       desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
     569             : 
     570          12 :       IF (.NOT. ALL(desc%group_dim == 1)) THEN
     571          24 :          desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
     572             :       END IF
     573             : 
     574           6 :    END SUBROUTINE
     575             : 
     576             : ! **************************************************************************************************
     577             : !> \brief Print information on grids to output
     578             : !> \param rs ...
     579             : !> \param iounit ...
     580             : !> \author JGH (17-May-2007)
     581             : ! **************************************************************************************************
     582       13434 :    SUBROUTINE rs_grid_print(rs, iounit)
     583             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     584             :       INTEGER, INTENT(in)                                :: iounit
     585             : 
     586             :       INTEGER                                            :: dir, i, nn
     587             :       REAL(KIND=dp)                                      :: pp(3)
     588             : 
     589       13434 :       IF (rs%desc%parallel) THEN
     590       13152 :          IF (iounit > 0) THEN
     591             :             WRITE (iounit, '(/,A,T71,I10)') &
     592        5864 :                " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
     593       23456 :             DO i = 1, 3
     594       17592 :                WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID|   Bounds ", &
     595       41048 :                   i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
     596             :             END DO
     597        5864 :             IF (.NOT. rs%desc%distributed) THEN
     598        5849 :                WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
     599             :                WRITE (iounit, '(A,T71,I10)') &
     600        5849 :                   " RS_GRID| Group size ", rs%desc%group_dim(2)
     601             :             ELSE
     602          60 :                DO dir = 1, 3
     603          60 :                   IF (rs%desc%perd(dir) /= 1) THEN
     604             :                      WRITE (iounit, '(A,T71,I3,A)') &
     605          15 :                         " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
     606             :                      WRITE (iounit, '(A,T71,I10)') &
     607          15 :                         " RS_GRID| Real space distribution along direction ", dir
     608             :                      WRITE (iounit, '(A,T71,I10)') &
     609          15 :                         " RS_GRID| Border size ", rs%desc%border
     610             :                   END IF
     611             :                END DO
     612             :             END IF
     613             :          END IF
     614       13152 :          IF (rs%desc%distributed) THEN
     615         120 :             DO dir = 1, 3
     616         120 :                IF (rs%desc%perd(dir) /= 1) THEN
     617          30 :                   nn = rs%npts_local(dir)
     618          30 :                   CALL rs%desc%group%sum(nn)
     619         120 :                   pp(1) = REAL(nn, KIND=dp)/REAL(PRODUCT(rs%desc%group_dim), KIND=dp)
     620          30 :                   nn = rs%npts_local(dir)
     621          30 :                   CALL rs%desc%group%max(nn)
     622          30 :                   pp(2) = REAL(nn, KIND=dp)
     623          30 :                   nn = rs%npts_local(dir)
     624          30 :                   CALL rs%desc%group%min(nn)
     625          30 :                   pp(3) = REAL(nn, KIND=dp)
     626          30 :                   IF (iounit > 0) THEN
     627          15 :                      WRITE (iounit, '(A,T48,A)') " RS_GRID|   Distribution", &
     628          30 :                         "  Average         Max         Min"
     629          15 :                      WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID|   Planes   ", &
     630          30 :                         pp(1), NINT(pp(2)), NINT(pp(3))
     631             :                   END IF
     632             :                END IF
     633             :             END DO
     634             : !          WRITE ( iounit, '(/)' )
     635             :          END IF
     636             :       ELSE
     637         282 :          IF (iounit > 0) THEN
     638             :             WRITE (iounit, '(/,A,T71,I10)') &
     639         172 :                " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
     640         688 :             DO i = 1, 3
     641         516 :                WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID|   Bounds ", &
     642        1204 :                   i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
     643             :             END DO
     644             : !         WRITE ( iounit, '(/)' )
     645             :          END IF
     646             :       END IF
     647             : 
     648       13434 :    END SUBROUTINE rs_grid_print
     649             : 
     650             : ! **************************************************************************************************
     651             : !> \brief ...
     652             : !> \param rs ...
     653             : !> \param pw ...
     654             : ! **************************************************************************************************
     655     1038488 :    SUBROUTINE transfer_rs2pw(rs, pw)
     656             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     657             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw
     658             : 
     659             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'transfer_rs2pw'
     660             : 
     661             :       INTEGER                                            :: handle, handle2, i
     662             : 
     663     1038488 :       CALL timeset(routineN, handle2)
     664     1038488 :       CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
     665             : 
     666     1038488 :       IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
     667           0 :          CPABORT("Different rs and pw indentifiers")
     668             : 
     669     1038488 :       IF (rs%desc%distributed) THEN
     670        1840 :          CALL transfer_rs2pw_distributed(rs, pw)
     671     1036648 :       ELSE IF (rs%desc%parallel) THEN
     672      835202 :          CALL transfer_rs2pw_replicated(rs, pw)
     673             :       ELSE ! treat simple serial case locally
     674      201446 :          IF (rs%desc%border == 0) THEN
     675      201446 :             CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
     676             :          ELSE
     677           0 :             CPASSERT(LBOUND(pw%array, 3) .EQ. rs%lb_real(3))
     678           0 : !$OMP          PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
     679             :             DO i = rs%lb_real(3), rs%ub_real(3)
     680             :                pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
     681             :                                         rs%lb_real(2):rs%ub_real(2), i)
     682             :             END DO
     683             : !$OMP          END PARALLEL DO
     684             :          END IF
     685             :       END IF
     686             : 
     687     1038488 :       CALL timestop(handle)
     688     1038488 :       CALL timestop(handle2)
     689             : 
     690     1038488 :    END SUBROUTINE transfer_rs2pw
     691             : 
     692             : ! **************************************************************************************************
     693             : !> \brief ...
     694             : !> \param rs ...
     695             : !> \param pw ...
     696             : ! **************************************************************************************************
     697     1013115 :    SUBROUTINE transfer_pw2rs(rs, pw)
     698             : 
     699             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     700             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     701             : 
     702             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'transfer_pw2rs'
     703             : 
     704             :       INTEGER                                            :: handle, handle2, i, im, j, jm, k, km
     705             : 
     706     1013115 :       CALL timeset(routineN, handle2)
     707     1013115 :       CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
     708             : 
     709     1013115 :       IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
     710           0 :          CPABORT("Different rs and pw indentifiers")
     711             : 
     712     1013115 :       IF (rs%desc%distributed) THEN
     713         868 :          CALL transfer_pw2rs_distributed(rs, pw)
     714     1012247 :       ELSE IF (rs%desc%parallel) THEN
     715      775858 :          CALL transfer_pw2rs_replicated(rs, pw)
     716             :       ELSE ! treat simple serial case locally
     717      236389 :          IF (rs%desc%border == 0) THEN
     718      236389 :             CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
     719             :          ELSE
     720             : !$OMP          PARALLEL DO DEFAULT(NONE) &
     721             : !$OMP                      PRIVATE(i,im,j,jm,k,km) &
     722           0 : !$OMP                      SHARED(pw,rs)
     723             :             DO k = rs%lb_local(3), rs%ub_local(3)
     724             :                IF (k < rs%lb_real(3)) THEN
     725             :                   km = k + rs%desc%npts(3)
     726             :                ELSE IF (k > rs%ub_real(3)) THEN
     727             :                   km = k - rs%desc%npts(3)
     728             :                ELSE
     729             :                   km = k
     730             :                END IF
     731             :                DO j = rs%lb_local(2), rs%ub_local(2)
     732             :                   IF (j < rs%lb_real(2)) THEN
     733             :                      jm = j + rs%desc%npts(2)
     734             :                   ELSE IF (j > rs%ub_real(2)) THEN
     735             :                      jm = j - rs%desc%npts(2)
     736             :                   ELSE
     737             :                      jm = j
     738             :                   END IF
     739             :                   DO i = rs%lb_local(1), rs%ub_local(1)
     740             :                      IF (i < rs%lb_real(1)) THEN
     741             :                         im = i + rs%desc%npts(1)
     742             :                      ELSE IF (i > rs%ub_real(1)) THEN
     743             :                         im = i - rs%desc%npts(1)
     744             :                      ELSE
     745             :                         im = i
     746             :                      END IF
     747             :                      rs%r(i, j, k) = pw%array(im, jm, km)
     748             :                   END DO
     749             :                END DO
     750             :             END DO
     751             : !$OMP          END PARALLEL DO
     752             :          END IF
     753             :       END IF
     754             : 
     755     1013115 :       CALL timestop(handle)
     756     1013115 :       CALL timestop(handle2)
     757             : 
     758     1013115 :    END SUBROUTINE transfer_pw2rs
     759             : 
     760             : ! **************************************************************************************************
     761             : !> \brief transfer from a realspace grid to a planewave grid
     762             : !> \param rs ...
     763             : !> \param pw ...
     764             : ! **************************************************************************************************
     765      835202 :    SUBROUTINE transfer_rs2pw_replicated(rs, pw)
     766             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     767             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw
     768             : 
     769             :       INTEGER                                            :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
     770             :                                                             source
     771      835202 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount
     772             :       INTEGER, DIMENSION(3)                              :: lb, ub
     773      835202 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recvbuf, sendbuf, swaparray
     774             : 
     775             :       ASSOCIATE (np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
     776             :                  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
     777             :                  grid => rs%r)
     778     2505606 :          ALLOCATE (rcount(0:np - 1))
     779     2505606 :          DO ip = 1, np
     780     7516818 :             rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
     781             :          END DO
     782     2505606 :          nma = MAXVAL(rcount(0:np - 1))
     783     4176010 :          ALLOCATE (sendbuf(nma), recvbuf(nma))
     784 31497273648 :          sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
     785             : 
     786             :          !sample peak memory
     787      835202 :          CALL m_memory()
     788             : 
     789      835202 :          dest = MODULO(mepos + 1, np)
     790      835202 :          source = MODULO(mepos - 1, np)
     791 15748636824 :          sendbuf = 0.0_dp
     792             : 
     793     1670404 :          DO ip = 1, np
     794             : 
     795     6681616 :             lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
     796     6681616 :             ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
     797             :             ! this loop takes about the same time as the message passing call
     798             :             ! notice that the range of ix is only a small fraction of the first index of grid
     799             :             ! therefore it seems faster to have the second index as the innermost loop
     800             :             ! if this runs on many cpus
     801             :             ! tested on itanium, pentium4, opteron, ultrasparc...
     802     6681616 :             s = ub - lb + 1
     803    41834180 :             DO iz = lb(3), ub(3)
     804   741220000 :                DO ix = lb(1), ub(1)
     805   699385820 :                   ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
     806 32023406526 :                   DO iy = lb(2), ub(2)
     807 31283856930 :                      sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
     808 31983242750 :                      ii = ii + s(1)
     809             :                   END DO
     810             :                END DO
     811             :             END DO
     812     1670404 :             IF (ip .EQ. np) EXIT
     813      835202 :             CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
     814      835202 :             CALL MOVE_ALLOC(sendbuf, swaparray)
     815      835202 :             CALL MOVE_ALLOC(recvbuf, sendbuf)
     816     1670404 :             CALL MOVE_ALLOC(swaparray, recvbuf)
     817             :          END DO
     818      835202 :          nn = rcount(mepos)
     819             :       END ASSOCIATE
     820             : 
     821      835202 :       CALL dcopy(nn, sendbuf, 1, pw%array, 1)
     822             : 
     823      835202 :       DEALLOCATE (rcount)
     824      835202 :       DEALLOCATE (sendbuf)
     825      835202 :       DEALLOCATE (recvbuf)
     826             : 
     827      835202 :    END SUBROUTINE transfer_rs2pw_replicated
     828             : 
     829             : ! **************************************************************************************************
     830             : !> \brief transfer from a planewave grid to a realspace grid
     831             : !> \param rs ...
     832             : !> \param pw ...
     833             : ! **************************************************************************************************
     834      775858 :    SUBROUTINE transfer_pw2rs_replicated(rs, pw)
     835             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     836             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     837             : 
     838             :       INTEGER                                            :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
     839             :                                                             k, km, nma, nn, source
     840      775858 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount
     841             :       INTEGER, DIMENSION(3)                              :: lb, ub
     842      775858 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recvbuf, sendbuf, swaparray
     843     2327574 :       TYPE(mp_request_type), DIMENSION(2)                :: req
     844             : 
     845             :       ASSOCIATE (np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
     846             :                  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
     847             :                  grid => rs%r)
     848     2327574 :          ALLOCATE (rcount(0:np - 1))
     849     2327574 :          DO ip = 1, np
     850     6982722 :             rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
     851             :          END DO
     852     2327574 :          nma = MAXVAL(rcount(0:np - 1))
     853     3879290 :          ALLOCATE (sendbuf(nma), recvbuf(nma))
     854 30713859552 :          sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
     855             : 
     856             :          !sample peak memory
     857      775858 :          CALL m_memory()
     858             : 
     859      775858 :          nn = rcount(mepos)
     860      775858 :          CALL dcopy(nn, pw%array, 1, sendbuf, 1)
     861             : 
     862      775858 :          dest = MODULO(mepos + 1, np)
     863      775858 :          source = MODULO(mepos - 1, np)
     864             : 
     865     2327574 :          DO ip = 0, np - 1
     866             :             ! we must shift the buffer only np-1 times around
     867     1551716 :             IF (ip .NE. np - 1) THEN
     868             :                CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
     869      775858 :                                     req(1), req(2), 13)
     870             :             END IF
     871     6206864 :             lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
     872     6206864 :             ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
     873     1551716 :             ii = 0
     874             :             ! this loop takes about the same time as the message passing call
     875             :             ! If I read the code correctly then:
     876    39896756 :             DO iz = lb(3), ub(3)
     877  1382097252 :                DO iy = lb(2), ub(2)
     878 31890640682 :                   DO ix = lb(1), ub(1)
     879 30510095146 :                      ii = ii + 1
     880 31852295642 :                      grid(ix, iy, iz) = sendbuf(ii)
     881             :                   END DO
     882             :                END DO
     883             :             END DO
     884     1551716 :             IF (ip .NE. np - 1) THEN
     885      775858 :                CALL mp_waitall(req)
     886             :             END IF
     887     1551716 :             CALL MOVE_ALLOC(sendbuf, swaparray)
     888     1551716 :             CALL MOVE_ALLOC(recvbuf, sendbuf)
     889     2327574 :             CALL MOVE_ALLOC(swaparray, recvbuf)
     890             :          END DO
     891     1551716 :          IF (rs%desc%border > 0) THEN
     892             : !$OMP       PARALLEL DO DEFAULT(NONE) &
     893             : !$OMP                   PRIVATE(i,im,j,jm,k,km) &
     894           0 : !$OMP                   SHARED(rs)
     895             :             DO k = rs%lb_local(3), rs%ub_local(3)
     896             :                IF (k < rs%lb_real(3)) THEN
     897             :                   km = k + rs%desc%npts(3)
     898             :                ELSE IF (k > rs%ub_real(3)) THEN
     899             :                   km = k - rs%desc%npts(3)
     900             :                ELSE
     901             :                   km = k
     902             :                END IF
     903             :                DO j = rs%lb_local(2), rs%ub_local(2)
     904             :                   IF (j < rs%lb_real(2)) THEN
     905             :                      jm = j + rs%desc%npts(2)
     906             :                   ELSE IF (j > rs%ub_real(2)) THEN
     907             :                      jm = j - rs%desc%npts(2)
     908             :                   ELSE
     909             :                      jm = j
     910             :                   END IF
     911             :                   DO i = rs%lb_local(1), rs%ub_local(1)
     912             :                      IF (i < rs%lb_real(1)) THEN
     913             :                         im = i + rs%desc%npts(1)
     914             :                      ELSE IF (i > rs%ub_real(1)) THEN
     915             :                         im = i - rs%desc%npts(1)
     916             :                      ELSE
     917             :                         im = i
     918             :                      END IF
     919             :                      rs%r(i, j, k) = rs%r(im, jm, km)
     920             :                   END DO
     921             :                END DO
     922             :             END DO
     923             : !$OMP       END PARALLEL DO
     924             :          END IF
     925             :       END ASSOCIATE
     926             : 
     927      775858 :       DEALLOCATE (rcount)
     928      775858 :       DEALLOCATE (sendbuf)
     929      775858 :       DEALLOCATE (recvbuf)
     930             : 
     931      775858 :    END SUBROUTINE transfer_pw2rs_replicated
     932             : 
     933             : ! **************************************************************************************************
     934             : !> \brief does the rs2pw transfer in the case where the rs grid is
     935             : !>       distributed (3D domain decomposition)
     936             : !> \param rs ...
     937             : !> \param pw ...
     938             : !> \par History
     939             : !>      12.2007 created [Matt Watkins]
     940             : !>      9.2008 reduced amount of halo data sent [Iain Bethune]
     941             : !>      10.2008 added non-blocking communication [Iain Bethune]
     942             : !>      4.2009 added support for rank-reordering on the grid [Iain Bethune]
     943             : !>      12.2009 added OMP and sparse alltoall [Iain Bethune]
     944             : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
     945             : !> \note
     946             : !>       the transfer is a two step procedure. For example, for the rs2pw transfer:
     947             : !>
     948             : !>       1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
     949             : !>       2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
     950             : !>
     951             : !>       the halo exchange is most expensive on a large number of CPUs. Particular in this halo
     952             : !>       exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
     953             : !>       with the central domain of several CPUs (i.e. next nearest neighbors)
     954             : ! **************************************************************************************************
     955        1840 :    SUBROUTINE transfer_rs2pw_distributed(rs, pw)
     956             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     957             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     958             : 
     959             :       CHARACTER(LEN=200)                                 :: error_string
     960             :       INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
     961             :          n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
     962        1840 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dshifts, recv_disps, recv_sizes, &
     963        1840 :                                                             send_disps, send_sizes, ushifts
     964        3680 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: bounds, recv_tasks, send_tasks
     965             :       INTEGER, DIMENSION(2)                              :: neighbours, pos
     966             :       INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
     967             :          lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
     968             :       LOGICAL, DIMENSION(3)                              :: halo_swapped
     969             :       REAL(KIND=dp)                                      :: pw_sum, rs_sum
     970        1840 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: recv_buf_3d_down, recv_buf_3d_up, &
     971        1840 :                                                             send_buf_3d_down, send_buf_3d_up
     972        3680 :       TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_bufs, send_bufs
     973        1840 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_reqs, send_reqs
     974        9200 :       TYPE(mp_request_type), DIMENSION(4)                :: req
     975             : 
     976        1840 :       num_threads = 1
     977        1840 :       my_id = 0
     978             : 
     979             :       ! safety check, to be removed once we're absolute sure the routine is correct
     980             :       IF (debug_this_module) THEN
     981             :          rs_sum = accurate_sum(rs%r)*ABS(det_3x3(rs%desc%dh))
     982             :          CALL rs%desc%group%sum(rs_sum)
     983             :       END IF
     984             : 
     985        1840 :       halo_swapped = .FALSE.
     986             :       ! We don't need to send the 'edges' of the halos that have already been sent
     987             :       ! Halos are contiguous in memory in z-direction only, so swap these first,
     988             :       ! and send less data in the y and x directions which are more expensive
     989             : 
     990        7360 :       DO idir = 3, 1, -1
     991             : 
     992        5520 :          IF (rs%desc%perd(idir) .NE. 1) THEN
     993             : 
     994       14304 :             ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
     995       14304 :             ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
     996             : 
     997       14304 :             ushifts = 0
     998       14304 :             dshifts = 0
     999             : 
    1000             :             ! check that we don't try to send data to ourself
    1001        6608 :             DO n_shifts = 1, MIN(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
    1002             : 
    1003             :                ! need to take into account the possible varying widths of neighbouring cells
    1004             :                ! offset_up and offset_down hold the real size of the neighbouring cells
    1005        1840 :                position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
    1006        1840 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1007        1840 :                dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1008             : 
    1009        1840 :                position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
    1010        1840 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1011        1840 :                ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1012             : 
    1013             :                ! The border data has to be send/received from the neighbours
    1014             :                ! First we calculate the source and destination processes for the shift
    1015             :                ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
    1016             : 
    1017        1840 :                CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
    1018             : 
    1019        7360 :                lb_send_down(:) = rs%lb_local(:)
    1020        7360 :                lb_recv_down(:) = rs%lb_local(:)
    1021        7360 :                ub_recv_down(:) = rs%ub_local(:)
    1022        7360 :                ub_send_down(:) = rs%ub_local(:)
    1023             : 
    1024        1840 :                IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1025        1840 :                   ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
    1026             :                   lb_send_down(idir) = MAX(lb_send_down(idir), &
    1027        1840 :                                            lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
    1028             : 
    1029        1840 :                   ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
    1030             :                   lb_recv_down(idir) = MAX(lb_recv_down(idir) + rs%desc%border, &
    1031        1840 :                                            ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
    1032             :                ELSE
    1033           0 :                   lb_send_down(idir) = 0
    1034           0 :                   ub_send_down(idir) = -1
    1035           0 :                   lb_recv_down(idir) = 0
    1036           0 :                   ub_recv_down(idir) = -1
    1037             :                END IF
    1038             : 
    1039        7360 :                DO i = 1, 3
    1040        7360 :                   IF (halo_swapped(i)) THEN
    1041         554 :                      lb_send_down(i) = rs%lb_real(i)
    1042         554 :                      ub_send_down(i) = rs%ub_real(i)
    1043         554 :                      lb_recv_down(i) = rs%lb_real(i)
    1044         554 :                      ub_recv_down(i) = rs%ub_real(i)
    1045             :                   END IF
    1046             :                END DO
    1047             : 
    1048             :                ! post the receive
    1049           0 :                ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
    1050        9200 :                                           lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
    1051        1840 :                CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
    1052             : 
    1053             :                ! now allocate, pack and send the send buffer
    1054        7360 :                nn = PRODUCT(ub_send_down - lb_send_down + 1)
    1055           0 :                ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
    1056        9200 :                                           lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
    1057             : 
    1058             : !$OMP PARALLEL DEFAULT(NONE), &
    1059             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1060        1840 : !$OMP          SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
    1061             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
    1062             : !$             my_id = omp_get_thread_num()
    1063             :                IF (my_id < num_threads) THEN
    1064             :                   lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
    1065             :                   ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
    1066             : 
    1067             :                   send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
    1068             :                                    lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
    1069             :                                                  lb_send_down(2):ub_send_down(2), lb:ub)
    1070             :                END IF
    1071             : !$OMP END PARALLEL
    1072             : 
    1073        1840 :                CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
    1074             : 
    1075             :                ! Now for the other direction
    1076        1840 :                CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
    1077             : 
    1078        7360 :                lb_send_up(:) = rs%lb_local(:)
    1079        7360 :                lb_recv_up(:) = rs%lb_local(:)
    1080        7360 :                ub_recv_up(:) = rs%ub_local(:)
    1081        7360 :                ub_send_up(:) = rs%ub_local(:)
    1082             : 
    1083        1840 :                IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1084             : 
    1085        1840 :                   lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
    1086             :                   ub_send_up(idir) = MIN(ub_send_up(idir), &
    1087        1840 :                                          ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
    1088             : 
    1089        1840 :                   lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
    1090             :                   ub_recv_up(idir) = MIN(ub_recv_up(idir) - rs%desc%border, &
    1091        1840 :                                          lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
    1092             :                ELSE
    1093           0 :                   lb_send_up(idir) = 0
    1094           0 :                   ub_send_up(idir) = -1
    1095           0 :                   lb_recv_up(idir) = 0
    1096           0 :                   ub_recv_up(idir) = -1
    1097             :                END IF
    1098             : 
    1099        7360 :                DO i = 1, 3
    1100        7360 :                   IF (halo_swapped(i)) THEN
    1101         554 :                      lb_send_up(i) = rs%lb_real(i)
    1102         554 :                      ub_send_up(i) = rs%ub_real(i)
    1103         554 :                      lb_recv_up(i) = rs%lb_real(i)
    1104         554 :                      ub_recv_up(i) = rs%ub_real(i)
    1105             :                   END IF
    1106             :                END DO
    1107             : 
    1108             :                ! post the receive
    1109           0 :                ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
    1110        9200 :                                         lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
    1111        1840 :                CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
    1112             : 
    1113             :                ! now allocate,pack and send the send buffer
    1114        7360 :                nn = PRODUCT(ub_send_up - lb_send_up + 1)
    1115           0 :                ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
    1116        9200 :                                         lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
    1117             : 
    1118             : !$OMP PARALLEL DEFAULT(NONE), &
    1119             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1120        1840 : !$OMP          SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
    1121             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
    1122             : !$             my_id = omp_get_thread_num()
    1123             :                IF (my_id < num_threads) THEN
    1124             :                   lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
    1125             :                   ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
    1126             : 
    1127             :                   send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
    1128             :                                  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
    1129             :                                                lb_send_up(2):ub_send_up(2), lb:ub)
    1130             :                END IF
    1131             : !$OMP END PARALLEL
    1132             : 
    1133        1840 :                CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
    1134             : 
    1135             :                ! wait for a recv to complete, then we can unpack
    1136             : 
    1137        5520 :                DO i = 1, 2
    1138             : 
    1139        3680 :                   CALL mp_waitany(req(1:2), completed)
    1140             : 
    1141        5520 :                   IF (completed .EQ. 1) THEN
    1142             : 
    1143             :                      ! only some procs may need later shifts
    1144        1840 :                      IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
    1145             :                         ! Sum the data in the RS Grid
    1146             : !$OMP PARALLEL DEFAULT(NONE), &
    1147             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1148        1840 : !$OMP          SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
    1149             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
    1150             : !$                      my_id = omp_get_thread_num()
    1151             :                         IF (my_id < num_threads) THEN
    1152             :                            lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
    1153             :                            ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
    1154             : 
    1155             :                            rs%r(lb_recv_down(1):ub_recv_down(1), &
    1156             :                                 lb_recv_down(2):ub_recv_down(2), lb:ub) = &
    1157             :                               rs%r(lb_recv_down(1):ub_recv_down(1), &
    1158             :                                    lb_recv_down(2):ub_recv_down(2), lb:ub) + &
    1159             :                               recv_buf_3d_down(:, :, lb:ub)
    1160             :                         END IF
    1161             : !$OMP END PARALLEL
    1162             :                      END IF
    1163        1840 :                      DEALLOCATE (recv_buf_3d_down)
    1164             :                   ELSE
    1165             : 
    1166             :                      ! only some procs may need later shifts
    1167        1840 :                      IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
    1168             :                         ! Sum the data in the RS Grid
    1169             : !$OMP PARALLEL DEFAULT(NONE), &
    1170             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1171        1840 : !$OMP          SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
    1172             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
    1173             : !$                      my_id = omp_get_thread_num()
    1174             :                         IF (my_id < num_threads) THEN
    1175             :                            lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
    1176             :                            ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
    1177             : 
    1178             :                            rs%r(lb_recv_up(1):ub_recv_up(1), &
    1179             :                                 lb_recv_up(2):ub_recv_up(2), lb:ub) = &
    1180             :                               rs%r(lb_recv_up(1):ub_recv_up(1), &
    1181             :                                    lb_recv_up(2):ub_recv_up(2), lb:ub) + &
    1182             :                               recv_buf_3d_up(:, :, lb:ub)
    1183             :                         END IF
    1184             : !$OMP END PARALLEL
    1185             :                      END IF
    1186        1840 :                      DEALLOCATE (recv_buf_3d_up)
    1187             :                   END IF
    1188             : 
    1189             :                END DO
    1190             : 
    1191             :                ! make sure the sends have completed before we deallocate
    1192             : 
    1193        1840 :                CALL mp_waitall(req(3:4))
    1194             : 
    1195        1840 :                DEALLOCATE (send_buf_3d_down)
    1196        8448 :                DEALLOCATE (send_buf_3d_up)
    1197             :             END DO
    1198             : 
    1199        4768 :             DEALLOCATE (dshifts)
    1200        4768 :             DEALLOCATE (ushifts)
    1201             : 
    1202             :          END IF
    1203             : 
    1204        7360 :          halo_swapped(idir) = .TRUE.
    1205             : 
    1206             :       END DO
    1207             : 
    1208             :       ! This is the real redistribution
    1209        7360 :       ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
    1210             : 
    1211             :       ! work out the pw grid points each proc holds
    1212        5520 :       DO i = 0, pw%pw_grid%para%group_size - 1
    1213       11040 :          bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
    1214       11040 :          bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
    1215       11040 :          bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
    1216       12880 :          bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
    1217             :       END DO
    1218             : 
    1219        7360 :       ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
    1220        5520 :       ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
    1221        5520 :       ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
    1222        7360 :       ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
    1223        5520 :       ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
    1224        5520 :       ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
    1225        5520 :       send_tasks(:, 1) = 1
    1226        5520 :       send_tasks(:, 2) = 0
    1227        5520 :       send_tasks(:, 3) = 1
    1228        5520 :       send_tasks(:, 4) = 0
    1229        5520 :       send_tasks(:, 5) = 1
    1230        5520 :       send_tasks(:, 6) = 0
    1231        5520 :       send_sizes = 0
    1232        5520 :       recv_sizes = 0
    1233             : 
    1234        1840 :       my_rs_rank = rs%desc%my_pos
    1235        1840 :       my_pw_rank = pw%pw_grid%para%rs_mpo
    1236             : 
    1237             :       ! find the processors that should hold our data
    1238             :       ! should be part of the rs grid type
    1239             :       ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
    1240             :       ! do the recv and send tasks in two separate loops which will
    1241             :       ! load balance better for OpenMP with large numbers of MPI tasks
    1242             : 
    1243             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1244             : !$OMP             PRIVATE(coords,idir,pos,lb_send,ub_send), &
    1245        1840 : !$OMP             SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
    1246             :       DO i = 0, rs%desc%group_size - 1
    1247             : 
    1248             :          coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
    1249             :          !calculate the rs grid points on each processor
    1250             :          !coords is the part of the grid that rank i actually holds
    1251             :          DO idir = 1, 3
    1252             :             pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1253             :             pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1254             :             lb_send(idir) = pos(1)
    1255             :             ub_send(idir) = pos(2)
    1256             :          END DO
    1257             : 
    1258             :          IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
    1259             :          IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
    1260             :          IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
    1261             :          IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
    1262             : 
    1263             :          recv_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
    1264             :          recv_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
    1265             :          recv_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
    1266             :          recv_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
    1267             :          recv_tasks(i, 5) = lb_send(3)
    1268             :          recv_tasks(i, 6) = ub_send(3)
    1269             :          recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
    1270             :                          (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
    1271             : 
    1272             :       END DO
    1273             : !$OMP END PARALLEL DO
    1274             : 
    1275        7360 :       coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
    1276        7360 :       DO idir = 1, 3
    1277        5520 :          pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1278       16560 :          pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1279        5520 :          lb_send(idir) = pos(1)
    1280        7360 :          ub_send(idir) = pos(2)
    1281             :       END DO
    1282             : 
    1283        1840 :       lb_recv(:) = lb_send(:)
    1284        1840 :       ub_recv(:) = ub_send(:)
    1285             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1286        1840 : !$OMP             SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
    1287             :       DO j = 0, pw%pw_grid%para%group_size - 1
    1288             : 
    1289             :          IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
    1290             :          IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
    1291             :          IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
    1292             :          IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
    1293             : 
    1294             :          send_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
    1295             :          send_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
    1296             :          send_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
    1297             :          send_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
    1298             :          send_tasks(j, 5) = lb_send(3)
    1299             :          send_tasks(j, 6) = ub_send(3)
    1300             :          send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
    1301             :                          (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
    1302             : 
    1303             :       END DO
    1304             : !$OMP END PARALLEL DO
    1305             : 
    1306        1840 :       send_disps(0) = 0
    1307        1840 :       recv_disps(0) = 0
    1308        3680 :       DO i = 1, pw%pw_grid%para%group_size - 1
    1309        1840 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1310        3680 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1311             :       END DO
    1312             : 
    1313       12880 :       CPASSERT(SUM(send_sizes) == PRODUCT(ub_recv - lb_recv + 1))
    1314             : 
    1315        9200 :       ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
    1316       11040 :       ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
    1317             : 
    1318        5520 :       DO i = 0, rs%desc%group_size - 1
    1319        3680 :          IF (send_sizes(i) .NE. 0) THEN
    1320       10302 :             ALLOCATE (send_bufs(i)%array(send_sizes(i)))
    1321             :          ELSE
    1322         246 :             NULLIFY (send_bufs(i)%array)
    1323             :          END IF
    1324        5520 :          IF (recv_sizes(i) .NE. 0) THEN
    1325       10302 :             ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
    1326             :          ELSE
    1327         246 :             NULLIFY (recv_bufs(i)%array)
    1328             :          END IF
    1329             :       END DO
    1330             : 
    1331        9200 :       ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
    1332        5520 :       recv_reqs = mp_request_null
    1333             : 
    1334        5520 :       DO i = 0, rs%desc%group_size - 1
    1335        5520 :          IF (recv_sizes(i) .NE. 0) THEN
    1336        3434 :             CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
    1337             :          END IF
    1338             :       END DO
    1339             : 
    1340             :       ! do packing
    1341             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1342             : !$OMP             PRIVATE(k,z,y,x), &
    1343        1840 : !$OMP             SHARED(rs,send_tasks,send_bufs,send_disps)
    1344             :       DO i = 0, rs%desc%group_size - 1
    1345             :          k = 0
    1346             :          DO z = send_tasks(i, 5), send_tasks(i, 6)
    1347             :             DO y = send_tasks(i, 3), send_tasks(i, 4)
    1348             :                DO x = send_tasks(i, 1), send_tasks(i, 2)
    1349             :                   k = k + 1
    1350             :                   send_bufs(i)%array(k) = rs%r(x, y, z)
    1351             :                END DO
    1352             :             END DO
    1353             :          END DO
    1354             :       END DO
    1355             : !$OMP END PARALLEL DO
    1356             : 
    1357        9200 :       ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
    1358        5520 :       send_reqs = mp_request_null
    1359             : 
    1360        5520 :       DO i = 0, rs%desc%group_size - 1
    1361        5520 :          IF (send_sizes(i) .NE. 0) THEN
    1362        3434 :             CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
    1363             :          END IF
    1364             :       END DO
    1365             : 
    1366             :       ! do unpacking
    1367             :       ! no OMP here so we can unpack each message as it arrives
    1368        5520 :       DO i = 0, rs%desc%group_size - 1
    1369        3680 :          IF (recv_sizes(i) .EQ. 0) CYCLE
    1370             : 
    1371        3434 :          CALL mp_waitany(recv_reqs, completed)
    1372        3434 :          k = 0
    1373      143312 :          DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
    1374    10769386 :             DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
    1375   455339438 :                DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
    1376   444573732 :                   k = k + 1
    1377   455201400 :                   pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
    1378             :                END DO
    1379             :             END DO
    1380             :          END DO
    1381             :       END DO
    1382             : 
    1383        1840 :       CALL mp_waitall(send_reqs)
    1384             : 
    1385        1840 :       DEALLOCATE (recv_reqs)
    1386        1840 :       DEALLOCATE (send_reqs)
    1387             : 
    1388        5520 :       DO i = 0, rs%desc%group_size - 1
    1389        3680 :          IF (ASSOCIATED(send_bufs(i)%array)) THEN
    1390        3434 :             DEALLOCATE (send_bufs(i)%array)
    1391             :          END IF
    1392        5520 :          IF (ASSOCIATED(recv_bufs(i)%array)) THEN
    1393        3434 :             DEALLOCATE (recv_bufs(i)%array)
    1394             :          END IF
    1395             :       END DO
    1396             : 
    1397        1840 :       DEALLOCATE (send_bufs)
    1398        1840 :       DEALLOCATE (recv_bufs)
    1399        1840 :       DEALLOCATE (send_tasks)
    1400        1840 :       DEALLOCATE (send_sizes)
    1401        1840 :       DEALLOCATE (send_disps)
    1402        1840 :       DEALLOCATE (recv_tasks)
    1403        1840 :       DEALLOCATE (recv_sizes)
    1404        1840 :       DEALLOCATE (recv_disps)
    1405             : 
    1406             :       IF (debug_this_module) THEN
    1407             :          ! safety check, to be removed once we're absolute sure the routine is correct
    1408             :          pw_sum = pw_integrate_function(pw)
    1409             :          IF (ABS(pw_sum - rs_sum)/MAX(1.0_dp, ABS(pw_sum), ABS(rs_sum)) > EPSILON(rs_sum)*1000) THEN
    1410             :             WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
    1411             :                rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, ABS(pw_sum - rs_sum)
    1412             :             CALL cp_abort(__LOCATION__, &
    1413             :                           error_string//" Please report this bug ... quick workaround: use "// &
    1414             :                           "DISTRIBUTION_TYPE REPLICATED")
    1415             :          END IF
    1416             :       END IF
    1417             : 
    1418        1840 :    END SUBROUTINE transfer_rs2pw_distributed
    1419             : 
    1420             : ! **************************************************************************************************
    1421             : !> \brief does the pw2rs transfer in the case where the rs grid is
    1422             : !>       distributed (3D domain decomposition)
    1423             : !> \param rs ...
    1424             : !> \param pw ...
    1425             : !> \par History
    1426             : !>      12.2007 created [Matt Watkins]
    1427             : !>      9.2008 reduced amount of halo data sent [Iain Bethune]
    1428             : !>      10.2008 added non-blocking communication [Iain Bethune]
    1429             : !>      4.2009 added support for rank-reordering on the grid [Iain Bethune]
    1430             : !>      12.2009 added OMP and sparse alltoall [Iain Bethune]
    1431             : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
    1432             : !> \note
    1433             : !>       the transfer is a two step procedure. For example, for the rs2pw transfer:
    1434             : !>
    1435             : !>       1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
    1436             : !>       2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
    1437             : !>
    1438             : !>       the halo exchange is most expensive on a large number of CPUs. Particular in this halo
    1439             : !>       exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
    1440             : !>       with the central domain of several CPUs (i.e. next nearest neighbors)
    1441             : ! **************************************************************************************************
    1442         868 :    SUBROUTINE transfer_pw2rs_distributed(rs, pw)
    1443             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1444             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
    1445             : 
    1446             :       INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
    1447             :          n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
    1448         868 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dshifts, recv_disps, recv_sizes, &
    1449         868 :                                                             send_disps, send_sizes, ushifts
    1450        1736 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: bounds, recv_tasks, send_tasks
    1451             :       INTEGER, DIMENSION(2)                              :: neighbours, pos
    1452             :       INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
    1453             :          lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
    1454             :       LOGICAL, DIMENSION(3)                              :: halo_swapped
    1455         868 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: recv_buf_3d_down, recv_buf_3d_up, &
    1456         868 :                                                             send_buf_3d_down, send_buf_3d_up
    1457        1736 :       TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_bufs, send_bufs
    1458         868 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_reqs, send_reqs
    1459        4340 :       TYPE(mp_request_type), DIMENSION(4)                :: req
    1460             : 
    1461         868 :       num_threads = 1
    1462         868 :       my_id = 0
    1463             : 
    1464         868 :       CALL rs_grid_zero(rs)
    1465             : 
    1466             :       ! This is the real redistribution
    1467             : 
    1468        3472 :       ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
    1469             : 
    1470        2604 :       DO i = 0, pw%pw_grid%para%group_size - 1
    1471        5208 :          bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
    1472        5208 :          bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
    1473        5208 :          bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
    1474        6076 :          bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
    1475             :       END DO
    1476             : 
    1477        3472 :       ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
    1478        2604 :       ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
    1479        2604 :       ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
    1480        3472 :       ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
    1481        2604 :       ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
    1482        2604 :       ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
    1483             : 
    1484       16492 :       send_tasks = 0
    1485        2604 :       send_tasks(:, 1) = 1
    1486        2604 :       send_tasks(:, 2) = 0
    1487        2604 :       send_tasks(:, 3) = 1
    1488        2604 :       send_tasks(:, 4) = 0
    1489        2604 :       send_tasks(:, 5) = 1
    1490        2604 :       send_tasks(:, 6) = 0
    1491        2604 :       send_sizes = 0
    1492             : 
    1493       16492 :       recv_tasks = 0
    1494        2604 :       recv_tasks(:, 1) = 1
    1495        2604 :       recv_tasks(:, 2) = 0
    1496        2604 :       send_tasks(:, 3) = 1
    1497        2604 :       send_tasks(:, 4) = 0
    1498        2604 :       send_tasks(:, 5) = 1
    1499        2604 :       send_tasks(:, 6) = 0
    1500        2604 :       recv_sizes = 0
    1501             : 
    1502         868 :       my_rs_rank = rs%desc%my_pos
    1503         868 :       my_pw_rank = pw%pw_grid%para%rs_mpo
    1504             : 
    1505             :       ! find the processors that should hold our data
    1506             :       ! should be part of the rs grid type
    1507             :       ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
    1508             :       ! do the recv and send tasks in two separate loops which will
    1509             :       ! load balance better for OpenMP with large numbers of MPI tasks
    1510             : 
    1511             :       ! this is the reverse of rs2pw: what were the sends are now the recvs
    1512             : 
    1513             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1514             : !$OMP             PRIVATE(coords,idir,pos,lb_send,ub_send), &
    1515         868 : !$OMP             SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
    1516             :       DO i = 0, pw%pw_grid%para%group_size - 1
    1517             : 
    1518             :          coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
    1519             :          !calculate the real rs grid points on each processor
    1520             :          !coords is the part of the grid that rank i actually holds
    1521             :          DO idir = 1, 3
    1522             :             pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1523             :             pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1524             :             lb_send(idir) = pos(1)
    1525             :             ub_send(idir) = pos(2)
    1526             :          END DO
    1527             : 
    1528             :          IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
    1529             :          IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
    1530             :          IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
    1531             :          IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
    1532             : 
    1533             :          send_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
    1534             :          send_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
    1535             :          send_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
    1536             :          send_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
    1537             :          send_tasks(i, 5) = lb_send(3)
    1538             :          send_tasks(i, 6) = ub_send(3)
    1539             :          send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
    1540             :                          (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
    1541             : 
    1542             :       END DO
    1543             : !$OMP END PARALLEL DO
    1544             : 
    1545        3472 :       coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
    1546        3472 :       DO idir = 1, 3
    1547        2604 :          pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1548        7812 :          pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1549        2604 :          lb_send(idir) = pos(1)
    1550        3472 :          ub_send(idir) = pos(2)
    1551             :       END DO
    1552             : 
    1553         868 :       lb_recv(:) = lb_send(:)
    1554         868 :       ub_recv(:) = ub_send(:)
    1555             : 
    1556             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1557         868 : !$OMP             SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
    1558             :       DO j = 0, pw%pw_grid%para%group_size - 1
    1559             : 
    1560             :          IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
    1561             :          IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
    1562             :          IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
    1563             :          IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
    1564             : 
    1565             :          recv_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
    1566             :          recv_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
    1567             :          recv_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
    1568             :          recv_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
    1569             :          recv_tasks(j, 5) = lb_send(3)
    1570             :          recv_tasks(j, 6) = ub_send(3)
    1571             :          recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
    1572             :                          (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
    1573             : 
    1574             :       END DO
    1575             : !$OMP END PARALLEL DO
    1576             : 
    1577         868 :       send_disps(0) = 0
    1578         868 :       recv_disps(0) = 0
    1579        1736 :       DO i = 1, pw%pw_grid%para%group_size - 1
    1580         868 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1581        1736 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1582             :       END DO
    1583             : 
    1584        6076 :       CPASSERT(SUM(recv_sizes) == PRODUCT(ub_recv - lb_recv + 1))
    1585             : 
    1586        4340 :       ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
    1587        5208 :       ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
    1588             : 
    1589        2604 :       DO i = 0, rs%desc%group_size - 1
    1590        1736 :          IF (send_sizes(i) .NE. 0) THEN
    1591        4962 :             ALLOCATE (send_bufs(i)%array(send_sizes(i)))
    1592             :          ELSE
    1593          82 :             NULLIFY (send_bufs(i)%array)
    1594             :          END IF
    1595        2604 :          IF (recv_sizes(i) .NE. 0) THEN
    1596        4962 :             ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
    1597             :          ELSE
    1598          82 :             NULLIFY (recv_bufs(i)%array)
    1599             :          END IF
    1600             :       END DO
    1601             : 
    1602        4340 :       ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
    1603        2604 :       recv_reqs = mp_request_null
    1604             : 
    1605        2604 :       DO i = 0, rs%desc%group_size - 1
    1606        2604 :          IF (recv_sizes(i) .NE. 0) THEN
    1607        1654 :             CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
    1608             :          END IF
    1609             :       END DO
    1610             : 
    1611             :       ! do packing
    1612             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1613             : !$OMP             PRIVATE(k,z,y,x), &
    1614         868 : !$OMP             SHARED(pw,rs,send_tasks,send_bufs,send_disps)
    1615             :       DO i = 0, rs%desc%group_size - 1
    1616             :          k = 0
    1617             :          DO z = send_tasks(i, 5), send_tasks(i, 6)
    1618             :             DO y = send_tasks(i, 3), send_tasks(i, 4)
    1619             :                DO x = send_tasks(i, 1), send_tasks(i, 2)
    1620             :                   k = k + 1
    1621             :                   send_bufs(i)%array(k) = pw%array(x, y, z)
    1622             :                END DO
    1623             :             END DO
    1624             :          END DO
    1625             :       END DO
    1626             : !$OMP END PARALLEL DO
    1627             : 
    1628        4340 :       ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
    1629        2604 :       send_reqs = mp_request_null
    1630             : 
    1631        2604 :       DO i = 0, rs%desc%group_size - 1
    1632        2604 :          IF (send_sizes(i) .NE. 0) THEN
    1633        1654 :             CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
    1634             :          END IF
    1635             :       END DO
    1636             : 
    1637             :       ! do unpacking
    1638             :       ! no OMP here so we can unpack each message as it arrives
    1639             : 
    1640        2604 :       DO i = 0, rs%desc%group_size - 1
    1641        1736 :          IF (recv_sizes(i) .EQ. 0) CYCLE
    1642             : 
    1643        1654 :          CALL mp_waitany(recv_reqs, completed)
    1644        1654 :          k = 0
    1645       66400 :          DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
    1646     4768332 :             DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
    1647   195756703 :                DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
    1648   190990107 :                   k = k + 1
    1649   195692825 :                   rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
    1650             :                END DO
    1651             :             END DO
    1652             :          END DO
    1653             :       END DO
    1654             : 
    1655         868 :       CALL mp_waitall(send_reqs)
    1656             : 
    1657         868 :       DEALLOCATE (recv_reqs)
    1658         868 :       DEALLOCATE (send_reqs)
    1659             : 
    1660        2604 :       DO i = 0, rs%desc%group_size - 1
    1661        1736 :          IF (ASSOCIATED(send_bufs(i)%array)) THEN
    1662        1654 :             DEALLOCATE (send_bufs(i)%array)
    1663             :          END IF
    1664        2604 :          IF (ASSOCIATED(recv_bufs(i)%array)) THEN
    1665        1654 :             DEALLOCATE (recv_bufs(i)%array)
    1666             :          END IF
    1667             :       END DO
    1668             : 
    1669         868 :       DEALLOCATE (send_bufs)
    1670         868 :       DEALLOCATE (recv_bufs)
    1671         868 :       DEALLOCATE (send_tasks)
    1672         868 :       DEALLOCATE (send_sizes)
    1673         868 :       DEALLOCATE (send_disps)
    1674         868 :       DEALLOCATE (recv_tasks)
    1675         868 :       DEALLOCATE (recv_sizes)
    1676         868 :       DEALLOCATE (recv_disps)
    1677             : 
    1678             :       ! now pass wings around
    1679         868 :       halo_swapped = .FALSE.
    1680             : 
    1681        3472 :       DO idir = 1, 3
    1682             : 
    1683        2604 :          IF (rs%desc%perd(idir) /= 1) THEN
    1684             : 
    1685        5532 :             ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
    1686        5532 :             ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
    1687        5532 :             ushifts = 0
    1688        5532 :             dshifts = 0
    1689             : 
    1690        3688 :             DO n_shifts = 1, rs%desc%neighbours(idir)
    1691             : 
    1692             :                ! need to take into account the possible varying widths of neighbouring cells
    1693             :                ! ushifts and dshifts hold the real size of the neighbouring cells
    1694             : 
    1695        1844 :                position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
    1696        1844 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1697        1844 :                dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1698             : 
    1699        1844 :                position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
    1700        1844 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1701        1844 :                ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1702             : 
    1703             :                ! The border data has to be send/received from the neighbors
    1704             :                ! First we calculate the source and destination processes for the shift
    1705             :                ! The first shift is "downwards"
    1706             : 
    1707        1844 :                CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
    1708             : 
    1709        7376 :                lb_send_down(:) = rs%lb_local(:)
    1710        7376 :                ub_send_down(:) = rs%ub_local(:)
    1711        7376 :                lb_recv_down(:) = rs%lb_local(:)
    1712        7376 :                ub_recv_down(:) = rs%ub_local(:)
    1713             : 
    1714        1844 :                IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1715        1844 :                   lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
    1716             :                   ub_send_down(idir) = MIN(ub_send_down(idir) - rs%desc%border, &
    1717        1844 :                                            lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
    1718             : 
    1719        1844 :                   lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
    1720             :                   ub_recv_down(idir) = MIN(ub_recv_down(idir), &
    1721        1844 :                                            ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
    1722             :                ELSE
    1723           0 :                   lb_send_down(idir) = 0
    1724           0 :                   ub_send_down(idir) = -1
    1725           0 :                   lb_recv_down(idir) = 0
    1726           0 :                   ub_recv_down(idir) = -1
    1727             :                END IF
    1728             : 
    1729        7376 :                DO i = 1, 3
    1730        7376 :                   IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
    1731        1548 :                      lb_send_down(i) = rs%lb_real(i)
    1732        1548 :                      ub_send_down(i) = rs%ub_real(i)
    1733        1548 :                      lb_recv_down(i) = rs%lb_real(i)
    1734        1548 :                      ub_recv_down(i) = rs%ub_real(i)
    1735             :                   END IF
    1736             :                END DO
    1737             : 
    1738             :                ! allocate the recv buffer
    1739        7376 :                nn = PRODUCT(ub_recv_down - lb_recv_down + 1)
    1740           0 :                ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
    1741        9220 :                                           lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
    1742             : 
    1743             :                ! recv buffer is now ready, so post the receive
    1744        1844 :                CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
    1745             : 
    1746             :                ! now allocate,pack and send the send buffer
    1747        7376 :                nn = PRODUCT(ub_send_down - lb_send_down + 1)
    1748           0 :                ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
    1749        9220 :                                           lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
    1750             : 
    1751             : !$OMP PARALLEL DEFAULT(NONE), &
    1752             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1753        1844 : !$OMP          SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
    1754             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
    1755             : !$             my_id = omp_get_thread_num()
    1756             :                IF (my_id < num_threads) THEN
    1757             :                   lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
    1758             :                   ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
    1759             : 
    1760             :                   send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
    1761             :                                    lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
    1762             :                                                  lb_send_down(2):ub_send_down(2), lb:ub)
    1763             :                END IF
    1764             : !$OMP END PARALLEL
    1765             : 
    1766        1844 :                CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
    1767             : 
    1768             :                ! Now for the other direction
    1769             : 
    1770        1844 :                CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
    1771             : 
    1772        7376 :                lb_send_up(:) = rs%lb_local(:)
    1773        7376 :                ub_send_up(:) = rs%ub_local(:)
    1774        7376 :                lb_recv_up(:) = rs%lb_local(:)
    1775        7376 :                ub_recv_up(:) = rs%ub_local(:)
    1776             : 
    1777        1844 :                IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1778        1844 :                   ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
    1779             :                   lb_send_up(idir) = MAX(lb_send_up(idir) + rs%desc%border, &
    1780        1844 :                                          ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
    1781             : 
    1782        1844 :                   ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
    1783             :                   lb_recv_up(idir) = MAX(lb_recv_up(idir), &
    1784        1844 :                                          lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
    1785             :                ELSE
    1786           0 :                   lb_send_up(idir) = 0
    1787           0 :                   ub_send_up(idir) = -1
    1788           0 :                   lb_recv_up(idir) = 0
    1789           0 :                   ub_recv_up(idir) = -1
    1790             :                END IF
    1791             : 
    1792        7376 :                DO i = 1, 3
    1793        7376 :                   IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
    1794        1548 :                      lb_send_up(i) = rs%lb_real(i)
    1795        1548 :                      ub_send_up(i) = rs%ub_real(i)
    1796        1548 :                      lb_recv_up(i) = rs%lb_real(i)
    1797        1548 :                      ub_recv_up(i) = rs%ub_real(i)
    1798             :                   END IF
    1799             :                END DO
    1800             : 
    1801             :                ! allocate the recv buffer
    1802        7376 :                nn = PRODUCT(ub_recv_up - lb_recv_up + 1)
    1803           0 :                ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
    1804        9220 :                                         lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
    1805             : 
    1806             :                ! recv buffer is now ready, so post the receive
    1807             : 
    1808        1844 :                CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
    1809             : 
    1810             :                ! now allocate,pack and send the send buffer
    1811        7376 :                nn = PRODUCT(ub_send_up - lb_send_up + 1)
    1812           0 :                ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
    1813        9220 :                                         lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
    1814             : 
    1815             : !$OMP PARALLEL DEFAULT(NONE), &
    1816             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1817        1844 : !$OMP          SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
    1818             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
    1819             : !$             my_id = omp_get_thread_num()
    1820             :                IF (my_id < num_threads) THEN
    1821             :                   lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
    1822             :                   ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
    1823             : 
    1824             :                   send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
    1825             :                                  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
    1826             :                                                lb_send_up(2):ub_send_up(2), lb:ub)
    1827             :                END IF
    1828             : !$OMP END PARALLEL
    1829             : 
    1830        1844 :                CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
    1831             : 
    1832             :                ! wait for a recv to complete, then we can unpack
    1833             : 
    1834        5532 :                DO i = 1, 2
    1835             : 
    1836        3688 :                   CALL mp_waitany(req(1:2), completed)
    1837             : 
    1838        5532 :                   IF (completed .EQ. 1) THEN
    1839             : 
    1840             :                      ! only some procs may need later shifts
    1841        1844 :                      IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
    1842             : 
    1843             :                         ! Add the data to the RS Grid
    1844             : !$OMP PARALLEL DEFAULT(NONE), &
    1845             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1846        1844 : !$OMP          SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
    1847             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
    1848             : !$                      my_id = omp_get_thread_num()
    1849             :                         IF (my_id < num_threads) THEN
    1850             :                            lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
    1851             :                            ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
    1852             : 
    1853             :                            rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
    1854             :                                 lb:ub) = recv_buf_3d_down(:, :, lb:ub)
    1855             :                         END IF
    1856             : !$OMP END PARALLEL
    1857             :                      END IF
    1858             : 
    1859        1844 :                      DEALLOCATE (recv_buf_3d_down)
    1860             :                   ELSE
    1861             : 
    1862             :                      ! only some procs may need later shifts
    1863        1844 :                      IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
    1864             : 
    1865             :                         ! Add the data to the RS Grid
    1866             : !$OMP PARALLEL DEFAULT(NONE), &
    1867             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1868        1844 : !$OMP          SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
    1869             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
    1870             : !$                      my_id = omp_get_thread_num()
    1871             :                         IF (my_id < num_threads) THEN
    1872             :                            lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
    1873             :                            ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
    1874             : 
    1875             :                            rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
    1876             :                                 lb:ub) = recv_buf_3d_up(:, :, lb:ub)
    1877             :                         END IF
    1878             : !$OMP END PARALLEL
    1879             :                      END IF
    1880             : 
    1881        1844 :                      DEALLOCATE (recv_buf_3d_up)
    1882             :                   END IF
    1883             :                END DO
    1884             : 
    1885        1844 :                CALL mp_waitall(req(3:4))
    1886             : 
    1887        1844 :                DEALLOCATE (send_buf_3d_down)
    1888        5532 :                DEALLOCATE (send_buf_3d_up)
    1889             :             END DO
    1890             : 
    1891        1844 :             DEALLOCATE (ushifts)
    1892        1844 :             DEALLOCATE (dshifts)
    1893             :          END IF
    1894             : 
    1895        3472 :          halo_swapped(idir) = .TRUE.
    1896             : 
    1897             :       END DO
    1898             : 
    1899         868 :    END SUBROUTINE transfer_pw2rs_distributed
    1900             : 
    1901             : ! **************************************************************************************************
    1902             : !> \brief Initialize grid to zero
    1903             : !> \param rs ...
    1904             : !> \par History
    1905             : !>      none
    1906             : !> \author JGH (23-Mar-2002)
    1907             : ! **************************************************************************************************
    1908      326052 :    SUBROUTINE rs_grid_zero(rs)
    1909             : 
    1910             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1911             : 
    1912             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'rs_grid_zero'
    1913             : 
    1914             :       INTEGER                                            :: handle, i, j, k, l(3), u(3)
    1915             : 
    1916      326052 :       CALL timeset(routineN, handle)
    1917      978156 :       l(1) = LBOUND(rs%r, 1); l(2) = LBOUND(rs%r, 2); l(3) = LBOUND(rs%r, 3)
    1918      978156 :       u(1) = UBOUND(rs%r, 1); u(2) = UBOUND(rs%r, 2); u(3) = UBOUND(rs%r, 3)
    1919             : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
    1920             : !$OMP             PRIVATE(i,j,k) &
    1921      326052 : !$OMP             SHARED(rs,l,u)
    1922             :       DO k = l(3), u(3)
    1923             :       DO j = l(2), u(2)
    1924             :       DO i = l(1), u(1)
    1925             :          rs%r(i, j, k) = 0.0_dp
    1926             :       END DO
    1927             :       END DO
    1928             :       END DO
    1929             : !$OMP END PARALLEL DO
    1930      326052 :       CALL timestop(handle)
    1931             : 
    1932      326052 :    END SUBROUTINE rs_grid_zero
    1933             : 
    1934             : ! **************************************************************************************************
    1935             : !> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
    1936             : !> \param rs1 ...
    1937             : !> \param rs2 ...
    1938             : !> \param rs3 ...
    1939             : !> \param scalar ...
    1940             : !> \par History
    1941             : !>      none
    1942             : !> \author
    1943             : ! **************************************************************************************************
    1944        1404 :    SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
    1945             : 
    1946             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs1, rs2, rs3
    1947             :       REAL(dp), INTENT(IN)                               :: scalar
    1948             : 
    1949             :       CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_mult_and_add'
    1950             : 
    1951             :       INTEGER                                            :: handle, i, j, k, l(3), u(3)
    1952             : 
    1953             : !-----------------------------------------------------------------------------!
    1954             : 
    1955        1404 :       CALL timeset(routineN, handle)
    1956        1404 :       IF (scalar .NE. 0.0_dp) THEN
    1957        4212 :          l(1) = LBOUND(rs1%r, 1); l(2) = LBOUND(rs1%r, 2); l(3) = LBOUND(rs1%r, 3)
    1958        4212 :          u(1) = UBOUND(rs1%r, 1); u(2) = UBOUND(rs1%r, 2); u(3) = UBOUND(rs1%r, 3)
    1959             : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
    1960             : !$OMP             PRIVATE(i,j,k) &
    1961        1404 : !$OMP             SHARED(rs1,rs2,rs3,scalar,l,u)
    1962             :          DO k = l(3), u(3)
    1963             :          DO j = l(2), u(2)
    1964             :          DO i = l(1), u(1)
    1965             :             rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
    1966             :          END DO
    1967             :          END DO
    1968             :          END DO
    1969             : !$OMP END PARALLEL DO
    1970             :       END IF
    1971        1404 :       CALL timestop(handle)
    1972        1404 :    END SUBROUTINE rs_grid_mult_and_add
    1973             : 
    1974             : ! **************************************************************************************************
    1975             : !> \brief Set box matrix info for real space grid
    1976             : !>      This is needed for variable cell simulations
    1977             : !> \param pw_grid ...
    1978             : !> \param rs ...
    1979             : !> \par History
    1980             : !>      none
    1981             : !> \author JGH (15-May-2007)
    1982             : ! **************************************************************************************************
    1983      166222 :    SUBROUTINE rs_grid_set_box(pw_grid, rs)
    1984             : 
    1985             :       TYPE(pw_grid_type), INTENT(IN), TARGET             :: pw_grid
    1986             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1987             : 
    1988      166222 :       CPASSERT(ASSOCIATED(rs%desc%pw, pw_grid))
    1989     2160886 :       rs%desc%dh = pw_grid%dh
    1990     2160886 :       rs%desc%dh_inv = pw_grid%dh_inv
    1991             : 
    1992      166222 :    END SUBROUTINE rs_grid_set_box
    1993             : 
    1994             : ! **************************************************************************************************
    1995             : !> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
    1996             : !> \param rs_desc the grid descriptor to retain
    1997             : !> \par History
    1998             : !>      04.2009 created [Iain Bethune]
    1999             : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2000             : ! **************************************************************************************************
    2001      225574 :    SUBROUTINE rs_grid_retain_descriptor(rs_desc)
    2002             :       TYPE(realspace_grid_desc_type), INTENT(INOUT)      :: rs_desc
    2003             : 
    2004      225574 :       CPASSERT(rs_desc%ref_count > 0)
    2005      225574 :       rs_desc%ref_count = rs_desc%ref_count + 1
    2006      225574 :    END SUBROUTINE rs_grid_retain_descriptor
    2007             : 
    2008             : ! **************************************************************************************************
    2009             : !> \brief releases the given rs grid (see doc/ReferenceCounting.html)
    2010             : !> \param rs_grid the rs grid to release
    2011             : !> \par History
    2012             : !>      03.2003 created [fawzi]
    2013             : !> \author fawzi
    2014             : ! **************************************************************************************************
    2015      225024 :    SUBROUTINE rs_grid_release(rs_grid)
    2016             :       TYPE(realspace_grid_type), INTENT(INOUT)           :: rs_grid
    2017             : 
    2018      225024 :       CALL rs_grid_release_descriptor(rs_grid%desc)
    2019             : 
    2020      225024 :       CALL offload_free_buffer(rs_grid%buffer)
    2021      225024 :       NULLIFY (rs_grid%r)
    2022             : 
    2023      225024 :       IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
    2024      225024 :       IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
    2025      225024 :       IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
    2026      225024 :    END SUBROUTINE rs_grid_release
    2027             : 
    2028             : ! **************************************************************************************************
    2029             : !> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
    2030             : !> \param rs_desc the rs grid descriptor to release
    2031             : !> \par History
    2032             : !>      04.2009 created [Iain Bethune]
    2033             : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2034             : ! **************************************************************************************************
    2035      259489 :    SUBROUTINE rs_grid_release_descriptor(rs_desc)
    2036             :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
    2037             : 
    2038      259489 :       IF (ASSOCIATED(rs_desc)) THEN
    2039      255870 :          CPASSERT(rs_desc%ref_count > 0)
    2040      255870 :          rs_desc%ref_count = rs_desc%ref_count - 1
    2041      255870 :          IF (rs_desc%ref_count == 0) THEN
    2042             : 
    2043       30296 :             CALL pw_grid_release(rs_desc%pw)
    2044             : 
    2045       30296 :             IF (rs_desc%parallel) THEN
    2046             :                ! release the group communicator
    2047       26390 :                CALL rs_desc%group%free()
    2048             : 
    2049       26390 :                DEALLOCATE (rs_desc%virtual2real)
    2050       26390 :                DEALLOCATE (rs_desc%real2virtual)
    2051             :             END IF
    2052             : 
    2053       30296 :             IF (rs_desc%distributed) THEN
    2054         158 :                DEALLOCATE (rs_desc%rank2coord)
    2055         158 :                DEALLOCATE (rs_desc%coord2rank)
    2056         158 :                DEALLOCATE (rs_desc%lb_global)
    2057         158 :                DEALLOCATE (rs_desc%ub_global)
    2058         158 :                DEALLOCATE (rs_desc%x2coord)
    2059         158 :                DEALLOCATE (rs_desc%y2coord)
    2060         158 :                DEALLOCATE (rs_desc%z2coord)
    2061             :             END IF
    2062             : 
    2063       30296 :             DEALLOCATE (rs_desc)
    2064             :          END IF
    2065             :       END IF
    2066      259489 :       NULLIFY (rs_desc)
    2067      259489 :    END SUBROUTINE rs_grid_release_descriptor
    2068             : 
    2069             : ! **************************************************************************************************
    2070             : !> \brief emulates the function of an MPI_cart_shift operation, but the shift is
    2071             : !>        done in virtual coordinates, and the corresponding real ranks are returned
    2072             : !> \param rs_grid ...
    2073             : !> \param dir ...
    2074             : !> \param disp ...
    2075             : !> \param source ...
    2076             : !> \param dest ...
    2077             : !> \par History
    2078             : !>      04.2009 created [Iain Bethune]
    2079             : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2080             : ! **************************************************************************************************
    2081        7368 :    PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
    2082             : 
    2083             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs_grid
    2084             :       INTEGER, INTENT(IN)                                :: dir, disp
    2085             :       INTEGER, INTENT(OUT)                               :: source, dest
    2086             : 
    2087             :       INTEGER, DIMENSION(3)                              :: shift_coords
    2088             : 
    2089       29472 :       shift_coords = rs_grid%desc%virtual_group_coor
    2090        7368 :       shift_coords(dir) = MODULO(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
    2091        7368 :       dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
    2092       29472 :       shift_coords = rs_grid%desc%virtual_group_coor
    2093        7368 :       shift_coords(dir) = MODULO(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
    2094        7368 :       source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
    2095             : 
    2096        7368 :    END SUBROUTINE
    2097             : 
    2098             : ! **************************************************************************************************
    2099             : !> \brief returns the maximum number of points in the local grid of any process
    2100             : !>        to account for the case where the grid may later be reordered
    2101             : !> \param desc ...
    2102             : !> \return ...
    2103             : !> \par History
    2104             : !>      10.2011 created [Iain Bethune]
    2105             : ! **************************************************************************************************
    2106           0 :    FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
    2107             :       TYPE(realspace_grid_desc_type), INTENT(IN)         :: desc
    2108             :       INTEGER                                            :: max_ngpts
    2109             : 
    2110             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'rs_grid_max_ngpts'
    2111             : 
    2112             :       INTEGER                                            :: handle, i
    2113             :       INTEGER, DIMENSION(3)                              :: lb, ub
    2114             : 
    2115           0 :       CALL timeset(routineN, handle)
    2116             : 
    2117           0 :       max_ngpts = 0
    2118           0 :       IF ((desc%pw%para%mode == PW_MODE_LOCAL) .OR. &
    2119             :           (ALL(desc%group_dim == 1))) THEN
    2120           0 :          CPASSERT(PRODUCT(INT(desc%npts, KIND=int_8)) < HUGE(1))
    2121           0 :          max_ngpts = PRODUCT(desc%npts)
    2122             :       ELSE
    2123           0 :          DO i = 0, desc%group_size - 1
    2124           0 :             lb = desc%lb_global(:, i)
    2125           0 :             ub = desc%ub_global(:, i)
    2126           0 :             lb = lb - desc%border*(1 - desc%perd)
    2127           0 :             ub = ub + desc%border*(1 - desc%perd)
    2128           0 :             CPASSERT(PRODUCT(INT(ub - lb + 1, KIND=int_8)) < HUGE(1))
    2129           0 :             max_ngpts = MAX(max_ngpts, PRODUCT(ub - lb + 1))
    2130             :          END DO
    2131             :       END IF
    2132             : 
    2133           0 :       CALL timestop(handle)
    2134             : 
    2135           0 :    END FUNCTION rs_grid_max_ngpts
    2136             : 
    2137             : ! **************************************************************************************************
    2138             : !> \brief ...
    2139             : !> \param rs_grid ...
    2140             : !> \param h_inv ...
    2141             : !> \param ra ...
    2142             : !> \param offset ...
    2143             : !> \param group_size ...
    2144             : !> \param my_pos ...
    2145             : !> \return ...
    2146             : ! **************************************************************************************************
    2147     1500159 :    PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
    2148             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs_grid
    2149             :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(IN)         :: h_inv
    2150             :       REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: ra
    2151             :       INTEGER, INTENT(IN), OPTIONAL                      :: offset, group_size, my_pos
    2152             : 
    2153             :       INTEGER                                            :: dir, lb(3), location(3), tp(3), ub(3)
    2154             : 
    2155     1500159 :       res = .FALSE.
    2156             : 
    2157     6000628 :       IF (.NOT. ALL(rs_grid%desc%perd == 1)) THEN
    2158          32 :          DO dir = 1, 3
    2159             :             ! bounds of local grid (i.e. removing the 'wings'), if periodic
    2160          96 :             tp(dir) = FLOOR(DOT_PRODUCT(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
    2161          24 :             tp(dir) = MODULO(tp(dir), rs_grid%desc%npts(dir))
    2162          24 :             IF (rs_grid%desc%perd(dir) .NE. 1) THEN
    2163           8 :                lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
    2164           8 :                ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
    2165             :             ELSE
    2166          16 :                lb(dir) = rs_grid%lb_local(dir)
    2167          16 :                ub(dir) = rs_grid%ub_local(dir)
    2168             :             END IF
    2169             :             ! distributed grid, only map if it is local to the grid
    2170          32 :             location(dir) = tp(dir) + rs_grid%desc%lb(dir)
    2171             :          END DO
    2172          60 :          IF (ALL(lb(:) <= location(:)) .AND. ALL(location(:) <= ub(:))) THEN
    2173           4 :             res = .TRUE.
    2174             :          END IF
    2175             :       ELSE
    2176     1500151 :          IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
    2177             :             ! not distributed, just a round-robin distribution over the full set of CPUs
    2178     1500151 :             IF (MODULO(offset, group_size) == my_pos) res = .TRUE.
    2179             :          END IF
    2180             :       END IF
    2181             : 
    2182     1500159 :    END FUNCTION map_gaussian_here
    2183             : 
    2184           0 : END MODULE realspace_grid_types

Generated by: LCOV version 1.15