Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \note
10 : !> Basic type for real space grid methods
11 : !> \par History
12 : !> JGH (22-May-2002) : New routine rs_grid_zero
13 : !> JGH (12-Jun-2002) : Bug fix for mpi groups
14 : !> JGH (19-Jun-2003) : Added routine for task distribution
15 : !> JGH (23-Nov-2003) : Added routine for task loop separation
16 : !> \author JGH (18-Mar-2001)
17 : ! **************************************************************************************************
18 : MODULE realspace_grid_types
19 : USE cp_array_utils, ONLY: cp_1d_r_p_type
20 : USE cp_log_handling, ONLY: cp_to_string
21 : USE kahan_sum, ONLY: accurate_sum
22 : USE kinds, ONLY: dp,&
23 : int_8
24 : USE machine, ONLY: m_memory
25 : USE mathlib, ONLY: det_3x3
26 : USE message_passing, ONLY: mp_comm_null,&
27 : mp_comm_type,&
28 : mp_request_null,&
29 : mp_request_type,&
30 : mp_waitall,&
31 : mp_waitany
32 : USE offload_api, ONLY: offload_buffer_type,&
33 : offload_create_buffer,&
34 : offload_free_buffer
35 : USE pw_grid_types, ONLY: PW_MODE_LOCAL,&
36 : pw_grid_type
37 : USE pw_grids, ONLY: pw_grid_release,&
38 : pw_grid_retain
39 : USE pw_methods, ONLY: pw_integrate_function
40 : USE pw_types, ONLY: pw_r3d_rs_type
41 : USE util, ONLY: get_limit
42 :
43 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
44 :
45 : #include "../base/base_uses.f90"
46 :
47 : IMPLICIT NONE
48 :
49 : PRIVATE
50 : PUBLIC :: realspace_grid_type, &
51 : realspace_grid_desc_type, &
52 : realspace_grid_p_type, &
53 : realspace_grid_desc_p_type, &
54 : realspace_grid_input_type
55 :
56 : PUBLIC :: transfer_rs2pw, &
57 : transfer_pw2rs, &
58 : rs_grid_zero, &
59 : rs_grid_set_box, &
60 : rs_grid_create, &
61 : rs_grid_create_descriptor, &
62 : rs_grid_retain_descriptor, &
63 : rs_grid_release, &
64 : rs_grid_release_descriptor, &
65 : rs_grid_reorder_ranks, &
66 : rs_grid_print, &
67 : rs_grid_locate_rank, &
68 : rs_grid_max_ngpts, &
69 : rs_grid_mult_and_add, &
70 : map_gaussian_here
71 :
72 : INTEGER, PARAMETER, PUBLIC :: rsgrid_distributed = 0, &
73 : rsgrid_replicated = 1, &
74 : rsgrid_automatic = 2
75 :
76 : LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
77 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'realspace_grid_types'
78 :
79 : ! **************************************************************************************************
80 : TYPE realspace_grid_input_type
81 : INTEGER :: distribution_type = rsgrid_replicated
82 : INTEGER :: distribution_layout(3) = -1
83 : REAL(KIND=dp) :: memory_factor = 0.0_dp
84 : LOGICAL :: lock_distribution = .FALSE.
85 : INTEGER :: nsmax = -1
86 : REAL(KIND=dp) :: halo_reduction_factor = 1.0_dp
87 : END TYPE realspace_grid_input_type
88 :
89 : ! **************************************************************************************************
90 : TYPE realspace_grid_desc_type
91 : TYPE(pw_grid_type), POINTER :: pw => NULL() ! the pw grid
92 :
93 : INTEGER :: ref_count = 0 ! reference count
94 :
95 : INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
96 : INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
97 : INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
98 : INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
99 :
100 : INTEGER :: border = 0 ! border points
101 :
102 : INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
103 : REAL(KIND=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
104 : REAL(KIND=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
105 : LOGICAL :: orthorhombic = .TRUE. ! grid symmetry
106 :
107 : LOGICAL :: parallel = .TRUE. ! whether the corresponding pw grid is distributed
108 : LOGICAL :: distributed = .TRUE. ! whether the rs grid is distributed
109 : ! these MPI related quantities are only meaningful depending on how the grid has been laid out
110 : ! they are most useful for fully distributed grids, where they reflect the topology of the grid
111 : TYPE(mp_comm_type) :: group = mp_comm_null
112 : INTEGER :: my_pos = -1
113 : LOGICAL :: group_head = .FALSE.
114 : INTEGER :: group_size = 0
115 : INTEGER, DIMENSION(3) :: group_dim = -1
116 : INTEGER, DIMENSION(3) :: group_coor = -1
117 : INTEGER, DIMENSION(3) :: neighbours = -1
118 : ! only meaningful on distributed grids
119 : ! a list of bounds for each CPU
120 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
121 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
122 : ! a mapping from linear rank to 3d coord
123 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
124 : INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
125 : ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
126 : INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
127 : INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
128 : INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
129 :
130 : INTEGER :: my_virtual_pos = -1
131 : INTEGER, DIMENSION(3) :: virtual_group_coor = -1
132 :
133 : INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
134 :
135 : END TYPE realspace_grid_desc_type
136 :
137 : TYPE realspace_grid_type
138 :
139 : TYPE(realspace_grid_desc_type), POINTER :: desc => NULL()
140 :
141 : INTEGER :: ngpts_local = -1 ! local dimensions
142 : INTEGER, DIMENSION(3) :: npts_local = -1
143 : INTEGER, DIMENSION(3) :: lb_local = -1
144 : INTEGER, DIMENSION(3) :: ub_local = -1
145 : INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
146 : INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
147 :
148 : INTEGER, DIMENSION(:), ALLOCATABLE :: px, py, pz ! index translators
149 : TYPE(offload_buffer_type) :: buffer = offload_buffer_type() ! owner of the grid's memory
150 : REAL(KIND=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => NULL() ! the grid (pointer to buffer%host_buffer)
151 :
152 : END TYPE realspace_grid_type
153 :
154 : ! **************************************************************************************************
155 : TYPE realspace_grid_p_type
156 : TYPE(realspace_grid_type), POINTER :: rs_grid => NULL()
157 : END TYPE realspace_grid_p_type
158 :
159 : TYPE realspace_grid_desc_p_type
160 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc => NULL()
161 : END TYPE realspace_grid_desc_p_type
162 :
163 : CONTAINS
164 :
165 : ! **************************************************************************************************
166 : !> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
167 : !> only possible if rs_grid is a distributed grid
168 : !> \param rs_desc ...
169 : !> \param rank_in ...
170 : !> \param shift ...
171 : !> \return ...
172 : ! **************************************************************************************************
173 2346 : PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
174 : TYPE(realspace_grid_desc_type), INTENT(IN) :: rs_desc
175 : INTEGER, INTENT(IN) :: rank_in
176 : INTEGER, DIMENSION(3), INTENT(IN) :: shift
177 : INTEGER :: rank_out
178 :
179 : INTEGER :: coord(3)
180 :
181 9384 : coord = MODULO(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
182 2346 : rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
183 2346 : END FUNCTION rs_grid_locate_rank
184 :
185 : ! **************************************************************************************************
186 : !> \brief Determine the setup of real space grids - this is divided up into the
187 : !> creation of a descriptor and the actual grid itself (see rs_grid_create)
188 : !> \param desc ...
189 : !> \param pw_grid ...
190 : !> \param input_settings ...
191 : !> \param border_points ...
192 : !> \par History
193 : !> JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
194 : !> Iain Bethune (05-Sep-2008) : modified cut heuristic
195 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
196 : !> - Create a descriptor for realspace grids with a number of border
197 : !> points as exactly given by the optional argument border_points.
198 : !> These grids are always distributed.
199 : !> (27.11.2013, Matthias Krack)
200 : !> \author JGH (18-Mar-2001)
201 : ! **************************************************************************************************
202 30296 : SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
203 : TYPE(realspace_grid_desc_type), POINTER :: desc
204 : TYPE(pw_grid_type), INTENT(INOUT), TARGET :: pw_grid
205 : TYPE(realspace_grid_input_type), INTENT(IN) :: input_settings
206 : INTEGER, INTENT(IN), OPTIONAL :: border_points
207 :
208 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_grid_create_descriptor'
209 :
210 : INTEGER :: border_size, dir, handle, i, j, k, l, &
211 : lb(2), min_npts_real, n_slices(3), &
212 : n_slices_tmp(3), nmin
213 : LOGICAL :: overlap
214 : REAL(KIND=dp) :: ratio, ratio_best, volume, volume_dist
215 :
216 30296 : CALL timeset(routineN, handle)
217 :
218 30296 : IF (PRESENT(border_points)) THEN
219 128 : border_size = border_points
220 : ELSE
221 : border_size = 0
222 : END IF
223 :
224 1484504 : ALLOCATE (desc)
225 :
226 30296 : CALL pw_grid%para%group%sync()
227 :
228 30296 : desc%pw => pw_grid
229 30296 : CALL pw_grid_retain(desc%pw)
230 :
231 393848 : desc%dh = pw_grid%dh
232 393848 : desc%dh_inv = pw_grid%dh_inv
233 30296 : desc%orthorhombic = pw_grid%orthorhombic
234 30296 : desc%ref_count = 1
235 :
236 30296 : IF (pw_grid%para%mode == PW_MODE_LOCAL) THEN
237 : ! The corresponding group has dimension 1
238 : ! All operations will be done locally
239 15624 : desc%npts = pw_grid%npts
240 15624 : desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
241 15624 : desc%lb = pw_grid%bounds(1, :)
242 15624 : desc%ub = pw_grid%bounds(2, :)
243 3906 : desc%border = border_size
244 3906 : IF (border_size == 0) THEN
245 15624 : desc%perd = 1
246 : ELSE
247 0 : desc%perd = 0
248 : END IF
249 3906 : desc%parallel = .FALSE.
250 3906 : desc%distributed = .FALSE.
251 3906 : desc%group = mp_comm_null
252 3906 : desc%group_size = 1
253 3906 : desc%group_head = .TRUE.
254 15624 : desc%group_dim = 1
255 15624 : desc%group_coor = 0
256 3906 : desc%my_pos = 0
257 : ELSE
258 : ! group size of desc grid
259 : ! global grid dimensions are still the same
260 26390 : desc%group_size = pw_grid%para%group_size
261 105560 : desc%npts = pw_grid%npts
262 105560 : desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
263 105560 : desc%lb = pw_grid%bounds(1, :)
264 105560 : desc%ub = pw_grid%bounds(2, :)
265 :
266 : ! this is the eventual border size
267 26390 : IF (border_size == 0) THEN
268 26262 : nmin = (input_settings%nsmax + 1)/2
269 26262 : nmin = MAX(0, NINT(nmin*input_settings%halo_reduction_factor))
270 : ELSE
271 : ! Set explicitly the requested border size
272 : nmin = border_size
273 : END IF
274 :
275 26390 : IF (input_settings%distribution_type == rsgrid_replicated) THEN
276 :
277 43448 : n_slices = 1
278 10862 : IF (border_size > 0) THEN
279 : CALL cp_abort(__LOCATION__, &
280 : "An explicit border size > 0 is not yet working for "// &
281 : "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
282 0 : "distributed for RS_GRID explicitly.")
283 : END IF
284 :
285 : ELSE
286 :
287 62112 : n_slices = 1
288 15528 : ratio_best = -HUGE(ratio_best)
289 :
290 : ! don't allow distributions with more processors than real grid points
291 46584 : DO k = 1, MIN(desc%npts(3), desc%group_size)
292 108696 : DO j = 1, MIN(desc%npts(2), desc%group_size)
293 62112 : i = MIN(desc%npts(1), desc%group_size/(j*k))
294 248448 : n_slices_tmp = (/i, j, k/)
295 :
296 : ! we don't match the actual number of CPUs
297 248448 : IF (PRODUCT(n_slices_tmp) .NE. desc%group_size) CYCLE
298 :
299 : ! we see if there has been a input constraint
300 : ! i.e. if the layout is not -1 we need to fullfil it
301 372930 : IF (.NOT. ALL(PACK(n_slices_tmp == input_settings%distribution_layout, &
302 : (/-1, -1, -1/) /= input_settings%distribution_layout) &
303 46584 : )) CYCLE
304 :
305 : ! We can not work with a grid that has more local than global grid points.
306 : ! This can happen when a halo region wraps around and overlaps with the other halo.
307 46358 : overlap = .FALSE.
308 185432 : DO dir = 1, 3
309 185432 : IF (n_slices_tmp(dir) > 1) THEN
310 139074 : DO l = 0, n_slices_tmp(dir) - 1
311 92716 : lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
312 139074 : IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .TRUE.
313 : END DO
314 : END IF
315 : END DO
316 46358 : IF (overlap) CYCLE
317 :
318 : ! a heuristic optimisation to reduce the memory usage
319 : ! we go for the smallest local to real volume
320 : ! volume of the box without the wings / volume of the box with the wings
321 : ! with prefactodesc to promote less cuts in Z dimension
322 : ratio = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp)/ &
323 : PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp + &
324 56854 : MERGE((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
325 39178 : IF (ratio > ratio_best) THEN
326 8106 : ratio_best = ratio
327 8106 : n_slices = n_slices_tmp
328 : END IF
329 :
330 : END DO
331 : END DO
332 :
333 : ! if automatic we can still decide this is a replicated grid
334 : ! if the memory gain (or the gain is messages) is too small.
335 15528 : IF (input_settings%distribution_type == rsgrid_automatic) THEN
336 59912 : volume = PRODUCT(REAL(desc%npts, KIND=dp))
337 : volume_dist = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices + &
338 59912 : MERGE((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
339 14978 : IF (volume < volume_dist*input_settings%memory_factor) THEN
340 59912 : n_slices = 1
341 : END IF
342 : END IF
343 :
344 : END IF
345 :
346 105560 : desc%group_dim(:) = n_slices(:)
347 26390 : CALL desc%group%from_dup(pw_grid%para%group)
348 26390 : desc%group_size = desc%group%num_pe
349 26390 : desc%my_pos = desc%group%mepos
350 :
351 105382 : IF (ALL(n_slices == 1)) THEN
352 : ! CASE 1 : only one slice: we do not need overlapping regions and special
353 : ! recombination of the total density
354 26232 : desc%border = border_size
355 26232 : IF (border_size == 0) THEN
356 104928 : desc%perd = 1
357 : ELSE
358 0 : desc%perd = 0
359 : END IF
360 26232 : desc%distributed = .FALSE.
361 26232 : desc%parallel = .TRUE.
362 26232 : desc%group_head = pw_grid%para%group_head
363 104928 : desc%group_coor(:) = 0
364 26232 : desc%my_virtual_pos = 0
365 :
366 78696 : ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
367 78696 : ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
368 : ! Start with no reordering
369 78696 : DO i = 0, desc%group_size - 1
370 52464 : desc%virtual2real(i) = i
371 78696 : desc%real2virtual(i) = i
372 : END DO
373 : ELSE
374 : ! CASE 2 : general case
375 : ! periodicity is no longer enforced arbritary directions
376 158 : IF (border_size == 0) THEN
377 120 : desc%perd = 1
378 120 : DO dir = 1, 3
379 120 : IF (n_slices(dir) > 1) desc%perd(dir) = 0
380 : END DO
381 : ELSE
382 512 : desc%perd(:) = 0
383 : END IF
384 : ! we keep a border of nmin points
385 158 : desc%border = nmin
386 : ! we are going parallel on the real space grid
387 158 : desc%parallel = .TRUE.
388 158 : desc%distributed = .TRUE.
389 158 : desc%group_head = (desc%my_pos == 0)
390 :
391 : ! set up global info about the distribution
392 474 : ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
393 790 : ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
394 474 : ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
395 474 : ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
396 474 : ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
397 474 : ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
398 474 : ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
399 :
400 474 : DO i = 0, desc%group_size - 1
401 : ! Calculate coordinates in a row-major order (to be SMP-friendly)
402 316 : desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
403 : desc%rank2coord(2, i) = MODULO(i, desc%group_dim(2)*desc%group_dim(3)) &
404 316 : /desc%group_dim(3)
405 316 : desc%rank2coord(3, i) = MODULO(i, desc%group_dim(3))
406 :
407 316 : IF (i == desc%my_pos) THEN
408 632 : desc%group_coor = desc%rank2coord(:, i)
409 : END IF
410 :
411 316 : desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
412 : ! the lb_global and ub_global correspond to lb_real and ub_real of each task
413 1264 : desc%lb_global(:, i) = desc%lb
414 1264 : desc%ub_global(:, i) = desc%ub
415 1422 : DO dir = 1, 3
416 1264 : IF (desc%group_dim(dir) .GT. 1) THEN
417 316 : lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
418 316 : desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
419 316 : desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
420 : END IF
421 : END DO
422 : END DO
423 :
424 : ! map a grid point to a CPU coord
425 632 : DO dir = 1, 3
426 1264 : DO l = 0, desc%group_dim(dir) - 1
427 632 : IF (desc%group_dim(dir) .GT. 1) THEN
428 316 : lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
429 948 : lb = lb + desc%lb(dir) - 1
430 : ELSE
431 316 : lb(1) = desc%lb(dir)
432 316 : lb(2) = desc%ub(dir)
433 : END IF
434 474 : SELECT CASE (dir)
435 : CASE (1)
436 11696 : desc%x2coord(lb(1):lb(2)) = l
437 : CASE (2)
438 12104 : desc%y2coord(lb(1):lb(2)) = l
439 : CASE (3)
440 12200 : desc%z2coord(lb(1):lb(2)) = l
441 : END SELECT
442 : END DO
443 : END DO
444 :
445 : ! an upper bound for the number of neighbours the border is overlapping with
446 632 : DO dir = 1, 3
447 474 : desc%neighbours(dir) = 0
448 632 : IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
449 414 : min_npts_real = HUGE(0)
450 986 : DO l = 0, n_slices(dir) - 1
451 572 : lb = get_limit(desc%npts(dir), n_slices(dir), l)
452 986 : min_npts_real = MIN(lb(2) - lb(1) + 1, min_npts_real)
453 : END DO
454 414 : desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
455 : END IF
456 : END DO
457 :
458 474 : ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
459 474 : ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
460 : ! Start with no reordering
461 474 : DO i = 0, desc%group_size - 1
462 316 : desc%virtual2real(i) = i
463 474 : desc%real2virtual(i) = i
464 : END DO
465 :
466 158 : desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
467 632 : desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
468 :
469 : END IF
470 : END IF
471 :
472 30296 : CALL timestop(handle)
473 :
474 30296 : END SUBROUTINE rs_grid_create_descriptor
475 :
476 : ! **************************************************************************************************
477 : !> \brief ...
478 : !> \param rs ...
479 : !> \param desc ...
480 : ! **************************************************************************************************
481 4465440 : SUBROUTINE rs_grid_create(rs, desc)
482 : TYPE(realspace_grid_type), INTENT(OUT) :: rs
483 : TYPE(realspace_grid_desc_type), INTENT(INOUT), &
484 : TARGET :: desc
485 :
486 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_grid_create'
487 :
488 : INTEGER :: handle
489 :
490 223272 : CALL timeset(routineN, handle)
491 :
492 223272 : rs%desc => desc
493 223272 : CALL rs_grid_retain_descriptor(rs%desc)
494 :
495 223272 : IF (desc%pw%para%mode == PW_MODE_LOCAL) THEN
496 : ! The corresponding group has dimension 1
497 : ! All operations will be done locally
498 63224 : rs%lb_real = desc%lb
499 63224 : rs%ub_real = desc%ub
500 63224 : rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
501 63224 : rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
502 63224 : rs%npts_local = rs%ub_local - rs%lb_local + 1
503 63224 : rs%ngpts_local = PRODUCT(rs%npts_local)
504 : END IF
505 :
506 890704 : IF (ALL(rs%desc%group_dim == 1)) THEN
507 : ! CASE 1 : only one slice: we do not need overlapping regions and special
508 : ! recombination of the total density
509 886032 : rs%lb_real = desc%lb
510 886032 : rs%ub_real = desc%ub
511 886032 : rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
512 886032 : rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
513 886032 : rs%npts_local = rs%ub_local - rs%lb_local + 1
514 886032 : rs%ngpts_local = PRODUCT(rs%npts_local)
515 : ELSE
516 : ! CASE 2 : general case
517 : ! extract some more derived quantities about the local grid
518 7056 : rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
519 7056 : rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
520 7056 : rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
521 7056 : rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
522 7056 : rs%npts_local = rs%ub_local - rs%lb_local + 1
523 7056 : rs%ngpts_local = PRODUCT(rs%npts_local)
524 : END IF
525 :
526 223272 : CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
527 : rs%r(rs%lb_local(1):rs%ub_local(1), &
528 : rs%lb_local(2):rs%ub_local(2), &
529 223272 : rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
530 :
531 669816 : ALLOCATE (rs%px(desc%npts(1)))
532 669816 : ALLOCATE (rs%py(desc%npts(2)))
533 669816 : ALLOCATE (rs%pz(desc%npts(3)))
534 :
535 223272 : CALL timestop(handle)
536 :
537 223272 : END SUBROUTINE rs_grid_create
538 :
539 : ! **************************************************************************************************
540 : !> \brief Defines a new ordering of ranks on this realspace grid, recalculating
541 : !> the data bounds and reallocating the grid. As a result, each MPI process
542 : !> now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
543 : !> and a virtual rank (the rank of the process where the data now owned by this
544 : !> process would reside in an ordinary cartesian distribution).
545 : !> NB. Since the grid size required may change, the caller should be sure to release
546 : !> and recreate the corresponding rs_grids
547 : !> The desc%real2virtual and desc%virtual2real arrays can be used to map
548 : !> a physical rank to the 'rank' of data owned by that process and vice versa
549 : !> \param desc ...
550 : !> \param real2virtual ...
551 : !> \par History
552 : !> 04-2009 created [Iain Bethune]
553 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
554 : ! **************************************************************************************************
555 6 : PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
556 :
557 : TYPE(realspace_grid_desc_type), INTENT(INOUT) :: desc
558 : INTEGER, DIMENSION(:), INTENT(IN) :: real2virtual
559 :
560 : INTEGER :: i
561 :
562 18 : desc%real2virtual(:) = real2virtual
563 :
564 18 : DO i = 0, desc%group_size - 1
565 18 : desc%virtual2real(desc%real2virtual(i)) = i
566 : END DO
567 :
568 6 : desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
569 :
570 12 : IF (.NOT. ALL(desc%group_dim == 1)) THEN
571 24 : desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
572 : END IF
573 :
574 6 : END SUBROUTINE
575 :
576 : ! **************************************************************************************************
577 : !> \brief Print information on grids to output
578 : !> \param rs ...
579 : !> \param iounit ...
580 : !> \author JGH (17-May-2007)
581 : ! **************************************************************************************************
582 13434 : SUBROUTINE rs_grid_print(rs, iounit)
583 : TYPE(realspace_grid_type), INTENT(IN) :: rs
584 : INTEGER, INTENT(in) :: iounit
585 :
586 : INTEGER :: dir, i, nn
587 : REAL(KIND=dp) :: pp(3)
588 :
589 13434 : IF (rs%desc%parallel) THEN
590 13152 : IF (iounit > 0) THEN
591 : WRITE (iounit, '(/,A,T71,I10)') &
592 5864 : " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
593 23456 : DO i = 1, 3
594 17592 : WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
595 41048 : i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
596 : END DO
597 5864 : IF (.NOT. rs%desc%distributed) THEN
598 5849 : WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
599 : WRITE (iounit, '(A,T71,I10)') &
600 5849 : " RS_GRID| Group size ", rs%desc%group_dim(2)
601 : ELSE
602 60 : DO dir = 1, 3
603 60 : IF (rs%desc%perd(dir) /= 1) THEN
604 : WRITE (iounit, '(A,T71,I3,A)') &
605 15 : " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
606 : WRITE (iounit, '(A,T71,I10)') &
607 15 : " RS_GRID| Real space distribution along direction ", dir
608 : WRITE (iounit, '(A,T71,I10)') &
609 15 : " RS_GRID| Border size ", rs%desc%border
610 : END IF
611 : END DO
612 : END IF
613 : END IF
614 13152 : IF (rs%desc%distributed) THEN
615 120 : DO dir = 1, 3
616 120 : IF (rs%desc%perd(dir) /= 1) THEN
617 30 : nn = rs%npts_local(dir)
618 30 : CALL rs%desc%group%sum(nn)
619 120 : pp(1) = REAL(nn, KIND=dp)/REAL(PRODUCT(rs%desc%group_dim), KIND=dp)
620 30 : nn = rs%npts_local(dir)
621 30 : CALL rs%desc%group%max(nn)
622 30 : pp(2) = REAL(nn, KIND=dp)
623 30 : nn = rs%npts_local(dir)
624 30 : CALL rs%desc%group%min(nn)
625 30 : pp(3) = REAL(nn, KIND=dp)
626 30 : IF (iounit > 0) THEN
627 15 : WRITE (iounit, '(A,T48,A)') " RS_GRID| Distribution", &
628 30 : " Average Max Min"
629 15 : WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID| Planes ", &
630 30 : pp(1), NINT(pp(2)), NINT(pp(3))
631 : END IF
632 : END IF
633 : END DO
634 : ! WRITE ( iounit, '(/)' )
635 : END IF
636 : ELSE
637 282 : IF (iounit > 0) THEN
638 : WRITE (iounit, '(/,A,T71,I10)') &
639 172 : " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
640 688 : DO i = 1, 3
641 516 : WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
642 1204 : i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
643 : END DO
644 : ! WRITE ( iounit, '(/)' )
645 : END IF
646 : END IF
647 :
648 13434 : END SUBROUTINE rs_grid_print
649 :
650 : ! **************************************************************************************************
651 : !> \brief ...
652 : !> \param rs ...
653 : !> \param pw ...
654 : ! **************************************************************************************************
655 1038488 : SUBROUTINE transfer_rs2pw(rs, pw)
656 : TYPE(realspace_grid_type), INTENT(IN) :: rs
657 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
658 :
659 : CHARACTER(len=*), PARAMETER :: routineN = 'transfer_rs2pw'
660 :
661 : INTEGER :: handle, handle2, i
662 :
663 1038488 : CALL timeset(routineN, handle2)
664 1038488 : CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
665 :
666 1038488 : IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
667 0 : CPABORT("Different rs and pw indentifiers")
668 :
669 1038488 : IF (rs%desc%distributed) THEN
670 1840 : CALL transfer_rs2pw_distributed(rs, pw)
671 1036648 : ELSE IF (rs%desc%parallel) THEN
672 835202 : CALL transfer_rs2pw_replicated(rs, pw)
673 : ELSE ! treat simple serial case locally
674 201446 : IF (rs%desc%border == 0) THEN
675 201446 : CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
676 : ELSE
677 0 : CPASSERT(LBOUND(pw%array, 3) .EQ. rs%lb_real(3))
678 0 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
679 : DO i = rs%lb_real(3), rs%ub_real(3)
680 : pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
681 : rs%lb_real(2):rs%ub_real(2), i)
682 : END DO
683 : !$OMP END PARALLEL DO
684 : END IF
685 : END IF
686 :
687 1038488 : CALL timestop(handle)
688 1038488 : CALL timestop(handle2)
689 :
690 1038488 : END SUBROUTINE transfer_rs2pw
691 :
692 : ! **************************************************************************************************
693 : !> \brief ...
694 : !> \param rs ...
695 : !> \param pw ...
696 : ! **************************************************************************************************
697 1013115 : SUBROUTINE transfer_pw2rs(rs, pw)
698 :
699 : TYPE(realspace_grid_type), INTENT(IN) :: rs
700 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
701 :
702 : CHARACTER(len=*), PARAMETER :: routineN = 'transfer_pw2rs'
703 :
704 : INTEGER :: handle, handle2, i, im, j, jm, k, km
705 :
706 1013115 : CALL timeset(routineN, handle2)
707 1013115 : CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
708 :
709 1013115 : IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
710 0 : CPABORT("Different rs and pw indentifiers")
711 :
712 1013115 : IF (rs%desc%distributed) THEN
713 868 : CALL transfer_pw2rs_distributed(rs, pw)
714 1012247 : ELSE IF (rs%desc%parallel) THEN
715 775858 : CALL transfer_pw2rs_replicated(rs, pw)
716 : ELSE ! treat simple serial case locally
717 236389 : IF (rs%desc%border == 0) THEN
718 236389 : CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
719 : ELSE
720 : !$OMP PARALLEL DO DEFAULT(NONE) &
721 : !$OMP PRIVATE(i,im,j,jm,k,km) &
722 0 : !$OMP SHARED(pw,rs)
723 : DO k = rs%lb_local(3), rs%ub_local(3)
724 : IF (k < rs%lb_real(3)) THEN
725 : km = k + rs%desc%npts(3)
726 : ELSE IF (k > rs%ub_real(3)) THEN
727 : km = k - rs%desc%npts(3)
728 : ELSE
729 : km = k
730 : END IF
731 : DO j = rs%lb_local(2), rs%ub_local(2)
732 : IF (j < rs%lb_real(2)) THEN
733 : jm = j + rs%desc%npts(2)
734 : ELSE IF (j > rs%ub_real(2)) THEN
735 : jm = j - rs%desc%npts(2)
736 : ELSE
737 : jm = j
738 : END IF
739 : DO i = rs%lb_local(1), rs%ub_local(1)
740 : IF (i < rs%lb_real(1)) THEN
741 : im = i + rs%desc%npts(1)
742 : ELSE IF (i > rs%ub_real(1)) THEN
743 : im = i - rs%desc%npts(1)
744 : ELSE
745 : im = i
746 : END IF
747 : rs%r(i, j, k) = pw%array(im, jm, km)
748 : END DO
749 : END DO
750 : END DO
751 : !$OMP END PARALLEL DO
752 : END IF
753 : END IF
754 :
755 1013115 : CALL timestop(handle)
756 1013115 : CALL timestop(handle2)
757 :
758 1013115 : END SUBROUTINE transfer_pw2rs
759 :
760 : ! **************************************************************************************************
761 : !> \brief transfer from a realspace grid to a planewave grid
762 : !> \param rs ...
763 : !> \param pw ...
764 : ! **************************************************************************************************
765 835202 : SUBROUTINE transfer_rs2pw_replicated(rs, pw)
766 : TYPE(realspace_grid_type), INTENT(IN) :: rs
767 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
768 :
769 : INTEGER :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
770 : source
771 835202 : INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
772 : INTEGER, DIMENSION(3) :: lb, ub
773 835202 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
774 :
775 : ASSOCIATE (np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
776 : pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
777 : grid => rs%r)
778 2505606 : ALLOCATE (rcount(0:np - 1))
779 2505606 : DO ip = 1, np
780 7516818 : rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
781 : END DO
782 2505606 : nma = MAXVAL(rcount(0:np - 1))
783 4176010 : ALLOCATE (sendbuf(nma), recvbuf(nma))
784 31497273648 : sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
785 :
786 : !sample peak memory
787 835202 : CALL m_memory()
788 :
789 835202 : dest = MODULO(mepos + 1, np)
790 835202 : source = MODULO(mepos - 1, np)
791 15748636824 : sendbuf = 0.0_dp
792 :
793 1670404 : DO ip = 1, np
794 :
795 6681616 : lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
796 6681616 : ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
797 : ! this loop takes about the same time as the message passing call
798 : ! notice that the range of ix is only a small fraction of the first index of grid
799 : ! therefore it seems faster to have the second index as the innermost loop
800 : ! if this runs on many cpus
801 : ! tested on itanium, pentium4, opteron, ultrasparc...
802 6681616 : s = ub - lb + 1
803 41834180 : DO iz = lb(3), ub(3)
804 741220000 : DO ix = lb(1), ub(1)
805 699385820 : ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
806 32023406526 : DO iy = lb(2), ub(2)
807 31283856930 : sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
808 31983242750 : ii = ii + s(1)
809 : END DO
810 : END DO
811 : END DO
812 1670404 : IF (ip .EQ. np) EXIT
813 835202 : CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
814 835202 : CALL MOVE_ALLOC(sendbuf, swaparray)
815 835202 : CALL MOVE_ALLOC(recvbuf, sendbuf)
816 1670404 : CALL MOVE_ALLOC(swaparray, recvbuf)
817 : END DO
818 835202 : nn = rcount(mepos)
819 : END ASSOCIATE
820 :
821 835202 : CALL dcopy(nn, sendbuf, 1, pw%array, 1)
822 :
823 835202 : DEALLOCATE (rcount)
824 835202 : DEALLOCATE (sendbuf)
825 835202 : DEALLOCATE (recvbuf)
826 :
827 835202 : END SUBROUTINE transfer_rs2pw_replicated
828 :
829 : ! **************************************************************************************************
830 : !> \brief transfer from a planewave grid to a realspace grid
831 : !> \param rs ...
832 : !> \param pw ...
833 : ! **************************************************************************************************
834 775858 : SUBROUTINE transfer_pw2rs_replicated(rs, pw)
835 : TYPE(realspace_grid_type), INTENT(IN) :: rs
836 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
837 :
838 : INTEGER :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
839 : k, km, nma, nn, source
840 775858 : INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
841 : INTEGER, DIMENSION(3) :: lb, ub
842 775858 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
843 2327574 : TYPE(mp_request_type), DIMENSION(2) :: req
844 :
845 : ASSOCIATE (np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
846 : pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
847 : grid => rs%r)
848 2327574 : ALLOCATE (rcount(0:np - 1))
849 2327574 : DO ip = 1, np
850 6982722 : rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
851 : END DO
852 2327574 : nma = MAXVAL(rcount(0:np - 1))
853 3879290 : ALLOCATE (sendbuf(nma), recvbuf(nma))
854 30713859552 : sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
855 :
856 : !sample peak memory
857 775858 : CALL m_memory()
858 :
859 775858 : nn = rcount(mepos)
860 775858 : CALL dcopy(nn, pw%array, 1, sendbuf, 1)
861 :
862 775858 : dest = MODULO(mepos + 1, np)
863 775858 : source = MODULO(mepos - 1, np)
864 :
865 2327574 : DO ip = 0, np - 1
866 : ! we must shift the buffer only np-1 times around
867 1551716 : IF (ip .NE. np - 1) THEN
868 : CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
869 775858 : req(1), req(2), 13)
870 : END IF
871 6206864 : lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
872 6206864 : ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
873 1551716 : ii = 0
874 : ! this loop takes about the same time as the message passing call
875 : ! If I read the code correctly then:
876 39896756 : DO iz = lb(3), ub(3)
877 1382097252 : DO iy = lb(2), ub(2)
878 31890640682 : DO ix = lb(1), ub(1)
879 30510095146 : ii = ii + 1
880 31852295642 : grid(ix, iy, iz) = sendbuf(ii)
881 : END DO
882 : END DO
883 : END DO
884 1551716 : IF (ip .NE. np - 1) THEN
885 775858 : CALL mp_waitall(req)
886 : END IF
887 1551716 : CALL MOVE_ALLOC(sendbuf, swaparray)
888 1551716 : CALL MOVE_ALLOC(recvbuf, sendbuf)
889 2327574 : CALL MOVE_ALLOC(swaparray, recvbuf)
890 : END DO
891 1551716 : IF (rs%desc%border > 0) THEN
892 : !$OMP PARALLEL DO DEFAULT(NONE) &
893 : !$OMP PRIVATE(i,im,j,jm,k,km) &
894 0 : !$OMP SHARED(rs)
895 : DO k = rs%lb_local(3), rs%ub_local(3)
896 : IF (k < rs%lb_real(3)) THEN
897 : km = k + rs%desc%npts(3)
898 : ELSE IF (k > rs%ub_real(3)) THEN
899 : km = k - rs%desc%npts(3)
900 : ELSE
901 : km = k
902 : END IF
903 : DO j = rs%lb_local(2), rs%ub_local(2)
904 : IF (j < rs%lb_real(2)) THEN
905 : jm = j + rs%desc%npts(2)
906 : ELSE IF (j > rs%ub_real(2)) THEN
907 : jm = j - rs%desc%npts(2)
908 : ELSE
909 : jm = j
910 : END IF
911 : DO i = rs%lb_local(1), rs%ub_local(1)
912 : IF (i < rs%lb_real(1)) THEN
913 : im = i + rs%desc%npts(1)
914 : ELSE IF (i > rs%ub_real(1)) THEN
915 : im = i - rs%desc%npts(1)
916 : ELSE
917 : im = i
918 : END IF
919 : rs%r(i, j, k) = rs%r(im, jm, km)
920 : END DO
921 : END DO
922 : END DO
923 : !$OMP END PARALLEL DO
924 : END IF
925 : END ASSOCIATE
926 :
927 775858 : DEALLOCATE (rcount)
928 775858 : DEALLOCATE (sendbuf)
929 775858 : DEALLOCATE (recvbuf)
930 :
931 775858 : END SUBROUTINE transfer_pw2rs_replicated
932 :
933 : ! **************************************************************************************************
934 : !> \brief does the rs2pw transfer in the case where the rs grid is
935 : !> distributed (3D domain decomposition)
936 : !> \param rs ...
937 : !> \param pw ...
938 : !> \par History
939 : !> 12.2007 created [Matt Watkins]
940 : !> 9.2008 reduced amount of halo data sent [Iain Bethune]
941 : !> 10.2008 added non-blocking communication [Iain Bethune]
942 : !> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
943 : !> 12.2009 added OMP and sparse alltoall [Iain Bethune]
944 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
945 : !> \note
946 : !> the transfer is a two step procedure. For example, for the rs2pw transfer:
947 : !>
948 : !> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
949 : !> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
950 : !>
951 : !> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
952 : !> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
953 : !> with the central domain of several CPUs (i.e. next nearest neighbors)
954 : ! **************************************************************************************************
955 1840 : SUBROUTINE transfer_rs2pw_distributed(rs, pw)
956 : TYPE(realspace_grid_type), INTENT(IN) :: rs
957 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
958 :
959 : CHARACTER(LEN=200) :: error_string
960 : INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
961 : n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
962 1840 : INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
963 1840 : send_disps, send_sizes, ushifts
964 3680 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
965 : INTEGER, DIMENSION(2) :: neighbours, pos
966 : INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
967 : lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
968 : LOGICAL, DIMENSION(3) :: halo_swapped
969 : REAL(KIND=dp) :: pw_sum, rs_sum
970 1840 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
971 1840 : send_buf_3d_down, send_buf_3d_up
972 3680 : TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
973 1840 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
974 9200 : TYPE(mp_request_type), DIMENSION(4) :: req
975 :
976 1840 : num_threads = 1
977 1840 : my_id = 0
978 :
979 : ! safety check, to be removed once we're absolute sure the routine is correct
980 : IF (debug_this_module) THEN
981 : rs_sum = accurate_sum(rs%r)*ABS(det_3x3(rs%desc%dh))
982 : CALL rs%desc%group%sum(rs_sum)
983 : END IF
984 :
985 1840 : halo_swapped = .FALSE.
986 : ! We don't need to send the 'edges' of the halos that have already been sent
987 : ! Halos are contiguous in memory in z-direction only, so swap these first,
988 : ! and send less data in the y and x directions which are more expensive
989 :
990 7360 : DO idir = 3, 1, -1
991 :
992 5520 : IF (rs%desc%perd(idir) .NE. 1) THEN
993 :
994 14304 : ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
995 14304 : ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
996 :
997 14304 : ushifts = 0
998 14304 : dshifts = 0
999 :
1000 : ! check that we don't try to send data to ourself
1001 6608 : DO n_shifts = 1, MIN(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
1002 :
1003 : ! need to take into account the possible varying widths of neighbouring cells
1004 : ! offset_up and offset_down hold the real size of the neighbouring cells
1005 1840 : position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1006 1840 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1007 1840 : dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1008 :
1009 1840 : position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1010 1840 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1011 1840 : ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1012 :
1013 : ! The border data has to be send/received from the neighbours
1014 : ! First we calculate the source and destination processes for the shift
1015 : ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
1016 :
1017 1840 : CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1018 :
1019 7360 : lb_send_down(:) = rs%lb_local(:)
1020 7360 : lb_recv_down(:) = rs%lb_local(:)
1021 7360 : ub_recv_down(:) = rs%ub_local(:)
1022 7360 : ub_send_down(:) = rs%ub_local(:)
1023 :
1024 1840 : IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1025 1840 : ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1026 : lb_send_down(idir) = MAX(lb_send_down(idir), &
1027 1840 : lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
1028 :
1029 1840 : ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
1030 : lb_recv_down(idir) = MAX(lb_recv_down(idir) + rs%desc%border, &
1031 1840 : ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1032 : ELSE
1033 0 : lb_send_down(idir) = 0
1034 0 : ub_send_down(idir) = -1
1035 0 : lb_recv_down(idir) = 0
1036 0 : ub_recv_down(idir) = -1
1037 : END IF
1038 :
1039 7360 : DO i = 1, 3
1040 7360 : IF (halo_swapped(i)) THEN
1041 554 : lb_send_down(i) = rs%lb_real(i)
1042 554 : ub_send_down(i) = rs%ub_real(i)
1043 554 : lb_recv_down(i) = rs%lb_real(i)
1044 554 : ub_recv_down(i) = rs%ub_real(i)
1045 : END IF
1046 : END DO
1047 :
1048 : ! post the receive
1049 0 : ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1050 9200 : lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1051 1840 : CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1052 :
1053 : ! now allocate, pack and send the send buffer
1054 7360 : nn = PRODUCT(ub_send_down - lb_send_down + 1)
1055 0 : ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1056 9200 : lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1057 :
1058 : !$OMP PARALLEL DEFAULT(NONE), &
1059 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1060 1840 : !$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1061 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1062 : !$ my_id = omp_get_thread_num()
1063 : IF (my_id < num_threads) THEN
1064 : lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1065 : ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1066 :
1067 : send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1068 : lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1069 : lb_send_down(2):ub_send_down(2), lb:ub)
1070 : END IF
1071 : !$OMP END PARALLEL
1072 :
1073 1840 : CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1074 :
1075 : ! Now for the other direction
1076 1840 : CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1077 :
1078 7360 : lb_send_up(:) = rs%lb_local(:)
1079 7360 : lb_recv_up(:) = rs%lb_local(:)
1080 7360 : ub_recv_up(:) = rs%ub_local(:)
1081 7360 : ub_send_up(:) = rs%ub_local(:)
1082 :
1083 1840 : IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1084 :
1085 1840 : lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1086 : ub_send_up(idir) = MIN(ub_send_up(idir), &
1087 1840 : ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
1088 :
1089 1840 : lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
1090 : ub_recv_up(idir) = MIN(ub_recv_up(idir) - rs%desc%border, &
1091 1840 : lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1092 : ELSE
1093 0 : lb_send_up(idir) = 0
1094 0 : ub_send_up(idir) = -1
1095 0 : lb_recv_up(idir) = 0
1096 0 : ub_recv_up(idir) = -1
1097 : END IF
1098 :
1099 7360 : DO i = 1, 3
1100 7360 : IF (halo_swapped(i)) THEN
1101 554 : lb_send_up(i) = rs%lb_real(i)
1102 554 : ub_send_up(i) = rs%ub_real(i)
1103 554 : lb_recv_up(i) = rs%lb_real(i)
1104 554 : ub_recv_up(i) = rs%ub_real(i)
1105 : END IF
1106 : END DO
1107 :
1108 : ! post the receive
1109 0 : ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1110 9200 : lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1111 1840 : CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1112 :
1113 : ! now allocate,pack and send the send buffer
1114 7360 : nn = PRODUCT(ub_send_up - lb_send_up + 1)
1115 0 : ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1116 9200 : lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1117 :
1118 : !$OMP PARALLEL DEFAULT(NONE), &
1119 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1120 1840 : !$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1121 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1122 : !$ my_id = omp_get_thread_num()
1123 : IF (my_id < num_threads) THEN
1124 : lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1125 : ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1126 :
1127 : send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1128 : lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1129 : lb_send_up(2):ub_send_up(2), lb:ub)
1130 : END IF
1131 : !$OMP END PARALLEL
1132 :
1133 1840 : CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1134 :
1135 : ! wait for a recv to complete, then we can unpack
1136 :
1137 5520 : DO i = 1, 2
1138 :
1139 3680 : CALL mp_waitany(req(1:2), completed)
1140 :
1141 5520 : IF (completed .EQ. 1) THEN
1142 :
1143 : ! only some procs may need later shifts
1144 1840 : IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1145 : ! Sum the data in the RS Grid
1146 : !$OMP PARALLEL DEFAULT(NONE), &
1147 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1148 1840 : !$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1149 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1150 : !$ my_id = omp_get_thread_num()
1151 : IF (my_id < num_threads) THEN
1152 : lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1153 : ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1154 :
1155 : rs%r(lb_recv_down(1):ub_recv_down(1), &
1156 : lb_recv_down(2):ub_recv_down(2), lb:ub) = &
1157 : rs%r(lb_recv_down(1):ub_recv_down(1), &
1158 : lb_recv_down(2):ub_recv_down(2), lb:ub) + &
1159 : recv_buf_3d_down(:, :, lb:ub)
1160 : END IF
1161 : !$OMP END PARALLEL
1162 : END IF
1163 1840 : DEALLOCATE (recv_buf_3d_down)
1164 : ELSE
1165 :
1166 : ! only some procs may need later shifts
1167 1840 : IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1168 : ! Sum the data in the RS Grid
1169 : !$OMP PARALLEL DEFAULT(NONE), &
1170 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1171 1840 : !$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1172 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1173 : !$ my_id = omp_get_thread_num()
1174 : IF (my_id < num_threads) THEN
1175 : lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1176 : ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1177 :
1178 : rs%r(lb_recv_up(1):ub_recv_up(1), &
1179 : lb_recv_up(2):ub_recv_up(2), lb:ub) = &
1180 : rs%r(lb_recv_up(1):ub_recv_up(1), &
1181 : lb_recv_up(2):ub_recv_up(2), lb:ub) + &
1182 : recv_buf_3d_up(:, :, lb:ub)
1183 : END IF
1184 : !$OMP END PARALLEL
1185 : END IF
1186 1840 : DEALLOCATE (recv_buf_3d_up)
1187 : END IF
1188 :
1189 : END DO
1190 :
1191 : ! make sure the sends have completed before we deallocate
1192 :
1193 1840 : CALL mp_waitall(req(3:4))
1194 :
1195 1840 : DEALLOCATE (send_buf_3d_down)
1196 8448 : DEALLOCATE (send_buf_3d_up)
1197 : END DO
1198 :
1199 4768 : DEALLOCATE (dshifts)
1200 4768 : DEALLOCATE (ushifts)
1201 :
1202 : END IF
1203 :
1204 7360 : halo_swapped(idir) = .TRUE.
1205 :
1206 : END DO
1207 :
1208 : ! This is the real redistribution
1209 7360 : ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
1210 :
1211 : ! work out the pw grid points each proc holds
1212 5520 : DO i = 0, pw%pw_grid%para%group_size - 1
1213 11040 : bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1214 11040 : bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1215 11040 : bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1216 12880 : bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1217 : END DO
1218 :
1219 7360 : ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1220 5520 : ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
1221 5520 : ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
1222 7360 : ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1223 5520 : ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
1224 5520 : ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
1225 5520 : send_tasks(:, 1) = 1
1226 5520 : send_tasks(:, 2) = 0
1227 5520 : send_tasks(:, 3) = 1
1228 5520 : send_tasks(:, 4) = 0
1229 5520 : send_tasks(:, 5) = 1
1230 5520 : send_tasks(:, 6) = 0
1231 5520 : send_sizes = 0
1232 5520 : recv_sizes = 0
1233 :
1234 1840 : my_rs_rank = rs%desc%my_pos
1235 1840 : my_pw_rank = pw%pw_grid%para%rs_mpo
1236 :
1237 : ! find the processors that should hold our data
1238 : ! should be part of the rs grid type
1239 : ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1240 : ! do the recv and send tasks in two separate loops which will
1241 : ! load balance better for OpenMP with large numbers of MPI tasks
1242 :
1243 : !$OMP PARALLEL DO DEFAULT(NONE), &
1244 : !$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1245 1840 : !$OMP SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
1246 : DO i = 0, rs%desc%group_size - 1
1247 :
1248 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1249 : !calculate the rs grid points on each processor
1250 : !coords is the part of the grid that rank i actually holds
1251 : DO idir = 1, 3
1252 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1253 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1254 : lb_send(idir) = pos(1)
1255 : ub_send(idir) = pos(2)
1256 : END DO
1257 :
1258 : IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
1259 : IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
1260 : IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
1261 : IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
1262 :
1263 : recv_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
1264 : recv_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
1265 : recv_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
1266 : recv_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
1267 : recv_tasks(i, 5) = lb_send(3)
1268 : recv_tasks(i, 6) = ub_send(3)
1269 : recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
1270 : (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
1271 :
1272 : END DO
1273 : !$OMP END PARALLEL DO
1274 :
1275 7360 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1276 7360 : DO idir = 1, 3
1277 5520 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1278 16560 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1279 5520 : lb_send(idir) = pos(1)
1280 7360 : ub_send(idir) = pos(2)
1281 : END DO
1282 :
1283 1840 : lb_recv(:) = lb_send(:)
1284 1840 : ub_recv(:) = ub_send(:)
1285 : !$OMP PARALLEL DO DEFAULT(NONE), &
1286 1840 : !$OMP SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
1287 : DO j = 0, pw%pw_grid%para%group_size - 1
1288 :
1289 : IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
1290 : IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
1291 : IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
1292 : IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
1293 :
1294 : send_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
1295 : send_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
1296 : send_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
1297 : send_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
1298 : send_tasks(j, 5) = lb_send(3)
1299 : send_tasks(j, 6) = ub_send(3)
1300 : send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
1301 : (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
1302 :
1303 : END DO
1304 : !$OMP END PARALLEL DO
1305 :
1306 1840 : send_disps(0) = 0
1307 1840 : recv_disps(0) = 0
1308 3680 : DO i = 1, pw%pw_grid%para%group_size - 1
1309 1840 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1310 3680 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1311 : END DO
1312 :
1313 12880 : CPASSERT(SUM(send_sizes) == PRODUCT(ub_recv - lb_recv + 1))
1314 :
1315 9200 : ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1316 11040 : ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1317 :
1318 5520 : DO i = 0, rs%desc%group_size - 1
1319 3680 : IF (send_sizes(i) .NE. 0) THEN
1320 10302 : ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1321 : ELSE
1322 246 : NULLIFY (send_bufs(i)%array)
1323 : END IF
1324 5520 : IF (recv_sizes(i) .NE. 0) THEN
1325 10302 : ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1326 : ELSE
1327 246 : NULLIFY (recv_bufs(i)%array)
1328 : END IF
1329 : END DO
1330 :
1331 9200 : ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1332 5520 : recv_reqs = mp_request_null
1333 :
1334 5520 : DO i = 0, rs%desc%group_size - 1
1335 5520 : IF (recv_sizes(i) .NE. 0) THEN
1336 3434 : CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1337 : END IF
1338 : END DO
1339 :
1340 : ! do packing
1341 : !$OMP PARALLEL DO DEFAULT(NONE), &
1342 : !$OMP PRIVATE(k,z,y,x), &
1343 1840 : !$OMP SHARED(rs,send_tasks,send_bufs,send_disps)
1344 : DO i = 0, rs%desc%group_size - 1
1345 : k = 0
1346 : DO z = send_tasks(i, 5), send_tasks(i, 6)
1347 : DO y = send_tasks(i, 3), send_tasks(i, 4)
1348 : DO x = send_tasks(i, 1), send_tasks(i, 2)
1349 : k = k + 1
1350 : send_bufs(i)%array(k) = rs%r(x, y, z)
1351 : END DO
1352 : END DO
1353 : END DO
1354 : END DO
1355 : !$OMP END PARALLEL DO
1356 :
1357 9200 : ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1358 5520 : send_reqs = mp_request_null
1359 :
1360 5520 : DO i = 0, rs%desc%group_size - 1
1361 5520 : IF (send_sizes(i) .NE. 0) THEN
1362 3434 : CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1363 : END IF
1364 : END DO
1365 :
1366 : ! do unpacking
1367 : ! no OMP here so we can unpack each message as it arrives
1368 5520 : DO i = 0, rs%desc%group_size - 1
1369 3680 : IF (recv_sizes(i) .EQ. 0) CYCLE
1370 :
1371 3434 : CALL mp_waitany(recv_reqs, completed)
1372 3434 : k = 0
1373 143312 : DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1374 10769386 : DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1375 455339438 : DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1376 444573732 : k = k + 1
1377 455201400 : pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
1378 : END DO
1379 : END DO
1380 : END DO
1381 : END DO
1382 :
1383 1840 : CALL mp_waitall(send_reqs)
1384 :
1385 1840 : DEALLOCATE (recv_reqs)
1386 1840 : DEALLOCATE (send_reqs)
1387 :
1388 5520 : DO i = 0, rs%desc%group_size - 1
1389 3680 : IF (ASSOCIATED(send_bufs(i)%array)) THEN
1390 3434 : DEALLOCATE (send_bufs(i)%array)
1391 : END IF
1392 5520 : IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1393 3434 : DEALLOCATE (recv_bufs(i)%array)
1394 : END IF
1395 : END DO
1396 :
1397 1840 : DEALLOCATE (send_bufs)
1398 1840 : DEALLOCATE (recv_bufs)
1399 1840 : DEALLOCATE (send_tasks)
1400 1840 : DEALLOCATE (send_sizes)
1401 1840 : DEALLOCATE (send_disps)
1402 1840 : DEALLOCATE (recv_tasks)
1403 1840 : DEALLOCATE (recv_sizes)
1404 1840 : DEALLOCATE (recv_disps)
1405 :
1406 : IF (debug_this_module) THEN
1407 : ! safety check, to be removed once we're absolute sure the routine is correct
1408 : pw_sum = pw_integrate_function(pw)
1409 : IF (ABS(pw_sum - rs_sum)/MAX(1.0_dp, ABS(pw_sum), ABS(rs_sum)) > EPSILON(rs_sum)*1000) THEN
1410 : WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
1411 : rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, ABS(pw_sum - rs_sum)
1412 : CALL cp_abort(__LOCATION__, &
1413 : error_string//" Please report this bug ... quick workaround: use "// &
1414 : "DISTRIBUTION_TYPE REPLICATED")
1415 : END IF
1416 : END IF
1417 :
1418 1840 : END SUBROUTINE transfer_rs2pw_distributed
1419 :
1420 : ! **************************************************************************************************
1421 : !> \brief does the pw2rs transfer in the case where the rs grid is
1422 : !> distributed (3D domain decomposition)
1423 : !> \param rs ...
1424 : !> \param pw ...
1425 : !> \par History
1426 : !> 12.2007 created [Matt Watkins]
1427 : !> 9.2008 reduced amount of halo data sent [Iain Bethune]
1428 : !> 10.2008 added non-blocking communication [Iain Bethune]
1429 : !> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
1430 : !> 12.2009 added OMP and sparse alltoall [Iain Bethune]
1431 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
1432 : !> \note
1433 : !> the transfer is a two step procedure. For example, for the rs2pw transfer:
1434 : !>
1435 : !> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
1436 : !> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
1437 : !>
1438 : !> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
1439 : !> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
1440 : !> with the central domain of several CPUs (i.e. next nearest neighbors)
1441 : ! **************************************************************************************************
1442 868 : SUBROUTINE transfer_pw2rs_distributed(rs, pw)
1443 : TYPE(realspace_grid_type), INTENT(IN) :: rs
1444 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
1445 :
1446 : INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
1447 : n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
1448 868 : INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
1449 868 : send_disps, send_sizes, ushifts
1450 1736 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
1451 : INTEGER, DIMENSION(2) :: neighbours, pos
1452 : INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
1453 : lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
1454 : LOGICAL, DIMENSION(3) :: halo_swapped
1455 868 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
1456 868 : send_buf_3d_down, send_buf_3d_up
1457 1736 : TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
1458 868 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
1459 4340 : TYPE(mp_request_type), DIMENSION(4) :: req
1460 :
1461 868 : num_threads = 1
1462 868 : my_id = 0
1463 :
1464 868 : CALL rs_grid_zero(rs)
1465 :
1466 : ! This is the real redistribution
1467 :
1468 3472 : ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
1469 :
1470 2604 : DO i = 0, pw%pw_grid%para%group_size - 1
1471 5208 : bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1472 5208 : bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1473 5208 : bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1474 6076 : bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1475 : END DO
1476 :
1477 3472 : ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1478 2604 : ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
1479 2604 : ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
1480 3472 : ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1481 2604 : ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
1482 2604 : ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
1483 :
1484 16492 : send_tasks = 0
1485 2604 : send_tasks(:, 1) = 1
1486 2604 : send_tasks(:, 2) = 0
1487 2604 : send_tasks(:, 3) = 1
1488 2604 : send_tasks(:, 4) = 0
1489 2604 : send_tasks(:, 5) = 1
1490 2604 : send_tasks(:, 6) = 0
1491 2604 : send_sizes = 0
1492 :
1493 16492 : recv_tasks = 0
1494 2604 : recv_tasks(:, 1) = 1
1495 2604 : recv_tasks(:, 2) = 0
1496 2604 : send_tasks(:, 3) = 1
1497 2604 : send_tasks(:, 4) = 0
1498 2604 : send_tasks(:, 5) = 1
1499 2604 : send_tasks(:, 6) = 0
1500 2604 : recv_sizes = 0
1501 :
1502 868 : my_rs_rank = rs%desc%my_pos
1503 868 : my_pw_rank = pw%pw_grid%para%rs_mpo
1504 :
1505 : ! find the processors that should hold our data
1506 : ! should be part of the rs grid type
1507 : ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1508 : ! do the recv and send tasks in two separate loops which will
1509 : ! load balance better for OpenMP with large numbers of MPI tasks
1510 :
1511 : ! this is the reverse of rs2pw: what were the sends are now the recvs
1512 :
1513 : !$OMP PARALLEL DO DEFAULT(NONE), &
1514 : !$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1515 868 : !$OMP SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
1516 : DO i = 0, pw%pw_grid%para%group_size - 1
1517 :
1518 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1519 : !calculate the real rs grid points on each processor
1520 : !coords is the part of the grid that rank i actually holds
1521 : DO idir = 1, 3
1522 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1523 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1524 : lb_send(idir) = pos(1)
1525 : ub_send(idir) = pos(2)
1526 : END DO
1527 :
1528 : IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
1529 : IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
1530 : IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
1531 : IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
1532 :
1533 : send_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
1534 : send_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
1535 : send_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
1536 : send_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
1537 : send_tasks(i, 5) = lb_send(3)
1538 : send_tasks(i, 6) = ub_send(3)
1539 : send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
1540 : (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
1541 :
1542 : END DO
1543 : !$OMP END PARALLEL DO
1544 :
1545 3472 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1546 3472 : DO idir = 1, 3
1547 2604 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1548 7812 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1549 2604 : lb_send(idir) = pos(1)
1550 3472 : ub_send(idir) = pos(2)
1551 : END DO
1552 :
1553 868 : lb_recv(:) = lb_send(:)
1554 868 : ub_recv(:) = ub_send(:)
1555 :
1556 : !$OMP PARALLEL DO DEFAULT(NONE), &
1557 868 : !$OMP SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
1558 : DO j = 0, pw%pw_grid%para%group_size - 1
1559 :
1560 : IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
1561 : IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
1562 : IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
1563 : IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
1564 :
1565 : recv_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
1566 : recv_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
1567 : recv_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
1568 : recv_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
1569 : recv_tasks(j, 5) = lb_send(3)
1570 : recv_tasks(j, 6) = ub_send(3)
1571 : recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
1572 : (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
1573 :
1574 : END DO
1575 : !$OMP END PARALLEL DO
1576 :
1577 868 : send_disps(0) = 0
1578 868 : recv_disps(0) = 0
1579 1736 : DO i = 1, pw%pw_grid%para%group_size - 1
1580 868 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1581 1736 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1582 : END DO
1583 :
1584 6076 : CPASSERT(SUM(recv_sizes) == PRODUCT(ub_recv - lb_recv + 1))
1585 :
1586 4340 : ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1587 5208 : ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1588 :
1589 2604 : DO i = 0, rs%desc%group_size - 1
1590 1736 : IF (send_sizes(i) .NE. 0) THEN
1591 4962 : ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1592 : ELSE
1593 82 : NULLIFY (send_bufs(i)%array)
1594 : END IF
1595 2604 : IF (recv_sizes(i) .NE. 0) THEN
1596 4962 : ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1597 : ELSE
1598 82 : NULLIFY (recv_bufs(i)%array)
1599 : END IF
1600 : END DO
1601 :
1602 4340 : ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1603 2604 : recv_reqs = mp_request_null
1604 :
1605 2604 : DO i = 0, rs%desc%group_size - 1
1606 2604 : IF (recv_sizes(i) .NE. 0) THEN
1607 1654 : CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1608 : END IF
1609 : END DO
1610 :
1611 : ! do packing
1612 : !$OMP PARALLEL DO DEFAULT(NONE), &
1613 : !$OMP PRIVATE(k,z,y,x), &
1614 868 : !$OMP SHARED(pw,rs,send_tasks,send_bufs,send_disps)
1615 : DO i = 0, rs%desc%group_size - 1
1616 : k = 0
1617 : DO z = send_tasks(i, 5), send_tasks(i, 6)
1618 : DO y = send_tasks(i, 3), send_tasks(i, 4)
1619 : DO x = send_tasks(i, 1), send_tasks(i, 2)
1620 : k = k + 1
1621 : send_bufs(i)%array(k) = pw%array(x, y, z)
1622 : END DO
1623 : END DO
1624 : END DO
1625 : END DO
1626 : !$OMP END PARALLEL DO
1627 :
1628 4340 : ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1629 2604 : send_reqs = mp_request_null
1630 :
1631 2604 : DO i = 0, rs%desc%group_size - 1
1632 2604 : IF (send_sizes(i) .NE. 0) THEN
1633 1654 : CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1634 : END IF
1635 : END DO
1636 :
1637 : ! do unpacking
1638 : ! no OMP here so we can unpack each message as it arrives
1639 :
1640 2604 : DO i = 0, rs%desc%group_size - 1
1641 1736 : IF (recv_sizes(i) .EQ. 0) CYCLE
1642 :
1643 1654 : CALL mp_waitany(recv_reqs, completed)
1644 1654 : k = 0
1645 66400 : DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1646 4768332 : DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1647 195756703 : DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1648 190990107 : k = k + 1
1649 195692825 : rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
1650 : END DO
1651 : END DO
1652 : END DO
1653 : END DO
1654 :
1655 868 : CALL mp_waitall(send_reqs)
1656 :
1657 868 : DEALLOCATE (recv_reqs)
1658 868 : DEALLOCATE (send_reqs)
1659 :
1660 2604 : DO i = 0, rs%desc%group_size - 1
1661 1736 : IF (ASSOCIATED(send_bufs(i)%array)) THEN
1662 1654 : DEALLOCATE (send_bufs(i)%array)
1663 : END IF
1664 2604 : IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1665 1654 : DEALLOCATE (recv_bufs(i)%array)
1666 : END IF
1667 : END DO
1668 :
1669 868 : DEALLOCATE (send_bufs)
1670 868 : DEALLOCATE (recv_bufs)
1671 868 : DEALLOCATE (send_tasks)
1672 868 : DEALLOCATE (send_sizes)
1673 868 : DEALLOCATE (send_disps)
1674 868 : DEALLOCATE (recv_tasks)
1675 868 : DEALLOCATE (recv_sizes)
1676 868 : DEALLOCATE (recv_disps)
1677 :
1678 : ! now pass wings around
1679 868 : halo_swapped = .FALSE.
1680 :
1681 3472 : DO idir = 1, 3
1682 :
1683 2604 : IF (rs%desc%perd(idir) /= 1) THEN
1684 :
1685 5532 : ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
1686 5532 : ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
1687 5532 : ushifts = 0
1688 5532 : dshifts = 0
1689 :
1690 3688 : DO n_shifts = 1, rs%desc%neighbours(idir)
1691 :
1692 : ! need to take into account the possible varying widths of neighbouring cells
1693 : ! ushifts and dshifts hold the real size of the neighbouring cells
1694 :
1695 1844 : position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1696 1844 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1697 1844 : dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1698 :
1699 1844 : position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1700 1844 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1701 1844 : ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1702 :
1703 : ! The border data has to be send/received from the neighbors
1704 : ! First we calculate the source and destination processes for the shift
1705 : ! The first shift is "downwards"
1706 :
1707 1844 : CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1708 :
1709 7376 : lb_send_down(:) = rs%lb_local(:)
1710 7376 : ub_send_down(:) = rs%ub_local(:)
1711 7376 : lb_recv_down(:) = rs%lb_local(:)
1712 7376 : ub_recv_down(:) = rs%ub_local(:)
1713 :
1714 1844 : IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1715 1844 : lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
1716 : ub_send_down(idir) = MIN(ub_send_down(idir) - rs%desc%border, &
1717 1844 : lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1718 :
1719 1844 : lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1720 : ub_recv_down(idir) = MIN(ub_recv_down(idir), &
1721 1844 : ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
1722 : ELSE
1723 0 : lb_send_down(idir) = 0
1724 0 : ub_send_down(idir) = -1
1725 0 : lb_recv_down(idir) = 0
1726 0 : ub_recv_down(idir) = -1
1727 : END IF
1728 :
1729 7376 : DO i = 1, 3
1730 7376 : IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1731 1548 : lb_send_down(i) = rs%lb_real(i)
1732 1548 : ub_send_down(i) = rs%ub_real(i)
1733 1548 : lb_recv_down(i) = rs%lb_real(i)
1734 1548 : ub_recv_down(i) = rs%ub_real(i)
1735 : END IF
1736 : END DO
1737 :
1738 : ! allocate the recv buffer
1739 7376 : nn = PRODUCT(ub_recv_down - lb_recv_down + 1)
1740 0 : ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1741 9220 : lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1742 :
1743 : ! recv buffer is now ready, so post the receive
1744 1844 : CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1745 :
1746 : ! now allocate,pack and send the send buffer
1747 7376 : nn = PRODUCT(ub_send_down - lb_send_down + 1)
1748 0 : ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1749 9220 : lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1750 :
1751 : !$OMP PARALLEL DEFAULT(NONE), &
1752 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1753 1844 : !$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1754 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1755 : !$ my_id = omp_get_thread_num()
1756 : IF (my_id < num_threads) THEN
1757 : lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1758 : ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1759 :
1760 : send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1761 : lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1762 : lb_send_down(2):ub_send_down(2), lb:ub)
1763 : END IF
1764 : !$OMP END PARALLEL
1765 :
1766 1844 : CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1767 :
1768 : ! Now for the other direction
1769 :
1770 1844 : CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1771 :
1772 7376 : lb_send_up(:) = rs%lb_local(:)
1773 7376 : ub_send_up(:) = rs%ub_local(:)
1774 7376 : lb_recv_up(:) = rs%lb_local(:)
1775 7376 : ub_recv_up(:) = rs%ub_local(:)
1776 :
1777 1844 : IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1778 1844 : ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
1779 : lb_send_up(idir) = MAX(lb_send_up(idir) + rs%desc%border, &
1780 1844 : ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1781 :
1782 1844 : ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1783 : lb_recv_up(idir) = MAX(lb_recv_up(idir), &
1784 1844 : lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
1785 : ELSE
1786 0 : lb_send_up(idir) = 0
1787 0 : ub_send_up(idir) = -1
1788 0 : lb_recv_up(idir) = 0
1789 0 : ub_recv_up(idir) = -1
1790 : END IF
1791 :
1792 7376 : DO i = 1, 3
1793 7376 : IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1794 1548 : lb_send_up(i) = rs%lb_real(i)
1795 1548 : ub_send_up(i) = rs%ub_real(i)
1796 1548 : lb_recv_up(i) = rs%lb_real(i)
1797 1548 : ub_recv_up(i) = rs%ub_real(i)
1798 : END IF
1799 : END DO
1800 :
1801 : ! allocate the recv buffer
1802 7376 : nn = PRODUCT(ub_recv_up - lb_recv_up + 1)
1803 0 : ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1804 9220 : lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1805 :
1806 : ! recv buffer is now ready, so post the receive
1807 :
1808 1844 : CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1809 :
1810 : ! now allocate,pack and send the send buffer
1811 7376 : nn = PRODUCT(ub_send_up - lb_send_up + 1)
1812 0 : ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1813 9220 : lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1814 :
1815 : !$OMP PARALLEL DEFAULT(NONE), &
1816 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1817 1844 : !$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1818 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1819 : !$ my_id = omp_get_thread_num()
1820 : IF (my_id < num_threads) THEN
1821 : lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1822 : ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1823 :
1824 : send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1825 : lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1826 : lb_send_up(2):ub_send_up(2), lb:ub)
1827 : END IF
1828 : !$OMP END PARALLEL
1829 :
1830 1844 : CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1831 :
1832 : ! wait for a recv to complete, then we can unpack
1833 :
1834 5532 : DO i = 1, 2
1835 :
1836 3688 : CALL mp_waitany(req(1:2), completed)
1837 :
1838 5532 : IF (completed .EQ. 1) THEN
1839 :
1840 : ! only some procs may need later shifts
1841 1844 : IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1842 :
1843 : ! Add the data to the RS Grid
1844 : !$OMP PARALLEL DEFAULT(NONE), &
1845 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1846 1844 : !$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1847 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1848 : !$ my_id = omp_get_thread_num()
1849 : IF (my_id < num_threads) THEN
1850 : lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1851 : ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1852 :
1853 : rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
1854 : lb:ub) = recv_buf_3d_down(:, :, lb:ub)
1855 : END IF
1856 : !$OMP END PARALLEL
1857 : END IF
1858 :
1859 1844 : DEALLOCATE (recv_buf_3d_down)
1860 : ELSE
1861 :
1862 : ! only some procs may need later shifts
1863 1844 : IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1864 :
1865 : ! Add the data to the RS Grid
1866 : !$OMP PARALLEL DEFAULT(NONE), &
1867 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1868 1844 : !$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1869 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1870 : !$ my_id = omp_get_thread_num()
1871 : IF (my_id < num_threads) THEN
1872 : lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1873 : ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1874 :
1875 : rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
1876 : lb:ub) = recv_buf_3d_up(:, :, lb:ub)
1877 : END IF
1878 : !$OMP END PARALLEL
1879 : END IF
1880 :
1881 1844 : DEALLOCATE (recv_buf_3d_up)
1882 : END IF
1883 : END DO
1884 :
1885 1844 : CALL mp_waitall(req(3:4))
1886 :
1887 1844 : DEALLOCATE (send_buf_3d_down)
1888 5532 : DEALLOCATE (send_buf_3d_up)
1889 : END DO
1890 :
1891 1844 : DEALLOCATE (ushifts)
1892 1844 : DEALLOCATE (dshifts)
1893 : END IF
1894 :
1895 3472 : halo_swapped(idir) = .TRUE.
1896 :
1897 : END DO
1898 :
1899 868 : END SUBROUTINE transfer_pw2rs_distributed
1900 :
1901 : ! **************************************************************************************************
1902 : !> \brief Initialize grid to zero
1903 : !> \param rs ...
1904 : !> \par History
1905 : !> none
1906 : !> \author JGH (23-Mar-2002)
1907 : ! **************************************************************************************************
1908 326052 : SUBROUTINE rs_grid_zero(rs)
1909 :
1910 : TYPE(realspace_grid_type), INTENT(IN) :: rs
1911 :
1912 : CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_zero'
1913 :
1914 : INTEGER :: handle, i, j, k, l(3), u(3)
1915 :
1916 326052 : CALL timeset(routineN, handle)
1917 978156 : l(1) = LBOUND(rs%r, 1); l(2) = LBOUND(rs%r, 2); l(3) = LBOUND(rs%r, 3)
1918 978156 : u(1) = UBOUND(rs%r, 1); u(2) = UBOUND(rs%r, 2); u(3) = UBOUND(rs%r, 3)
1919 : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1920 : !$OMP PRIVATE(i,j,k) &
1921 326052 : !$OMP SHARED(rs,l,u)
1922 : DO k = l(3), u(3)
1923 : DO j = l(2), u(2)
1924 : DO i = l(1), u(1)
1925 : rs%r(i, j, k) = 0.0_dp
1926 : END DO
1927 : END DO
1928 : END DO
1929 : !$OMP END PARALLEL DO
1930 326052 : CALL timestop(handle)
1931 :
1932 326052 : END SUBROUTINE rs_grid_zero
1933 :
1934 : ! **************************************************************************************************
1935 : !> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
1936 : !> \param rs1 ...
1937 : !> \param rs2 ...
1938 : !> \param rs3 ...
1939 : !> \param scalar ...
1940 : !> \par History
1941 : !> none
1942 : !> \author
1943 : ! **************************************************************************************************
1944 1404 : SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
1945 :
1946 : TYPE(realspace_grid_type), INTENT(IN) :: rs1, rs2, rs3
1947 : REAL(dp), INTENT(IN) :: scalar
1948 :
1949 : CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_mult_and_add'
1950 :
1951 : INTEGER :: handle, i, j, k, l(3), u(3)
1952 :
1953 : !-----------------------------------------------------------------------------!
1954 :
1955 1404 : CALL timeset(routineN, handle)
1956 1404 : IF (scalar .NE. 0.0_dp) THEN
1957 4212 : l(1) = LBOUND(rs1%r, 1); l(2) = LBOUND(rs1%r, 2); l(3) = LBOUND(rs1%r, 3)
1958 4212 : u(1) = UBOUND(rs1%r, 1); u(2) = UBOUND(rs1%r, 2); u(3) = UBOUND(rs1%r, 3)
1959 : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1960 : !$OMP PRIVATE(i,j,k) &
1961 1404 : !$OMP SHARED(rs1,rs2,rs3,scalar,l,u)
1962 : DO k = l(3), u(3)
1963 : DO j = l(2), u(2)
1964 : DO i = l(1), u(1)
1965 : rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
1966 : END DO
1967 : END DO
1968 : END DO
1969 : !$OMP END PARALLEL DO
1970 : END IF
1971 1404 : CALL timestop(handle)
1972 1404 : END SUBROUTINE rs_grid_mult_and_add
1973 :
1974 : ! **************************************************************************************************
1975 : !> \brief Set box matrix info for real space grid
1976 : !> This is needed for variable cell simulations
1977 : !> \param pw_grid ...
1978 : !> \param rs ...
1979 : !> \par History
1980 : !> none
1981 : !> \author JGH (15-May-2007)
1982 : ! **************************************************************************************************
1983 166222 : SUBROUTINE rs_grid_set_box(pw_grid, rs)
1984 :
1985 : TYPE(pw_grid_type), INTENT(IN), TARGET :: pw_grid
1986 : TYPE(realspace_grid_type), INTENT(IN) :: rs
1987 :
1988 166222 : CPASSERT(ASSOCIATED(rs%desc%pw, pw_grid))
1989 2160886 : rs%desc%dh = pw_grid%dh
1990 2160886 : rs%desc%dh_inv = pw_grid%dh_inv
1991 :
1992 166222 : END SUBROUTINE rs_grid_set_box
1993 :
1994 : ! **************************************************************************************************
1995 : !> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
1996 : !> \param rs_desc the grid descriptor to retain
1997 : !> \par History
1998 : !> 04.2009 created [Iain Bethune]
1999 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2000 : ! **************************************************************************************************
2001 225574 : SUBROUTINE rs_grid_retain_descriptor(rs_desc)
2002 : TYPE(realspace_grid_desc_type), INTENT(INOUT) :: rs_desc
2003 :
2004 225574 : CPASSERT(rs_desc%ref_count > 0)
2005 225574 : rs_desc%ref_count = rs_desc%ref_count + 1
2006 225574 : END SUBROUTINE rs_grid_retain_descriptor
2007 :
2008 : ! **************************************************************************************************
2009 : !> \brief releases the given rs grid (see doc/ReferenceCounting.html)
2010 : !> \param rs_grid the rs grid to release
2011 : !> \par History
2012 : !> 03.2003 created [fawzi]
2013 : !> \author fawzi
2014 : ! **************************************************************************************************
2015 225024 : SUBROUTINE rs_grid_release(rs_grid)
2016 : TYPE(realspace_grid_type), INTENT(INOUT) :: rs_grid
2017 :
2018 225024 : CALL rs_grid_release_descriptor(rs_grid%desc)
2019 :
2020 225024 : CALL offload_free_buffer(rs_grid%buffer)
2021 225024 : NULLIFY (rs_grid%r)
2022 :
2023 225024 : IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
2024 225024 : IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
2025 225024 : IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
2026 225024 : END SUBROUTINE rs_grid_release
2027 :
2028 : ! **************************************************************************************************
2029 : !> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
2030 : !> \param rs_desc the rs grid descriptor to release
2031 : !> \par History
2032 : !> 04.2009 created [Iain Bethune]
2033 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2034 : ! **************************************************************************************************
2035 259489 : SUBROUTINE rs_grid_release_descriptor(rs_desc)
2036 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2037 :
2038 259489 : IF (ASSOCIATED(rs_desc)) THEN
2039 255870 : CPASSERT(rs_desc%ref_count > 0)
2040 255870 : rs_desc%ref_count = rs_desc%ref_count - 1
2041 255870 : IF (rs_desc%ref_count == 0) THEN
2042 :
2043 30296 : CALL pw_grid_release(rs_desc%pw)
2044 :
2045 30296 : IF (rs_desc%parallel) THEN
2046 : ! release the group communicator
2047 26390 : CALL rs_desc%group%free()
2048 :
2049 26390 : DEALLOCATE (rs_desc%virtual2real)
2050 26390 : DEALLOCATE (rs_desc%real2virtual)
2051 : END IF
2052 :
2053 30296 : IF (rs_desc%distributed) THEN
2054 158 : DEALLOCATE (rs_desc%rank2coord)
2055 158 : DEALLOCATE (rs_desc%coord2rank)
2056 158 : DEALLOCATE (rs_desc%lb_global)
2057 158 : DEALLOCATE (rs_desc%ub_global)
2058 158 : DEALLOCATE (rs_desc%x2coord)
2059 158 : DEALLOCATE (rs_desc%y2coord)
2060 158 : DEALLOCATE (rs_desc%z2coord)
2061 : END IF
2062 :
2063 30296 : DEALLOCATE (rs_desc)
2064 : END IF
2065 : END IF
2066 259489 : NULLIFY (rs_desc)
2067 259489 : END SUBROUTINE rs_grid_release_descriptor
2068 :
2069 : ! **************************************************************************************************
2070 : !> \brief emulates the function of an MPI_cart_shift operation, but the shift is
2071 : !> done in virtual coordinates, and the corresponding real ranks are returned
2072 : !> \param rs_grid ...
2073 : !> \param dir ...
2074 : !> \param disp ...
2075 : !> \param source ...
2076 : !> \param dest ...
2077 : !> \par History
2078 : !> 04.2009 created [Iain Bethune]
2079 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2080 : ! **************************************************************************************************
2081 7368 : PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
2082 :
2083 : TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2084 : INTEGER, INTENT(IN) :: dir, disp
2085 : INTEGER, INTENT(OUT) :: source, dest
2086 :
2087 : INTEGER, DIMENSION(3) :: shift_coords
2088 :
2089 29472 : shift_coords = rs_grid%desc%virtual_group_coor
2090 7368 : shift_coords(dir) = MODULO(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
2091 7368 : dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2092 29472 : shift_coords = rs_grid%desc%virtual_group_coor
2093 7368 : shift_coords(dir) = MODULO(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
2094 7368 : source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2095 :
2096 7368 : END SUBROUTINE
2097 :
2098 : ! **************************************************************************************************
2099 : !> \brief returns the maximum number of points in the local grid of any process
2100 : !> to account for the case where the grid may later be reordered
2101 : !> \param desc ...
2102 : !> \return ...
2103 : !> \par History
2104 : !> 10.2011 created [Iain Bethune]
2105 : ! **************************************************************************************************
2106 0 : FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
2107 : TYPE(realspace_grid_desc_type), INTENT(IN) :: desc
2108 : INTEGER :: max_ngpts
2109 :
2110 : CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_max_ngpts'
2111 :
2112 : INTEGER :: handle, i
2113 : INTEGER, DIMENSION(3) :: lb, ub
2114 :
2115 0 : CALL timeset(routineN, handle)
2116 :
2117 0 : max_ngpts = 0
2118 0 : IF ((desc%pw%para%mode == PW_MODE_LOCAL) .OR. &
2119 : (ALL(desc%group_dim == 1))) THEN
2120 0 : CPASSERT(PRODUCT(INT(desc%npts, KIND=int_8)) < HUGE(1))
2121 0 : max_ngpts = PRODUCT(desc%npts)
2122 : ELSE
2123 0 : DO i = 0, desc%group_size - 1
2124 0 : lb = desc%lb_global(:, i)
2125 0 : ub = desc%ub_global(:, i)
2126 0 : lb = lb - desc%border*(1 - desc%perd)
2127 0 : ub = ub + desc%border*(1 - desc%perd)
2128 0 : CPASSERT(PRODUCT(INT(ub - lb + 1, KIND=int_8)) < HUGE(1))
2129 0 : max_ngpts = MAX(max_ngpts, PRODUCT(ub - lb + 1))
2130 : END DO
2131 : END IF
2132 :
2133 0 : CALL timestop(handle)
2134 :
2135 0 : END FUNCTION rs_grid_max_ngpts
2136 :
2137 : ! **************************************************************************************************
2138 : !> \brief ...
2139 : !> \param rs_grid ...
2140 : !> \param h_inv ...
2141 : !> \param ra ...
2142 : !> \param offset ...
2143 : !> \param group_size ...
2144 : !> \param my_pos ...
2145 : !> \return ...
2146 : ! **************************************************************************************************
2147 1500159 : PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
2148 : TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2149 : REAL(KIND=dp), DIMENSION(3, 3), INTENT(IN) :: h_inv
2150 : REAL(KIND=dp), DIMENSION(3), INTENT(IN) :: ra
2151 : INTEGER, INTENT(IN), OPTIONAL :: offset, group_size, my_pos
2152 :
2153 : INTEGER :: dir, lb(3), location(3), tp(3), ub(3)
2154 :
2155 1500159 : res = .FALSE.
2156 :
2157 6000628 : IF (.NOT. ALL(rs_grid%desc%perd == 1)) THEN
2158 32 : DO dir = 1, 3
2159 : ! bounds of local grid (i.e. removing the 'wings'), if periodic
2160 96 : tp(dir) = FLOOR(DOT_PRODUCT(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
2161 24 : tp(dir) = MODULO(tp(dir), rs_grid%desc%npts(dir))
2162 24 : IF (rs_grid%desc%perd(dir) .NE. 1) THEN
2163 8 : lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
2164 8 : ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
2165 : ELSE
2166 16 : lb(dir) = rs_grid%lb_local(dir)
2167 16 : ub(dir) = rs_grid%ub_local(dir)
2168 : END IF
2169 : ! distributed grid, only map if it is local to the grid
2170 32 : location(dir) = tp(dir) + rs_grid%desc%lb(dir)
2171 : END DO
2172 60 : IF (ALL(lb(:) <= location(:)) .AND. ALL(location(:) <= ub(:))) THEN
2173 4 : res = .TRUE.
2174 : END IF
2175 : ELSE
2176 1500151 : IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
2177 : ! not distributed, just a round-robin distribution over the full set of CPUs
2178 1500151 : IF (MODULO(offset, group_size) == my_pos) res = .TRUE.
2179 : END IF
2180 : END IF
2181 :
2182 1500159 : END FUNCTION map_gaussian_here
2183 :
2184 0 : END MODULE realspace_grid_types
|