Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief generate the tasks lists used by collocate and integrate routines
10 : !> \par History
11 : !> 01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
12 : !> \author Joost VandeVondele
13 : ! **************************************************************************************************
14 : MODULE task_list_methods
15 : USE offload_api, ONLY: offload_create_buffer, offload_buffer_type
16 : USE grid_api, ONLY: grid_create_basis_set, grid_create_task_list
17 : USE ao_util, ONLY: exp_radius_very_extended
18 : USE basis_set_types, ONLY: get_gto_basis_set, &
19 : gto_basis_set_p_type, &
20 : gto_basis_set_type
21 : USE cell_types, ONLY: cell_type, &
22 : pbc
23 : USE cp_control_types, ONLY: dft_control_type
24 : USE cube_utils, ONLY: compute_cube_center, &
25 : cube_info_type, &
26 : return_cube, &
27 : return_cube_nonortho
28 : USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
29 : dbcsr_finalize, &
30 : dbcsr_get_block_p, &
31 : dbcsr_get_info, &
32 : dbcsr_p_type, &
33 : dbcsr_put_block, &
34 : dbcsr_type, &
35 : dbcsr_work_create
36 : USE gaussian_gridlevels, ONLY: gaussian_gridlevel, &
37 : gridlevel_info_type
38 : USE kinds, ONLY: default_string_length, &
39 : dp, &
40 : int_8
41 : USE kpoint_types, ONLY: get_kpoint_info, &
42 : kpoint_type
43 : USE memory_utilities, ONLY: reallocate
44 : USE message_passing, ONLY: &
45 : mp_comm_type
46 : USE particle_types, ONLY: particle_type
47 : USE particle_methods, ONLY: get_particle_set
48 : USE pw_env_types, ONLY: pw_env_get, &
49 : pw_env_type
50 : USE qs_kind_types, ONLY: get_qs_kind, &
51 : qs_kind_type
52 : USE qs_ks_types, ONLY: get_ks_env, &
53 : qs_ks_env_type
54 : USE qs_neighbor_list_types, ONLY: neighbor_list_set_p_type
55 : USE realspace_grid_types, ONLY: realspace_grid_desc_p_type, &
56 : realspace_grid_desc_type, &
57 : rs_grid_create, &
58 : rs_grid_locate_rank, &
59 : rs_grid_release, &
60 : rs_grid_reorder_ranks, realspace_grid_type
61 : USE task_list_types, ONLY: deserialize_task, &
62 : reallocate_tasks, &
63 : serialize_task, &
64 : task_list_type, &
65 : atom_pair_type, &
66 : task_size_in_int8, &
67 : task_type
68 : USE util, ONLY: sort
69 :
70 : !$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
71 : !$ omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
72 : #include "./base/base_uses.f90"
73 :
74 : #:include './common/array_sort.fypp'
75 :
76 : IMPLICIT NONE
77 :
78 : LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
79 :
80 : PRIVATE
81 :
82 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
83 :
84 : PUBLIC :: generate_qs_task_list, &
85 : task_list_inner_loop
86 : PUBLIC :: distribute_tasks, &
87 : rs_distribute_matrix, &
88 : rs_scatter_matrices, &
89 : rs_gather_matrices, &
90 : rs_copy_to_buffer, &
91 : rs_copy_to_matrices
92 :
93 : CONTAINS
94 :
95 : ! **************************************************************************************************
96 : !> \brief ...
97 : !> \param ks_env ...
98 : !> \param task_list ...
99 : !> \param basis_type ...
100 : !> \param reorder_rs_grid_ranks Flag that indicates if this routine should
101 : !> or should not overwrite the rs descriptor (see comment below)
102 : !> \param skip_load_balance_distributed ...
103 : !> \param pw_env_external ...
104 : !> \param sab_orb_external ...
105 : !> \par History
106 : !> 01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
107 : !> 04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
108 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
109 : !> 06.2015 adjusted to be used with multiple images (k-points) [JGH]
110 : !> \note If this routine is called several times with different task lists,
111 : !> the default behaviour is to re-optimize the grid ranks and overwrite
112 : !> the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
113 : !> of performing a new optimization by leaving the rank order in
114 : !> its current state.
115 : ! **************************************************************************************************
116 14110 : SUBROUTINE generate_qs_task_list(ks_env, task_list, basis_type, &
117 : reorder_rs_grid_ranks, skip_load_balance_distributed, &
118 : pw_env_external, sab_orb_external)
119 :
120 : TYPE(qs_ks_env_type), POINTER :: ks_env
121 : TYPE(task_list_type), POINTER :: task_list
122 : CHARACTER(LEN=*), INTENT(IN) :: basis_type
123 : LOGICAL, INTENT(IN) :: reorder_rs_grid_ranks, &
124 : skip_load_balance_distributed
125 : TYPE(pw_env_type), OPTIONAL, POINTER :: pw_env_external
126 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
127 : OPTIONAL, POINTER :: sab_orb_external
128 :
129 : CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list'
130 : INTEGER, PARAMETER :: max_tasks = 2000
131 :
132 : INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
133 : ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
134 : jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
135 14110 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: blocks
136 : INTEGER, DIMENSION(3) :: cellind
137 14110 : INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
138 14110 : npgfb, nsgf
139 14110 : INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index
140 : LOGICAL :: dokp
141 : REAL(KIND=dp) :: kind_radius_a, kind_radius_b
142 : REAL(KIND=dp), DIMENSION(3) :: ra, rab
143 14110 : REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
144 14110 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
145 : TYPE(cell_type), POINTER :: cell
146 14110 : TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
147 : TYPE(dft_control_type), POINTER :: dft_control
148 : TYPE(gridlevel_info_type), POINTER :: gridlevel_info
149 14110 : TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER :: basis_set_list
150 : TYPE(gto_basis_set_type), POINTER :: basis_set_a, basis_set_b, orb_basis_set
151 : TYPE(kpoint_type), POINTER :: kpoints
152 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
153 14110 : POINTER :: sab_orb
154 14110 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
155 : TYPE(pw_env_type), POINTER :: pw_env
156 14110 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
157 : TYPE(qs_kind_type), POINTER :: qs_kind
158 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
159 14110 : POINTER :: rs_descs
160 14110 : TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
161 14110 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
162 :
163 14110 : CALL timeset(routineN, handle)
164 :
165 : CALL get_ks_env(ks_env, &
166 : qs_kind_set=qs_kind_set, &
167 : cell=cell, &
168 : particle_set=particle_set, &
169 14110 : dft_control=dft_control)
170 :
171 14110 : CALL get_ks_env(ks_env, sab_orb=sab_orb)
172 14110 : IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
173 :
174 14110 : CALL get_ks_env(ks_env, pw_env=pw_env)
175 14110 : IF (PRESENT(pw_env_external)) pw_env => pw_env_external
176 14110 : CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
177 :
178 : ! *** assign from pw_env
179 14110 : gridlevel_info => pw_env%gridlevel_info
180 14110 : cube_info => pw_env%cube_info
181 :
182 : ! find maximum numbers
183 14110 : nkind = SIZE(qs_kind_set)
184 14110 : natoms = SIZE(particle_set)
185 14110 : maxset = 0
186 14110 : maxpgf = 0
187 39275 : DO ikind = 1, nkind
188 25165 : qs_kind => qs_kind_set(ikind)
189 : CALL get_qs_kind(qs_kind=qs_kind, &
190 25165 : basis_set=orb_basis_set, basis_type=basis_type)
191 :
192 25165 : IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
193 25165 : CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
194 :
195 25165 : maxset = MAX(nseta, maxset)
196 92336 : maxpgf = MAX(MAXVAL(npgfa), maxpgf)
197 : END DO
198 :
199 : ! kpoint related
200 14110 : nimages = dft_control%nimages
201 14110 : IF (nimages > 1) THEN
202 286 : dokp = .TRUE.
203 286 : NULLIFY (kpoints)
204 286 : CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
205 286 : CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
206 : ELSE
207 13824 : dokp = .FALSE.
208 13824 : NULLIFY (cell_to_index)
209 : END IF
210 :
211 : ! free the atom_pair lists if allocated
212 14110 : IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
213 14110 : IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
214 :
215 : ! construct a list of all tasks
216 14110 : IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
217 8432 : CALL reallocate_tasks(task_list%tasks, max_tasks)
218 : END IF
219 14110 : task_list%ntasks = 0
220 14110 : curr_tasks = SIZE(task_list%tasks)
221 :
222 67495 : ALLOCATE (basis_set_list(nkind))
223 39275 : DO ikind = 1, nkind
224 25165 : qs_kind => qs_kind_set(ikind)
225 : CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
226 25165 : basis_type=basis_type)
227 39275 : IF (ASSOCIATED(basis_set_a)) THEN
228 25165 : basis_set_list(ikind)%gto_basis_set => basis_set_a
229 : ELSE
230 0 : NULLIFY (basis_set_list(ikind)%gto_basis_set)
231 : END IF
232 : END DO
233 : !!$OMP PARALLEL DEFAULT(NONE) &
234 : !!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info, &
235 : !!$OMP curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
236 : !!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
237 : !!$OMP la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
238 : !!$OMP lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
239 : !!$OMP cindex, slot)
240 : ! Loop over neighbor list
241 : !!$OMP DO SCHEDULE(GUIDED)
242 1381393 : DO slot = 1, sab_orb(1)%nl_size
243 1367283 : ikind = sab_orb(1)%nlist_task(slot)%ikind
244 1367283 : jkind = sab_orb(1)%nlist_task(slot)%jkind
245 1367283 : iatom = sab_orb(1)%nlist_task(slot)%iatom
246 1367283 : jatom = sab_orb(1)%nlist_task(slot)%jatom
247 5469132 : rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
248 5469132 : cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
249 :
250 1367283 : basis_set_a => basis_set_list(ikind)%gto_basis_set
251 1367283 : IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE
252 1367283 : basis_set_b => basis_set_list(jkind)%gto_basis_set
253 1367283 : IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE
254 1367283 : ra(:) = pbc(particle_set(iatom)%r, cell)
255 : ! basis ikind
256 1367283 : la_max => basis_set_a%lmax
257 1367283 : la_min => basis_set_a%lmin
258 1367283 : npgfa => basis_set_a%npgf
259 1367283 : nseta = basis_set_a%nset
260 1367283 : rpgfa => basis_set_a%pgf_radius
261 1367283 : set_radius_a => basis_set_a%set_radius
262 1367283 : kind_radius_a = basis_set_a%kind_radius
263 1367283 : zeta => basis_set_a%zet
264 : ! basis jkind
265 1367283 : lb_max => basis_set_b%lmax
266 1367283 : lb_min => basis_set_b%lmin
267 1367283 : npgfb => basis_set_b%npgf
268 1367283 : nsetb = basis_set_b%nset
269 1367283 : rpgfb => basis_set_b%pgf_radius
270 1367283 : set_radius_b => basis_set_b%set_radius
271 1367283 : kind_radius_b = basis_set_b%kind_radius
272 1367283 : zetb => basis_set_b%zet
273 :
274 1367283 : IF (dokp) THEN
275 154885 : cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
276 : ELSE
277 1212398 : cindex = 1
278 : END IF
279 :
280 : CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
281 : rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
282 : iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
283 : set_radius_a, set_radius_b, ra, rab, &
284 1381393 : la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
285 :
286 : END DO
287 : !!$OMP END PARALLEL
288 :
289 : ! redistribute the task list so that all tasks map on the local rs grids
290 : CALL distribute_tasks( &
291 : rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
292 : tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
293 : atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
294 : reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
295 14110 : skip_load_balance_distributed=skip_load_balance_distributed)
296 :
297 : ! compute offsets for rs_scatter_matrix / rs_copy_matrix
298 42330 : ALLOCATE (nsgf(natoms))
299 14110 : CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
300 14110 : IF (ASSOCIATED(task_list%atom_pair_send)) THEN
301 : ! only needed when there is a distributed grid
302 : CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
303 : nsgf=nsgf, &
304 : group_size=rs_descs(1)%rs_desc%group_size, &
305 : pair_offsets=task_list%pair_offsets_send, &
306 : rank_offsets=task_list%rank_offsets_send, &
307 : rank_sizes=task_list%rank_sizes_send, &
308 48 : buffer_size=task_list%buffer_size_send)
309 : END IF
310 : CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
311 : nsgf=nsgf, &
312 : group_size=rs_descs(1)%rs_desc%group_size, &
313 : pair_offsets=task_list%pair_offsets_recv, &
314 : rank_offsets=task_list%rank_offsets_recv, &
315 : rank_sizes=task_list%rank_sizes_recv, &
316 14110 : buffer_size=task_list%buffer_size_recv)
317 14110 : DEALLOCATE (basis_set_list, nsgf)
318 :
319 : ! If the rank order has changed, reallocate any of the distributed rs_grids
320 14110 : IF (reorder_rs_grid_ranks) THEN
321 60040 : DO i = 1, gridlevel_info%ngrid_levels
322 60040 : IF (rs_descs(i)%rs_desc%distributed) THEN
323 54 : CALL rs_grid_release(rs_grids(i))
324 54 : CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
325 : END IF
326 : END DO
327 : END IF
328 :
329 : CALL create_grid_task_list(task_list=task_list, &
330 : qs_kind_set=qs_kind_set, &
331 : particle_set=particle_set, &
332 : cell=cell, &
333 : basis_type=basis_type, &
334 14110 : rs_grids=rs_grids)
335 :
336 : ! Now we have the final list of tasks, setup the task_list with the
337 : ! data needed for the loops in integrate_v/calculate_rho
338 :
339 14110 : IF (ASSOCIATED(task_list%taskstart)) THEN
340 5678 : DEALLOCATE (task_list%taskstart)
341 : END IF
342 14110 : IF (ASSOCIATED(task_list%taskstop)) THEN
343 5678 : DEALLOCATE (task_list%taskstop)
344 : END IF
345 14110 : IF (ASSOCIATED(task_list%npairs)) THEN
346 5678 : DEALLOCATE (task_list%npairs)
347 : END IF
348 :
349 : ! First, count the number of unique atom pairs per grid level
350 :
351 42330 : ALLOCATE (task_list%npairs(SIZE(rs_descs)))
352 :
353 14110 : iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
354 14110 : ipair = 0
355 70090 : task_list%npairs = 0
356 :
357 7827117 : DO i = 1, task_list%ntasks
358 7813007 : igrid_level = task_list%tasks(i)%grid_level
359 7813007 : img = task_list%tasks(i)%image
360 7813007 : iatom = task_list%tasks(i)%iatom
361 7813007 : jatom = task_list%tasks(i)%jatom
362 7813007 : iset = task_list%tasks(i)%iset
363 7813007 : jset = task_list%tasks(i)%jset
364 7813007 : ipgf = task_list%tasks(i)%ipgf
365 7813007 : jpgf = task_list%tasks(i)%jpgf
366 7827117 : IF (igrid_level /= igrid_level_old) THEN
367 38350 : IF (igrid_level_old /= -1) THEN
368 24472 : task_list%npairs(igrid_level_old) = ipair
369 : END IF
370 : ipair = 1
371 : igrid_level_old = igrid_level
372 : iatom_old = iatom
373 : jatom_old = jatom
374 : img_old = img
375 7774657 : ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
376 513449 : ipair = ipair + 1
377 513449 : iatom_old = iatom
378 513449 : jatom_old = jatom
379 513449 : img_old = img
380 : END IF
381 : END DO
382 : ! Take care of the last iteration
383 14110 : IF (task_list%ntasks /= 0) THEN
384 13878 : task_list%npairs(igrid_level) = ipair
385 : END IF
386 :
387 : ! Second, for each atom pair, find the indices in the task list
388 : ! of the first and last task
389 :
390 : ! Array sized for worst case
391 112188 : ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs)))
392 112188 : ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs)))
393 :
394 14110 : iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
395 14110 : ipair = 0
396 1160804 : task_list%taskstart = 0
397 1160804 : task_list%taskstop = 0
398 :
399 7827117 : DO i = 1, task_list%ntasks
400 7813007 : igrid_level = task_list%tasks(i)%grid_level
401 7813007 : img = task_list%tasks(i)%image
402 7813007 : iatom = task_list%tasks(i)%iatom
403 7813007 : jatom = task_list%tasks(i)%jatom
404 7813007 : iset = task_list%tasks(i)%iset
405 7813007 : jset = task_list%tasks(i)%jset
406 7813007 : ipgf = task_list%tasks(i)%ipgf
407 7813007 : jpgf = task_list%tasks(i)%jpgf
408 7827117 : IF (igrid_level /= igrid_level_old) THEN
409 38350 : IF (igrid_level_old /= -1) THEN
410 24472 : task_list%taskstop(ipair, igrid_level_old) = i - 1
411 : END IF
412 38350 : ipair = 1
413 38350 : task_list%taskstart(ipair, igrid_level) = i
414 38350 : igrid_level_old = igrid_level
415 38350 : iatom_old = iatom
416 38350 : jatom_old = jatom
417 38350 : img_old = img
418 7774657 : ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
419 513449 : ipair = ipair + 1
420 513449 : task_list%taskstart(ipair, igrid_level) = i
421 513449 : task_list%taskstop(ipair - 1, igrid_level) = i - 1
422 513449 : iatom_old = iatom
423 513449 : jatom_old = jatom
424 513449 : img_old = img
425 : END IF
426 : END DO
427 : ! Take care of the last iteration
428 14110 : IF (task_list%ntasks /= 0) THEN
429 13878 : task_list%taskstop(ipair, igrid_level) = task_list%ntasks
430 : END IF
431 :
432 : ! Debug task destribution
433 : IF (debug_this_module) THEN
434 : tasks => task_list%tasks
435 : WRITE (6, *)
436 : WRITE (6, *) "Total number of tasks ", task_list%ntasks
437 : DO igrid_level = 1, gridlevel_info%ngrid_levels
438 : WRITE (6, *) "Total number of pairs(grid_level) ", &
439 : igrid_level, task_list%npairs(igrid_level)
440 : END DO
441 : WRITE (6, *)
442 :
443 : DO igrid_level = 1, gridlevel_info%ngrid_levels
444 :
445 : ALLOCATE (blocks(natoms, natoms, nimages))
446 : blocks = -1
447 : DO ipair = 1, task_list%npairs(igrid_level)
448 : itask = task_list%taskstart(ipair, igrid_level)
449 : ilevel = task_list%tasks(itask)%grid_level
450 : img = task_list%tasks(itask)%image
451 : iatom = task_list%tasks(itask)%iatom
452 : jatom = task_list%tasks(itask)%jatom
453 : iset = task_list%tasks(itask)%iset
454 : jset = task_list%tasks(itask)%jset
455 : ipgf = task_list%tasks(itask)%ipgf
456 : jpgf = task_list%tasks(itask)%jpgf
457 : IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
458 : blocks(iatom, jatom, img) = 1
459 : blocks(jatom, iatom, img) = 1
460 : ELSE
461 : WRITE (6, *) "TASK LIST CONFLICT IN PAIR ", ipair
462 : WRITE (6, *) "Reuse of iatom, jatom, image ", iatom, jatom, img
463 : END IF
464 :
465 : iatom_old = iatom
466 : jatom_old = jatom
467 : img_old = img
468 : DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
469 : ilevel = task_list%tasks(itask)%grid_level
470 : img = task_list%tasks(itask)%image
471 : iatom = task_list%tasks(itask)%iatom
472 : jatom = task_list%tasks(itask)%jatom
473 : iset = task_list%tasks(itask)%iset
474 : jset = task_list%tasks(itask)%jset
475 : ipgf = task_list%tasks(itask)%ipgf
476 : jpgf = task_list%tasks(itask)%jpgf
477 : IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
478 : WRITE (6, *) "TASK LIST CONFLICT IN TASK ", itask
479 : WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
480 : WRITE (6, *) "Should be iatom, jatom, image ", iatom_old, jatom_old, img_old
481 : END IF
482 :
483 : END DO
484 : END DO
485 : DEALLOCATE (blocks)
486 :
487 : END DO
488 :
489 : END IF
490 :
491 14110 : CALL timestop(handle)
492 :
493 14110 : END SUBROUTINE generate_qs_task_list
494 :
495 : ! **************************************************************************************************
496 : !> \brief Sends the task list data to the grid API.
497 : !> \author Ole Schuett
498 : ! **************************************************************************************************
499 14110 : SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
500 : TYPE(task_list_type), POINTER :: task_list
501 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
502 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
503 : TYPE(cell_type), POINTER :: cell
504 : CHARACTER(LEN=*) :: basis_type
505 : TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
506 :
507 : TYPE(gto_basis_set_type), POINTER :: orb_basis_set
508 : INTEGER :: nset, natoms, nkinds, ntasks, &
509 : ikind, iatom, itask, nsgf
510 14110 : INTEGER, DIMENSION(:), ALLOCATABLE :: atom_kinds, level_list, iatom_list, jatom_list, &
511 14110 : iset_list, jset_list, ipgf_list, jpgf_list, &
512 14110 : border_mask_list, block_num_list
513 14110 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: radius_list
514 14110 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: rab_list, atom_positions
515 14110 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
516 14110 : INTEGER, DIMENSION(:, :), POINTER :: first_sgf
517 14110 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: sphi, zet
518 14110 : INTEGER, DIMENSION(:), POINTER :: lmax, lmin, npgf, nsgf_set
519 :
520 14110 : nkinds = SIZE(qs_kind_set)
521 14110 : natoms = SIZE(particle_set)
522 14110 : ntasks = task_list%ntasks
523 14110 : tasks => task_list%tasks
524 :
525 14110 : IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
526 : ! Basis sets do not change during simulation - only need to create them once.
527 40273 : ALLOCATE (task_list%grid_basis_sets(nkinds))
528 23409 : DO ikind = 1, nkinds
529 14977 : CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
530 : CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
531 : nset=nset, &
532 : nsgf=nsgf, &
533 : nsgf_set=nsgf_set, &
534 : npgf=npgf, &
535 : first_sgf=first_sgf, &
536 : lmax=lmax, &
537 : lmin=lmin, &
538 : sphi=sphi, &
539 14977 : zet=zet)
540 : CALL grid_create_basis_set(nset=nset, &
541 : nsgf=nsgf, &
542 : maxco=SIZE(sphi, 1), &
543 : maxpgf=SIZE(zet, 1), &
544 : lmin=lmin, &
545 : lmax=lmax, &
546 : npgf=npgf, &
547 : nsgf_set=nsgf_set, &
548 : first_sgf=first_sgf, &
549 : sphi=sphi, &
550 : zet=zet, &
551 23409 : basis_set=task_list%grid_basis_sets(ikind))
552 : END DO
553 : END IF
554 :
555 : ! Pack task list infos
556 70550 : ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
557 65807 : DO iatom = 1, natoms
558 51697 : atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
559 65807 : atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
560 : END DO
561 :
562 69854 : ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
563 69622 : ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
564 41866 : ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
565 70086 : ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
566 :
567 7827117 : DO itask = 1, ntasks
568 7813007 : level_list(itask) = tasks(itask)%grid_level
569 7813007 : iatom_list(itask) = tasks(itask)%iatom
570 7813007 : jatom_list(itask) = tasks(itask)%jatom
571 7813007 : iset_list(itask) = tasks(itask)%iset
572 7813007 : jset_list(itask) = tasks(itask)%jset
573 7813007 : ipgf_list(itask) = tasks(itask)%ipgf
574 7813007 : jpgf_list(itask) = tasks(itask)%jpgf
575 7813007 : IF (tasks(itask)%dist_type == 2) THEN
576 0 : border_mask_list(itask) = IAND(63, NOT(tasks(itask)%subpatch_pattern)) ! invert last 6 bits
577 : ELSE
578 7813007 : border_mask_list(itask) = 0 ! no masking
579 : END IF
580 7813007 : block_num_list(itask) = tasks(itask)%pair_index ! change of nomenclature pair_index -> block_num
581 7813007 : radius_list(itask) = tasks(itask)%radius
582 31266138 : rab_list(:, itask) = tasks(itask)%rab(:)
583 : END DO
584 :
585 : CALL grid_create_task_list(ntasks=ntasks, &
586 : natoms=natoms, &
587 : nkinds=nkinds, &
588 : nblocks=SIZE(task_list%pair_offsets_recv), &
589 : block_offsets=task_list%pair_offsets_recv, &
590 : atom_positions=atom_positions, &
591 : atom_kinds=atom_kinds, &
592 : basis_sets=task_list%grid_basis_sets, &
593 : level_list=level_list, &
594 : iatom_list=iatom_list, &
595 : jatom_list=jatom_list, &
596 : iset_list=iset_list, &
597 : jset_list=jset_list, &
598 : ipgf_list=ipgf_list, &
599 : jpgf_list=jpgf_list, &
600 : border_mask_list=border_mask_list, &
601 : block_num_list=block_num_list, &
602 : radius_list=radius_list, &
603 : rab_list=rab_list, &
604 : rs_grids=rs_grids, &
605 14110 : task_list=task_list%grid_task_list)
606 :
607 14110 : CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
608 14110 : CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
609 :
610 28220 : END SUBROUTINE create_grid_task_list
611 :
612 : ! **************************************************************************************************
613 : !> \brief ...
614 : !> \param tasks ...
615 : !> \param ntasks ...
616 : !> \param curr_tasks ...
617 : !> \param rs_descs ...
618 : !> \param dft_control ...
619 : !> \param cube_info ...
620 : !> \param gridlevel_info ...
621 : !> \param cindex ...
622 : !> \param iatom ...
623 : !> \param jatom ...
624 : !> \param rpgfa ...
625 : !> \param rpgfb ...
626 : !> \param zeta ...
627 : !> \param zetb ...
628 : !> \param kind_radius_b ...
629 : !> \param set_radius_a ...
630 : !> \param set_radius_b ...
631 : !> \param ra ...
632 : !> \param rab ...
633 : !> \param la_max ...
634 : !> \param la_min ...
635 : !> \param lb_max ...
636 : !> \param lb_min ...
637 : !> \param npgfa ...
638 : !> \param npgfb ...
639 : !> \param nseta ...
640 : !> \param nsetb ...
641 : !> \par History
642 : !> Joost VandeVondele: 10.2008 refactored
643 : ! **************************************************************************************************
644 1378740 : SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
645 : cube_info, gridlevel_info, cindex, &
646 : iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
647 : la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
648 :
649 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
650 : INTEGER :: ntasks, curr_tasks
651 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
652 : POINTER :: rs_descs
653 : TYPE(dft_control_type), POINTER :: dft_control
654 : TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
655 : TYPE(gridlevel_info_type), POINTER :: gridlevel_info
656 : INTEGER :: cindex, iatom, jatom
657 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
658 : REAL(KIND=dp) :: kind_radius_b
659 : REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
660 : REAL(KIND=dp), DIMENSION(3) :: ra, rab
661 : INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
662 : npgfb
663 : INTEGER :: nseta, nsetb
664 :
665 : INTEGER :: cube_center(3), igrid_level, ipgf, iset, &
666 : jpgf, jset, lb_cube(3), ub_cube(3)
667 : REAL(KIND=dp) :: dab, rab2, radius, zetp
668 :
669 1378740 : rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
670 1378740 : dab = SQRT(rab2)
671 :
672 4428179 : loop_iset: DO iset = 1, nseta
673 :
674 3049439 : IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE loop_iset
675 :
676 9192002 : loop_jset: DO jset = 1, nsetb
677 :
678 5698482 : IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE loop_jset
679 :
680 14876692 : loop_ipgf: DO ipgf = 1, npgfa(iset)
681 :
682 8330579 : IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE loop_ipgf
683 :
684 25157284 : loop_jpgf: DO jpgf = 1, npgfb(jset)
685 :
686 14320508 : IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE loop_jpgf
687 :
688 7976897 : zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
689 7976897 : igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
690 :
691 : CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
692 : rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
693 : la_max(iset), zeta(ipgf, iset), la_min(iset), &
694 : lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
695 7976897 : ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
696 :
697 : CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
698 : rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
699 : la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
700 : igrid_level, gridlevel_info%ngrid_levels, cube_center, &
701 22651087 : lb_cube, ub_cube, radius)
702 :
703 : END DO loop_jpgf
704 :
705 : END DO loop_ipgf
706 :
707 : END DO loop_jset
708 :
709 : END DO loop_iset
710 :
711 1378740 : END SUBROUTINE task_list_inner_loop
712 :
713 : ! **************************************************************************************************
714 : !> \brief combines the calculation of several basic properties of a given pgf:
715 : !> its center, the bounding cube, the radius, the cost,
716 : !> tries to predict the time needed for processing this task
717 : !> in this way an improved load balance might be obtained
718 : !> \param cube_center ...
719 : !> \param lb_cube ...
720 : !> \param ub_cube ...
721 : !> \param radius ...
722 : !> \param rs_desc ...
723 : !> \param cube_info ...
724 : !> \param la_max ...
725 : !> \param zeta ...
726 : !> \param la_min ...
727 : !> \param lb_max ...
728 : !> \param zetb ...
729 : !> \param lb_min ...
730 : !> \param ra ...
731 : !> \param rab ...
732 : !> \param rab2 ...
733 : !> \param eps ...
734 : !> \par History
735 : !> 10.2008 refactored [Joost VandeVondele]
736 : !> \note
737 : !> -) this requires the radius to be computed in the same way as
738 : !> collocate_pgf_product, we should factor that part into a subroutine
739 : !> -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
740 : !> this is more or less true for map_consistent
741 : !> -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
742 : !> the same, this could lead to a small speedup
743 : !> -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
744 : !> fitting the measured data on an opteron/g95 using the expression
745 : !> a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
746 : ! **************************************************************************************************
747 7976897 : SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
748 : rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
749 :
750 : INTEGER, DIMENSION(3), INTENT(OUT) :: cube_center, lb_cube, ub_cube
751 : REAL(KIND=dp), INTENT(OUT) :: radius
752 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
753 : TYPE(cube_info_type), INTENT(IN) :: cube_info
754 : INTEGER, INTENT(IN) :: la_max
755 : REAL(KIND=dp), INTENT(IN) :: zeta
756 : INTEGER, INTENT(IN) :: la_min, lb_max
757 : REAL(KIND=dp), INTENT(IN) :: zetb
758 : INTEGER, INTENT(IN) :: lb_min
759 : REAL(KIND=dp), INTENT(IN) :: ra(3), rab(3), rab2, eps
760 :
761 : INTEGER :: extent(3)
762 7976897 : INTEGER, DIMENSION(:), POINTER :: sphere_bounds
763 : REAL(KIND=dp) :: cutoff, f, prefactor, rb(3), zetp
764 : REAL(KIND=dp), DIMENSION(3) :: rp
765 :
766 : ! the radius for this task
767 :
768 7976897 : zetp = zeta + zetb
769 31907588 : rp(:) = ra(:) + zetb/zetp*rab(:)
770 31907588 : rb(:) = ra(:) + rab(:)
771 7976897 : cutoff = 1.0_dp
772 7976897 : f = zetb/zetp
773 7976897 : prefactor = EXP(-zeta*f*rab2)
774 : radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
775 7976897 : zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
776 :
777 7976897 : CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
778 : ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
779 31907588 : cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:))
780 31907588 : cube_center(:) = cube_center(:) + rs_desc%lb(:)
781 :
782 7976897 : IF (rs_desc%orthorhombic) THEN
783 7541105 : CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
784 : ELSE
785 435792 : CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
786 : !? unclear if extent is computed correctly.
787 1743168 : extent(:) = ub_cube(:) - lb_cube(:)
788 1743168 : lb_cube(:) = -extent(:)/2 - 1
789 1743168 : ub_cube(:) = extent(:)/2
790 : END IF
791 :
792 7976897 : END SUBROUTINE compute_pgf_properties
793 : ! **************************************************************************************************
794 : !> \brief predicts the cost of a task in kcycles for a given task
795 : !> the model is based on a fit of actual data, and might need updating
796 : !> as collocate_pgf_product changes (or CPUs/compilers change)
797 : !> maybe some dynamic approach, improving the cost model on the fly could
798 : !> work as well
799 : !> the cost model does not yet take into account the fraction of space
800 : !> that is mapped locally for a given cube and rs_grid (generalised tasks)
801 : !> \param lb_cube ...
802 : !> \param ub_cube ...
803 : !> \param fraction ...
804 : !> \param lmax ...
805 : !> \param is_ortho ...
806 : !> \return ...
807 : ! **************************************************************************************************
808 7976897 : INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
809 : INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
810 : REAL(KIND=dp), INTENT(IN) :: fraction
811 : INTEGER :: lmax
812 : LOGICAL :: is_ortho
813 :
814 : INTEGER :: cmax
815 : REAL(KIND=dp) :: v1, v2, v3, v4, v5
816 :
817 31907588 : cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2)
818 :
819 7976897 : IF (is_ortho) THEN
820 : v1 = 1.504760E+00_dp
821 : v2 = 3.126770E+00_dp
822 : v3 = 5.074106E+00_dp
823 : v4 = 1.091568E+00_dp
824 : v5 = 1.070187E+00_dp
825 : ELSE
826 435792 : v1 = 7.831105E+00_dp
827 435792 : v2 = 2.675174E+00_dp
828 435792 : v3 = 7.546553E+00_dp
829 435792 : v4 = 6.122446E-01_dp
830 435792 : v5 = 3.886382E+00_dp
831 : END IF
832 7976897 : cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
833 :
834 7976897 : END FUNCTION cost_model
835 : ! **************************************************************************************************
836 : !> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
837 : !> this determines by which CPUs a given pgf gets collocated
838 : !> the format of the task array is as follows
839 : !> tasks(1,i) := destination
840 : !> tasks(2,i) := source
841 : !> tasks(3,i) := compressed type (iatom, jatom, ....)
842 : !> tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
843 : !> tasks(5,i) := cost
844 : !> tasks(6,i) := alternate destination code (0 if none available)
845 : !>
846 : !> \param tasks ...
847 : !> \param ntasks ...
848 : !> \param curr_tasks ...
849 : !> \param rab ...
850 : !> \param cindex ...
851 : !> \param iatom ...
852 : !> \param jatom ...
853 : !> \param iset ...
854 : !> \param jset ...
855 : !> \param ipgf ...
856 : !> \param jpgf ...
857 : !> \param la_max ...
858 : !> \param lb_max ...
859 : !> \param rs_desc ...
860 : !> \param igrid_level ...
861 : !> \param n_levels ...
862 : !> \param cube_center ...
863 : !> \param lb_cube ...
864 : !> \param ub_cube ...
865 : !> \par History
866 : !> 10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
867 : ! **************************************************************************************************
868 7976897 : SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
869 : rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
870 : la_max, lb_max, rs_desc, igrid_level, n_levels, &
871 : cube_center, lb_cube, ub_cube, radius)
872 :
873 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
874 : INTEGER, INTENT(INOUT) :: ntasks, curr_tasks
875 : REAL(KIND=dp), DIMENSION(3), INTENT(IN) :: rab
876 : INTEGER, INTENT(IN) :: cindex, iatom, jatom, iset, jset, ipgf, &
877 : jpgf, la_max, lb_max
878 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
879 : INTEGER, INTENT(IN) :: igrid_level, n_levels
880 : INTEGER, DIMENSION(3), INTENT(IN) :: cube_center, lb_cube, ub_cube
881 : REAL(KIND=dp), INTENT(IN) :: radius
882 :
883 : INTEGER, PARAMETER :: add_tasks = 1000
884 : REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
885 :
886 : INTEGER :: added_tasks, cost, j, lmax
887 : LOGICAL :: is_ortho
888 : REAL(KIND=dp) :: tfraction
889 :
890 : !$OMP SINGLE
891 7976897 : ntasks = ntasks + 1
892 7976897 : IF (ntasks > curr_tasks) THEN
893 990 : curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks)
894 990 : CALL reallocate_tasks(tasks, curr_tasks)
895 : END IF
896 : !$OMP END SINGLE
897 :
898 7976897 : IF (rs_desc%distributed) THEN
899 :
900 : ! finds the node(s) that need to process this task
901 : ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
902 : CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
903 1380 : ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
904 :
905 : ELSE
906 7975517 : tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
907 7975517 : tasks(ntasks)%dist_type = 0
908 7975517 : tasks(ntasks)%subpatch_pattern = 0
909 7975517 : added_tasks = 1
910 : END IF
911 :
912 7976897 : lmax = la_max + lb_max
913 7976897 : is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
914 : ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
915 : ! this could be refined in the future
916 7976897 : tfraction = 1.0_dp/added_tasks
917 :
918 7976897 : cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
919 :
920 15953794 : DO j = 1, added_tasks
921 7976897 : tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
922 7976897 : tasks(ntasks - added_tasks + j)%cost = cost
923 7976897 : tasks(ntasks - added_tasks + j)%grid_level = igrid_level
924 7976897 : tasks(ntasks - added_tasks + j)%image = cindex
925 7976897 : tasks(ntasks - added_tasks + j)%iatom = iatom
926 7976897 : tasks(ntasks - added_tasks + j)%jatom = jatom
927 7976897 : tasks(ntasks - added_tasks + j)%iset = iset
928 7976897 : tasks(ntasks - added_tasks + j)%jset = jset
929 7976897 : tasks(ntasks - added_tasks + j)%ipgf = ipgf
930 7976897 : tasks(ntasks - added_tasks + j)%jpgf = jpgf
931 31907588 : tasks(ntasks - added_tasks + j)%rab = rab
932 15953794 : tasks(ntasks - added_tasks + j)%radius = radius
933 : END DO
934 :
935 7976897 : END SUBROUTINE pgf_to_tasks
936 :
937 : ! **************************************************************************************************
938 : !> \brief performs load balancing of the tasks on the distributed grids
939 : !> \param tasks ...
940 : !> \param ntasks ...
941 : !> \param rs_descs ...
942 : !> \param grid_level ...
943 : !> \param natoms ...
944 : !> \par History
945 : !> created 2008-10-03 [Joost VandeVondele]
946 : ! **************************************************************************************************
947 54 : SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
948 :
949 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
950 : INTEGER :: ntasks
951 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
952 : POINTER :: rs_descs
953 : INTEGER :: grid_level, natoms
954 :
955 : CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed'
956 :
957 : INTEGER :: handle
958 54 : INTEGER, DIMENSION(:, :, :), POINTER :: list
959 :
960 54 : CALL timeset(routineN, handle)
961 :
962 54 : NULLIFY (list)
963 : ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
964 : ! if a destination would not be in this list, it is a bug
965 54 : CALL create_destination_list(list, rs_descs, grid_level)
966 :
967 : ! now, walk over the tasks, filling in the loads of each destination
968 54 : CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.TRUE.)
969 :
970 : ! optimize loads & fluxes
971 54 : CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
972 :
973 : ! now, walk over the tasks, using the list to set the destinations
974 54 : CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.FALSE.)
975 :
976 54 : DEALLOCATE (list)
977 :
978 54 : CALL timestop(handle)
979 :
980 54 : END SUBROUTINE load_balance_distributed
981 :
982 : ! **************************************************************************************************
983 : !> \brief this serial routine adjusts the fluxes in the global list
984 : !>
985 : !> \param list_global ...
986 : !> \par History
987 : !> created 2008-10-06 [Joost VandeVondele]
988 : ! **************************************************************************************************
989 27 : SUBROUTINE balance_global_list(list_global)
990 : INTEGER, DIMENSION(:, :, 0:) :: list_global
991 :
992 : CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list'
993 : INTEGER, PARAMETER :: Max_Iter = 100
994 : REAL(KIND=dp), PARAMETER :: Tolerance_factor = 0.005_dp
995 :
996 : INTEGER :: dest, handle, icpu, idest, iflux, &
997 : ilocal, k, maxdest, Ncpu, Nflux
998 27 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: flux_connections
999 : LOGICAL :: solution_optimal
1000 : REAL(KIND=dp) :: average, load_shift, max_load_shift, &
1001 : tolerance
1002 27 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, optimized_load
1003 27 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: flux_limits
1004 :
1005 27 : CALL timeset(routineN, handle)
1006 :
1007 27 : Ncpu = SIZE(list_global, 3)
1008 27 : maxdest = SIZE(list_global, 2)
1009 81 : ALLOCATE (load(0:Ncpu - 1))
1010 81 : load = 0.0_dp
1011 54 : ALLOCATE (optimized_load(0:Ncpu - 1))
1012 :
1013 : ! figure out the number of fluxes
1014 : ! we assume that the global_list is symmetric
1015 81 : Nflux = 0
1016 81 : DO icpu = 0, ncpu - 1
1017 189 : DO idest = 1, maxdest
1018 108 : dest = list_global(1, idest, icpu)
1019 162 : IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1
1020 : END DO
1021 : END DO
1022 81 : ALLOCATE (optimized_flux(Nflux))
1023 81 : ALLOCATE (flux_limits(2, Nflux))
1024 81 : ALLOCATE (flux_connections(2, Nflux))
1025 :
1026 : ! reorder data
1027 108 : flux_limits = 0
1028 : Nflux = 0
1029 81 : DO icpu = 0, ncpu - 1
1030 162 : load(icpu) = SUM(list_global(2, :, icpu))
1031 189 : DO idest = 1, maxdest
1032 108 : dest = list_global(1, idest, icpu)
1033 162 : IF (dest < ncpu) THEN
1034 108 : IF (dest /= icpu) THEN
1035 54 : IF (dest > icpu) THEN
1036 27 : Nflux = Nflux + 1
1037 27 : flux_limits(2, Nflux) = list_global(2, idest, icpu)
1038 27 : flux_connections(1, Nflux) = icpu
1039 27 : flux_connections(2, Nflux) = dest
1040 : ELSE
1041 27 : DO iflux = 1, Nflux
1042 27 : IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1043 27 : flux_limits(1, iflux) = -list_global(2, idest, icpu)
1044 27 : EXIT
1045 : END IF
1046 : END DO
1047 : END IF
1048 : END IF
1049 : END IF
1050 : END DO
1051 : END DO
1052 :
1053 : solution_optimal = .FALSE.
1054 54 : optimized_flux = 0.0_dp
1055 :
1056 : ! an iterative solver, if iterated till convergence the maximum load is minimal
1057 : ! we terminate before things are fully converged, since this does show up in the timings
1058 : ! once the largest shift becomes less than a small fraction of the average load, we're done
1059 : ! we're perfectly happy if the load balance is within 1 percent or so
1060 : ! the maximum load normally converges even faster
1061 81 : average = SUM(load)/SIZE(load)
1062 27 : tolerance = Tolerance_factor*average
1063 :
1064 81 : optimized_load(:) = load
1065 332 : DO k = 1, Max_iter
1066 : max_load_shift = 0.0_dp
1067 658 : DO iflux = 1, Nflux
1068 329 : load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
1069 329 : load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
1070 329 : load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
1071 329 : max_load_shift = MAX(ABS(load_shift), max_load_shift)
1072 329 : optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
1073 329 : optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
1074 658 : optimized_flux(iflux) = optimized_flux(iflux) + load_shift
1075 : END DO
1076 332 : IF (max_load_shift < tolerance) THEN
1077 : solution_optimal = .TRUE.
1078 : EXIT
1079 : END IF
1080 : END DO
1081 :
1082 : ! now adjust the load list to reflect the optimized fluxes
1083 : ! reorder data
1084 : Nflux = 0
1085 81 : DO icpu = 0, ncpu - 1
1086 162 : DO idest = 1, maxdest
1087 162 : IF (list_global(1, idest, icpu) == icpu) ilocal = idest
1088 : END DO
1089 189 : DO idest = 1, maxdest
1090 108 : dest = list_global(1, idest, icpu)
1091 162 : IF (dest < ncpu) THEN
1092 108 : IF (dest /= icpu) THEN
1093 54 : IF (dest > icpu) THEN
1094 27 : Nflux = Nflux + 1
1095 27 : IF (optimized_flux(Nflux) > 0) THEN
1096 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1097 0 : list_global(2, idest, icpu) - NINT(optimized_flux(Nflux))
1098 0 : list_global(2, idest, icpu) = NINT(optimized_flux(Nflux))
1099 : ELSE
1100 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1101 27 : list_global(2, idest, icpu)
1102 27 : list_global(2, idest, icpu) = 0
1103 : END IF
1104 : ELSE
1105 27 : DO iflux = 1, Nflux
1106 27 : IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1107 27 : IF (optimized_flux(iflux) > 0) THEN
1108 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1109 0 : list_global(2, idest, icpu)
1110 0 : list_global(2, idest, icpu) = 0
1111 : ELSE
1112 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1113 27 : list_global(2, idest, icpu) + NINT(optimized_flux(iflux))
1114 27 : list_global(2, idest, icpu) = -NINT(optimized_flux(iflux))
1115 : END IF
1116 : EXIT
1117 : END IF
1118 : END DO
1119 : END IF
1120 : END IF
1121 : END IF
1122 : END DO
1123 : END DO
1124 :
1125 27 : CALL timestop(handle)
1126 :
1127 54 : END SUBROUTINE balance_global_list
1128 :
1129 : ! **************************************************************************************************
1130 : !> \brief this routine gets back optimized loads for all destinations
1131 : !>
1132 : !> \param list ...
1133 : !> \param group ...
1134 : !> \param my_pos ...
1135 : !> \par History
1136 : !> created 2008-10-06 [Joost VandeVondele]
1137 : !> Modified 2016-01 [EPCC] Reduce memory requirements on P processes
1138 : !> from O(P^2) to O(P)
1139 : ! **************************************************************************************************
1140 54 : SUBROUTINE optimize_load_list(list, group, my_pos)
1141 : INTEGER, DIMENSION(:, :, 0:) :: list
1142 : TYPE(mp_comm_type), INTENT(IN) :: group
1143 : INTEGER, INTENT(IN) :: my_pos
1144 :
1145 : CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list'
1146 : INTEGER, PARAMETER :: rank_of_root = 0
1147 :
1148 : INTEGER :: handle, icpu, idest, maxdest, ncpu
1149 : INTEGER, ALLOCATABLE, DIMENSION(:) :: load_all
1150 54 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: load_partial
1151 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global
1152 :
1153 54 : CALL timeset(routineN, handle)
1154 :
1155 54 : ncpu = SIZE(list, 3)
1156 54 : maxdest = SIZE(list, 2)
1157 :
1158 : !find total workload ...
1159 162 : ALLOCATE (load_all(maxdest*ncpu))
1160 108 : load_all(:) = RESHAPE(list(2, :, :), [maxdest*ncpu])
1161 54 : CALL group%sum(load_all(:), rank_of_root)
1162 :
1163 : ! ... and optimise the work per process
1164 216 : ALLOCATE (list_global(2, maxdest, ncpu))
1165 54 : IF (rank_of_root == my_pos) THEN
1166 189 : list_global(1, :, :) = list(1, :, :)
1167 243 : list_global(2, :, :) = RESHAPE(load_all, [maxdest, ncpu])
1168 27 : CALL balance_global_list(list_global)
1169 : END IF
1170 54 : CALL group%bcast(list_global, rank_of_root)
1171 :
1172 : !figure out how much can be sent to other processes
1173 216 : ALLOCATE (load_partial(maxdest, ncpu))
1174 : ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
1175 162 : CALL group%sum_partial(RESHAPE(load_all, [maxdest, ncpu]), load_partial(:, :))
1176 :
1177 162 : DO icpu = 1, ncpu
1178 378 : DO idest = 1, maxdest
1179 :
1180 : !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
1181 324 : IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
1182 37 : IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
1183 : list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
1184 10 : - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
1185 : ELSE
1186 27 : list(2, idest, icpu - 1) = 0
1187 : END IF
1188 : END IF
1189 :
1190 : END DO
1191 : END DO
1192 :
1193 : !clean up before leaving
1194 54 : DEALLOCATE (load_all)
1195 54 : DEALLOCATE (list_global)
1196 54 : DEALLOCATE (load_partial)
1197 :
1198 54 : CALL timestop(handle)
1199 54 : END SUBROUTINE optimize_load_list
1200 :
1201 : ! **************************************************************************************************
1202 : !> \brief fill the load list with values derived from the tasks array
1203 : !> from the alternate locations, we select the alternate location that
1204 : !> can be used without increasing the number of matrix blocks needed to
1205 : !> distribute.
1206 : !> Replicated tasks are not yet considered
1207 : !>
1208 : !> \param list ...
1209 : !> \param rs_descs ...
1210 : !> \param grid_level ...
1211 : !> \param tasks ...
1212 : !> \param ntasks ...
1213 : !> \param natoms ...
1214 : !> \param create_list ...
1215 : !> \par History
1216 : !> created 2008-10-06 [Joost VandeVondele]
1217 : ! **************************************************************************************************
1218 108 : SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
1219 : INTEGER, DIMENSION(:, :, 0:) :: list
1220 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1221 : POINTER :: rs_descs
1222 : INTEGER :: grid_level
1223 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1224 : INTEGER :: ntasks, natoms
1225 : LOGICAL :: create_list
1226 :
1227 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list'
1228 :
1229 : INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
1230 : itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
1231 : rank
1232 : INTEGER(KIND=int_8) :: bit_pattern, ipair, ipair_old, natom8
1233 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: loads
1234 108 : INTEGER, ALLOCATABLE, DIMENSION(:) :: all_dests, index
1235 : INTEGER, DIMENSION(6) :: options
1236 :
1237 108 : CALL timeset(routineN, handle)
1238 :
1239 324 : ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
1240 108 : CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.FALSE.)
1241 :
1242 108 : maxdest = SIZE(list, 2)
1243 108 : ncpu = SIZE(list, 3)
1244 108 : natom8 = natoms
1245 :
1246 : ! first find the tasks that deal with the same atom pair
1247 108 : itask_stop = 0
1248 108 : ipair_old = HUGE(ipair_old)
1249 108 : img_old = -1
1250 108 : ALLOCATE (all_dests(0))
1251 108 : ALLOCATE (INDEX(0))
1252 :
1253 : DO
1254 :
1255 : ! first find the range of tasks that deal with the same atom pair
1256 290 : itask_start = itask_stop + 1
1257 290 : itask_stop = itask_start
1258 290 : IF (itask_stop > ntasks) EXIT
1259 182 : ilevel = tasks(itask_stop)%grid_level
1260 182 : img_old = tasks(itask_stop)%image
1261 182 : iatom = tasks(itask_stop)%iatom
1262 182 : jatom = tasks(itask_stop)%jatom
1263 182 : iset = tasks(itask_stop)%iset
1264 182 : jset = tasks(itask_stop)%jset
1265 182 : ipgf = tasks(itask_stop)%ipgf
1266 182 : jpgf = tasks(itask_stop)%jpgf
1267 :
1268 182 : ipair_old = (iatom - 1)*natom8 + (jatom - 1)
1269 : DO
1270 7386 : IF (itask_stop + 1 > ntasks) EXIT
1271 7298 : ilevel = tasks(itask_stop + 1)%grid_level
1272 7298 : img = tasks(itask_stop + 1)%image
1273 7298 : iatom = tasks(itask_stop + 1)%iatom
1274 7298 : jatom = tasks(itask_stop + 1)%jatom
1275 7298 : iset = tasks(itask_stop + 1)%iset
1276 7298 : jset = tasks(itask_stop + 1)%jset
1277 7298 : ipgf = tasks(itask_stop + 1)%ipgf
1278 7298 : jpgf = tasks(itask_stop + 1)%jpgf
1279 :
1280 7298 : ipair = (iatom - 1)*natom8 + (jatom - 1)
1281 7386 : IF (ipair == ipair_old .AND. img == img_old) THEN
1282 : itask_stop = itask_stop + 1
1283 : ELSE
1284 : EXIT
1285 : END IF
1286 : END DO
1287 182 : ipair = ipair_old
1288 182 : nshort = itask_stop - itask_start + 1
1289 :
1290 : ! find the unique list of destinations on this grid level only
1291 182 : DEALLOCATE (all_dests)
1292 546 : ALLOCATE (all_dests(nshort))
1293 182 : DEALLOCATE (index)
1294 364 : ALLOCATE (INDEX(nshort))
1295 7568 : DO i = 1, nshort
1296 7386 : ilevel = tasks(itask_start + i - 1)%grid_level
1297 7386 : img = tasks(itask_start + i - 1)%image
1298 7386 : iatom = tasks(itask_start + i - 1)%iatom
1299 7386 : jatom = tasks(itask_start + i - 1)%jatom
1300 7386 : iset = tasks(itask_start + i - 1)%iset
1301 7386 : jset = tasks(itask_start + i - 1)%jset
1302 7386 : ipgf = tasks(itask_start + i - 1)%ipgf
1303 7386 : jpgf = tasks(itask_start + i - 1)%jpgf
1304 :
1305 7568 : IF (ilevel == grid_level) THEN
1306 2760 : all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
1307 : ELSE
1308 4626 : all_dests(i) = HUGE(all_dests(i))
1309 : END IF
1310 : END DO
1311 182 : CALL sort(all_dests, nshort, index)
1312 182 : ndest_pair = 1
1313 7386 : DO i = 2, nshort
1314 7386 : IF ((all_dests(ndest_pair) /= all_dests(i)) .AND. (all_dests(i) /= HUGE(all_dests(i)))) THEN
1315 10 : ndest_pair = ndest_pair + 1
1316 10 : all_dests(ndest_pair) = all_dests(i)
1317 : END IF
1318 : END DO
1319 :
1320 7676 : DO itask = itask_start, itask_stop
1321 :
1322 7386 : dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
1323 7386 : ilevel = tasks(itask)%grid_level
1324 7386 : img = tasks(itask)%image
1325 7386 : iatom = tasks(itask)%iatom
1326 7386 : jatom = tasks(itask)%jatom
1327 7386 : iset = tasks(itask)%iset
1328 7386 : jset = tasks(itask)%jset
1329 7386 : ipgf = tasks(itask)%ipgf
1330 7386 : jpgf = tasks(itask)%jpgf
1331 :
1332 : ! Only proceed with tasks which are on this grid level
1333 7386 : IF (ilevel /= grid_level) CYCLE
1334 2760 : ipair = (iatom - 1)*natom8 + (jatom - 1)
1335 2760 : cost = INT(tasks(itask)%cost)
1336 :
1337 182 : SELECT CASE (tasks(itask)%dist_type)
1338 : CASE (1)
1339 2760 : bit_pattern = tasks(itask)%subpatch_pattern
1340 2760 : nopt = 0
1341 2760 : IF (BTEST(bit_pattern, 0)) THEN
1342 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [-1, 0, 0])
1343 0 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1344 0 : nopt = nopt + 1
1345 0 : options(nopt) = rank
1346 : END IF
1347 : END IF
1348 2760 : IF (BTEST(bit_pattern, 1)) THEN
1349 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [+1, 0, 0])
1350 0 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1351 0 : nopt = nopt + 1
1352 0 : options(nopt) = rank
1353 : END IF
1354 : END IF
1355 2760 : IF (BTEST(bit_pattern, 2)) THEN
1356 48 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, -1, 0])
1357 96 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1358 0 : nopt = nopt + 1
1359 0 : options(nopt) = rank
1360 : END IF
1361 : END IF
1362 2760 : IF (BTEST(bit_pattern, 3)) THEN
1363 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, +1, 0])
1364 0 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1365 0 : nopt = nopt + 1
1366 0 : options(nopt) = rank
1367 : END IF
1368 : END IF
1369 2760 : IF (BTEST(bit_pattern, 4)) THEN
1370 1150 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, -1])
1371 2200 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1372 100 : nopt = nopt + 1
1373 100 : options(nopt) = rank
1374 : END IF
1375 : END IF
1376 2760 : IF (BTEST(bit_pattern, 5)) THEN
1377 500 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, +1])
1378 1000 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1379 80 : nopt = nopt + 1
1380 80 : options(nopt) = rank
1381 : END IF
1382 : END IF
1383 2760 : IF (nopt > 0) THEN
1384 : ! set it to the rank with the lowest load
1385 180 : rank = options(1)
1386 180 : DO iopt = 2, nopt
1387 180 : IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
1388 : END DO
1389 : ELSE
1390 2580 : rank = dest
1391 : END IF
1392 2760 : li = list_index(list, rank, dest)
1393 2760 : IF (create_list) THEN
1394 1380 : list(2, li, dest) = list(2, li, dest) + cost
1395 : ELSE
1396 1380 : IF (list(1, li, dest) == dest) THEN
1397 1290 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1398 : ELSE
1399 90 : IF (list(2, li, dest) >= cost) THEN
1400 10 : list(2, li, dest) = list(2, li, dest) - cost
1401 10 : tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1402 : ELSE
1403 80 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1404 : END IF
1405 : END IF
1406 : END IF
1407 : CASE (2) ! generalised
1408 0 : li = list_index(list, dest, dest)
1409 0 : IF (create_list) THEN
1410 0 : list(2, li, dest) = list(2, li, dest) + cost
1411 : ELSE
1412 0 : IF (list(1, li, dest) == dest) THEN
1413 0 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1414 : ELSE
1415 0 : IF (list(2, li, dest) >= cost) THEN
1416 0 : list(2, li, dest) = list(2, li, dest) - cost
1417 0 : tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1418 : ELSE
1419 0 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1420 : END IF
1421 : END IF
1422 : END IF
1423 : CASE DEFAULT
1424 7386 : CPABORT("")
1425 : END SELECT
1426 :
1427 : END DO
1428 :
1429 : END DO
1430 :
1431 108 : CALL timestop(handle)
1432 :
1433 216 : END SUBROUTINE compute_load_list
1434 : ! **************************************************************************************************
1435 : !> \brief small helper function to return the proper index in the list array
1436 : !>
1437 : !> \param list ...
1438 : !> \param rank ...
1439 : !> \param dest ...
1440 : !> \return ...
1441 : !> \par History
1442 : !> created 2008-10-06 [Joost VandeVondele]
1443 : ! **************************************************************************************************
1444 2760 : INTEGER FUNCTION list_index(list, rank, dest)
1445 : INTEGER, DIMENSION(:, :, 0:), INTENT(IN) :: list
1446 : INTEGER, INTENT(IN) :: rank, dest
1447 :
1448 2760 : list_index = 1
1449 1424 : DO
1450 4184 : IF (list(1, list_index, dest) == rank) EXIT
1451 1424 : list_index = list_index + 1
1452 : END DO
1453 2760 : END FUNCTION list_index
1454 : ! **************************************************************************************************
1455 : !> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
1456 : !> note that we allocate it with an additional field to store the load of this destination
1457 : !>
1458 : !> \param list ...
1459 : !> \param rs_descs ...
1460 : !> \param grid_level ...
1461 : !> \par History
1462 : !> created 2008-10-06 [Joost VandeVondele]
1463 : ! **************************************************************************************************
1464 54 : SUBROUTINE create_destination_list(list, rs_descs, grid_level)
1465 : INTEGER, DIMENSION(:, :, :), POINTER :: list
1466 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1467 : POINTER :: rs_descs
1468 : INTEGER, INTENT(IN) :: grid_level
1469 :
1470 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list'
1471 :
1472 : INTEGER :: handle, i, icpu, j, maxcount, ncpu, &
1473 : ultimate_max
1474 54 : INTEGER, ALLOCATABLE, DIMENSION(:) :: index, sublist
1475 :
1476 54 : CALL timeset(routineN, handle)
1477 :
1478 54 : CPASSERT(.NOT. ASSOCIATED(list))
1479 54 : ncpu = rs_descs(grid_level)%rs_desc%group_size
1480 54 : ultimate_max = 7
1481 :
1482 162 : ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
1483 :
1484 54 : ALLOCATE (INDEX(ultimate_max))
1485 54 : ALLOCATE (sublist(ultimate_max))
1486 432 : sublist = HUGE(sublist)
1487 :
1488 54 : maxcount = 1
1489 162 : DO icpu = 0, ncpu - 1
1490 108 : sublist(1) = icpu
1491 108 : sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [-1, 0, 0])
1492 108 : sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [+1, 0, 0])
1493 108 : sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, -1, 0])
1494 108 : sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, +1, 0])
1495 108 : sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, -1])
1496 108 : sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, +1])
1497 : ! only retain unique values of the destination
1498 108 : CALL sort(sublist, ultimate_max, index)
1499 108 : j = 1
1500 756 : DO i = 2, 7
1501 756 : IF (sublist(i) /= sublist(j)) THEN
1502 108 : j = j + 1
1503 108 : sublist(j) = sublist(i)
1504 : END IF
1505 : END DO
1506 108 : maxcount = MAX(maxcount, j)
1507 648 : sublist(j + 1:ultimate_max) = HUGE(sublist)
1508 864 : list(1, :, icpu) = sublist
1509 918 : list(2, :, icpu) = 0
1510 : END DO
1511 :
1512 54 : CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
1513 :
1514 54 : CALL timestop(handle)
1515 :
1516 108 : END SUBROUTINE create_destination_list
1517 :
1518 : ! **************************************************************************************************
1519 : !> \brief given a task list, compute the load of each process everywhere
1520 : !> giving this function the ability to loop over a (sub)set of rs_grids,
1521 : !> and do all the communication in one shot, would speed it up
1522 : !> \param loads ...
1523 : !> \param rs_descs ...
1524 : !> \param grid_level ...
1525 : !> \param ntasks ...
1526 : !> \param tasks ...
1527 : !> \param use_reordered_ranks ...
1528 : !> \par History
1529 : !> none
1530 : !> \author MattW 21/11/2007
1531 : ! **************************************************************************************************
1532 480 : SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
1533 : INTEGER(KIND=int_8), DIMENSION(:) :: loads
1534 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1535 : POINTER :: rs_descs
1536 : INTEGER :: grid_level, ntasks
1537 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1538 : LOGICAL, INTENT(IN) :: use_reordered_ranks
1539 :
1540 : CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads'
1541 :
1542 : INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1543 : iset, jatom, jpgf, jset
1544 : INTEGER(KIND=int_8) :: total_cost_local
1545 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i
1546 : TYPE(realspace_grid_desc_type), POINTER :: desc
1547 :
1548 480 : CALL timeset(routineN, handle)
1549 :
1550 480 : desc => rs_descs(grid_level)%rs_desc
1551 :
1552 : ! allocate local arrays
1553 1440 : ALLOCATE (send_buf_i(desc%group_size))
1554 960 : ALLOCATE (recv_buf_i(desc%group_size))
1555 :
1556 : ! communication step 1 : compute the total local cost of the tasks
1557 : ! each proc needs to know the amount of work he will receive
1558 :
1559 : ! send buffer now contains for each target the cost of the tasks it will receive
1560 1440 : send_buf_i = 0
1561 36792 : DO i = 1, ntasks
1562 36312 : ilevel = tasks(i)%grid_level
1563 36312 : img = tasks(i)%image
1564 36312 : iatom = tasks(i)%iatom
1565 36312 : jatom = tasks(i)%jatom
1566 36312 : iset = tasks(i)%iset
1567 36312 : jset = tasks(i)%jset
1568 36312 : ipgf = tasks(i)%ipgf
1569 36312 : jpgf = tasks(i)%jpgf
1570 36312 : IF (ilevel /= grid_level) CYCLE
1571 10476 : IF (use_reordered_ranks) THEN
1572 : send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
1573 : send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
1574 3618 : + tasks(i)%cost
1575 : ELSE
1576 : send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
1577 : send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
1578 6378 : + tasks(i)%cost
1579 : END IF
1580 : END DO
1581 480 : CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
1582 :
1583 : ! communication step 2 : compute the global cost of the tasks
1584 1440 : total_cost_local = SUM(recv_buf_i)
1585 :
1586 : ! after this step, the recv buffer contains the local cost for each CPU
1587 1440 : CALL desc%group%allgather(total_cost_local, loads)
1588 :
1589 480 : CALL timestop(handle)
1590 :
1591 960 : END SUBROUTINE get_current_loads
1592 : ! **************************************************************************************************
1593 : !> \brief performs load balancing shifting tasks on the replicated grids
1594 : !> this modifies the destination of some of the tasks on replicated
1595 : !> grids, and in this way balances the load
1596 : !> \param rs_descs ...
1597 : !> \param ntasks ...
1598 : !> \param tasks ...
1599 : !> \par History
1600 : !> none
1601 : !> \author MattW 21/11/2007
1602 : ! **************************************************************************************************
1603 48 : SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
1604 :
1605 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1606 : POINTER :: rs_descs
1607 : INTEGER :: ntasks
1608 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1609 :
1610 : CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated'
1611 :
1612 : INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1613 : iset, j, jatom, jpgf, jset, &
1614 : no_overloaded, no_underloaded, &
1615 : proc_receiving
1616 : INTEGER(KIND=int_8) :: average_cost, cost_task_rep, count, &
1617 : offset, total_cost_global
1618 48 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: load_imbalance, loads, recv_buf_i
1619 48 : INTEGER, ALLOCATABLE, DIMENSION(:) :: index
1620 : TYPE(realspace_grid_desc_type), POINTER :: desc
1621 :
1622 48 : CALL timeset(routineN, handle)
1623 :
1624 48 : desc => rs_descs(1)%rs_desc
1625 :
1626 : ! allocate local arrays
1627 144 : ALLOCATE (recv_buf_i(desc%group_size))
1628 96 : ALLOCATE (loads(desc%group_size))
1629 :
1630 144 : recv_buf_i = 0
1631 234 : DO i = 1, SIZE(rs_descs)
1632 186 : CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.TRUE.)
1633 606 : recv_buf_i(:) = recv_buf_i + loads
1634 : END DO
1635 :
1636 144 : total_cost_global = SUM(recv_buf_i)
1637 48 : average_cost = total_cost_global/desc%group_size
1638 :
1639 : !
1640 : ! compute how to redistribute the replicated tasks so that the average cost is reached
1641 : !
1642 :
1643 : ! load imbalance measures the load of a given CPU relative
1644 : ! to the optimal load distribution (load=average)
1645 144 : ALLOCATE (load_imbalance(desc%group_size))
1646 144 : ALLOCATE (INDEX(desc%group_size))
1647 :
1648 144 : load_imbalance(:) = recv_buf_i - average_cost
1649 48 : no_overloaded = 0
1650 48 : no_underloaded = 0
1651 :
1652 144 : DO i = 1, desc%group_size
1653 96 : IF (load_imbalance(i) > 0) no_overloaded = no_overloaded + 1
1654 144 : IF (load_imbalance(i) < 0) no_underloaded = no_underloaded + 1
1655 : END DO
1656 :
1657 : ! sort the recv_buffer on number of tasks, gives us index which provides a
1658 : ! mapping between processor ranks and how overloaded the processor
1659 48 : CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
1660 :
1661 : ! find out the number of replicated tasks each proc has
1662 : ! but only those tasks which have not yet been assigned
1663 48 : cost_task_rep = 0
1664 3666 : DO i = 1, ntasks
1665 : IF (tasks(i)%dist_type == 0 &
1666 3666 : .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
1667 2238 : cost_task_rep = cost_task_rep + tasks(i)%cost
1668 : END IF
1669 : END DO
1670 :
1671 : ! now, correct the load imbalance for the overloaded CPUs
1672 : ! they will send away not more than the total load of replicated tasks
1673 48 : CALL desc%group%allgather(cost_task_rep, recv_buf_i)
1674 :
1675 144 : DO i = 1, desc%group_size
1676 : ! At the moment we can only offload replicated tasks
1677 96 : IF (load_imbalance(i) > 0) &
1678 96 : load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i))
1679 : END DO
1680 :
1681 : ! simplest algorithm I can think of of is that the processor with the most
1682 : ! excess tasks fills up the process needing most, then moves on to next most.
1683 : ! At the moment if we've got less replicated tasks than we're overloaded then
1684 : ! task balancing will be incomplete
1685 :
1686 : ! only need to do anything if I've excess tasks
1687 48 : IF (load_imbalance(desc%my_pos + 1) > 0) THEN
1688 :
1689 22 : count = 0 ! weighted amount of tasks offloaded
1690 22 : offset = 0 ! no of underloaded processes already filled by other more overloaded procs
1691 :
1692 : ! calculate offset
1693 22 : DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
1694 22 : IF (INDEX(i) == desc%my_pos + 1) THEN
1695 : EXIT
1696 : ELSE
1697 0 : offset = offset + load_imbalance(INDEX(i))
1698 : END IF
1699 : END DO
1700 :
1701 : ! find my starting processor to send to
1702 22 : proc_receiving = HUGE(proc_receiving)
1703 22 : DO i = 1, no_underloaded
1704 22 : offset = offset + load_imbalance(INDEX(i))
1705 22 : IF (offset <= 0) THEN
1706 : proc_receiving = i
1707 : EXIT
1708 : END IF
1709 : END DO
1710 :
1711 : ! offset now contains minus the number of tasks proc_receiving requires
1712 : ! we fill this up by adjusting the destination of tasks on the replicated grid,
1713 : ! then move to next most underloaded proc
1714 1380 : DO j = 1, ntasks
1715 : IF (tasks(j)%dist_type == 0 &
1716 1380 : .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
1717 : ! just avoid sending to non existing procs due to integer truncation
1718 : ! in the computation of the average
1719 808 : IF (proc_receiving > no_underloaded) EXIT
1720 : ! set new destination
1721 808 : ilevel = tasks(j)%grid_level
1722 808 : img = tasks(j)%image
1723 808 : iatom = tasks(j)%iatom
1724 808 : jatom = tasks(j)%jatom
1725 808 : iset = tasks(j)%iset
1726 808 : jset = tasks(j)%jset
1727 808 : ipgf = tasks(j)%ipgf
1728 808 : jpgf = tasks(j)%jpgf
1729 808 : tasks(j)%destination = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs))
1730 808 : offset = offset + tasks(j)%cost
1731 808 : count = count + tasks(j)%cost
1732 808 : IF (count >= load_imbalance(desc%my_pos + 1)) EXIT
1733 786 : IF (offset > 0) THEN
1734 0 : proc_receiving = proc_receiving + 1
1735 : ! just avoid sending to non existing procs due to integer truncation
1736 : ! in the computation of the average
1737 0 : IF (proc_receiving > no_underloaded) EXIT
1738 0 : offset = load_imbalance(INDEX(proc_receiving))
1739 : END IF
1740 : END IF
1741 : END DO
1742 : END IF
1743 :
1744 48 : DEALLOCATE (index)
1745 48 : DEALLOCATE (load_imbalance)
1746 :
1747 48 : CALL timestop(handle)
1748 :
1749 96 : END SUBROUTINE load_balance_replicated
1750 :
1751 : ! **************************************************************************************************
1752 : !> \brief given an input task list, redistribute so that all tasks can be processed locally,
1753 : !> i.e. dest equals rank
1754 : !> \param rs_descs ...
1755 : !> \param ntasks ...
1756 : !> \param tasks ...
1757 : !> \param ntasks_recv ...
1758 : !> \param tasks_recv ...
1759 : !> \par History
1760 : !> none
1761 : !> \author MattW 21/11/2007
1762 : ! **************************************************************************************************
1763 48 : SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
1764 :
1765 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1766 : POINTER :: rs_descs
1767 : INTEGER :: ntasks
1768 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1769 : INTEGER :: ntasks_recv
1770 : TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1771 :
1772 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks'
1773 :
1774 : INTEGER :: handle, i, j, k, l, rank
1775 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf, send_buf
1776 : INTEGER, ALLOCATABLE, DIMENSION(:) :: recv_disps, recv_sizes, send_disps, &
1777 : send_sizes
1778 : TYPE(realspace_grid_desc_type), POINTER :: desc
1779 :
1780 48 : CALL timeset(routineN, handle)
1781 :
1782 48 : desc => rs_descs(1)%rs_desc
1783 :
1784 : ! allocate local arrays
1785 144 : ALLOCATE (send_sizes(desc%group_size))
1786 96 : ALLOCATE (recv_sizes(desc%group_size))
1787 96 : ALLOCATE (send_disps(desc%group_size))
1788 96 : ALLOCATE (recv_disps(desc%group_size))
1789 144 : ALLOCATE (send_buf(desc%group_size))
1790 96 : ALLOCATE (recv_buf(desc%group_size))
1791 :
1792 : ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
1793 144 : send_buf = 0
1794 3666 : DO i = 1, ntasks
1795 : rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
1796 3618 : %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
1797 3666 : send_buf(rank + 1) = send_buf(rank + 1) + 1
1798 : END DO
1799 :
1800 48 : CALL desc%group%alltoall(send_buf, recv_buf, 1)
1801 :
1802 : ! pack the tasks, and send them around
1803 :
1804 144 : send_sizes = 0
1805 144 : send_disps = 0
1806 144 : recv_sizes = 0
1807 144 : recv_disps = 0
1808 :
1809 48 : send_sizes(1) = INT(send_buf(1)*task_size_in_int8)
1810 48 : recv_sizes(1) = INT(recv_buf(1)*task_size_in_int8)
1811 96 : DO i = 2, desc%group_size
1812 48 : send_sizes(i) = INT(send_buf(i)*task_size_in_int8)
1813 48 : recv_sizes(i) = INT(recv_buf(i)*task_size_in_int8)
1814 48 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1815 96 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1816 : END DO
1817 :
1818 : ! deallocate old send/recv buffers
1819 48 : DEALLOCATE (send_buf)
1820 48 : DEALLOCATE (recv_buf)
1821 :
1822 : ! allocate them with new sizes
1823 233 : ALLOCATE (send_buf(SUM(send_sizes)))
1824 238 : ALLOCATE (recv_buf(SUM(recv_sizes)))
1825 :
1826 : ! do packing
1827 61554 : send_buf = 0
1828 144 : send_sizes = 0
1829 3666 : DO j = 1, ntasks
1830 : i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
1831 3618 : %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
1832 3618 : l = send_disps(i) + send_sizes(i)
1833 3618 : CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
1834 3666 : send_sizes(i) = send_sizes(i) + task_size_in_int8
1835 : END DO
1836 :
1837 : ! do communication
1838 48 : CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
1839 :
1840 48 : DEALLOCATE (send_buf)
1841 :
1842 144 : ntasks_recv = SUM(recv_sizes)/task_size_in_int8
1843 3904 : ALLOCATE (tasks_recv(ntasks_recv))
1844 :
1845 : ! do unpacking
1846 48 : l = 0
1847 144 : DO i = 1, desc%group_size
1848 3762 : DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
1849 3618 : l = l + 1
1850 3618 : k = recv_disps(i) + j*task_size_in_int8
1851 3714 : CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
1852 : END DO
1853 : END DO
1854 :
1855 48 : DEALLOCATE (recv_buf)
1856 48 : DEALLOCATE (send_sizes)
1857 48 : DEALLOCATE (recv_sizes)
1858 48 : DEALLOCATE (send_disps)
1859 48 : DEALLOCATE (recv_disps)
1860 :
1861 48 : CALL timestop(handle)
1862 :
1863 48 : END SUBROUTINE create_local_tasks
1864 :
1865 : ! **************************************************************************************************
1866 : !> \brief Assembles tasks to be performed on local grid
1867 : !> \param rs_descs the grids
1868 : !> \param ntasks Number of tasks for local processing
1869 : !> \param natoms ...
1870 : !> \param nimages ...
1871 : !> \param tasks the task set generated on this processor
1872 : !> \param rval ...
1873 : !> \param atom_pair_send ...
1874 : !> \param atom_pair_recv ...
1875 : !> \param symmetric ...
1876 : !> \param reorder_rs_grid_ranks ...
1877 : !> \param skip_load_balance_distributed ...
1878 : !> \par History
1879 : !> none
1880 : !> \author MattW 21/11/2007
1881 : ! **************************************************************************************************
1882 16504 : SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
1883 : tasks, atom_pair_send, atom_pair_recv, &
1884 : symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
1885 :
1886 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1887 : POINTER :: rs_descs
1888 : INTEGER :: ntasks, natoms
1889 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1890 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
1891 : LOGICAL, INTENT(IN) :: symmetric, reorder_rs_grid_ranks, &
1892 : skip_load_balance_distributed
1893 :
1894 : CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks'
1895 :
1896 : INTEGER :: handle, igrid_level, irank, ntasks_recv
1897 : INTEGER(KIND=int_8) :: load_gap, max_load, replicated_load
1898 16504 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: total_loads, total_loads_tmp, trial_loads
1899 16504 : INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: loads
1900 16504 : INTEGER, ALLOCATABLE, DIMENSION(:) :: indices, real2virtual, total_index
1901 : LOGICAL :: distributed_grids, fixed_first_grid
1902 : TYPE(realspace_grid_desc_type), POINTER :: desc
1903 16504 : TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1904 :
1905 16504 : CALL timeset(routineN, handle)
1906 :
1907 16504 : CPASSERT(ASSOCIATED(tasks))
1908 :
1909 : ! *** figure out if we have distributed grids
1910 16504 : distributed_grids = .FALSE.
1911 82024 : DO igrid_level = 1, SIZE(rs_descs)
1912 82024 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1913 54 : distributed_grids = .TRUE.
1914 : END IF
1915 : END DO
1916 16504 : desc => rs_descs(1)%rs_desc
1917 :
1918 16504 : IF (distributed_grids) THEN
1919 :
1920 192 : ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
1921 144 : ALLOCATE (total_loads(0:desc%group_size - 1))
1922 :
1923 144 : total_loads = 0
1924 :
1925 : ! First round of balancing on the distributed grids
1926 : ! we just balance each of the distributed grids independently
1927 234 : DO igrid_level = 1, SIZE(rs_descs)
1928 234 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1929 :
1930 54 : IF (.NOT. skip_load_balance_distributed) &
1931 54 : CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
1932 :
1933 : CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1934 54 : tasks, use_reordered_ranks=.FALSE.)
1935 :
1936 162 : total_loads(:) = total_loads + loads(:, igrid_level)
1937 :
1938 : END IF
1939 : END DO
1940 :
1941 : ! calculate the total load of replicated tasks, so we can decide if it is worth
1942 : ! reordering the distributed grid levels
1943 :
1944 48 : replicated_load = 0
1945 234 : DO igrid_level = 1, SIZE(rs_descs)
1946 234 : IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
1947 : CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1948 132 : tasks, use_reordered_ranks=.FALSE.)
1949 396 : replicated_load = replicated_load + SUM(loads(:, igrid_level))
1950 : END IF
1951 : END DO
1952 :
1953 : !IF (desc%my_pos==0) THEN
1954 : ! WRITE(*,*) "Total replicated load is ",replicated_load
1955 : !END IF
1956 :
1957 : ! Now we adjust the rank ordering based on the current loads
1958 : ! we leave the first distributed level and all the replicated levels in the default order
1959 48 : IF (reorder_rs_grid_ranks) THEN
1960 48 : fixed_first_grid = .FALSE.
1961 234 : DO igrid_level = 1, SIZE(rs_descs)
1962 234 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1963 54 : IF (fixed_first_grid .EQV. .FALSE.) THEN
1964 144 : total_loads(:) = loads(:, igrid_level)
1965 : fixed_first_grid = .TRUE.
1966 : ELSE
1967 18 : ALLOCATE (trial_loads(0:desc%group_size - 1))
1968 :
1969 18 : trial_loads(:) = total_loads + loads(:, igrid_level)
1970 18 : max_load = MAXVAL(trial_loads)
1971 : load_gap = 0
1972 18 : DO irank = 0, desc%group_size - 1
1973 18 : load_gap = load_gap + max_load - trial_loads(irank)
1974 : END DO
1975 :
1976 : ! If there is not enough replicated load to load balance well enough
1977 : ! then we will reorder this grid level
1978 6 : IF (load_gap > replicated_load*1.05_dp) THEN
1979 :
1980 18 : ALLOCATE (indices(0:desc%group_size - 1))
1981 12 : ALLOCATE (total_index(0:desc%group_size - 1))
1982 12 : ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
1983 12 : ALLOCATE (real2virtual(0:desc%group_size - 1))
1984 :
1985 18 : total_loads_tmp(:) = total_loads
1986 6 : CALL sort(total_loads_tmp, desc%group_size, total_index)
1987 6 : CALL sort(loads(:, igrid_level), desc%group_size, indices)
1988 :
1989 : ! Reorder so that the rank with smallest load on this grid level is paired with
1990 : ! the highest load in total
1991 18 : DO irank = 0, desc%group_size - 1
1992 : total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
1993 12 : loads(desc%group_size - irank - 1, igrid_level)
1994 18 : real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
1995 : END DO
1996 :
1997 6 : CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
1998 :
1999 6 : DEALLOCATE (indices)
2000 6 : DEALLOCATE (total_index)
2001 6 : DEALLOCATE (total_loads_tmp)
2002 6 : DEALLOCATE (real2virtual)
2003 : ELSE
2004 0 : total_loads(:) = trial_loads
2005 : END IF
2006 :
2007 6 : DEALLOCATE (trial_loads)
2008 :
2009 : END IF
2010 : END IF
2011 : END DO
2012 : END IF
2013 :
2014 : ! Now we use the replicated tasks to balance out the rest of the load
2015 48 : CALL load_balance_replicated(rs_descs, ntasks, tasks)
2016 :
2017 : !total_loads = 0
2018 : !DO igrid_level=1,SIZE(rs_descs)
2019 : ! CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
2020 : ! tasks, use_reordered_ranks=.TRUE.)
2021 : ! total_loads = total_loads + loads(:, igrid_level)
2022 : !END DO
2023 :
2024 : !IF (desc%my_pos==0) THEN
2025 : ! WRITE(*,*) ""
2026 : ! WRITE(*,*) "At the end of the load balancing procedure"
2027 : ! WRITE(*,*) "Maximum load:",MAXVAL(total_loads)
2028 : ! WRITE(*,*) "Average load:",SUM(total_loads)/SIZE(total_loads)
2029 : ! WRITE(*,*) "Minimum load:",MINVAL(total_loads)
2030 : !ENDIF
2031 :
2032 : ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
2033 48 : CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
2034 :
2035 : !
2036 : ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
2037 : ! we will be sending. These lists are needed for redistribute_matrix.
2038 : !
2039 48 : CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.TRUE., symmetric=symmetric, rs_descs=rs_descs)
2040 :
2041 : ! natom_send=SIZE(atom_pair_send)
2042 : ! CALL desc%group%sum(natom_send)
2043 : ! IF (desc%my_pos==0) THEN
2044 : ! WRITE(*,*) ""
2045 : ! WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
2046 : ! ENDIF
2047 :
2048 48 : CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
2049 :
2050 : ! cleanup, at this point we don't need the original tasks anymore
2051 48 : DEALLOCATE (tasks)
2052 48 : DEALLOCATE (loads)
2053 48 : DEALLOCATE (total_loads)
2054 :
2055 : ELSE
2056 16456 : tasks_recv => tasks
2057 16456 : ntasks_recv = ntasks
2058 16456 : CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
2059 : ! not distributed, hence atom_pair_send not needed
2060 : END IF
2061 :
2062 : ! here we sort the task list we will process locally.
2063 49229 : ALLOCATE (indices(ntasks_recv))
2064 16504 : CALL tasks_sort(tasks_recv, ntasks_recv, indices)
2065 16504 : DEALLOCATE (indices)
2066 :
2067 : !
2068 : ! final lists are ready
2069 : !
2070 :
2071 16504 : tasks => tasks_recv
2072 16504 : ntasks = ntasks_recv
2073 :
2074 16504 : CALL timestop(handle)
2075 :
2076 33008 : END SUBROUTINE distribute_tasks
2077 :
2078 : ! **************************************************************************************************
2079 : !> \brief ...
2080 : !> \param atom_pair ...
2081 : !> \param my_tasks ...
2082 : !> \param send ...
2083 : !> \param symmetric ...
2084 : !> \param natoms ...
2085 : !> \param nimages ...
2086 : !> \param rs_descs ...
2087 : ! **************************************************************************************************
2088 16552 : SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
2089 :
2090 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair
2091 : TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: tasks
2092 : INTEGER, INTENT(IN) :: ntasks
2093 : LOGICAL, INTENT(IN) :: send, symmetric
2094 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
2095 :
2096 : INTEGER :: i, ilevel, iatom, jatom, npairs, virt_rank
2097 16552 : INTEGER, DIMENSION(:), ALLOCATABLE :: indices
2098 16552 : TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE :: atom_pair_tmp
2099 :
2100 16552 : CPASSERT(.NOT. ASSOCIATED(atom_pair))
2101 16552 : IF (ntasks == 0) THEN
2102 290 : ALLOCATE (atom_pair(0))
2103 : RETURN
2104 : END IF
2105 :
2106 : ! calculate list of atom pairs
2107 : ! fill pair list taking into account symmetry
2108 8029301 : ALLOCATE (atom_pair_tmp(ntasks))
2109 7996777 : DO i = 1, ntasks
2110 7980515 : atom_pair_tmp(i)%image = tasks(i)%image
2111 7980515 : iatom = tasks(i)%iatom
2112 7980515 : jatom = tasks(i)%jatom
2113 7980515 : IF (symmetric .AND. iatom > jatom) THEN
2114 : ! iatom / jatom swapped
2115 3008071 : atom_pair_tmp(i)%row = jatom
2116 3008071 : atom_pair_tmp(i)%col = iatom
2117 : ELSE
2118 4972444 : atom_pair_tmp(i)%row = iatom
2119 4972444 : atom_pair_tmp(i)%col = jatom
2120 : END IF
2121 :
2122 7996777 : IF (send) THEN
2123 : ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
2124 : ! actually has the correct part of the rs_grid to do the mapping
2125 3618 : ilevel = tasks(i)%grid_level
2126 3618 : virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
2127 3618 : atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
2128 : ELSE
2129 : ! If we are receiving, then no conversion is needed as the rank is that of the process with the
2130 : ! required matrix block, and the ordering of the rs grid is irrelevant
2131 7976897 : atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
2132 : END IF
2133 : END DO
2134 :
2135 : ! find unique atom pairs that I'm sending/receiving
2136 48786 : ALLOCATE (indices(ntasks))
2137 16262 : CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
2138 16262 : npairs = 1
2139 16262 : tasks(indices(1))%pair_index = 1
2140 7980515 : DO i = 2, ntasks
2141 7964253 : IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
2142 267845 : npairs = npairs + 1
2143 267845 : atom_pair_tmp(npairs) = atom_pair_tmp(i)
2144 : END IF
2145 7980515 : tasks(indices(i))%pair_index = npairs
2146 : END DO
2147 16262 : DEALLOCATE (indices)
2148 :
2149 : ! Copy unique pairs to final location.
2150 332893 : ALLOCATE (atom_pair(npairs))
2151 300369 : atom_pair(:) = atom_pair_tmp(:npairs)
2152 16262 : DEALLOCATE (atom_pair_tmp)
2153 :
2154 : END SUBROUTINE get_atom_pair
2155 :
2156 : ! **************************************************************************************************
2157 : !> \brief redistributes the matrix so that it can be used in realspace operations
2158 : !> i.e. according to the task lists for collocate and integrate.
2159 : !> This routine can become a bottleneck in large calculations.
2160 : !> \param rs_descs ...
2161 : !> \param pmats ...
2162 : !> \param atom_pair_send ...
2163 : !> \param atom_pair_recv ...
2164 : !> \param natoms ...
2165 : !> \param nimages ...
2166 : !> \param scatter ...
2167 : !> \param hmats ...
2168 : ! **************************************************************************************************
2169 0 : SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
2170 : nimages, scatter, hmats)
2171 :
2172 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2173 : POINTER :: rs_descs
2174 : TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: pmats
2175 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
2176 : INTEGER :: nimages
2177 : LOGICAL :: scatter
2178 : TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
2179 : POINTER :: hmats
2180 :
2181 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix'
2182 :
2183 : INTEGER :: acol, arow, handle, i, img, j, k, l, me, &
2184 : nblkcols_total, nblkrows_total, ncol, &
2185 : nrow, nthread, nthread_left
2186 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
2187 0 : recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
2188 0 : send_pair_disps, send_sizes
2189 0 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, row_blk_size
2190 : LOGICAL :: found
2191 0 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: recv_buf_r, send_buf_r
2192 0 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: h_block, p_block
2193 : TYPE(dbcsr_type), POINTER :: hmat, pmat
2194 : TYPE(realspace_grid_desc_type), POINTER :: desc
2195 0 : REAL(kind=dp), DIMENSION(:), POINTER :: vector
2196 :
2197 0 : !$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2198 :
2199 0 : CALL timeset(routineN, handle)
2200 :
2201 0 : IF (.NOT. scatter) THEN
2202 0 : CPASSERT(PRESENT(hmats))
2203 : END IF
2204 :
2205 0 : desc => rs_descs(1)%rs_desc
2206 0 : me = desc%my_pos + 1
2207 :
2208 : ! allocate local arrays
2209 0 : ALLOCATE (send_sizes(desc%group_size))
2210 0 : ALLOCATE (recv_sizes(desc%group_size))
2211 0 : ALLOCATE (send_disps(desc%group_size))
2212 0 : ALLOCATE (recv_disps(desc%group_size))
2213 0 : ALLOCATE (send_pair_count(desc%group_size))
2214 0 : ALLOCATE (recv_pair_count(desc%group_size))
2215 0 : ALLOCATE (send_pair_disps(desc%group_size))
2216 0 : ALLOCATE (recv_pair_disps(desc%group_size))
2217 :
2218 0 : pmat => pmats(1)%matrix
2219 : CALL dbcsr_get_info(pmat, &
2220 : row_blk_size=row_blk_size, &
2221 : col_blk_size=col_blk_size, &
2222 : nblkrows_total=nblkrows_total, &
2223 0 : nblkcols_total=nblkcols_total)
2224 0 : ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
2225 0 : first_col(nblkcols_total), last_col(nblkcols_total))
2226 0 : CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
2227 0 : CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
2228 :
2229 : ! set up send buffer sizes
2230 0 : send_sizes = 0
2231 0 : send_pair_count = 0
2232 0 : DO i = 1, SIZE(atom_pair_send)
2233 0 : k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
2234 0 : arow = atom_pair_send(i)%row
2235 0 : acol = atom_pair_send(i)%col
2236 0 : nrow = last_row(arow) - first_row(arow) + 1
2237 0 : ncol = last_col(acol) - first_col(acol) + 1
2238 0 : send_sizes(k) = send_sizes(k) + nrow*ncol
2239 0 : send_pair_count(k) = send_pair_count(k) + 1
2240 : END DO
2241 :
2242 0 : send_disps = 0
2243 0 : send_pair_disps = 0
2244 0 : DO i = 2, desc%group_size
2245 0 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
2246 0 : send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
2247 : END DO
2248 :
2249 0 : ALLOCATE (send_buf_r(SUM(send_sizes)))
2250 :
2251 : ! set up recv buffer
2252 :
2253 0 : recv_sizes = 0
2254 0 : recv_pair_count = 0
2255 0 : DO i = 1, SIZE(atom_pair_recv)
2256 0 : k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
2257 0 : arow = atom_pair_recv(i)%row
2258 0 : acol = atom_pair_recv(i)%col
2259 0 : nrow = last_row(arow) - first_row(arow) + 1
2260 0 : ncol = last_col(acol) - first_col(acol) + 1
2261 0 : recv_sizes(k) = recv_sizes(k) + nrow*ncol
2262 0 : recv_pair_count(k) = recv_pair_count(k) + 1
2263 : END DO
2264 :
2265 0 : recv_disps = 0
2266 0 : recv_pair_disps = 0
2267 0 : DO i = 2, desc%group_size
2268 0 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
2269 0 : recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
2270 : END DO
2271 0 : ALLOCATE (recv_buf_r(SUM(recv_sizes)))
2272 :
2273 : !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
2274 : !$OMP SHARED(desc,send_pair_count,send_pair_disps,nimages),&
2275 : !$OMP SHARED(last_row,first_row,last_col,first_col),&
2276 : !$OMP SHARED(pmats,send_buf_r,send_disps,send_sizes),&
2277 : !$OMP SHARED(atom_pair_send,me,hmats,nblkrows_total),&
2278 : !$OMP SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
2279 : !$OMP SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
2280 : !$OMP PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
2281 0 : !$OMP PRIVATE(nthread,h_block,nthread_left,hmat,pmat,vector)
2282 :
2283 : nthread = 1
2284 : !$ nthread = omp_get_num_threads()
2285 : nthread_left = 1
2286 : !$ nthread_left = MAX(1, nthread - 1)
2287 :
2288 : ! do packing
2289 : !$OMP DO schedule(guided)
2290 : DO l = 1, desc%group_size
2291 : IF (l == me) CYCLE
2292 : send_sizes(l) = 0
2293 : DO i = 1, send_pair_count(l)
2294 : arow = atom_pair_send(send_pair_disps(l) + i)%row
2295 : acol = atom_pair_send(send_pair_disps(l) + i)%col
2296 : img = atom_pair_send(send_pair_disps(l) + i)%image
2297 : nrow = last_row(arow) - first_row(arow) + 1
2298 : ncol = last_col(acol) - first_col(acol) + 1
2299 : pmat => pmats(img)%matrix
2300 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2301 : CPASSERT(found)
2302 :
2303 : DO k = 1, ncol
2304 : DO j = 1, nrow
2305 : send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
2306 : END DO
2307 : END DO
2308 : send_sizes(l) = send_sizes(l) + nrow*ncol
2309 : END DO
2310 : END DO
2311 : !$OMP END DO
2312 :
2313 : IF (.NOT. scatter) THEN
2314 : ! We need locks to protect concurrent summation into H
2315 : !$OMP SINGLE
2316 : !$ ALLOCATE (locks(nthread*10))
2317 : !$OMP END SINGLE
2318 :
2319 : !$OMP DO
2320 : !$ do i = 1, nthread*10
2321 : !$ call omp_init_lock(locks(i))
2322 : !$ end do
2323 : !$OMP END DO
2324 : END IF
2325 :
2326 : !$OMP MASTER
2327 : ! do communication
2328 : CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
2329 : recv_buf_r, recv_sizes, recv_disps)
2330 : !$OMP END MASTER
2331 :
2332 : ! If this is a scatter, then no need to copy local blocks,
2333 : ! If not, we sum them directly into H (bypassing the alltoall)
2334 : IF (.NOT. scatter) THEN
2335 :
2336 : ! Distribute work over remaining threads assuming one is still in the alltoall
2337 : !$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
2338 : DO i = 1, send_pair_count(me)
2339 : arow = atom_pair_send(send_pair_disps(me) + i)%row
2340 : acol = atom_pair_send(send_pair_disps(me) + i)%col
2341 : img = atom_pair_send(send_pair_disps(me) + i)%image
2342 : nrow = last_row(arow) - first_row(arow) + 1
2343 : ncol = last_col(acol) - first_col(acol) + 1
2344 : hmat => hmats(img)%matrix
2345 : pmat => pmats(img)%matrix
2346 : CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
2347 : CPASSERT(found)
2348 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
2349 : CPASSERT(found)
2350 :
2351 : !$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2352 : DO k = 1, ncol
2353 : DO j = 1, nrow
2354 : h_block(j, k) = h_block(j, k) + p_block(j, k)
2355 : END DO
2356 : END DO
2357 : !$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2358 : END DO
2359 : !$OMP END DO
2360 : ELSE
2361 : ! We will insert new blocks into P, so create mutable work matrices
2362 : DO img = 1, nimages
2363 : pmat => pmats(img)%matrix
2364 : CALL dbcsr_work_create(pmat, work_mutable=.TRUE., &
2365 : nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
2366 : n=nthread)
2367 : END DO
2368 : END IF
2369 :
2370 : ! wait for comm and setup to finish
2371 : !$OMP BARRIER
2372 :
2373 : !do unpacking
2374 : !$OMP DO schedule(guided)
2375 : DO l = 1, desc%group_size
2376 : IF (l == me) CYCLE
2377 : recv_sizes(l) = 0
2378 : DO i = 1, recv_pair_count(l)
2379 : arow = atom_pair_recv(recv_pair_disps(l) + i)%row
2380 : acol = atom_pair_recv(recv_pair_disps(l) + i)%col
2381 : img = atom_pair_recv(recv_pair_disps(l) + i)%image
2382 : nrow = last_row(arow) - first_row(arow) + 1
2383 : ncol = last_col(acol) - first_col(acol) + 1
2384 : pmat => pmats(img)%matrix
2385 : NULLIFY (p_block)
2386 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
2387 :
2388 : IF (PRESENT(hmats)) THEN
2389 : hmat => hmats(img)%matrix
2390 : CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
2391 : CPASSERT(found)
2392 : END IF
2393 :
2394 : IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
2395 : vector => recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol)
2396 : CALL dbcsr_put_block(pmat, arow, acol, block=RESHAPE(vector, [nrow, ncol]))
2397 : END IF
2398 : IF (.NOT. scatter) THEN
2399 : !$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2400 : DO k = 1, ncol
2401 : DO j = 1, nrow
2402 : h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
2403 : END DO
2404 : END DO
2405 : !$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2406 : END IF
2407 : recv_sizes(l) = recv_sizes(l) + nrow*ncol
2408 : END DO
2409 : END DO
2410 : !$OMP END DO
2411 :
2412 : !$ IF (.not. scatter) THEN
2413 : !$OMP DO
2414 : !$ do i = 1, nthread*10
2415 : !$ call omp_destroy_lock(locks(i))
2416 : !$ end do
2417 : !$OMP END DO
2418 : !$ END IF
2419 :
2420 : !$OMP SINGLE
2421 : !$ IF (.not. scatter) THEN
2422 : !$ DEALLOCATE (locks)
2423 : !$ END IF
2424 : !$OMP END SINGLE NOWAIT
2425 :
2426 : IF (scatter) THEN
2427 : ! Blocks were added to P
2428 : DO img = 1, nimages
2429 : pmat => pmats(img)%matrix
2430 : CALL dbcsr_finalize(pmat)
2431 : END DO
2432 : END IF
2433 : !$OMP END PARALLEL
2434 :
2435 0 : DEALLOCATE (send_buf_r)
2436 0 : DEALLOCATE (recv_buf_r)
2437 :
2438 0 : DEALLOCATE (send_sizes)
2439 0 : DEALLOCATE (recv_sizes)
2440 0 : DEALLOCATE (send_disps)
2441 0 : DEALLOCATE (recv_disps)
2442 0 : DEALLOCATE (send_pair_count)
2443 0 : DEALLOCATE (recv_pair_count)
2444 0 : DEALLOCATE (send_pair_disps)
2445 0 : DEALLOCATE (recv_pair_disps)
2446 :
2447 0 : DEALLOCATE (first_row, last_row, first_col, last_col)
2448 :
2449 0 : CALL timestop(handle)
2450 :
2451 0 : END SUBROUTINE rs_distribute_matrix
2452 :
2453 : ! **************************************************************************************************
2454 : !> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
2455 : !> \author Ole Schuett
2456 : ! **************************************************************************************************
2457 14158 : SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
2458 : pair_offsets, rank_offsets, rank_sizes, buffer_size)
2459 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: pairs
2460 : INTEGER, DIMENSION(:), INTENT(IN) :: nsgf
2461 : INTEGER, INTENT(IN) :: group_size
2462 : INTEGER, DIMENSION(:), POINTER :: pair_offsets, rank_offsets, rank_sizes
2463 : INTEGER, INTENT(INOUT) :: buffer_size
2464 :
2465 : INTEGER :: acol, arow, i, block_size, total_size, k, prev_k
2466 :
2467 14158 : IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
2468 14158 : IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
2469 14158 : IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
2470 :
2471 : ! calculate buffer_size and pair_offsets
2472 42235 : ALLOCATE (pair_offsets(SIZE(pairs)))
2473 14158 : total_size = 0
2474 286847 : DO i = 1, SIZE(pairs)
2475 272689 : pair_offsets(i) = total_size
2476 272689 : arow = pairs(i)%row
2477 272689 : acol = pairs(i)%col
2478 272689 : block_size = nsgf(arow)*nsgf(acol)
2479 286847 : total_size = total_size + block_size
2480 : END DO
2481 14158 : buffer_size = total_size
2482 :
2483 : ! calculate rank_offsets and rank_sizes
2484 42474 : ALLOCATE (rank_offsets(group_size))
2485 28316 : ALLOCATE (rank_sizes(group_size))
2486 41254 : rank_offsets = 0
2487 41254 : rank_sizes = 0
2488 14158 : IF (SIZE(pairs) > 0) THEN
2489 13919 : prev_k = pairs(1)%rank + 1
2490 286608 : DO i = 1, SIZE(pairs)
2491 272689 : k = pairs(i)%rank + 1
2492 272689 : CPASSERT(k >= prev_k) ! expecting the pairs to be ordered by rank
2493 286608 : IF (k > prev_k) THEN
2494 61 : rank_offsets(k) = pair_offsets(i)
2495 61 : rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
2496 61 : prev_k = k
2497 : END IF
2498 : END DO
2499 13919 : rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
2500 : END IF
2501 :
2502 14158 : END SUBROUTINE rs_calc_offsets
2503 :
2504 : ! **************************************************************************************************
2505 : !> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
2506 : !> \author Ole Schuett
2507 : ! **************************************************************************************************
2508 266 : SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
2509 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2510 : TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2511 : TYPE(task_list_type), INTENT(IN) :: task_list
2512 : TYPE(mp_comm_type), INTENT(IN) :: group
2513 :
2514 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_scatter_matrices'
2515 :
2516 : INTEGER :: handle
2517 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2518 :
2519 266 : CALL timeset(routineN, handle)
2520 777 : ALLOCATE (buffer_send(task_list%buffer_size_send))
2521 :
2522 : ! pack dbcsr blocks into send buffer
2523 266 : CPASSERT(ASSOCIATED(task_list%atom_pair_send))
2524 : CALL rs_pack_buffer(src_matrices=src_matrices, &
2525 : dest_buffer=buffer_send, &
2526 : atom_pair=task_list%atom_pair_send, &
2527 266 : pair_offsets=task_list%pair_offsets_send)
2528 :
2529 : ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
2530 : CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
2531 : dest_buffer%host_buffer, &
2532 2394 : task_list%rank_sizes_recv, task_list%rank_offsets_recv)
2533 :
2534 266 : DEALLOCATE (buffer_send)
2535 266 : CALL timestop(handle)
2536 :
2537 266 : END SUBROUTINE rs_scatter_matrices
2538 :
2539 : ! **************************************************************************************************
2540 : !> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
2541 : !> \author Ole Schuett
2542 : ! **************************************************************************************************
2543 218 : SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
2544 : TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2545 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2546 : TYPE(task_list_type), INTENT(IN) :: task_list
2547 : TYPE(mp_comm_type), INTENT(IN) :: group
2548 :
2549 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_gather_matrices'
2550 :
2551 : INTEGER :: handle
2552 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2553 :
2554 218 : CALL timeset(routineN, handle)
2555 :
2556 : ! Caution: The meaning of send and recv are reversed in this routine.
2557 640 : ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
2558 :
2559 : ! mpi all-to-all communication
2560 : CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
2561 1962 : buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
2562 :
2563 : ! unpack dbcsr blocks from send buffer
2564 218 : CPASSERT(ASSOCIATED(task_list%atom_pair_send))
2565 : CALL rs_unpack_buffer(src_buffer=buffer_send, &
2566 : dest_matrices=dest_matrices, &
2567 : atom_pair=task_list%atom_pair_send, &
2568 218 : pair_offsets=task_list%pair_offsets_send)
2569 :
2570 218 : DEALLOCATE (buffer_send)
2571 218 : CALL timestop(handle)
2572 :
2573 218 : END SUBROUTINE rs_gather_matrices
2574 :
2575 : ! **************************************************************************************************
2576 : !> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
2577 : !> \author Ole Schuett
2578 : ! **************************************************************************************************
2579 231477 : SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
2580 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2581 : TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2582 : TYPE(task_list_type), INTENT(IN) :: task_list
2583 :
2584 : CALL rs_pack_buffer(src_matrices=src_matrices, &
2585 : dest_buffer=dest_buffer%host_buffer, &
2586 : atom_pair=task_list%atom_pair_recv, &
2587 231477 : pair_offsets=task_list%pair_offsets_recv)
2588 :
2589 231477 : END SUBROUTINE rs_copy_to_buffer
2590 :
2591 : ! **************************************************************************************************
2592 : !> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
2593 : !> \author Ole Schuett
2594 : ! **************************************************************************************************
2595 190600 : SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
2596 : TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2597 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2598 : TYPE(task_list_type), INTENT(IN) :: task_list
2599 :
2600 : CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
2601 : dest_matrices=dest_matrices, &
2602 : atom_pair=task_list%atom_pair_recv, &
2603 190600 : pair_offsets=task_list%pair_offsets_recv)
2604 :
2605 190600 : END SUBROUTINE rs_copy_to_matrices
2606 :
2607 : ! **************************************************************************************************
2608 : !> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
2609 : !> \author Ole Schuett
2610 : ! **************************************************************************************************
2611 231743 : SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
2612 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2613 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: dest_buffer
2614 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2615 : INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2616 :
2617 : INTEGER :: acol, arow, img, i, offset, block_size
2618 : LOGICAL :: found
2619 231743 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: block
2620 :
2621 : !$OMP PARALLEL DEFAULT(NONE), &
2622 : !$OMP SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
2623 231743 : !$OMP PRIVATE(acol,arow,img,i,offset,block_size,found,block)
2624 : !$OMP DO schedule(guided)
2625 : DO i = 1, SIZE(atom_pair)
2626 : arow = atom_pair(i)%row
2627 : acol = atom_pair(i)%col
2628 : img = atom_pair(i)%image
2629 : CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
2630 : block=block, found=found)
2631 : CPASSERT(found)
2632 : block_size = SIZE(block)
2633 : offset = pair_offsets(i)
2634 : dest_buffer(offset + 1:offset + block_size) = RESHAPE(block, shape=[block_size])
2635 : END DO
2636 : !$OMP END DO
2637 : !$OMP END PARALLEL
2638 :
2639 231743 : END SUBROUTINE rs_pack_buffer
2640 :
2641 : ! **************************************************************************************************
2642 : !> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
2643 : !> \author Ole Schuett
2644 : ! **************************************************************************************************
2645 190818 : SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
2646 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: src_buffer
2647 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2648 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2649 : INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2650 :
2651 : INTEGER :: acol, arow, img, i, offset, &
2652 : nrows, ncols, lock_num
2653 : LOGICAL :: found
2654 190818 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: block
2655 190818 : INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2656 :
2657 : ! initialize locks
2658 572454 : ALLOCATE (locks(10*omp_get_max_threads()))
2659 2098998 : DO i = 1, SIZE(locks)
2660 2098998 : CALL omp_init_lock(locks(i))
2661 : END DO
2662 :
2663 : !$OMP PARALLEL DEFAULT(NONE), &
2664 : !$OMP SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
2665 190818 : !$OMP PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
2666 : !$OMP DO schedule(guided)
2667 : DO i = 1, SIZE(atom_pair)
2668 : arow = atom_pair(i)%row
2669 : acol = atom_pair(i)%col
2670 : img = atom_pair(i)%image
2671 : CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
2672 : block=block, found=found)
2673 : CPASSERT(found)
2674 : nrows = SIZE(block, 1)
2675 : ncols = SIZE(block, 2)
2676 : offset = pair_offsets(i)
2677 : lock_num = MODULO(arow, SIZE(locks)) + 1 ! map matrix rows round-robin to available locks
2678 :
2679 : CALL omp_set_lock(locks(lock_num))
2680 : block = block + RESHAPE(src_buffer(offset + 1:offset + nrows*ncols), shape=[nrows, ncols])
2681 : CALL omp_unset_lock(locks(lock_num))
2682 : END DO
2683 : !$OMP END DO
2684 : !$OMP END PARALLEL
2685 :
2686 : ! destroy locks
2687 2098998 : DO i = 1, SIZE(locks)
2688 2098998 : CALL omp_destroy_lock(locks(i))
2689 : END DO
2690 190818 : DEALLOCATE (locks)
2691 :
2692 190818 : END SUBROUTINE rs_unpack_buffer
2693 :
2694 : ! **************************************************************************************************
2695 : !> \brief determines the rank of the processor who's real rs grid contains point
2696 : !> \param rs_desc ...
2697 : !> \param igrid_level ...
2698 : !> \param n_levels ...
2699 : !> \param cube_center ...
2700 : !> \param ntasks ...
2701 : !> \param tasks ...
2702 : !> \param lb_cube ...
2703 : !> \param ub_cube ...
2704 : !> \param added_tasks ...
2705 : !> \par History
2706 : !> 11.2007 created [MattW]
2707 : !> 10.2008 rewritten [Joost VandeVondele]
2708 : !> \author MattW
2709 : ! **************************************************************************************************
2710 1380 : SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
2711 : lb_cube, ub_cube, added_tasks)
2712 :
2713 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2714 : INTEGER, INTENT(IN) :: igrid_level, n_levels
2715 : INTEGER, DIMENSION(3), INTENT(IN) :: cube_center
2716 : INTEGER, INTENT(INOUT) :: ntasks
2717 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
2718 : INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
2719 : INTEGER, INTENT(OUT) :: added_tasks
2720 :
2721 : INTEGER, PARAMETER :: add_tasks = 1000
2722 : REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
2723 :
2724 : INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
2725 : lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
2726 : INTEGER :: bit_pattern
2727 : LOGICAL :: dir_periodic(3)
2728 :
2729 1380 : coord(1) = rs_desc%x2coord(cube_center(1))
2730 1380 : coord(2) = rs_desc%y2coord(cube_center(2))
2731 1380 : coord(3) = rs_desc%z2coord(cube_center(3))
2732 1380 : dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
2733 :
2734 : ! the real cube coordinates
2735 5520 : lbc = lb_cube + cube_center
2736 5520 : ubc = ub_cube + cube_center
2737 :
2738 9660 : IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) <= lbc) .AND. &
2739 : ALL((rs_desc%ub_global(:, dest) + rs_desc%border) >= ubc)) THEN
2740 : !standard distributed collocation/integration
2741 1380 : tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
2742 1380 : tasks(ntasks)%dist_type = 1
2743 1380 : tasks(ntasks)%subpatch_pattern = 0
2744 1380 : added_tasks = 1
2745 :
2746 : ! here we figure out if there is an alternate location for this task
2747 : ! i.e. even though the cube_center is not in the real local domain,
2748 : ! it might fully fit in the halo of the neighbor
2749 : ! if its radius is smaller than the maximum radius
2750 : ! the list of possible neighbors is stored here as a bit pattern
2751 : ! to reconstruct what the rank of the neigbor is,
2752 : ! we can use (note this requires the correct rs_grid)
2753 : ! IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[-1,0,0])
2754 : ! IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[+1,0,0])
2755 : ! IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,-1,0])
2756 : ! IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,+1,0])
2757 : ! IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,-1])
2758 : ! IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,+1])
2759 1380 : bit_index = 0
2760 1380 : bit_pattern = 0
2761 5520 : DO i = 1, 3
2762 5520 : IF (rs_desc%perd(i) == 1) THEN
2763 2760 : bit_pattern = IBCLR(bit_pattern, bit_index)
2764 2760 : bit_index = bit_index + 1
2765 2760 : bit_pattern = IBCLR(bit_pattern, bit_index)
2766 2760 : bit_index = bit_index + 1
2767 : ELSE
2768 : ! fits the left neighbor ?
2769 1380 : IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
2770 599 : bit_pattern = IBSET(bit_pattern, bit_index)
2771 599 : bit_index = bit_index + 1
2772 : ELSE
2773 781 : bit_pattern = IBCLR(bit_pattern, bit_index)
2774 781 : bit_index = bit_index + 1
2775 : END IF
2776 : ! fits the right neighbor ?
2777 1380 : IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
2778 250 : bit_pattern = IBSET(bit_pattern, bit_index)
2779 250 : bit_index = bit_index + 1
2780 : ELSE
2781 1130 : bit_pattern = IBCLR(bit_pattern, bit_index)
2782 1130 : bit_index = bit_index + 1
2783 : END IF
2784 : END IF
2785 : END DO
2786 1380 : tasks(ntasks)%subpatch_pattern = bit_pattern
2787 :
2788 : ELSE
2789 : ! generalised collocation/integration needed
2790 : ! first we figure out how many neighbors we have to add to include the lbc/ubc
2791 : ! in the available domains (inclusive of halo points)
2792 : ! first we 'ignore' periodic boundary conditions
2793 : ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
2794 0 : lb_coord = coord
2795 0 : ub_coord = coord
2796 0 : lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
2797 0 : ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
2798 0 : DO i = 1, 3
2799 : ! only if the grid is not periodic in this direction we need to take care of adding neighbors
2800 0 : IF (rs_desc%perd(i) == 0) THEN
2801 : ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
2802 0 : DO
2803 0 : IF (lb_domain(i) > lbc(i)) THEN
2804 0 : lb_coord(i) = lb_coord(i) - 1
2805 0 : icoord = MODULO(lb_coord, rs_desc%group_dim)
2806 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2807 0 : lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2808 : ELSE
2809 : EXIT
2810 : END IF
2811 : END DO
2812 : ! same for the upper bound
2813 0 : DO
2814 0 : IF (ub_domain(i) < ubc(i)) THEN
2815 0 : ub_coord(i) = ub_coord(i) + 1
2816 0 : icoord = MODULO(ub_coord, rs_desc%group_dim)
2817 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2818 0 : ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2819 : ELSE
2820 : EXIT
2821 : END IF
2822 : END DO
2823 : END IF
2824 : END DO
2825 :
2826 : ! some care is needed for the periodic boundaries
2827 0 : DO i = 1, 3
2828 0 : IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
2829 0 : dir_periodic(i) = .TRUE.
2830 0 : lb_coord(i) = 0
2831 0 : ub_coord(i) = rs_desc%group_dim(i) - 1
2832 : ELSE
2833 0 : dir_periodic(i) = .FALSE.
2834 : END IF
2835 : END DO
2836 :
2837 0 : added_tasks = PRODUCT(ub_coord - lb_coord + 1)
2838 0 : itask = ntasks
2839 0 : ntasks = ntasks + added_tasks - 1
2840 0 : IF (ntasks > SIZE(tasks)) THEN
2841 0 : curr_tasks = INT((SIZE(tasks) + add_tasks)*mult_tasks)
2842 0 : CALL reallocate_tasks(tasks, curr_tasks)
2843 : END IF
2844 0 : DO iz = lb_coord(3), ub_coord(3)
2845 0 : DO iy = lb_coord(2), ub_coord(2)
2846 0 : DO ix = lb_coord(1), ub_coord(1)
2847 0 : icoord = MODULO([ix, iy, iz], rs_desc%group_dim)
2848 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2849 0 : tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
2850 0 : tasks(itask)%dist_type = 2
2851 0 : tasks(itask)%subpatch_pattern = 0
2852 : ! encode the domain size for this task
2853 : ! if the bit is set, we need to add the border in that direction
2854 0 : IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
2855 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 0)
2856 0 : IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
2857 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 1)
2858 0 : IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
2859 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 2)
2860 0 : IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
2861 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 3)
2862 0 : IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
2863 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 4)
2864 0 : IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
2865 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 5)
2866 0 : itask = itask + 1
2867 : END DO
2868 : END DO
2869 : END DO
2870 : END IF
2871 :
2872 1380 : END SUBROUTINE rs_find_node
2873 :
2874 : ! **************************************************************************************************
2875 : !> \brief utility functions for encoding the grid level with a rank, allowing
2876 : !> different grid levels to maintain a different rank ordering without
2877 : !> losing information. These encoded_ints are stored in the tasks(1:2,:) array
2878 : !> \param rank ...
2879 : !> \param grid_level ...
2880 : !> \param n_levels ...
2881 : !> \return ...
2882 : !> \par History
2883 : !> 4.2009 created [Iain Bethune]
2884 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2885 : ! **************************************************************************************************
2886 15955982 : FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
2887 :
2888 : INTEGER, INTENT(IN) :: rank, grid_level, n_levels
2889 : INTEGER :: encoded_int
2890 :
2891 : ! ordered so can still sort by rank
2892 :
2893 15955982 : encoded_int = rank*n_levels + grid_level - 1
2894 :
2895 15955982 : END FUNCTION encode_rank
2896 :
2897 : ! **************************************************************************************************
2898 : !> \brief ...
2899 : !> \param encoded_int ...
2900 : !> \param n_levels ...
2901 : !> \return ...
2902 : ! **************************************************************************************************
2903 8023981 : FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
2904 :
2905 : INTEGER, INTENT(IN) :: encoded_int
2906 : INTEGER, INTENT(IN) :: n_levels
2907 : INTEGER :: rank
2908 :
2909 8023981 : rank = INT(encoded_int/n_levels)
2910 :
2911 8023981 : END FUNCTION decode_rank
2912 :
2913 : ! **************************************************************************************************
2914 : !> \brief ...
2915 : !> \param encoded_int ...
2916 : !> \param n_levels ...
2917 : !> \return ...
2918 : ! **************************************************************************************************
2919 7236 : FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
2920 :
2921 : INTEGER, INTENT(IN) :: encoded_int
2922 : INTEGER, INTENT(IN) :: n_levels
2923 : INTEGER :: grid_level
2924 :
2925 7236 : grid_level = INT(MODULO(encoded_int, n_levels)) + 1
2926 :
2927 7236 : END FUNCTION decode_level
2928 :
2929 : ! **************************************************************************************************
2930 : !> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
2931 : !> grouped, and for each atom pair all set pairs are grouped and for each set pair,
2932 : !> all pgfs are grouped.
2933 : !> This yields the right order of the tasks for collocation after the sort
2934 : !> in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
2935 : !> The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
2936 : !> that a given density matrix block will be decontracted several times,
2937 : !> but cache effects on the grid make up for this.
2938 : !> \param a ...
2939 : !> \param b ...
2940 : !> \return ...
2941 : !> \author Ole Schuett
2942 : ! **************************************************************************************************
2943 75058299 : PURE FUNCTION tasks_less_than(a, b) RESULT(res)
2944 : TYPE(task_type), INTENT(IN) :: a, b
2945 : LOGICAL :: res
2946 :
2947 75058299 : IF (a%grid_level /= b%grid_level) THEN
2948 19126837 : res = a%grid_level < b%grid_level
2949 :
2950 55931462 : ELSE IF (a%image /= b%image) THEN
2951 5606317 : res = a%image < b%image
2952 :
2953 50325145 : ELSE IF (a%iatom /= b%iatom) THEN
2954 6459612 : res = a%iatom < b%iatom
2955 :
2956 43865533 : ELSE IF (a%jatom /= b%jatom) THEN
2957 6984787 : res = a%jatom < b%jatom
2958 :
2959 36880746 : ELSE IF (a%iset /= b%iset) THEN
2960 5908172 : res = a%iset < b%iset
2961 :
2962 30972574 : ELSE IF (a%jset /= b%jset) THEN
2963 6311613 : res = a%jset < b%jset
2964 :
2965 24660961 : ELSE IF (a%ipgf /= b%ipgf) THEN
2966 6299435 : res = a%ipgf < b%ipgf
2967 :
2968 : ELSE
2969 18361526 : res = a%jpgf < b%jpgf
2970 :
2971 : END IF
2972 75058299 : END FUNCTION tasks_less_than
2973 :
2974 178193482 : #:call array_sort(prefix='tasks', type='TYPE(task_type)')
2975 : #:endcall
2976 :
2977 : ! **************************************************************************************************
2978 : !> \brief Order atom pairs to find duplicates.
2979 : !> \param a ...
2980 : !> \param b ...
2981 : !> \return ...
2982 : !> \author Ole Schuett
2983 : ! **************************************************************************************************
2984 32091542 : PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
2985 : TYPE(atom_pair_type), INTENT(IN) :: a, b
2986 : LOGICAL :: res
2987 :
2988 32091542 : IF (a%rank /= b%rank) THEN
2989 4243 : res = a%rank < b%rank
2990 :
2991 32087299 : ELSE IF (a%row /= b%row) THEN
2992 5223410 : res = a%row < b%row
2993 :
2994 26863889 : ELSE IF (a%col /= b%col) THEN
2995 5214176 : res = a%col < b%col
2996 :
2997 : ELSE
2998 21649713 : res = a%image < b%image
2999 :
3000 : END IF
3001 32091542 : END FUNCTION atom_pair_less_than
3002 :
3003 67944060 : #:call array_sort(prefix='atom_pair', type='TYPE(atom_pair_type)')
3004 : #:endcall
3005 :
3006 : END MODULE task_list_methods
|