Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief generate the tasks lists used by collocate and integrate routines
10 : !> \par History
11 : !> 01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
12 : !> \author Joost VandeVondele
13 : ! **************************************************************************************************
14 : MODULE task_list_methods
15 : USE offload_api, ONLY: offload_create_buffer, offload_buffer_type
16 : USE grid_api, ONLY: grid_create_basis_set, grid_create_task_list
17 : USE ao_util, ONLY: exp_radius_very_extended
18 : USE basis_set_types, ONLY: get_gto_basis_set, &
19 : gto_basis_set_p_type, &
20 : gto_basis_set_type
21 : USE cell_types, ONLY: cell_type, &
22 : pbc
23 : USE cp_control_types, ONLY: dft_control_type
24 : USE cube_utils, ONLY: compute_cube_center, &
25 : cube_info_type, &
26 : return_cube, &
27 : return_cube_nonortho
28 : USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
29 : dbcsr_finalize, &
30 : dbcsr_get_block_p, &
31 : dbcsr_get_info, &
32 : dbcsr_p_type, &
33 : dbcsr_put_block, &
34 : dbcsr_type, &
35 : dbcsr_work_create
36 : USE gaussian_gridlevels, ONLY: gaussian_gridlevel, &
37 : gridlevel_info_type
38 : USE kinds, ONLY: default_string_length, &
39 : dp, &
40 : int_8
41 : USE kpoint_types, ONLY: get_kpoint_info, &
42 : kpoint_type
43 : USE memory_utilities, ONLY: reallocate
44 : USE message_passing, ONLY: &
45 : mp_comm_type
46 : USE particle_types, ONLY: particle_type
47 : USE particle_methods, ONLY: get_particle_set
48 : USE pw_env_types, ONLY: pw_env_get, &
49 : pw_env_type
50 : USE qs_kind_types, ONLY: get_qs_kind, &
51 : qs_kind_type
52 : USE qs_ks_types, ONLY: get_ks_env, &
53 : qs_ks_env_type
54 : USE qs_neighbor_list_types, ONLY: neighbor_list_set_p_type
55 : USE realspace_grid_types, ONLY: realspace_grid_desc_p_type, &
56 : realspace_grid_desc_type, &
57 : rs_grid_create, &
58 : rs_grid_locate_rank, &
59 : rs_grid_release, &
60 : rs_grid_reorder_ranks, realspace_grid_type
61 : USE task_list_types, ONLY: deserialize_task, &
62 : reallocate_tasks, &
63 : serialize_task, &
64 : task_list_type, &
65 : atom_pair_type, &
66 : task_size_in_int8, &
67 : task_type
68 : USE util, ONLY: sort
69 :
70 : !$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
71 : !$ omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
72 : #include "./base/base_uses.f90"
73 :
74 : #:include './common/array_sort.fypp'
75 :
76 : IMPLICIT NONE
77 :
78 : LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
79 :
80 : PRIVATE
81 :
82 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
83 :
84 : PUBLIC :: generate_qs_task_list, &
85 : task_list_inner_loop
86 : PUBLIC :: distribute_tasks, &
87 : rs_distribute_matrix, &
88 : rs_scatter_matrices, &
89 : rs_gather_matrices, &
90 : rs_copy_to_buffer, &
91 : rs_copy_to_matrices
92 :
93 : CONTAINS
94 :
95 : ! **************************************************************************************************
96 : !> \brief ...
97 : !> \param ks_env ...
98 : !> \param task_list ...
99 : !> \param basis_type ...
100 : !> \param reorder_rs_grid_ranks Flag that indicates if this routine should
101 : !> or should not overwrite the rs descriptor (see comment below)
102 : !> \param skip_load_balance_distributed ...
103 : !> \param pw_env_external ...
104 : !> \param sab_orb_external ...
105 : !> \param ext_kpoints ...
106 : !> \par History
107 : !> 01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
108 : !> 04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
109 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
110 : !> 06.2015 adjusted to be used with multiple images (k-points) [JGH]
111 : !> \note If this routine is called several times with different task lists,
112 : !> the default behaviour is to re-optimize the grid ranks and overwrite
113 : !> the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
114 : !> of performing a new optimization by leaving the rank order in
115 : !> its current state.
116 : ! **************************************************************************************************
117 15024 : SUBROUTINE generate_qs_task_list(ks_env, task_list, basis_type, &
118 : reorder_rs_grid_ranks, skip_load_balance_distributed, &
119 : pw_env_external, sab_orb_external, ext_kpoints)
120 :
121 : TYPE(qs_ks_env_type), POINTER :: ks_env
122 : TYPE(task_list_type), POINTER :: task_list
123 : CHARACTER(LEN=*), INTENT(IN) :: basis_type
124 : LOGICAL, INTENT(IN) :: reorder_rs_grid_ranks, &
125 : skip_load_balance_distributed
126 : TYPE(pw_env_type), OPTIONAL, POINTER :: pw_env_external
127 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
128 : OPTIONAL, POINTER :: sab_orb_external
129 : TYPE(kpoint_type), OPTIONAL, POINTER :: ext_kpoints
130 :
131 : CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list'
132 : INTEGER, PARAMETER :: max_tasks = 2000
133 :
134 : INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
135 : ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
136 : jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
137 15024 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: blocks
138 : INTEGER, DIMENSION(3) :: cellind
139 15024 : INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
140 15024 : npgfb, nsgf
141 15024 : INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index
142 : LOGICAL :: dokp
143 : REAL(KIND=dp) :: kind_radius_a, kind_radius_b
144 : REAL(KIND=dp), DIMENSION(3) :: ra, rab
145 15024 : REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
146 15024 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
147 : TYPE(cell_type), POINTER :: cell
148 15024 : TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
149 : TYPE(dft_control_type), POINTER :: dft_control
150 : TYPE(gridlevel_info_type), POINTER :: gridlevel_info
151 15024 : TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER :: basis_set_list
152 : TYPE(gto_basis_set_type), POINTER :: basis_set_a, basis_set_b, orb_basis_set
153 : TYPE(kpoint_type), POINTER :: kpoints
154 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
155 15024 : POINTER :: sab_orb
156 15024 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
157 : TYPE(pw_env_type), POINTER :: pw_env
158 15024 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
159 : TYPE(qs_kind_type), POINTER :: qs_kind
160 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
161 15024 : POINTER :: rs_descs
162 15024 : TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
163 15024 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
164 :
165 15024 : CALL timeset(routineN, handle)
166 :
167 : CALL get_ks_env(ks_env, &
168 : qs_kind_set=qs_kind_set, &
169 : cell=cell, &
170 : particle_set=particle_set, &
171 15024 : dft_control=dft_control)
172 :
173 15024 : CALL get_ks_env(ks_env, sab_orb=sab_orb)
174 15024 : IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
175 :
176 15024 : CALL get_ks_env(ks_env, pw_env=pw_env)
177 15024 : IF (PRESENT(pw_env_external)) pw_env => pw_env_external
178 15024 : CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
179 :
180 : ! *** assign from pw_env
181 15024 : gridlevel_info => pw_env%gridlevel_info
182 15024 : cube_info => pw_env%cube_info
183 :
184 : ! find maximum numbers
185 15024 : nkind = SIZE(qs_kind_set)
186 15024 : natoms = SIZE(particle_set)
187 15024 : maxset = 0
188 15024 : maxpgf = 0
189 41965 : DO ikind = 1, nkind
190 26941 : qs_kind => qs_kind_set(ikind)
191 : CALL get_qs_kind(qs_kind=qs_kind, &
192 26941 : basis_set=orb_basis_set, basis_type=basis_type)
193 :
194 26941 : IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
195 26941 : CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
196 :
197 26941 : maxset = MAX(nseta, maxset)
198 98842 : maxpgf = MAX(MAXVAL(npgfa), maxpgf)
199 : END DO
200 :
201 : ! kpoint related
202 15024 : nimages = dft_control%nimages
203 15024 : dokp = .FALSE.
204 15024 : NULLIFY (cell_to_index)
205 15024 : NULLIFY (kpoints)
206 15024 : IF (PRESENT(ext_kpoints)) THEN
207 832 : IF (ASSOCIATED(ext_kpoints)) THEN
208 10 : dokp = .TRUE.
209 10 : CALL get_kpoint_info(kpoint=ext_kpoints, cell_to_index=cell_to_index)
210 10 : nimages = SIZE(ext_kpoints%index_to_cell, 2)
211 : END IF
212 : END IF
213 15024 : IF (.NOT. dokp .AND. nimages > 1) THEN
214 336 : dokp = .TRUE.
215 336 : CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
216 336 : CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
217 : END IF
218 :
219 : ! free the atom_pair lists if allocated
220 15024 : IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
221 15024 : IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
222 :
223 : ! construct a list of all tasks
224 15024 : IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
225 8992 : CALL reallocate_tasks(task_list%tasks, max_tasks)
226 : END IF
227 15024 : task_list%ntasks = 0
228 15024 : curr_tasks = SIZE(task_list%tasks)
229 :
230 72013 : ALLOCATE (basis_set_list(nkind))
231 41965 : DO ikind = 1, nkind
232 26941 : qs_kind => qs_kind_set(ikind)
233 : CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
234 26941 : basis_type=basis_type)
235 41965 : IF (ASSOCIATED(basis_set_a)) THEN
236 26941 : basis_set_list(ikind)%gto_basis_set => basis_set_a
237 : ELSE
238 0 : NULLIFY (basis_set_list(ikind)%gto_basis_set)
239 : END IF
240 : END DO
241 : !!$OMP PARALLEL DEFAULT(NONE) &
242 : !!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info, &
243 : !!$OMP curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
244 : !!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
245 : !!$OMP la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
246 : !!$OMP lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
247 : !!$OMP cindex, slot)
248 : ! Loop over neighbor list
249 : !!$OMP DO SCHEDULE(GUIDED)
250 1510140 : DO slot = 1, sab_orb(1)%nl_size
251 1495116 : ikind = sab_orb(1)%nlist_task(slot)%ikind
252 1495116 : jkind = sab_orb(1)%nlist_task(slot)%jkind
253 1495116 : iatom = sab_orb(1)%nlist_task(slot)%iatom
254 1495116 : jatom = sab_orb(1)%nlist_task(slot)%jatom
255 5980464 : rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
256 5980464 : cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
257 :
258 1495116 : basis_set_a => basis_set_list(ikind)%gto_basis_set
259 1495116 : IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE
260 1495116 : basis_set_b => basis_set_list(jkind)%gto_basis_set
261 1495116 : IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE
262 1495116 : ra(:) = pbc(particle_set(iatom)%r, cell)
263 : ! basis ikind
264 1495116 : la_max => basis_set_a%lmax
265 1495116 : la_min => basis_set_a%lmin
266 1495116 : npgfa => basis_set_a%npgf
267 1495116 : nseta = basis_set_a%nset
268 1495116 : rpgfa => basis_set_a%pgf_radius
269 1495116 : set_radius_a => basis_set_a%set_radius
270 1495116 : kind_radius_a = basis_set_a%kind_radius
271 1495116 : zeta => basis_set_a%zet
272 : ! basis jkind
273 1495116 : lb_max => basis_set_b%lmax
274 1495116 : lb_min => basis_set_b%lmin
275 1495116 : npgfb => basis_set_b%npgf
276 1495116 : nsetb = basis_set_b%nset
277 1495116 : rpgfb => basis_set_b%pgf_radius
278 1495116 : set_radius_b => basis_set_b%set_radius
279 1495116 : kind_radius_b = basis_set_b%kind_radius
280 1495116 : zetb => basis_set_b%zet
281 :
282 1495116 : IF (dokp) THEN
283 250624 : cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
284 : ELSE
285 1244492 : cindex = 1
286 : END IF
287 :
288 : CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
289 : rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
290 : iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
291 : set_radius_a, set_radius_b, ra, rab, &
292 1510140 : la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
293 :
294 : END DO
295 : !!$OMP END PARALLEL
296 :
297 : ! redistribute the task list so that all tasks map on the local rs grids
298 : CALL distribute_tasks( &
299 : rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
300 : tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
301 : atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
302 : reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
303 15024 : skip_load_balance_distributed=skip_load_balance_distributed)
304 :
305 : ! compute offsets for rs_scatter_matrix / rs_copy_matrix
306 45072 : ALLOCATE (nsgf(natoms))
307 15024 : CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
308 15024 : IF (ASSOCIATED(task_list%atom_pair_send)) THEN
309 : ! only needed when there is a distributed grid
310 : CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
311 : nsgf=nsgf, &
312 : group_size=rs_descs(1)%rs_desc%group_size, &
313 : pair_offsets=task_list%pair_offsets_send, &
314 : rank_offsets=task_list%rank_offsets_send, &
315 : rank_sizes=task_list%rank_sizes_send, &
316 48 : buffer_size=task_list%buffer_size_send)
317 : END IF
318 : CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
319 : nsgf=nsgf, &
320 : group_size=rs_descs(1)%rs_desc%group_size, &
321 : pair_offsets=task_list%pair_offsets_recv, &
322 : rank_offsets=task_list%rank_offsets_recv, &
323 : rank_sizes=task_list%rank_sizes_recv, &
324 15024 : buffer_size=task_list%buffer_size_recv)
325 15024 : DEALLOCATE (basis_set_list, nsgf)
326 :
327 : ! If the rank order has changed, reallocate any of the distributed rs_grids
328 15024 : IF (reorder_rs_grid_ranks) THEN
329 63134 : DO i = 1, gridlevel_info%ngrid_levels
330 63134 : IF (rs_descs(i)%rs_desc%distributed) THEN
331 54 : CALL rs_grid_release(rs_grids(i))
332 54 : CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
333 : END IF
334 : END DO
335 : END IF
336 :
337 : CALL create_grid_task_list(task_list=task_list, &
338 : qs_kind_set=qs_kind_set, &
339 : particle_set=particle_set, &
340 : cell=cell, &
341 : basis_type=basis_type, &
342 15024 : rs_grids=rs_grids)
343 :
344 : ! Now we have the final list of tasks, setup the task_list with the
345 : ! data needed for the loops in integrate_v/calculate_rho
346 :
347 15024 : IF (ASSOCIATED(task_list%taskstart)) THEN
348 6032 : DEALLOCATE (task_list%taskstart)
349 : END IF
350 15024 : IF (ASSOCIATED(task_list%taskstop)) THEN
351 6032 : DEALLOCATE (task_list%taskstop)
352 : END IF
353 15024 : IF (ASSOCIATED(task_list%npairs)) THEN
354 6032 : DEALLOCATE (task_list%npairs)
355 : END IF
356 :
357 : ! First, count the number of unique atom pairs per grid level
358 :
359 45072 : ALLOCATE (task_list%npairs(SIZE(rs_descs)))
360 :
361 15024 : iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
362 15024 : ipair = 0
363 74604 : task_list%npairs = 0
364 :
365 8446080 : DO i = 1, task_list%ntasks
366 8431056 : igrid_level = task_list%tasks(i)%grid_level
367 8431056 : img = task_list%tasks(i)%image
368 8431056 : iatom = task_list%tasks(i)%iatom
369 8431056 : jatom = task_list%tasks(i)%jatom
370 8431056 : iset = task_list%tasks(i)%iset
371 8431056 : jset = task_list%tasks(i)%jset
372 8431056 : ipgf = task_list%tasks(i)%ipgf
373 8431056 : jpgf = task_list%tasks(i)%jpgf
374 8446080 : IF (igrid_level /= igrid_level_old) THEN
375 41041 : IF (igrid_level_old /= -1) THEN
376 26258 : task_list%npairs(igrid_level_old) = ipair
377 : END IF
378 : ipair = 1
379 : igrid_level_old = igrid_level
380 : iatom_old = iatom
381 : jatom_old = jatom
382 : img_old = img
383 8390015 : ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
384 684146 : ipair = ipair + 1
385 684146 : iatom_old = iatom
386 684146 : jatom_old = jatom
387 684146 : img_old = img
388 : END IF
389 : END DO
390 : ! Take care of the last iteration
391 15024 : IF (task_list%ntasks /= 0) THEN
392 14783 : task_list%npairs(igrid_level) = ipair
393 : END IF
394 :
395 : ! Second, for each atom pair, find the indices in the task list
396 : ! of the first and last task
397 :
398 : ! Array sized for worst case
399 119435 : ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs)))
400 119435 : ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs)))
401 :
402 15024 : iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
403 15024 : ipair = 0
404 1561340 : task_list%taskstart = 0
405 1561340 : task_list%taskstop = 0
406 :
407 8446080 : DO i = 1, task_list%ntasks
408 8431056 : igrid_level = task_list%tasks(i)%grid_level
409 8431056 : img = task_list%tasks(i)%image
410 8431056 : iatom = task_list%tasks(i)%iatom
411 8431056 : jatom = task_list%tasks(i)%jatom
412 8431056 : iset = task_list%tasks(i)%iset
413 8431056 : jset = task_list%tasks(i)%jset
414 8431056 : ipgf = task_list%tasks(i)%ipgf
415 8431056 : jpgf = task_list%tasks(i)%jpgf
416 8446080 : IF (igrid_level /= igrid_level_old) THEN
417 41041 : IF (igrid_level_old /= -1) THEN
418 26258 : task_list%taskstop(ipair, igrid_level_old) = i - 1
419 : END IF
420 41041 : ipair = 1
421 41041 : task_list%taskstart(ipair, igrid_level) = i
422 41041 : igrid_level_old = igrid_level
423 41041 : iatom_old = iatom
424 41041 : jatom_old = jatom
425 41041 : img_old = img
426 8390015 : ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
427 684146 : ipair = ipair + 1
428 684146 : task_list%taskstart(ipair, igrid_level) = i
429 684146 : task_list%taskstop(ipair - 1, igrid_level) = i - 1
430 684146 : iatom_old = iatom
431 684146 : jatom_old = jatom
432 684146 : img_old = img
433 : END IF
434 : END DO
435 : ! Take care of the last iteration
436 15024 : IF (task_list%ntasks /= 0) THEN
437 14783 : task_list%taskstop(ipair, igrid_level) = task_list%ntasks
438 : END IF
439 :
440 : ! Debug task destribution
441 : IF (debug_this_module) THEN
442 : tasks => task_list%tasks
443 : WRITE (6, *)
444 : WRITE (6, *) "Total number of tasks ", task_list%ntasks
445 : DO igrid_level = 1, gridlevel_info%ngrid_levels
446 : WRITE (6, *) "Total number of pairs(grid_level) ", &
447 : igrid_level, task_list%npairs(igrid_level)
448 : END DO
449 : WRITE (6, *)
450 :
451 : DO igrid_level = 1, gridlevel_info%ngrid_levels
452 :
453 : ALLOCATE (blocks(natoms, natoms, nimages))
454 : blocks = -1
455 : DO ipair = 1, task_list%npairs(igrid_level)
456 : itask = task_list%taskstart(ipair, igrid_level)
457 : ilevel = task_list%tasks(itask)%grid_level
458 : img = task_list%tasks(itask)%image
459 : iatom = task_list%tasks(itask)%iatom
460 : jatom = task_list%tasks(itask)%jatom
461 : iset = task_list%tasks(itask)%iset
462 : jset = task_list%tasks(itask)%jset
463 : ipgf = task_list%tasks(itask)%ipgf
464 : jpgf = task_list%tasks(itask)%jpgf
465 : IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
466 : blocks(iatom, jatom, img) = 1
467 : blocks(jatom, iatom, img) = 1
468 : ELSE
469 : WRITE (6, *) "TASK LIST CONFLICT IN PAIR ", ipair
470 : WRITE (6, *) "Reuse of iatom, jatom, image ", iatom, jatom, img
471 : END IF
472 :
473 : iatom_old = iatom
474 : jatom_old = jatom
475 : img_old = img
476 : DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
477 : ilevel = task_list%tasks(itask)%grid_level
478 : img = task_list%tasks(itask)%image
479 : iatom = task_list%tasks(itask)%iatom
480 : jatom = task_list%tasks(itask)%jatom
481 : iset = task_list%tasks(itask)%iset
482 : jset = task_list%tasks(itask)%jset
483 : ipgf = task_list%tasks(itask)%ipgf
484 : jpgf = task_list%tasks(itask)%jpgf
485 : IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
486 : WRITE (6, *) "TASK LIST CONFLICT IN TASK ", itask
487 : WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
488 : WRITE (6, *) "Should be iatom, jatom, image ", iatom_old, jatom_old, img_old
489 : END IF
490 :
491 : END DO
492 : END DO
493 : DEALLOCATE (blocks)
494 :
495 : END DO
496 :
497 : END IF
498 :
499 15024 : CALL timestop(handle)
500 :
501 15024 : END SUBROUTINE generate_qs_task_list
502 :
503 : ! **************************************************************************************************
504 : !> \brief Sends the task list data to the grid API.
505 : !> \author Ole Schuett
506 : ! **************************************************************************************************
507 15024 : SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
508 : TYPE(task_list_type), POINTER :: task_list
509 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
510 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
511 : TYPE(cell_type), POINTER :: cell
512 : CHARACTER(LEN=*) :: basis_type
513 : TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
514 :
515 : TYPE(gto_basis_set_type), POINTER :: orb_basis_set
516 : INTEGER :: nset, natoms, nkinds, ntasks, &
517 : ikind, iatom, itask, nsgf
518 15024 : INTEGER, DIMENSION(:), ALLOCATABLE :: atom_kinds, level_list, iatom_list, jatom_list, &
519 15024 : iset_list, jset_list, ipgf_list, jpgf_list, &
520 15024 : border_mask_list, block_num_list
521 15024 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: radius_list
522 15024 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: rab_list, atom_positions
523 15024 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
524 15024 : INTEGER, DIMENSION(:, :), POINTER :: first_sgf
525 15024 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: sphi, zet
526 15024 : INTEGER, DIMENSION(:), POINTER :: lmax, lmin, npgf, nsgf_set
527 :
528 15024 : nkinds = SIZE(qs_kind_set)
529 15024 : natoms = SIZE(particle_set)
530 15024 : ntasks = task_list%ntasks
531 15024 : tasks => task_list%tasks
532 :
533 15024 : IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
534 : ! Basis sets do not change during simulation - only need to create them once.
535 43009 : ALLOCATE (task_list%grid_basis_sets(nkinds))
536 25025 : DO ikind = 1, nkinds
537 16033 : CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
538 : CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
539 : nset=nset, &
540 : nsgf=nsgf, &
541 : nsgf_set=nsgf_set, &
542 : npgf=npgf, &
543 : first_sgf=first_sgf, &
544 : lmax=lmax, &
545 : lmin=lmin, &
546 : sphi=sphi, &
547 16033 : zet=zet)
548 : CALL grid_create_basis_set(nset=nset, &
549 : nsgf=nsgf, &
550 : maxco=SIZE(sphi, 1), &
551 : maxpgf=SIZE(zet, 1), &
552 : lmin=lmin, &
553 : lmax=lmax, &
554 : npgf=npgf, &
555 : nsgf_set=nsgf_set, &
556 : first_sgf=first_sgf, &
557 : sphi=sphi, &
558 : zet=zet, &
559 25025 : basis_set=task_list%grid_basis_sets(ikind))
560 : END DO
561 : END IF
562 :
563 : ! Pack task list infos
564 75120 : ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
565 69545 : DO iatom = 1, natoms
566 54521 : atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
567 69545 : atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
568 : END DO
569 :
570 74397 : ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
571 74156 : ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
572 44590 : ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
573 74638 : ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
574 :
575 8446080 : DO itask = 1, ntasks
576 8431056 : level_list(itask) = tasks(itask)%grid_level
577 8431056 : iatom_list(itask) = tasks(itask)%iatom
578 8431056 : jatom_list(itask) = tasks(itask)%jatom
579 8431056 : iset_list(itask) = tasks(itask)%iset
580 8431056 : jset_list(itask) = tasks(itask)%jset
581 8431056 : ipgf_list(itask) = tasks(itask)%ipgf
582 8431056 : jpgf_list(itask) = tasks(itask)%jpgf
583 8431056 : IF (tasks(itask)%dist_type == 2) THEN
584 0 : border_mask_list(itask) = IAND(63, NOT(tasks(itask)%subpatch_pattern)) ! invert last 6 bits
585 : ELSE
586 8431056 : border_mask_list(itask) = 0 ! no masking
587 : END IF
588 8431056 : block_num_list(itask) = tasks(itask)%pair_index ! change of nomenclature pair_index -> block_num
589 8431056 : radius_list(itask) = tasks(itask)%radius
590 33739248 : rab_list(:, itask) = tasks(itask)%rab(:)
591 : END DO
592 :
593 : CALL grid_create_task_list(ntasks=ntasks, &
594 : natoms=natoms, &
595 : nkinds=nkinds, &
596 : nblocks=SIZE(task_list%pair_offsets_recv), &
597 : block_offsets=task_list%pair_offsets_recv, &
598 : atom_positions=atom_positions, &
599 : atom_kinds=atom_kinds, &
600 : basis_sets=task_list%grid_basis_sets, &
601 : level_list=level_list, &
602 : iatom_list=iatom_list, &
603 : jatom_list=jatom_list, &
604 : iset_list=iset_list, &
605 : jset_list=jset_list, &
606 : ipgf_list=ipgf_list, &
607 : jpgf_list=jpgf_list, &
608 : border_mask_list=border_mask_list, &
609 : block_num_list=block_num_list, &
610 : radius_list=radius_list, &
611 : rab_list=rab_list, &
612 : rs_grids=rs_grids, &
613 15024 : task_list=task_list%grid_task_list)
614 :
615 15024 : CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
616 15024 : CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
617 :
618 30048 : END SUBROUTINE create_grid_task_list
619 :
620 : ! **************************************************************************************************
621 : !> \brief ...
622 : !> \param tasks ...
623 : !> \param ntasks ...
624 : !> \param curr_tasks ...
625 : !> \param rs_descs ...
626 : !> \param dft_control ...
627 : !> \param cube_info ...
628 : !> \param gridlevel_info ...
629 : !> \param cindex ...
630 : !> \param iatom ...
631 : !> \param jatom ...
632 : !> \param rpgfa ...
633 : !> \param rpgfb ...
634 : !> \param zeta ...
635 : !> \param zetb ...
636 : !> \param kind_radius_b ...
637 : !> \param set_radius_a ...
638 : !> \param set_radius_b ...
639 : !> \param ra ...
640 : !> \param rab ...
641 : !> \param la_max ...
642 : !> \param la_min ...
643 : !> \param lb_max ...
644 : !> \param lb_min ...
645 : !> \param npgfa ...
646 : !> \param npgfb ...
647 : !> \param nseta ...
648 : !> \param nsetb ...
649 : !> \par History
650 : !> Joost VandeVondele: 10.2008 refactored
651 : ! **************************************************************************************************
652 1506573 : SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
653 : cube_info, gridlevel_info, cindex, &
654 : iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
655 : la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
656 :
657 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
658 : INTEGER :: ntasks, curr_tasks
659 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
660 : POINTER :: rs_descs
661 : TYPE(dft_control_type), POINTER :: dft_control
662 : TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
663 : TYPE(gridlevel_info_type), POINTER :: gridlevel_info
664 : INTEGER :: cindex, iatom, jatom
665 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
666 : REAL(KIND=dp) :: kind_radius_b
667 : REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
668 : REAL(KIND=dp), DIMENSION(3) :: ra, rab
669 : INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
670 : npgfb
671 : INTEGER :: nseta, nsetb
672 :
673 : INTEGER :: cube_center(3), igrid_level, ipgf, iset, &
674 : jpgf, jset, lb_cube(3), ub_cube(3)
675 : REAL(KIND=dp) :: dab, rab2, radius, zetp
676 :
677 1506573 : rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
678 1506573 : dab = SQRT(rab2)
679 :
680 4694580 : loop_iset: DO iset = 1, nseta
681 :
682 3188007 : IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE loop_iset
683 :
684 9619107 : loop_jset: DO jset = 1, nsetb
685 :
686 5861550 : IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE loop_jset
687 :
688 15804494 : loop_ipgf: DO ipgf = 1, npgfa(iset)
689 :
690 8961907 : IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE loop_ipgf
691 :
692 26968954 : loop_jpgf: DO jpgf = 1, npgfb(jset)
693 :
694 15649003 : IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE loop_jpgf
695 :
696 8594946 : zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
697 8594946 : igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
698 :
699 : CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
700 : rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
701 : la_max(iset), zeta(ipgf, iset), la_min(iset), &
702 : lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
703 8594946 : ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
704 :
705 : CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
706 : rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
707 : la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
708 : igrid_level, gridlevel_info%ngrid_levels, cube_center, &
709 24610910 : lb_cube, ub_cube, radius)
710 :
711 : END DO loop_jpgf
712 :
713 : END DO loop_ipgf
714 :
715 : END DO loop_jset
716 :
717 : END DO loop_iset
718 :
719 1506573 : END SUBROUTINE task_list_inner_loop
720 :
721 : ! **************************************************************************************************
722 : !> \brief combines the calculation of several basic properties of a given pgf:
723 : !> its center, the bounding cube, the radius, the cost,
724 : !> tries to predict the time needed for processing this task
725 : !> in this way an improved load balance might be obtained
726 : !> \param cube_center ...
727 : !> \param lb_cube ...
728 : !> \param ub_cube ...
729 : !> \param radius ...
730 : !> \param rs_desc ...
731 : !> \param cube_info ...
732 : !> \param la_max ...
733 : !> \param zeta ...
734 : !> \param la_min ...
735 : !> \param lb_max ...
736 : !> \param zetb ...
737 : !> \param lb_min ...
738 : !> \param ra ...
739 : !> \param rab ...
740 : !> \param rab2 ...
741 : !> \param eps ...
742 : !> \par History
743 : !> 10.2008 refactored [Joost VandeVondele]
744 : !> \note
745 : !> -) this requires the radius to be computed in the same way as
746 : !> collocate_pgf_product, we should factor that part into a subroutine
747 : !> -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
748 : !> this is more or less true for map_consistent
749 : !> -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
750 : !> the same, this could lead to a small speedup
751 : !> -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
752 : !> fitting the measured data on an opteron/g95 using the expression
753 : !> a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
754 : ! **************************************************************************************************
755 8594946 : SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
756 : rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
757 :
758 : INTEGER, DIMENSION(3), INTENT(OUT) :: cube_center, lb_cube, ub_cube
759 : REAL(KIND=dp), INTENT(OUT) :: radius
760 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
761 : TYPE(cube_info_type), INTENT(IN) :: cube_info
762 : INTEGER, INTENT(IN) :: la_max
763 : REAL(KIND=dp), INTENT(IN) :: zeta
764 : INTEGER, INTENT(IN) :: la_min, lb_max
765 : REAL(KIND=dp), INTENT(IN) :: zetb
766 : INTEGER, INTENT(IN) :: lb_min
767 : REAL(KIND=dp), INTENT(IN) :: ra(3), rab(3), rab2, eps
768 :
769 : INTEGER :: extent(3)
770 8594946 : INTEGER, DIMENSION(:), POINTER :: sphere_bounds
771 : REAL(KIND=dp) :: cutoff, f, prefactor, rb(3), zetp
772 : REAL(KIND=dp), DIMENSION(3) :: rp
773 :
774 : ! the radius for this task
775 :
776 8594946 : zetp = zeta + zetb
777 34379784 : rp(:) = ra(:) + zetb/zetp*rab(:)
778 34379784 : rb(:) = ra(:) + rab(:)
779 8594946 : cutoff = 1.0_dp
780 8594946 : f = zetb/zetp
781 8594946 : prefactor = EXP(-zeta*f*rab2)
782 : radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
783 8594946 : zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
784 :
785 8594946 : CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
786 : ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
787 34379784 : cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:))
788 34379784 : cube_center(:) = cube_center(:) + rs_desc%lb(:)
789 :
790 8594946 : IF (rs_desc%orthorhombic) THEN
791 8144401 : CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
792 : ELSE
793 450545 : CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
794 : !? unclear if extent is computed correctly.
795 1802180 : extent(:) = ub_cube(:) - lb_cube(:)
796 1802180 : lb_cube(:) = -extent(:)/2 - 1
797 1802180 : ub_cube(:) = extent(:)/2
798 : END IF
799 :
800 8594946 : END SUBROUTINE compute_pgf_properties
801 : ! **************************************************************************************************
802 : !> \brief predicts the cost of a task in kcycles for a given task
803 : !> the model is based on a fit of actual data, and might need updating
804 : !> as collocate_pgf_product changes (or CPUs/compilers change)
805 : !> maybe some dynamic approach, improving the cost model on the fly could
806 : !> work as well
807 : !> the cost model does not yet take into account the fraction of space
808 : !> that is mapped locally for a given cube and rs_grid (generalised tasks)
809 : !> \param lb_cube ...
810 : !> \param ub_cube ...
811 : !> \param fraction ...
812 : !> \param lmax ...
813 : !> \param is_ortho ...
814 : !> \return ...
815 : ! **************************************************************************************************
816 8594946 : INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
817 : INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
818 : REAL(KIND=dp), INTENT(IN) :: fraction
819 : INTEGER :: lmax
820 : LOGICAL :: is_ortho
821 :
822 : INTEGER :: cmax
823 : REAL(KIND=dp) :: v1, v2, v3, v4, v5
824 :
825 34379784 : cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2)
826 :
827 8594946 : IF (is_ortho) THEN
828 : v1 = 1.504760E+00_dp
829 : v2 = 3.126770E+00_dp
830 : v3 = 5.074106E+00_dp
831 : v4 = 1.091568E+00_dp
832 : v5 = 1.070187E+00_dp
833 : ELSE
834 450545 : v1 = 7.831105E+00_dp
835 450545 : v2 = 2.675174E+00_dp
836 450545 : v3 = 7.546553E+00_dp
837 450545 : v4 = 6.122446E-01_dp
838 450545 : v5 = 3.886382E+00_dp
839 : END IF
840 8594946 : cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
841 :
842 8594946 : END FUNCTION cost_model
843 : ! **************************************************************************************************
844 : !> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
845 : !> this determines by which CPUs a given pgf gets collocated
846 : !> the format of the task array is as follows
847 : !> tasks(1,i) := destination
848 : !> tasks(2,i) := source
849 : !> tasks(3,i) := compressed type (iatom, jatom, ....)
850 : !> tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
851 : !> tasks(5,i) := cost
852 : !> tasks(6,i) := alternate destination code (0 if none available)
853 : !>
854 : !> \param tasks ...
855 : !> \param ntasks ...
856 : !> \param curr_tasks ...
857 : !> \param rab ...
858 : !> \param cindex ...
859 : !> \param iatom ...
860 : !> \param jatom ...
861 : !> \param iset ...
862 : !> \param jset ...
863 : !> \param ipgf ...
864 : !> \param jpgf ...
865 : !> \param la_max ...
866 : !> \param lb_max ...
867 : !> \param rs_desc ...
868 : !> \param igrid_level ...
869 : !> \param n_levels ...
870 : !> \param cube_center ...
871 : !> \param lb_cube ...
872 : !> \param ub_cube ...
873 : !> \par History
874 : !> 10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
875 : ! **************************************************************************************************
876 8594946 : SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
877 : rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
878 : la_max, lb_max, rs_desc, igrid_level, n_levels, &
879 : cube_center, lb_cube, ub_cube, radius)
880 :
881 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
882 : INTEGER, INTENT(INOUT) :: ntasks, curr_tasks
883 : REAL(KIND=dp), DIMENSION(3), INTENT(IN) :: rab
884 : INTEGER, INTENT(IN) :: cindex, iatom, jatom, iset, jset, ipgf, &
885 : jpgf, la_max, lb_max
886 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
887 : INTEGER, INTENT(IN) :: igrid_level, n_levels
888 : INTEGER, DIMENSION(3), INTENT(IN) :: cube_center, lb_cube, ub_cube
889 : REAL(KIND=dp), INTENT(IN) :: radius
890 :
891 : INTEGER, PARAMETER :: add_tasks = 1000
892 : REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
893 :
894 : INTEGER :: added_tasks, cost, j, lmax
895 : LOGICAL :: is_ortho
896 : REAL(KIND=dp) :: tfraction
897 :
898 : !$OMP SINGLE
899 8594946 : ntasks = ntasks + 1
900 8594946 : IF (ntasks > curr_tasks) THEN
901 1058 : curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks)
902 1058 : CALL reallocate_tasks(tasks, curr_tasks)
903 : END IF
904 : !$OMP END SINGLE
905 :
906 8594946 : IF (rs_desc%distributed) THEN
907 :
908 : ! finds the node(s) that need to process this task
909 : ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
910 : CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
911 1380 : ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
912 :
913 : ELSE
914 8593566 : tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
915 8593566 : tasks(ntasks)%dist_type = 0
916 8593566 : tasks(ntasks)%subpatch_pattern = 0
917 8593566 : added_tasks = 1
918 : END IF
919 :
920 8594946 : lmax = la_max + lb_max
921 8594946 : is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
922 : ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
923 : ! this could be refined in the future
924 8594946 : tfraction = 1.0_dp/added_tasks
925 :
926 8594946 : cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
927 :
928 17189892 : DO j = 1, added_tasks
929 8594946 : tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
930 8594946 : tasks(ntasks - added_tasks + j)%cost = cost
931 8594946 : tasks(ntasks - added_tasks + j)%grid_level = igrid_level
932 8594946 : tasks(ntasks - added_tasks + j)%image = cindex
933 8594946 : tasks(ntasks - added_tasks + j)%iatom = iatom
934 8594946 : tasks(ntasks - added_tasks + j)%jatom = jatom
935 8594946 : tasks(ntasks - added_tasks + j)%iset = iset
936 8594946 : tasks(ntasks - added_tasks + j)%jset = jset
937 8594946 : tasks(ntasks - added_tasks + j)%ipgf = ipgf
938 8594946 : tasks(ntasks - added_tasks + j)%jpgf = jpgf
939 34379784 : tasks(ntasks - added_tasks + j)%rab = rab
940 17189892 : tasks(ntasks - added_tasks + j)%radius = radius
941 : END DO
942 :
943 8594946 : END SUBROUTINE pgf_to_tasks
944 :
945 : ! **************************************************************************************************
946 : !> \brief performs load balancing of the tasks on the distributed grids
947 : !> \param tasks ...
948 : !> \param ntasks ...
949 : !> \param rs_descs ...
950 : !> \param grid_level ...
951 : !> \param natoms ...
952 : !> \par History
953 : !> created 2008-10-03 [Joost VandeVondele]
954 : ! **************************************************************************************************
955 54 : SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
956 :
957 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
958 : INTEGER :: ntasks
959 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
960 : POINTER :: rs_descs
961 : INTEGER :: grid_level, natoms
962 :
963 : CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed'
964 :
965 : INTEGER :: handle
966 54 : INTEGER, DIMENSION(:, :, :), POINTER :: list
967 :
968 54 : CALL timeset(routineN, handle)
969 :
970 54 : NULLIFY (list)
971 : ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
972 : ! if a destination would not be in this list, it is a bug
973 54 : CALL create_destination_list(list, rs_descs, grid_level)
974 :
975 : ! now, walk over the tasks, filling in the loads of each destination
976 54 : CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.TRUE.)
977 :
978 : ! optimize loads & fluxes
979 54 : CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
980 :
981 : ! now, walk over the tasks, using the list to set the destinations
982 54 : CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.FALSE.)
983 :
984 54 : DEALLOCATE (list)
985 :
986 54 : CALL timestop(handle)
987 :
988 54 : END SUBROUTINE load_balance_distributed
989 :
990 : ! **************************************************************************************************
991 : !> \brief this serial routine adjusts the fluxes in the global list
992 : !>
993 : !> \param list_global ...
994 : !> \par History
995 : !> created 2008-10-06 [Joost VandeVondele]
996 : ! **************************************************************************************************
997 27 : SUBROUTINE balance_global_list(list_global)
998 : INTEGER, DIMENSION(:, :, 0:) :: list_global
999 :
1000 : CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list'
1001 : INTEGER, PARAMETER :: Max_Iter = 100
1002 : REAL(KIND=dp), PARAMETER :: Tolerance_factor = 0.005_dp
1003 :
1004 : INTEGER :: dest, handle, icpu, idest, iflux, &
1005 : ilocal, k, maxdest, Ncpu, Nflux
1006 27 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: flux_connections
1007 : LOGICAL :: solution_optimal
1008 : REAL(KIND=dp) :: average, load_shift, max_load_shift, &
1009 : tolerance
1010 27 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, optimized_load
1011 27 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: flux_limits
1012 :
1013 27 : CALL timeset(routineN, handle)
1014 :
1015 27 : Ncpu = SIZE(list_global, 3)
1016 27 : maxdest = SIZE(list_global, 2)
1017 81 : ALLOCATE (load(0:Ncpu - 1))
1018 81 : load = 0.0_dp
1019 54 : ALLOCATE (optimized_load(0:Ncpu - 1))
1020 :
1021 : ! figure out the number of fluxes
1022 : ! we assume that the global_list is symmetric
1023 81 : Nflux = 0
1024 81 : DO icpu = 0, ncpu - 1
1025 189 : DO idest = 1, maxdest
1026 108 : dest = list_global(1, idest, icpu)
1027 162 : IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1
1028 : END DO
1029 : END DO
1030 81 : ALLOCATE (optimized_flux(Nflux))
1031 81 : ALLOCATE (flux_limits(2, Nflux))
1032 81 : ALLOCATE (flux_connections(2, Nflux))
1033 :
1034 : ! reorder data
1035 108 : flux_limits = 0
1036 : Nflux = 0
1037 81 : DO icpu = 0, ncpu - 1
1038 162 : load(icpu) = SUM(list_global(2, :, icpu))
1039 189 : DO idest = 1, maxdest
1040 108 : dest = list_global(1, idest, icpu)
1041 162 : IF (dest < ncpu) THEN
1042 108 : IF (dest /= icpu) THEN
1043 54 : IF (dest > icpu) THEN
1044 27 : Nflux = Nflux + 1
1045 27 : flux_limits(2, Nflux) = list_global(2, idest, icpu)
1046 27 : flux_connections(1, Nflux) = icpu
1047 27 : flux_connections(2, Nflux) = dest
1048 : ELSE
1049 27 : DO iflux = 1, Nflux
1050 27 : IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1051 27 : flux_limits(1, iflux) = -list_global(2, idest, icpu)
1052 27 : EXIT
1053 : END IF
1054 : END DO
1055 : END IF
1056 : END IF
1057 : END IF
1058 : END DO
1059 : END DO
1060 :
1061 : solution_optimal = .FALSE.
1062 54 : optimized_flux = 0.0_dp
1063 :
1064 : ! an iterative solver, if iterated till convergence the maximum load is minimal
1065 : ! we terminate before things are fully converged, since this does show up in the timings
1066 : ! once the largest shift becomes less than a small fraction of the average load, we're done
1067 : ! we're perfectly happy if the load balance is within 1 percent or so
1068 : ! the maximum load normally converges even faster
1069 81 : average = SUM(load)/SIZE(load)
1070 27 : tolerance = Tolerance_factor*average
1071 :
1072 81 : optimized_load(:) = load
1073 332 : DO k = 1, Max_iter
1074 : max_load_shift = 0.0_dp
1075 658 : DO iflux = 1, Nflux
1076 329 : load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
1077 329 : load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
1078 329 : load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
1079 329 : max_load_shift = MAX(ABS(load_shift), max_load_shift)
1080 329 : optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
1081 329 : optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
1082 658 : optimized_flux(iflux) = optimized_flux(iflux) + load_shift
1083 : END DO
1084 332 : IF (max_load_shift < tolerance) THEN
1085 : solution_optimal = .TRUE.
1086 : EXIT
1087 : END IF
1088 : END DO
1089 :
1090 : ! now adjust the load list to reflect the optimized fluxes
1091 : ! reorder data
1092 : Nflux = 0
1093 81 : DO icpu = 0, ncpu - 1
1094 162 : DO idest = 1, maxdest
1095 162 : IF (list_global(1, idest, icpu) == icpu) ilocal = idest
1096 : END DO
1097 189 : DO idest = 1, maxdest
1098 108 : dest = list_global(1, idest, icpu)
1099 162 : IF (dest < ncpu) THEN
1100 108 : IF (dest /= icpu) THEN
1101 54 : IF (dest > icpu) THEN
1102 27 : Nflux = Nflux + 1
1103 27 : IF (optimized_flux(Nflux) > 0) THEN
1104 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1105 0 : list_global(2, idest, icpu) - NINT(optimized_flux(Nflux))
1106 0 : list_global(2, idest, icpu) = NINT(optimized_flux(Nflux))
1107 : ELSE
1108 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1109 27 : list_global(2, idest, icpu)
1110 27 : list_global(2, idest, icpu) = 0
1111 : END IF
1112 : ELSE
1113 27 : DO iflux = 1, Nflux
1114 27 : IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1115 27 : IF (optimized_flux(iflux) > 0) THEN
1116 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1117 0 : list_global(2, idest, icpu)
1118 0 : list_global(2, idest, icpu) = 0
1119 : ELSE
1120 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1121 27 : list_global(2, idest, icpu) + NINT(optimized_flux(iflux))
1122 27 : list_global(2, idest, icpu) = -NINT(optimized_flux(iflux))
1123 : END IF
1124 : EXIT
1125 : END IF
1126 : END DO
1127 : END IF
1128 : END IF
1129 : END IF
1130 : END DO
1131 : END DO
1132 :
1133 27 : CALL timestop(handle)
1134 :
1135 54 : END SUBROUTINE balance_global_list
1136 :
1137 : ! **************************************************************************************************
1138 : !> \brief this routine gets back optimized loads for all destinations
1139 : !>
1140 : !> \param list ...
1141 : !> \param group ...
1142 : !> \param my_pos ...
1143 : !> \par History
1144 : !> created 2008-10-06 [Joost VandeVondele]
1145 : !> Modified 2016-01 [EPCC] Reduce memory requirements on P processes
1146 : !> from O(P^2) to O(P)
1147 : ! **************************************************************************************************
1148 54 : SUBROUTINE optimize_load_list(list, group, my_pos)
1149 : INTEGER, DIMENSION(:, :, 0:) :: list
1150 : TYPE(mp_comm_type), INTENT(IN) :: group
1151 : INTEGER, INTENT(IN) :: my_pos
1152 :
1153 : CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list'
1154 : INTEGER, PARAMETER :: rank_of_root = 0
1155 :
1156 : INTEGER :: handle, icpu, idest, maxdest, ncpu
1157 : INTEGER, ALLOCATABLE, DIMENSION(:) :: load_all
1158 54 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: load_partial
1159 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global
1160 :
1161 54 : CALL timeset(routineN, handle)
1162 :
1163 54 : ncpu = SIZE(list, 3)
1164 54 : maxdest = SIZE(list, 2)
1165 :
1166 : !find total workload ...
1167 162 : ALLOCATE (load_all(maxdest*ncpu))
1168 108 : load_all(:) = RESHAPE(list(2, :, :), [maxdest*ncpu])
1169 54 : CALL group%sum(load_all(:), rank_of_root)
1170 :
1171 : ! ... and optimise the work per process
1172 216 : ALLOCATE (list_global(2, maxdest, ncpu))
1173 54 : IF (rank_of_root == my_pos) THEN
1174 189 : list_global(1, :, :) = list(1, :, :)
1175 243 : list_global(2, :, :) = RESHAPE(load_all, [maxdest, ncpu])
1176 27 : CALL balance_global_list(list_global)
1177 : END IF
1178 54 : CALL group%bcast(list_global, rank_of_root)
1179 :
1180 : !figure out how much can be sent to other processes
1181 216 : ALLOCATE (load_partial(maxdest, ncpu))
1182 : ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
1183 162 : CALL group%sum_partial(RESHAPE(load_all, [maxdest, ncpu]), load_partial(:, :))
1184 :
1185 162 : DO icpu = 1, ncpu
1186 378 : DO idest = 1, maxdest
1187 :
1188 : !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
1189 324 : IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
1190 37 : IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
1191 : list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
1192 10 : - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
1193 : ELSE
1194 27 : list(2, idest, icpu - 1) = 0
1195 : END IF
1196 : END IF
1197 :
1198 : END DO
1199 : END DO
1200 :
1201 : !clean up before leaving
1202 54 : DEALLOCATE (load_all)
1203 54 : DEALLOCATE (list_global)
1204 54 : DEALLOCATE (load_partial)
1205 :
1206 54 : CALL timestop(handle)
1207 54 : END SUBROUTINE optimize_load_list
1208 :
1209 : ! **************************************************************************************************
1210 : !> \brief fill the load list with values derived from the tasks array
1211 : !> from the alternate locations, we select the alternate location that
1212 : !> can be used without increasing the number of matrix blocks needed to
1213 : !> distribute.
1214 : !> Replicated tasks are not yet considered
1215 : !>
1216 : !> \param list ...
1217 : !> \param rs_descs ...
1218 : !> \param grid_level ...
1219 : !> \param tasks ...
1220 : !> \param ntasks ...
1221 : !> \param natoms ...
1222 : !> \param create_list ...
1223 : !> \par History
1224 : !> created 2008-10-06 [Joost VandeVondele]
1225 : ! **************************************************************************************************
1226 108 : SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
1227 : INTEGER, DIMENSION(:, :, 0:) :: list
1228 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1229 : POINTER :: rs_descs
1230 : INTEGER :: grid_level
1231 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1232 : INTEGER :: ntasks, natoms
1233 : LOGICAL :: create_list
1234 :
1235 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list'
1236 :
1237 : INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
1238 : itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
1239 : rank
1240 : INTEGER(KIND=int_8) :: bit_pattern, ipair, ipair_old, natom8
1241 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: loads
1242 108 : INTEGER, ALLOCATABLE, DIMENSION(:) :: all_dests, index
1243 : INTEGER, DIMENSION(6) :: options
1244 :
1245 108 : CALL timeset(routineN, handle)
1246 :
1247 324 : ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
1248 108 : CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.FALSE.)
1249 :
1250 108 : maxdest = SIZE(list, 2)
1251 108 : ncpu = SIZE(list, 3)
1252 108 : natom8 = natoms
1253 :
1254 : ! first find the tasks that deal with the same atom pair
1255 108 : itask_stop = 0
1256 108 : ipair_old = HUGE(ipair_old)
1257 108 : img_old = -1
1258 108 : ALLOCATE (all_dests(0))
1259 108 : ALLOCATE (INDEX(0))
1260 :
1261 : DO
1262 :
1263 : ! first find the range of tasks that deal with the same atom pair
1264 290 : itask_start = itask_stop + 1
1265 290 : itask_stop = itask_start
1266 290 : IF (itask_stop > ntasks) EXIT
1267 182 : ilevel = tasks(itask_stop)%grid_level
1268 182 : img_old = tasks(itask_stop)%image
1269 182 : iatom = tasks(itask_stop)%iatom
1270 182 : jatom = tasks(itask_stop)%jatom
1271 182 : iset = tasks(itask_stop)%iset
1272 182 : jset = tasks(itask_stop)%jset
1273 182 : ipgf = tasks(itask_stop)%ipgf
1274 182 : jpgf = tasks(itask_stop)%jpgf
1275 :
1276 182 : ipair_old = (iatom - 1)*natom8 + (jatom - 1)
1277 : DO
1278 7386 : IF (itask_stop + 1 > ntasks) EXIT
1279 7298 : ilevel = tasks(itask_stop + 1)%grid_level
1280 7298 : img = tasks(itask_stop + 1)%image
1281 7298 : iatom = tasks(itask_stop + 1)%iatom
1282 7298 : jatom = tasks(itask_stop + 1)%jatom
1283 7298 : iset = tasks(itask_stop + 1)%iset
1284 7298 : jset = tasks(itask_stop + 1)%jset
1285 7298 : ipgf = tasks(itask_stop + 1)%ipgf
1286 7298 : jpgf = tasks(itask_stop + 1)%jpgf
1287 :
1288 7298 : ipair = (iatom - 1)*natom8 + (jatom - 1)
1289 7386 : IF (ipair == ipair_old .AND. img == img_old) THEN
1290 : itask_stop = itask_stop + 1
1291 : ELSE
1292 : EXIT
1293 : END IF
1294 : END DO
1295 182 : ipair = ipair_old
1296 182 : nshort = itask_stop - itask_start + 1
1297 :
1298 : ! find the unique list of destinations on this grid level only
1299 182 : DEALLOCATE (all_dests)
1300 546 : ALLOCATE (all_dests(nshort))
1301 182 : DEALLOCATE (index)
1302 364 : ALLOCATE (INDEX(nshort))
1303 7568 : DO i = 1, nshort
1304 7386 : ilevel = tasks(itask_start + i - 1)%grid_level
1305 7386 : img = tasks(itask_start + i - 1)%image
1306 7386 : iatom = tasks(itask_start + i - 1)%iatom
1307 7386 : jatom = tasks(itask_start + i - 1)%jatom
1308 7386 : iset = tasks(itask_start + i - 1)%iset
1309 7386 : jset = tasks(itask_start + i - 1)%jset
1310 7386 : ipgf = tasks(itask_start + i - 1)%ipgf
1311 7386 : jpgf = tasks(itask_start + i - 1)%jpgf
1312 :
1313 7568 : IF (ilevel == grid_level) THEN
1314 2760 : all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
1315 : ELSE
1316 4626 : all_dests(i) = HUGE(all_dests(i))
1317 : END IF
1318 : END DO
1319 182 : CALL sort(all_dests, nshort, index)
1320 182 : ndest_pair = 1
1321 7386 : DO i = 2, nshort
1322 7386 : IF ((all_dests(ndest_pair) /= all_dests(i)) .AND. (all_dests(i) /= HUGE(all_dests(i)))) THEN
1323 10 : ndest_pair = ndest_pair + 1
1324 10 : all_dests(ndest_pair) = all_dests(i)
1325 : END IF
1326 : END DO
1327 :
1328 7676 : DO itask = itask_start, itask_stop
1329 :
1330 7386 : dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
1331 7386 : ilevel = tasks(itask)%grid_level
1332 7386 : img = tasks(itask)%image
1333 7386 : iatom = tasks(itask)%iatom
1334 7386 : jatom = tasks(itask)%jatom
1335 7386 : iset = tasks(itask)%iset
1336 7386 : jset = tasks(itask)%jset
1337 7386 : ipgf = tasks(itask)%ipgf
1338 7386 : jpgf = tasks(itask)%jpgf
1339 :
1340 : ! Only proceed with tasks which are on this grid level
1341 7386 : IF (ilevel /= grid_level) CYCLE
1342 2760 : ipair = (iatom - 1)*natom8 + (jatom - 1)
1343 2760 : cost = INT(tasks(itask)%cost)
1344 :
1345 182 : SELECT CASE (tasks(itask)%dist_type)
1346 : CASE (1)
1347 2760 : bit_pattern = tasks(itask)%subpatch_pattern
1348 2760 : nopt = 0
1349 2760 : IF (BTEST(bit_pattern, 0)) THEN
1350 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [-1, 0, 0])
1351 0 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1352 0 : nopt = nopt + 1
1353 0 : options(nopt) = rank
1354 : END IF
1355 : END IF
1356 2760 : IF (BTEST(bit_pattern, 1)) THEN
1357 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [+1, 0, 0])
1358 0 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1359 0 : nopt = nopt + 1
1360 0 : options(nopt) = rank
1361 : END IF
1362 : END IF
1363 2760 : IF (BTEST(bit_pattern, 2)) THEN
1364 48 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, -1, 0])
1365 96 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1366 0 : nopt = nopt + 1
1367 0 : options(nopt) = rank
1368 : END IF
1369 : END IF
1370 2760 : IF (BTEST(bit_pattern, 3)) THEN
1371 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, +1, 0])
1372 0 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1373 0 : nopt = nopt + 1
1374 0 : options(nopt) = rank
1375 : END IF
1376 : END IF
1377 2760 : IF (BTEST(bit_pattern, 4)) THEN
1378 1150 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, -1])
1379 2200 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1380 100 : nopt = nopt + 1
1381 100 : options(nopt) = rank
1382 : END IF
1383 : END IF
1384 2760 : IF (BTEST(bit_pattern, 5)) THEN
1385 500 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, +1])
1386 1000 : IF (ANY(all_dests(1:ndest_pair) == rank)) THEN
1387 80 : nopt = nopt + 1
1388 80 : options(nopt) = rank
1389 : END IF
1390 : END IF
1391 2760 : IF (nopt > 0) THEN
1392 : ! set it to the rank with the lowest load
1393 180 : rank = options(1)
1394 180 : DO iopt = 2, nopt
1395 180 : IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
1396 : END DO
1397 : ELSE
1398 2580 : rank = dest
1399 : END IF
1400 2760 : li = list_index(list, rank, dest)
1401 2760 : IF (create_list) THEN
1402 1380 : list(2, li, dest) = list(2, li, dest) + cost
1403 : ELSE
1404 1380 : IF (list(1, li, dest) == dest) THEN
1405 1290 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1406 : ELSE
1407 90 : IF (list(2, li, dest) >= cost) THEN
1408 10 : list(2, li, dest) = list(2, li, dest) - cost
1409 10 : tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1410 : ELSE
1411 80 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1412 : END IF
1413 : END IF
1414 : END IF
1415 : CASE (2) ! generalised
1416 0 : li = list_index(list, dest, dest)
1417 0 : IF (create_list) THEN
1418 0 : list(2, li, dest) = list(2, li, dest) + cost
1419 : ELSE
1420 0 : IF (list(1, li, dest) == dest) THEN
1421 0 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1422 : ELSE
1423 0 : IF (list(2, li, dest) >= cost) THEN
1424 0 : list(2, li, dest) = list(2, li, dest) - cost
1425 0 : tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1426 : ELSE
1427 0 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1428 : END IF
1429 : END IF
1430 : END IF
1431 : CASE DEFAULT
1432 7386 : CPABORT("")
1433 : END SELECT
1434 :
1435 : END DO
1436 :
1437 : END DO
1438 :
1439 108 : CALL timestop(handle)
1440 :
1441 216 : END SUBROUTINE compute_load_list
1442 : ! **************************************************************************************************
1443 : !> \brief small helper function to return the proper index in the list array
1444 : !>
1445 : !> \param list ...
1446 : !> \param rank ...
1447 : !> \param dest ...
1448 : !> \return ...
1449 : !> \par History
1450 : !> created 2008-10-06 [Joost VandeVondele]
1451 : ! **************************************************************************************************
1452 2760 : INTEGER FUNCTION list_index(list, rank, dest)
1453 : INTEGER, DIMENSION(:, :, 0:), INTENT(IN) :: list
1454 : INTEGER, INTENT(IN) :: rank, dest
1455 :
1456 2760 : list_index = 1
1457 1424 : DO
1458 4184 : IF (list(1, list_index, dest) == rank) EXIT
1459 1424 : list_index = list_index + 1
1460 : END DO
1461 2760 : END FUNCTION list_index
1462 : ! **************************************************************************************************
1463 : !> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
1464 : !> note that we allocate it with an additional field to store the load of this destination
1465 : !>
1466 : !> \param list ...
1467 : !> \param rs_descs ...
1468 : !> \param grid_level ...
1469 : !> \par History
1470 : !> created 2008-10-06 [Joost VandeVondele]
1471 : ! **************************************************************************************************
1472 54 : SUBROUTINE create_destination_list(list, rs_descs, grid_level)
1473 : INTEGER, DIMENSION(:, :, :), POINTER :: list
1474 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1475 : POINTER :: rs_descs
1476 : INTEGER, INTENT(IN) :: grid_level
1477 :
1478 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list'
1479 :
1480 : INTEGER :: handle, i, icpu, j, maxcount, ncpu, &
1481 : ultimate_max
1482 54 : INTEGER, ALLOCATABLE, DIMENSION(:) :: index, sublist
1483 :
1484 54 : CALL timeset(routineN, handle)
1485 :
1486 54 : CPASSERT(.NOT. ASSOCIATED(list))
1487 54 : ncpu = rs_descs(grid_level)%rs_desc%group_size
1488 54 : ultimate_max = 7
1489 :
1490 162 : ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
1491 :
1492 54 : ALLOCATE (INDEX(ultimate_max))
1493 54 : ALLOCATE (sublist(ultimate_max))
1494 432 : sublist = HUGE(sublist)
1495 :
1496 54 : maxcount = 1
1497 162 : DO icpu = 0, ncpu - 1
1498 108 : sublist(1) = icpu
1499 108 : sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [-1, 0, 0])
1500 108 : sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [+1, 0, 0])
1501 108 : sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, -1, 0])
1502 108 : sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, +1, 0])
1503 108 : sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, -1])
1504 108 : sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, +1])
1505 : ! only retain unique values of the destination
1506 108 : CALL sort(sublist, ultimate_max, index)
1507 108 : j = 1
1508 756 : DO i = 2, 7
1509 756 : IF (sublist(i) /= sublist(j)) THEN
1510 108 : j = j + 1
1511 108 : sublist(j) = sublist(i)
1512 : END IF
1513 : END DO
1514 108 : maxcount = MAX(maxcount, j)
1515 648 : sublist(j + 1:ultimate_max) = HUGE(sublist)
1516 864 : list(1, :, icpu) = sublist
1517 918 : list(2, :, icpu) = 0
1518 : END DO
1519 :
1520 54 : CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
1521 :
1522 54 : CALL timestop(handle)
1523 :
1524 108 : END SUBROUTINE create_destination_list
1525 :
1526 : ! **************************************************************************************************
1527 : !> \brief given a task list, compute the load of each process everywhere
1528 : !> giving this function the ability to loop over a (sub)set of rs_grids,
1529 : !> and do all the communication in one shot, would speed it up
1530 : !> \param loads ...
1531 : !> \param rs_descs ...
1532 : !> \param grid_level ...
1533 : !> \param ntasks ...
1534 : !> \param tasks ...
1535 : !> \param use_reordered_ranks ...
1536 : !> \par History
1537 : !> none
1538 : !> \author MattW 21/11/2007
1539 : ! **************************************************************************************************
1540 480 : SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
1541 : INTEGER(KIND=int_8), DIMENSION(:) :: loads
1542 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1543 : POINTER :: rs_descs
1544 : INTEGER :: grid_level, ntasks
1545 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1546 : LOGICAL, INTENT(IN) :: use_reordered_ranks
1547 :
1548 : CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads'
1549 :
1550 : INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1551 : iset, jatom, jpgf, jset
1552 : INTEGER(KIND=int_8) :: total_cost_local
1553 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i
1554 : TYPE(realspace_grid_desc_type), POINTER :: desc
1555 :
1556 480 : CALL timeset(routineN, handle)
1557 :
1558 480 : desc => rs_descs(grid_level)%rs_desc
1559 :
1560 : ! allocate local arrays
1561 1440 : ALLOCATE (send_buf_i(desc%group_size))
1562 960 : ALLOCATE (recv_buf_i(desc%group_size))
1563 :
1564 : ! communication step 1 : compute the total local cost of the tasks
1565 : ! each proc needs to know the amount of work he will receive
1566 :
1567 : ! send buffer now contains for each target the cost of the tasks it will receive
1568 1440 : send_buf_i = 0
1569 36792 : DO i = 1, ntasks
1570 36312 : ilevel = tasks(i)%grid_level
1571 36312 : img = tasks(i)%image
1572 36312 : iatom = tasks(i)%iatom
1573 36312 : jatom = tasks(i)%jatom
1574 36312 : iset = tasks(i)%iset
1575 36312 : jset = tasks(i)%jset
1576 36312 : ipgf = tasks(i)%ipgf
1577 36312 : jpgf = tasks(i)%jpgf
1578 36312 : IF (ilevel /= grid_level) CYCLE
1579 10476 : IF (use_reordered_ranks) THEN
1580 : send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
1581 : send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
1582 3618 : + tasks(i)%cost
1583 : ELSE
1584 : send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
1585 : send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
1586 6378 : + tasks(i)%cost
1587 : END IF
1588 : END DO
1589 480 : CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
1590 :
1591 : ! communication step 2 : compute the global cost of the tasks
1592 1440 : total_cost_local = SUM(recv_buf_i)
1593 :
1594 : ! after this step, the recv buffer contains the local cost for each CPU
1595 1440 : CALL desc%group%allgather(total_cost_local, loads)
1596 :
1597 480 : CALL timestop(handle)
1598 :
1599 960 : END SUBROUTINE get_current_loads
1600 : ! **************************************************************************************************
1601 : !> \brief performs load balancing shifting tasks on the replicated grids
1602 : !> this modifies the destination of some of the tasks on replicated
1603 : !> grids, and in this way balances the load
1604 : !> \param rs_descs ...
1605 : !> \param ntasks ...
1606 : !> \param tasks ...
1607 : !> \par History
1608 : !> none
1609 : !> \author MattW 21/11/2007
1610 : ! **************************************************************************************************
1611 48 : SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
1612 :
1613 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1614 : POINTER :: rs_descs
1615 : INTEGER :: ntasks
1616 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1617 :
1618 : CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated'
1619 :
1620 : INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1621 : iset, j, jatom, jpgf, jset, &
1622 : no_overloaded, no_underloaded, &
1623 : proc_receiving
1624 : INTEGER(KIND=int_8) :: average_cost, cost_task_rep, count, &
1625 : offset, total_cost_global
1626 48 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: load_imbalance, loads, recv_buf_i
1627 48 : INTEGER, ALLOCATABLE, DIMENSION(:) :: index
1628 : TYPE(realspace_grid_desc_type), POINTER :: desc
1629 :
1630 48 : CALL timeset(routineN, handle)
1631 :
1632 48 : desc => rs_descs(1)%rs_desc
1633 :
1634 : ! allocate local arrays
1635 144 : ALLOCATE (recv_buf_i(desc%group_size))
1636 96 : ALLOCATE (loads(desc%group_size))
1637 :
1638 144 : recv_buf_i = 0
1639 234 : DO i = 1, SIZE(rs_descs)
1640 186 : CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.TRUE.)
1641 606 : recv_buf_i(:) = recv_buf_i + loads
1642 : END DO
1643 :
1644 144 : total_cost_global = SUM(recv_buf_i)
1645 48 : average_cost = total_cost_global/desc%group_size
1646 :
1647 : !
1648 : ! compute how to redistribute the replicated tasks so that the average cost is reached
1649 : !
1650 :
1651 : ! load imbalance measures the load of a given CPU relative
1652 : ! to the optimal load distribution (load=average)
1653 144 : ALLOCATE (load_imbalance(desc%group_size))
1654 144 : ALLOCATE (INDEX(desc%group_size))
1655 :
1656 144 : load_imbalance(:) = recv_buf_i - average_cost
1657 48 : no_overloaded = 0
1658 48 : no_underloaded = 0
1659 :
1660 144 : DO i = 1, desc%group_size
1661 96 : IF (load_imbalance(i) > 0) no_overloaded = no_overloaded + 1
1662 144 : IF (load_imbalance(i) < 0) no_underloaded = no_underloaded + 1
1663 : END DO
1664 :
1665 : ! sort the recv_buffer on number of tasks, gives us index which provides a
1666 : ! mapping between processor ranks and how overloaded the processor
1667 48 : CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
1668 :
1669 : ! find out the number of replicated tasks each proc has
1670 : ! but only those tasks which have not yet been assigned
1671 48 : cost_task_rep = 0
1672 3666 : DO i = 1, ntasks
1673 : IF (tasks(i)%dist_type == 0 &
1674 3666 : .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
1675 2238 : cost_task_rep = cost_task_rep + tasks(i)%cost
1676 : END IF
1677 : END DO
1678 :
1679 : ! now, correct the load imbalance for the overloaded CPUs
1680 : ! they will send away not more than the total load of replicated tasks
1681 48 : CALL desc%group%allgather(cost_task_rep, recv_buf_i)
1682 :
1683 144 : DO i = 1, desc%group_size
1684 : ! At the moment we can only offload replicated tasks
1685 96 : IF (load_imbalance(i) > 0) &
1686 96 : load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i))
1687 : END DO
1688 :
1689 : ! simplest algorithm I can think of of is that the processor with the most
1690 : ! excess tasks fills up the process needing most, then moves on to next most.
1691 : ! At the moment if we've got less replicated tasks than we're overloaded then
1692 : ! task balancing will be incomplete
1693 :
1694 : ! only need to do anything if I've excess tasks
1695 48 : IF (load_imbalance(desc%my_pos + 1) > 0) THEN
1696 :
1697 22 : count = 0 ! weighted amount of tasks offloaded
1698 22 : offset = 0 ! no of underloaded processes already filled by other more overloaded procs
1699 :
1700 : ! calculate offset
1701 22 : DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
1702 22 : IF (INDEX(i) == desc%my_pos + 1) THEN
1703 : EXIT
1704 : ELSE
1705 0 : offset = offset + load_imbalance(INDEX(i))
1706 : END IF
1707 : END DO
1708 :
1709 : ! find my starting processor to send to
1710 22 : proc_receiving = HUGE(proc_receiving)
1711 22 : DO i = 1, no_underloaded
1712 22 : offset = offset + load_imbalance(INDEX(i))
1713 22 : IF (offset <= 0) THEN
1714 : proc_receiving = i
1715 : EXIT
1716 : END IF
1717 : END DO
1718 :
1719 : ! offset now contains minus the number of tasks proc_receiving requires
1720 : ! we fill this up by adjusting the destination of tasks on the replicated grid,
1721 : ! then move to next most underloaded proc
1722 1380 : DO j = 1, ntasks
1723 : IF (tasks(j)%dist_type == 0 &
1724 1380 : .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
1725 : ! just avoid sending to non existing procs due to integer truncation
1726 : ! in the computation of the average
1727 808 : IF (proc_receiving > no_underloaded) EXIT
1728 : ! set new destination
1729 808 : ilevel = tasks(j)%grid_level
1730 808 : img = tasks(j)%image
1731 808 : iatom = tasks(j)%iatom
1732 808 : jatom = tasks(j)%jatom
1733 808 : iset = tasks(j)%iset
1734 808 : jset = tasks(j)%jset
1735 808 : ipgf = tasks(j)%ipgf
1736 808 : jpgf = tasks(j)%jpgf
1737 808 : tasks(j)%destination = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs))
1738 808 : offset = offset + tasks(j)%cost
1739 808 : count = count + tasks(j)%cost
1740 808 : IF (count >= load_imbalance(desc%my_pos + 1)) EXIT
1741 786 : IF (offset > 0) THEN
1742 0 : proc_receiving = proc_receiving + 1
1743 : ! just avoid sending to non existing procs due to integer truncation
1744 : ! in the computation of the average
1745 0 : IF (proc_receiving > no_underloaded) EXIT
1746 0 : offset = load_imbalance(INDEX(proc_receiving))
1747 : END IF
1748 : END IF
1749 : END DO
1750 : END IF
1751 :
1752 48 : DEALLOCATE (index)
1753 48 : DEALLOCATE (load_imbalance)
1754 :
1755 48 : CALL timestop(handle)
1756 :
1757 96 : END SUBROUTINE load_balance_replicated
1758 :
1759 : ! **************************************************************************************************
1760 : !> \brief given an input task list, redistribute so that all tasks can be processed locally,
1761 : !> i.e. dest equals rank
1762 : !> \param rs_descs ...
1763 : !> \param ntasks ...
1764 : !> \param tasks ...
1765 : !> \param ntasks_recv ...
1766 : !> \param tasks_recv ...
1767 : !> \par History
1768 : !> none
1769 : !> \author MattW 21/11/2007
1770 : ! **************************************************************************************************
1771 48 : SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
1772 :
1773 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1774 : POINTER :: rs_descs
1775 : INTEGER :: ntasks
1776 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1777 : INTEGER :: ntasks_recv
1778 : TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1779 :
1780 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks'
1781 :
1782 : INTEGER :: handle, i, j, k, l, rank
1783 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf, send_buf
1784 : INTEGER, ALLOCATABLE, DIMENSION(:) :: recv_disps, recv_sizes, send_disps, &
1785 : send_sizes
1786 : TYPE(realspace_grid_desc_type), POINTER :: desc
1787 :
1788 48 : CALL timeset(routineN, handle)
1789 :
1790 48 : desc => rs_descs(1)%rs_desc
1791 :
1792 : ! allocate local arrays
1793 144 : ALLOCATE (send_sizes(desc%group_size))
1794 96 : ALLOCATE (recv_sizes(desc%group_size))
1795 96 : ALLOCATE (send_disps(desc%group_size))
1796 96 : ALLOCATE (recv_disps(desc%group_size))
1797 144 : ALLOCATE (send_buf(desc%group_size))
1798 96 : ALLOCATE (recv_buf(desc%group_size))
1799 :
1800 : ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
1801 144 : send_buf = 0
1802 3666 : DO i = 1, ntasks
1803 : rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
1804 3618 : %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
1805 3666 : send_buf(rank + 1) = send_buf(rank + 1) + 1
1806 : END DO
1807 :
1808 48 : CALL desc%group%alltoall(send_buf, recv_buf, 1)
1809 :
1810 : ! pack the tasks, and send them around
1811 :
1812 144 : send_sizes = 0
1813 144 : send_disps = 0
1814 144 : recv_sizes = 0
1815 144 : recv_disps = 0
1816 :
1817 48 : send_sizes(1) = INT(send_buf(1)*task_size_in_int8)
1818 48 : recv_sizes(1) = INT(recv_buf(1)*task_size_in_int8)
1819 96 : DO i = 2, desc%group_size
1820 48 : send_sizes(i) = INT(send_buf(i)*task_size_in_int8)
1821 48 : recv_sizes(i) = INT(recv_buf(i)*task_size_in_int8)
1822 48 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1823 96 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1824 : END DO
1825 :
1826 : ! deallocate old send/recv buffers
1827 48 : DEALLOCATE (send_buf)
1828 48 : DEALLOCATE (recv_buf)
1829 :
1830 : ! allocate them with new sizes
1831 233 : ALLOCATE (send_buf(SUM(send_sizes)))
1832 238 : ALLOCATE (recv_buf(SUM(recv_sizes)))
1833 :
1834 : ! do packing
1835 61554 : send_buf = 0
1836 144 : send_sizes = 0
1837 3666 : DO j = 1, ntasks
1838 : i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
1839 3618 : %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
1840 3618 : l = send_disps(i) + send_sizes(i)
1841 3618 : CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
1842 3666 : send_sizes(i) = send_sizes(i) + task_size_in_int8
1843 : END DO
1844 :
1845 : ! do communication
1846 48 : CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
1847 :
1848 48 : DEALLOCATE (send_buf)
1849 :
1850 144 : ntasks_recv = SUM(recv_sizes)/task_size_in_int8
1851 3904 : ALLOCATE (tasks_recv(ntasks_recv))
1852 :
1853 : ! do unpacking
1854 48 : l = 0
1855 144 : DO i = 1, desc%group_size
1856 3762 : DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
1857 3618 : l = l + 1
1858 3618 : k = recv_disps(i) + j*task_size_in_int8
1859 3714 : CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
1860 : END DO
1861 : END DO
1862 :
1863 48 : DEALLOCATE (recv_buf)
1864 48 : DEALLOCATE (send_sizes)
1865 48 : DEALLOCATE (recv_sizes)
1866 48 : DEALLOCATE (send_disps)
1867 48 : DEALLOCATE (recv_disps)
1868 :
1869 48 : CALL timestop(handle)
1870 :
1871 48 : END SUBROUTINE create_local_tasks
1872 :
1873 : ! **************************************************************************************************
1874 : !> \brief Assembles tasks to be performed on local grid
1875 : !> \param rs_descs the grids
1876 : !> \param ntasks Number of tasks for local processing
1877 : !> \param natoms ...
1878 : !> \param nimages ...
1879 : !> \param tasks the task set generated on this processor
1880 : !> \param rval ...
1881 : !> \param atom_pair_send ...
1882 : !> \param atom_pair_recv ...
1883 : !> \param symmetric ...
1884 : !> \param reorder_rs_grid_ranks ...
1885 : !> \param skip_load_balance_distributed ...
1886 : !> \par History
1887 : !> none
1888 : !> \author MattW 21/11/2007
1889 : ! **************************************************************************************************
1890 17418 : SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
1891 : tasks, atom_pair_send, atom_pair_recv, &
1892 : symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
1893 :
1894 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1895 : POINTER :: rs_descs
1896 : INTEGER :: ntasks, natoms
1897 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1898 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
1899 : LOGICAL, INTENT(IN) :: symmetric, reorder_rs_grid_ranks, &
1900 : skip_load_balance_distributed
1901 :
1902 : CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks'
1903 :
1904 : INTEGER :: handle, igrid_level, irank, ntasks_recv
1905 : INTEGER(KIND=int_8) :: load_gap, max_load, replicated_load
1906 17418 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: total_loads, total_loads_tmp, trial_loads
1907 17418 : INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: loads
1908 17418 : INTEGER, ALLOCATABLE, DIMENSION(:) :: indices, real2virtual, total_index
1909 : LOGICAL :: distributed_grids, fixed_first_grid
1910 : TYPE(realspace_grid_desc_type), POINTER :: desc
1911 17418 : TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1912 :
1913 17418 : CALL timeset(routineN, handle)
1914 :
1915 17418 : CPASSERT(ASSOCIATED(tasks))
1916 :
1917 : ! *** figure out if we have distributed grids
1918 17418 : distributed_grids = .FALSE.
1919 86538 : DO igrid_level = 1, SIZE(rs_descs)
1920 86538 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1921 54 : distributed_grids = .TRUE.
1922 : END IF
1923 : END DO
1924 17418 : desc => rs_descs(1)%rs_desc
1925 :
1926 17418 : IF (distributed_grids) THEN
1927 :
1928 192 : ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
1929 144 : ALLOCATE (total_loads(0:desc%group_size - 1))
1930 :
1931 144 : total_loads = 0
1932 :
1933 : ! First round of balancing on the distributed grids
1934 : ! we just balance each of the distributed grids independently
1935 234 : DO igrid_level = 1, SIZE(rs_descs)
1936 234 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1937 :
1938 54 : IF (.NOT. skip_load_balance_distributed) &
1939 54 : CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
1940 :
1941 : CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1942 54 : tasks, use_reordered_ranks=.FALSE.)
1943 :
1944 162 : total_loads(:) = total_loads + loads(:, igrid_level)
1945 :
1946 : END IF
1947 : END DO
1948 :
1949 : ! calculate the total load of replicated tasks, so we can decide if it is worth
1950 : ! reordering the distributed grid levels
1951 :
1952 48 : replicated_load = 0
1953 234 : DO igrid_level = 1, SIZE(rs_descs)
1954 234 : IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
1955 : CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1956 132 : tasks, use_reordered_ranks=.FALSE.)
1957 396 : replicated_load = replicated_load + SUM(loads(:, igrid_level))
1958 : END IF
1959 : END DO
1960 :
1961 : !IF (desc%my_pos==0) THEN
1962 : ! WRITE(*,*) "Total replicated load is ",replicated_load
1963 : !END IF
1964 :
1965 : ! Now we adjust the rank ordering based on the current loads
1966 : ! we leave the first distributed level and all the replicated levels in the default order
1967 48 : IF (reorder_rs_grid_ranks) THEN
1968 48 : fixed_first_grid = .FALSE.
1969 234 : DO igrid_level = 1, SIZE(rs_descs)
1970 234 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1971 54 : IF (fixed_first_grid .EQV. .FALSE.) THEN
1972 144 : total_loads(:) = loads(:, igrid_level)
1973 : fixed_first_grid = .TRUE.
1974 : ELSE
1975 18 : ALLOCATE (trial_loads(0:desc%group_size - 1))
1976 :
1977 18 : trial_loads(:) = total_loads + loads(:, igrid_level)
1978 18 : max_load = MAXVAL(trial_loads)
1979 : load_gap = 0
1980 18 : DO irank = 0, desc%group_size - 1
1981 18 : load_gap = load_gap + max_load - trial_loads(irank)
1982 : END DO
1983 :
1984 : ! If there is not enough replicated load to load balance well enough
1985 : ! then we will reorder this grid level
1986 6 : IF (load_gap > replicated_load*1.05_dp) THEN
1987 :
1988 18 : ALLOCATE (indices(0:desc%group_size - 1))
1989 12 : ALLOCATE (total_index(0:desc%group_size - 1))
1990 12 : ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
1991 12 : ALLOCATE (real2virtual(0:desc%group_size - 1))
1992 :
1993 18 : total_loads_tmp(:) = total_loads
1994 6 : CALL sort(total_loads_tmp, desc%group_size, total_index)
1995 6 : CALL sort(loads(:, igrid_level), desc%group_size, indices)
1996 :
1997 : ! Reorder so that the rank with smallest load on this grid level is paired with
1998 : ! the highest load in total
1999 18 : DO irank = 0, desc%group_size - 1
2000 : total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
2001 12 : loads(desc%group_size - irank - 1, igrid_level)
2002 18 : real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
2003 : END DO
2004 :
2005 6 : CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
2006 :
2007 6 : DEALLOCATE (indices)
2008 6 : DEALLOCATE (total_index)
2009 6 : DEALLOCATE (total_loads_tmp)
2010 6 : DEALLOCATE (real2virtual)
2011 : ELSE
2012 0 : total_loads(:) = trial_loads
2013 : END IF
2014 :
2015 6 : DEALLOCATE (trial_loads)
2016 :
2017 : END IF
2018 : END IF
2019 : END DO
2020 : END IF
2021 :
2022 : ! Now we use the replicated tasks to balance out the rest of the load
2023 48 : CALL load_balance_replicated(rs_descs, ntasks, tasks)
2024 :
2025 : !total_loads = 0
2026 : !DO igrid_level=1,SIZE(rs_descs)
2027 : ! CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
2028 : ! tasks, use_reordered_ranks=.TRUE.)
2029 : ! total_loads = total_loads + loads(:, igrid_level)
2030 : !END DO
2031 :
2032 : !IF (desc%my_pos==0) THEN
2033 : ! WRITE(*,*) ""
2034 : ! WRITE(*,*) "At the end of the load balancing procedure"
2035 : ! WRITE(*,*) "Maximum load:",MAXVAL(total_loads)
2036 : ! WRITE(*,*) "Average load:",SUM(total_loads)/SIZE(total_loads)
2037 : ! WRITE(*,*) "Minimum load:",MINVAL(total_loads)
2038 : !ENDIF
2039 :
2040 : ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
2041 48 : CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
2042 :
2043 : !
2044 : ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
2045 : ! we will be sending. These lists are needed for redistribute_matrix.
2046 : !
2047 48 : CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.TRUE., symmetric=symmetric, rs_descs=rs_descs)
2048 :
2049 : ! natom_send=SIZE(atom_pair_send)
2050 : ! CALL desc%group%sum(natom_send)
2051 : ! IF (desc%my_pos==0) THEN
2052 : ! WRITE(*,*) ""
2053 : ! WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
2054 : ! ENDIF
2055 :
2056 48 : CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
2057 :
2058 : ! cleanup, at this point we don't need the original tasks anymore
2059 48 : DEALLOCATE (tasks)
2060 48 : DEALLOCATE (loads)
2061 48 : DEALLOCATE (total_loads)
2062 :
2063 : ELSE
2064 17370 : tasks_recv => tasks
2065 17370 : ntasks_recv = ntasks
2066 17370 : CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
2067 : ! not distributed, hence atom_pair_send not needed
2068 : END IF
2069 :
2070 : ! here we sort the task list we will process locally.
2071 51962 : ALLOCATE (indices(ntasks_recv))
2072 17418 : CALL tasks_sort(tasks_recv, ntasks_recv, indices)
2073 17418 : DEALLOCATE (indices)
2074 :
2075 : !
2076 : ! final lists are ready
2077 : !
2078 :
2079 17418 : tasks => tasks_recv
2080 17418 : ntasks = ntasks_recv
2081 :
2082 17418 : CALL timestop(handle)
2083 :
2084 34836 : END SUBROUTINE distribute_tasks
2085 :
2086 : ! **************************************************************************************************
2087 : !> \brief ...
2088 : !> \param atom_pair ...
2089 : !> \param my_tasks ...
2090 : !> \param send ...
2091 : !> \param symmetric ...
2092 : !> \param natoms ...
2093 : !> \param nimages ...
2094 : !> \param rs_descs ...
2095 : ! **************************************************************************************************
2096 17466 : SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
2097 :
2098 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair
2099 : TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: tasks
2100 : INTEGER, INTENT(IN) :: ntasks
2101 : LOGICAL, INTENT(IN) :: send, symmetric
2102 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
2103 :
2104 : INTEGER :: i, ilevel, iatom, jatom, npairs, virt_rank
2105 17466 : INTEGER, DIMENSION(:), ALLOCATABLE :: indices
2106 17466 : TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE :: atom_pair_tmp
2107 :
2108 17466 : CPASSERT(.NOT. ASSOCIATED(atom_pair))
2109 17466 : IF (ntasks == 0) THEN
2110 299 : ALLOCATE (atom_pair(0))
2111 : RETURN
2112 : END IF
2113 :
2114 : ! calculate list of atom pairs
2115 : ! fill pair list taking into account symmetry
2116 8650065 : ALLOCATE (atom_pair_tmp(ntasks))
2117 8615731 : DO i = 1, ntasks
2118 8598564 : atom_pair_tmp(i)%image = tasks(i)%image
2119 8598564 : iatom = tasks(i)%iatom
2120 8598564 : jatom = tasks(i)%jatom
2121 8598564 : IF (symmetric .AND. iatom > jatom) THEN
2122 : ! iatom / jatom swapped
2123 3272316 : atom_pair_tmp(i)%row = jatom
2124 3272316 : atom_pair_tmp(i)%col = iatom
2125 : ELSE
2126 5326248 : atom_pair_tmp(i)%row = iatom
2127 5326248 : atom_pair_tmp(i)%col = jatom
2128 : END IF
2129 :
2130 8615731 : IF (send) THEN
2131 : ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
2132 : ! actually has the correct part of the rs_grid to do the mapping
2133 3618 : ilevel = tasks(i)%grid_level
2134 3618 : virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
2135 3618 : atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
2136 : ELSE
2137 : ! If we are receiving, then no conversion is needed as the rank is that of the process with the
2138 : ! required matrix block, and the ordering of the rs grid is irrelevant
2139 8594946 : atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
2140 : END IF
2141 : END DO
2142 :
2143 : ! find unique atom pairs that I'm sending/receiving
2144 51501 : ALLOCATE (indices(ntasks))
2145 17167 : CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
2146 17167 : npairs = 1
2147 17167 : tasks(indices(1))%pair_index = 1
2148 8598564 : DO i = 2, ntasks
2149 8581397 : IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
2150 365224 : npairs = npairs + 1
2151 365224 : atom_pair_tmp(npairs) = atom_pair_tmp(i)
2152 : END IF
2153 8598564 : tasks(indices(i))%pair_index = npairs
2154 : END DO
2155 17167 : DEALLOCATE (indices)
2156 :
2157 : ! Copy unique pairs to final location.
2158 433892 : ALLOCATE (atom_pair(npairs))
2159 399558 : atom_pair(:) = atom_pair_tmp(:npairs)
2160 17167 : DEALLOCATE (atom_pair_tmp)
2161 :
2162 : END SUBROUTINE get_atom_pair
2163 :
2164 : ! **************************************************************************************************
2165 : !> \brief redistributes the matrix so that it can be used in realspace operations
2166 : !> i.e. according to the task lists for collocate and integrate.
2167 : !> This routine can become a bottleneck in large calculations.
2168 : !> \param rs_descs ...
2169 : !> \param pmats ...
2170 : !> \param atom_pair_send ...
2171 : !> \param atom_pair_recv ...
2172 : !> \param natoms ...
2173 : !> \param nimages ...
2174 : !> \param scatter ...
2175 : !> \param hmats ...
2176 : ! **************************************************************************************************
2177 0 : SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
2178 : nimages, scatter, hmats)
2179 :
2180 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2181 : POINTER :: rs_descs
2182 : TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: pmats
2183 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
2184 : INTEGER :: nimages
2185 : LOGICAL :: scatter
2186 : TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
2187 : POINTER :: hmats
2188 :
2189 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix'
2190 :
2191 : INTEGER :: acol, arow, handle, i, img, j, k, l, me, &
2192 : nblkcols_total, nblkrows_total, ncol, &
2193 : nrow, nthread, nthread_left
2194 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
2195 0 : recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
2196 0 : send_pair_disps, send_sizes
2197 0 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, row_blk_size
2198 : LOGICAL :: found
2199 0 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: recv_buf_r, send_buf_r
2200 0 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: h_block, p_block
2201 : TYPE(dbcsr_type), POINTER :: hmat, pmat
2202 : TYPE(realspace_grid_desc_type), POINTER :: desc
2203 0 : REAL(kind=dp), DIMENSION(:), POINTER :: vector
2204 :
2205 0 : !$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2206 :
2207 0 : CALL timeset(routineN, handle)
2208 :
2209 0 : IF (.NOT. scatter) THEN
2210 0 : CPASSERT(PRESENT(hmats))
2211 : END IF
2212 :
2213 0 : desc => rs_descs(1)%rs_desc
2214 0 : me = desc%my_pos + 1
2215 :
2216 : ! allocate local arrays
2217 0 : ALLOCATE (send_sizes(desc%group_size))
2218 0 : ALLOCATE (recv_sizes(desc%group_size))
2219 0 : ALLOCATE (send_disps(desc%group_size))
2220 0 : ALLOCATE (recv_disps(desc%group_size))
2221 0 : ALLOCATE (send_pair_count(desc%group_size))
2222 0 : ALLOCATE (recv_pair_count(desc%group_size))
2223 0 : ALLOCATE (send_pair_disps(desc%group_size))
2224 0 : ALLOCATE (recv_pair_disps(desc%group_size))
2225 :
2226 0 : pmat => pmats(1)%matrix
2227 : CALL dbcsr_get_info(pmat, &
2228 : row_blk_size=row_blk_size, &
2229 : col_blk_size=col_blk_size, &
2230 : nblkrows_total=nblkrows_total, &
2231 0 : nblkcols_total=nblkcols_total)
2232 0 : ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
2233 0 : first_col(nblkcols_total), last_col(nblkcols_total))
2234 0 : CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
2235 0 : CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
2236 :
2237 : ! set up send buffer sizes
2238 0 : send_sizes = 0
2239 0 : send_pair_count = 0
2240 0 : DO i = 1, SIZE(atom_pair_send)
2241 0 : k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
2242 0 : arow = atom_pair_send(i)%row
2243 0 : acol = atom_pair_send(i)%col
2244 0 : nrow = last_row(arow) - first_row(arow) + 1
2245 0 : ncol = last_col(acol) - first_col(acol) + 1
2246 0 : send_sizes(k) = send_sizes(k) + nrow*ncol
2247 0 : send_pair_count(k) = send_pair_count(k) + 1
2248 : END DO
2249 :
2250 0 : send_disps = 0
2251 0 : send_pair_disps = 0
2252 0 : DO i = 2, desc%group_size
2253 0 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
2254 0 : send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
2255 : END DO
2256 :
2257 0 : ALLOCATE (send_buf_r(SUM(send_sizes)))
2258 :
2259 : ! set up recv buffer
2260 :
2261 0 : recv_sizes = 0
2262 0 : recv_pair_count = 0
2263 0 : DO i = 1, SIZE(atom_pair_recv)
2264 0 : k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
2265 0 : arow = atom_pair_recv(i)%row
2266 0 : acol = atom_pair_recv(i)%col
2267 0 : nrow = last_row(arow) - first_row(arow) + 1
2268 0 : ncol = last_col(acol) - first_col(acol) + 1
2269 0 : recv_sizes(k) = recv_sizes(k) + nrow*ncol
2270 0 : recv_pair_count(k) = recv_pair_count(k) + 1
2271 : END DO
2272 :
2273 0 : recv_disps = 0
2274 0 : recv_pair_disps = 0
2275 0 : DO i = 2, desc%group_size
2276 0 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
2277 0 : recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
2278 : END DO
2279 0 : ALLOCATE (recv_buf_r(SUM(recv_sizes)))
2280 :
2281 : !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
2282 : !$OMP SHARED(desc,send_pair_count,send_pair_disps,nimages),&
2283 : !$OMP SHARED(last_row,first_row,last_col,first_col),&
2284 : !$OMP SHARED(pmats,send_buf_r,send_disps,send_sizes),&
2285 : !$OMP SHARED(atom_pair_send,me,hmats,nblkrows_total),&
2286 : !$OMP SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
2287 : !$OMP SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
2288 : !$OMP PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
2289 0 : !$OMP PRIVATE(nthread,h_block,nthread_left,hmat,pmat,vector)
2290 :
2291 : nthread = 1
2292 : !$ nthread = omp_get_num_threads()
2293 : nthread_left = 1
2294 : !$ nthread_left = MAX(1, nthread - 1)
2295 :
2296 : ! do packing
2297 : !$OMP DO schedule(guided)
2298 : DO l = 1, desc%group_size
2299 : IF (l == me) CYCLE
2300 : send_sizes(l) = 0
2301 : DO i = 1, send_pair_count(l)
2302 : arow = atom_pair_send(send_pair_disps(l) + i)%row
2303 : acol = atom_pair_send(send_pair_disps(l) + i)%col
2304 : img = atom_pair_send(send_pair_disps(l) + i)%image
2305 : nrow = last_row(arow) - first_row(arow) + 1
2306 : ncol = last_col(acol) - first_col(acol) + 1
2307 : pmat => pmats(img)%matrix
2308 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2309 : CPASSERT(found)
2310 :
2311 : DO k = 1, ncol
2312 : DO j = 1, nrow
2313 : send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
2314 : END DO
2315 : END DO
2316 : send_sizes(l) = send_sizes(l) + nrow*ncol
2317 : END DO
2318 : END DO
2319 : !$OMP END DO
2320 :
2321 : IF (.NOT. scatter) THEN
2322 : ! We need locks to protect concurrent summation into H
2323 : !$OMP SINGLE
2324 : !$ ALLOCATE (locks(nthread*10))
2325 : !$OMP END SINGLE
2326 :
2327 : !$OMP DO
2328 : !$ do i = 1, nthread*10
2329 : !$ call omp_init_lock(locks(i))
2330 : !$ end do
2331 : !$OMP END DO
2332 : END IF
2333 :
2334 : !$OMP MASTER
2335 : ! do communication
2336 : CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
2337 : recv_buf_r, recv_sizes, recv_disps)
2338 : !$OMP END MASTER
2339 :
2340 : ! If this is a scatter, then no need to copy local blocks,
2341 : ! If not, we sum them directly into H (bypassing the alltoall)
2342 : IF (.NOT. scatter) THEN
2343 :
2344 : ! Distribute work over remaining threads assuming one is still in the alltoall
2345 : !$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
2346 : DO i = 1, send_pair_count(me)
2347 : arow = atom_pair_send(send_pair_disps(me) + i)%row
2348 : acol = atom_pair_send(send_pair_disps(me) + i)%col
2349 : img = atom_pair_send(send_pair_disps(me) + i)%image
2350 : nrow = last_row(arow) - first_row(arow) + 1
2351 : ncol = last_col(acol) - first_col(acol) + 1
2352 : hmat => hmats(img)%matrix
2353 : pmat => pmats(img)%matrix
2354 : CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
2355 : CPASSERT(found)
2356 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
2357 : CPASSERT(found)
2358 :
2359 : !$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2360 : DO k = 1, ncol
2361 : DO j = 1, nrow
2362 : h_block(j, k) = h_block(j, k) + p_block(j, k)
2363 : END DO
2364 : END DO
2365 : !$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2366 : END DO
2367 : !$OMP END DO
2368 : ELSE
2369 : ! We will insert new blocks into P, so create mutable work matrices
2370 : DO img = 1, nimages
2371 : pmat => pmats(img)%matrix
2372 : CALL dbcsr_work_create(pmat, work_mutable=.TRUE., &
2373 : nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
2374 : n=nthread)
2375 : END DO
2376 : END IF
2377 :
2378 : ! wait for comm and setup to finish
2379 : !$OMP BARRIER
2380 :
2381 : !do unpacking
2382 : !$OMP DO schedule(guided)
2383 : DO l = 1, desc%group_size
2384 : IF (l == me) CYCLE
2385 : recv_sizes(l) = 0
2386 : DO i = 1, recv_pair_count(l)
2387 : arow = atom_pair_recv(recv_pair_disps(l) + i)%row
2388 : acol = atom_pair_recv(recv_pair_disps(l) + i)%col
2389 : img = atom_pair_recv(recv_pair_disps(l) + i)%image
2390 : nrow = last_row(arow) - first_row(arow) + 1
2391 : ncol = last_col(acol) - first_col(acol) + 1
2392 : pmat => pmats(img)%matrix
2393 : NULLIFY (p_block)
2394 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
2395 :
2396 : IF (PRESENT(hmats)) THEN
2397 : hmat => hmats(img)%matrix
2398 : CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
2399 : CPASSERT(found)
2400 : END IF
2401 :
2402 : IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
2403 : vector => recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol)
2404 : CALL dbcsr_put_block(pmat, arow, acol, block=RESHAPE(vector, [nrow, ncol]))
2405 : END IF
2406 : IF (.NOT. scatter) THEN
2407 : !$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2408 : DO k = 1, ncol
2409 : DO j = 1, nrow
2410 : h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
2411 : END DO
2412 : END DO
2413 : !$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2414 : END IF
2415 : recv_sizes(l) = recv_sizes(l) + nrow*ncol
2416 : END DO
2417 : END DO
2418 : !$OMP END DO
2419 :
2420 : !$ IF (.not. scatter) THEN
2421 : !$OMP DO
2422 : !$ do i = 1, nthread*10
2423 : !$ call omp_destroy_lock(locks(i))
2424 : !$ end do
2425 : !$OMP END DO
2426 : !$ END IF
2427 :
2428 : !$OMP SINGLE
2429 : !$ IF (.not. scatter) THEN
2430 : !$ DEALLOCATE (locks)
2431 : !$ END IF
2432 : !$OMP END SINGLE NOWAIT
2433 :
2434 : IF (scatter) THEN
2435 : ! Blocks were added to P
2436 : DO img = 1, nimages
2437 : pmat => pmats(img)%matrix
2438 : CALL dbcsr_finalize(pmat)
2439 : END DO
2440 : END IF
2441 : !$OMP END PARALLEL
2442 :
2443 0 : DEALLOCATE (send_buf_r)
2444 0 : DEALLOCATE (recv_buf_r)
2445 :
2446 0 : DEALLOCATE (send_sizes)
2447 0 : DEALLOCATE (recv_sizes)
2448 0 : DEALLOCATE (send_disps)
2449 0 : DEALLOCATE (recv_disps)
2450 0 : DEALLOCATE (send_pair_count)
2451 0 : DEALLOCATE (recv_pair_count)
2452 0 : DEALLOCATE (send_pair_disps)
2453 0 : DEALLOCATE (recv_pair_disps)
2454 :
2455 0 : DEALLOCATE (first_row, last_row, first_col, last_col)
2456 :
2457 0 : CALL timestop(handle)
2458 :
2459 0 : END SUBROUTINE rs_distribute_matrix
2460 :
2461 : ! **************************************************************************************************
2462 : !> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
2463 : !> \author Ole Schuett
2464 : ! **************************************************************************************************
2465 15072 : SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
2466 : pair_offsets, rank_offsets, rank_sizes, buffer_size)
2467 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: pairs
2468 : INTEGER, DIMENSION(:), INTENT(IN) :: nsgf
2469 : INTEGER, INTENT(IN) :: group_size
2470 : INTEGER, DIMENSION(:), POINTER :: pair_offsets, rank_offsets, rank_sizes
2471 : INTEGER, INTENT(INOUT) :: buffer_size
2472 :
2473 : INTEGER :: acol, arow, i, block_size, total_size, k, prev_k
2474 :
2475 15072 : IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
2476 15072 : IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
2477 15072 : IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
2478 :
2479 : ! calculate buffer_size and pair_offsets
2480 44968 : ALLOCATE (pair_offsets(SIZE(pairs)))
2481 15072 : total_size = 0
2482 386045 : DO i = 1, SIZE(pairs)
2483 370973 : pair_offsets(i) = total_size
2484 370973 : arow = pairs(i)%row
2485 370973 : acol = pairs(i)%col
2486 370973 : block_size = nsgf(arow)*nsgf(acol)
2487 386045 : total_size = total_size + block_size
2488 : END DO
2489 15072 : buffer_size = total_size
2490 :
2491 : ! calculate rank_offsets and rank_sizes
2492 45216 : ALLOCATE (rank_offsets(group_size))
2493 30144 : ALLOCATE (rank_sizes(group_size))
2494 43992 : rank_offsets = 0
2495 43992 : rank_sizes = 0
2496 15072 : IF (SIZE(pairs) > 0) THEN
2497 14824 : prev_k = pairs(1)%rank + 1
2498 385797 : DO i = 1, SIZE(pairs)
2499 370973 : k = pairs(i)%rank + 1
2500 370973 : CPASSERT(k >= prev_k) ! expecting the pairs to be ordered by rank
2501 385797 : IF (k > prev_k) THEN
2502 61 : rank_offsets(k) = pair_offsets(i)
2503 61 : rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
2504 61 : prev_k = k
2505 : END IF
2506 : END DO
2507 14824 : rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
2508 : END IF
2509 :
2510 15072 : END SUBROUTINE rs_calc_offsets
2511 :
2512 : ! **************************************************************************************************
2513 : !> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
2514 : !> \author Ole Schuett
2515 : ! **************************************************************************************************
2516 266 : SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
2517 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2518 : TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2519 : TYPE(task_list_type), INTENT(IN) :: task_list
2520 : TYPE(mp_comm_type), INTENT(IN) :: group
2521 :
2522 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_scatter_matrices'
2523 :
2524 : INTEGER :: handle
2525 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2526 :
2527 266 : CALL timeset(routineN, handle)
2528 777 : ALLOCATE (buffer_send(task_list%buffer_size_send))
2529 :
2530 : ! pack dbcsr blocks into send buffer
2531 266 : CPASSERT(ASSOCIATED(task_list%atom_pair_send))
2532 : CALL rs_pack_buffer(src_matrices=src_matrices, &
2533 : dest_buffer=buffer_send, &
2534 : atom_pair=task_list%atom_pair_send, &
2535 266 : pair_offsets=task_list%pair_offsets_send)
2536 :
2537 : ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
2538 : CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
2539 : dest_buffer%host_buffer, &
2540 2394 : task_list%rank_sizes_recv, task_list%rank_offsets_recv)
2541 :
2542 266 : DEALLOCATE (buffer_send)
2543 266 : CALL timestop(handle)
2544 :
2545 266 : END SUBROUTINE rs_scatter_matrices
2546 :
2547 : ! **************************************************************************************************
2548 : !> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
2549 : !> \author Ole Schuett
2550 : ! **************************************************************************************************
2551 218 : SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
2552 : TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2553 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2554 : TYPE(task_list_type), INTENT(IN) :: task_list
2555 : TYPE(mp_comm_type), INTENT(IN) :: group
2556 :
2557 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_gather_matrices'
2558 :
2559 : INTEGER :: handle
2560 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2561 :
2562 218 : CALL timeset(routineN, handle)
2563 :
2564 : ! Caution: The meaning of send and recv are reversed in this routine.
2565 640 : ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
2566 :
2567 : ! mpi all-to-all communication
2568 : CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
2569 1962 : buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
2570 :
2571 : ! unpack dbcsr blocks from send buffer
2572 218 : CPASSERT(ASSOCIATED(task_list%atom_pair_send))
2573 : CALL rs_unpack_buffer(src_buffer=buffer_send, &
2574 : dest_matrices=dest_matrices, &
2575 : atom_pair=task_list%atom_pair_send, &
2576 218 : pair_offsets=task_list%pair_offsets_send)
2577 :
2578 218 : DEALLOCATE (buffer_send)
2579 218 : CALL timestop(handle)
2580 :
2581 218 : END SUBROUTINE rs_gather_matrices
2582 :
2583 : ! **************************************************************************************************
2584 : !> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
2585 : !> \author Ole Schuett
2586 : ! **************************************************************************************************
2587 245735 : SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
2588 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2589 : TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2590 : TYPE(task_list_type), INTENT(IN) :: task_list
2591 :
2592 : CALL rs_pack_buffer(src_matrices=src_matrices, &
2593 : dest_buffer=dest_buffer%host_buffer, &
2594 : atom_pair=task_list%atom_pair_recv, &
2595 245735 : pair_offsets=task_list%pair_offsets_recv)
2596 :
2597 245735 : END SUBROUTINE rs_copy_to_buffer
2598 :
2599 : ! **************************************************************************************************
2600 : !> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
2601 : !> \author Ole Schuett
2602 : ! **************************************************************************************************
2603 201630 : SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
2604 : TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2605 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2606 : TYPE(task_list_type), INTENT(IN) :: task_list
2607 :
2608 : CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
2609 : dest_matrices=dest_matrices, &
2610 : atom_pair=task_list%atom_pair_recv, &
2611 201630 : pair_offsets=task_list%pair_offsets_recv)
2612 :
2613 201630 : END SUBROUTINE rs_copy_to_matrices
2614 :
2615 : ! **************************************************************************************************
2616 : !> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
2617 : !> \author Ole Schuett
2618 : ! **************************************************************************************************
2619 246001 : SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
2620 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2621 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: dest_buffer
2622 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2623 : INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2624 :
2625 : INTEGER :: acol, arow, img, i, offset, block_size
2626 : LOGICAL :: found
2627 246001 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: block
2628 :
2629 : !$OMP PARALLEL DEFAULT(NONE), &
2630 : !$OMP SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
2631 246001 : !$OMP PRIVATE(acol,arow,img,i,offset,block_size,found,block)
2632 : !$OMP DO schedule(guided)
2633 : DO i = 1, SIZE(atom_pair)
2634 : arow = atom_pair(i)%row
2635 : acol = atom_pair(i)%col
2636 : img = atom_pair(i)%image
2637 : CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
2638 : block=block, found=found)
2639 : CPASSERT(found)
2640 : block_size = SIZE(block)
2641 : offset = pair_offsets(i)
2642 : dest_buffer(offset + 1:offset + block_size) = RESHAPE(block, shape=[block_size])
2643 : END DO
2644 : !$OMP END DO
2645 : !$OMP END PARALLEL
2646 :
2647 246001 : END SUBROUTINE rs_pack_buffer
2648 :
2649 : ! **************************************************************************************************
2650 : !> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
2651 : !> \author Ole Schuett
2652 : ! **************************************************************************************************
2653 201848 : SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
2654 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: src_buffer
2655 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2656 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2657 : INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2658 :
2659 : INTEGER :: acol, arow, img, i, offset, &
2660 : nrows, ncols, lock_num
2661 : LOGICAL :: found
2662 201848 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: block
2663 201848 : INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2664 :
2665 : ! initialize locks
2666 605544 : ALLOCATE (locks(10*omp_get_max_threads()))
2667 2220328 : DO i = 1, SIZE(locks)
2668 2220328 : CALL omp_init_lock(locks(i))
2669 : END DO
2670 :
2671 : !$OMP PARALLEL DEFAULT(NONE), &
2672 : !$OMP SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
2673 201848 : !$OMP PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
2674 : !$OMP DO schedule(guided)
2675 : DO i = 1, SIZE(atom_pair)
2676 : arow = atom_pair(i)%row
2677 : acol = atom_pair(i)%col
2678 : img = atom_pair(i)%image
2679 : CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
2680 : block=block, found=found)
2681 : CPASSERT(found)
2682 : nrows = SIZE(block, 1)
2683 : ncols = SIZE(block, 2)
2684 : offset = pair_offsets(i)
2685 : lock_num = MODULO(arow, SIZE(locks)) + 1 ! map matrix rows round-robin to available locks
2686 :
2687 : CALL omp_set_lock(locks(lock_num))
2688 : block = block + RESHAPE(src_buffer(offset + 1:offset + nrows*ncols), shape=[nrows, ncols])
2689 : CALL omp_unset_lock(locks(lock_num))
2690 : END DO
2691 : !$OMP END DO
2692 : !$OMP END PARALLEL
2693 :
2694 : ! destroy locks
2695 2220328 : DO i = 1, SIZE(locks)
2696 2220328 : CALL omp_destroy_lock(locks(i))
2697 : END DO
2698 201848 : DEALLOCATE (locks)
2699 :
2700 201848 : END SUBROUTINE rs_unpack_buffer
2701 :
2702 : ! **************************************************************************************************
2703 : !> \brief determines the rank of the processor who's real rs grid contains point
2704 : !> \param rs_desc ...
2705 : !> \param igrid_level ...
2706 : !> \param n_levels ...
2707 : !> \param cube_center ...
2708 : !> \param ntasks ...
2709 : !> \param tasks ...
2710 : !> \param lb_cube ...
2711 : !> \param ub_cube ...
2712 : !> \param added_tasks ...
2713 : !> \par History
2714 : !> 11.2007 created [MattW]
2715 : !> 10.2008 rewritten [Joost VandeVondele]
2716 : !> \author MattW
2717 : ! **************************************************************************************************
2718 1380 : SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
2719 : lb_cube, ub_cube, added_tasks)
2720 :
2721 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2722 : INTEGER, INTENT(IN) :: igrid_level, n_levels
2723 : INTEGER, DIMENSION(3), INTENT(IN) :: cube_center
2724 : INTEGER, INTENT(INOUT) :: ntasks
2725 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
2726 : INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
2727 : INTEGER, INTENT(OUT) :: added_tasks
2728 :
2729 : INTEGER, PARAMETER :: add_tasks = 1000
2730 : REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
2731 :
2732 : INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
2733 : lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
2734 : INTEGER :: bit_pattern
2735 : LOGICAL :: dir_periodic(3)
2736 :
2737 1380 : coord(1) = rs_desc%x2coord(cube_center(1))
2738 1380 : coord(2) = rs_desc%y2coord(cube_center(2))
2739 1380 : coord(3) = rs_desc%z2coord(cube_center(3))
2740 1380 : dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
2741 :
2742 : ! the real cube coordinates
2743 5520 : lbc = lb_cube + cube_center
2744 5520 : ubc = ub_cube + cube_center
2745 :
2746 9660 : IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) <= lbc) .AND. &
2747 : ALL((rs_desc%ub_global(:, dest) + rs_desc%border) >= ubc)) THEN
2748 : !standard distributed collocation/integration
2749 1380 : tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
2750 1380 : tasks(ntasks)%dist_type = 1
2751 1380 : tasks(ntasks)%subpatch_pattern = 0
2752 1380 : added_tasks = 1
2753 :
2754 : ! here we figure out if there is an alternate location for this task
2755 : ! i.e. even though the cube_center is not in the real local domain,
2756 : ! it might fully fit in the halo of the neighbor
2757 : ! if its radius is smaller than the maximum radius
2758 : ! the list of possible neighbors is stored here as a bit pattern
2759 : ! to reconstruct what the rank of the neigbor is,
2760 : ! we can use (note this requires the correct rs_grid)
2761 : ! IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[-1,0,0])
2762 : ! IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[+1,0,0])
2763 : ! IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,-1,0])
2764 : ! IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,+1,0])
2765 : ! IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,-1])
2766 : ! IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,+1])
2767 1380 : bit_index = 0
2768 1380 : bit_pattern = 0
2769 5520 : DO i = 1, 3
2770 5520 : IF (rs_desc%perd(i) == 1) THEN
2771 2760 : bit_pattern = IBCLR(bit_pattern, bit_index)
2772 2760 : bit_index = bit_index + 1
2773 2760 : bit_pattern = IBCLR(bit_pattern, bit_index)
2774 2760 : bit_index = bit_index + 1
2775 : ELSE
2776 : ! fits the left neighbor ?
2777 1380 : IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
2778 599 : bit_pattern = IBSET(bit_pattern, bit_index)
2779 599 : bit_index = bit_index + 1
2780 : ELSE
2781 781 : bit_pattern = IBCLR(bit_pattern, bit_index)
2782 781 : bit_index = bit_index + 1
2783 : END IF
2784 : ! fits the right neighbor ?
2785 1380 : IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
2786 250 : bit_pattern = IBSET(bit_pattern, bit_index)
2787 250 : bit_index = bit_index + 1
2788 : ELSE
2789 1130 : bit_pattern = IBCLR(bit_pattern, bit_index)
2790 1130 : bit_index = bit_index + 1
2791 : END IF
2792 : END IF
2793 : END DO
2794 1380 : tasks(ntasks)%subpatch_pattern = bit_pattern
2795 :
2796 : ELSE
2797 : ! generalised collocation/integration needed
2798 : ! first we figure out how many neighbors we have to add to include the lbc/ubc
2799 : ! in the available domains (inclusive of halo points)
2800 : ! first we 'ignore' periodic boundary conditions
2801 : ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
2802 0 : lb_coord = coord
2803 0 : ub_coord = coord
2804 0 : lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
2805 0 : ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
2806 0 : DO i = 1, 3
2807 : ! only if the grid is not periodic in this direction we need to take care of adding neighbors
2808 0 : IF (rs_desc%perd(i) == 0) THEN
2809 : ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
2810 0 : DO
2811 0 : IF (lb_domain(i) > lbc(i)) THEN
2812 0 : lb_coord(i) = lb_coord(i) - 1
2813 0 : icoord = MODULO(lb_coord, rs_desc%group_dim)
2814 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2815 0 : lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2816 : ELSE
2817 : EXIT
2818 : END IF
2819 : END DO
2820 : ! same for the upper bound
2821 0 : DO
2822 0 : IF (ub_domain(i) < ubc(i)) THEN
2823 0 : ub_coord(i) = ub_coord(i) + 1
2824 0 : icoord = MODULO(ub_coord, rs_desc%group_dim)
2825 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2826 0 : ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2827 : ELSE
2828 : EXIT
2829 : END IF
2830 : END DO
2831 : END IF
2832 : END DO
2833 :
2834 : ! some care is needed for the periodic boundaries
2835 0 : DO i = 1, 3
2836 0 : IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
2837 0 : dir_periodic(i) = .TRUE.
2838 0 : lb_coord(i) = 0
2839 0 : ub_coord(i) = rs_desc%group_dim(i) - 1
2840 : ELSE
2841 0 : dir_periodic(i) = .FALSE.
2842 : END IF
2843 : END DO
2844 :
2845 0 : added_tasks = PRODUCT(ub_coord - lb_coord + 1)
2846 0 : itask = ntasks
2847 0 : ntasks = ntasks + added_tasks - 1
2848 0 : IF (ntasks > SIZE(tasks)) THEN
2849 0 : curr_tasks = INT((SIZE(tasks) + add_tasks)*mult_tasks)
2850 0 : CALL reallocate_tasks(tasks, curr_tasks)
2851 : END IF
2852 0 : DO iz = lb_coord(3), ub_coord(3)
2853 0 : DO iy = lb_coord(2), ub_coord(2)
2854 0 : DO ix = lb_coord(1), ub_coord(1)
2855 0 : icoord = MODULO([ix, iy, iz], rs_desc%group_dim)
2856 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2857 0 : tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
2858 0 : tasks(itask)%dist_type = 2
2859 0 : tasks(itask)%subpatch_pattern = 0
2860 : ! encode the domain size for this task
2861 : ! if the bit is set, we need to add the border in that direction
2862 0 : IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
2863 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 0)
2864 0 : IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
2865 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 1)
2866 0 : IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
2867 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 2)
2868 0 : IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
2869 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 3)
2870 0 : IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
2871 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 4)
2872 0 : IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
2873 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 5)
2874 0 : itask = itask + 1
2875 : END DO
2876 : END DO
2877 : END DO
2878 : END IF
2879 :
2880 1380 : END SUBROUTINE rs_find_node
2881 :
2882 : ! **************************************************************************************************
2883 : !> \brief utility functions for encoding the grid level with a rank, allowing
2884 : !> different grid levels to maintain a different rank ordering without
2885 : !> losing information. These encoded_ints are stored in the tasks(1:2,:) array
2886 : !> \param rank ...
2887 : !> \param grid_level ...
2888 : !> \param n_levels ...
2889 : !> \return ...
2890 : !> \par History
2891 : !> 4.2009 created [Iain Bethune]
2892 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2893 : ! **************************************************************************************************
2894 17192080 : FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
2895 :
2896 : INTEGER, INTENT(IN) :: rank, grid_level, n_levels
2897 : INTEGER :: encoded_int
2898 :
2899 : ! ordered so can still sort by rank
2900 :
2901 17192080 : encoded_int = rank*n_levels + grid_level - 1
2902 :
2903 17192080 : END FUNCTION encode_rank
2904 :
2905 : ! **************************************************************************************************
2906 : !> \brief ...
2907 : !> \param encoded_int ...
2908 : !> \param n_levels ...
2909 : !> \return ...
2910 : ! **************************************************************************************************
2911 8642030 : FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
2912 :
2913 : INTEGER, INTENT(IN) :: encoded_int
2914 : INTEGER, INTENT(IN) :: n_levels
2915 : INTEGER :: rank
2916 :
2917 8642030 : rank = INT(encoded_int/n_levels)
2918 :
2919 8642030 : END FUNCTION decode_rank
2920 :
2921 : ! **************************************************************************************************
2922 : !> \brief ...
2923 : !> \param encoded_int ...
2924 : !> \param n_levels ...
2925 : !> \return ...
2926 : ! **************************************************************************************************
2927 7236 : FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
2928 :
2929 : INTEGER, INTENT(IN) :: encoded_int
2930 : INTEGER, INTENT(IN) :: n_levels
2931 : INTEGER :: grid_level
2932 :
2933 7236 : grid_level = INT(MODULO(encoded_int, n_levels)) + 1
2934 :
2935 7236 : END FUNCTION decode_level
2936 :
2937 : ! **************************************************************************************************
2938 : !> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
2939 : !> grouped, and for each atom pair all set pairs are grouped and for each set pair,
2940 : !> all pgfs are grouped.
2941 : !> This yields the right order of the tasks for collocation after the sort
2942 : !> in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
2943 : !> The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
2944 : !> that a given density matrix block will be decontracted several times,
2945 : !> but cache effects on the grid make up for this.
2946 : !> \param a ...
2947 : !> \param b ...
2948 : !> \return ...
2949 : !> \author Ole Schuett
2950 : ! **************************************************************************************************
2951 81441977 : PURE FUNCTION tasks_less_than(a, b) RESULT(res)
2952 : TYPE(task_type), INTENT(IN) :: a, b
2953 : LOGICAL :: res
2954 :
2955 81441977 : IF (a%grid_level /= b%grid_level) THEN
2956 20279384 : res = a%grid_level < b%grid_level
2957 :
2958 61162593 : ELSE IF (a%image /= b%image) THEN
2959 7834043 : res = a%image < b%image
2960 :
2961 53328550 : ELSE IF (a%iatom /= b%iatom) THEN
2962 6934692 : res = a%iatom < b%iatom
2963 :
2964 46393858 : ELSE IF (a%jatom /= b%jatom) THEN
2965 7562331 : res = a%jatom < b%jatom
2966 :
2967 38831527 : ELSE IF (a%iset /= b%iset) THEN
2968 6046190 : res = a%iset < b%iset
2969 :
2970 32785337 : ELSE IF (a%jset /= b%jset) THEN
2971 6427889 : res = a%jset < b%jset
2972 :
2973 26357448 : ELSE IF (a%ipgf /= b%ipgf) THEN
2974 6848925 : res = a%ipgf < b%ipgf
2975 :
2976 : ELSE
2977 19508523 : res = a%jpgf < b%jpgf
2978 :
2979 : END IF
2980 81441977 : END FUNCTION tasks_less_than
2981 :
2982 193275283 : #:call array_sort(prefix='tasks', type='TYPE(task_type)')
2983 : #:endcall
2984 :
2985 : ! **************************************************************************************************
2986 : !> \brief Order atom pairs to find duplicates.
2987 : !> \param a ...
2988 : !> \param b ...
2989 : !> \return ...
2990 : !> \author Ole Schuett
2991 : ! **************************************************************************************************
2992 36460913 : PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
2993 : TYPE(atom_pair_type), INTENT(IN) :: a, b
2994 : LOGICAL :: res
2995 :
2996 36460913 : IF (a%rank /= b%rank) THEN
2997 4243 : res = a%rank < b%rank
2998 :
2999 36456670 : ELSE IF (a%row /= b%row) THEN
3000 5764761 : res = a%row < b%row
3001 :
3002 30691909 : ELSE IF (a%col /= b%col) THEN
3003 5784008 : res = a%col < b%col
3004 :
3005 : ELSE
3006 24907901 : res = a%image < b%image
3007 :
3008 : END IF
3009 36460913 : END FUNCTION atom_pair_less_than
3010 :
3011 77467935 : #:call array_sort(prefix='atom_pair', type='TYPE(atom_pair_type)')
3012 : #:endcall
3013 :
3014 : END MODULE task_list_methods
|