Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines for optimizing load balance between processes in HFX calculations
10 : !> \par History
11 : !> 04.2008 created [Manuel Guidon]
12 : !> \author Manuel Guidon
13 : ! **************************************************************************************************
14 : MODULE hfx_load_balance_methods
15 : USE cell_types, ONLY: cell_type
16 : USE cp_files, ONLY: close_file, &
17 : open_file
18 : USE message_passing, ONLY: mp_para_env_type
19 : USE hfx_pair_list_methods, ONLY: build_atomic_pair_list, &
20 : build_pair_list
21 : USE hfx_types, ONLY: &
22 : hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &
23 : hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &
24 : pair_list_type, pair_set_list_type
25 : USE input_constants, ONLY: hfx_do_eval_energy, &
26 : hfx_do_eval_forces
27 : USE kinds, ONLY: dp, &
28 : int_8
29 : USE message_passing, ONLY: mp_waitall, mp_request_type
30 : USE parallel_rng_types, ONLY: UNIFORM, &
31 : rng_stream_type
32 : USE particle_types, ONLY: particle_type
33 : USE util, ONLY: sort
34 : #include "./base/base_uses.f90"
35 :
36 : IMPLICIT NONE
37 : PRIVATE
38 :
39 : PUBLIC :: hfx_load_balance, &
40 : hfx_update_load_balance, &
41 : collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy
42 :
43 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
44 :
45 : REAL(KIND=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
46 : -1.91570128356921242E-002_dp, -1.6668495454436603_dp, &
47 : 1.7512639006523709_dp, -9.76074323945336081E-002_dp, &
48 : 2.6230786127311889_dp, -0.31870737623014189_dp, &
49 : 7.9588203912690973_dp, 1.8331423413134813_dp, &
50 : -0.15427618665346299_dp, 0.19749436090711650_dp/)
51 : REAL(KIND=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
52 : -9.36564055598656797E-002_dp, 0.64284973765086939_dp, &
53 : 1.0137565430060556_dp, -6.80088178288954567E-003_dp, &
54 : 1.1692629207374552_dp, -2.6314710080507573_dp, &
55 : 19.237814781880786_dp, 1.0505934173661349_dp, &
56 : 0.80382371955699250_dp, 0.49903401991818103_dp/)
57 : REAL(KIND=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350E-002_dp, 0.38073304105744837_dp/)
58 : REAL(KIND=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
59 : -9.41673106447732111E-002_dp, 0.94568006899317825_dp, &
60 : -1.4511897117448544_dp, 0.59178934677316952_dp, &
61 : 2.7291149361757236_dp, -0.50555512044800210_dp, &
62 : 8.3508180969609871_dp, 1.6829982496141809_dp, &
63 : -0.74895370472152600_dp, 0.43801726744197500_dp/)
64 : REAL(KIND=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
65 : 5.33216585432061581E-003_dp, 0.45572145697283628_dp, &
66 : 1.8119743851500618_dp, -0.12533918548421166_dp, &
67 : -1.4040312084552751_dp, -4.5331650463917859_dp, &
68 : 12.593431549069477_dp, 1.1311978374487595_dp, &
69 : 1.4245996087624646_dp, 1.1425350529853495_dp/)
70 : REAL(KIND=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
71 :
72 : !***
73 :
74 : CONTAINS
75 :
76 : ! **************************************************************************************************
77 : !> \brief Distributes the computation of eri's to all available processes.
78 : !> \param x_data Object that stores the indices array
79 : !> \param eps_schwarz screening parameter
80 : !> \param particle_set , atomic_kind_set, para_env ...
81 : !> \param max_set Maximum number of set to be considered
82 : !> \param para_env para_env
83 : !> \param coeffs_set screening functions
84 : !> \param coeffs_kind screening functions
85 : !> \param is_assoc_atomic_block_global KS-matrix sparsity
86 : !> \param do_periodic flag for periodicity
87 : !> \param load_balance_parameter Parameters for Monte-Carlo routines
88 : !> \param kind_of helper array for mapping
89 : !> \param basis_parameter Basis set parameters
90 : !> \param pmax_set Initial screening matrix
91 : !> \param pmax_atom ...
92 : !> \param i_thread Process ID of current Thread
93 : !> \param n_threads Total Number of Threads
94 : !> \param cell cell
95 : !> \param do_p_screening Flag for initial p screening
96 : !> \param map_atom_to_kind_atom ...
97 : !> \param nkind ...
98 : !> \param eval_type ...
99 : !> \param pmax_block ...
100 : !> \param use_virial ...
101 : !> \par History
102 : !> 06.2007 created [Manuel Guidon]
103 : !> 08.2007 new parallel scheme [Manuel Guidon]
104 : !> 09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
105 : !> 11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
106 : !> 02.2009 completely refactored [Manuel Guidon]
107 : !> \author Manuel Guidon
108 : !> \note
109 : !> The optimization is done via a binning procedure followed by simple
110 : !> Monte Carlo procedure:
111 : !> In a first step the total amount of integrals in the system is calculated,
112 : !> taking into account the sparsity of the KS-matrix , the screening based
113 : !> on near/farfield approximations and if desired the screening on an initial
114 : !> density matrix.
115 : !> In a second step, bins are generate that contain approximately the same number
116 : !> of integrals, and a cost for these bins is estimated (currently the number of integrals)
117 : !> In a third step, a Monte Carlo procedure optimizes the assignment
118 : !> of the different loads to each process
119 : !> At the end each process owns an unique array of *atomic* indices-ranges
120 : !> that are used to decide whether a process has to calculate a certain
121 : !> bunch of integrals or not
122 : ! **************************************************************************************************
123 1744 : SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
124 : coeffs_set, coeffs_kind, &
125 1744 : is_assoc_atomic_block_global, do_periodic, &
126 : load_balance_parameter, kind_of, basis_parameter, pmax_set, &
127 : pmax_atom, i_thread, n_threads, cell, &
128 : do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
129 : pmax_block, use_virial)
130 : TYPE(hfx_type), POINTER :: x_data
131 : REAL(dp), INTENT(IN) :: eps_schwarz
132 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
133 : INTEGER, INTENT(IN) :: max_set
134 : TYPE(mp_para_env_type), POINTER :: para_env
135 : TYPE(hfx_screen_coeff_type), &
136 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
137 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
138 : POINTER :: coeffs_kind
139 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
140 : LOGICAL :: do_periodic
141 : TYPE(hfx_load_balance_type), POINTER :: load_balance_parameter
142 : INTEGER :: kind_of(*)
143 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
144 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
145 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
146 : INTEGER, INTENT(IN) :: i_thread, n_threads
147 : TYPE(cell_type), POINTER :: cell
148 : LOGICAL, INTENT(IN) :: do_p_screening
149 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
150 : INTEGER, INTENT(IN) :: nkind, eval_type
151 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
152 : LOGICAL, INTENT(IN) :: use_virial
153 :
154 : CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_load_balance'
155 :
156 : CHARACTER(LEN=512) :: error_msg
157 : INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
158 : handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
159 : jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
160 : latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
161 : new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
162 : objective_block_size, objective_nblocks, source, total_blocks
163 5232 : TYPE(mp_request_type), DIMENSION(2) :: req
164 : INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
165 : distribution_counter_end, distribution_counter_start, global_quartet_counter, &
166 : local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
167 1744 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer_in, buffer_out
168 1744 : INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
169 1744 : sendbuffer, swapbuffer
170 : INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
171 : INTEGER(int_8), SAVE :: shm_global_quartet_counter, &
172 : shm_local_quartet_counter
173 3488 : INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, &
174 1744 : to_be_sorted
175 : INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
176 : INTEGER, SAVE :: shm_nblocks
177 : LOGICAL :: changed, last_bin_needs_to_be_filled, &
178 : optimized
179 : LOGICAL, DIMENSION(:, :), POINTER, SAVE :: atomic_pair_list
180 : REAL(dp) :: coeffs_kind_max0, log10_eps_schwarz, &
181 : log_2, pmax_blocks
182 3488 : TYPE(hfx_block_range_type), DIMENSION(:), POINTER :: blocks_guess, tmp_blocks, tmp_blocks2
183 : TYPE(hfx_block_range_type), DIMENSION(:), &
184 : POINTER, SAVE :: shm_blocks
185 5232 : TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
186 : TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
187 : SAVE :: full_dist
188 : TYPE(pair_list_type) :: list_ij, list_kl
189 : TYPE(pair_set_list_type), ALLOCATABLE, &
190 1744 : DIMENSION(:) :: set_list_ij, set_list_kl
191 :
192 1744 : !$OMP BARRIER
193 3488 : !$OMP MASTER
194 1744 : CALL timeset(routineN, handle)
195 : !$OMP END MASTER
196 1744 : !$OMP BARRIER
197 :
198 1744 : log10_eps_schwarz = LOG10(eps_schwarz)
199 1744 : log_2 = LOG10(2.0_dp)
200 12020 : coeffs_kind_max0 = MAXVAL(coeffs_kind(:, :)%x(2))
201 1744 : ncpu = para_env%num_pe
202 1744 : n_processes = ncpu*n_threads
203 1744 : natom = SIZE(particle_set)
204 :
205 1744 : block_size = load_balance_parameter%block_size
206 33348 : ALLOCATE (set_list_ij((max_set*block_size)**2))
207 33348 : ALLOCATE (set_list_kl((max_set*block_size)**2))
208 :
209 1744 : IF (.NOT. load_balance_parameter%blocks_initialized) THEN
210 1216 : !$OMP BARRIER
211 1216 : !$OMP MASTER
212 1216 : CALL timeset(routineN//"_range", handle_range)
213 :
214 1216 : nblocks = MAX((natom + block_size - 1)/block_size, 1)
215 7478 : ALLOCATE (blocks_guess(nblocks))
216 7478 : ALLOCATE (tmp_blocks(natom))
217 7478 : ALLOCATE (tmp_blocks2(natom))
218 :
219 1216 : pmax_blocks = 0.0_dp
220 2432 : SELECT CASE (eval_type)
221 : CASE (hfx_do_eval_energy)
222 1216 : atomic_pair_list => x_data%atomic_pair_list
223 : CASE (hfx_do_eval_forces)
224 1216 : atomic_pair_list => x_data%atomic_pair_list_forces
225 : END SELECT
226 20716 : atomic_pair_list = .TRUE.
227 : CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
228 : list_ij, list_kl, set_list_ij, set_list_kl, &
229 : particle_set, &
230 : coeffs_set, coeffs_kind, &
231 : is_assoc_atomic_block_global, do_periodic, &
232 : kind_of, basis_parameter, pmax_set, pmax_atom, &
233 : pmax_blocks, cell, &
234 : do_p_screening, map_atom_to_kind_atom, eval_type, &
235 1216 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
236 :
237 1216 : total_block_self_cost = 0
238 :
239 5046 : DO i = 1, nblocks
240 5046 : total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
241 : END DO
242 :
243 1216 : CALL para_env%sum(total_block_self_cost)
244 :
245 1216 : objective_block_size = load_balance_parameter%block_size
246 1216 : objective_nblocks = MAX((natom + objective_block_size - 1)/objective_block_size, 1)
247 :
248 1216 : self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
249 :
250 5046 : DO i = 1, nblocks
251 5046 : tmp_blocks2(i) = blocks_guess(i)
252 : END DO
253 :
254 : optimized = .FALSE.
255 : i = 0
256 2432 : DO WHILE (.NOT. optimized)
257 1216 : i = i + 1
258 1216 : current_block_id = 0
259 1216 : changed = .FALSE.
260 5046 : DO atom_block = 1, nblocks
261 3830 : current_block_id = current_block_id + 1
262 3830 : iatom_start = tmp_blocks2(atom_block)%istart
263 3830 : iatom_end = tmp_blocks2(atom_block)%iend
264 5046 : IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
265 0 : changed = .TRUE.
266 0 : new_iatom_start = iatom_start
267 0 : new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
268 0 : new_jatom_start = new_iatom_end + 1
269 0 : new_jatom_end = iatom_end
270 0 : tmp_blocks(current_block_id)%istart = new_iatom_start
271 0 : tmp_blocks(current_block_id)%iend = new_iatom_end
272 : tmp_blocks(current_block_id)%cost = estimate_block_cost( &
273 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
274 : new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
275 : new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
276 : particle_set, &
277 : coeffs_set, coeffs_kind, &
278 : is_assoc_atomic_block_global, do_periodic, &
279 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
280 : cell, &
281 : do_p_screening, map_atom_to_kind_atom, eval_type, &
282 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
283 0 : current_block_id = current_block_id + 1
284 0 : tmp_blocks(current_block_id)%istart = new_jatom_start
285 0 : tmp_blocks(current_block_id)%iend = new_jatom_end
286 : tmp_blocks(current_block_id)%cost = estimate_block_cost( &
287 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
288 : new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
289 : new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
290 : particle_set, &
291 : coeffs_set, coeffs_kind, &
292 : is_assoc_atomic_block_global, do_periodic, &
293 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
294 : cell, &
295 : do_p_screening, map_atom_to_kind_atom, eval_type, &
296 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
297 : ELSE
298 3830 : tmp_blocks(current_block_id)%istart = iatom_start
299 3830 : tmp_blocks(current_block_id)%iend = iatom_end
300 3830 : tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
301 : END IF
302 : END DO
303 1216 : IF (.NOT. changed) optimized = .TRUE.
304 1216 : IF (i > 20) optimized = .TRUE.
305 1216 : nblocks = current_block_id
306 6262 : DO atom_block = 1, nblocks
307 5046 : tmp_blocks2(atom_block) = tmp_blocks(atom_block)
308 : END DO
309 : END DO
310 :
311 1216 : DEALLOCATE (tmp_blocks2)
312 :
313 : ! ** count number of non empty blocks on each node
314 1216 : non_empty_blocks = 0
315 5046 : DO atom_block = 1, nblocks
316 3830 : IF (tmp_blocks(atom_block)%istart == 0) CYCLE
317 5046 : non_empty_blocks = non_empty_blocks + 1
318 : END DO
319 :
320 3648 : ALLOCATE (rcount(ncpu))
321 3634 : rcount = 0
322 1216 : rcount(para_env%mepos + 1) = non_empty_blocks
323 1216 : CALL para_env%sum(rcount)
324 :
325 : ! ** sum all non_empty_blocks
326 1216 : total_blocks = 0
327 3634 : DO i = 1, ncpu
328 3634 : total_blocks = total_blocks + rcount(i)
329 : END DO
330 :
331 : ! ** calculate offsets
332 3648 : ALLOCATE (rdispl(ncpu))
333 3634 : rcount(:) = rcount(:)*3
334 1216 : rdispl(1) = 0
335 2418 : DO i = 2, ncpu
336 2418 : rdispl(i) = rdispl(i - 1) + rcount(i - 1)
337 : END DO
338 :
339 3620 : ALLOCATE (buffer_in(3*non_empty_blocks))
340 :
341 1216 : non_empty_blocks = 0
342 5046 : DO atom_block = 1, nblocks
343 3830 : IF (tmp_blocks(atom_block)%istart == 0) CYCLE
344 1936 : buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
345 1936 : buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
346 1936 : buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
347 5046 : non_empty_blocks = non_empty_blocks + 1
348 : END DO
349 :
350 1216 : nblocks = total_blocks
351 :
352 7478 : ALLOCATE (tmp_blocks2(nblocks))
353 :
354 3648 : ALLOCATE (buffer_out(3*nblocks))
355 :
356 : ! ** Gather all three arrays
357 1216 : CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
358 :
359 5046 : DO i = 1, nblocks
360 3830 : tmp_blocks2(i)%istart = INT(buffer_out((i - 1)*3 + 1))
361 3830 : tmp_blocks2(i)%iend = INT(buffer_out((i - 1)*3 + 2))
362 5046 : tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
363 : END DO
364 :
365 : ! ** Now we sort the blocks
366 3648 : ALLOCATE (to_be_sorted(nblocks))
367 3648 : ALLOCATE (tmp_index(nblocks))
368 :
369 5046 : DO atom_block = 1, nblocks
370 5046 : to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
371 : END DO
372 :
373 1216 : CALL sort(to_be_sorted, nblocks, tmp_index)
374 :
375 7478 : ALLOCATE (x_data%blocks(nblocks))
376 :
377 5046 : DO atom_block = 1, nblocks
378 5046 : x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
379 : END DO
380 :
381 1216 : shm_blocks => x_data%blocks
382 1216 : shm_nblocks = nblocks
383 :
384 : ! ** Set nblocks in structure
385 1216 : load_balance_parameter%nblocks = nblocks
386 :
387 1216 : DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
388 :
389 1216 : DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
390 :
391 1216 : load_balance_parameter%blocks_initialized = .TRUE.
392 :
393 8876 : x_data%blocks = shm_blocks
394 1216 : load_balance_parameter%nblocks = shm_nblocks
395 1216 : load_balance_parameter%blocks_initialized = .TRUE.
396 :
397 4864 : ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
398 20716 : x_data%pmax_block = 0.0_dp
399 1216 : pmax_block => x_data%pmax_block
400 2432 : CALL timestop(handle_range)
401 : !$OMP END MASTER
402 1216 : !$OMP BARRIER
403 :
404 1216 : IF (.NOT. load_balance_parameter%blocks_initialized) THEN
405 0 : ALLOCATE (x_data%blocks(shm_nblocks))
406 0 : x_data%blocks = shm_blocks
407 0 : load_balance_parameter%nblocks = shm_nblocks
408 0 : load_balance_parameter%blocks_initialized = .TRUE.
409 : END IF
410 : !! ** precalculate maximum density matrix elements in blocks
411 1216 : !$OMP BARRIER
412 : END IF
413 :
414 1744 : !$OMP BARRIER
415 1744 : !$OMP MASTER
416 1744 : pmax_block => x_data%pmax_block
417 28132 : pmax_block = 0.0_dp
418 1744 : IF (do_p_screening) THEN
419 1454 : DO iatom_block = 1, shm_nblocks
420 1118 : iatom_start = x_data%blocks(iatom_block)%istart
421 1118 : iatom_end = x_data%blocks(iatom_block)%iend
422 5648 : DO jatom_block = 1, shm_nblocks
423 4194 : jatom_start = x_data%blocks(jatom_block)%istart
424 4194 : jatom_end = x_data%blocks(jatom_block)%iend
425 13700 : pmax_block(iatom_block, jatom_block) = MAXVAL(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
426 : END DO
427 : END DO
428 : END IF
429 :
430 2960 : SELECT CASE (eval_type)
431 : CASE (hfx_do_eval_energy)
432 1216 : atomic_pair_list => x_data%atomic_pair_list
433 : CASE (hfx_do_eval_forces)
434 1744 : atomic_pair_list => x_data%atomic_pair_list_forces
435 : END SELECT
436 : CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
437 : do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
438 1744 : x_data%blocks)
439 :
440 : !$OMP END MASTER
441 1744 : !$OMP BARRIER
442 :
443 : !! If there is only 1 cpu skip the binning
444 1744 : IF (n_processes == 1) THEN
445 28 : ALLOCATE (tmp_dist(1))
446 14 : tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
447 14 : tmp_dist(1)%istart = 0_int_8
448 14 : ptr_to_tmp_dist => tmp_dist(:)
449 28 : SELECT CASE (eval_type)
450 : CASE (hfx_do_eval_energy)
451 14 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
452 : CASE (hfx_do_eval_forces)
453 14 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
454 : END SELECT
455 14 : DEALLOCATE (tmp_dist)
456 : ELSE
457 : !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
458 1730 : !$OMP BARRIER
459 1730 : !$OMP MASTER
460 1730 : CALL timeset(routineN//"_count", handle_inner)
461 : !$OMP END MASTER
462 1730 : !$OMP BARRIER
463 :
464 1730 : cost_per_core = 0_int_8
465 1730 : my_process_id = para_env%mepos*n_threads + i_thread
466 1730 : nblocks = load_balance_parameter%nblocks
467 :
468 481024 : DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
469 :
470 479294 : latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
471 479294 : tmp_block = atom_block/nblocks
472 479294 : katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
473 479294 : IF (latom_block < katom_block) CYCLE
474 268410 : tmp_block = tmp_block/nblocks
475 268410 : jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
476 268410 : tmp_block = tmp_block/nblocks
477 268410 : iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
478 268410 : IF (jatom_block < iatom_block) CYCLE
479 :
480 151191 : iatom_start = x_data%blocks(iatom_block)%istart
481 151191 : iatom_end = x_data%blocks(iatom_block)%iend
482 151191 : jatom_start = x_data%blocks(jatom_block)%istart
483 151191 : jatom_end = x_data%blocks(jatom_block)%iend
484 151191 : katom_start = x_data%blocks(katom_block)%istart
485 151191 : katom_end = x_data%blocks(katom_block)%iend
486 151191 : latom_start = x_data%blocks(latom_block)%istart
487 151191 : latom_end = x_data%blocks(latom_block)%iend
488 :
489 286182 : SELECT CASE (eval_type)
490 : CASE (hfx_do_eval_energy)
491 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
492 : pmax_block(latom_block, jatom_block), &
493 : pmax_block(latom_block, iatom_block), &
494 134991 : pmax_block(katom_block, jatom_block))
495 : CASE (hfx_do_eval_forces)
496 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
497 : pmax_block(latom_block, jatom_block), &
498 : pmax_block(latom_block, iatom_block) + &
499 151191 : pmax_block(katom_block, jatom_block))
500 : END SELECT
501 :
502 151191 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
503 :
504 : cost_per_core = cost_per_core &
505 : + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
506 : iatom_start, iatom_end, jatom_start, jatom_end, &
507 : katom_start, katom_end, latom_start, latom_end, &
508 : particle_set, &
509 : coeffs_set, coeffs_kind, &
510 : is_assoc_atomic_block_global, do_periodic, &
511 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
512 : cell, &
513 : do_p_screening, map_atom_to_kind_atom, eval_type, &
514 481024 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
515 :
516 : END DO ! atom_block
517 :
518 1730 : nbins = load_balance_parameter%nbins
519 1730 : cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
520 :
521 1730 : !$OMP BARRIER
522 1730 : !$OMP MASTER
523 1730 : CALL timestop(handle_inner)
524 : !$OMP END MASTER
525 1730 : !$OMP BARRIER
526 :
527 : ! new load balancing test
528 : IF (.FALSE.) THEN
529 : CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
530 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
531 : particle_set, &
532 : coeffs_set, coeffs_kind, &
533 : is_assoc_atomic_block_global, do_periodic, &
534 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
535 : cell, x_data, para_env, pmax_block, &
536 : do_p_screening, map_atom_to_kind_atom, eval_type, &
537 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
538 : END IF
539 :
540 1730 : !$OMP BARRIER
541 1730 : !$OMP MASTER
542 1730 : CALL timeset(routineN//"_bin", handle_inner)
543 : !$OMP END MASTER
544 1730 : !$OMP BARRIER
545 :
546 115910 : ALLOCATE (binned_dist(nbins))
547 112450 : binned_dist(:)%istart = -1_int_8
548 112450 : binned_dist(:)%number_of_atom_quartets = 0_int_8
549 112450 : binned_dist(:)%cost = 0_int_8
550 112450 : binned_dist(:)%time_first_scf = 0.0_dp
551 112450 : binned_dist(:)%time_other_scf = 0.0_dp
552 112450 : binned_dist(:)%time_forces = 0.0_dp
553 :
554 1730 : current_cost = 0
555 1730 : mepos = 1
556 1730 : distribution_counter_start = 1
557 1730 : distribution_counter_end = 0
558 1730 : ibin = 1
559 :
560 1730 : global_quartet_counter = 0
561 1730 : local_quartet_counter = 0
562 1730 : last_bin_needs_to_be_filled = .FALSE.
563 481024 : DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
564 479294 : latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
565 479294 : tmp_block = atom_block/nblocks
566 479294 : katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
567 479294 : IF (latom_block < katom_block) CYCLE
568 268410 : tmp_block = tmp_block/nblocks
569 268410 : jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
570 268410 : tmp_block = tmp_block/nblocks
571 268410 : iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
572 268410 : IF (jatom_block < iatom_block) CYCLE
573 :
574 151191 : distribution_counter_end = distribution_counter_end + 1
575 151191 : global_quartet_counter = global_quartet_counter + 1
576 151191 : last_bin_needs_to_be_filled = .TRUE.
577 :
578 151191 : IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
579 :
580 151191 : iatom_start = x_data%blocks(iatom_block)%istart
581 151191 : iatom_end = x_data%blocks(iatom_block)%iend
582 151191 : jatom_start = x_data%blocks(jatom_block)%istart
583 151191 : jatom_end = x_data%blocks(jatom_block)%iend
584 151191 : katom_start = x_data%blocks(katom_block)%istart
585 151191 : katom_end = x_data%blocks(katom_block)%iend
586 151191 : latom_start = x_data%blocks(latom_block)%istart
587 151191 : latom_end = x_data%blocks(latom_block)%iend
588 :
589 286182 : SELECT CASE (eval_type)
590 : CASE (hfx_do_eval_energy)
591 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
592 : pmax_block(latom_block, jatom_block), &
593 : pmax_block(latom_block, iatom_block), &
594 134991 : pmax_block(katom_block, jatom_block))
595 : CASE (hfx_do_eval_forces)
596 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
597 : pmax_block(latom_block, jatom_block), &
598 : pmax_block(latom_block, iatom_block) + &
599 151191 : pmax_block(katom_block, jatom_block))
600 : END SELECT
601 :
602 151191 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
603 :
604 : current_cost = current_cost &
605 : + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
606 : iatom_start, iatom_end, jatom_start, jatom_end, &
607 : katom_start, katom_end, latom_start, latom_end, &
608 : particle_set, &
609 : coeffs_set, coeffs_kind, &
610 : is_assoc_atomic_block_global, do_periodic, &
611 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
612 : cell, &
613 : do_p_screening, map_atom_to_kind_atom, eval_type, &
614 150565 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
615 :
616 152295 : IF (current_cost >= cost_per_bin) THEN
617 18688 : IF (ibin == nbins) THEN
618 : binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
619 0 : distribution_counter_end - distribution_counter_start + 1
620 : ELSE
621 18688 : binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
622 : END IF
623 18688 : binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
624 18688 : ibin = MIN(ibin + 1, nbins)
625 18688 : distribution_counter_start = distribution_counter_end + 1
626 18688 : current_cost = 0
627 18688 : last_bin_needs_to_be_filled = .FALSE.
628 : END IF
629 : END DO
630 :
631 1730 : !$OMP BARRIER
632 1730 : !$OMP MASTER
633 1730 : CALL timestop(handle_inner)
634 1730 : CALL timeset(routineN//"_dist", handle_inner)
635 : !$OMP END MASTER
636 1730 : !$OMP BARRIER
637 : !! Fill the last bin if necessary
638 1730 : IF (last_bin_needs_to_be_filled) THEN
639 510 : binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
640 510 : IF (ibin == nbins) THEN
641 : binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
642 0 : distribution_counter_end - distribution_counter_start + 1
643 : ELSE
644 510 : binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
645 : END IF
646 : END IF
647 :
648 : !! Sanity-Check
649 112450 : DO ibin = 1, nbins
650 112450 : local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
651 : END DO
652 1730 : !$OMP BARRIER
653 1730 : !$OMP MASTER
654 1730 : shm_local_quartet_counter = 0
655 1730 : shm_global_quartet_counter = 0
656 : !$OMP END MASTER
657 1730 : !$OMP BARRIER
658 1730 : !$OMP ATOMIC
659 : shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
660 1730 : !$OMP ATOMIC
661 : shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
662 :
663 1730 : !$OMP BARRIER
664 1730 : !$OMP MASTER
665 1730 : CALL para_env%sum(shm_local_quartet_counter)
666 1730 : CALL para_env%sum(shm_global_quartet_counter)
667 1730 : IF (para_env%is_source()) THEN
668 865 : IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
669 : WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
670 0 : "Number of local quartets (", shm_local_quartet_counter, &
671 0 : ") and number of global quartets (", shm_global_quartet_counter, &
672 0 : ") are different. Please send in a bug report."
673 0 : CPABORT(error_msg)
674 : END IF
675 : END IF
676 : !$OMP END MASTER
677 :
678 1730 : !$OMP BARRIER
679 1730 : !$OMP MASTER
680 5190 : ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
681 223170 : cost_matrix = 0
682 : !$OMP END MASTER
683 1730 : !$OMP BARRIER
684 1730 : icpu = para_env%mepos + 1
685 112450 : DO i = 1, nbins
686 112450 : cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
687 : END DO
688 1730 : mepos = para_env%mepos
689 1730 : !$OMP BARRIER
690 :
691 1730 : !$OMP MASTER
692 : ! sync before/after ring of isendrecv
693 1730 : CALL para_env%sync()
694 :
695 5190 : ALLOCATE (sendbuffer(nbins*n_threads))
696 5190 : ALLOCATE (recbuffer(nbins*n_threads))
697 :
698 223170 : sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
699 :
700 1730 : dest = MODULO(mepos + 1, ncpu)
701 1730 : source = MODULO(mepos - 1, ncpu)
702 5190 : DO icpu = 0, ncpu - 1
703 3460 : IF (icpu .NE. ncpu - 1) THEN
704 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
705 1730 : req(1), req(2), 13)
706 : END IF
707 3460 : data_from = MODULO(mepos - icpu, ncpu)
708 446340 : cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
709 3460 : IF (icpu .NE. ncpu - 1) THEN
710 1730 : CALL mp_waitall(req)
711 : END IF
712 3460 : swapbuffer => sendbuffer
713 3460 : sendbuffer => recbuffer
714 5190 : recbuffer => swapbuffer
715 : END DO
716 1730 : DEALLOCATE (recbuffer, sendbuffer)
717 : !$OMP END MASTER
718 1730 : !$OMP BARRIER
719 :
720 1730 : !$OMP BARRIER
721 1730 : !$OMP MASTER
722 1730 : CALL timestop(handle_inner)
723 1730 : CALL timeset(routineN//"_opt", handle_inner)
724 : !$OMP END MASTER
725 1730 : !$OMP BARRIER
726 :
727 : !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
728 1730 : !$OMP BARRIER
729 5190 : ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
730 444610 : local_cost_matrix = cost_matrix
731 1730 : !$OMP MASTER
732 5190 : ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
733 :
734 : CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
735 1730 : shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
736 :
737 1730 : CALL timestop(handle_inner)
738 1730 : CALL timeset(routineN//"_redist", handle_inner)
739 : !! Collect local data to global array
740 339080 : ALLOCATE (full_dist(ncpu*n_threads, nbins))
741 :
742 333890 : full_dist(:, :)%istart = 0_int_8
743 333890 : full_dist(:, :)%number_of_atom_quartets = 0_int_8
744 333890 : full_dist(:, :)%cost = 0_int_8
745 333890 : full_dist(:, :)%time_first_scf = 0.0_dp
746 333890 : full_dist(:, :)%time_other_scf = 0.0_dp
747 335620 : full_dist(:, :)%time_forces = 0.0_dp
748 : !$OMP END MASTER
749 1730 : !$OMP BARRIER
750 1730 : mepos = para_env%mepos + 1
751 223170 : full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
752 :
753 1730 : !$OMP BARRIER
754 1730 : !$OMP MASTER
755 5190 : ALLOCATE (sendbuffer(3*nbins*n_threads))
756 5190 : ALLOCATE (recbuffer(3*nbins*n_threads))
757 1730 : mepos = para_env%mepos
758 3460 : DO j = 1, n_threads
759 114180 : DO i = 1, nbins
760 110720 : sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
761 110720 : sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
762 112450 : sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
763 : END DO
764 : END DO
765 :
766 : ! sync before/after ring of isendrecv
767 1730 : CALL para_env%sync()
768 1730 : dest = MODULO(mepos + 1, ncpu)
769 1730 : source = MODULO(mepos - 1, ncpu)
770 5190 : DO icpu = 0, ncpu - 1
771 3460 : IF (icpu .NE. ncpu - 1) THEN
772 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
773 1730 : req(1), req(2), 13)
774 : END IF
775 3460 : data_from = MODULO(mepos - icpu, ncpu)
776 6920 : DO j = 1, n_threads
777 228360 : DO i = 1, nbins
778 221440 : full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
779 221440 : full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
780 224900 : full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
781 : END DO
782 : END DO
783 :
784 3460 : IF (icpu .NE. ncpu - 1) THEN
785 1730 : CALL mp_waitall(req)
786 : END IF
787 3460 : swapbuffer => sendbuffer
788 3460 : sendbuffer => recbuffer
789 5190 : recbuffer => swapbuffer
790 : END DO
791 1730 : DEALLOCATE (recbuffer, sendbuffer)
792 :
793 : ! sync before/after ring of isendrecv
794 1730 : CALL para_env%sync()
795 : !$OMP END MASTER
796 1730 : !$OMP BARRIER
797 : !! reorder the distribution according to the distribution vector
798 5190 : ALLOCATE (tmp_pos(ncpu*n_threads))
799 5190 : tmp_pos = 1
800 226630 : ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
801 :
802 223170 : tmp_dist(:)%istart = 0_int_8
803 223170 : tmp_dist(:)%number_of_atom_quartets = 0_int_8
804 223170 : tmp_dist(:)%cost = 0_int_8
805 223170 : tmp_dist(:)%time_first_scf = 0.0_dp
806 223170 : tmp_dist(:)%time_other_scf = 0.0_dp
807 223170 : tmp_dist(:)%time_forces = 0.0_dp
808 :
809 5190 : DO icpu = 1, n_processes
810 226630 : DO i = 1, nbins
811 221440 : mepos = my_process_id + 1
812 224900 : IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
813 110720 : tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
814 110720 : tmp_pos(mepos) = tmp_pos(mepos) + 1
815 : END IF
816 : END DO
817 : END DO
818 :
819 : !! Assign the load to each process
820 : NULLIFY (ptr_to_tmp_dist)
821 1730 : mepos = my_process_id + 1
822 1730 : ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
823 2932 : SELECT CASE (eval_type)
824 : CASE (hfx_do_eval_energy)
825 1202 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
826 : CASE (hfx_do_eval_forces)
827 1730 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
828 : END SELECT
829 :
830 1730 : !$OMP BARRIER
831 1730 : !$OMP MASTER
832 1730 : DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
833 : !$OMP END MASTER
834 1730 : !$OMP BARRIER
835 1730 : DEALLOCATE (tmp_dist, tmp_pos)
836 1730 : DEALLOCATE (binned_dist, local_cost_matrix)
837 1730 : DEALLOCATE (set_list_ij, set_list_kl)
838 :
839 1730 : !$OMP BARRIER
840 1730 : !$OMP MASTER
841 1730 : CALL timestop(handle_inner)
842 : !$OMP END MASTER
843 1730 : !$OMP BARRIER
844 : END IF
845 1744 : !$OMP BARRIER
846 1744 : !$OMP MASTER
847 1744 : CALL timestop(handle)
848 : !$OMP END MASTER
849 1744 : !$OMP BARRIER
850 3618800 : END SUBROUTINE hfx_load_balance
851 :
852 : ! **************************************************************************************************
853 : !> \brief Reference implementation of new recursive load balancing routine
854 : !> Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
855 : !> each process in a P-Q grid such that every process has more or less the
856 : !> same amount of work. Has no output at the moment (not used) but writes
857 : !> its computed load balance values into a file. Possible output is ready
858 : !> to use in the two arrays p_atom_blocks & q_atom_blocks
859 : !> \param n_processes ...
860 : !> \param my_process_id ...
861 : !> \param nblocks ...
862 : !> \param natom ...
863 : !> \param nkind ...
864 : !> \param list_ij ...
865 : !> \param list_kl ...
866 : !> \param set_list_ij ...
867 : !> \param set_list_kl ...
868 : !> \param particle_set ...
869 : !> \param coeffs_set ...
870 : !> \param coeffs_kind ...
871 : !> \param is_assoc_atomic_block_global ...
872 : !> \param do_periodic ...
873 : !> \param kind_of ...
874 : !> \param basis_parameter ...
875 : !> \param pmax_set ...
876 : !> \param pmax_atom ...
877 : !> \param pmax_blocks ...
878 : !> \param cell ...
879 : !> \param x_data ...
880 : !> \param para_env ...
881 : !> \param pmax_block ...
882 : !> \param do_p_screening ...
883 : !> \param map_atom_to_kind_atom ...
884 : !> \param eval_type ...
885 : !> \param log10_eps_schwarz ...
886 : !> \param log_2 ...
887 : !> \param coeffs_kind_max0 ...
888 : !> \param use_virial ...
889 : !> \param atomic_pair_list ...
890 : !> \par History
891 : !> 03.2011 created [Michael Steinlechner]
892 : !> \author Michael Steinlechner
893 : ! **************************************************************************************************
894 :
895 0 : SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
896 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
897 : particle_set, &
898 : coeffs_set, coeffs_kind, &
899 0 : is_assoc_atomic_block_global, do_periodic, &
900 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
901 : cell, x_data, para_env, pmax_block, &
902 : do_p_screening, map_atom_to_kind_atom, eval_type, &
903 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
904 :
905 : ! input variables:
906 : INTEGER, INTENT(IN) :: n_processes, my_process_id, nblocks, &
907 : natom, nkind
908 : TYPE(pair_list_type), INTENT(IN) :: list_ij, list_kl
909 : TYPE(pair_set_list_type), ALLOCATABLE, &
910 : DIMENSION(:), INTENT(IN) :: set_list_ij, set_list_kl
911 : TYPE(particle_type), DIMENSION(:), INTENT(IN), &
912 : POINTER :: particle_set
913 : TYPE(hfx_screen_coeff_type), &
914 : DIMENSION(:, :, :, :), INTENT(IN), POINTER :: coeffs_set
915 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
916 : INTENT(IN), POINTER :: coeffs_kind
917 : INTEGER, DIMENSION(:, :), INTENT(IN) :: is_assoc_atomic_block_global
918 : LOGICAL, INTENT(IN) :: do_periodic
919 : INTEGER, INTENT(IN) :: kind_of(*)
920 : TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
921 : POINTER :: basis_parameter
922 : TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
923 : POINTER :: pmax_set
924 : REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_atom
925 : REAL(dp) :: pmax_blocks
926 : TYPE(cell_type), INTENT(IN), POINTER :: cell
927 : TYPE(hfx_type), INTENT(IN), POINTER :: x_data
928 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
929 : REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_block
930 : LOGICAL, INTENT(IN) :: do_p_screening
931 : INTEGER, DIMENSION(:), INTENT(IN), POINTER :: map_atom_to_kind_atom
932 : INTEGER, INTENT(IN) :: eval_type
933 : REAL(dp), INTENT(IN) :: log10_eps_schwarz, log_2, &
934 : coeffs_kind_max0
935 : LOGICAL, INTENT(IN) :: use_virial
936 : LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER :: atomic_pair_list
937 :
938 : CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_recursive_load_balance'
939 :
940 : INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
941 : jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
942 : nP, nQ, numBins, p, q, sizeP, sizeQ, unit_nr
943 : INTEGER(int_8) :: local_cost, pidx, qidx, sumP, sumQ
944 0 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: local_cost_vector
945 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: blocksize, p_atom_blocks, permute, &
946 0 : q_atom_blocks
947 : REAL(dp) :: maximum, mean
948 :
949 : ! internal variables:
950 :
951 0 : !$OMP BARRIER
952 0 : !$OMP MASTER
953 0 : CALL timeset(routineN, handle)
954 : !$OMP END MASTER
955 0 : !$OMP BARRIER
956 :
957 : ! calculate best p/q distribution grid for the n_processes
958 0 : CALL hfx_calculate_PQ(p, q, numBins, n_processes)
959 :
960 0 : ALLOCATE (blocksize(numBins))
961 0 : ALLOCATE (permute(nblocks**2))
962 0 : DO i = 1, nblocks**2
963 0 : permute(i) = i
964 : END DO
965 :
966 : ! call the main recursive permutation routine.
967 : ! Output:
968 : ! blocksize :: vector (size numBins) with the sizes for each column/row block
969 : ! permute :: permutation vector
970 : CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numBins, &
971 : permute, 1, &
972 : my_process_id, n_processes, nblocks, &
973 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
974 : particle_set, &
975 : coeffs_set, coeffs_kind, &
976 : is_assoc_atomic_block_global, do_periodic, &
977 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
978 : cell, x_data, para_env, pmax_block, &
979 : do_p_screening, map_atom_to_kind_atom, eval_type, &
980 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
981 :
982 : ! number of blocks per processor in p-direction (vertical)
983 0 : nP = numBins/p
984 : ! number of blocks per processor in q-direction (horizontal)
985 0 : nQ = numBins/q
986 :
987 : ! calc own position in P-Q-processor grid (PQ-grid is column-major)
988 0 : pidx = MODULO(INT(my_process_id), INT(p)) + 1
989 0 : qidx = my_process_id/p + 1
990 :
991 0 : sizeP = SUM(blocksize((nP*(pidx - 1) + 1):(nP*pidx)))
992 0 : sizeQ = SUM(blocksize((nQ*(qidx - 1) + 1):(nQ*qidx)))
993 :
994 0 : sumP = SUM(blocksize(1:(nP*(pidx - 1))))
995 0 : sumQ = SUM(blocksize(1:(nQ*(qidx - 1))))
996 :
997 0 : ALLOCATE (p_atom_blocks(sizeP))
998 0 : ALLOCATE (q_atom_blocks(sizeQ))
999 :
1000 0 : p_atom_blocks(:) = permute((sumP + 1):(sumP + sizeP))
1001 0 : q_atom_blocks(:) = permute((sumQ + 1):(sumQ + sizeQ))
1002 :
1003 : ! from here on, we are actually finished, each process has been
1004 : ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
1005 : ! what follows is just a small routine to calculate the local cost
1006 : ! for each processor which is then written to a file.
1007 :
1008 : ! calculate local cost for each processor!
1009 : ! ****************************************
1010 : local_cost = 0
1011 0 : DO i = 1, sizeP
1012 0 : DO j = 1, sizeQ
1013 :
1014 : ! get corresponding 4D block indices out of our own P-Q-block
1015 0 : latom_block = MODULO(q_atom_blocks(j), nblocks)
1016 0 : iatom_block = q_atom_blocks(j)/nblocks + 1
1017 0 : jatom_block = MODULO(p_atom_blocks(i), nblocks)
1018 0 : katom_block = p_atom_blocks(i)/nblocks + 1
1019 :
1020 : ! symmetry checks.
1021 0 : IF (latom_block < katom_block) CYCLE
1022 0 : IF (jatom_block < iatom_block) CYCLE
1023 :
1024 0 : iatom_start = x_data%blocks(iatom_block)%istart
1025 0 : iatom_end = x_data%blocks(iatom_block)%iend
1026 0 : jatom_start = x_data%blocks(jatom_block)%istart
1027 0 : jatom_end = x_data%blocks(jatom_block)%iend
1028 0 : katom_start = x_data%blocks(katom_block)%istart
1029 0 : katom_end = x_data%blocks(katom_block)%iend
1030 0 : latom_start = x_data%blocks(latom_block)%istart
1031 0 : latom_end = x_data%blocks(latom_block)%iend
1032 :
1033 : ! whatever.
1034 0 : SELECT CASE (eval_type)
1035 : CASE (hfx_do_eval_energy)
1036 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
1037 : pmax_block(latom_block, jatom_block), &
1038 : pmax_block(latom_block, iatom_block), &
1039 0 : pmax_block(katom_block, jatom_block))
1040 : CASE (hfx_do_eval_forces)
1041 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
1042 : pmax_block(latom_block, jatom_block), &
1043 : pmax_block(latom_block, iatom_block) + &
1044 0 : pmax_block(katom_block, jatom_block))
1045 : END SELECT
1046 :
1047 : ! screening.
1048 0 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
1049 :
1050 : ! estimate the cost of this atom_block.
1051 : local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1052 : set_list_kl, &
1053 : iatom_start, iatom_end, jatom_start, jatom_end, &
1054 : katom_start, katom_end, latom_start, latom_end, &
1055 : particle_set, &
1056 : coeffs_set, coeffs_kind, &
1057 : is_assoc_atomic_block_global, do_periodic, &
1058 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1059 : cell, &
1060 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1061 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1062 : END DO
1063 : END DO
1064 :
1065 0 : ALLOCATE (local_cost_vector(n_processes))
1066 0 : local_cost_vector = 0
1067 0 : local_cost_vector(my_process_id + 1) = local_cost
1068 0 : CALL para_env%sum(local_cost_vector)
1069 :
1070 0 : mean = SUM(local_cost_vector)/n_processes
1071 0 : maximum = MAXVAL(local_cost_vector)
1072 :
1073 0 : !$OMP BARRIER
1074 0 : !$OMP MASTER
1075 : ! only output once
1076 0 : IF (my_process_id == 0) THEN
1077 0 : CALL open_file(unit_number=unit_nr, file_name="loads.dat")
1078 0 : WRITE (unit_nr, *) 'maximum cost:', maximum
1079 0 : WRITE (unit_nr, *) 'mean cost:', mean
1080 0 : WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
1081 0 : WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
1082 0 : DO i = 1, n_processes
1083 0 : WRITE (unit_nr, *) local_cost_vector(i)
1084 : END DO
1085 0 : CALL close_file(unit_nr)
1086 : END IF
1087 : !$OMP END MASTER
1088 0 : !$OMP BARRIER
1089 :
1090 0 : DEALLOCATE (local_cost_vector)
1091 0 : DEALLOCATE (p_atom_blocks, q_atom_blocks)
1092 0 : DEALLOCATE (blocksize, permute)
1093 :
1094 0 : !$OMP BARRIER
1095 0 : !$OMP MASTER
1096 0 : CALL timestop(handle)
1097 : !$OMP END MASTER
1098 0 : !$OMP BARRIER
1099 :
1100 0 : END SUBROUTINE hfx_recursive_load_balance
1101 :
1102 : ! **************************************************************************************************
1103 : !> \brief Small routine to calculate the optimal P-Q-processor grid distribution
1104 : !> for a given number of processors N
1105 : !> and the corresponding number of Bins for the load balancing routine
1106 : !> \param p number of rows on P-Q process grid (output)
1107 : !> \param q number of columns on P-Q process grid (output)
1108 : !> \param nBins number of Bins (output)
1109 : !> \param N number of processes (input)
1110 : !> \par History
1111 : !> 03.2011 created [Michael Steinlechner]
1112 : !> \author Michael Steinlechner
1113 : ! **************************************************************************************************
1114 0 : SUBROUTINE hfx_calculate_PQ(p, q, nBins, N)
1115 :
1116 : INTEGER, INTENT(OUT) :: p, q, nBins
1117 : INTEGER, INTENT(IN) :: N
1118 :
1119 : INTEGER :: a, b, k
1120 : REAL(dp) :: sqN
1121 :
1122 0 : k = 2
1123 0 : sqN = SQRT(REAL(N, KIND=dp))
1124 0 : p = 1
1125 :
1126 0 : DO WHILE (REAL(k, KIND=dp) <= sqN)
1127 0 : IF (MODULO(N, k) == 0) THEN
1128 0 : p = k
1129 : END IF
1130 0 : k = k + 1
1131 : END DO
1132 0 : q = N/p
1133 :
1134 : ! now compute the least common multiple of p & q to get the number of necessary bins
1135 : ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
1136 : ! and use euclid's algorithm for GCD computation.
1137 0 : a = p
1138 0 : b = q
1139 :
1140 0 : DO WHILE (b .NE. 0)
1141 0 : IF (a > b) THEN
1142 0 : a = a - b
1143 : ELSE
1144 0 : b = b - a
1145 : END IF
1146 : END DO
1147 : ! gcd(p,q) is now saved in a
1148 :
1149 0 : nBins = p*q/a
1150 :
1151 0 : END SUBROUTINE
1152 :
1153 : ! **************************************************************************************************
1154 : !> \brief Recursive permutation routine for the load balancing of the integral
1155 : !> computation
1156 : !> \param blocksize vector of blocksizes, size(nProc), which contains for
1157 : !> each process the local blocksize (OUTPUT)
1158 : !> \param blockstart starting row/column idx of the block which is to be examined
1159 : !> at this point (INPUT)
1160 : !> \param blockend ending row/column idx of the block which is to be examined
1161 : !> (INPUT)
1162 : !> \param nProc_in number of bins into which the current block has to be divided
1163 : !> (INPUT)
1164 : !> \param permute permutation vector which balances column/row cost
1165 : !> size(nblocks^2). (OUTPUT)
1166 : !> \param step ...
1167 : !> \param my_process_id ...
1168 : !> \param n_processes ...
1169 : !> \param nblocks ...
1170 : !> \param natom ...
1171 : !> \param nkind ...
1172 : !> \param list_ij ...
1173 : !> \param list_kl ...
1174 : !> \param set_list_ij ...
1175 : !> \param set_list_kl ...
1176 : !> \param particle_set ...
1177 : !> \param coeffs_set ...
1178 : !> \param coeffs_kind ...
1179 : !> \param is_assoc_atomic_block_global ...
1180 : !> \param do_periodic ...
1181 : !> \param kind_of ...
1182 : !> \param basis_parameter ...
1183 : !> \param pmax_set ...
1184 : !> \param pmax_atom ...
1185 : !> \param pmax_blocks ...
1186 : !> \param cell ...
1187 : !> \param x_data ...
1188 : !> \param para_env ...
1189 : !> \param pmax_block ...
1190 : !> \param do_p_screening ...
1191 : !> \param map_atom_to_kind_atom ...
1192 : !> \param eval_type ...
1193 : !> \param log10_eps_schwarz ...
1194 : !> \param log_2 ...
1195 : !> \param coeffs_kind_max0 ...
1196 : !> \param use_virial ...
1197 : !> \param atomic_pair_list ...
1198 : !> \par History
1199 : !> 03.2011 created [Michael Steinlechner]
1200 : !> \author Michael Steinlechner
1201 : ! **************************************************************************************************
1202 0 : RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1203 0 : permute, step, &
1204 : my_process_id, n_processes, nblocks, &
1205 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1206 : particle_set, &
1207 : coeffs_set, coeffs_kind, &
1208 0 : is_assoc_atomic_block_global, do_periodic, &
1209 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1210 : cell, x_data, para_env, pmax_block, &
1211 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1212 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1213 :
1214 : INTEGER :: nProc_in, blockend, blockstart
1215 : INTEGER, DIMENSION(nProc_in) :: blocksize
1216 : INTEGER :: nblocks, n_processes, my_process_id
1217 : INTEGER, INTENT(IN) :: step
1218 : INTEGER, DIMENSION(nblocks*nblocks) :: permute
1219 : INTEGER :: natom
1220 : INTEGER, INTENT(IN) :: nkind
1221 : TYPE(pair_list_type) :: list_ij, list_kl
1222 : TYPE(pair_set_list_type), ALLOCATABLE, &
1223 : DIMENSION(:) :: set_list_ij, set_list_kl
1224 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1225 : TYPE(hfx_screen_coeff_type), &
1226 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
1227 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
1228 : POINTER :: coeffs_kind
1229 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1230 : LOGICAL :: do_periodic
1231 : INTEGER :: kind_of(*)
1232 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1233 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1234 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1235 : REAL(dp) :: pmax_blocks
1236 : TYPE(cell_type), POINTER :: cell
1237 : TYPE(hfx_type), POINTER :: x_data
1238 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
1239 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
1240 : LOGICAL, INTENT(IN) :: do_p_screening
1241 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1242 : INTEGER, INTENT(IN) :: eval_type
1243 : REAL(dp) :: log10_eps_schwarz, log_2, &
1244 : coeffs_kind_max0
1245 : LOGICAL, INTENT(IN) :: use_virial
1246 : LOGICAL, DIMENSION(:, :), POINTER :: atomic_pair_list
1247 :
1248 : INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
1249 : jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1250 : latom_end, latom_start, nbins, nProc, row, startoffset
1251 : INTEGER(int_8) :: atom_block, tmp_block
1252 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: ithblocksize, localblocksize
1253 0 : INTEGER, DIMENSION(blockend - blockstart + 1) :: bin_perm, tmp_perm
1254 : REAL(dp) :: partialcost
1255 0 : REAL(dp), DIMENSION(nblocks*nblocks) :: cost_vector
1256 :
1257 0 : nProc = nProc_in
1258 0 : cost_vector = 0.0_dp
1259 :
1260 : ! loop over local atom_blocks.
1261 0 : DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
1262 :
1263 : ! get corresponding 4D block indices
1264 0 : latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
1265 0 : tmp_block = atom_block/nblocks
1266 0 : katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1267 0 : IF (latom_block < katom_block) CYCLE
1268 0 : tmp_block = tmp_block/nblocks
1269 0 : jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1270 0 : tmp_block = tmp_block/nblocks
1271 0 : iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1272 0 : IF (jatom_block < iatom_block) CYCLE
1273 :
1274 : ! get 2D indices of this atom_block (with permutation applied)
1275 : ! for this, we need to invert the permutation, this means
1276 : ! find position in permutation vector where value==idx
1277 :
1278 0 : row = (katom_block - 1)*nblocks + jatom_block
1279 0 : inv_perm = 1
1280 0 : DO WHILE (permute(inv_perm) .NE. row)
1281 0 : inv_perm = inv_perm + 1
1282 : END DO
1283 0 : row = inv_perm
1284 :
1285 0 : col = (iatom_block - 1)*nblocks + latom_block
1286 0 : inv_perm = 1
1287 0 : DO WHILE (permute(inv_perm) .NE. col)
1288 0 : inv_perm = inv_perm + 1
1289 : END DO
1290 0 : col = inv_perm
1291 :
1292 : ! if row/col outside our current diagonal block, skip calculation.
1293 0 : IF (col < blockstart .OR. col > blockend) CYCLE
1294 0 : IF (row < blockstart .OR. row > blockend) CYCLE
1295 :
1296 0 : iatom_start = x_data%blocks(iatom_block)%istart
1297 0 : iatom_end = x_data%blocks(iatom_block)%iend
1298 0 : jatom_start = x_data%blocks(jatom_block)%istart
1299 0 : jatom_end = x_data%blocks(jatom_block)%iend
1300 0 : katom_start = x_data%blocks(katom_block)%istart
1301 0 : katom_end = x_data%blocks(katom_block)%iend
1302 0 : latom_start = x_data%blocks(latom_block)%istart
1303 0 : latom_end = x_data%blocks(latom_block)%iend
1304 :
1305 : ! whatever.
1306 0 : SELECT CASE (eval_type)
1307 : CASE (hfx_do_eval_energy)
1308 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
1309 : pmax_block(latom_block, jatom_block), &
1310 : pmax_block(latom_block, iatom_block), &
1311 0 : pmax_block(katom_block, jatom_block))
1312 : CASE (hfx_do_eval_forces)
1313 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
1314 : pmax_block(latom_block, jatom_block), &
1315 : pmax_block(latom_block, iatom_block) + &
1316 0 : pmax_block(katom_block, jatom_block))
1317 : END SELECT
1318 :
1319 : ! screening.
1320 0 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
1321 :
1322 : ! every second recursion step, compute row sum instead of column sum
1323 :
1324 0 : IF (MODULO(step, 2) .EQ. 0) THEN
1325 : idx = row
1326 : ELSE
1327 0 : idx = col
1328 : END IF
1329 :
1330 : ! estimate the cost of this atom_block.
1331 : partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1332 : set_list_kl, &
1333 : iatom_start, iatom_end, jatom_start, jatom_end, &
1334 : katom_start, katom_end, latom_start, latom_end, &
1335 : particle_set, &
1336 : coeffs_set, coeffs_kind, &
1337 : is_assoc_atomic_block_global, do_periodic, &
1338 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1339 : cell, &
1340 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1341 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1342 :
1343 0 : cost_vector(idx) = cost_vector(idx) + partialcost
1344 : END DO ! atom_block
1345 :
1346 : ! sum costvector over all processes
1347 0 : CALL para_env%sum(cost_vector)
1348 :
1349 : ! calculate next prime factor of nProc
1350 0 : nBins = 2
1351 0 : DO WHILE (MODULO(INT(nProc), INT(nBins)) .NE. 0)
1352 0 : nBins = nBins + 1
1353 : END DO
1354 :
1355 0 : nProc = nProc/nBins
1356 :
1357 : ! ... do the binning...
1358 :
1359 0 : ALLOCATE (localblocksize(nBins))
1360 0 : CALL hfx_permute_binning(nBins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1361 :
1362 : !... and update the permutation vector
1363 :
1364 0 : tmp_perm = permute(blockstart:blockend)
1365 0 : permute(blockstart:blockend) = tmp_perm(bin_perm)
1366 :
1367 : ! split recursion into the nBins Bins
1368 0 : IF (nProc > 1) THEN
1369 0 : ALLOCATE (ithblocksize(nProc))
1370 0 : DO i = 1, nBins
1371 0 : startoffset = SUM(localblocksize(1:(i - 1)))
1372 0 : endoffset = SUM(localblocksize(1:i)) - 1
1373 :
1374 : CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nProc, &
1375 : permute, step + 1, &
1376 : my_process_id, n_processes, nblocks, &
1377 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1378 : particle_set, &
1379 : coeffs_set, coeffs_kind, &
1380 : is_assoc_atomic_block_global, do_periodic, &
1381 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1382 : cell, x_data, para_env, pmax_block, &
1383 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1384 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1385 0 : blocksize(((i - 1)*nProc + 1):(i*nProc)) = ithblocksize
1386 : END DO
1387 0 : DEALLOCATE (ithblocksize)
1388 : ELSE
1389 0 : DO i = 1, nBins
1390 0 : blocksize(i) = localblocksize(i)
1391 : END DO
1392 : END IF
1393 :
1394 0 : DEALLOCATE (localblocksize)
1395 :
1396 0 : END SUBROUTINE hfx_recursive_permute
1397 :
1398 : ! **************************************************************************************************
1399 : !> \brief small binning routine for the recursive load balancing
1400 : !>
1401 : !> \param nBins number of Bins (INPUT)
1402 : !> \param costvector vector of current row/column costs which have to be binned (INPUT)
1403 : !> \param maxbinsize upper bound for bin size (INPUT)
1404 : !> \param perm resulting permutation due to be binning routine (OUTPUT)
1405 : !> \param block_count vector of size(nbins) which contains the size of each bin (OUTPUT)
1406 : !> \par History
1407 : !> 03.2011 created [Michael Steinlechner]
1408 : !> \author Michael Steinlechner
1409 : ! **************************************************************************************************
1410 0 : SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1411 :
1412 : INTEGER, INTENT(IN) :: nBins, maxbinsize
1413 : REAL(dp), DIMENSION(maxbinsize), INTENT(IN) :: costvector
1414 : INTEGER, DIMENSION(maxbinsize), INTENT(OUT) :: perm
1415 : INTEGER, DIMENSION(nBins), INTENT(OUT) :: block_count
1416 :
1417 : INTEGER :: i, j, mod_idx, offset
1418 0 : INTEGER, DIMENSION(nBins, maxbinsize) :: bin
1419 0 : INTEGER, DIMENSION(nBins) :: bin_idx
1420 0 : INTEGER, DIMENSION(maxbinsize) :: idx
1421 0 : REAL(dp), DIMENSION(maxbinsize) :: vec
1422 0 : REAL(dp), DIMENSION(nBins) :: bincosts
1423 :
1424 : ! be careful not to change costvector (copy it!)
1425 :
1426 0 : vec = costvector
1427 0 : block_count = 0
1428 0 : bincosts = 0
1429 :
1430 : !sort the array (ascending)
1431 0 : CALL sort(vec, maxbinsize, idx)
1432 :
1433 : ! count the loop down to distribute the largest cols/rows first
1434 0 : DO i = maxbinsize, 1, -1
1435 0 : IF (vec(i) == 0) THEN
1436 : ! spread zero-cost col/rows evenly among procs
1437 0 : mod_idx = MODULO(i, nBins) + 1 !(note the fortran offset by one!)
1438 0 : block_count(mod_idx) = block_count(mod_idx) + 1
1439 0 : bin(mod_idx, block_count(mod_idx)) = idx(i)
1440 : ELSE
1441 : ! sort the bins so that the one with the lowest cost is at the
1442 : ! first place, where we then assign the current col/row
1443 0 : CALL sort(bincosts, nBins, bin_idx)
1444 0 : block_count = block_count(bin_idx)
1445 0 : bin = bin(bin_idx, :)
1446 :
1447 0 : bincosts(1) = bincosts(1) + vec(i)
1448 0 : block_count(1) = block_count(1) + 1
1449 0 : bin(1, block_count(1)) = idx(i)
1450 : END IF
1451 : END DO
1452 :
1453 : ! construct permutation vector from the binning
1454 : offset = 0
1455 0 : DO i = 1, nBins
1456 0 : DO j = 1, block_count(i)
1457 0 : perm(offset + j) = bin(i, j)
1458 : END DO
1459 0 : offset = offset + block_count(i)
1460 : END DO
1461 :
1462 0 : END SUBROUTINE hfx_permute_binning
1463 :
1464 : ! **************************************************************************************************
1465 : !> \brief Cheap way of redistributing the eri's
1466 : !> \param x_data Object that stores the indices array
1467 : !> \param para_env para_env
1468 : !> \param load_balance_parameter contains parmameter for Monte-Carlo routines
1469 : !> \param i_thread current thread ID
1470 : !> \param n_threads Total Number of threads
1471 : !> \param eval_type ...
1472 : !> \par History
1473 : !> 12.2007 created [Manuel Guidon]
1474 : !> 02.2009 optimize Memory Usage [Manuel Guidon]
1475 : !> \author Manuel Guidon
1476 : !> \note
1477 : !> The cost matrix is given by the walltime for each bin that is measured
1478 : !> during the calculation
1479 : ! **************************************************************************************************
1480 1712 : SUBROUTINE hfx_update_load_balance(x_data, para_env, &
1481 : load_balance_parameter, &
1482 : i_thread, n_threads, eval_type)
1483 :
1484 : TYPE(hfx_type), POINTER :: x_data
1485 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
1486 : TYPE(hfx_load_balance_type) :: load_balance_parameter
1487 : INTEGER, INTENT(IN) :: i_thread, n_threads, eval_type
1488 :
1489 : CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_update_load_balance'
1490 :
1491 : INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1492 : my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
1493 5136 : TYPE(mp_request_type), DIMENSION(2) :: req
1494 1712 : INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
1495 1712 : sendbuffer, swapbuffer
1496 : INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
1497 1712 : INTEGER, ALLOCATABLE, DIMENSION(:) :: tmp_pos
1498 : INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: bins_per_rank
1499 : INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE :: bin_histogram
1500 : INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
1501 : INTEGER, SAVE :: max_bin_size
1502 3424 : TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
1503 : TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
1504 : SAVE :: full_dist
1505 :
1506 1712 : !$OMP BARRIER
1507 3424 : !$OMP MASTER
1508 1712 : CALL timeset(routineN, handle)
1509 : !$OMP END MASTER
1510 1712 : !$OMP BARRIER
1511 :
1512 1712 : ncpu = para_env%num_pe
1513 1712 : n_processes = ncpu*n_threads
1514 : !! If there is only 1 cpu skip the binning
1515 1712 : IF (n_processes == 1) THEN
1516 0 : ALLOCATE (tmp_dist(1))
1517 0 : tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
1518 0 : tmp_dist(1)%istart = 0_int_8
1519 0 : ptr_to_tmp_dist => tmp_dist(:)
1520 0 : SELECT CASE (eval_type)
1521 : CASE (hfx_do_eval_energy)
1522 0 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1523 : CASE (hfx_do_eval_forces)
1524 0 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1525 : END SELECT
1526 0 : DEALLOCATE (tmp_dist)
1527 : ELSE
1528 1712 : mepos = para_env%mepos
1529 1712 : my_process_id = para_env%mepos*n_threads + i_thread
1530 1712 : nbins = load_balance_parameter%nbins
1531 1712 : !$OMP MASTER
1532 5136 : ALLOCATE (bin_histogram(n_processes, 2))
1533 11984 : bin_histogram = 0
1534 : !$OMP END MASTER
1535 1712 : !$OMP BARRIER
1536 2622 : SELECT CASE (eval_type)
1537 : CASE (hfx_do_eval_energy)
1538 910 : my_bin_size = SIZE(x_data%distribution_energy)
1539 : CASE (hfx_do_eval_forces)
1540 1712 : my_bin_size = SIZE(x_data%distribution_forces)
1541 : END SELECT
1542 1712 : bin_histogram(my_process_id + 1, 1) = my_bin_size
1543 1712 : !$OMP BARRIER
1544 1712 : !$OMP MASTER
1545 1712 : CALL para_env%sum(bin_histogram(:, 1))
1546 1712 : bin_histogram(1, 2) = bin_histogram(1, 1)
1547 3424 : DO iprocess = 2, n_processes
1548 3424 : bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1549 : END DO
1550 :
1551 3424 : max_bin_size = MAXVAL(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1552 1712 : CALL para_env%max(max_bin_size)
1553 : !$OMP END MASTER
1554 1712 : !$OMP BARRIER
1555 114698 : ALLOCATE (binned_dist(my_bin_size))
1556 : !! Use old binned_dist, but with timings cost
1557 910 : SELECT CASE (eval_type)
1558 : CASE (hfx_do_eval_energy)
1559 117390 : binned_dist = x_data%distribution_energy
1560 : CASE (hfx_do_eval_forces)
1561 104368 : binned_dist = x_data%distribution_forces
1562 : END SELECT
1563 :
1564 111280 : DO ibin = 1, my_bin_size
1565 111280 : IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
1566 94384 : binned_dist(ibin)%cost = 0
1567 : ELSE
1568 8808 : SELECT CASE (eval_type)
1569 : CASE (hfx_do_eval_energy)
1570 8808 : IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
1571 : binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_first_scf + &
1572 8808 : binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1573 : ELSE
1574 0 : binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1575 : END IF
1576 : CASE (hfx_do_eval_forces)
1577 15184 : binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
1578 : END SELECT
1579 : END IF
1580 : END DO
1581 1712 : !$OMP BARRIER
1582 1712 : !$OMP MASTER
1583 : !! store all local results in a big cost matrix
1584 5136 : ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1585 220848 : cost_matrix = 0
1586 5136 : ALLOCATE (sendbuffer(max_bin_size*n_threads))
1587 5136 : ALLOCATE (recbuffer(max_bin_size*n_threads))
1588 : !$OMP END MASTER
1589 1712 : !$OMP BARRIER
1590 1712 : my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1591 1712 : icpu = para_env%mepos + 1
1592 111280 : DO i = 1, my_bin_size
1593 111280 : cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1594 : END DO
1595 :
1596 1712 : mepos = para_env%mepos
1597 1712 : !$OMP BARRIER
1598 1712 : !$OMP MASTER
1599 5136 : ALLOCATE (bins_per_rank(ncpu))
1600 5136 : bins_per_rank = 0
1601 5136 : DO icpu = 1, ncpu
1602 8560 : bins_per_rank(icpu) = SUM(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1603 : END DO
1604 : sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1605 220848 : cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1606 :
1607 1712 : dest = MODULO(mepos + 1, ncpu)
1608 1712 : source = MODULO(mepos - 1, ncpu)
1609 : ! sync before/after ring of isendrecv
1610 1712 : CALL para_env%sync()
1611 5136 : DO icpu = 0, ncpu - 1
1612 3424 : IF (icpu .NE. ncpu - 1) THEN
1613 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1614 1712 : req(1), req(2), 13)
1615 : END IF
1616 3424 : data_from = MODULO(mepos - icpu, ncpu)
1617 8560 : start_idx = SUM(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1618 3424 : end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1619 441696 : cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1620 :
1621 3424 : IF (icpu .NE. ncpu - 1) THEN
1622 1712 : CALL mp_waitall(req)
1623 : END IF
1624 3424 : swapbuffer => sendbuffer
1625 3424 : sendbuffer => recbuffer
1626 5136 : recbuffer => swapbuffer
1627 : END DO
1628 1712 : DEALLOCATE (recbuffer, sendbuffer)
1629 : ! sync before/after ring of isendrecv
1630 1712 : CALL para_env%sync()
1631 : !$OMP END MASTER
1632 1712 : !$OMP BARRIER
1633 5136 : ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
1634 439984 : local_cost_matrix = cost_matrix
1635 1712 : !$OMP MASTER
1636 5136 : ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1637 : CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1638 1712 : shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1639 :
1640 620168 : ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1641 :
1642 615032 : full_dist(:, :)%istart = 0_int_8
1643 615032 : full_dist(:, :)%number_of_atom_quartets = 0_int_8
1644 615032 : full_dist(:, :)%cost = 0_int_8
1645 615032 : full_dist(:, :)%time_first_scf = 0.0_dp
1646 615032 : full_dist(:, :)%time_other_scf = 0.0_dp
1647 615032 : full_dist(:, :)%time_forces = 0.0_dp
1648 : !$OMP END MASTER
1649 :
1650 1712 : !$OMP BARRIER
1651 1712 : mepos = para_env%mepos + 1
1652 220848 : full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1653 1712 : !$OMP BARRIER
1654 1712 : !$OMP MASTER
1655 5136 : ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1656 5136 : ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1657 1712 : mepos = para_env%mepos
1658 3424 : DO j = 1, n_threads
1659 207864 : DO i = 1, max_bin_size
1660 204440 : sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1661 204440 : sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1662 206152 : sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1663 : END DO
1664 : END DO
1665 1712 : dest = MODULO(mepos + 1, ncpu)
1666 1712 : source = MODULO(mepos - 1, ncpu)
1667 : ! sync before/after ring of isendrecv
1668 1712 : CALL para_env%sync()
1669 5136 : DO icpu = 0, ncpu - 1
1670 3424 : IF (icpu .NE. ncpu - 1) THEN
1671 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1672 1712 : req(1), req(2), 13)
1673 : END IF
1674 3424 : data_from = MODULO(mepos - icpu, ncpu)
1675 6848 : DO j = 1, n_threads
1676 415728 : DO i = 1, max_bin_size
1677 408880 : full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1678 408880 : full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1679 412304 : full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1680 : END DO
1681 : END DO
1682 :
1683 3424 : IF (icpu .NE. ncpu - 1) THEN
1684 1712 : CALL mp_waitall(req)
1685 : END IF
1686 3424 : swapbuffer => sendbuffer
1687 3424 : sendbuffer => recbuffer
1688 5136 : recbuffer => swapbuffer
1689 : END DO
1690 : ! sync before/after ring of isendrecv
1691 1712 : DEALLOCATE (recbuffer, sendbuffer)
1692 1712 : CALL para_env%sync()
1693 : !$OMP END MASTER
1694 1712 : !$OMP BARRIER
1695 : !! reorder the distribution according to the distribution vector
1696 5136 : ALLOCATE (tmp_pos(ncpu*n_threads))
1697 5136 : tmp_pos = 1
1698 224272 : ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1699 :
1700 220848 : tmp_dist(:)%istart = 0_int_8
1701 220848 : tmp_dist(:)%number_of_atom_quartets = 0_int_8
1702 220848 : tmp_dist(:)%cost = 0_int_8
1703 220848 : tmp_dist(:)%time_first_scf = 0.0_dp
1704 220848 : tmp_dist(:)%time_other_scf = 0.0_dp
1705 220848 : tmp_dist(:)%time_forces = 0.0_dp
1706 :
1707 5136 : mepos = my_process_id + 1
1708 5136 : DO icpu = 1, n_processes
1709 224272 : DO i = 1, bin_histogram(icpu, 1)
1710 222560 : IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
1711 109568 : tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1712 109568 : tmp_pos(mepos) = tmp_pos(mepos) + 1
1713 : END IF
1714 : END DO
1715 : END DO
1716 :
1717 : !! Assign the load to each process
1718 : NULLIFY (ptr_to_tmp_dist)
1719 1712 : mepos = my_process_id + 1
1720 1712 : ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1721 910 : SELECT CASE (eval_type)
1722 : CASE (hfx_do_eval_energy)
1723 910 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1724 : CASE (hfx_do_eval_forces)
1725 1712 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1726 : END SELECT
1727 :
1728 1712 : !$OMP BARRIER
1729 1712 : !$OMP MASTER
1730 1712 : DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1731 1712 : DEALLOCATE (bins_per_rank, bin_histogram)
1732 : !$OMP END MASTER
1733 1712 : !$OMP BARRIER
1734 1712 : DEALLOCATE (tmp_dist, tmp_pos)
1735 1712 : DEALLOCATE (binned_dist, local_cost_matrix)
1736 : END IF
1737 1712 : !$OMP BARRIER
1738 1712 : !$OMP MASTER
1739 1712 : CALL timestop(handle)
1740 : !$OMP END MASTER
1741 1712 : !$OMP BARRIER
1742 :
1743 3424 : END SUBROUTINE hfx_update_load_balance
1744 :
1745 : ! **************************************************************************************************
1746 : !> \brief estimates the cost of a set quartet with info available at load balance time
1747 : !> i.e. without much info on the primitives primitives
1748 : !> \param nsa ...
1749 : !> \param nsb ...
1750 : !> \param nsc ...
1751 : !> \param nsd ...
1752 : !> \param npgfa ...
1753 : !> \param npgfb ...
1754 : !> \param npgfc ...
1755 : !> \param npgfd ...
1756 : !> \param ratio ...
1757 : !> \param p1 ...
1758 : !> \param p2 ...
1759 : !> \param p3 ...
1760 : !> \return ...
1761 : !> \par History
1762 : !> 08.2009 created Joost VandeVondele
1763 : !> \author Joost VandeVondele
1764 : ! **************************************************************************************************
1765 8831621 : FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
1766 : IMPLICIT NONE
1767 : REAL(KIND=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1768 : INTEGER(KIND=int_8) :: res
1769 : REAL(KIND=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
1770 :
1771 : INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1772 :
1773 8831621 : estimate1 = estimate_basic(p1)
1774 8831621 : estimate2 = estimate_basic(p2)
1775 8831621 : mu = LOG(ABS(1.0E6_dp*p3(1)) + 1)
1776 8831621 : sigma = p3(2)*0.1_dp*mu
1777 8831621 : switch = 1.0_dp/(1.0_dp + EXP((LOG(estimate1) - mu)/sigma))
1778 8831621 : estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1779 8831621 : res = INT(estimate*0.001_dp, KIND=int_8) + 1
1780 :
1781 : CONTAINS
1782 :
1783 : ! **************************************************************************************************
1784 : !> \brief ...
1785 : !> \param p ...
1786 : !> \return ...
1787 : ! **************************************************************************************************
1788 17663242 : REAL(KIND=dp) FUNCTION estimate_basic(p) RESULT(res)
1789 : REAL(KIND=dp) :: p(12)
1790 :
1791 : REAL(KIND=dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1792 : p7, p8, p9
1793 :
1794 17663242 : p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1795 17663242 : p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1796 17663242 : p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1797 : res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1798 : poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1799 : poly2(npgfd, p4, p5, p6)*EXP(-p7*ratio + p8*ratio**2) + &
1800 17663242 : 1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1801 17663242 : res = 1 + ABS(res)
1802 17663242 : END FUNCTION estimate_basic
1803 :
1804 : ! **************************************************************************************************
1805 : !> \brief ...
1806 : !> \param x ...
1807 : !> \param a0 ...
1808 : !> \param a1 ...
1809 : !> \param a2 ...
1810 : !> \return ...
1811 : ! **************************************************************************************************
1812 211958904 : REAL(KIND=dp) FUNCTION poly2(x, a0, a1, a2)
1813 : INTEGER :: x
1814 : REAL(KIND=dp) :: a0, a1, a2
1815 :
1816 211958904 : poly2 = a0 + a1*x + a2*x*x
1817 211958904 : END FUNCTION poly2
1818 :
1819 : END FUNCTION cost_model
1820 : ! **************************************************************************************************
1821 : !> \brief Minimizes the maximum cost per cpu by shuffling around all bins
1822 : !> \param total_number_of_bins ...
1823 : !> \param number_of_processes ...
1824 : !> \param bin_costs costs per bin
1825 : !> \param distribution_vector will contain the final distribution
1826 : !> \param do_randomize ...
1827 : !> \par History
1828 : !> 03.2009 created from a hack by Joost [Manuel Guidon]
1829 : !> \author Manuel Guidon
1830 : ! **************************************************************************************************
1831 3442 : SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1832 : distribution_vector, do_randomize)
1833 : INTEGER :: total_number_of_bins, number_of_processes
1834 : INTEGER(int_8), DIMENSION(:), POINTER :: bin_costs
1835 : INTEGER, DIMENSION(:), POINTER :: distribution_vector
1836 : LOGICAL, INTENT(IN) :: do_randomize
1837 :
1838 : INTEGER :: i, itmp, j, nstep
1839 3442 : INTEGER(int_8), DIMENSION(:), POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1840 : INTEGER, DIMENSION(:), POINTER :: tmp_cpu_index, tmp_index
1841 3442 : TYPE(rng_stream_type), ALLOCATABLE :: rng_stream
1842 :
1843 3442 : nstep = MAX(1, INT(number_of_processes)/2)
1844 :
1845 10326 : ALLOCATE (tmp_cost(total_number_of_bins))
1846 10326 : ALLOCATE (tmp_index(total_number_of_bins))
1847 10326 : ALLOCATE (tmp_cpu_cost(number_of_processes))
1848 10326 : ALLOCATE (tmp_cpu_index(number_of_processes))
1849 10326 : ALLOCATE (my_cost_cpu(number_of_processes))
1850 888036 : tmp_cost = bin_costs
1851 :
1852 3442 : CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1853 10326 : my_cost_cpu = 0
1854 : !
1855 : ! assign the largest remaining bin to the CPU with the smallest load
1856 : ! gives near perfect distributions for a sufficient number of bins ...
1857 : ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
1858 : ! each cpu a similar number of tasks.
1859 : ! it also avoids degenerate cases where thousands of zero sized tasks
1860 : ! are assigned to the same (least loaded) cpu
1861 : !
1862 3442 : IF (do_randomize) &
1863 : rng_stream = rng_stream_type(name="uniform_rng", &
1864 0 : distribution_type=UNIFORM)
1865 :
1866 444018 : DO i = total_number_of_bins, 1, -nstep
1867 2643456 : tmp_cpu_cost = my_cost_cpu
1868 440576 : CALL sort(tmp_cpu_cost, INT(number_of_processes), tmp_cpu_index)
1869 440576 : IF (do_randomize) THEN
1870 0 : CALL rng_stream%shuffle(tmp_cpu_index(1:MIN(i, nstep)))
1871 : END IF
1872 884594 : DO j = 1, MIN(i, nstep)
1873 440576 : itmp = tmp_cpu_index(j)
1874 440576 : distribution_vector(tmp_index(i - j + 1)) = itmp
1875 881152 : my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1876 : END DO
1877 : END DO
1878 :
1879 3442 : DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1880 3442 : DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1881 6884 : END SUBROUTINE optimize_distribution
1882 :
1883 : ! **************************************************************************************************
1884 : !> \brief Given a 2d index pair, this function returns a 1d index pair for
1885 : !> a symmetric upper triangle NxN matrix
1886 : !> The compiler should inline this function, therefore it appears in
1887 : !> several modules
1888 : !> \param i 2d index
1889 : !> \param j 2d index
1890 : !> \param N matrix size
1891 : !> \return ...
1892 : !> \par History
1893 : !> 03.2009 created [Manuel Guidon]
1894 : !> \author Manuel Guidon
1895 : ! **************************************************************************************************
1896 64808 : PURE FUNCTION get_1D_idx(i, j, N)
1897 : INTEGER, INTENT(IN) :: i, j
1898 : INTEGER(int_8), INTENT(IN) :: N
1899 : INTEGER(int_8) :: get_1D_idx
1900 :
1901 : INTEGER(int_8) :: min_ij
1902 :
1903 64808 : min_ij = MIN(i, j)
1904 64808 : get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N
1905 :
1906 64808 : END FUNCTION get_1D_idx
1907 :
1908 : ! **************************************************************************************************
1909 : !> \brief ...
1910 : !> \param natom ...
1911 : !> \param nkind ...
1912 : !> \param list_ij ...
1913 : !> \param list_kl ...
1914 : !> \param set_list_ij ...
1915 : !> \param set_list_kl ...
1916 : !> \param iatom_start ...
1917 : !> \param iatom_end ...
1918 : !> \param jatom_start ...
1919 : !> \param jatom_end ...
1920 : !> \param katom_start ...
1921 : !> \param katom_end ...
1922 : !> \param latom_start ...
1923 : !> \param latom_end ...
1924 : !> \param particle_set ...
1925 : !> \param coeffs_set ...
1926 : !> \param coeffs_kind ...
1927 : !> \param is_assoc_atomic_block_global ...
1928 : !> \param do_periodic ...
1929 : !> \param kind_of ...
1930 : !> \param basis_parameter ...
1931 : !> \param pmax_set ...
1932 : !> \param pmax_atom ...
1933 : !> \param pmax_blocks ...
1934 : !> \param cell ...
1935 : !> \param do_p_screening ...
1936 : !> \param map_atom_to_kind_atom ...
1937 : !> \param eval_type ...
1938 : !> \param log10_eps_schwarz ...
1939 : !> \param log_2 ...
1940 : !> \param coeffs_kind_max0 ...
1941 : !> \param use_virial ...
1942 : !> \param atomic_pair_list ...
1943 : !> \return ...
1944 : ! **************************************************************************************************
1945 303066 : FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1946 : iatom_start, iatom_end, jatom_start, jatom_end, &
1947 : katom_start, katom_end, latom_start, latom_end, &
1948 : particle_set, &
1949 303066 : coeffs_set, coeffs_kind, &
1950 303066 : is_assoc_atomic_block_global, do_periodic, &
1951 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1952 : cell, &
1953 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1954 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1955 303066 : atomic_pair_list)
1956 :
1957 : INTEGER, INTENT(IN) :: natom, nkind
1958 : TYPE(pair_list_type) :: list_ij, list_kl
1959 : TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
1960 : INTEGER, INTENT(IN) :: iatom_start, iatom_end, jatom_start, &
1961 : jatom_end, katom_start, katom_end, &
1962 : latom_start, latom_end
1963 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1964 : TYPE(hfx_screen_coeff_type), &
1965 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
1966 : TYPE(hfx_screen_coeff_type), &
1967 : DIMENSION(nkind, nkind) :: coeffs_kind
1968 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1969 : LOGICAL :: do_periodic
1970 : INTEGER :: kind_of(*)
1971 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1972 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1973 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1974 : REAL(dp) :: pmax_blocks
1975 : TYPE(cell_type), POINTER :: cell
1976 : LOGICAL, INTENT(IN) :: do_p_screening
1977 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1978 : INTEGER, INTENT(IN) :: eval_type
1979 : REAL(dp) :: log10_eps_schwarz, log_2, &
1980 : coeffs_kind_max0
1981 : LOGICAL, INTENT(IN) :: use_virial
1982 : LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
1983 : INTEGER(int_8) :: estimate_block_cost
1984 :
1985 : INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
1986 : i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
1987 : jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
1988 303066 : INTEGER, DIMENSION(:), POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
1989 303066 : nsgfb, nsgfc, nsgfd
1990 : REAL(dp) :: actual_pmax_atom, cost_tmp, max_val1, &
1991 : max_val2, pmax_entry, rab2, rcd2, &
1992 : screen_kind_ij, screen_kind_kl
1993 303066 : REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
1994 :
1995 303066 : estimate_block_cost = 0_int_8
1996 :
1997 : CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
1998 : kind_of, basis_parameter, particle_set, &
1999 : do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2000 303066 : log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2001 :
2002 : CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2003 : kind_of, basis_parameter, particle_set, &
2004 : do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2005 303066 : log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2006 :
2007 575868 : DO i_list_ij = 1, list_ij%n_element
2008 272802 : iatom = list_ij%elements(i_list_ij)%pair(1)
2009 272802 : jatom = list_ij%elements(i_list_ij)%pair(2)
2010 272802 : i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2011 272802 : i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2012 272802 : ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2013 272802 : jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2014 272802 : rab2 = list_ij%elements(i_list_ij)%dist2
2015 :
2016 272802 : nsgfa => basis_parameter(ikind)%nsgf
2017 272802 : nsgfb => basis_parameter(jkind)%nsgf
2018 272802 : npgfa => basis_parameter(ikind)%npgf
2019 272802 : npgfb => basis_parameter(jkind)%npgf
2020 :
2021 830352 : DO i_list_kl = 1, list_kl%n_element
2022 :
2023 254484 : katom = list_kl%elements(i_list_kl)%pair(1)
2024 254484 : latom = list_kl%elements(i_list_kl)%pair(2)
2025 :
2026 254484 : IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE
2027 140198 : IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) CYCLE
2028 :
2029 134582 : IF (eval_type == hfx_do_eval_forces) THEN
2030 14558 : IF (.NOT. use_virial) THEN
2031 13176 : IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) CYCLE
2032 : END IF
2033 : END IF
2034 :
2035 133266 : i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2036 133266 : i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2037 133266 : kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2038 133266 : lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2039 133266 : rcd2 = list_kl%elements(i_list_kl)%dist2
2040 :
2041 133266 : nsgfc => basis_parameter(kkind)%nsgf
2042 133266 : nsgfd => basis_parameter(lkind)%nsgf
2043 133266 : npgfc => basis_parameter(kkind)%npgf
2044 133266 : npgfd => basis_parameter(lkind)%npgf
2045 :
2046 133266 : IF (do_p_screening) THEN
2047 : actual_pmax_atom = MAX(pmax_atom(katom, iatom), &
2048 : pmax_atom(latom, jatom), &
2049 : pmax_atom(latom, iatom), &
2050 16562 : pmax_atom(katom, jatom))
2051 : ELSE
2052 : actual_pmax_atom = 0.0_dp
2053 : END IF
2054 :
2055 : screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2056 133266 : coeffs_kind(jkind, ikind)%x(2)
2057 : screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2058 133266 : coeffs_kind(lkind, kkind)%x(2)
2059 133266 : IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
2060 :
2061 126088 : IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2062 : is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2063 : is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2064 : is_assoc_atomic_block_global(latom, jatom) >= 1)) CYCLE
2065 :
2066 122904 : IF (do_p_screening) THEN
2067 6378 : SELECT CASE (eval_type)
2068 : CASE (hfx_do_eval_energy)
2069 6378 : swap_id = 0
2070 6378 : kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
2071 6378 : IF (ikind >= kkind) THEN
2072 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2073 : map_atom_to_kind_atom(katom), &
2074 5746 : map_atom_to_kind_atom(iatom))
2075 : ELSE
2076 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2077 : map_atom_to_kind_atom(iatom), &
2078 632 : map_atom_to_kind_atom(katom))
2079 632 : swap_id = swap_id + 1
2080 : END IF
2081 6378 : kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
2082 6378 : IF (jkind >= lkind) THEN
2083 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2084 : map_atom_to_kind_atom(latom), &
2085 5978 : map_atom_to_kind_atom(jatom))
2086 : ELSE
2087 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2088 : map_atom_to_kind_atom(jatom), &
2089 400 : map_atom_to_kind_atom(latom))
2090 400 : swap_id = swap_id + 2
2091 : END IF
2092 6378 : kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
2093 6378 : IF (ikind >= lkind) THEN
2094 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2095 : map_atom_to_kind_atom(latom), &
2096 3594 : map_atom_to_kind_atom(iatom))
2097 : ELSE
2098 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2099 : map_atom_to_kind_atom(iatom), &
2100 2784 : map_atom_to_kind_atom(latom))
2101 2784 : swap_id = swap_id + 4
2102 : END IF
2103 6378 : kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
2104 6378 : IF (jkind >= kkind) THEN
2105 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2106 : map_atom_to_kind_atom(katom), &
2107 6348 : map_atom_to_kind_atom(jatom))
2108 : ELSE
2109 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2110 : map_atom_to_kind_atom(jatom), &
2111 30 : map_atom_to_kind_atom(katom))
2112 30 : swap_id = swap_id + 8
2113 : END IF
2114 : CASE (hfx_do_eval_forces)
2115 9824 : swap_id = 16
2116 9824 : kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
2117 9824 : IF (ikind >= kkind) THEN
2118 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2119 : map_atom_to_kind_atom(katom), &
2120 9290 : map_atom_to_kind_atom(iatom))
2121 : ELSE
2122 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2123 : map_atom_to_kind_atom(iatom), &
2124 534 : map_atom_to_kind_atom(katom))
2125 534 : swap_id = swap_id + 1
2126 : END IF
2127 9824 : kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
2128 9824 : IF (jkind >= lkind) THEN
2129 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2130 : map_atom_to_kind_atom(latom), &
2131 9824 : map_atom_to_kind_atom(jatom))
2132 : ELSE
2133 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2134 : map_atom_to_kind_atom(jatom), &
2135 0 : map_atom_to_kind_atom(latom))
2136 0 : swap_id = swap_id + 2
2137 : END IF
2138 9824 : kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
2139 9824 : IF (ikind >= lkind) THEN
2140 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2141 : map_atom_to_kind_atom(latom), &
2142 8074 : map_atom_to_kind_atom(iatom))
2143 : ELSE
2144 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2145 : map_atom_to_kind_atom(iatom), &
2146 1750 : map_atom_to_kind_atom(latom))
2147 1750 : swap_id = swap_id + 4
2148 : END IF
2149 9824 : kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
2150 26026 : IF (jkind >= kkind) THEN
2151 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2152 : map_atom_to_kind_atom(katom), &
2153 9824 : map_atom_to_kind_atom(jatom))
2154 : ELSE
2155 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2156 : map_atom_to_kind_atom(jatom), &
2157 0 : map_atom_to_kind_atom(katom))
2158 0 : swap_id = swap_id + 8
2159 : END IF
2160 : END SELECT
2161 : END IF
2162 :
2163 1080388 : DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2164 684682 : iset = set_list_ij(i_set_list_ij)%pair(1)
2165 684682 : jset = set_list_ij(i_set_list_ij)%pair(2)
2166 :
2167 : max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2168 684682 : coeffs_set(jset, iset, jkind, ikind)%x(2)
2169 :
2170 684682 : IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
2171 9928560 : DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2172 9005754 : kset = set_list_kl(i_set_list_kl)%pair(1)
2173 9005754 : lset = set_list_kl(i_set_list_kl)%pair(2)
2174 :
2175 : max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2176 9005754 : coeffs_set(lset, kset, lkind, kkind)%x(2))
2177 :
2178 9005754 : IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) CYCLE
2179 8001728 : IF (do_p_screening) THEN
2180 : CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2181 : iset, jset, kset, lset, &
2182 995549 : pmax_entry, swap_id)
2183 995549 : IF (eval_type == hfx_do_eval_forces) THEN
2184 578154 : pmax_entry = log_2 + pmax_entry
2185 : END IF
2186 : ELSE
2187 7006179 : pmax_entry = 0.0_dp
2188 : END IF
2189 8001728 : max_val2 = max_val2 + pmax_entry
2190 8001728 : IF (max_val2 < log10_eps_schwarz) CYCLE
2191 684682 : SELECT CASE (eval_type)
2192 : CASE (hfx_do_eval_energy)
2193 : cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2194 : npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2195 : max_val2/log10_eps_schwarz, &
2196 6905536 : p1_energy, p2_energy, p3_energy)
2197 6905536 : estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
2198 : CASE (hfx_do_eval_forces)
2199 : cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2200 : npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2201 : max_val2/log10_eps_schwarz, &
2202 914300 : p1_forces, p2_forces, p3_forces)
2203 8734136 : estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
2204 : END SELECT
2205 : END DO ! i_set_list_kl
2206 : END DO ! i_set_list_ij
2207 : END DO ! i_list_kl
2208 : END DO ! i_list_ij
2209 :
2210 303066 : END FUNCTION estimate_block_cost
2211 :
2212 : ! **************************************************************************************************
2213 : !> \brief ...
2214 : !> \param nkind ...
2215 : !> \param para_env ...
2216 : !> \param natom ...
2217 : !> \param block_size ...
2218 : !> \param nblock ...
2219 : !> \param blocks ...
2220 : !> \param list_ij ...
2221 : !> \param list_kl ...
2222 : !> \param set_list_ij ...
2223 : !> \param set_list_kl ...
2224 : !> \param particle_set ...
2225 : !> \param coeffs_set ...
2226 : !> \param coeffs_kind ...
2227 : !> \param is_assoc_atomic_block_global ...
2228 : !> \param do_periodic ...
2229 : !> \param kind_of ...
2230 : !> \param basis_parameter ...
2231 : !> \param pmax_set ...
2232 : !> \param pmax_atom ...
2233 : !> \param pmax_blocks ...
2234 : !> \param cell ...
2235 : !> \param do_p_screening ...
2236 : !> \param map_atom_to_kind_atom ...
2237 : !> \param eval_type ...
2238 : !> \param log10_eps_schwarz ...
2239 : !> \param log_2 ...
2240 : !> \param coeffs_kind_max0 ...
2241 : !> \param use_virial ...
2242 : !> \param atomic_pair_list ...
2243 : ! **************************************************************************************************
2244 1216 : SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2245 1216 : list_ij, list_kl, set_list_ij, set_list_kl, &
2246 : particle_set, &
2247 : coeffs_set, coeffs_kind, &
2248 1216 : is_assoc_atomic_block_global, do_periodic, &
2249 : kind_of, basis_parameter, pmax_set, pmax_atom, &
2250 : pmax_blocks, cell, &
2251 : do_p_screening, map_atom_to_kind_atom, eval_type, &
2252 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2253 1216 : atomic_pair_list)
2254 :
2255 : INTEGER, INTENT(IN) :: nkind
2256 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
2257 : INTEGER :: natom, block_size, nblock
2258 : TYPE(hfx_block_range_type), DIMENSION(1:nblock) :: blocks
2259 : TYPE(pair_list_type) :: list_ij, list_kl
2260 : TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
2261 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
2262 : TYPE(hfx_screen_coeff_type), &
2263 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
2264 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
2265 : POINTER :: coeffs_kind
2266 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
2267 : LOGICAL :: do_periodic
2268 : INTEGER :: kind_of(*)
2269 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
2270 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
2271 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
2272 : REAL(dp) :: pmax_blocks
2273 : TYPE(cell_type), POINTER :: cell
2274 : LOGICAL, INTENT(IN) :: do_p_screening
2275 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
2276 : INTEGER, INTENT(IN) :: eval_type
2277 : REAL(dp) :: log10_eps_schwarz, log_2, &
2278 : coeffs_kind_max0
2279 : LOGICAL, INTENT(IN) :: use_virial
2280 : LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
2281 :
2282 : INTEGER :: atom_block, i, iatom_block, iatom_end, &
2283 : iatom_start, my_cpu_rank, ncpus
2284 :
2285 5046 : DO atom_block = 0, nblock - 1
2286 3830 : iatom_block = MODULO(atom_block, nblock) + 1
2287 3830 : iatom_start = (iatom_block - 1)*block_size + 1
2288 3830 : iatom_end = MIN(iatom_block*block_size, natom)
2289 3830 : blocks(atom_block + 1)%istart = iatom_start
2290 3830 : blocks(atom_block + 1)%iend = iatom_end
2291 5046 : blocks(atom_block + 1)%cost = 0_int_8
2292 : END DO
2293 :
2294 1216 : ncpus = para_env%num_pe
2295 1216 : my_cpu_rank = para_env%mepos
2296 5046 : DO i = 1, nblock
2297 3830 : IF (MODULO(i, ncpus) /= my_cpu_rank) THEN
2298 1894 : blocks(i)%istart = 0
2299 1894 : blocks(i)%iend = 0
2300 1894 : CYCLE
2301 : END IF
2302 1936 : iatom_start = blocks(i)%istart
2303 1936 : iatom_end = blocks(i)%iend
2304 : blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2305 : iatom_start, iatom_end, iatom_start, iatom_end, &
2306 : iatom_start, iatom_end, iatom_start, iatom_end, &
2307 : particle_set, &
2308 : coeffs_set, coeffs_kind, &
2309 : is_assoc_atomic_block_global, do_periodic, &
2310 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2311 : cell, &
2312 : do_p_screening, map_atom_to_kind_atom, eval_type, &
2313 3152 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2314 :
2315 : END DO
2316 1216 : END SUBROUTINE init_blocks
2317 :
2318 : ! **************************************************************************************************
2319 : !> \brief ...
2320 : !> \param para_env ...
2321 : !> \param x_data ...
2322 : !> \param iw ...
2323 : !> \param n_threads ...
2324 : !> \param i_thread ...
2325 : !> \param eval_type ...
2326 : ! **************************************************************************************************
2327 0 : SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
2328 : eval_type)
2329 :
2330 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
2331 : TYPE(hfx_type), POINTER :: x_data
2332 : INTEGER, INTENT(IN) :: iw, n_threads, i_thread, eval_type
2333 :
2334 : INTEGER :: i, j, k, my_rank, nbins, nranks, &
2335 : total_bins
2336 : INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, &
2337 : min_bin, min_rank, sum_bin, sum_rank
2338 0 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer, buffer_in, buffer_out, summary
2339 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE :: shm_cost_vector
2340 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: bins_per_rank, rdispl, sort_idx
2341 : INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: shm_bins_per_rank, shm_displ
2342 :
2343 0 : SELECT CASE (eval_type)
2344 : CASE (hfx_do_eval_energy)
2345 0 : nbins = SIZE(x_data%distribution_energy)
2346 : CASE (hfx_do_eval_forces)
2347 0 : nbins = SIZE(x_data%distribution_forces)
2348 : END SELECT
2349 :
2350 0 : !$OMP MASTER
2351 0 : ALLOCATE (shm_bins_per_rank(n_threads))
2352 0 : ALLOCATE (shm_displ(n_threads + 1))
2353 : !$OMP END MASTER
2354 0 : !$OMP BARRIER
2355 :
2356 0 : shm_bins_per_rank(i_thread + 1) = nbins
2357 0 : !$OMP BARRIER
2358 0 : nbins = 0
2359 0 : DO i = 1, n_threads
2360 0 : nbins = nbins + shm_bins_per_rank(i)
2361 : END DO
2362 0 : my_rank = para_env%mepos
2363 0 : nranks = para_env%num_pe
2364 :
2365 0 : !$OMP BARRIER
2366 0 : !$OMP MASTER
2367 0 : ALLOCATE (bins_per_rank(nranks))
2368 0 : bins_per_rank = 0
2369 :
2370 0 : bins_per_rank(my_rank + 1) = nbins
2371 :
2372 0 : CALL para_env%sum(bins_per_rank)
2373 :
2374 0 : total_bins = 0
2375 0 : DO i = 1, nranks
2376 0 : total_bins = total_bins + bins_per_rank(i)
2377 : END DO
2378 :
2379 0 : ALLOCATE (shm_cost_vector(2*total_bins))
2380 0 : shm_cost_vector = -1_int_8
2381 0 : shm_displ(1) = 1
2382 0 : DO i = 2, n_threads
2383 0 : shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2384 : END DO
2385 0 : shm_displ(n_threads + 1) = nbins + 1
2386 : !$OMP END MASTER
2387 0 : !$OMP BARRIER
2388 0 : j = 0
2389 0 : SELECT CASE (eval_type)
2390 : CASE (hfx_do_eval_energy)
2391 0 : DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2392 0 : j = j + 1
2393 0 : shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2394 0 : shm_cost_vector(2*i) = INT(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, KIND=int_8)
2395 : END DO
2396 : CASE (hfx_do_eval_forces)
2397 0 : DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2398 0 : j = j + 1
2399 0 : shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2400 0 : shm_cost_vector(2*i) = INT(x_data%distribution_forces(j)%time_forces*10000.0_dp, KIND=int_8)
2401 : END DO
2402 : END SELECT
2403 0 : !$OMP BARRIER
2404 0 : !$OMP MASTER
2405 : ! ** calculate offsets
2406 0 : ALLOCATE (rdispl(nranks))
2407 0 : bins_per_rank(:) = bins_per_rank(:)*2
2408 0 : rdispl(1) = 0
2409 0 : DO i = 2, nranks
2410 0 : rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2411 : END DO
2412 :
2413 0 : ALLOCATE (buffer_in(2*nbins))
2414 0 : ALLOCATE (buffer_out(2*total_bins))
2415 :
2416 0 : DO i = 1, nbins
2417 0 : buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2418 0 : buffer_in(2*i) = shm_cost_vector(2*i)
2419 : END DO
2420 :
2421 0 : CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
2422 :
2423 0 : IF (iw > 0) THEN
2424 :
2425 0 : ALLOCATE (summary(2*nranks))
2426 0 : summary = 0_int_8
2427 :
2428 0 : WRITE (iw, '( /, 1X, 79("-") )')
2429 0 : WRITE (iw, '( " -", 77X, "-" )')
2430 0 : SELECT CASE (eval_type)
2431 : CASE (hfx_do_eval_energy)
2432 0 : WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
2433 : CASE (hfx_do_eval_forces)
2434 0 : WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
2435 : END SELECT
2436 0 : WRITE (iw, '( " -", 77X, "-" )')
2437 0 : WRITE (iw, '( 1X, 79("-") )')
2438 :
2439 0 : WRITE (iw, FMT="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
2440 0 : WRITE (iw, '( 1X, 79("-"), / )')
2441 0 : k = 0
2442 0 : DO i = 1, nranks
2443 0 : DO j = 1, bins_per_rank(i)/2
2444 0 : k = k + 1
2445 : WRITE (iw, FMT="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2446 0 : i - 1, j, buffer_out(2*(k - 1) + 1), REAL(buffer_out(2*k), dp)/10000.0_dp
2447 0 : summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2448 0 : summary(2*i) = summary(2*i) + buffer_out(2*k)
2449 : END DO
2450 : END DO
2451 :
2452 : !** Summary
2453 : max_bin = 0_int_8
2454 : min_bin = HUGE(min_bin)
2455 : sum_bin = 0_int_8
2456 0 : DO i = 1, total_bins
2457 0 : sum_bin = sum_bin + buffer_out(2*i)
2458 0 : max_bin = MAX(max_bin, buffer_out(2*i))
2459 0 : min_bin = MIN(min_bin, buffer_out(2*i))
2460 : END DO
2461 0 : avg_bin = sum_bin/total_bins
2462 :
2463 0 : max_rank = 0_int_8
2464 0 : min_rank = HUGE(min_rank)
2465 0 : sum_rank = 0_int_8
2466 0 : DO i = 1, nranks
2467 0 : sum_rank = sum_rank + summary(2*i)
2468 0 : max_rank = MAX(max_rank, summary(2*i))
2469 0 : min_rank = MIN(min_rank, summary(2*i))
2470 : END DO
2471 0 : avg_rank = sum_rank/nranks
2472 :
2473 0 : WRITE (iw, FMT='(/,T3,A,/)') "SUMMARY:"
2474 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max bin", REAL(max_bin, dp)/10000.0_dp
2475 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min bin", REAL(min_bin, dp)/10000.0_dp
2476 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum bin", REAL(sum_bin, dp)/10000.0_dp
2477 0 : WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg bin", REAL(avg_bin, dp)/10000.0_dp
2478 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max rank", REAL(max_rank, dp)/10000.0_dp
2479 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min rank", REAL(min_rank, dp)/10000.0_dp
2480 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum rank", REAL(sum_rank, dp)/10000.0_dp
2481 0 : WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg rank", REAL(avg_rank, dp)/10000.0_dp
2482 :
2483 0 : ALLOCATE (buffer(nranks))
2484 0 : ALLOCATE (sort_idx(nranks))
2485 :
2486 0 : DO i = 1, nranks
2487 0 : buffer(i) = summary(2*i)
2488 : END DO
2489 :
2490 0 : CALL sort(buffer, nranks, sort_idx)
2491 :
2492 0 : WRITE (iw, FMT="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
2493 0 : DO i = nranks, 1, -1
2494 0 : WRITE (iw, FMT="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), REAL(buffer(i), dp)/10000.0_dp
2495 : END DO
2496 :
2497 0 : DEALLOCATE (summary, buffer, sort_idx)
2498 :
2499 : END IF
2500 :
2501 0 : DEALLOCATE (buffer_in, buffer_out, rdispl)
2502 :
2503 0 : CALL para_env%sync()
2504 :
2505 0 : DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2506 : !$OMP END MASTER
2507 0 : !$OMP BARRIER
2508 :
2509 0 : END SUBROUTINE collect_load_balance_info
2510 :
2511 : #:include 'hfx_get_pmax_val.fypp'
2512 :
2513 : END MODULE hfx_load_balance_methods
|