Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines to calculate RI-RPA and SOS-MP2 gradients
10 : !> \par History
11 : !> 10.2021 created [Frederick Stein]
12 : ! **************************************************************************************************
13 : MODULE rpa_grad
14 : USE ISO_C_BINDING, ONLY: C_NULL_PTR,&
15 : C_PTR,&
16 : c_associated
17 : USE cp_array_utils, ONLY: cp_1d_r_cp_type,&
18 : cp_3d_r_cp_type
19 : USE cp_blacs_env, ONLY: cp_blacs_env_type
20 : USE cp_fm_basic_linalg, ONLY: cp_fm_geadd,&
21 : cp_fm_scale_and_add,&
22 : cp_fm_upper_to_full
23 : USE cp_fm_cholesky, ONLY: cp_fm_cholesky_invert
24 : USE cp_fm_struct, ONLY: cp_fm_struct_create,&
25 : cp_fm_struct_get,&
26 : cp_fm_struct_release,&
27 : cp_fm_struct_type
28 : USE cp_fm_types, ONLY: &
29 : cp_fm_create, cp_fm_get_info, cp_fm_indxg2l, cp_fm_indxg2p, cp_fm_release, cp_fm_set_all, &
30 : cp_fm_to_fm, cp_fm_to_fm_submat_general, cp_fm_type
31 : USE dgemm_counter_types, ONLY: dgemm_counter_start,&
32 : dgemm_counter_stop,&
33 : dgemm_counter_type
34 : USE group_dist_types, ONLY: create_group_dist,&
35 : get_group_dist,&
36 : group_dist_d1_type,&
37 : group_dist_proc,&
38 : maxsize,&
39 : release_group_dist
40 : USE kahan_sum, ONLY: accurate_dot_product,&
41 : accurate_dot_product_2
42 : USE kinds, ONLY: dp,&
43 : int_8
44 : USE libint_2c_3c, ONLY: compare_potential_types
45 : USE local_gemm_api, ONLY: LOCAL_GEMM_PU_GPU,&
46 : local_gemm,&
47 : local_gemm_create,&
48 : local_gemm_destroy,&
49 : local_gemm_set_op_threshold_gpu
50 : USE machine, ONLY: m_flush,&
51 : m_memory
52 : USE mathconstants, ONLY: pi
53 : USE message_passing, ONLY: mp_comm_type,&
54 : mp_para_env_type,&
55 : mp_request_null,&
56 : mp_request_type,&
57 : mp_waitall,&
58 : mp_waitany
59 : USE mp2_laplace, ONLY: calc_fm_mat_s_laplace
60 : USE mp2_ri_grad_util, ONLY: array2fm,&
61 : create_dbcsr_gamma,&
62 : fm2array,&
63 : prepare_redistribution
64 : USE mp2_types, ONLY: mp2_type,&
65 : one_dim_int_array,&
66 : two_dim_int_array,&
67 : two_dim_real_array
68 : USE parallel_gemm_api, ONLY: parallel_gemm
69 : USE qs_environment_types, ONLY: get_qs_env,&
70 : qs_environment_type
71 : USE rpa_util, ONLY: calc_fm_mat_S_rpa,&
72 : remove_scaling_factor_rpa
73 : USE util, ONLY: get_limit
74 : #include "./base/base_uses.f90"
75 :
76 : IMPLICIT NONE
77 :
78 : PRIVATE
79 :
80 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'rpa_grad'
81 :
82 : PUBLIC :: rpa_grad_needed_mem, rpa_grad_type, rpa_grad_create, rpa_grad_finalize, rpa_grad_matrix_operations, rpa_grad_copy_Q
83 :
84 : TYPE sos_mp2_grad_work_type
85 : PRIVATE
86 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: pair_list
87 : TYPE(one_dim_int_array), DIMENSION(:), ALLOCATABLE :: index2send, index2recv
88 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: P
89 : END TYPE
90 :
91 : TYPE rpa_grad_work_type
92 : TYPE(cp_fm_type) :: fm_mat_Q_copy = cp_fm_type()
93 : TYPE(one_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2send
94 : TYPE(two_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2recv
95 : TYPE(group_dist_d1_type), DIMENSION(:), ALLOCATABLE :: gd_homo, gd_virtual
96 : INTEGER, DIMENSION(2) :: grid = -1, mepos = -1
97 : TYPE(two_dim_real_array), DIMENSION(:), ALLOCATABLE :: P_ij, P_ab
98 : END TYPE
99 :
100 : TYPE rpa_grad_type
101 : PRIVATE
102 : TYPE(cp_fm_type) :: fm_Gamma_PQ = cp_fm_type()
103 : TYPE(cp_fm_type), DIMENSION(:), ALLOCATABLE :: fm_Y
104 : TYPE(sos_mp2_grad_work_type), ALLOCATABLE, DIMENSION(:) :: sos_mp2_work_occ, sos_mp2_work_virt
105 : TYPE(rpa_grad_work_type) :: rpa_work
106 : END TYPE
107 :
108 : INTEGER, PARAMETER :: spla_threshold = 128*128*128*2
109 : INTEGER, PARAMETER :: blksize_threshold = 4
110 :
111 : CONTAINS
112 :
113 : ! **************************************************************************************************
114 : !> \brief Calculates the necessary minimum memory for the Gradient code ion MiB
115 : !> \param homo ...
116 : !> \param virtual ...
117 : !> \param dimen_RI ...
118 : !> \param mem_per_rank ...
119 : !> \param mem_per_repl ...
120 : !> \param do_ri_sos_laplace_mp2 ...
121 : !> \return ...
122 : ! **************************************************************************************************
123 40 : PURE SUBROUTINE rpa_grad_needed_mem(homo, virtual, dimen_RI, mem_per_rank, mem_per_repl, do_ri_sos_laplace_mp2)
124 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
125 : INTEGER, INTENT(IN) :: dimen_RI
126 : REAL(KIND=dp), INTENT(INOUT) :: mem_per_rank, mem_per_repl
127 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
128 :
129 : REAL(KIND=dp) :: mem_iaK, mem_KL, mem_pab, mem_pij
130 :
131 88 : mem_iaK = SUM(REAL(virtual, KIND=dp)*homo)*dimen_RI
132 88 : mem_pij = SUM(REAL(homo, KIND=dp)**2)
133 88 : mem_pab = SUM(REAL(virtual, KIND=dp)**2)
134 40 : mem_KL = REAL(dimen_RI, KIND=dp)*dimen_RI
135 :
136 : ! Required matrices iaK
137 : ! Ytot_iaP = sum_tau Y_iaP(tau)
138 : ! Y_iaP(tau) = S_iaP(tau)*Q_PQ(tau) (work array)
139 : ! Required matrices density matrices
140 : ! Pij (local)
141 : ! Pab (local)
142 : ! Additionally with SOS-MP2
143 : ! Send and receive buffers for degenerate orbital pairs (rough estimate: everything)
144 : ! Additionally with RPA
145 : ! copy of work matrix
146 : ! receive buffer for calculation of density matrix
147 : ! copy of matrix Q
148 40 : mem_per_rank = mem_per_rank + (mem_pij + mem_pab)*8.0_dp/(1024**2)
149 40 : mem_per_repl = mem_per_repl + (mem_iaK + 2.0_dp*mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
150 40 : IF (.NOT. do_ri_sos_laplace_mp2) THEN
151 20 : mem_per_repl = mem_per_rank + (mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
152 : END IF
153 :
154 40 : END SUBROUTINE rpa_grad_needed_mem
155 :
156 : ! **************************************************************************************************
157 : !> \brief Creates the arrays of a rpa_grad_type
158 : !> \param rpa_grad ...
159 : !> \param fm_mat_Q ...
160 : !> \param fm_mat_S ...
161 : !> \param homo ...
162 : !> \param virtual ...
163 : !> \param mp2_env ...
164 : !> \param Eigenval ...
165 : !> \param unit_nr ...
166 : !> \param do_ri_sos_laplace_mp2 ...
167 : ! **************************************************************************************************
168 280 : SUBROUTINE rpa_grad_create(rpa_grad, fm_mat_Q, fm_mat_S, &
169 40 : homo, virtual, mp2_env, Eigenval, unit_nr, do_ri_sos_laplace_mp2)
170 : TYPE(rpa_grad_type), INTENT(OUT) :: rpa_grad
171 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_Q
172 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
173 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
174 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
175 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
176 : INTEGER, INTENT(IN) :: unit_nr
177 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
178 :
179 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_create'
180 :
181 : INTEGER :: handle, ispin, nrow_local, nspins
182 :
183 40 : CALL timeset(routineN, handle)
184 :
185 40 : CALL cp_fm_create(rpa_grad%fm_Gamma_PQ, matrix_struct=fm_mat_Q%matrix_struct)
186 40 : CALL cp_fm_set_all(rpa_grad%fm_Gamma_PQ, 0.0_dp)
187 :
188 40 : nspins = SIZE(fm_mat_S)
189 :
190 168 : ALLOCATE (rpa_grad%fm_Y(nspins))
191 88 : DO ispin = 1, nspins
192 88 : CALL cp_fm_create(rpa_grad%fm_Y(ispin), fm_mat_S(ispin)%matrix_struct)
193 : END DO
194 :
195 40 : IF (do_ri_sos_laplace_mp2) THEN
196 : CALL sos_mp2_work_type_create(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
197 20 : unit_nr, Eigenval, homo, virtual, mp2_env%ri_grad%eps_canonical, fm_mat_S)
198 : ELSE
199 20 : CALL rpa_work_type_create(rpa_grad%rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
200 : END IF
201 :
202 : ! Set blocksize
203 40 : CALL cp_fm_struct_get(fm_mat_S(1)%matrix_struct, nrow_local=nrow_local)
204 40 : IF (mp2_env%ri_grad%dot_blksize < 1) mp2_env%ri_grad%dot_blksize = nrow_local
205 40 : mp2_env%ri_grad%dot_blksize = MIN(mp2_env%ri_grad%dot_blksize, nrow_local)
206 40 : IF (unit_nr > 0) THEN
207 20 : WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Block size for the contraction:', mp2_env%ri_grad%dot_blksize
208 20 : CALL m_flush(unit_nr)
209 : END IF
210 40 : CALL fm_mat_S(1)%matrix_struct%para_env%sync()
211 :
212 40 : CALL timestop(handle)
213 :
214 80 : END SUBROUTINE rpa_grad_create
215 :
216 : ! **************************************************************************************************
217 : !> \brief ...
218 : !> \param sos_mp2_work_occ ...
219 : !> \param sos_mp2_work_virt ...
220 : !> \param unit_nr ...
221 : !> \param Eigenval ...
222 : !> \param homo ...
223 : !> \param virtual ...
224 : !> \param eps_degenerate ...
225 : !> \param fm_mat_S ...
226 : ! **************************************************************************************************
227 20 : SUBROUTINE sos_mp2_work_type_create(sos_mp2_work_occ, sos_mp2_work_virt, unit_nr, &
228 20 : Eigenval, homo, virtual, eps_degenerate, fm_mat_S)
229 : TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
230 : DIMENSION(:), INTENT(OUT) :: sos_mp2_work_occ, sos_mp2_work_virt
231 : INTEGER, INTENT(IN) :: unit_nr
232 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
233 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
234 : REAL(KIND=dp), INTENT(IN) :: eps_degenerate
235 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
236 :
237 : CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_work_type_create'
238 :
239 : INTEGER :: handle, ispin, nspins
240 :
241 20 : CALL timeset(routineN, handle)
242 :
243 20 : nspins = SIZE(fm_mat_S)
244 148 : ALLOCATE (sos_mp2_work_occ(nspins), sos_mp2_work_virt(nspins))
245 44 : DO ispin = 1, nspins
246 :
247 : CALL create_list_nearly_degen_pairs(Eigenval(1:homo(ispin), ispin), &
248 24 : eps_degenerate, sos_mp2_work_occ(ispin)%pair_list)
249 24 : IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
250 12 : "MO_INFO| Number of ij pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
251 72 : ALLOCATE (sos_mp2_work_occ(ispin)%P(homo(ispin) + SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)))
252 164 : sos_mp2_work_occ(ispin)%P = 0.0_dp
253 24 : CALL prepare_comm_Pij(sos_mp2_work_occ(ispin), virtual(ispin), fm_mat_S(ispin))
254 :
255 : CALL create_list_nearly_degen_pairs(Eigenval(homo(ispin) + 1:, ispin), &
256 24 : eps_degenerate, sos_mp2_work_virt(ispin)%pair_list)
257 24 : IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
258 12 : "MO_INFO| Number of ab pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
259 72 : ALLOCATE (sos_mp2_work_virt(ispin)%P(virtual(ispin) + SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)))
260 1136 : sos_mp2_work_virt(ispin)%P = 0.0_dp
261 44 : CALL prepare_comm_Pab(sos_mp2_work_virt(ispin), virtual(ispin), fm_mat_S(ispin))
262 : END DO
263 :
264 20 : CALL timestop(handle)
265 :
266 20 : END SUBROUTINE
267 :
268 : ! **************************************************************************************************
269 : !> \brief ...
270 : !> \param rpa_work ...
271 : !> \param fm_mat_Q ...
272 : !> \param fm_mat_S ...
273 : !> \param homo ...
274 : !> \param virtual ...
275 : ! **************************************************************************************************
276 120 : SUBROUTINE rpa_work_type_create(rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
277 : TYPE(rpa_grad_work_type), INTENT(OUT) :: rpa_work
278 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_Q
279 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
280 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
281 :
282 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_work_type_create'
283 :
284 : INTEGER :: avirt, col_global, col_local, first_p_pos(2), first_p_pos_col, handle, iocc, &
285 : ispin, my_a, my_a_end, my_a_size, my_a_start, my_i, my_i_end, my_i_size, my_i_start, &
286 : my_pcol, ncol_block, ncol_local, nspins, num_pe_col, proc_homo, proc_homo_send, &
287 : proc_recv, proc_send, proc_virtual, proc_virtual_send
288 20 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
289 20 : INTEGER, DIMENSION(:), POINTER :: col_indices
290 :
291 20 : CALL timeset(routineN, handle)
292 :
293 20 : CALL cp_fm_create(rpa_work%fm_mat_Q_copy, matrix_struct=fm_mat_Q%matrix_struct)
294 :
295 20 : CALL fm_mat_S(1)%matrix_struct%context%get(number_of_process_columns=num_pe_col, my_process_column=my_pcol)
296 :
297 20 : nspins = SIZE(fm_mat_S)
298 :
299 0 : ALLOCATE (rpa_work%index2send(0:num_pe_col - 1, nspins), &
300 0 : rpa_work%index2recv(0:num_pe_col - 1, nspins), &
301 0 : rpa_work%gd_homo(nspins), rpa_work%gd_virtual(nspins), &
302 : data2send(0:num_pe_col - 1), data2recv(0:num_pe_col - 1), &
303 512 : rpa_work%P_ij(nspins), rpa_work%P_ab(nspins))
304 :
305 : ! Determine new process grid
306 20 : proc_homo = MAX(1, CEILING(SQRT(REAL(num_pe_col, KIND=dp))))
307 20 : DO WHILE (MOD(num_pe_col, proc_homo) /= 0)
308 0 : proc_homo = proc_homo - 1
309 : END DO
310 20 : proc_virtual = num_pe_col/proc_homo
311 :
312 20 : rpa_work%grid(1) = proc_virtual
313 20 : rpa_work%grid(2) = proc_homo
314 :
315 20 : rpa_work%mepos(1) = MOD(my_pcol, proc_virtual)
316 20 : rpa_work%mepos(2) = my_pcol/proc_virtual
317 :
318 44 : DO ispin = 1, nspins
319 :
320 : ! Determine distributions of the orbitals
321 24 : CALL create_group_dist(rpa_work%gd_homo(ispin), proc_homo, homo(ispin))
322 24 : CALL create_group_dist(rpa_work%gd_virtual(ispin), proc_virtual, virtual(ispin))
323 :
324 : CALL cp_fm_struct_get(fm_mat_S(ispin)%matrix_struct, ncol_local=ncol_local, col_indices=col_indices, &
325 24 : first_p_pos=first_p_pos, ncol_block=ncol_block)
326 24 : first_p_pos_col = first_p_pos(2)
327 :
328 48 : data2send = 0
329 : ! Count the amount of data2send to each process
330 1924 : DO col_local = 1, ncol_local
331 1900 : col_global = col_indices(col_local)
332 :
333 1900 : iocc = (col_global - 1)/virtual(ispin) + 1
334 1900 : avirt = col_global - (iocc - 1)*virtual(ispin)
335 :
336 1900 : proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
337 1900 : proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
338 :
339 1900 : proc_send = proc_homo_send*proc_virtual + proc_virtual_send
340 :
341 1924 : data2send(proc_send) = data2send(proc_send) + 1
342 : END DO
343 :
344 48 : DO proc_send = 0, num_pe_col - 1
345 96 : ALLOCATE (rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)))
346 : END DO
347 :
348 : ! Prepare the indices
349 48 : data2send = 0
350 1924 : DO col_local = 1, ncol_local
351 1900 : col_global = col_indices(col_local)
352 :
353 1900 : iocc = (col_global - 1)/virtual(ispin) + 1
354 1900 : avirt = col_global - (iocc - 1)*virtual(ispin)
355 :
356 1900 : proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
357 1900 : proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
358 :
359 1900 : proc_send = proc_homo_send*proc_virtual + proc_virtual_send
360 :
361 1900 : data2send(proc_send) = data2send(proc_send) + 1
362 :
363 1924 : rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)) = col_local
364 : END DO
365 :
366 : ! Count the amount of data2recv from each process
367 24 : CALL get_group_dist(rpa_work%gd_homo(ispin), my_pcol/proc_virtual, my_i_start, my_i_end, my_i_size)
368 24 : CALL get_group_dist(rpa_work%gd_virtual(ispin), MOD(my_pcol, proc_virtual), my_a_start, my_a_end, my_a_size)
369 :
370 48 : data2recv = 0
371 116 : DO my_i = my_i_start, my_i_end
372 2016 : DO my_a = my_a_start, my_a_end
373 1900 : proc_recv = cp_fm_indxg2p((my_i - 1)*virtual(ispin) + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
374 1992 : data2recv(proc_recv) = data2recv(proc_recv) + 1
375 : END DO
376 : END DO
377 :
378 48 : DO proc_recv = 0, num_pe_col - 1
379 96 : ALLOCATE (rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)))
380 : END DO
381 :
382 48 : data2recv = 0
383 116 : DO my_i = my_i_start, my_i_end
384 2016 : DO my_a = my_a_start, my_a_end
385 1900 : proc_recv = cp_fm_indxg2p((my_i - 1)*virtual(ispin) + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
386 1900 : data2recv(proc_recv) = data2recv(proc_recv) + 1
387 :
388 1900 : rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)) = my_i - my_i_start + 1
389 1992 : rpa_work%index2recv(proc_recv, ispin)%array(1, data2recv(proc_recv)) = my_a - my_a_start + 1
390 : END DO
391 : END DO
392 :
393 0 : ALLOCATE (rpa_work%P_ij(ispin)%array(my_i_size, homo(ispin)), &
394 168 : rpa_work%P_ab(ispin)%array(my_a_size, virtual(ispin)))
395 472 : rpa_work%P_ij(ispin)%array(:, :) = 0.0_dp
396 11148 : rpa_work%P_ab(ispin)%array(:, :) = 0.0_dp
397 :
398 : END DO
399 :
400 20 : DEALLOCATE (data2send, data2recv)
401 :
402 20 : CALL timestop(handle)
403 :
404 40 : END SUBROUTINE
405 :
406 : ! **************************************************************************************************
407 : !> \brief ...
408 : !> \param Eigenval ...
409 : !> \param eps_degen ...
410 : !> \param pair_list ...
411 : ! **************************************************************************************************
412 48 : SUBROUTINE create_list_nearly_degen_pairs(Eigenval, eps_degen, pair_list)
413 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
414 : REAL(KIND=dp), INTENT(IN) :: eps_degen
415 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: pair_list
416 :
417 : INTEGER :: my_i, my_j, num_orbitals, num_pairs, &
418 : pair_counter
419 :
420 48 : num_orbitals = SIZE(Eigenval)
421 :
422 : ! Determine number of nearly degenerate orbital pairs
423 : ! Trivial cases: diagonal elements
424 48 : num_pairs = 0
425 640 : DO my_i = 1, num_orbitals
426 11576 : DO my_j = 1, num_orbitals
427 10936 : IF (my_i == my_j) CYCLE
428 10936 : IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) num_pairs = num_pairs + 1
429 : END DO
430 : END DO
431 104 : ALLOCATE (pair_list(2, num_pairs))
432 :
433 : ! Print the required pairs
434 48 : pair_counter = 1
435 640 : DO my_i = 1, num_orbitals
436 11576 : DO my_j = 1, num_orbitals
437 10936 : IF (my_i == my_j) CYCLE
438 10936 : IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) THEN
439 660 : pair_list(1, pair_counter) = my_i
440 660 : pair_list(2, pair_counter) = my_j
441 660 : pair_counter = pair_counter + 1
442 : END IF
443 : END DO
444 : END DO
445 :
446 48 : END SUBROUTINE create_list_nearly_degen_pairs
447 :
448 : ! **************************************************************************************************
449 : !> \brief ...
450 : !> \param sos_mp2_work ...
451 : !> \param virtual ...
452 : !> \param fm_mat_S ...
453 : ! **************************************************************************************************
454 24 : SUBROUTINE prepare_comm_Pij(sos_mp2_work, virtual, fm_mat_S)
455 : TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work
456 : INTEGER, INTENT(IN) :: virtual
457 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S
458 :
459 : CHARACTER(LEN=*), PARAMETER :: routineN = 'prepare_comm_Pij'
460 :
461 : INTEGER :: avirt, col_global, col_local, counter, first_p_pos(2), first_p_pos_col, handle, &
462 : ij_counter, iocc, my_i, my_j, my_pcol, my_prow, ncol_block, ncol_local, nrow_local, &
463 : num_ij_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, tag
464 24 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
465 24 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
466 24 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
467 : TYPE(cp_blacs_env_type), POINTER :: context
468 : TYPE(mp_comm_type) :: comm_exchange
469 : TYPE(mp_para_env_type), POINTER :: para_env
470 :
471 24 : CALL timeset(routineN, handle)
472 :
473 24 : tag = 44
474 :
475 24 : CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
476 0 : ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
477 168 : sos_mp2_work%index2recv(0:num_pe_col - 1))
478 :
479 72 : ALLOCATE (data2send(0:num_pe_col - 1))
480 72 : ALLOCATE (data2recv(0:num_pe_col - 1))
481 :
482 : CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
483 : ncol_local=ncol_local, col_indices=col_indices, &
484 24 : context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
485 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
486 24 : blacs2mpi=blacs2mpi)
487 24 : first_p_pos_col = first_p_pos(2)
488 :
489 24 : num_ij_pairs = SIZE(sos_mp2_work%pair_list, 2)
490 :
491 24 : IF (num_ij_pairs > 0) THEN
492 :
493 4 : CALL comm_exchange%from_split(para_env, my_prow)
494 :
495 8 : data2send = 0
496 8 : data2recv = 0
497 :
498 8 : DO proc_shift = 0, num_pe_col - 1
499 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
500 :
501 4 : counter = 0
502 308 : DO col_local = 1, ncol_local
503 304 : col_global = col_indices(col_local)
504 :
505 304 : iocc = MAX(1, col_global - 1)/virtual + 1
506 304 : avirt = col_global - (iocc - 1)*virtual
507 :
508 764 : DO ij_counter = 1, num_ij_pairs
509 :
510 760 : my_i = sos_mp2_work%pair_list(1, ij_counter)
511 760 : my_j = sos_mp2_work%pair_list(2, ij_counter)
512 :
513 760 : IF (iocc /= my_j) CYCLE
514 304 : pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
515 304 : IF (pcol /= pcol_send) CYCLE
516 :
517 304 : counter = counter + 1
518 :
519 760 : EXIT
520 :
521 : END DO
522 : END DO
523 8 : data2send(pcol_send) = counter
524 : END DO
525 :
526 4 : CALL comm_exchange%alltoall(data2send, data2recv, 1)
527 :
528 8 : DO proc_shift = 0, num_pe_col - 1
529 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
530 4 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
531 :
532 : ! Collect indices and exchange
533 12 : ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
534 :
535 4 : counter = 0
536 308 : DO col_local = 1, ncol_local
537 304 : col_global = col_indices(col_local)
538 :
539 304 : iocc = MAX(1, col_global - 1)/virtual + 1
540 304 : avirt = col_global - (iocc - 1)*virtual
541 :
542 764 : DO ij_counter = 1, num_ij_pairs
543 :
544 760 : my_i = sos_mp2_work%pair_list(1, ij_counter)
545 760 : my_j = sos_mp2_work%pair_list(2, ij_counter)
546 :
547 760 : IF (iocc /= my_j) CYCLE
548 304 : pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
549 304 : IF (pcol /= pcol_send) CYCLE
550 :
551 304 : counter = counter + 1
552 :
553 304 : sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
554 :
555 760 : EXIT
556 :
557 : END DO
558 : END DO
559 :
560 12 : ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
561 : !
562 : CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
563 4 : sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
564 :
565 : ! Convert to global coordinates to local coordinates as we always work with them
566 312 : DO counter = 1, data2send(pcol_send)
567 : sos_mp2_work%index2send(pcol_send)%array(counter) = &
568 : cp_fm_indxg2l(sos_mp2_work%index2send(pcol_send)%array(counter), &
569 308 : ncol_block, 0, first_p_pos_col, num_pe_col)
570 : END DO
571 : END DO
572 :
573 4 : CALL comm_exchange%free()
574 : END IF
575 :
576 24 : DEALLOCATE (data2send, data2recv)
577 :
578 24 : CALL timestop(handle)
579 :
580 48 : END SUBROUTINE
581 :
582 : ! **************************************************************************************************
583 : !> \brief ...
584 : !> \param sos_mp2_work ...
585 : !> \param virtual ...
586 : !> \param fm_mat_S ...
587 : ! **************************************************************************************************
588 24 : SUBROUTINE prepare_comm_Pab(sos_mp2_work, virtual, fm_mat_S)
589 : TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work
590 : INTEGER, INTENT(IN) :: virtual
591 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S
592 :
593 : CHARACTER(LEN=*), PARAMETER :: routineN = 'prepare_comm_Pab'
594 :
595 : INTEGER :: ab_counter, avirt, col_global, col_local, counter, first_p_pos(2), &
596 : first_p_pos_col, handle, iocc, my_a, my_b, my_pcol, my_prow, ncol_block, ncol_local, &
597 : nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, tag
598 24 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
599 24 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
600 24 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
601 : TYPE(cp_blacs_env_type), POINTER :: context
602 : TYPE(mp_comm_type) :: comm_exchange
603 : TYPE(mp_para_env_type), POINTER :: para_env
604 :
605 24 : CALL timeset(routineN, handle)
606 :
607 24 : tag = 44
608 :
609 24 : CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
610 0 : ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
611 168 : sos_mp2_work%index2recv(0:num_pe_col - 1))
612 :
613 24 : num_ab_pairs = SIZE(sos_mp2_work%pair_list, 2)
614 24 : IF (num_ab_pairs > 0) THEN
615 :
616 : CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
617 : ncol_local=ncol_local, col_indices=col_indices, &
618 4 : context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
619 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
620 4 : blacs2mpi=blacs2mpi)
621 4 : first_p_pos_col = first_p_pos(2)
622 :
623 4 : CALL comm_exchange%from_split(para_env, my_prow)
624 :
625 12 : ALLOCATE (data2send(0:num_pe_col - 1))
626 12 : ALLOCATE (data2recv(0:num_pe_col - 1))
627 :
628 8 : data2send = 0
629 8 : data2recv = 0
630 8 : DO proc_shift = 0, num_pe_col - 1
631 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
632 4 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
633 :
634 4 : counter = 0
635 308 : DO col_local = 1, ncol_local
636 304 : col_global = col_indices(col_local)
637 :
638 304 : iocc = MAX(1, col_global - 1)/virtual + 1
639 304 : avirt = col_global - (iocc - 1)*virtual
640 :
641 15476 : DO ab_counter = 1, num_ab_pairs
642 :
643 15472 : my_a = sos_mp2_work%pair_list(1, ab_counter)
644 15472 : my_b = sos_mp2_work%pair_list(2, ab_counter)
645 :
646 15472 : IF (avirt /= my_b) CYCLE
647 304 : pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
648 304 : IF (pcol /= pcol_send) CYCLE
649 :
650 304 : counter = counter + 1
651 :
652 15472 : EXIT
653 :
654 : END DO
655 : END DO
656 8 : data2send(pcol_send) = counter
657 : END DO
658 :
659 4 : CALL comm_exchange%alltoall(data2send, data2recv, 1)
660 :
661 8 : DO proc_shift = 0, num_pe_col - 1
662 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
663 4 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
664 :
665 : ! Collect indices and exchange
666 12 : ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
667 :
668 4 : counter = 0
669 308 : DO col_local = 1, ncol_local
670 304 : col_global = col_indices(col_local)
671 :
672 304 : iocc = MAX(1, col_global - 1)/virtual + 1
673 304 : avirt = col_global - (iocc - 1)*virtual
674 :
675 15476 : DO ab_counter = 1, num_ab_pairs
676 :
677 15472 : my_a = sos_mp2_work%pair_list(1, ab_counter)
678 15472 : my_b = sos_mp2_work%pair_list(2, ab_counter)
679 :
680 15472 : IF (avirt /= my_b) CYCLE
681 304 : pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
682 304 : IF (pcol /= pcol_send) CYCLE
683 :
684 304 : counter = counter + 1
685 :
686 304 : sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
687 :
688 15472 : EXIT
689 :
690 : END DO
691 : END DO
692 :
693 12 : ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
694 : !
695 : CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
696 4 : sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
697 :
698 : ! Convert to global coordinates to local coordinates as we always work with them
699 312 : DO counter = 1, data2send(pcol_send)
700 : sos_mp2_work%index2send(pcol_send)%array(counter) = &
701 : cp_fm_indxg2l(sos_mp2_work%index2send(pcol_send)%array(counter), &
702 308 : ncol_block, 0, first_p_pos_col, num_pe_col)
703 : END DO
704 : END DO
705 :
706 4 : CALL comm_exchange%free()
707 8 : DEALLOCATE (data2send, data2recv)
708 :
709 : END IF
710 :
711 24 : CALL timestop(handle)
712 :
713 48 : END SUBROUTINE
714 :
715 : ! **************************************************************************************************
716 : !> \brief ...
717 : !> \param fm_mat_Q ...
718 : !> \param rpa_grad ...
719 : ! **************************************************************************************************
720 48 : SUBROUTINE rpa_grad_copy_Q(fm_mat_Q, rpa_grad)
721 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_Q
722 : TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
723 :
724 48 : CALL cp_fm_to_fm(fm_mat_Q, rpa_grad%rpa_work%fm_mat_Q_copy)
725 :
726 48 : END SUBROUTINE
727 :
728 : ! **************************************************************************************************
729 : !> \brief ...
730 : !> \param mp2_env ...
731 : !> \param rpa_grad ...
732 : !> \param do_ri_sos_laplace_mp2 ...
733 : !> \param fm_mat_Q ...
734 : !> \param fm_mat_Q_gemm ...
735 : !> \param dgemm_counter ...
736 : !> \param fm_mat_S ...
737 : !> \param omega ...
738 : !> \param homo ...
739 : !> \param virtual ...
740 : !> \param Eigenval ...
741 : !> \param weight ...
742 : !> \param unit_nr ...
743 : ! **************************************************************************************************
744 108 : SUBROUTINE rpa_grad_matrix_operations(mp2_env, rpa_grad, do_ri_sos_laplace_mp2, fm_mat_Q, fm_mat_Q_gemm, &
745 108 : dgemm_counter, fm_mat_S, omega, homo, virtual, Eigenval, weight, unit_nr)
746 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
747 : TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
748 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
749 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_Q, fm_mat_Q_gemm
750 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
751 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
752 : REAL(KIND=dp), INTENT(IN) :: omega
753 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
754 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
755 : REAL(KIND=dp), INTENT(IN) :: weight
756 : INTEGER, INTENT(IN) :: unit_nr
757 :
758 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_matrix_operations'
759 :
760 : INTEGER :: col_global, col_local, dimen_ia, &
761 : dimen_RI, handle, handle2, ispin, &
762 : jspin, ncol_local, nrow_local, nspins, &
763 : row_local
764 108 : INTEGER, DIMENSION(:), POINTER :: col_indices, row_indices
765 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
766 108 : TARGET :: mat_S_3D, mat_work_iaP_3D
767 : TYPE(cp_fm_type) :: fm_work_iaP, fm_work_PQ
768 :
769 108 : CALL timeset(routineN, handle)
770 :
771 108 : nspins = SIZE(fm_mat_Q)
772 :
773 : CALL cp_fm_get_info(fm_mat_Q(1), nrow_global=dimen_RI, nrow_local=nrow_local, ncol_local=ncol_local, &
774 108 : col_indices=col_indices, row_indices=row_indices)
775 :
776 108 : IF (.NOT. do_ri_sos_laplace_mp2) THEN
777 48 : CALL cp_fm_create(fm_work_PQ, fm_mat_Q(1)%matrix_struct)
778 :
779 : ! calculate [1+Q(iw')]^-1
780 48 : CALL cp_fm_cholesky_invert(fm_mat_Q(1))
781 : ! symmetrize the result, fm_work_PQ is only a work matrix
782 48 : CALL cp_fm_upper_to_full(fm_mat_Q(1), fm_work_PQ)
783 :
784 48 : CALL cp_fm_release(fm_work_PQ)
785 :
786 4144 : DO col_local = 1, ncol_local
787 4096 : col_global = col_indices(col_local)
788 164064 : DO row_local = 1, nrow_local
789 164016 : IF (col_global == row_indices(row_local)) THEN
790 3432 : fm_mat_Q(1)%local_data(row_local, col_local) = fm_mat_Q(1)%local_data(row_local, col_local) - 1.0_dp
791 3432 : EXIT
792 : END IF
793 : END DO
794 : END DO
795 :
796 48 : CALL timeset(routineN//"_PQ", handle2)
797 48 : CALL dgemm_counter_start(dgemm_counter)
798 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
799 : matrix_a=rpa_grad%rpa_work%fm_mat_Q_copy, matrix_b=fm_mat_Q(1), beta=1.0_dp, &
800 48 : matrix_c=rpa_grad%fm_Gamma_PQ)
801 48 : CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
802 48 : CALL timestop(handle2)
803 :
804 : CALL cp_fm_to_fm_submat_general(fm_mat_Q(1), fm_mat_Q_gemm(1), dimen_RI, dimen_RI, 1, 1, 1, 1, &
805 48 : fm_mat_Q_gemm(1)%matrix_struct%context)
806 : END IF
807 :
808 232 : DO ispin = 1, nspins
809 124 : IF (do_ri_sos_laplace_mp2) THEN
810 : ! The spin of the other Q matrix is always the other spin
811 68 : jspin = nspins - ispin + 1
812 : ELSE
813 : ! or the first matrix in the case of RPA
814 : jspin = 1
815 : END IF
816 :
817 124 : IF (do_ri_sos_laplace_mp2) THEN
818 68 : CALL timeset(routineN//"_PQ", handle2)
819 68 : CALL dgemm_counter_start(dgemm_counter)
820 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
821 : matrix_a=fm_mat_Q(ispin), matrix_b=fm_mat_Q(jspin), beta=1.0_dp, &
822 68 : matrix_c=rpa_grad%fm_Gamma_PQ)
823 68 : CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
824 68 : CALL timestop(handle2)
825 :
826 : CALL cp_fm_to_fm_submat_general(fm_mat_Q(jspin), fm_mat_Q_gemm(jspin), dimen_RI, dimen_RI, 1, 1, 1, 1, &
827 68 : fm_mat_Q_gemm(jspin)%matrix_struct%context)
828 : ELSE
829 : CALL calc_fm_mat_S_rpa(fm_mat_S(ispin), .TRUE., virtual(ispin), Eigenval(:, ispin), &
830 56 : homo(ispin), omega, 0.0_dp)
831 : END IF
832 :
833 124 : CALL timeset(routineN//"_contr_S", handle2)
834 124 : CALL cp_fm_create(fm_work_iaP, rpa_grad%fm_Y(ispin)%matrix_struct)
835 :
836 124 : CALL cp_fm_get_info(fm_mat_S(ispin), ncol_global=dimen_ia)
837 :
838 124 : CALL dgemm_counter_start(dgemm_counter)
839 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_ia, k=dimen_RI, alpha=1.0_dp, &
840 : matrix_a=fm_mat_Q_gemm(jspin), matrix_b=fm_mat_S(ispin), beta=0.0_dp, &
841 124 : matrix_c=fm_work_iaP)
842 124 : CALL dgemm_counter_stop(dgemm_counter, dimen_ia, dimen_RI, dimen_RI)
843 124 : CALL timestop(handle2)
844 :
845 356 : IF (do_ri_sos_laplace_mp2) THEN
846 : CALL calc_P_sos_mp2(homo(ispin), fm_mat_S(ispin), fm_work_iaP, &
847 : rpa_grad%sos_mp2_work_occ(ispin), rpa_grad%sos_mp2_work_virt(ispin), &
848 68 : omega, weight, virtual(ispin), Eigenval(:, ispin), mp2_env%ri_grad%dot_blksize)
849 :
850 68 : CALL calc_fm_mat_S_laplace(fm_work_iaP, homo(ispin), virtual(ispin), Eigenval(:, ispin), omega)
851 :
852 68 : CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
853 :
854 68 : CALL cp_fm_release(fm_work_iaP)
855 : ELSE
856 : ! To save memory, we add it now
857 56 : CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
858 :
859 : ! Redistribute both matrices and deallocate fm_work_iaP
860 : CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
861 : fm_work_iaP, mat_work_iaP_3D, &
862 : rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
863 56 : rpa_grad%rpa_work%mepos)
864 56 : CALL cp_fm_release(fm_work_iaP)
865 :
866 : CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
867 : fm_mat_S(ispin), mat_S_3D, &
868 : rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
869 56 : rpa_grad%rpa_work%mepos)
870 :
871 : ! Now collect the density matrix
872 : CALL calc_P_rpa(mat_S_3D, mat_work_iaP_3D, rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
873 : rpa_grad%rpa_work%grid, rpa_grad%rpa_work%mepos, &
874 : fm_mat_S(ispin)%matrix_struct, &
875 : rpa_grad%rpa_work%P_ij(ispin)%array, rpa_grad%rpa_work%P_ab(ispin)%array, &
876 56 : weight, omega, Eigenval(:, ispin), homo(ispin), unit_nr, mp2_env)
877 :
878 56 : DEALLOCATE (mat_work_iaP_3D, mat_S_3D)
879 :
880 56 : CALL remove_scaling_factor_rpa(fm_mat_S(ispin), virtual(ispin), Eigenval(:, ispin), homo(ispin), omega)
881 :
882 : END IF
883 :
884 : END DO
885 :
886 108 : CALL timestop(handle)
887 :
888 216 : END SUBROUTINE
889 :
890 : ! **************************************************************************************************
891 : !> \brief ...
892 : !> \param homo ...
893 : !> \param fm_mat_S ...
894 : !> \param fm_work_iaP ...
895 : !> \param sos_mp2_work_occ ...
896 : !> \param sos_mp2_work_virt ...
897 : !> \param omega ...
898 : !> \param weight ...
899 : !> \param virtual ...
900 : !> \param Eigenval ...
901 : !> \param dot_blksize ...
902 : ! **************************************************************************************************
903 340 : SUBROUTINE calc_P_sos_mp2(homo, fm_mat_S, fm_work_iaP, sos_mp2_work_occ, sos_mp2_work_virt, &
904 68 : omega, weight, virtual, Eigenval, dot_blksize)
905 : INTEGER, INTENT(IN) :: homo
906 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S, fm_work_iaP
907 : TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work_occ, sos_mp2_work_virt
908 : REAL(KIND=dp), INTENT(IN) :: omega, weight
909 : INTEGER, INTENT(IN) :: virtual
910 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
911 : INTEGER, INTENT(IN) :: dot_blksize
912 :
913 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_sos_mp2'
914 :
915 : INTEGER :: avirt, col_global, col_local, handle, &
916 : handle2, iocc, my_a, my_i, ncol_local, &
917 : nrow_local, num_ab_pairs, num_ij_pairs
918 68 : INTEGER, DIMENSION(:), POINTER :: col_indices
919 : REAL(KIND=dp) :: ddot, trace
920 :
921 68 : CALL timeset(routineN, handle)
922 :
923 68 : CALL cp_fm_get_info(fm_mat_S, col_indices=col_indices, ncol_local=ncol_local, nrow_local=nrow_local)
924 :
925 68 : CALL timeset(routineN//"_Pij_diag", handle2)
926 332 : DO my_i = 1, homo
927 : ! Collect the contributions of the matrix elements
928 :
929 264 : trace = 0.0_dp
930 :
931 20944 : DO col_local = 1, ncol_local
932 20680 : col_global = col_indices(col_local)
933 :
934 20680 : iocc = MAX(1, col_global - 1)/virtual + 1
935 20680 : avirt = col_global - (iocc - 1)*virtual
936 :
937 20680 : IF (iocc == my_i) trace = trace + &
938 5584 : ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
939 : END DO
940 :
941 332 : sos_mp2_work_occ%P(my_i) = sos_mp2_work_occ%P(my_i) - trace*omega*weight
942 :
943 : END DO
944 68 : CALL timestop(handle2)
945 :
946 68 : CALL timeset(routineN//"_Pab_diag", handle2)
947 1448 : DO my_a = 1, virtual
948 : ! Collect the contributions of the matrix elements
949 :
950 1380 : trace = 0.0_dp
951 :
952 109900 : DO col_local = 1, ncol_local
953 108520 : col_global = col_indices(col_local)
954 :
955 108520 : iocc = MAX(1, col_global - 1)/virtual + 1
956 108520 : avirt = col_global - (iocc - 1)*virtual
957 :
958 108520 : IF (avirt == my_a) trace = trace + &
959 6700 : ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
960 : END DO
961 :
962 1448 : sos_mp2_work_virt%P(my_a) = sos_mp2_work_virt%P(my_a) + trace*omega*weight
963 :
964 : END DO
965 68 : CALL timestop(handle2)
966 :
967 : ! Loop over list and carry out operations
968 68 : num_ij_pairs = SIZE(sos_mp2_work_occ%pair_list, 2)
969 68 : num_ab_pairs = SIZE(sos_mp2_work_virt%pair_list, 2)
970 68 : IF (num_ij_pairs > 0) THEN
971 : CALL calc_Pij_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_occ%pair_list, &
972 : virtual, sos_mp2_work_occ%P(homo + 1:), Eigenval(:homo), omega, weight, &
973 8 : sos_mp2_work_occ%index2send, sos_mp2_work_occ%index2recv, dot_blksize)
974 : END IF
975 68 : IF (num_ab_pairs > 0) THEN
976 : CALL calc_Pab_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_virt%pair_list, &
977 : virtual, sos_mp2_work_virt%P(virtual + 1:), Eigenval(homo + 1:), omega, weight, &
978 8 : sos_mp2_work_virt%index2send, sos_mp2_work_virt%index2recv, dot_blksize)
979 : END IF
980 :
981 68 : CALL timestop(handle)
982 :
983 68 : END SUBROUTINE
984 :
985 : ! **************************************************************************************************
986 : !> \brief ...
987 : !> \param mat_S_1D ...
988 : !> \param mat_work_iaP_3D ...
989 : !> \param gd_homo ...
990 : !> \param gd_virtual ...
991 : !> \param grid ...
992 : !> \param mepos ...
993 : !> \param fm_struct_S ...
994 : !> \param P_ij ...
995 : !> \param P_ab ...
996 : !> \param weight ...
997 : !> \param omega ...
998 : !> \param Eigenval ...
999 : !> \param homo ...
1000 : !> \param unit_nr ...
1001 : !> \param mp2_env ...
1002 : ! **************************************************************************************************
1003 56 : SUBROUTINE calc_P_rpa(mat_S_1D, mat_work_iaP_3D, gd_homo, gd_virtual, grid, mepos, &
1004 56 : fm_struct_S, P_ij, P_ab, weight, omega, Eigenval, homo, unit_nr, mp2_env)
1005 : REAL(KIND=dp), DIMENSION(*), INTENT(INOUT), TARGET :: mat_S_1D
1006 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: mat_work_iaP_3D
1007 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_homo, gd_virtual
1008 : INTEGER, DIMENSION(2), INTENT(IN) :: grid, mepos
1009 : TYPE(cp_fm_struct_type), INTENT(IN), POINTER :: fm_struct_S
1010 : REAL(KIND=dp), DIMENSION(:, :) :: P_ij, P_ab
1011 : REAL(KIND=dp), INTENT(IN) :: weight, omega
1012 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
1013 : INTEGER, INTENT(IN) :: homo, unit_nr
1014 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
1015 :
1016 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_rpa'
1017 :
1018 : INTEGER :: completed, handle, handle2, my_a_end, my_a_size, my_a_start, my_i_end, my_i_size, &
1019 : my_i_start, my_P_size, my_prow, number_of_parallel_channels, proc_a_recv, proc_a_send, &
1020 : proc_i_recv, proc_i_send, proc_recv, proc_send, proc_shift, recv_a_end, recv_a_size, &
1021 : recv_a_start, recv_i_end, recv_i_size, recv_i_start, tag
1022 : INTEGER(KIND=int_8) :: mem, number_of_elements_per_blk
1023 56 : INTEGER, ALLOCATABLE, DIMENSION(:) :: procs_recv
1024 56 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1025 : REAL(KIND=dp) :: mem_per_block, mem_real
1026 56 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_compens_1D
1027 56 : REAL(KIND=dp), DIMENSION(:, :, :), POINTER :: mat_S_3D
1028 56 : TYPE(cp_1d_r_cp_type), ALLOCATABLE, DIMENSION(:) :: buffer_1D
1029 56 : TYPE(cp_3d_r_cp_type), ALLOCATABLE, DIMENSION(:) :: buffer_3D
1030 : TYPE(mp_para_env_type), POINTER :: para_env
1031 56 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_requests, send_requests
1032 :
1033 56 : CALL timeset(routineN, handle)
1034 :
1035 : ! We allocate it at every step to reduce potential memory conflicts with COSMA
1036 56 : IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1037 48 : IF (.NOT. c_associated(mp2_env%local_gemm_ctx)) THEN
1038 48 : CALL local_gemm_create(mp2_env%local_gemm_ctx, LOCAL_GEMM_PU_GPU)
1039 48 : CALL local_gemm_set_op_threshold_gpu(mp2_env%local_gemm_ctx, spla_threshold)
1040 : END IF
1041 : END IF
1042 :
1043 56 : tag = 47
1044 :
1045 56 : my_P_size = SIZE(mat_work_iaP_3D, 1)
1046 :
1047 56 : CALL cp_fm_struct_get(fm_struct_S, para_env=para_env)
1048 56 : CALL fm_struct_S%context%get(my_process_row=my_prow, blacs2mpi=blacs2mpi, para_env=para_env)
1049 :
1050 56 : CALL get_group_dist(gd_virtual, mepos(1), my_a_start, my_a_end, my_a_size)
1051 56 : CALL get_group_dist(gd_homo, mepos(2), my_i_start, my_i_end, my_i_size)
1052 :
1053 : ! We have to remap the indices because mp_sendrecv requires a 3D array (because of mat_work_iaP_3D)
1054 : ! and dgemm requires 2D arrays
1055 : ! Fortran 2008 does allow pointer remapping independently of the ranks but GCC 7 does not properly support it
1056 56 : mat_S_3D(1:my_P_size, 1:my_a_size, 1:my_i_size) => mat_S_1D(1:INT(my_P_size, int_8)*my_a_size*my_i_size)
1057 :
1058 : number_of_elements_per_blk = MAX(INT(maxsize(gd_homo), KIND=int_8)*my_a_size, &
1059 56 : INT(maxsize(gd_virtual), KIND=int_8)*my_i_size)*my_P_size
1060 :
1061 : ! Determine the available memory and estimate the number of possible parallel communication channels
1062 56 : CALL m_memory(mem)
1063 56 : mem_real = REAL(mem, KIND=dp)
1064 56 : mem_per_block = REAL(number_of_elements_per_blk, KIND=dp)*8.0_dp
1065 168 : number_of_parallel_channels = MAX(1, MIN(MAXVAL(grid) - 1, FLOOR(mem_real/mem_per_block)))
1066 56 : CALL para_env%min(number_of_parallel_channels)
1067 56 : IF (mp2_env%ri_grad%max_parallel_comm > 0) &
1068 56 : number_of_parallel_channels = MIN(number_of_parallel_channels, mp2_env%ri_grad%max_parallel_comm)
1069 :
1070 56 : IF (unit_nr > 0) THEN
1071 28 : WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Number of parallel communication channels:', number_of_parallel_channels
1072 28 : CALL m_flush(unit_nr)
1073 : END IF
1074 56 : CALL para_env%sync()
1075 :
1076 224 : ALLOCATE (buffer_1D(number_of_parallel_channels))
1077 112 : DO proc_shift = 1, number_of_parallel_channels
1078 224 : ALLOCATE (buffer_1D(proc_shift)%array(number_of_elements_per_blk))
1079 : END DO
1080 :
1081 224 : ALLOCATE (buffer_3D(number_of_parallel_channels))
1082 :
1083 : ! Allocate buffers for vector version of kahan summation
1084 56 : IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1085 144 : ALLOCATE (buffer_compens_1D(2*MAX(my_a_size*maxsize(gd_virtual), my_i_size*maxsize(gd_homo))))
1086 : END IF
1087 :
1088 56 : IF (number_of_parallel_channels > 1) THEN
1089 0 : ALLOCATE (procs_recv(number_of_parallel_channels))
1090 0 : ALLOCATE (recv_requests(number_of_parallel_channels))
1091 0 : ALLOCATE (send_requests(MAXVAL(grid) - 1))
1092 : END IF
1093 :
1094 56 : IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
1095 0 : CALL timeset(routineN//"_comm_a", handle2)
1096 0 : recv_requests(:) = mp_request_null
1097 0 : procs_recv(:) = -1
1098 0 : DO proc_shift = 1, MIN(grid(1) - 1, number_of_parallel_channels)
1099 0 : proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
1100 0 : proc_recv = mepos(2)*grid(1) + proc_a_recv
1101 :
1102 0 : CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1103 :
1104 : buffer_3D(proc_shift)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
1105 0 : buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
1106 :
1107 : CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
1108 0 : recv_requests(proc_shift), tag)
1109 :
1110 0 : procs_recv(proc_shift) = proc_a_recv
1111 : END DO
1112 :
1113 0 : send_requests(:) = mp_request_null
1114 0 : DO proc_shift = 1, grid(1) - 1
1115 0 : proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
1116 0 : proc_send = mepos(2)*grid(1) + proc_a_send
1117 :
1118 : CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1119 0 : send_requests(proc_shift), tag)
1120 : END DO
1121 0 : CALL timestop(handle2)
1122 : END IF
1123 :
1124 : CALL calc_P_rpa_a(P_ab(:, my_a_start:my_a_end), &
1125 : mat_S_3D, mat_work_iaP_3D, &
1126 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1127 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1128 56 : Eigenval(homo + my_a_start:homo + my_a_end), omega, weight)
1129 :
1130 56 : DO proc_shift = 1, grid(1) - 1
1131 0 : CALL timeset(routineN//"_comm_a", handle2)
1132 0 : IF (number_of_parallel_channels > 1) THEN
1133 0 : CALL mp_waitany(recv_requests, completed)
1134 :
1135 0 : CALL get_group_dist(gd_virtual, procs_recv(completed), recv_a_start, recv_a_end, recv_a_size)
1136 : ELSE
1137 0 : proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
1138 0 : proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
1139 :
1140 0 : proc_send = mepos(2)*grid(1) + proc_a_send
1141 0 : proc_recv = mepos(2)*grid(1) + proc_a_recv
1142 :
1143 0 : CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1144 :
1145 : buffer_3D(1)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
1146 0 : buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
1147 :
1148 : CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1149 0 : buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
1150 0 : completed = 1
1151 : END IF
1152 0 : CALL timestop(handle2)
1153 :
1154 : CALL calc_P_rpa_a(P_ab(:, recv_a_start:recv_a_end), &
1155 : mat_S_3D, buffer_3D(completed)%array, &
1156 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1157 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1158 0 : Eigenval(homo + recv_a_start:homo + recv_a_end), omega, weight)
1159 :
1160 56 : IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(1)) THEN
1161 0 : proc_a_recv = MODULO(mepos(1) - proc_shift - number_of_parallel_channels, grid(1))
1162 0 : proc_recv = mepos(2)*grid(1) + proc_a_recv
1163 :
1164 0 : CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1165 :
1166 : buffer_3D(completed)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
1167 0 : buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
1168 :
1169 : CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
1170 0 : recv_requests(completed), tag)
1171 :
1172 0 : procs_recv(completed) = proc_a_recv
1173 : END IF
1174 : END DO
1175 :
1176 56 : IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
1177 0 : CALL mp_waitall(send_requests)
1178 : END IF
1179 :
1180 56 : IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
1181 0 : recv_requests(:) = mp_request_null
1182 0 : procs_recv(:) = -1
1183 0 : DO proc_shift = 1, MIN(grid(2) - 1, number_of_parallel_channels)
1184 0 : proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
1185 0 : proc_recv = proc_i_recv*grid(1) + mepos(1)
1186 :
1187 0 : CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
1188 :
1189 : buffer_3D(proc_shift)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
1190 0 : buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
1191 :
1192 : CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
1193 0 : recv_requests(proc_shift), tag)
1194 :
1195 0 : procs_recv(proc_shift) = proc_i_recv
1196 : END DO
1197 :
1198 0 : send_requests(:) = mp_request_null
1199 0 : DO proc_shift = 1, grid(2) - 1
1200 0 : proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
1201 0 : proc_send = proc_i_send*grid(1) + mepos(1)
1202 :
1203 : CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1204 0 : send_requests(proc_shift), tag)
1205 : END DO
1206 : END IF
1207 :
1208 : CALL calc_P_rpa_i(P_ij(:, my_i_start:my_i_end), &
1209 : mat_S_3D, mat_work_iaP_3D, &
1210 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1211 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1212 56 : Eigenval(my_i_start:my_i_end), omega, weight)
1213 :
1214 56 : DO proc_shift = 1, grid(2) - 1
1215 0 : CALL timeset(routineN//"_comm_i", handle2)
1216 0 : IF (number_of_parallel_channels > 1) THEN
1217 0 : CALL mp_waitany(recv_requests, completed)
1218 :
1219 0 : CALL get_group_dist(gd_homo, procs_recv(completed), recv_i_start, recv_i_end, recv_i_size)
1220 : ELSE
1221 0 : proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
1222 0 : proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
1223 :
1224 0 : proc_send = proc_i_send*grid(1) + mepos(1)
1225 0 : proc_recv = proc_i_recv*grid(1) + mepos(1)
1226 :
1227 0 : CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
1228 :
1229 : buffer_3D(1)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
1230 0 : buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
1231 :
1232 : CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1233 0 : buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
1234 0 : completed = 1
1235 : END IF
1236 0 : CALL timestop(handle2)
1237 :
1238 : CALL calc_P_rpa_i(P_ij(:, recv_i_start:recv_i_end), &
1239 : mat_S_3D, buffer_3D(completed)%array, &
1240 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1241 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1242 0 : Eigenval(recv_i_start:recv_i_end), omega, weight)
1243 :
1244 56 : IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(2)) THEN
1245 0 : proc_i_recv = MODULO(mepos(2) - proc_shift - number_of_parallel_channels, grid(2))
1246 0 : proc_recv = proc_i_recv*grid(1) + mepos(1)
1247 :
1248 0 : CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_a_end, recv_i_size)
1249 :
1250 : buffer_3D(completed)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
1251 0 : buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
1252 :
1253 : CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
1254 0 : recv_requests(completed), tag)
1255 :
1256 0 : procs_recv(completed) = proc_i_recv
1257 : END IF
1258 : END DO
1259 :
1260 56 : IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
1261 0 : CALL mp_waitall(send_requests)
1262 : END IF
1263 :
1264 56 : IF (number_of_parallel_channels > 1) THEN
1265 0 : DEALLOCATE (procs_recv)
1266 0 : DEALLOCATE (recv_requests)
1267 0 : DEALLOCATE (send_requests)
1268 : END IF
1269 :
1270 56 : IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1271 : ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
1272 48 : CALL local_gemm_destroy(mp2_env%local_gemm_ctx)
1273 48 : mp2_env%local_gemm_ctx = C_NULL_PTR
1274 48 : DEALLOCATE (buffer_compens_1D)
1275 : END IF
1276 :
1277 112 : DO proc_shift = 1, number_of_parallel_channels
1278 56 : NULLIFY (buffer_3D(proc_shift)%array)
1279 112 : DEALLOCATE (buffer_1D(proc_shift)%array)
1280 : END DO
1281 56 : DEALLOCATE (buffer_3D, buffer_1D)
1282 :
1283 56 : CALL timestop(handle)
1284 :
1285 168 : END SUBROUTINE
1286 :
1287 : ! **************************************************************************************************
1288 : !> \brief ...
1289 : !> \param P_ab ...
1290 : !> \param mat_S ...
1291 : !> \param mat_work ...
1292 : !> \param dot_blksize ...
1293 : !> \param buffer_1D ...
1294 : !> \param local_gemm_ctx ...
1295 : !> \param my_eval_virt ...
1296 : !> \param my_eval_occ ...
1297 : !> \param recv_eval_virt ...
1298 : !> \param omega ...
1299 : !> \param weight ...
1300 : ! **************************************************************************************************
1301 56 : SUBROUTINE calc_P_rpa_a(P_ab, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
1302 56 : my_eval_virt, my_eval_occ, recv_eval_virt, omega, weight)
1303 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: P_ab
1304 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: mat_S, mat_work
1305 : INTEGER, INTENT(IN) :: dot_blksize
1306 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
1307 : INTENT(INOUT), TARGET :: buffer_1D
1308 : TYPE(C_PTR), INTENT(INOUT) :: local_gemm_ctx
1309 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, my_eval_occ, recv_eval_virt
1310 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1311 :
1312 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_rpa_a'
1313 :
1314 : INTEGER :: handle, my_a, my_a_size, my_i, &
1315 : my_i_size, my_P_size, P_end, P_start, &
1316 : recv_a_size, stripesize
1317 56 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_compens, buffer_unscaled
1318 :
1319 56 : CALL timeset(routineN, handle)
1320 :
1321 56 : my_i_size = SIZE(mat_S, 3)
1322 56 : recv_a_size = SIZE(mat_work, 2)
1323 56 : my_a_size = SIZE(mat_S, 2)
1324 56 : my_P_size = SIZE(mat_S, 1)
1325 :
1326 56 : IF (dot_blksize >= blksize_threshold) THEN
1327 48 : buffer_compens(1:my_a_size, 1:recv_a_size) => buffer_1D(1:my_a_size*recv_a_size)
1328 22208 : buffer_compens = 0.0_dp
1329 48 : buffer_unscaled(1:my_a_size, 1:recv_a_size) => buffer_1D(my_a_size*recv_a_size + 1:2*my_a_size*recv_a_size)
1330 :
1331 : ! This loop imitates the actual tensor contraction
1332 232 : DO my_i = 1, my_i_size
1333 416 : DO P_start = 1, my_P_size, dot_blksize
1334 184 : stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
1335 184 : P_end = P_start + stripesize - 1
1336 :
1337 : CALL local_gemm("T", "N", my_a_size, recv_a_size, stripesize, &
1338 : -weight, mat_S(P_start:P_end, :, my_i), stripesize, &
1339 : mat_work(P_start:P_end, :, my_i), stripesize, &
1340 184 : 0.0_dp, buffer_unscaled, my_a_size, local_gemm_ctx)
1341 :
1342 : CALL scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
1343 184 : my_eval_virt, recv_eval_virt, my_eval_occ(my_i))
1344 :
1345 368 : CALL kahan_step(buffer_compens, P_ab)
1346 : END DO
1347 : END DO
1348 : ELSE
1349 : BLOCK
1350 : INTEGER :: recv_a
1351 : REAL(KIND=dp) :: tmp, e_i, e_a, e_b, omega2, my_compens, my_p, s
1352 8 : omega2 = -omega**2
1353 : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
1354 : !$OMP SHARED(my_a_size,recv_a_size,my_i_size,mat_S,my_eval_virt,recv_eval_virt,my_eval_occ,omega2,&
1355 : !$OMP P_ab,weight,mat_work)&
1356 8 : !$OMP PRIVATE(tmp,my_a,recv_a,my_i,e_a,e_b,e_i,my_compens,my_p,s)
1357 : DO my_a = 1, my_a_size
1358 : DO recv_a = 1, recv_a_size
1359 : e_a = my_eval_virt(my_a)
1360 : e_b = recv_eval_virt(recv_a)
1361 : my_p = P_ab(my_a, recv_a)
1362 : my_compens = 0.0_dp
1363 : DO my_i = 1, my_i_size
1364 : e_i = -my_eval_occ(my_i)
1365 : tmp = -weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, recv_a, my_i)) &
1366 : *(1.0_dp + omega2/((e_a + e_i)*(e_b + e_i))) - my_compens
1367 : s = my_p + tmp
1368 : my_compens = (s - my_p) - tmp
1369 : my_p = s
1370 : END DO
1371 : P_ab(my_a, recv_a) = my_p
1372 : END DO
1373 : END DO
1374 : END BLOCK
1375 : END IF
1376 :
1377 56 : CALL timestop(handle)
1378 :
1379 56 : END SUBROUTINE calc_P_rpa_a
1380 :
1381 : ! **************************************************************************************************
1382 : !> \brief ...
1383 : !> \param P_ij ...
1384 : !> \param mat_S ...
1385 : !> \param mat_work ...
1386 : !> \param dot_blksize ...
1387 : !> \param buffer_1D ...
1388 : !> \param local_gemm_ctx ...
1389 : !> \param my_eval_virt ...
1390 : !> \param my_eval_occ ...
1391 : !> \param recv_eval_occ ...
1392 : !> \param omega ...
1393 : !> \param weight ...
1394 : ! **************************************************************************************************
1395 56 : SUBROUTINE calc_P_rpa_i(P_ij, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
1396 56 : my_eval_virt, my_eval_occ, recv_eval_occ, omega, weight)
1397 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: P_ij
1398 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: mat_S, mat_work
1399 : INTEGER, INTENT(IN) :: dot_blksize
1400 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
1401 : INTENT(INOUT), TARGET :: buffer_1D
1402 : TYPE(C_PTR), INTENT(INOUT) :: local_gemm_ctx
1403 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, my_eval_occ, recv_eval_occ
1404 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1405 :
1406 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_rpa_i'
1407 :
1408 : INTEGER :: handle, my_a, my_a_size, my_i, &
1409 : my_i_size, my_P_size, P_end, P_start, &
1410 : recv_i_size, stripesize
1411 56 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_compens, buffer_unscaled
1412 :
1413 56 : CALL timeset(routineN, handle)
1414 :
1415 56 : my_i_size = SIZE(mat_S, 3)
1416 56 : recv_i_size = SIZE(mat_work, 3)
1417 56 : my_a_size = SIZE(mat_S, 2)
1418 56 : my_P_size = SIZE(mat_S, 1)
1419 :
1420 56 : IF (dot_blksize >= blksize_threshold) THEN
1421 48 : buffer_compens(1:my_i_size, 1:recv_i_size) => buffer_1D(1:my_i_size*recv_i_size)
1422 944 : buffer_compens = 0.0_dp
1423 48 : buffer_unscaled(1:my_i_size, 1:recv_i_size) => buffer_1D(my_i_size*recv_i_size + 1:2*my_i_size*recv_i_size)
1424 :
1425 : ! This loop imitates the actual tensor contraction
1426 1048 : DO my_a = 1, my_a_size
1427 2048 : DO P_start = 1, my_P_size, dot_blksize
1428 1000 : stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
1429 1000 : P_end = P_start + stripesize - 1
1430 :
1431 : CALL local_gemm("T", "N", my_i_size, recv_i_size, stripesize, &
1432 : weight, mat_S(P_start:P_end, my_a, :), stripesize, &
1433 : mat_work(P_start:P_end, my_a, :), stripesize, &
1434 1000 : 0.0_dp, buffer_unscaled, my_i_size, local_gemm_ctx)
1435 :
1436 : CALL scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
1437 1000 : my_eval_occ, recv_eval_occ, my_eval_virt(my_a))
1438 :
1439 2000 : CALL kahan_step(buffer_compens, P_ij)
1440 : END DO
1441 : END DO
1442 : ELSE
1443 : BLOCK
1444 : REAL(KIND=dp) :: tmp, e_i, e_a, e_j, omega2, my_compens, my_p, s
1445 : INTEGER :: recv_i
1446 8 : omega2 = -omega**2
1447 : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
1448 : !$OMP SHARED(my_a_size,recv_i_size,my_i_size,mat_S,my_eval_occ,my_eval_virt,omega2,&
1449 : !$OMP recv_eval_occ,P_ij,weight,mat_work)&
1450 8 : !$OMP PRIVATE(tmp,my_a,recv_i,my_i,e_i,e_j,e_a,my_compens,my_p,s)
1451 : DO my_i = 1, my_i_size
1452 : DO recv_i = 1, recv_i_size
1453 : e_i = my_eval_occ(my_i)
1454 : e_j = recv_eval_occ(recv_i)
1455 : my_p = P_ij(my_i, recv_i)
1456 : my_compens = 0.0_dp
1457 : DO my_a = 1, my_a_size
1458 : e_a = my_eval_virt(my_a)
1459 : tmp = weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, my_a, recv_i)) &
1460 : *(1.0_dp + omega2/((e_a - e_i)*(e_a - e_j))) - my_compens
1461 : s = my_p + tmp
1462 : my_compens = (s - my_p) - tmp
1463 : my_p = s
1464 : END DO
1465 : P_ij(my_i, recv_i) = my_p
1466 : END DO
1467 : END DO
1468 : END BLOCK
1469 : END IF
1470 :
1471 56 : CALL timestop(handle)
1472 :
1473 56 : END SUBROUTINE calc_P_rpa_i
1474 :
1475 : ! **************************************************************************************************
1476 : !> \brief ...
1477 : !> \param compens ...
1478 : !> \param P ...
1479 : ! **************************************************************************************************
1480 1184 : SUBROUTINE kahan_step(compens, P)
1481 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: compens, P
1482 :
1483 : CHARACTER(LEN=*), PARAMETER :: routineN = 'kahan_step'
1484 :
1485 : INTEGER :: handle, i, j
1486 : REAL(KIND=dp) :: my_compens, my_p, s
1487 :
1488 1184 : CALL timeset(routineN, handle)
1489 :
1490 1184 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(P,compens) PRIVATE(i,my_p,my_compens,s, j) COLLAPSE(2)
1491 : DO j = 1, SIZE(compens, 2)
1492 : DO i = 1, SIZE(compens, 1)
1493 : my_p = P(i, j)
1494 : my_compens = compens(i, j)
1495 : s = my_p + my_compens
1496 : compens(i, j) = (s - my_p) - my_compens
1497 : P(i, j) = s
1498 : END DO
1499 : END DO
1500 : !$OMP END PARALLEL DO
1501 :
1502 1184 : CALL timestop(handle)
1503 :
1504 1184 : END SUBROUTINE
1505 :
1506 : ! **************************************************************************************************
1507 : !> \brief ...
1508 : !> \param buffer_unscaled ...
1509 : !> \param buffer_compens ...
1510 : !> \param omega ...
1511 : !> \param my_eval_virt ...
1512 : !> \param recv_eval_virt ...
1513 : !> \param my_eval_occ ...
1514 : ! **************************************************************************************************
1515 184 : SUBROUTINE scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
1516 184 : my_eval_virt, recv_eval_virt, my_eval_occ)
1517 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: buffer_unscaled
1518 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: buffer_compens
1519 : REAL(KIND=dp), INTENT(IN) :: omega
1520 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, recv_eval_virt
1521 : REAL(KIND=dp), INTENT(IN) :: my_eval_occ
1522 :
1523 : CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_virt'
1524 :
1525 : INTEGER :: handle, my_a, my_b
1526 :
1527 184 : CALL timeset(routineN, handle)
1528 :
1529 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_unscaled,buffer_compens,omega,&
1530 184 : !$OMP my_eval_virt,recv_eval_virt,my_eval_occ) PRIVATE(my_a,my_b)
1531 : DO my_b = 1, SIZE(buffer_compens, 2)
1532 : DO my_a = 1, SIZE(buffer_compens, 1)
1533 : buffer_compens(my_a, my_b) = buffer_unscaled(my_a, my_b) &
1534 : *(1.0_dp - omega**2/((my_eval_virt(my_a) - my_eval_occ)*(recv_eval_virt(my_b) - my_eval_occ))) &
1535 : - buffer_compens(my_a, my_b)
1536 : END DO
1537 : END DO
1538 : !$OMP END PARALLEL DO
1539 :
1540 184 : CALL timestop(handle)
1541 :
1542 184 : END SUBROUTINE scale_buffer_and_add_compens_virt
1543 :
1544 : ! **************************************************************************************************
1545 : !> \brief ...
1546 : !> \param buffer_unscaled ...
1547 : !> \param buffer_compens ...
1548 : !> \param omega ...
1549 : !> \param my_eval_occ ...
1550 : !> \param recv_eval_occ ...
1551 : !> \param my_eval_virt ...
1552 : ! **************************************************************************************************
1553 1000 : SUBROUTINE scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
1554 1000 : my_eval_occ, recv_eval_occ, my_eval_virt)
1555 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: buffer_unscaled
1556 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: buffer_compens
1557 : REAL(KIND=dp), INTENT(IN) :: omega
1558 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_occ, recv_eval_occ
1559 : REAL(KIND=dp), INTENT(IN) :: my_eval_virt
1560 :
1561 : CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_occ'
1562 :
1563 : INTEGER :: handle, my_i, my_j
1564 :
1565 1000 : CALL timeset(routineN, handle)
1566 :
1567 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_compens,buffer_unscaled,omega,&
1568 1000 : !$OMP my_eval_virt,my_eval_occ,recv_eval_occ) PRIVATE(my_i,my_j)
1569 : DO my_j = 1, SIZE(buffer_compens, 2)
1570 : DO my_i = 1, SIZE(buffer_compens, 1)
1571 : buffer_compens(my_i, my_j) = buffer_unscaled(my_i, my_j) &
1572 : *(1.0_dp - omega**2/((my_eval_virt - my_eval_occ(my_i))*(my_eval_virt - recv_eval_occ(my_j)))) &
1573 : - buffer_compens(my_i, my_j)
1574 : END DO
1575 : END DO
1576 : !$OMP END PARALLEL DO
1577 :
1578 1000 : CALL timestop(handle)
1579 :
1580 1000 : END SUBROUTINE scale_buffer_and_add_compens_occ
1581 :
1582 : ! **************************************************************************************************
1583 : !> \brief ...
1584 : !> \param x ...
1585 : !> \return ...
1586 : ! **************************************************************************************************
1587 1320 : ELEMENTAL FUNCTION sinh_over_x(x) RESULT(res)
1588 : REAL(KIND=dp), INTENT(IN) :: x
1589 : REAL(KIND=dp) :: res
1590 :
1591 : ! Calculate sinh(x)/x
1592 : ! Split the intervall to prevent numerical instabilities
1593 1320 : IF (ABS(x) > 3.0e-4_dp) THEN
1594 1318 : res = SINH(x)/x
1595 : ELSE
1596 2 : res = 1.0_dp + x**2/6.0_dp
1597 : END IF
1598 :
1599 1320 : END FUNCTION sinh_over_x
1600 :
1601 : ! **************************************************************************************************
1602 : !> \brief ...
1603 : !> \param fm_work_iaP ...
1604 : !> \param fm_mat_S ...
1605 : !> \param pair_list ...
1606 : !> \param virtual ...
1607 : !> \param P_ij ...
1608 : !> \param Eigenval ...
1609 : !> \param omega ...
1610 : !> \param weight ...
1611 : !> \param index2send ...
1612 : !> \param index2recv ...
1613 : !> \param dot_blksize ...
1614 : ! **************************************************************************************************
1615 8 : SUBROUTINE calc_Pij_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ij, Eigenval, &
1616 8 : omega, weight, index2send, index2recv, dot_blksize)
1617 : TYPE(cp_fm_type), INTENT(IN) :: fm_work_iaP, fm_mat_S
1618 : INTEGER, DIMENSION(:, :), INTENT(IN) :: pair_list
1619 : INTEGER, INTENT(IN) :: virtual
1620 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: P_ij
1621 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
1622 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1623 : TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
1624 : INTEGER, INTENT(IN) :: dot_blksize
1625 :
1626 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_Pij_degen'
1627 :
1628 : INTEGER :: avirt, col_global, col_local, counter, first_p_pos(2), first_p_pos_col, handle, &
1629 : handle2, ij_counter, iocc, my_col_local, my_i, my_j, my_pcol, my_prow, ncol_block, &
1630 : ncol_local, nrow_local, num_ij_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, &
1631 : recv_size, send_size, size_recv_buffer, size_send_buffer, tag
1632 8 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
1633 8 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1634 : REAL(KIND=dp) :: trace
1635 8 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1636 : TYPE(cp_blacs_env_type), POINTER :: context
1637 : TYPE(mp_para_env_type), POINTER :: para_env
1638 :
1639 8 : CALL timeset(routineN, handle)
1640 :
1641 : CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
1642 : ncol_local=ncol_local, col_indices=col_indices, &
1643 8 : context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
1644 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1645 8 : number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1646 8 : first_p_pos_col = first_p_pos(2)
1647 :
1648 8 : num_ij_pairs = SIZE(pair_list, 2)
1649 :
1650 8 : tag = 42
1651 :
1652 104 : DO ij_counter = 1, num_ij_pairs
1653 :
1654 96 : my_i = pair_list(1, ij_counter)
1655 96 : my_j = pair_list(2, ij_counter)
1656 :
1657 96 : trace = 0.0_dp
1658 :
1659 7392 : DO col_local = 1, ncol_local
1660 7296 : col_global = col_indices(col_local)
1661 :
1662 7296 : iocc = MAX(1, col_global - 1)/virtual + 1
1663 7296 : avirt = col_global - (iocc - 1)*virtual
1664 :
1665 7296 : IF (iocc /= my_j) CYCLE
1666 1824 : pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1667 1824 : IF (pcol /= my_pcol) CYCLE
1668 :
1669 1824 : my_col_local = cp_fm_indxg2l((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1670 :
1671 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
1672 7392 : dot_blksize)
1673 : END DO
1674 :
1675 104 : P_ij(ij_counter) = P_ij(ij_counter) - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
1676 :
1677 : END DO
1678 :
1679 8 : IF (num_pe_col > 1) THEN
1680 : size_send_buffer = 0
1681 : size_recv_buffer = 0
1682 0 : DO proc_shift = 1, num_pe_col - 1
1683 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1684 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1685 :
1686 0 : IF (ALLOCATED(index2send(pcol_send)%array)) &
1687 0 : size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
1688 :
1689 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) &
1690 0 : size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
1691 : END DO
1692 :
1693 0 : ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1694 :
1695 0 : DO proc_shift = 1, num_pe_col - 1
1696 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1697 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1698 :
1699 : ! Collect data and exchange
1700 0 : send_size = 0
1701 0 : IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
1702 :
1703 0 : DO counter = 1, send_size
1704 0 : buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
1705 : END DO
1706 :
1707 0 : recv_size = 0
1708 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
1709 0 : IF (recv_size > 0) THEN
1710 0 : CALL timeset(routineN//"_send", handle2)
1711 0 : IF (send_size > 0) THEN
1712 : CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
1713 0 : buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1714 : ELSE
1715 0 : CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1716 : END IF
1717 0 : CALL timestop(handle2)
1718 :
1719 0 : DO ij_counter = 1, num_ij_pairs
1720 : ! Collect the contributions of the matrix elements
1721 :
1722 0 : my_i = pair_list(1, ij_counter)
1723 0 : my_j = pair_list(2, ij_counter)
1724 :
1725 0 : trace = 0.0_dp
1726 :
1727 0 : DO col_local = 1, recv_size
1728 0 : col_global = index2recv(pcol_recv)%array(col_local)
1729 :
1730 0 : iocc = MAX(1, col_global - 1)/virtual + 1
1731 0 : IF (iocc /= my_j) CYCLE
1732 0 : avirt = col_global - (iocc - 1)*virtual
1733 0 : pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1734 0 : IF (pcol /= my_pcol) CYCLE
1735 :
1736 0 : my_col_local = cp_fm_indxg2l((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1737 :
1738 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
1739 0 : dot_blksize)
1740 : END DO
1741 :
1742 : P_ij(ij_counter) = P_ij(ij_counter) &
1743 0 : - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
1744 : END DO
1745 0 : ELSE IF (send_size > 0) THEN
1746 0 : CALL timeset(routineN//"_send", handle2)
1747 0 : CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
1748 0 : CALL timestop(handle2)
1749 : END IF
1750 : END DO
1751 0 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
1752 0 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
1753 : END IF
1754 :
1755 8 : CALL timestop(handle)
1756 :
1757 16 : END SUBROUTINE
1758 :
1759 : ! **************************************************************************************************
1760 : !> \brief ...
1761 : !> \param fm_work_iaP ...
1762 : !> \param fm_mat_S ...
1763 : !> \param pair_list ...
1764 : !> \param virtual ...
1765 : !> \param P_ab ...
1766 : !> \param Eigenval ...
1767 : !> \param omega ...
1768 : !> \param weight ...
1769 : !> \param index2send ...
1770 : !> \param index2recv ...
1771 : !> \param dot_blksize ...
1772 : ! **************************************************************************************************
1773 8 : SUBROUTINE calc_Pab_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ab, Eigenval, &
1774 8 : omega, weight, index2send, index2recv, dot_blksize)
1775 : TYPE(cp_fm_type), INTENT(IN) :: fm_work_iaP, fm_mat_S
1776 : INTEGER, DIMENSION(:, :), INTENT(IN) :: pair_list
1777 : INTEGER, INTENT(IN) :: virtual
1778 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: P_ab
1779 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
1780 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1781 : TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
1782 : INTEGER, INTENT(IN) :: dot_blksize
1783 :
1784 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_Pab_degen'
1785 :
1786 : INTEGER :: ab_counter, avirt, col_global, col_local, counter, first_p_pos(2), &
1787 : first_p_pos_col, handle, handle2, iocc, my_a, my_b, my_col_local, my_pcol, my_prow, &
1788 : ncol_block, ncol_local, nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, pcol_send, &
1789 : proc_shift, recv_size, send_size, size_recv_buffer, size_send_buffer, tag
1790 8 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
1791 8 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1792 : REAL(KIND=dp) :: trace
1793 8 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1794 : TYPE(cp_blacs_env_type), POINTER :: context
1795 : TYPE(mp_para_env_type), POINTER :: para_env
1796 :
1797 8 : CALL timeset(routineN, handle)
1798 :
1799 : CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
1800 : ncol_local=ncol_local, col_indices=col_indices, &
1801 8 : context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
1802 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1803 8 : number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1804 8 : first_p_pos_col = first_p_pos(2)
1805 :
1806 8 : num_ab_pairs = SIZE(pair_list, 2)
1807 :
1808 8 : tag = 43
1809 :
1810 1232 : DO ab_counter = 1, num_ab_pairs
1811 :
1812 1224 : my_a = pair_list(1, ab_counter)
1813 1224 : my_b = pair_list(2, ab_counter)
1814 :
1815 1224 : trace = 0.0_dp
1816 :
1817 94248 : DO col_local = 1, ncol_local
1818 93024 : col_global = col_indices(col_local)
1819 :
1820 93024 : iocc = MAX(1, col_global - 1)/virtual + 1
1821 93024 : avirt = col_global - (iocc - 1)*virtual
1822 :
1823 93024 : IF (avirt /= my_b) CYCLE
1824 4896 : pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1825 4896 : IF (pcol /= my_pcol) CYCLE
1826 4896 : my_col_local = cp_fm_indxg2l((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1827 :
1828 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
1829 94248 : dot_blksize)
1830 :
1831 : END DO
1832 :
1833 : P_ab(ab_counter) = P_ab(ab_counter) &
1834 1232 : + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
1835 :
1836 : END DO
1837 :
1838 8 : IF (num_pe_col > 1) THEN
1839 : size_send_buffer = 0
1840 : size_recv_buffer = 0
1841 0 : DO proc_shift = 1, num_pe_col - 1
1842 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1843 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1844 :
1845 0 : IF (ALLOCATED(index2send(pcol_send)%array)) &
1846 0 : size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
1847 :
1848 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) &
1849 0 : size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
1850 : END DO
1851 :
1852 0 : ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1853 :
1854 0 : DO proc_shift = 1, num_pe_col - 1
1855 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1856 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1857 :
1858 : ! Collect data and exchange
1859 0 : send_size = 0
1860 0 : IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
1861 :
1862 0 : DO counter = 1, send_size
1863 0 : buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
1864 : END DO
1865 :
1866 0 : recv_size = 0
1867 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
1868 0 : IF (recv_size > 0) THEN
1869 0 : CALL timeset(routineN//"_send", handle2)
1870 0 : IF (send_size > 0) THEN
1871 : CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
1872 0 : buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1873 : ELSE
1874 0 : CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1875 : END IF
1876 0 : CALL timestop(handle2)
1877 :
1878 0 : DO ab_counter = 1, num_ab_pairs
1879 : ! Collect the contributions of the matrix elements
1880 :
1881 0 : my_a = pair_list(1, ab_counter)
1882 0 : my_b = pair_list(2, ab_counter)
1883 :
1884 0 : trace = 0.0_dp
1885 :
1886 0 : DO col_local = 1, SIZE(index2recv(pcol_recv)%array)
1887 0 : col_global = index2recv(pcol_recv)%array(col_local)
1888 :
1889 0 : iocc = MAX(1, col_global - 1)/virtual + 1
1890 0 : avirt = col_global - (iocc - 1)*virtual
1891 0 : IF (avirt /= my_b) CYCLE
1892 0 : pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1893 0 : IF (pcol /= my_pcol) CYCLE
1894 :
1895 0 : my_col_local = cp_fm_indxg2l((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1896 :
1897 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
1898 0 : dot_blksize)
1899 : END DO
1900 :
1901 : P_ab(ab_counter) = P_ab(ab_counter) &
1902 0 : + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
1903 :
1904 : END DO
1905 0 : ELSE IF (send_size > 0) THEN
1906 0 : CALL timeset(routineN//"_send", handle2)
1907 0 : CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
1908 0 : CALL timestop(handle2)
1909 : END IF
1910 : END DO
1911 0 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
1912 0 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
1913 : END IF
1914 :
1915 8 : CALL timestop(handle)
1916 :
1917 16 : END SUBROUTINE
1918 :
1919 : ! **************************************************************************************************
1920 : !> \brief ...
1921 : !> \param index2send ...
1922 : !> \param index2recv ...
1923 : !> \param fm_mat_S ...
1924 : !> \param mat_S_3D ...
1925 : !> \param gd_homo ...
1926 : !> \param gd_virtual ...
1927 : !> \param mepos ...
1928 : ! **************************************************************************************************
1929 112 : SUBROUTINE redistribute_fm_mat_S(index2send, index2recv, fm_mat_S, mat_S_3D, gd_homo, gd_virtual, mepos)
1930 : TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send
1931 : TYPE(two_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2recv
1932 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S
1933 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
1934 : INTENT(OUT) :: mat_S_3D
1935 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_homo, gd_virtual
1936 : INTEGER, DIMENSION(2), INTENT(IN) :: mepos
1937 :
1938 : CHARACTER(LEN=*), PARAMETER :: routineN = 'redistribute_fm_mat_S'
1939 :
1940 : INTEGER :: col_local, handle, my_a, my_homo, my_i, my_pcol, my_prow, my_virtual, nrow_local, &
1941 : num_pe_col, proc_recv, proc_send, proc_shift, recv_size, send_size, size_recv_buffer, &
1942 : size_send_buffer, tag
1943 112 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1944 112 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1945 : TYPE(mp_para_env_type), POINTER :: para_env
1946 :
1947 112 : CALL timeset(routineN, handle)
1948 :
1949 112 : tag = 46
1950 :
1951 : CALL fm_mat_S%matrix_struct%context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1952 112 : number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1953 :
1954 112 : CALL cp_fm_struct_get(fm_mat_S%matrix_struct, nrow_local=nrow_local, para_env=para_env)
1955 :
1956 112 : CALL get_group_dist(gd_homo, mepos(2), sizes=my_homo)
1957 112 : CALL get_group_dist(gd_virtual, mepos(1), sizes=my_virtual)
1958 :
1959 560 : ALLOCATE (mat_S_3D(nrow_local, my_virtual, my_homo))
1960 :
1961 112 : IF (ALLOCATED(index2send(my_pcol)%array)) THEN
1962 8928 : DO col_local = 1, SIZE(index2send(my_pcol)%array)
1963 8816 : my_a = index2recv(my_pcol)%array(1, col_local)
1964 8816 : my_i = index2recv(my_pcol)%array(2, col_local)
1965 678032 : mat_S_3D(:, my_a, my_i) = fm_mat_S%local_data(:, index2send(my_pcol)%array(col_local))
1966 : END DO
1967 : END IF
1968 :
1969 112 : IF (num_pe_col > 1) THEN
1970 : size_send_buffer = 0
1971 : size_recv_buffer = 0
1972 0 : DO proc_shift = 1, num_pe_col - 1
1973 0 : proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
1974 0 : proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1975 :
1976 0 : send_size = 0
1977 0 : IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
1978 0 : size_send_buffer = MAX(size_send_buffer, send_size)
1979 :
1980 0 : recv_size = 0
1981 0 : IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array)
1982 0 : size_recv_buffer = MAX(size_recv_buffer, recv_size)
1983 :
1984 : END DO
1985 :
1986 0 : ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1987 :
1988 0 : DO proc_shift = 1, num_pe_col - 1
1989 0 : proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
1990 0 : proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1991 :
1992 0 : send_size = 0
1993 0 : IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
1994 0 : DO col_local = 1, send_size
1995 0 : buffer_send(:, col_local) = fm_mat_S%local_data(:, index2send(proc_send)%array(col_local))
1996 : END DO
1997 :
1998 0 : recv_size = 0
1999 0 : IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array, 2)
2000 0 : IF (recv_size > 0) THEN
2001 0 : IF (send_size > 0) THEN
2002 : CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), &
2003 0 : buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
2004 : ELSE
2005 0 : CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
2006 : END IF
2007 :
2008 0 : DO col_local = 1, recv_size
2009 0 : my_a = index2recv(proc_recv)%array(1, col_local)
2010 0 : my_i = index2recv(proc_recv)%array(2, col_local)
2011 0 : mat_S_3D(:, my_a, my_i) = buffer_recv(:, col_local)
2012 : END DO
2013 0 : ELSE IF (send_size > 0) THEN
2014 0 : CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), tag)
2015 : END IF
2016 :
2017 : END DO
2018 :
2019 0 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
2020 0 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
2021 : END IF
2022 :
2023 112 : CALL timestop(handle)
2024 :
2025 336 : END SUBROUTINE
2026 :
2027 : ! **************************************************************************************************
2028 : !> \brief ...
2029 : !> \param rpa_grad ...
2030 : !> \param mp2_env ...
2031 : !> \param para_env_sub ...
2032 : !> \param para_env ...
2033 : !> \param qs_env ...
2034 : !> \param gd_array ...
2035 : !> \param color_sub ...
2036 : !> \param do_ri_sos_laplace_mp2 ...
2037 : !> \param homo ...
2038 : !> \param virtual ...
2039 : ! **************************************************************************************************
2040 40 : SUBROUTINE rpa_grad_finalize(rpa_grad, mp2_env, para_env_sub, para_env, qs_env, gd_array, &
2041 40 : color_sub, do_ri_sos_laplace_mp2, homo, virtual)
2042 : TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
2043 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2044 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env_sub, para_env
2045 : TYPE(qs_environment_type), INTENT(IN), POINTER :: qs_env
2046 : TYPE(group_dist_d1_type) :: gd_array
2047 : INTEGER, INTENT(IN) :: color_sub
2048 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
2049 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2050 :
2051 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_finalize'
2052 :
2053 : INTEGER :: dimen_ia, dimen_RI, handle, iiB, ispin, my_group_L_end, my_group_L_size, &
2054 : my_group_L_start, my_ia_end, my_ia_size, my_ia_start, my_P_end, my_P_size, my_P_start, &
2055 : ngroup, nspins, pos_group, pos_sub, proc
2056 40 : INTEGER, ALLOCATABLE, DIMENSION(:) :: pos_info
2057 40 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: group_grid_2_mepos, mepos_2_grid_group
2058 : REAL(KIND=dp) :: my_scale
2059 40 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: Gamma_2D
2060 : TYPE(cp_blacs_env_type), POINTER :: blacs_env
2061 : TYPE(cp_fm_struct_type), POINTER :: fm_struct
2062 : TYPE(cp_fm_type) :: fm_G_P_ia, fm_PQ, fm_PQ_2, fm_PQ_half, &
2063 : fm_work_PQ, fm_work_PQ_2, fm_Y, &
2064 : operator_half
2065 40 : TYPE(group_dist_d1_type) :: gd_array_new, gd_ia, gd_P, gd_P_new
2066 :
2067 40 : CALL timeset(routineN, handle)
2068 :
2069 : ! Release unnecessary matrices to save memory for next steps
2070 :
2071 40 : nspins = SIZE(rpa_grad%fm_Y)
2072 :
2073 : ! Scaling factor is required to scale the density matrices and the Gamma matrices later
2074 40 : IF (do_ri_sos_laplace_mp2) THEN
2075 20 : my_scale = mp2_env%scale_s
2076 : ELSE
2077 20 : my_scale = -mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
2078 20 : IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
2079 : END IF
2080 :
2081 40 : IF (do_ri_sos_laplace_mp2) THEN
2082 : CALL sos_mp2_grad_finalize(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
2083 20 : para_env, para_env_sub, homo, virtual, mp2_env)
2084 : ELSE
2085 : CALL rpa_grad_work_finalize(rpa_grad%rpa_work, mp2_env, homo, &
2086 20 : virtual, para_env, para_env_sub)
2087 : END IF
2088 :
2089 40 : CALL get_qs_env(qs_env, blacs_env=blacs_env)
2090 :
2091 40 : CALL cp_fm_get_info(rpa_grad%fm_Gamma_PQ, ncol_global=dimen_RI)
2092 :
2093 40 : NULLIFY (fm_struct)
2094 : CALL cp_fm_struct_create(fm_struct, context=blacs_env, nrow_global=dimen_RI, &
2095 40 : ncol_global=dimen_RI, para_env=para_env)
2096 40 : CALL cp_fm_create(fm_PQ, fm_struct)
2097 40 : CALL cp_fm_create(fm_work_PQ, fm_struct)
2098 40 : IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2099 4 : CALL cp_fm_create(fm_PQ_2, fm_struct)
2100 : END IF
2101 40 : CALL cp_fm_struct_release(fm_struct)
2102 40 : CALL cp_fm_set_all(fm_PQ, 0.0_dp)
2103 :
2104 : ! We still have to left- and right multiply it with PQhalf
2105 40 : CALL dereplicate_and_sum_fm(rpa_grad%fm_Gamma_PQ, fm_PQ)
2106 :
2107 40 : ngroup = para_env%num_pe/para_env_sub%num_pe
2108 :
2109 : CALL prepare_redistribution(para_env, para_env_sub, ngroup, &
2110 : group_grid_2_mepos, mepos_2_grid_group, pos_info=pos_info)
2111 :
2112 : ! Create fm_PQ_half
2113 40 : CALL create_group_dist(gd_P, para_env_sub%num_pe, dimen_RI)
2114 40 : CALL get_group_dist(gd_P, para_env_sub%mepos, my_P_start, my_P_end, my_P_size)
2115 :
2116 40 : CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
2117 :
2118 40 : CALL create_group_dist(gd_P_new, para_env%num_pe)
2119 40 : CALL create_group_dist(gd_array_new, para_env%num_pe)
2120 :
2121 120 : DO proc = 0, para_env%num_pe - 1
2122 : ! calculate position of the group
2123 80 : pos_group = proc/para_env_sub%num_pe
2124 : ! calculate position in the subgroup
2125 80 : pos_sub = pos_info(proc)
2126 : ! 1 -> rows, 2 -> cols
2127 80 : CALL get_group_dist(gd_array, pos_group, gd_array_new, proc)
2128 120 : CALL get_group_dist(gd_P, pos_sub, gd_P_new, proc)
2129 : END DO
2130 :
2131 40 : DEALLOCATE (pos_info)
2132 40 : CALL release_group_dist(gd_P)
2133 :
2134 : CALL array2fm(mp2_env%ri_grad%PQ_half, fm_PQ%matrix_struct, &
2135 : my_P_start, my_P_end, &
2136 : my_group_L_start, my_group_L_end, &
2137 : gd_P_new, gd_array_new, &
2138 : group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
2139 40 : fm_PQ_half)
2140 :
2141 40 : IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2142 : CALL array2fm(mp2_env%ri_grad%operator_half, fm_PQ%matrix_struct, my_P_start, my_P_end, &
2143 : my_group_L_start, my_group_L_end, &
2144 : gd_P_new, gd_array_new, &
2145 : group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
2146 4 : operator_half)
2147 : END IF
2148 :
2149 : ! deallocate the info array
2150 40 : CALL release_group_dist(gd_P_new)
2151 40 : CALL release_group_dist(gd_array_new)
2152 :
2153 40 : IF (compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2154 : ! Finish Gamma_PQ
2155 : CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
2156 : matrix_a=fm_PQ, matrix_b=fm_PQ_half, beta=0.0_dp, &
2157 36 : matrix_c=fm_work_PQ)
2158 :
2159 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
2160 : matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
2161 36 : matrix_c=fm_PQ)
2162 :
2163 36 : CALL cp_fm_release(fm_work_PQ)
2164 : ELSE
2165 : CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
2166 : matrix_a=fm_PQ, matrix_b=operator_half, beta=0.0_dp, &
2167 4 : matrix_c=fm_work_PQ)
2168 :
2169 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
2170 : matrix_a=operator_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
2171 4 : matrix_c=fm_PQ)
2172 4 : CALL cp_fm_release(operator_half)
2173 :
2174 4 : CALL cp_fm_create(fm_work_PQ_2, fm_PQ%matrix_struct, name="fm_Gamma_PQ_2")
2175 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
2176 : matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
2177 4 : matrix_c=fm_work_PQ_2)
2178 4 : CALL cp_fm_to_fm(fm_work_PQ_2, fm_PQ_2)
2179 4 : CALL cp_fm_geadd(1.0_dp, "T", fm_work_PQ_2, 1.0_dp, fm_PQ_2)
2180 4 : CALL cp_fm_release(fm_work_PQ_2)
2181 4 : CALL cp_fm_release(fm_work_PQ)
2182 : END IF
2183 :
2184 160 : ALLOCATE (mp2_env%ri_grad%Gamma_PQ(my_P_size, my_group_L_size))
2185 : CALL fm2array(mp2_env%ri_grad%Gamma_PQ, &
2186 : my_P_size, my_P_start, my_P_end, &
2187 : my_group_L_size, my_group_L_start, my_group_L_end, &
2188 : group_grid_2_mepos, mepos_2_grid_group, &
2189 : para_env_sub%num_pe, ngroup, &
2190 40 : fm_PQ)
2191 :
2192 40 : IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2193 12 : ALLOCATE (mp2_env%ri_grad%Gamma_PQ_2(my_P_size, my_group_L_size))
2194 : CALL fm2array(mp2_env%ri_grad%Gamma_PQ_2, my_P_size, my_P_start, my_P_end, &
2195 : my_group_L_size, my_group_L_start, my_group_L_end, &
2196 : group_grid_2_mepos, mepos_2_grid_group, &
2197 : para_env_sub%num_pe, ngroup, &
2198 4 : fm_PQ_2)
2199 : END IF
2200 :
2201 : ! Now, Gamma_Pia
2202 2644 : ALLOCATE (mp2_env%ri_grad%G_P_ia(my_group_L_size, nspins))
2203 88 : DO ispin = 1, nspins
2204 2524 : DO iiB = 1, my_group_L_size
2205 2484 : NULLIFY (mp2_env%ri_grad%G_P_ia(iiB, ispin)%matrix)
2206 : END DO
2207 : END DO
2208 :
2209 : ! Redistribute the Y matrix
2210 88 : DO ispin = 1, nspins
2211 : ! Collect all data of columns for the own sub group locally
2212 48 : CALL cp_fm_get_info(rpa_grad%fm_Y(ispin), ncol_global=dimen_ia)
2213 :
2214 48 : CALL get_qs_env(qs_env, blacs_env=blacs_env)
2215 :
2216 48 : NULLIFY (fm_struct)
2217 48 : CALL cp_fm_struct_create(fm_struct, template_fmstruct=fm_PQ_half%matrix_struct, nrow_global=dimen_ia)
2218 48 : CALL cp_fm_create(fm_Y, fm_struct)
2219 48 : CALL cp_fm_struct_release(fm_struct)
2220 48 : CALL cp_fm_set_all(fm_Y, 0.0_dp)
2221 :
2222 48 : CALL dereplicate_and_sum_fm(rpa_grad%fm_Y(ispin), fm_Y)
2223 :
2224 48 : CALL cp_fm_create(fm_G_P_ia, fm_Y%matrix_struct)
2225 48 : CALL cp_fm_set_all(fm_G_P_ia, 0.0_dp)
2226 :
2227 : CALL parallel_gemm(transa="N", transb="T", m=dimen_ia, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
2228 : matrix_a=fm_Y, matrix_b=fm_PQ_half, beta=0.0_dp, &
2229 48 : matrix_c=fm_G_P_ia)
2230 :
2231 48 : CALL cp_fm_release(fm_Y)
2232 :
2233 48 : CALL create_group_dist(gd_ia, para_env_sub%num_pe, dimen_ia)
2234 48 : CALL get_group_dist(gd_ia, para_env_sub%mepos, my_ia_start, my_ia_end, my_ia_size)
2235 :
2236 : CALL fm2array(Gamma_2D, my_ia_size, my_ia_start, my_ia_end, &
2237 : my_group_L_size, my_group_L_start, my_group_L_end, &
2238 : group_grid_2_mepos, mepos_2_grid_group, &
2239 : para_env_sub%num_pe, ngroup, &
2240 48 : fm_G_P_ia)
2241 :
2242 : ! create the Gamma_ia_P in DBCSR style
2243 : CALL create_dbcsr_gamma(Gamma_2D, homo(ispin), virtual(ispin), dimen_ia, para_env_sub, &
2244 : my_ia_start, my_ia_end, my_group_L_size, gd_ia, &
2245 48 : mp2_env%ri_grad%G_P_ia(:, ispin), mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
2246 :
2247 328 : CALL release_group_dist(gd_ia)
2248 :
2249 : END DO
2250 40 : DEALLOCATE (rpa_grad%fm_Y)
2251 40 : CALL cp_fm_release(fm_PQ_half)
2252 :
2253 40 : CALL timestop(handle)
2254 :
2255 320 : END SUBROUTINE rpa_grad_finalize
2256 :
2257 : ! **************************************************************************************************
2258 : !> \brief ...
2259 : !> \param sos_mp2_work_occ ...
2260 : !> \param sos_mp2_work_virt ...
2261 : !> \param para_env ...
2262 : !> \param para_env_sub ...
2263 : !> \param homo ...
2264 : !> \param virtual ...
2265 : !> \param mp2_env ...
2266 : ! **************************************************************************************************
2267 20 : SUBROUTINE sos_mp2_grad_finalize(sos_mp2_work_occ, sos_mp2_work_virt, para_env, para_env_sub, homo, virtual, mp2_env)
2268 : TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
2269 : DIMENSION(:), INTENT(INOUT) :: sos_mp2_work_occ, sos_mp2_work_virt
2270 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
2271 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2272 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2273 :
2274 : CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_grad_finalize'
2275 :
2276 : INTEGER :: ab_counter, handle, ij_counter, ispin, &
2277 : itmp(2), my_a, my_b, my_B_end, &
2278 : my_B_size, my_B_start, my_i, my_j, &
2279 : nspins, pcol
2280 : REAL(KIND=dp) :: my_scale
2281 :
2282 20 : CALL timeset(routineN, handle)
2283 :
2284 20 : nspins = SIZE(sos_mp2_work_occ)
2285 20 : my_scale = mp2_env%scale_s
2286 :
2287 44 : DO ispin = 1, nspins
2288 48 : DO pcol = 0, SIZE(sos_mp2_work_occ(ispin)%index2send, 1) - 1
2289 24 : IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
2290 4 : DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
2291 24 : IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
2292 0 : DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
2293 24 : IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
2294 4 : DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
2295 24 : IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
2296 24 : DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
2297 : END DO
2298 0 : DEALLOCATE (sos_mp2_work_occ(ispin)%index2send, &
2299 0 : sos_mp2_work_occ(ispin)%index2recv, &
2300 0 : sos_mp2_work_virt(ispin)%index2send, &
2301 140 : sos_mp2_work_virt(ispin)%index2recv)
2302 : END DO
2303 :
2304 : ! Sum P_ij and P_ab and redistribute them
2305 44 : DO ispin = 1, nspins
2306 24 : CALL para_env%sum(sos_mp2_work_occ(ispin)%P)
2307 :
2308 96 : ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
2309 472 : mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
2310 116 : DO my_i = 1, homo(ispin)
2311 116 : mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_i) = my_scale*sos_mp2_work_occ(ispin)%P(my_i)
2312 : END DO
2313 72 : DO ij_counter = 1, SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
2314 48 : my_i = sos_mp2_work_occ(ispin)%pair_list(1, ij_counter)
2315 48 : my_j = sos_mp2_work_occ(ispin)%pair_list(2, ij_counter)
2316 :
2317 72 : mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = my_scale*sos_mp2_work_occ(ispin)%P(homo(ispin) + ij_counter)
2318 : END DO
2319 24 : DEALLOCATE (sos_mp2_work_occ(ispin)%P, sos_mp2_work_occ(ispin)%pair_list)
2320 :
2321 : ! Symmetrize P_ij
2322 : mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
2323 920 : TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
2324 :
2325 : ! The first index of P_ab has to be distributed within the subgroups, so sum it up first and add the required elements later
2326 24 : CALL para_env%sum(sos_mp2_work_virt(ispin)%P)
2327 :
2328 24 : itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
2329 24 : my_B_size = itmp(2) - itmp(1) + 1
2330 24 : my_B_start = itmp(1)
2331 24 : my_B_end = itmp(2)
2332 :
2333 96 : ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
2334 10382 : mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
2335 486 : DO my_a = itmp(1), itmp(2)
2336 486 : mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_a) = my_scale*sos_mp2_work_virt(ispin)%P(my_a)
2337 : END DO
2338 636 : DO ab_counter = 1, SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
2339 612 : my_a = sos_mp2_work_virt(ispin)%pair_list(1, ab_counter)
2340 612 : my_b = sos_mp2_work_virt(ispin)%pair_list(2, ab_counter)
2341 :
2342 612 : IF (my_a >= itmp(1) .AND. my_a <= itmp(2)) mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_b) = &
2343 636 : my_scale*sos_mp2_work_virt(ispin)%P(virtual(ispin) + ab_counter)
2344 : END DO
2345 :
2346 24 : DEALLOCATE (sos_mp2_work_virt(ispin)%P, sos_mp2_work_virt(ispin)%pair_list)
2347 :
2348 : ! Symmetrize P_ab
2349 44 : IF (para_env_sub%num_pe > 1) THEN
2350 12 : BLOCK
2351 : INTEGER :: send_a_start, send_a_end, send_a_size, &
2352 : recv_a_start, recv_a_end, recv_a_size, proc_shift, proc_send, proc_recv
2353 4 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
2354 4 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
2355 4 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
2356 4 : TYPE(group_dist_d1_type) :: gd_virtual_sub
2357 :
2358 4 : CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
2359 :
2360 : mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
2361 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
2362 804 : + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
2363 :
2364 12 : ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
2365 16 : ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
2366 :
2367 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2368 :
2369 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2370 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2371 :
2372 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
2373 4 : CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
2374 :
2375 4 : buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
2376 :
2377 402 : buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
2378 : CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
2379 402 : buffer_recv(:, :recv_a_size), proc_recv)
2380 :
2381 : mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
2382 410 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
2383 :
2384 : END DO
2385 :
2386 4 : DEALLOCATE (buffer_send_1D, buffer_recv)
2387 :
2388 16 : CALL release_group_dist(gd_virtual_sub)
2389 : END BLOCK
2390 : ELSE
2391 : mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
2392 19140 : TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
2393 : END IF
2394 :
2395 : END DO
2396 68 : DEALLOCATE (sos_mp2_work_occ, sos_mp2_work_virt)
2397 20 : IF (nspins == 1) THEN
2398 336 : mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
2399 5374 : mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
2400 : END IF
2401 :
2402 20 : CALL timestop(handle)
2403 :
2404 20 : END SUBROUTINE
2405 :
2406 : ! **************************************************************************************************
2407 : !> \brief ...
2408 : !> \param rpa_work ...
2409 : !> \param mp2_env ...
2410 : !> \param homo ...
2411 : !> \param virtual ...
2412 : !> \param para_env ...
2413 : !> \param para_env_sub ...
2414 : ! **************************************************************************************************
2415 20 : SUBROUTINE rpa_grad_work_finalize(rpa_work, mp2_env, homo, virtual, para_env, para_env_sub)
2416 : TYPE(rpa_grad_work_type), INTENT(INOUT) :: rpa_work
2417 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2418 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2419 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
2420 :
2421 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_work_finalize'
2422 :
2423 : INTEGER :: handle, ispin, itmp(2), my_a_end, my_a_size, my_a_start, my_B_end, my_B_size, &
2424 : my_B_start, my_i_end, my_i_size, my_i_start, nspins, proc, proc_recv, proc_send, &
2425 : proc_shift, recv_a_end, recv_a_size, recv_a_start, recv_end, recv_start, send_a_end, &
2426 : send_a_size, send_a_start, send_end, send_start, size_recv_buffer, size_send_buffer
2427 : REAL(KIND=dp) :: my_scale
2428 20 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
2429 20 : TYPE(group_dist_d1_type) :: gd_a_sub, gd_virtual_sub
2430 :
2431 20 : CALL timeset(routineN, handle)
2432 :
2433 20 : nspins = SIZE(homo)
2434 20 : my_scale = mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
2435 20 : IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
2436 :
2437 20 : CALL cp_fm_release(rpa_work%fm_mat_Q_copy)
2438 :
2439 44 : DO ispin = 1, nspins
2440 68 : DO proc = 0, SIZE(rpa_work%index2send, 1) - 1
2441 24 : IF (ALLOCATED(rpa_work%index2send(proc, ispin)%array)) DEALLOCATE (rpa_work%index2send(proc, ispin)%array)
2442 48 : IF (ALLOCATED(rpa_work%index2recv(proc, ispin)%array)) DEALLOCATE (rpa_work%index2recv(proc, ispin)%array)
2443 : END DO
2444 : END DO
2445 68 : DEALLOCATE (rpa_work%index2send, rpa_work%index2recv)
2446 :
2447 44 : DO ispin = 1, nspins
2448 24 : CALL get_group_dist(rpa_work%gd_homo(ispin), rpa_work%mepos(2), my_i_start, my_i_end, my_i_size)
2449 24 : CALL release_group_dist(rpa_work%gd_homo(ispin))
2450 :
2451 96 : ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
2452 472 : mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
2453 472 : mp2_env%ri_grad%P_ij(ispin)%array(my_i_start:my_i_end, :) = my_scale*rpa_work%P_ij(ispin)%array
2454 24 : DEALLOCATE (rpa_work%P_ij(ispin)%array)
2455 24 : CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
2456 :
2457 : ! Symmetrize P_ij
2458 : mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
2459 920 : TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
2460 :
2461 24 : itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
2462 24 : my_B_start = itmp(1)
2463 24 : my_B_end = itmp(2)
2464 24 : my_B_size = my_B_end - my_B_start + 1
2465 :
2466 96 : ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
2467 10382 : mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
2468 :
2469 24 : CALL get_group_dist(rpa_work%gd_virtual(ispin), rpa_work%mepos(1), my_a_start, my_a_end, my_a_size)
2470 24 : CALL release_group_dist(rpa_work%gd_virtual(ispin))
2471 : ! This group dist contains the info which parts of Pab a process currently owns
2472 24 : CALL create_group_dist(gd_a_sub, my_a_start, my_a_end, my_a_size, para_env_sub)
2473 : ! This group dist contains the info which parts of Pab a process is supposed to own later
2474 24 : CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
2475 :
2476 : ! Calculate local indices of the common range of own matrix and send process
2477 24 : send_start = MAX(1, my_B_start - my_a_start + 1)
2478 24 : send_end = MIN(my_a_size, my_B_end - my_a_start + 1)
2479 :
2480 : ! Same for recv process but with reverse positions
2481 24 : recv_start = MAX(1, my_a_start - my_B_start + 1)
2482 24 : recv_end = MIN(my_B_size, my_a_end - my_B_start + 1)
2483 :
2484 : mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
2485 10382 : my_scale*rpa_work%P_ab(ispin)%array(send_start:send_end, :)
2486 :
2487 24 : IF (para_env_sub%num_pe > 1) THEN
2488 : size_send_buffer = 0
2489 : size_recv_buffer = 0
2490 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2491 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2492 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2493 :
2494 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
2495 4 : CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
2496 :
2497 : ! Calculate local indices of the common range of own matrix and send process
2498 4 : send_start = MAX(1, send_a_start - my_a_start + 1)
2499 4 : send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
2500 :
2501 4 : size_send_buffer = MAX(size_send_buffer, MAX(send_end - send_start + 1, 0))
2502 :
2503 : ! Same for recv process but with reverse positions
2504 4 : recv_start = MAX(1, recv_a_start - my_B_start + 1)
2505 4 : recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
2506 :
2507 8 : size_recv_buffer = MAX(size_recv_buffer, MAX(recv_end - recv_start + 1, 0))
2508 : END DO
2509 28 : ALLOCATE (buffer_send(size_send_buffer, virtual(ispin)), buffer_recv(size_recv_buffer, virtual(ispin)))
2510 :
2511 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2512 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2513 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2514 :
2515 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
2516 4 : CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
2517 :
2518 : ! Calculate local indices of the common range of own matrix and send process
2519 4 : send_start = MAX(1, send_a_start - my_a_start + 1)
2520 4 : send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
2521 802 : buffer_send(1:MAX(send_end - send_start + 1, 0), :) = rpa_work%P_ab(ispin)%array(send_start:send_end, :)
2522 :
2523 : ! Same for recv process but with reverse positions
2524 4 : recv_start = MAX(1, recv_a_start - my_B_start + 1)
2525 4 : recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
2526 :
2527 : CALL para_env_sub%sendrecv(buffer_send(1:MAX(send_end - send_start + 1, 0), :), proc_send, &
2528 1600 : buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :), proc_recv)
2529 :
2530 : mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
2531 : mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) + &
2532 810 : my_scale*buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :)
2533 :
2534 : END DO
2535 :
2536 4 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
2537 4 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
2538 : END IF
2539 24 : DEALLOCATE (rpa_work%P_ab(ispin)%array)
2540 :
2541 24 : CALL release_group_dist(gd_a_sub)
2542 :
2543 : BLOCK
2544 : TYPE(mp_comm_type) :: comm_exchange
2545 24 : CALL comm_exchange%from_split(para_env, para_env_sub%mepos)
2546 24 : CALL comm_exchange%sum(mp2_env%ri_grad%P_ab(ispin)%array)
2547 48 : CALL comm_exchange%free()
2548 : END BLOCK
2549 :
2550 : ! Symmetrize P_ab
2551 24 : IF (para_env_sub%num_pe > 1) THEN
2552 : BLOCK
2553 4 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
2554 4 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
2555 4 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
2556 :
2557 : mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
2558 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
2559 804 : + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
2560 :
2561 12 : ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
2562 16 : ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
2563 :
2564 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2565 :
2566 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2567 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2568 :
2569 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
2570 4 : CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
2571 :
2572 4 : buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
2573 :
2574 402 : buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
2575 : CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
2576 402 : buffer_recv(:, :recv_a_size), proc_recv)
2577 :
2578 : mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
2579 410 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
2580 :
2581 : END DO
2582 :
2583 8 : DEALLOCATE (buffer_send_1D, buffer_recv)
2584 : END BLOCK
2585 : ELSE
2586 : mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
2587 19140 : TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
2588 : END IF
2589 :
2590 92 : CALL release_group_dist(gd_virtual_sub)
2591 :
2592 : END DO
2593 116 : DEALLOCATE (rpa_work%gd_homo, rpa_work%gd_virtual, rpa_work%P_ij, rpa_work%P_ab)
2594 20 : IF (nspins == 1) THEN
2595 336 : mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
2596 5374 : mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
2597 : END IF
2598 :
2599 20 : CALL timestop(handle)
2600 20 : END SUBROUTINE
2601 :
2602 : ! **************************************************************************************************
2603 : !> \brief Dereplicate data from fm_sub and collect in fm_global, overlapping data will be added
2604 : !> \param fm_sub replicated matrix, all subgroups have the same size, will be release on output
2605 : !> \param fm_global global matrix, on output it will contain the sum of the replicated matrices redistributed
2606 : ! **************************************************************************************************
2607 88 : SUBROUTINE dereplicate_and_sum_fm(fm_sub, fm_global)
2608 : TYPE(cp_fm_type), INTENT(INOUT) :: fm_sub, fm_global
2609 :
2610 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dereplicate_and_sum_fm'
2611 :
2612 : INTEGER :: col_local, elements2recv_col, elements2recv_row, elements2send_col, &
2613 : elements2send_row, first_p_pos_global(2), first_p_pos_sub(2), handle, handle2, &
2614 : mypcol_global, myprow_global, ncol_block_global, ncol_block_sub, ncol_local_global, &
2615 : ncol_local_sub, npcol_global, npcol_sub, nprow_global, nprow_sub, nrow_block_global, &
2616 : nrow_block_sub, nrow_local_global, nrow_local_sub, pcol_recv, pcol_send, proc_recv, &
2617 : proc_send, proc_send_global, proc_shift, prow_recv, prow_send, row_local, tag
2618 : INTEGER(int_8) :: size_recv_buffer, size_send_buffer
2619 88 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv_col, data2recv_row, &
2620 88 : data2send_col, data2send_row, &
2621 88 : subgroup2mepos
2622 88 : INTEGER, DIMENSION(:), POINTER :: col_indices_global, col_indices_sub, &
2623 88 : row_indices_global, row_indices_sub
2624 88 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi_global, blacs2mpi_sub, &
2625 88 : mpi2blacs_global, mpi2blacs_sub
2626 88 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: recv_buffer_1D, send_buffer_1D
2627 88 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: recv_buffer, send_buffer
2628 : TYPE(mp_para_env_type), POINTER :: para_env, para_env_sub
2629 88 : TYPE(one_dim_int_array), ALLOCATABLE, DIMENSION(:) :: index2recv_col, index2recv_row, &
2630 88 : index2send_col, index2send_row
2631 :
2632 88 : CALL timeset(routineN, handle)
2633 :
2634 88 : tag = 1
2635 :
2636 88 : nprow_sub = fm_sub%matrix_struct%context%num_pe(1)
2637 88 : npcol_sub = fm_sub%matrix_struct%context%num_pe(2)
2638 :
2639 88 : myprow_global = fm_global%matrix_struct%context%mepos(1)
2640 88 : mypcol_global = fm_global%matrix_struct%context%mepos(2)
2641 88 : nprow_global = fm_global%matrix_struct%context%num_pe(1)
2642 88 : npcol_global = fm_global%matrix_struct%context%num_pe(2)
2643 :
2644 : CALL cp_fm_get_info(fm_sub, col_indices=col_indices_sub, row_indices=row_indices_sub, &
2645 88 : nrow_local=nrow_local_sub, ncol_local=ncol_local_sub)
2646 : CALL cp_fm_struct_get(fm_sub%matrix_struct, para_env=para_env_sub, first_p_pos=first_p_pos_sub, &
2647 88 : nrow_block=nrow_block_sub, ncol_block=ncol_block_sub)
2648 : CALL cp_fm_struct_get(fm_global%matrix_struct, first_p_pos=first_p_pos_global, nrow_block=nrow_block_global, &
2649 : ncol_block=ncol_block_global, para_env=para_env, &
2650 : col_indices=col_indices_global, row_indices=row_indices_global, &
2651 88 : nrow_local=nrow_local_global, ncol_local=ncol_local_global)
2652 88 : CALL fm_sub%matrix_struct%context%get(blacs2mpi=blacs2mpi_sub, mpi2blacs=mpi2blacs_sub)
2653 88 : CALL fm_global%matrix_struct%context%get(blacs2mpi=blacs2mpi_global, mpi2blacs=mpi2blacs_global)
2654 :
2655 88 : IF (para_env%num_pe /= para_env_sub%num_pe) THEN
2656 : BLOCK
2657 : TYPE(mp_comm_type) :: comm_exchange
2658 64 : comm_exchange = fm_sub%matrix_struct%context%interconnect(para_env)
2659 64 : CALL comm_exchange%sum(fm_sub%local_data)
2660 128 : CALL comm_exchange%free()
2661 : END BLOCK
2662 : END IF
2663 :
2664 264 : ALLOCATE (subgroup2mepos(0:para_env_sub%num_pe - 1))
2665 88 : CALL para_env_sub%allgather(para_env%mepos, subgroup2mepos)
2666 :
2667 88 : CALL timeset(routineN//"_data2", handle2)
2668 : ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
2669 88 : CALL get_elements2send(data2send_col, npcol_global, row_indices_sub, ncol_block_global, first_p_pos_global(2), index2send_col)
2670 88 : CALL get_elements2send(data2send_row, nprow_global, col_indices_sub, nrow_block_global, first_p_pos_global(1), index2send_row)
2671 :
2672 : ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
2673 : ! Do the reverse for the recieve processes
2674 88 : CALL get_elements2send(data2recv_col, npcol_sub, row_indices_global, ncol_block_sub, first_p_pos_sub(2), index2recv_col)
2675 88 : CALL get_elements2send(data2recv_row, nprow_sub, col_indices_global, nrow_block_sub, first_p_pos_sub(1), index2recv_row)
2676 88 : CALL timestop(handle2)
2677 :
2678 88 : CALL timeset(routineN//"_local", handle2)
2679 : ! Loop over local data and transpose
2680 88 : prow_send = mpi2blacs_global(1, para_env%mepos)
2681 88 : pcol_send = mpi2blacs_global(2, para_env%mepos)
2682 88 : prow_recv = mpi2blacs_sub(1, para_env_sub%mepos)
2683 88 : pcol_recv = mpi2blacs_sub(2, para_env_sub%mepos)
2684 88 : elements2recv_col = data2recv_col(pcol_recv)
2685 88 : elements2recv_row = data2recv_row(prow_recv)
2686 :
2687 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2688 : !$OMP SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
2689 : !$OMP index2recv_col,index2recv_row,pcol_recv,prow_recv, &
2690 88 : !$OMP fm_sub,index2send_col,index2send_row,pcol_send,prow_send)
2691 : DO col_local = 1, elements2recv_col
2692 : DO row_local = 1, elements2recv_row
2693 : fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
2694 : index2recv_row(prow_recv)%array(row_local)) &
2695 : = fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
2696 : index2send_row(prow_send)%array(col_local))
2697 : END DO
2698 : END DO
2699 : !$OMP END PARALLEL DO
2700 88 : CALL timestop(handle2)
2701 :
2702 88 : IF (para_env_sub%num_pe > 1) THEN
2703 : size_send_buffer = 0_int_8
2704 : size_recv_buffer = 0_int_8
2705 : ! Loop over all processes in para_env_sub
2706 48 : DO proc_shift = 1, para_env_sub%num_pe - 1
2707 24 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2708 24 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2709 :
2710 24 : proc_send_global = subgroup2mepos(proc_send)
2711 24 : prow_send = mpi2blacs_global(1, proc_send_global)
2712 24 : pcol_send = mpi2blacs_global(2, proc_send_global)
2713 24 : elements2send_col = data2send_col(pcol_send)
2714 24 : elements2send_row = data2send_row(prow_send)
2715 :
2716 24 : size_send_buffer = MAX(size_send_buffer, INT(elements2send_col, int_8)*elements2send_row)
2717 :
2718 24 : prow_recv = mpi2blacs_sub(1, proc_recv)
2719 24 : pcol_recv = mpi2blacs_sub(2, proc_recv)
2720 24 : elements2recv_col = data2recv_col(pcol_recv)
2721 24 : elements2recv_row = data2recv_row(prow_recv)
2722 :
2723 48 : size_recv_buffer = MAX(size_recv_buffer, INT(elements2recv_col, int_8)*elements2recv_row)
2724 : END DO
2725 120 : ALLOCATE (send_buffer_1D(size_send_buffer), recv_buffer_1D(size_recv_buffer))
2726 :
2727 : ! Loop over all processes in para_env_sub
2728 48 : DO proc_shift = 1, para_env_sub%num_pe - 1
2729 24 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2730 24 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2731 :
2732 24 : proc_send_global = subgroup2mepos(proc_send)
2733 24 : prow_send = mpi2blacs_global(1, proc_send_global)
2734 24 : pcol_send = mpi2blacs_global(2, proc_send_global)
2735 24 : elements2send_col = data2send_col(pcol_send)
2736 24 : elements2send_row = data2send_row(prow_send)
2737 :
2738 24 : CALL timeset(routineN//"_pack", handle2)
2739 : ! Loop over local data and pack the buffer
2740 : ! Transpose the matrix already
2741 24 : send_buffer(1:elements2send_row, 1:elements2send_col) => send_buffer_1D(1:INT(elements2send_row, int_8)*elements2send_col)
2742 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2743 : !$OMP SHARED(elements2send_col,elements2send_row,send_buffer,fm_sub,&
2744 24 : !$OMP index2send_col,index2send_row,pcol_send,prow_send)
2745 : DO row_local = 1, elements2send_col
2746 : DO col_local = 1, elements2send_row
2747 : send_buffer(col_local, row_local) = &
2748 : fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
2749 : index2send_row(prow_send)%array(col_local))
2750 : END DO
2751 : END DO
2752 : !$OMP END PARALLEL DO
2753 24 : CALL timestop(handle2)
2754 :
2755 24 : prow_recv = mpi2blacs_sub(1, proc_recv)
2756 24 : pcol_recv = mpi2blacs_sub(2, proc_recv)
2757 24 : elements2recv_col = data2recv_col(pcol_recv)
2758 24 : elements2recv_row = data2recv_row(prow_recv)
2759 :
2760 : ! Send data
2761 24 : recv_buffer(1:elements2recv_col, 1:elements2recv_row) => recv_buffer_1D(1:INT(elements2recv_row, int_8)*elements2recv_col)
2762 120 : IF (SIZE(recv_buffer) > 0_int_8) THEN
2763 72 : IF (SIZE(send_buffer) > 0_int_8) THEN
2764 81192 : CALL para_env_sub%sendrecv(send_buffer, proc_send, recv_buffer, proc_recv, tag)
2765 : ELSE
2766 0 : CALL para_env_sub%recv(recv_buffer, proc_recv, tag)
2767 : END IF
2768 :
2769 24 : CALL timeset(routineN//"_unpack", handle2)
2770 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2771 : !$OMP SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
2772 24 : !$OMP index2recv_col,index2recv_row,pcol_recv,prow_recv)
2773 : DO col_local = 1, elements2recv_col
2774 : DO row_local = 1, elements2recv_row
2775 : fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
2776 : index2recv_row(prow_recv)%array(row_local)) &
2777 : = recv_buffer(col_local, row_local)
2778 : END DO
2779 : END DO
2780 : !$OMP END PARALLEL DO
2781 24 : CALL timestop(handle2)
2782 0 : ELSE IF (SIZE(send_buffer) > 0_int_8) THEN
2783 0 : CALL para_env_sub%send(send_buffer, proc_send, tag)
2784 : END IF
2785 : END DO
2786 : END IF
2787 :
2788 88 : DEALLOCATE (data2send_col, data2send_row, data2recv_col, data2recv_row)
2789 176 : DO proc_shift = 0, npcol_global - 1
2790 176 : DEALLOCATE (index2send_col(proc_shift)%array)
2791 : END DO
2792 176 : DO proc_shift = 0, npcol_sub - 1
2793 176 : DEALLOCATE (index2recv_col(proc_shift)%array)
2794 : END DO
2795 264 : DO proc_shift = 0, nprow_global - 1
2796 264 : DEALLOCATE (index2send_row(proc_shift)%array)
2797 : END DO
2798 200 : DO proc_shift = 0, nprow_sub - 1
2799 200 : DEALLOCATE (index2recv_row(proc_shift)%array)
2800 : END DO
2801 552 : DEALLOCATE (index2send_col, index2recv_col, index2send_row, index2recv_row)
2802 :
2803 88 : CALL cp_fm_release(fm_sub)
2804 :
2805 88 : CALL timestop(handle)
2806 :
2807 352 : END SUBROUTINE dereplicate_and_sum_fm
2808 :
2809 : ! **************************************************************************************************
2810 : !> \brief ...
2811 : !> \param data2send ...
2812 : !> \param np_global ...
2813 : !> \param indices_sub ...
2814 : !> \param n_block_global ...
2815 : !> \param first_p_pos_global ...
2816 : !> \param index2send ...
2817 : ! **************************************************************************************************
2818 352 : SUBROUTINE get_elements2send(data2send, np_global, indices_sub, n_block_global, first_p_pos_global, index2send)
2819 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: data2send
2820 : INTEGER, INTENT(IN) :: np_global
2821 : INTEGER, DIMENSION(:), INTENT(IN) :: indices_sub
2822 : INTEGER, INTENT(IN) :: n_block_global, first_p_pos_global
2823 : TYPE(one_dim_int_array), ALLOCATABLE, &
2824 : DIMENSION(:), INTENT(OUT) :: index2send
2825 :
2826 : INTEGER :: i_global, i_local, proc
2827 :
2828 1056 : ALLOCATE (data2send(0:np_global - 1))
2829 816 : data2send = 0
2830 25484 : DO i_local = 1, SIZE(indices_sub)
2831 25132 : i_global = indices_sub(i_local)
2832 25132 : proc = cp_fm_indxg2p(i_global, n_block_global, 0, first_p_pos_global, np_global)
2833 25484 : data2send(proc) = data2send(proc) + 1
2834 : END DO
2835 :
2836 1520 : ALLOCATE (index2send(0:np_global - 1))
2837 816 : DO proc = 0, np_global - 1
2838 1392 : ALLOCATE (index2send(proc)%array(data2send(proc)))
2839 : ! We want to crash if there is an error
2840 25948 : index2send(proc)%array = -1
2841 : END DO
2842 :
2843 816 : data2send = 0
2844 25484 : DO i_local = 1, SIZE(indices_sub)
2845 25132 : i_global = indices_sub(i_local)
2846 25132 : proc = cp_fm_indxg2p(i_global, n_block_global, 0, first_p_pos_global, np_global)
2847 25132 : data2send(proc) = data2send(proc) + 1
2848 25484 : index2send(proc)%array(data2send(proc)) = i_local
2849 : END DO
2850 :
2851 352 : END SUBROUTINE
2852 :
2853 0 : END MODULE rpa_grad
|