Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines to calculate RI-GPW-MP2 energy using pw
10 : !> \par History
11 : !> 06.2012 created [Mauro Del Ben]
12 : !> 03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
13 : ! **************************************************************************************************
14 : MODULE mp2_ri_gpw
15 : USE cp_log_handling, ONLY: cp_to_string
16 : USE dgemm_counter_types, ONLY: dgemm_counter_init,&
17 : dgemm_counter_start,&
18 : dgemm_counter_stop,&
19 : dgemm_counter_type,&
20 : dgemm_counter_write
21 : USE group_dist_types, ONLY: get_group_dist,&
22 : group_dist_d1_type,&
23 : maxsize,&
24 : release_group_dist
25 : USE kinds, ONLY: dp,&
26 : int_8
27 : USE libint_2c_3c, ONLY: compare_potential_types
28 : USE local_gemm_api, ONLY: LOCAL_GEMM_PU_GPU
29 : USE machine, ONLY: m_flush,&
30 : m_memory,&
31 : m_walltime
32 : USE message_passing, ONLY: mp_comm_type,&
33 : mp_para_env_type
34 : USE mp2_ri_grad_util, ONLY: complete_gamma
35 : USE mp2_types, ONLY: mp2_type,&
36 : three_dim_real_array
37 :
38 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
39 : #include "./base/base_uses.f90"
40 :
41 : IMPLICIT NONE
42 :
43 : PRIVATE
44 :
45 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
46 :
47 : PUBLIC :: mp2_ri_gpw_compute_en
48 :
49 : CONTAINS
50 :
51 : ! **************************************************************************************************
52 : !> \brief ...
53 : !> \param Emp2_Cou ...
54 : !> \param Emp2_EX ...
55 : !> \param Emp2_S ...
56 : !> \param Emp2_T ...
57 : !> \param BIb_C ...
58 : !> \param mp2_env ...
59 : !> \param para_env ...
60 : !> \param para_env_sub ...
61 : !> \param color_sub ...
62 : !> \param gd_array ...
63 : !> \param gd_B_virtual ...
64 : !> \param Eigenval ...
65 : !> \param nmo ...
66 : !> \param homo ...
67 : !> \param dimen_RI ...
68 : !> \param unit_nr ...
69 : !> \param calc_forces ...
70 : !> \param calc_ex ...
71 : ! **************************************************************************************************
72 1062 : SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
73 354 : gd_array, gd_B_virtual, &
74 354 : Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
75 : REAL(KIND=dp), INTENT(INOUT) :: Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T
76 : TYPE(three_dim_real_array), DIMENSION(:), &
77 : INTENT(INOUT) :: BIb_C
78 : TYPE(mp2_type) :: mp2_env
79 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
80 : INTEGER, INTENT(IN) :: color_sub
81 : TYPE(group_dist_d1_type), INTENT(INOUT) :: gd_array
82 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
83 : INTEGER, INTENT(IN) :: nmo
84 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
85 : TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
86 : INTENT(INOUT) :: gd_B_virtual
87 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
88 : LOGICAL, INTENT(IN) :: calc_forces, calc_ex
89 :
90 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_gpw_compute_en'
91 :
92 : INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
93 : iiB, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjB, jspin, &
94 : max_ij_pairs, my_block_size, my_group_L_end, my_group_L_size, my_group_L_size_orig, &
95 : my_group_L_start, my_i, my_ij_pairs, my_j, my_new_group_L_size, ngroup, nspins, &
96 : num_integ_group, proc_receive, proc_send, proc_shift, rec_B_size, rec_B_virtual_end, &
97 : rec_B_virtual_start, rec_L_size, send_B_size, send_B_virtual_end, send_B_virtual_start, &
98 : send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
99 354 : INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_B_size, &
100 354 : my_B_virtual_end, my_B_virtual_start, num_ij_pairs, sizes_array_orig, virtual
101 354 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ij_map
102 354 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: ranges_info_array
103 : LOGICAL :: my_alpha_beta_case, my_beta_beta_case, &
104 : my_open_shell_SS
105 : REAL(KIND=dp) :: amp_fac, my_Emp2_Cou, my_Emp2_EX, &
106 : sym_fac, t_new, t_start
107 354 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_1D
108 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
109 354 : TARGET :: local_ab, local_ba, t_ab
110 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
111 354 : TARGET :: local_i_aL, local_j_aL, Y_i_aP, Y_j_aP
112 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
113 354 : POINTER :: external_ab, external_i_aL
114 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
115 354 : POINTER :: BI_C_rec
116 : TYPE(dgemm_counter_type) :: dgemm_counter
117 : TYPE(mp_comm_type) :: comm_exchange, comm_rep
118 : TYPE(three_dim_real_array), ALLOCATABLE, &
119 354 : DIMENSION(:) :: B_ia_Q
120 :
121 354 : CALL timeset(routineN, handle)
122 :
123 354 : nspins = SIZE(homo)
124 :
125 1062 : ALLOCATE (virtual(nspins))
126 796 : virtual(:) = nmo - homo(:)
127 :
128 1416 : ALLOCATE (my_B_size(nspins), my_B_virtual_start(nspins), my_B_virtual_end(nspins))
129 796 : DO ispin = 1, nspins
130 : CALL get_group_dist(gd_B_virtual(ispin), para_env_sub%mepos, &
131 796 : my_B_virtual_start(ispin), my_B_virtual_end(ispin), my_B_size(ispin))
132 : END DO
133 :
134 354 : CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
135 :
136 354 : CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
137 :
138 : ! local_gemm_ctx has a very footprint the first time this routine is
139 : ! called.
140 354 : CALL mp2_env%local_gemm_ctx%create(LOCAL_GEMM_PU_GPU)
141 354 : CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(128*128*128*2)
142 :
143 : CALL mp2_ri_get_integ_group_size( &
144 : mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
145 : homo, dimen_RI, unit_nr, &
146 : integ_group_size, ngroup, &
147 354 : num_integ_group, virtual, calc_forces)
148 :
149 : ! now create a group that contains all the proc that have the same virtual starting point
150 : ! in the integ group
151 : CALL mp2_ri_create_group( &
152 : para_env, para_env_sub, color_sub, &
153 : gd_array%sizes, calc_forces, &
154 : integ_group_size, my_group_L_end, &
155 : my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
156 : integ_group_pos2color_sub, sizes_array_orig, &
157 354 : ranges_info_array, comm_exchange, comm_rep, num_integ_group)
158 :
159 : ! We cannot fix the tag because of the recv routine
160 354 : tag = 42
161 :
162 796 : DO jspin = 1, nspins
163 :
164 : CALL replicate_iaK_2intgroup(BIb_C(jspin)%array, comm_exchange, comm_rep, &
165 : homo(jspin), gd_array%sizes, my_B_size(jspin), &
166 442 : my_group_L_size, ranges_info_array)
167 :
168 1326 : DO ispin = 1, jspin
169 :
170 530 : IF (unit_nr > 0) THEN
171 265 : IF (nspins == 1) THEN
172 133 : WRITE (unit_nr, *) "Start loop run"
173 132 : ELSE IF (ispin == 1 .AND. jspin == 1) THEN
174 44 : WRITE (unit_nr, *) "Start loop run alpha-alpha"
175 88 : ELSE IF (ispin == 1 .AND. jspin == 2) THEN
176 44 : WRITE (unit_nr, *) "Start loop run alpha-beta"
177 44 : ELSE IF (ispin == 2 .AND. jspin == 2) THEN
178 44 : WRITE (unit_nr, *) "Start loop run beta-beta"
179 : END IF
180 265 : CALL m_flush(unit_nr)
181 : END IF
182 :
183 530 : my_open_shell_SS = (nspins == 2) .AND. (ispin == jspin)
184 :
185 : ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
186 : ! If we calculate the gradient we need to distinguish
187 : ! between alpha-alpha and beta-beta cases for UMP2
188 :
189 530 : my_beta_beta_case = .FALSE.
190 530 : my_alpha_beta_case = .FALSE.
191 530 : IF (ispin /= jspin) THEN
192 88 : my_alpha_beta_case = .TRUE.
193 442 : ELSE IF (my_open_shell_SS) THEN
194 176 : IF (ispin == 2) my_beta_beta_case = .TRUE.
195 : END IF
196 :
197 530 : amp_fac = mp2_env%scale_S + mp2_env%scale_T
198 530 : IF (my_alpha_beta_case .OR. my_open_shell_SS) amp_fac = mp2_env%scale_T
199 :
200 : CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
201 530 : my_group_L_size, calc_forces, ispin, jspin, local_ba)
202 :
203 : CALL mp2_ri_get_block_size( &
204 : mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual(ispin:jspin), &
205 : homo(ispin:jspin), virtual(ispin:jspin), dimen_RI, unit_nr, block_size, &
206 530 : ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1D)
207 :
208 : ! *****************************************************************
209 : ! ********** REPLICATION-BLOCKED COMMUNICATION SCHEME ***********
210 : ! *****************************************************************
211 : ! introduce block size, the number of occupied orbitals has to be a
212 : ! multiple of the block size
213 :
214 : ! Calculate the maximum number of ij pairs that have to be computed
215 : ! among groups
216 : CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
217 530 : block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
218 :
219 1590 : ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
220 530 : CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
221 :
222 1170 : max_ij_pairs = MAXVAL(num_ij_pairs)
223 :
224 : ! start real stuff
225 : CALL mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, local_i_aL, &
226 530 : local_j_aL, calc_forces, Y_i_aP, Y_j_aP, ispin, jspin)
227 :
228 530 : CALL timeset(routineN//"_RI_loop", handle2)
229 530 : my_Emp2_Cou = 0.0_dp
230 530 : my_Emp2_EX = 0.0_dp
231 530 : t_start = m_walltime()
232 2572 : DO ij_index = 1, max_ij_pairs
233 :
234 : ! Prediction is unreliable if we are in the first step of the loop
235 2042 : IF (unit_nr > 0 .AND. ij_index > 1) THEN
236 742 : decil = ij_index*10/max_ij_pairs
237 742 : IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
238 701 : t_new = m_walltime()
239 701 : t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
240 : WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
241 701 : cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
242 701 : CALL m_flush(unit_nr)
243 : END IF
244 : END IF
245 :
246 2042 : IF (calc_forces) THEN
247 2837553 : Y_i_aP = 0.0_dp
248 2939903 : Y_j_aP = 0.0_dp
249 : END IF
250 :
251 2042 : IF (ij_index <= my_ij_pairs) THEN
252 : ! We have work to do
253 1993 : ij_counter = (ij_index - MIN(1, color_sub))*ngroup + color_sub
254 1993 : my_i = ij_map(1, ij_counter)
255 1993 : my_j = ij_map(2, ij_counter)
256 1993 : my_block_size = ij_map(3, ij_counter)
257 :
258 3318845 : local_i_aL = 0.0_dp
259 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
260 1993 : BIb_C(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
261 :
262 3434615 : local_j_aL = 0.0_dp
263 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
264 1993 : BIb_C(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
265 :
266 : ! collect data from other proc
267 1993 : CALL timeset(routineN//"_comm", handle3)
268 2102 : DO proc_shift = 1, comm_exchange%num_pe - 1
269 109 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
270 109 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
271 :
272 109 : send_ij_index = num_ij_pairs(proc_send)
273 :
274 109 : CALL get_group_dist(gd_array, proc_receive, sizes=rec_L_size)
275 :
276 2102 : IF (ij_index <= send_ij_index) THEN
277 : ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
278 60 : integ_group_pos2color_sub(proc_send)
279 60 : send_i = ij_map(1, ij_counter_send)
280 60 : send_j = ij_map(2, ij_counter_send)
281 :
282 : ! occupied i
283 : BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
284 60 : buffer_1D(1:rec_L_size*my_B_size(ispin)*my_block_size)
285 43353 : BI_C_rec = 0.0_dp
286 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
287 60 : proc_send, BI_C_rec, proc_receive, tag)
288 :
289 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
290 60 : BI_C_rec(:, 1:my_B_size(ispin), :))
291 :
292 : ! occupied j
293 : BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
294 60 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
295 44373 : BI_C_rec = 0.0_dp
296 : CALL comm_exchange%sendrecv(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
297 60 : proc_send, BI_C_rec, proc_receive, tag)
298 :
299 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
300 60 : BI_C_rec(:, 1:my_B_size(jspin), :))
301 :
302 : ELSE
303 : ! we send nothing while we know that we have to receive something
304 :
305 : ! occupied i
306 : BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
307 49 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(ispin)*my_block_size)
308 9124 : BI_C_rec = 0.0_dp
309 49 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
310 :
311 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
312 49 : BI_C_rec(:, 1:my_B_size(ispin), 1:my_block_size))
313 :
314 : ! occupied j
315 : BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
316 49 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
317 9124 : BI_C_rec = 0.0_dp
318 49 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
319 :
320 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
321 49 : BI_C_rec(:, 1:my_B_size(jspin), 1:my_block_size))
322 :
323 : END IF
324 :
325 : END DO
326 :
327 1993 : CALL timestop(handle3)
328 :
329 : ! loop over the block elements
330 3990 : DO iiB = 1, my_block_size
331 5995 : DO jjB = 1, my_block_size
332 2005 : CALL timeset(routineN//"_expansion", handle3)
333 4002 : ASSOCIATE (my_local_i_aL => local_i_aL(:, :, iiB), my_local_j_aL => local_j_aL(:, :, jjB))
334 :
335 : ! calculate the integrals (ia|jb) strating from my local data ...
336 585679 : local_ab = 0.0_dp
337 2005 : IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
338 140066 : local_ba = 0.0_dp
339 : END IF
340 2005 : CALL dgemm_counter_start(dgemm_counter)
341 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(jspin), dimen_RI, 1.0_dp, &
342 : my_local_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
343 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), &
344 2005 : my_B_size(ispin))
345 : ! Additional integrals only for alpha_beta case and forces
346 2005 : IF (my_alpha_beta_case .AND. calc_forces) THEN
347 : local_ba(my_B_virtual_start(jspin):my_B_virtual_end(jspin), :) = &
348 133101 : TRANSPOSE(local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :))
349 : END IF
350 : ! ... and from the other of my subgroup
351 2279 : DO proc_shift = 1, para_env_sub%num_pe - 1
352 274 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
353 274 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
354 :
355 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, &
356 274 : rec_B_virtual_end, rec_B_size)
357 :
358 274 : external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
359 267944 : external_i_aL = 0.0_dp
360 :
361 : CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
362 274 : external_i_aL, proc_receive, tag)
363 :
364 : CALL mp2_env%local_gemm_ctx%gemm( &
365 : 'T', 'N', rec_B_size, my_B_size(jspin), dimen_RI, 1.0_dp, &
366 : external_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
367 274 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(jspin)), rec_B_size)
368 :
369 : ! Additional integrals only for alpha_beta case and forces
370 2553 : IF (my_alpha_beta_case .AND. calc_forces) THEN
371 :
372 : CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, &
373 70 : rec_B_virtual_end, rec_B_size)
374 :
375 70 : external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
376 81655 : external_i_aL = 0.0_dp
377 :
378 : CALL para_env_sub%sendrecv(my_local_j_aL, proc_send, &
379 70 : external_i_aL, proc_receive, tag)
380 :
381 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, my_B_size(ispin), dimen_RI, 1.0_dp, &
382 : external_i_aL, dimen_RI, my_local_i_aL, dimen_RI, &
383 70 : 0.0_dp, local_ba(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(ispin)), rec_B_size)
384 : END IF
385 : END DO
386 2005 : IF (my_alpha_beta_case .AND. calc_forces) THEN
387 : ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
388 502 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(ispin) + my_B_size(jspin), dimen_RI)
389 : ELSE
390 1503 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(jspin), dimen_RI)
391 : END IF
392 2005 : CALL timestop(handle3)
393 :
394 : !sample peak memory
395 2005 : CALL m_memory()
396 :
397 2005 : CALL timeset(routineN//"_ener", handle3)
398 : ! calculate coulomb only MP2
399 2005 : sym_fac = 2.0_dp
400 2005 : IF (my_i == my_j) sym_fac = 1.0_dp
401 2005 : IF (my_alpha_beta_case) sym_fac = 0.5_dp
402 30928 : DO b = 1, my_B_size(jspin)
403 28923 : b_global = b + my_B_virtual_start(jspin) - 1
404 585679 : DO a = 1, virtual(ispin)
405 : my_Emp2_Cou = my_Emp2_Cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
406 : (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
407 583674 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
408 : END DO
409 : END DO
410 :
411 2005 : IF (calc_ex) THEN
412 : ! contract integrals with orbital energies for exchange MP2 energy
413 : ! starting with local ...
414 330129 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
415 30328 : DO b = 1, my_B_size(ispin)
416 28323 : b_global = b + my_B_virtual_start(ispin) - 1
417 548367 : DO a = 1, my_B_size(ispin)
418 518039 : a_global = a + my_B_virtual_start(ispin) - 1
419 : my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
420 : (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
421 518039 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
422 546362 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
423 : t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
424 : (Eigenval(homo(ispin) + a_global, ispin) + &
425 : Eigenval(homo(ispin) + b_global, ispin) - &
426 302030 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
427 : END IF
428 : END DO
429 : END DO
430 : ! ... and then with external data
431 2279 : DO proc_shift = 1, para_env_sub%num_pe - 1
432 274 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
433 274 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
434 :
435 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, &
436 274 : rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
437 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, &
438 274 : send_B_virtual_start, send_B_virtual_end, send_B_size)
439 :
440 : external_ab(1:my_B_size(ispin), 1:rec_B_size) => &
441 274 : buffer_1D(1:INT(rec_B_size, int_8)*my_B_size(ispin))
442 30405 : external_ab = 0.0_dp
443 :
444 : CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:my_B_size(ispin)), proc_send, &
445 60536 : external_ab(1:my_B_size(ispin), 1:rec_B_size), proc_receive, tag)
446 :
447 5284 : DO b = 1, my_B_size(ispin)
448 2731 : b_global = b + my_B_virtual_start(ispin) - 1
449 30405 : DO a = 1, rec_B_size
450 27400 : a_global = a + rec_B_virtual_start - 1
451 : my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
452 : (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
453 27400 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
454 27400 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
455 : t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
456 : (Eigenval(homo(ispin) + a_global, ispin) + &
457 : Eigenval(homo(ispin) + b_global, ispin) - &
458 12311 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
459 : END DO
460 : END DO
461 : END DO
462 : END IF
463 2005 : CALL timestop(handle3)
464 :
465 4010 : IF (calc_forces) THEN
466 : ! update P_ab, Gamma_P_ia
467 : CALL mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
468 : Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
469 : my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, &
470 : local_ab, t_ab, my_local_i_aL, my_local_j_aL, &
471 : my_open_shell_ss, Y_i_aP(:, :, iiB), Y_j_aP(:, :, jjB), local_ba, &
472 1620 : ispin, jspin, dgemm_counter, buffer_1D)
473 :
474 : END IF
475 :
476 : END ASSOCIATE
477 :
478 : END DO ! jjB
479 : END DO ! iiB
480 :
481 : ELSE
482 : ! We need it later in case of gradients
483 49 : my_block_size = 1
484 :
485 49 : CALL timeset(routineN//"_comm", handle3)
486 : ! No work to do and we know that we have to receive nothing, but send something
487 : ! send data to other proc
488 98 : DO proc_shift = 1, comm_exchange%num_pe - 1
489 49 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
490 49 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
491 :
492 49 : send_ij_index = num_ij_pairs(proc_send)
493 :
494 98 : IF (ij_index <= send_ij_index) THEN
495 : ! something to send
496 : ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
497 49 : integ_group_pos2color_sub(proc_send)
498 49 : send_i = ij_map(1, ij_counter_send)
499 49 : send_j = ij_map(2, ij_counter_send)
500 :
501 : ! occupied i
502 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
503 49 : proc_send, tag)
504 : ! occupied j
505 : CALL comm_exchange%send(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
506 49 : proc_send, tag)
507 : END IF
508 : END DO
509 49 : CALL timestop(handle3)
510 : END IF
511 :
512 : ! redistribute gamma
513 2572 : IF (calc_forces) THEN
514 : CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_B_size(ispin), &
515 : my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
516 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
517 : ij_map, ranges_info_array, Y_i_aP(:, :, 1:my_block_size), comm_exchange, &
518 1617 : gd_array%sizes, 1, buffer_1D)
519 : CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_B_size(jspin), &
520 : my_block_size, my_group_L_size, my_j, my_ij_pairs, ngroup, &
521 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
522 : ij_map, ranges_info_array, Y_j_aP(:, :, 1:my_block_size), comm_exchange, &
523 1617 : gd_array%sizes, 2, buffer_1D)
524 : END IF
525 :
526 : END DO
527 530 : CALL timestop(handle2)
528 :
529 530 : DEALLOCATE (local_i_aL)
530 530 : DEALLOCATE (local_j_aL)
531 530 : DEALLOCATE (ij_map)
532 530 : DEALLOCATE (num_ij_pairs)
533 530 : DEALLOCATE (local_ab)
534 :
535 530 : IF (calc_forces) THEN
536 384 : DEALLOCATE (Y_i_aP)
537 384 : DEALLOCATE (Y_j_aP)
538 384 : IF (ALLOCATED(t_ab)) THEN
539 306 : DEALLOCATE (t_ab)
540 : END IF
541 384 : DEALLOCATE (local_ba)
542 :
543 : ! here we check if there are almost degenerate ij
544 : ! pairs and we update P_ij with these contribution.
545 : ! If all pairs are degenerate with each other this step will scale O(N^6),
546 : ! if the number of degenerate pairs scales linearly with the system size
547 : ! this step will scale O(N^5).
548 : ! Start counting the number of almost degenerate ij pairs according
549 : ! to eps_canonical
550 : CALL quasi_degenerate_P_ij( &
551 : mp2_env, Eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
552 : my_beta_beta_case, Bib_C(ispin:jspin), unit_nr, dimen_RI, &
553 : my_B_size(ispin:jspin), ngroup, my_group_L_size, &
554 : color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
555 : my_B_virtual_start(ispin:jspin), my_B_virtual_end(ispin:jspin), gd_array%sizes, gd_B_virtual(ispin:jspin), &
556 384 : integ_group_pos2color_sub, dgemm_counter, buffer_1D)
557 :
558 : END IF
559 :
560 530 : DEALLOCATE (buffer_1D)
561 :
562 : ! Dereplicate BIb_C and Gamma_P_ia to save memory
563 : ! These matrices will not be needed in that fashion anymore
564 : ! B_ia_Q will needed later
565 530 : IF (calc_forces .AND. jspin == nspins) THEN
566 1068 : IF (.NOT. ALLOCATED(B_ia_Q)) ALLOCATE (B_ia_Q(nspins))
567 1530 : ALLOCATE (B_ia_Q(ispin)%array(homo(ispin), my_B_size(ispin), my_group_L_size_orig))
568 907278 : B_ia_Q(ispin)%array = 0.0_dp
569 1408 : DO jjB = 1, homo(ispin)
570 17332 : DO iiB = 1, my_B_size(ispin)
571 : B_ia_Q(ispin)%array(jjB, iiB, 1:my_group_L_size_orig) = &
572 724326 : BIb_C(ispin)%array(1:my_group_L_size_orig, iiB, jjB)
573 : END DO
574 : END DO
575 306 : DEALLOCATE (BIb_C(ispin)%array)
576 :
577 : ! sum Gamma and dereplicate
578 1530 : ALLOCATE (BIb_C(ispin)%array(my_B_size(ispin), homo(ispin), my_group_L_size_orig))
579 582 : DO proc_shift = 1, comm_rep%num_pe - 1
580 : ! invert order
581 276 : proc_send = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
582 276 : proc_receive = MODULO(comm_rep%mepos + proc_shift, comm_rep%num_pe)
583 :
584 276 : start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
585 276 : end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
586 :
587 : CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
588 276 : proc_send, BIb_C(ispin)%array, proc_receive, tag)
589 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
590 582 : !$OMP SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
591 : mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) = &
592 : mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) &
593 : + BIb_C(ispin)%array(:, :, :)
594 : !$OMP END PARALLEL WORKSHARE
595 : END DO
596 :
597 770352 : BIb_C(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig)
598 306 : DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
599 306 : CALL MOVE_ALLOC(BIb_C(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
600 224 : ELSE IF (jspin == nspins) THEN
601 136 : DEALLOCATE (BIb_C(ispin)%array)
602 : END IF
603 :
604 530 : CALL para_env%sum(my_Emp2_Cou)
605 530 : CALL para_env%sum(my_Emp2_Ex)
606 :
607 2032 : IF (my_open_shell_SS .OR. my_alpha_beta_case) THEN
608 264 : IF (my_alpha_beta_case) THEN
609 88 : Emp2_S = Emp2_S + my_Emp2_Cou
610 88 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
611 : ELSE
612 176 : my_Emp2_Cou = my_Emp2_Cou*0.25_dp
613 176 : my_Emp2_EX = my_Emp2_EX*0.5_dp
614 176 : Emp2_T = Emp2_T + my_Emp2_Cou + my_Emp2_EX
615 176 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
616 176 : Emp2_EX = Emp2_EX + my_Emp2_EX
617 : END IF
618 : ELSE
619 266 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
620 266 : Emp2_EX = Emp2_EX + my_Emp2_EX
621 : END IF
622 : END DO
623 :
624 : END DO
625 :
626 354 : DEALLOCATE (integ_group_pos2color_sub)
627 354 : DEALLOCATE (ranges_info_array)
628 :
629 354 : CALL comm_exchange%free()
630 354 : CALL comm_rep%free()
631 :
632 354 : IF (calc_forces) THEN
633 : ! recover original information (before replication)
634 228 : DEALLOCATE (gd_array%sizes)
635 228 : iiB = SIZE(sizes_array_orig)
636 684 : ALLOCATE (gd_array%sizes(0:iiB - 1))
637 678 : gd_array%sizes(:) = sizes_array_orig
638 228 : DEALLOCATE (sizes_array_orig)
639 :
640 : ! Remove replication from BIb_C and reorder the matrix
641 228 : my_group_L_size = my_group_L_size_orig
642 :
643 : ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
644 534 : DO ispin = 1, nspins
645 : CALL complete_gamma(mp2_env, B_ia_Q(ispin)%array, dimen_RI, homo(ispin), &
646 : virtual(ispin), para_env, para_env_sub, ngroup, &
647 : my_group_L_size, my_group_L_start, my_group_L_end, &
648 : my_B_size(ispin), my_B_virtual_start(ispin), &
649 : gd_array, gd_B_virtual(ispin), &
650 534 : ispin)
651 : END DO
652 534 : DEALLOCATE (B_ia_Q)
653 :
654 45074 : IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
655 : BLOCK
656 : TYPE(mp_comm_type) :: comm
657 228 : CALL comm%from_split(para_env, para_env_sub%mepos)
658 534 : DO ispin = 1, nspins
659 : ! P_ab is only replicated over all subgroups
660 306 : CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
661 : ! P_ij is replicated over all processes
662 534 : CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
663 : END DO
664 456 : CALL comm%free()
665 : END BLOCK
666 : END IF
667 :
668 354 : CALL release_group_dist(gd_array)
669 796 : DO ispin = 1, nspins
670 442 : IF (ALLOCATED(BIb_C(ispin)%array)) DEALLOCATE (BIb_C(ispin)%array)
671 796 : CALL release_group_dist(gd_B_virtual(ispin))
672 : END DO
673 :
674 : ! We do not need this matrix later, so deallocate it here to safe memory
675 354 : IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
676 354 : IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
677 8 : DEALLOCATE (mp2_env%ri_grad%operator_half)
678 :
679 354 : CALL dgemm_counter_write(dgemm_counter, para_env)
680 :
681 : ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
682 354 : CALL mp2_env%local_gemm_ctx%destroy()
683 354 : CALL timestop(handle)
684 :
685 1416 : END SUBROUTINE mp2_ri_gpw_compute_en
686 :
687 : ! **************************************************************************************************
688 : !> \brief ...
689 : !> \param local_i_aL ...
690 : !> \param ranges_info_array ...
691 : !> \param BIb_C_rec ...
692 : ! **************************************************************************************************
693 4360 : SUBROUTINE fill_local_i_aL(local_i_aL, ranges_info_array, BIb_C_rec)
694 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: local_i_aL
695 : INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
696 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: BIb_C_rec
697 :
698 : CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL'
699 :
700 : INTEGER :: end_point, handle, irep, Lend_pos, &
701 : Lstart_pos, start_point
702 :
703 4360 : CALL timeset(routineN, handle)
704 :
705 12100 : DO irep = 1, SIZE(ranges_info_array, 2)
706 7740 : Lstart_pos = ranges_info_array(1, irep)
707 7740 : Lend_pos = ranges_info_array(2, irep)
708 7740 : start_point = ranges_info_array(3, irep)
709 7740 : end_point = ranges_info_array(4, irep)
710 :
711 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
712 12100 : !$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
713 : local_i_aL(Lstart_pos:Lend_pos, :, :) = BIb_C_rec(start_point:end_point, :, :)
714 : !$OMP END PARALLEL WORKSHARE
715 : END DO
716 :
717 4360 : CALL timestop(handle)
718 :
719 4360 : END SUBROUTINE fill_local_i_aL
720 :
721 : ! **************************************************************************************************
722 : !> \brief ...
723 : !> \param local_i_aL ...
724 : !> \param ranges_info_array ...
725 : !> \param BIb_C_rec ...
726 : ! **************************************************************************************************
727 266 : SUBROUTINE fill_local_i_aL_2D(local_i_aL, ranges_info_array, BIb_C_rec)
728 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: local_i_aL
729 : INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
730 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: BIb_C_rec
731 :
732 : CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL_2D'
733 :
734 : INTEGER :: end_point, handle, irep, Lend_pos, &
735 : Lstart_pos, start_point
736 :
737 266 : CALL timeset(routineN, handle)
738 :
739 766 : DO irep = 1, SIZE(ranges_info_array, 2)
740 500 : Lstart_pos = ranges_info_array(1, irep)
741 500 : Lend_pos = ranges_info_array(2, irep)
742 500 : start_point = ranges_info_array(3, irep)
743 500 : end_point = ranges_info_array(4, irep)
744 :
745 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
746 766 : !$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
747 : local_i_aL(Lstart_pos:Lend_pos, :) = BIb_C_rec(start_point:end_point, :)
748 : !$OMP END PARALLEL WORKSHARE
749 : END DO
750 :
751 266 : CALL timestop(handle)
752 :
753 266 : END SUBROUTINE fill_local_i_aL_2D
754 :
755 : ! **************************************************************************************************
756 : !> \brief ...
757 : !> \param BIb_C ...
758 : !> \param comm_exchange ...
759 : !> \param comm_rep ...
760 : !> \param homo ...
761 : !> \param sizes_array ...
762 : !> \param my_B_size ...
763 : !> \param my_group_L_size ...
764 : !> \param ranges_info_array ...
765 : ! **************************************************************************************************
766 442 : SUBROUTINE replicate_iaK_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
767 442 : my_group_L_size, ranges_info_array)
768 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
769 : INTENT(INOUT) :: BIb_C
770 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange, comm_rep
771 : INTEGER, INTENT(IN) :: homo
772 : INTEGER, DIMENSION(:), INTENT(IN) :: sizes_array
773 : INTEGER, INTENT(IN) :: my_B_size, my_group_L_size
774 : INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN) :: ranges_info_array
775 :
776 : CHARACTER(LEN=*), PARAMETER :: routineN = 'replicate_iaK_2intgroup'
777 :
778 : INTEGER :: end_point, handle, max_L_size, &
779 : proc_receive, proc_shift, start_point
780 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: BIb_C_copy
781 442 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: BIb_C_gather
782 :
783 442 : CALL timeset(routineN, handle)
784 :
785 : ! replication scheme using mpi_allgather
786 : ! get the max L size of the
787 988 : max_L_size = MAXVAL(sizes_array)
788 :
789 2210 : ALLOCATE (BIb_C_copy(max_L_size, my_B_size, homo))
790 1655684 : BIb_C_copy = 0.0_dp
791 898474 : BIb_C_copy(1:SIZE(BIb_C, 1), 1:my_B_size, 1:homo) = BIb_C
792 :
793 442 : DEALLOCATE (BIb_C)
794 :
795 2652 : ALLOCATE (BIb_C_gather(max_L_size, my_B_size, homo, 0:comm_rep%num_pe - 1))
796 3188844 : BIb_C_gather = 0.0_dp
797 :
798 442 : CALL comm_rep%allgather(BIb_C_copy, BIb_C_gather)
799 :
800 442 : DEALLOCATE (BIb_C_copy)
801 :
802 2210 : ALLOCATE (BIb_C(my_group_L_size, my_B_size, homo))
803 1655337 : BIb_C = 0.0_dp
804 :
805 : ! reorder data
806 1204 : DO proc_shift = 0, comm_rep%num_pe - 1
807 762 : proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
808 :
809 762 : start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
810 762 : end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
811 :
812 : BIb_C(start_point:end_point, 1:my_B_size, 1:homo) = &
813 1674771 : BIb_C_gather(1:end_point - start_point + 1, 1:my_B_size, 1:homo, proc_receive)
814 :
815 : END DO
816 :
817 442 : DEALLOCATE (BIb_C_gather)
818 :
819 442 : CALL timestop(handle)
820 :
821 442 : END SUBROUTINE replicate_iaK_2intgroup
822 :
823 : ! **************************************************************************************************
824 : !> \brief ...
825 : !> \param local_ab ...
826 : !> \param t_ab ...
827 : !> \param mp2_env ...
828 : !> \param homo ...
829 : !> \param virtual ...
830 : !> \param my_B_size ...
831 : !> \param my_group_L_size ...
832 : !> \param calc_forces ...
833 : !> \param ispin ...
834 : !> \param jspin ...
835 : !> \param local_ba ...
836 : ! **************************************************************************************************
837 530 : SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
838 : my_group_L_size, calc_forces, ispin, jspin, local_ba)
839 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
840 : INTENT(OUT) :: local_ab, t_ab
841 : TYPE(mp2_type) :: mp2_env
842 : INTEGER, INTENT(IN) :: homo(2), virtual(2), my_B_size(2), &
843 : my_group_L_size
844 : LOGICAL, INTENT(IN) :: calc_forces
845 : INTEGER, INTENT(IN) :: ispin, jspin
846 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
847 : INTENT(OUT) :: local_ba
848 :
849 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_no_blk'
850 :
851 : INTEGER :: handle
852 :
853 530 : CALL timeset(routineN, handle)
854 :
855 2120 : ALLOCATE (local_ab(virtual(ispin), my_B_size(jspin)))
856 138441 : local_ab = 0.0_dp
857 :
858 530 : IF (calc_forces) THEN
859 384 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
860 1224 : ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
861 6278 : mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
862 : END IF
863 384 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
864 1224 : ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_B_size(jspin), virtual(jspin)))
865 85070 : mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
866 : END IF
867 384 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
868 1530 : ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_B_size(jspin), homo(jspin), my_group_L_size))
869 1475502 : mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
870 : END IF
871 :
872 384 : IF (ispin == jspin) THEN
873 : ! For non-alpha-beta case we need amplitudes
874 1224 : ALLOCATE (t_ab(virtual(ispin), my_B_size(jspin)))
875 :
876 : ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
877 306 : ALLOCATE (local_ba(1, 1))
878 : ELSE
879 : ! We need more integrals
880 312 : ALLOCATE (local_ba(virtual(jspin), my_B_size(ispin)))
881 : END IF
882 : END IF
883 : !
884 :
885 530 : CALL timestop(handle)
886 :
887 530 : END SUBROUTINE mp2_ri_allocate_no_blk
888 :
889 : ! **************************************************************************************************
890 : !> \brief ...
891 : !> \param dimen_RI ...
892 : !> \param my_B_size ...
893 : !> \param block_size ...
894 : !> \param local_i_aL ...
895 : !> \param local_j_aL ...
896 : !> \param calc_forces ...
897 : !> \param Y_i_aP ...
898 : !> \param Y_j_aP ...
899 : !> \param ispin ...
900 : !> \param jspin ...
901 : ! **************************************************************************************************
902 530 : SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
903 : local_i_aL, local_j_aL, calc_forces, &
904 : Y_i_aP, Y_j_aP, ispin, jspin)
905 : INTEGER, INTENT(IN) :: dimen_RI, my_B_size(2), block_size
906 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
907 : INTENT(OUT) :: local_i_aL, local_j_aL
908 : LOGICAL, INTENT(IN) :: calc_forces
909 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
910 : INTENT(OUT) :: Y_i_aP, Y_j_aP
911 : INTEGER, INTENT(IN) :: ispin, jspin
912 :
913 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_blk'
914 :
915 : INTEGER :: handle
916 :
917 530 : CALL timeset(routineN, handle)
918 :
919 2650 : ALLOCATE (local_i_aL(dimen_RI, my_B_size(ispin), block_size))
920 657247 : local_i_aL = 0.0_dp
921 2650 : ALLOCATE (local_j_aL(dimen_RI, my_B_size(jspin), block_size))
922 670091 : local_j_aL = 0.0_dp
923 :
924 530 : IF (calc_forces) THEN
925 1920 : ALLOCATE (Y_i_aP(my_B_size(ispin), dimen_RI, block_size))
926 541654 : Y_i_aP = 0.0_dp
927 : ! For closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
928 : ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
929 1920 : ALLOCATE (Y_j_aP(my_B_size(jspin), dimen_RI, block_size))
930 552606 : Y_j_aP = 0.0_dp
931 : END IF
932 : !
933 :
934 530 : CALL timestop(handle)
935 :
936 530 : END SUBROUTINE mp2_ri_allocate_blk
937 :
938 : ! **************************************************************************************************
939 : !> \brief ...
940 : !> \param my_alpha_beta_case ...
941 : !> \param total_ij_pairs ...
942 : !> \param homo ...
943 : !> \param homo_beta ...
944 : !> \param block_size ...
945 : !> \param ngroup ...
946 : !> \param ij_map ...
947 : !> \param color_sub ...
948 : !> \param my_ij_pairs ...
949 : !> \param my_open_shell_SS ...
950 : !> \param unit_nr ...
951 : ! **************************************************************************************************
952 530 : SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
953 : block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
954 : LOGICAL, INTENT(IN) :: my_alpha_beta_case
955 : INTEGER, INTENT(OUT) :: total_ij_pairs
956 : INTEGER, INTENT(IN) :: homo, homo_beta, block_size, ngroup
957 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
958 : INTEGER, INTENT(IN) :: color_sub
959 : INTEGER, INTENT(OUT) :: my_ij_pairs
960 : LOGICAL, INTENT(IN) :: my_open_shell_SS
961 : INTEGER, INTENT(IN) :: unit_nr
962 :
963 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_communication'
964 :
965 : INTEGER :: assigned_blocks, first_I_block, first_J_block, handle, iiB, ij_block_counter, &
966 : ij_counter, jjB, last_i_block, last_J_block, num_block_per_group, num_IJ_blocks, &
967 : num_IJ_blocks_beta, total_ij_block, total_ij_pairs_blocks
968 530 : LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ij_marker
969 :
970 : ! Calculate the maximum number of ij pairs that have to be computed
971 : ! among groups
972 :
973 530 : CALL timeset(routineN, handle)
974 :
975 530 : IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
976 266 : total_ij_pairs = homo*(1 + homo)/2
977 266 : num_IJ_blocks = homo/block_size - 1
978 :
979 266 : first_I_block = 1
980 266 : last_i_block = block_size*(num_IJ_blocks - 1)
981 :
982 266 : first_J_block = block_size + 1
983 266 : last_J_block = block_size*(num_IJ_blocks + 1)
984 :
985 266 : ij_block_counter = 0
986 602 : DO iiB = first_I_block, last_i_block, block_size
987 602 : DO jjB = iiB + block_size, last_J_block, block_size
988 840 : ij_block_counter = ij_block_counter + 1
989 : END DO
990 : END DO
991 :
992 266 : total_ij_block = ij_block_counter
993 266 : num_block_per_group = total_ij_block/ngroup
994 266 : assigned_blocks = num_block_per_group*ngroup
995 :
996 266 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
997 :
998 1064 : ALLOCATE (ij_marker(homo, homo))
999 4018 : ij_marker = .TRUE.
1000 798 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1001 7770 : ij_map = 0
1002 266 : ij_counter = 0
1003 266 : my_ij_pairs = 0
1004 602 : DO iiB = first_I_block, last_i_block, block_size
1005 1278 : DO jjB = iiB + block_size, last_J_block, block_size
1006 840 : IF (ij_counter + 1 > assigned_blocks) EXIT
1007 676 : ij_counter = ij_counter + 1
1008 2028 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1009 676 : ij_map(1, ij_counter) = iiB
1010 676 : ij_map(2, ij_counter) = jjB
1011 676 : ij_map(3, ij_counter) = block_size
1012 1012 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1013 : END DO
1014 : END DO
1015 1070 : DO iiB = 1, homo
1016 2946 : DO jjB = iiB, homo
1017 2680 : IF (ij_marker(iiB, jjB)) THEN
1018 1200 : ij_counter = ij_counter + 1
1019 1200 : ij_map(1, ij_counter) = iiB
1020 1200 : ij_map(2, ij_counter) = jjB
1021 1200 : ij_map(3, ij_counter) = 1
1022 1200 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1023 : END IF
1024 : END DO
1025 : END DO
1026 266 : DEALLOCATE (ij_marker)
1027 :
1028 264 : ELSE IF (.NOT. my_alpha_beta_case) THEN
1029 : ! THese are the cases alpha/alpha and beta/beta
1030 : ! We do not have to consider the diagonal elements
1031 176 : total_ij_pairs = homo*(homo - 1)/2
1032 176 : num_IJ_blocks = (homo - 1)/block_size - 1
1033 :
1034 176 : first_I_block = 1
1035 176 : last_i_block = block_size*(num_IJ_blocks - 1)
1036 :
1037 : ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
1038 176 : first_J_block = block_size + 2
1039 176 : last_J_block = block_size*(num_IJ_blocks + 1) + 1
1040 :
1041 176 : ij_block_counter = 0
1042 262 : DO iiB = first_I_block, last_i_block, block_size
1043 262 : DO jjB = iiB + block_size + 1, last_J_block, block_size
1044 200 : ij_block_counter = ij_block_counter + 1
1045 : END DO
1046 : END DO
1047 :
1048 176 : total_ij_block = ij_block_counter
1049 176 : num_block_per_group = total_ij_block/ngroup
1050 176 : assigned_blocks = num_block_per_group*ngroup
1051 :
1052 176 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1053 :
1054 704 : ALLOCATE (ij_marker(homo, homo))
1055 2984 : ij_marker = .TRUE.
1056 528 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1057 3352 : ij_map = 0
1058 176 : ij_counter = 0
1059 176 : my_ij_pairs = 0
1060 262 : DO iiB = first_I_block, last_i_block, block_size
1061 458 : DO jjB = iiB + block_size + 1, last_J_block, block_size
1062 200 : IF (ij_counter + 1 > assigned_blocks) EXIT
1063 196 : ij_counter = ij_counter + 1
1064 604 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1065 196 : ij_map(1, ij_counter) = iiB
1066 196 : ij_map(2, ij_counter) = jjB
1067 196 : ij_map(3, ij_counter) = block_size
1068 282 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1069 : END DO
1070 : END DO
1071 774 : DO iiB = 1, homo
1072 1580 : DO jjB = iiB + 1, homo
1073 1404 : IF (ij_marker(iiB, jjB)) THEN
1074 598 : ij_counter = ij_counter + 1
1075 598 : ij_map(1, ij_counter) = iiB
1076 598 : ij_map(2, ij_counter) = jjB
1077 598 : ij_map(3, ij_counter) = 1
1078 598 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1079 : END IF
1080 : END DO
1081 : END DO
1082 176 : DEALLOCATE (ij_marker)
1083 :
1084 : ELSE
1085 88 : total_ij_pairs = homo*homo_beta
1086 88 : num_IJ_blocks = homo/block_size
1087 88 : num_IJ_blocks_beta = homo_beta/block_size
1088 :
1089 88 : first_I_block = 1
1090 88 : last_i_block = block_size*(num_IJ_blocks - 1)
1091 :
1092 88 : first_J_block = 1
1093 88 : last_J_block = block_size*(num_IJ_blocks_beta - 1)
1094 :
1095 88 : ij_block_counter = 0
1096 246 : DO iiB = first_I_block, last_i_block, block_size
1097 246 : DO jjB = first_J_block, last_J_block, block_size
1098 284 : ij_block_counter = ij_block_counter + 1
1099 : END DO
1100 : END DO
1101 :
1102 88 : total_ij_block = ij_block_counter
1103 88 : num_block_per_group = total_ij_block/ngroup
1104 88 : assigned_blocks = num_block_per_group*ngroup
1105 :
1106 88 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1107 :
1108 352 : ALLOCATE (ij_marker(homo, homo_beta))
1109 1396 : ij_marker = .TRUE.
1110 264 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1111 4304 : ij_map = 0
1112 88 : ij_counter = 0
1113 88 : my_ij_pairs = 0
1114 246 : DO iiB = first_I_block, last_i_block, block_size
1115 486 : DO jjB = first_J_block, last_J_block, block_size
1116 244 : IF (ij_counter + 1 > assigned_blocks) EXIT
1117 240 : ij_counter = ij_counter + 1
1118 720 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1119 240 : ij_map(1, ij_counter) = iiB
1120 240 : ij_map(2, ij_counter) = jjB
1121 240 : ij_map(3, ij_counter) = block_size
1122 398 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1123 : END DO
1124 : END DO
1125 432 : DO iiB = 1, homo
1126 1486 : DO jjB = 1, homo_beta
1127 1398 : IF (ij_marker(iiB, jjB)) THEN
1128 814 : ij_counter = ij_counter + 1
1129 814 : ij_map(1, ij_counter) = iiB
1130 814 : ij_map(2, ij_counter) = jjB
1131 814 : ij_map(3, ij_counter) = 1
1132 814 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1133 : END IF
1134 : END DO
1135 : END DO
1136 88 : DEALLOCATE (ij_marker)
1137 : END IF
1138 :
1139 530 : IF (unit_nr > 0) THEN
1140 265 : IF (block_size == 1) THEN
1141 : WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
1142 236 : "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
1143 : ELSE
1144 : WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
1145 29 : "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
1146 58 : 100.0_dp*REAL((total_ij_pairs - assigned_blocks*(block_size**2)), KIND=dp)/REAL(total_ij_pairs, KIND=dp)
1147 : END IF
1148 265 : CALL m_flush(unit_nr)
1149 : END IF
1150 :
1151 530 : CALL timestop(handle)
1152 :
1153 530 : END SUBROUTINE mp2_ri_communication
1154 :
1155 : ! **************************************************************************************************
1156 : !> \brief ...
1157 : !> \param para_env ...
1158 : !> \param para_env_sub ...
1159 : !> \param color_sub ...
1160 : !> \param sizes_array ...
1161 : !> \param calc_forces ...
1162 : !> \param integ_group_size ...
1163 : !> \param my_group_L_end ...
1164 : !> \param my_group_L_size ...
1165 : !> \param my_group_L_size_orig ...
1166 : !> \param my_group_L_start ...
1167 : !> \param my_new_group_L_size ...
1168 : !> \param integ_group_pos2color_sub ...
1169 : !> \param sizes_array_orig ...
1170 : !> \param ranges_info_array ...
1171 : !> \param comm_exchange ...
1172 : !> \param comm_rep ...
1173 : !> \param num_integ_group ...
1174 : ! **************************************************************************************************
1175 354 : SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
1176 : sizes_array, calc_forces, &
1177 : integ_group_size, my_group_L_end, &
1178 : my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
1179 : integ_group_pos2color_sub, &
1180 : sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
1181 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1182 : INTEGER, INTENT(IN) :: color_sub
1183 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT) :: sizes_array
1184 : LOGICAL, INTENT(IN) :: calc_forces
1185 : INTEGER, INTENT(IN) :: integ_group_size, my_group_L_end
1186 : INTEGER, INTENT(INOUT) :: my_group_L_size
1187 : INTEGER, INTENT(OUT) :: my_group_L_size_orig
1188 : INTEGER, INTENT(IN) :: my_group_L_start
1189 : INTEGER, INTENT(INOUT) :: my_new_group_L_size
1190 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: integ_group_pos2color_sub, &
1191 : sizes_array_orig
1192 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1193 : INTENT(OUT) :: ranges_info_array
1194 : TYPE(mp_comm_type), INTENT(OUT) :: comm_exchange, comm_rep
1195 : INTEGER, INTENT(IN) :: num_integ_group
1196 :
1197 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_create_group'
1198 :
1199 : INTEGER :: handle, iiB, proc_receive, proc_shift, &
1200 : sub_sub_color
1201 354 : INTEGER, ALLOCATABLE, DIMENSION(:) :: new_sizes_array, rep_ends_array, &
1202 : rep_sizes_array, rep_starts_array
1203 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: my_info
1204 :
1205 354 : CALL timeset(routineN, handle)
1206 : !
1207 354 : sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
1208 354 : CALL comm_exchange%from_split(para_env, sub_sub_color)
1209 :
1210 : ! create the replication group
1211 354 : sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
1212 354 : CALL comm_rep%from_split(para_env, sub_sub_color)
1213 :
1214 : ! create the new limits for K according to the size
1215 : ! of the integral group
1216 :
1217 : ! info array for replication
1218 1062 : ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
1219 1062 : ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
1220 1062 : ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
1221 :
1222 354 : CALL comm_rep%allgather(my_group_L_size, rep_sizes_array)
1223 354 : CALL comm_rep%allgather(my_group_L_start, rep_starts_array)
1224 354 : CALL comm_rep%allgather(my_group_L_end, rep_ends_array)
1225 :
1226 : ! calculate my_new_group_L_size according to sizes_array
1227 354 : my_new_group_L_size = my_group_L_size
1228 :
1229 : ! Info of this process
1230 1062 : ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
1231 354 : my_info(1, 0) = my_group_L_start
1232 354 : my_info(2, 0) = my_group_L_end
1233 354 : my_info(3, 0) = 1
1234 354 : my_info(4, 0) = my_group_L_size
1235 :
1236 596 : DO proc_shift = 1, comm_rep%num_pe - 1
1237 242 : proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
1238 :
1239 242 : my_new_group_L_size = my_new_group_L_size + rep_sizes_array(proc_receive)
1240 :
1241 242 : my_info(1, proc_shift) = rep_starts_array(proc_receive)
1242 242 : my_info(2, proc_shift) = rep_ends_array(proc_receive)
1243 242 : my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
1244 596 : my_info(4, proc_shift) = my_new_group_L_size
1245 :
1246 : END DO
1247 :
1248 1062 : ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
1249 1416 : ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
1250 354 : CALL comm_exchange%allgather(my_new_group_L_size, new_sizes_array)
1251 354 : CALL comm_exchange%allgather(my_info, ranges_info_array)
1252 :
1253 354 : DEALLOCATE (rep_sizes_array)
1254 354 : DEALLOCATE (rep_starts_array)
1255 354 : DEALLOCATE (rep_ends_array)
1256 :
1257 1062 : ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
1258 354 : CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
1259 :
1260 354 : IF (calc_forces) THEN
1261 228 : iiB = SIZE(sizes_array)
1262 684 : ALLOCATE (sizes_array_orig(0:iiB - 1))
1263 678 : sizes_array_orig(:) = sizes_array
1264 : END IF
1265 :
1266 354 : my_group_L_size_orig = my_group_L_size
1267 354 : my_group_L_size = my_new_group_L_size
1268 354 : DEALLOCATE (sizes_array)
1269 :
1270 1062 : ALLOCATE (sizes_array(0:integ_group_size - 1))
1271 806 : sizes_array(:) = new_sizes_array
1272 :
1273 354 : DEALLOCATE (new_sizes_array)
1274 : !
1275 354 : CALL timestop(handle)
1276 :
1277 708 : END SUBROUTINE mp2_ri_create_group
1278 :
1279 : ! **************************************************************************************************
1280 : !> \brief ...
1281 : !> \param mp2_env ...
1282 : !> \param para_env ...
1283 : !> \param para_env_sub ...
1284 : !> \param gd_array ...
1285 : !> \param gd_B_virtual ...
1286 : !> \param homo ...
1287 : !> \param dimen_RI ...
1288 : !> \param unit_nr ...
1289 : !> \param integ_group_size ...
1290 : !> \param ngroup ...
1291 : !> \param num_integ_group ...
1292 : !> \param virtual ...
1293 : !> \param calc_forces ...
1294 : ! **************************************************************************************************
1295 1416 : SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1296 354 : homo, dimen_RI, unit_nr, &
1297 : integ_group_size, &
1298 : ngroup, num_integ_group, &
1299 354 : virtual, calc_forces)
1300 : TYPE(mp2_type) :: mp2_env
1301 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1302 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1303 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1304 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
1305 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
1306 : INTEGER, INTENT(OUT) :: integ_group_size, ngroup, num_integ_group
1307 : INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1308 : LOGICAL, INTENT(IN) :: calc_forces
1309 :
1310 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_integ_group_size'
1311 :
1312 : INTEGER :: block_size, handle, iiB, &
1313 : max_repl_group_size, &
1314 : min_integ_group_size
1315 : INTEGER(KIND=int_8) :: mem
1316 : LOGICAL :: calc_group_size
1317 : REAL(KIND=dp) :: factor, mem_base, mem_min, mem_per_blk, &
1318 : mem_per_repl, mem_per_repl_blk, &
1319 : mem_real
1320 :
1321 354 : CALL timeset(routineN, handle)
1322 :
1323 354 : ngroup = para_env%num_pe/para_env_sub%num_pe
1324 :
1325 354 : calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
1326 354 : IF (.NOT. calc_group_size) THEN
1327 10 : IF (MOD(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .TRUE.
1328 : END IF
1329 :
1330 354 : IF (calc_group_size) THEN
1331 344 : CALL m_memory(mem)
1332 344 : mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1333 344 : CALL para_env%min(mem_real)
1334 344 : mem_real = MIN(mem_real, mp2_env%mp2_memory)
1335 :
1336 344 : mem_base = 0.0_dp
1337 344 : mem_per_blk = 0.0_dp
1338 344 : mem_per_repl = 0.0_dp
1339 344 : mem_per_repl_blk = 0.0_dp
1340 :
1341 : ! BIB_C_copy
1342 : mem_per_repl = mem_per_repl + MAXVAL(MAX(REAL(homo, KIND=dp)*maxsize(gd_array), REAL(dimen_RI, KIND=dp))* &
1343 1114 : maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1344 : ! BIB_C
1345 770 : mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_B_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
1346 : ! BIB_C_rec
1347 770 : mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1348 : ! local_i_aL+local_j_aL
1349 770 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
1350 : ! local_ab
1351 1114 : mem_base = mem_base + MAXVAL(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1352 : ! external_ab/external_i_aL
1353 1196 : mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1354 :
1355 344 : IF (calc_forces) THEN
1356 : ! Gamma_P_ia
1357 : mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_array)* &
1358 514 : maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1359 : ! Y_i_aP+Y_j_aP
1360 514 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
1361 : ! local_ba/t_ab
1362 808 : mem_base = mem_base + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1363 : ! P_ij
1364 514 : mem_base = mem_base + SUM(REAL(homo, KIND=dp)*homo)*8.0_dp/(1024**2)
1365 : ! P_ab
1366 514 : mem_base = mem_base + SUM(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1367 : ! send_ab/send_i_aL
1368 808 : mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1369 : END IF
1370 :
1371 : ! This a first guess based on the assumption of optimal block sizes
1372 770 : block_size = MAX(1, MIN(FLOOR(SQRT(REAL(MINVAL(homo), KIND=dp))), FLOOR(MINVAL(homo)/SQRT(2.0_dp*ngroup))))
1373 344 : IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
1374 :
1375 344 : mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
1376 :
1377 516 : IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
1378 344 : mem_real, ' MiB'
1379 516 : IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
1380 344 : mem_min, ' MiB'
1381 :
1382 : ! We use the following communication model
1383 : ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
1384 : ! One can show that the costs of the contraction step are independent of the block size and the replication group size
1385 : ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
1386 : ! NL ... number of RI basis functions
1387 : ! NR ... replication group size
1388 : ! NG ... number of sub groups
1389 : ! NB ... Block size
1390 : ! o ... number of occupied orbitals
1391 : ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
1392 : ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1393 : ! and with gradients
1394 : ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1395 : ! We are looking for the minimum of the communication volume,
1396 : ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
1397 : ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
1398 : ! Replication group size = 1 implies that the integration group size equals the number of subgroups
1399 :
1400 344 : integ_group_size = ngroup
1401 :
1402 : ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
1403 : factor = REAL(SUM(homo*virtual), KIND=dp) &
1404 1622 : - SUM((REAL(MAXVAL(homo), KIND=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
1405 672 : IF (SIZE(homo) == 2) factor = factor - 2.0_dp*PRODUCT(homo)/block_size/ngroup*SUM(homo*virtual)
1406 :
1407 688 : IF (factor <= 0.0_dp) THEN
1408 : ! Remove the fixed memory and divide by the memory per replication group size
1409 : max_repl_group_size = MIN(MAX(FLOOR((mem_real - mem_base - mem_per_blk*block_size)/ &
1410 256 : (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
1411 : ! Convert to an integration group size
1412 256 : min_integ_group_size = ngroup/max_repl_group_size
1413 :
1414 : ! Ensure that the integration group size is a divisor of the number of sub groups
1415 256 : DO iiB = MAX(MIN(min_integ_group_size, ngroup), 1), ngroup
1416 : ! check that the ngroup is a multiple of integ_group_size
1417 256 : IF (MOD(ngroup, iiB) == 0) THEN
1418 256 : integ_group_size = iiB
1419 256 : EXIT
1420 : END IF
1421 0 : integ_group_size = integ_group_size + 1
1422 : END DO
1423 : END IF
1424 : ELSE ! We take the user provided group size
1425 10 : integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
1426 : END IF
1427 :
1428 354 : IF (unit_nr > 0) THEN
1429 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
1430 177 : "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
1431 177 : CALL m_flush(unit_nr)
1432 : END IF
1433 :
1434 354 : num_integ_group = ngroup/integ_group_size
1435 :
1436 354 : CALL timestop(handle)
1437 :
1438 354 : END SUBROUTINE mp2_ri_get_integ_group_size
1439 :
1440 : ! **************************************************************************************************
1441 : !> \brief ...
1442 : !> \param mp2_env ...
1443 : !> \param para_env ...
1444 : !> \param para_env_sub ...
1445 : !> \param gd_array ...
1446 : !> \param gd_B_virtual ...
1447 : !> \param homo ...
1448 : !> \param virtual ...
1449 : !> \param dimen_RI ...
1450 : !> \param unit_nr ...
1451 : !> \param block_size ...
1452 : !> \param ngroup ...
1453 : !> \param num_integ_group ...
1454 : !> \param my_open_shell_ss ...
1455 : !> \param calc_forces ...
1456 : !> \param buffer_1D ...
1457 : ! **************************************************************************************************
1458 530 : SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1459 530 : homo, virtual, dimen_RI, unit_nr, &
1460 : block_size, ngroup, num_integ_group, &
1461 : my_open_shell_ss, calc_forces, buffer_1D)
1462 : TYPE(mp2_type) :: mp2_env
1463 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1464 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1465 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1466 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
1467 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
1468 : INTEGER, INTENT(OUT) :: block_size, ngroup
1469 : INTEGER, INTENT(IN) :: num_integ_group
1470 : LOGICAL, INTENT(IN) :: my_open_shell_ss, calc_forces
1471 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
1472 : INTENT(OUT) :: buffer_1D
1473 :
1474 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_block_size'
1475 :
1476 : INTEGER :: best_block_size, handle, num_IJ_blocks
1477 : INTEGER(KIND=int_8) :: buffer_size, mem
1478 : REAL(KIND=dp) :: mem_base, mem_per_blk, mem_per_repl_blk, &
1479 : mem_real
1480 :
1481 530 : CALL timeset(routineN, handle)
1482 :
1483 530 : ngroup = para_env%num_pe/para_env_sub%num_pe
1484 :
1485 530 : CALL m_memory(mem)
1486 530 : mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1487 530 : CALL para_env%min(mem_real)
1488 :
1489 530 : mem_base = 0.0_dp
1490 530 : mem_per_blk = 0.0_dp
1491 530 : mem_per_repl_blk = 0.0_dp
1492 :
1493 : ! external_ab
1494 1766 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1495 : ! BIB_C_rec
1496 1148 : mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1497 : ! local_i_aL+local_j_aL
1498 1148 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
1499 : ! Copy to keep arrays contiguous
1500 1766 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1501 :
1502 530 : IF (calc_forces) THEN
1503 : ! Y_i_aP+Y_j_aP+BIb_C_send
1504 846 : mem_per_blk = mem_per_blk + 3.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
1505 : ! send_ab
1506 1308 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1507 : END IF
1508 :
1509 530 : best_block_size = 1
1510 :
1511 : ! Here we split the memory half for the communication, half for replication
1512 530 : IF (mp2_env%ri_mp2%block_size > 0) THEN
1513 : best_block_size = mp2_env%ri_mp2%block_size
1514 : ELSE
1515 294 : best_block_size = MAX(FLOOR((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
1516 :
1517 2976800 : DO
1518 2977094 : IF (SIZE(homo) == 1) THEN
1519 2395652 : IF (.NOT. my_open_shell_ss) THEN
1520 1103100 : num_IJ_blocks = (homo(1)/best_block_size)
1521 1103100 : num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
1522 : ELSE
1523 1292552 : num_IJ_blocks = ((homo(1) - 1)/best_block_size)
1524 1292552 : num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
1525 : END IF
1526 : ELSE
1527 1744326 : num_ij_blocks = PRODUCT(homo/best_block_size)
1528 : END IF
1529 : ! Enforce at least one large block for each subgroup
1530 2977094 : IF ((num_IJ_blocks >= ngroup .AND. num_IJ_blocks > 0) .OR. best_block_size == 1) THEN
1531 : EXIT
1532 : ELSE
1533 2976800 : best_block_size = best_block_size - 1
1534 : END IF
1535 : END DO
1536 :
1537 294 : IF (SIZE(homo) == 1) THEN
1538 232 : IF (my_open_shell_ss) THEN
1539 : ! check that best_block_size is not bigger than sqrt(homo-1)
1540 : ! Diagonal elements do not have to be considered
1541 124 : best_block_size = MIN(FLOOR(SQRT(REAL(homo(1) - 1, KIND=dp))), best_block_size)
1542 : ELSE
1543 : ! check that best_block_size is not bigger than sqrt(homo)
1544 108 : best_block_size = MIN(FLOOR(SQRT(REAL(homo(1), KIND=dp))), best_block_size)
1545 : END IF
1546 : END IF
1547 : END IF
1548 530 : block_size = MAX(1, best_block_size)
1549 :
1550 530 : IF (unit_nr > 0) THEN
1551 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
1552 265 : "RI_INFO| Block size:", block_size
1553 265 : CALL m_flush(unit_nr)
1554 : END IF
1555 :
1556 : ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
1557 : buffer_size = MAX(INT(maxsize(gd_array), KIND=int_8)*block_size, INT(MAX(dimen_RI, MAXVAL(virtual)), KIND=int_8)) &
1558 1766 : *MAXVAL(maxsize(gd_B_virtual))
1559 : ! The send buffer has the same size as the recv buffer
1560 530 : IF (calc_forces) buffer_size = buffer_size*2
1561 1590 : ALLOCATE (buffer_1D(buffer_size))
1562 :
1563 530 : CALL timestop(handle)
1564 :
1565 530 : END SUBROUTINE mp2_ri_get_block_size
1566 :
1567 : ! **************************************************************************************************
1568 : !> \brief ...
1569 : !> \param mp2_env ...
1570 : !> \param para_env_sub ...
1571 : !> \param gd_B_virtual ...
1572 : !> \param Eigenval ...
1573 : !> \param homo ...
1574 : !> \param dimen_RI ...
1575 : !> \param iiB ...
1576 : !> \param jjB ...
1577 : !> \param my_B_size ...
1578 : !> \param my_B_virtual_end ...
1579 : !> \param my_B_virtual_start ...
1580 : !> \param my_i ...
1581 : !> \param my_j ...
1582 : !> \param virtual ...
1583 : !> \param local_ab ...
1584 : !> \param t_ab ...
1585 : !> \param my_local_i_aL ...
1586 : !> \param my_local_j_aL ...
1587 : !> \param open_ss ...
1588 : !> \param Y_i_aP ...
1589 : !> \param Y_j_aP ...
1590 : !> \param local_ba ...
1591 : !> \param ispin ...
1592 : !> \param jspin ...
1593 : !> \param dgemm_counter ...
1594 : !> \param buffer_1D ...
1595 : ! **************************************************************************************************
1596 6480 : SUBROUTINE mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
1597 3240 : Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
1598 3240 : my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
1599 3240 : t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
1600 1620 : local_ba, ispin, jspin, dgemm_counter, buffer_1D)
1601 : TYPE(mp2_type) :: mp2_env
1602 : TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub
1603 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1604 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
1605 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
1606 : INTEGER, INTENT(IN) :: dimen_RI, iiB, jjB
1607 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_size, my_B_virtual_end, &
1608 : my_B_virtual_start
1609 : INTEGER, INTENT(IN) :: my_i, my_j
1610 : INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1611 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1612 : INTENT(INOUT), TARGET :: local_ab
1613 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1614 : INTENT(IN), TARGET :: t_ab, my_local_i_aL, my_local_j_aL
1615 : LOGICAL, INTENT(IN) :: open_ss
1616 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1617 : INTENT(INOUT), TARGET :: Y_i_aP, Y_j_aP, local_ba
1618 : INTEGER, INTENT(IN) :: ispin, jspin
1619 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
1620 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
1621 :
1622 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_update_P_gamma'
1623 :
1624 : INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_B_size, &
1625 : rec_B_virtual_end, rec_B_virtual_start, send_B_size, send_B_virtual_end, &
1626 : send_B_virtual_start
1627 : INTEGER(KIND=int_8) :: offset
1628 : LOGICAL :: alpha_beta
1629 : REAL(KIND=dp) :: factor, P_ij_diag
1630 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1631 1620 : POINTER :: external_ab, send_ab
1632 :
1633 1620 : CALL timeset(routineN//"_Pia", handle)
1634 :
1635 1620 : alpha_beta = .NOT. (ispin == jspin)
1636 1620 : IF (open_ss) THEN
1637 : factor = 1.0_dp
1638 : ELSE
1639 1231 : factor = 2.0_dp
1640 : END IF
1641 : ! divide the (ia|jb) integrals by Delta_ij^ab
1642 25159 : DO b = 1, my_B_size(jspin)
1643 23539 : b_global = b + my_B_virtual_start(jspin) - 1
1644 469848 : DO a = 1, virtual(ispin)
1645 : local_ab(a, b) = -local_ab(a, b)/ &
1646 : (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
1647 468228 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
1648 : END DO
1649 : END DO
1650 1620 : IF (.NOT. (alpha_beta)) THEN
1651 329242 : P_ij_diag = -SUM(local_ab*t_ab)*factor
1652 : ELSE
1653 : ! update diagonal part of P_ij
1654 140606 : P_ij_diag = -SUM(local_ab*local_ab)*mp2_env%scale_S
1655 : ! More integrals needed only for alpha-beta case: local_ba
1656 6987 : DO b = 1, my_B_size(ispin)
1657 6485 : b_global = b + my_B_virtual_start(ispin) - 1
1658 140066 : DO a = 1, virtual(jspin)
1659 : local_ba(a, b) = -local_ba(a, b)/ &
1660 : (Eigenval(homo(jspin) + a, jspin) + Eigenval(homo(ispin) + b_global, ispin) - &
1661 139564 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
1662 : END DO
1663 : END DO
1664 : END IF
1665 :
1666 : ! P_ab and add diagonal part of P_ij
1667 :
1668 1620 : CALL dgemm_counter_start(dgemm_counter)
1669 1620 : IF (.NOT. (alpha_beta)) THEN
1670 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(ispin), 1.0_dp, &
1671 : t_ab, virtual(ispin), local_ab, virtual(ispin), &
1672 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
1673 1118 : my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
1674 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
1675 1118 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
1676 : ELSE
1677 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(jspin), mp2_env%scale_S, &
1678 : local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
1679 502 : mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
1680 :
1681 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
1682 502 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
1683 :
1684 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), my_B_size(jspin), virtual(ispin), mp2_env%scale_S, &
1685 : local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
1686 502 : mp2_env%ri_grad%P_ab(jspin)%array(:, my_B_virtual_start(jspin):my_B_virtual_end(jspin)), my_B_size(jspin))
1687 :
1688 : mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
1689 502 : mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
1690 : END IF
1691 : ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
1692 : ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
1693 : ! due to spin in alpha-beta case.
1694 1620 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1695 :
1696 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), virtual(ispin), my_B_size(ispin), 1.0_dp, &
1697 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1698 : local_ab, virtual(ispin), &
1699 823 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_B_size(ispin))
1700 :
1701 : mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
1702 823 : mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
1703 : END IF
1704 1792 : DO proc_shift = 1, para_env_sub%num_pe - 1
1705 172 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1706 172 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1707 :
1708 172 : CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1709 172 : CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1710 :
1711 172 : external_ab(1:virtual(ispin), 1:rec_B_size) => buffer_1D(1:INT(virtual(ispin), int_8)*rec_B_size)
1712 35072 : external_ab = 0.0_dp
1713 :
1714 : CALL para_env_sub%sendrecv(local_ab, proc_send, &
1715 172 : external_ab, proc_receive)
1716 :
1717 172 : IF (.NOT. (alpha_beta)) THEN
1718 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(ispin), 1.0_dp, &
1719 : t_ab, virtual(ispin), external_ab, virtual(ispin), &
1720 102 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
1721 : ELSE
1722 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), rec_B_size, virtual(ispin), mp2_env%scale_S, &
1723 : local_ab, virtual(ispin), external_ab, virtual(ispin), &
1724 : 1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_B_virtual_start:rec_B_virtual_end), &
1725 70 : my_B_size(jspin))
1726 :
1727 : ! For alpha-beta part of alpha-density we need a new parallel code
1728 : ! And new external_ab (of a different size)
1729 70 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1730 70 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1731 70 : external_ab(1:virtual(jspin), 1:rec_B_size) => buffer_1D(1:INT(virtual(jspin), int_8)*rec_B_size)
1732 14700 : external_ab = 0.0_dp
1733 : CALL para_env_sub%sendrecv(local_ba, proc_send, &
1734 70 : external_ab, proc_receive)
1735 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(jspin), mp2_env%scale_S, &
1736 : local_ba, virtual(jspin), external_ab, virtual(jspin), &
1737 70 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
1738 : END IF
1739 :
1740 1964 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1741 : external_ab(1:my_B_size(ispin), 1:virtual(ispin)) => &
1742 86 : buffer_1D(1:INT(virtual(ispin), int_8)*my_B_size(ispin))
1743 18083 : external_ab = 0.0_dp
1744 :
1745 86 : offset = INT(virtual(ispin), int_8)*my_B_size(ispin)
1746 :
1747 86 : send_ab(1:send_B_size, 1:virtual(ispin)) => buffer_1D(offset + 1:offset + INT(send_B_size, int_8)*virtual(ispin))
1748 18083 : send_ab = 0.0_dp
1749 :
1750 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, virtual(ispin), my_B_size(ispin), 1.0_dp, &
1751 : t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1752 86 : local_ab, virtual(ispin), 0.0_dp, send_ab, send_B_size)
1753 : CALL para_env_sub%sendrecv(send_ab, proc_send, &
1754 86 : external_ab, proc_receive)
1755 :
1756 18083 : mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
1757 : END IF
1758 :
1759 : END DO
1760 1620 : IF (.NOT. alpha_beta) THEN
1761 1118 : IF (my_i /= my_j) THEN
1762 823 : CALL dgemm_counter_stop(dgemm_counter, 2*my_B_size(ispin), virtual(ispin), virtual(ispin))
1763 : ELSE
1764 295 : CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), virtual(ispin), virtual(ispin))
1765 : END IF
1766 : ELSE
1767 1506 : CALL dgemm_counter_stop(dgemm_counter, SUM(my_B_size), virtual(ispin), virtual(jspin))
1768 : END IF
1769 1620 : CALL timestop(handle)
1770 :
1771 : ! Now, Gamma_P_ia (made of Y_ia_P)
1772 :
1773 1620 : CALL timeset(routineN//"_Gamma", handle)
1774 1620 : CALL dgemm_counter_start(dgemm_counter)
1775 1620 : IF (.NOT. alpha_beta) THEN
1776 : ! Alpha-alpha, beta-beta and closed shell
1777 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
1778 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1779 1118 : my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
1780 : ELSE ! Alpha-beta
1781 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
1782 : local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1783 502 : my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
1784 : CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(jspin), dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
1785 : local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1786 502 : my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(jspin))
1787 : END IF
1788 :
1789 1620 : IF (para_env_sub%num_pe > 1) THEN
1790 172 : external_ab(1:my_B_size(ispin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(ispin), int_8)*dimen_RI)
1791 189692 : external_ab = 0.0_dp
1792 :
1793 172 : offset = INT(my_B_size(ispin), int_8)*dimen_RI
1794 : END IF
1795 : !
1796 1792 : DO proc_shift = 1, para_env_sub%num_pe - 1
1797 172 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1798 172 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1799 :
1800 172 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1801 172 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1802 :
1803 172 : send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
1804 189692 : send_ab = 0.0_dp
1805 1964 : IF (.NOT. alpha_beta) THEN
1806 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), 1.0_dp, &
1807 : t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1808 102 : my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
1809 102 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1810 :
1811 217544 : Y_i_aP(:, :) = Y_i_aP + external_ab
1812 :
1813 : ELSE ! Alpha-beta case
1814 : ! Alpha-alpha part
1815 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
1816 : local_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1817 70 : my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
1818 70 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1819 161840 : Y_i_aP(:, :) = Y_i_aP + external_ab
1820 : END IF
1821 : END DO
1822 :
1823 1620 : IF (alpha_beta) THEN
1824 : ! For beta-beta part (in alpha-beta case) we need a new parallel code
1825 502 : IF (para_env_sub%num_pe > 1) THEN
1826 70 : external_ab(1:my_B_size(jspin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(jspin), int_8)*dimen_RI)
1827 88620 : external_ab = 0.0_dp
1828 :
1829 70 : offset = INT(my_B_size(jspin), int_8)*dimen_RI
1830 : END IF
1831 572 : DO proc_shift = 1, para_env_sub%num_pe - 1
1832 70 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1833 70 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1834 :
1835 70 : CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1836 70 : send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
1837 88620 : send_ab = 0.0_dp
1838 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
1839 : local_ba(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1840 70 : my_local_i_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
1841 70 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1842 177812 : Y_j_aP(:, :) = Y_j_aP + external_ab
1843 :
1844 : END DO
1845 :
1846 : ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
1847 502 : CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_RI, my_B_size(jspin))
1848 : ELSE
1849 1118 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_RI, my_B_size(ispin))
1850 : END IF
1851 :
1852 1620 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1853 : ! Alpha-alpha, beta-beta and closed shell
1854 823 : CALL dgemm_counter_start(dgemm_counter)
1855 : CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
1856 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1857 823 : my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
1858 909 : DO proc_shift = 1, para_env_sub%num_pe - 1
1859 86 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1860 86 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1861 :
1862 86 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1863 :
1864 86 : external_ab(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
1865 86837 : external_ab = 0.0_dp
1866 :
1867 : CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
1868 86 : external_ab, proc_receive)
1869 :
1870 : ! Alpha-alpha, beta-beta and closed shell
1871 : CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, rec_B_size, 1.0_dp, &
1872 : t_ab(rec_B_virtual_start:rec_B_virtual_end, :), rec_B_size, &
1873 995 : external_ab, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
1874 : END DO
1875 :
1876 823 : CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), dimen_RI, virtual(ispin))
1877 : END IF
1878 :
1879 1620 : CALL timestop(handle)
1880 1620 : END SUBROUTINE mp2_update_P_gamma
1881 :
1882 : ! **************************************************************************************************
1883 : !> \brief ...
1884 : !> \param Gamma_P_ia ...
1885 : !> \param ij_index ...
1886 : !> \param my_B_size ...
1887 : !> \param my_block_size ...
1888 : !> \param my_group_L_size ...
1889 : !> \param my_i ...
1890 : !> \param my_ij_pairs ...
1891 : !> \param ngroup ...
1892 : !> \param num_integ_group ...
1893 : !> \param integ_group_pos2color_sub ...
1894 : !> \param num_ij_pairs ...
1895 : !> \param ij_map ...
1896 : !> \param ranges_info_array ...
1897 : !> \param Y_i_aP ...
1898 : !> \param comm_exchange ...
1899 : !> \param sizes_array ...
1900 : !> \param spin ...
1901 : !> \param buffer_1D ...
1902 : ! **************************************************************************************************
1903 3234 : SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
1904 : my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
1905 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
1906 3234 : ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
1907 3234 : sizes_array, spin, buffer_1D)
1908 :
1909 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: Gamma_P_ia
1910 : INTEGER, INTENT(IN) :: ij_index, my_B_size, my_block_size, &
1911 : my_group_L_size, my_i, my_ij_pairs, &
1912 : ngroup, num_integ_group
1913 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub, num_ij_pairs
1914 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN) :: ij_map
1915 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1916 : INTENT(IN) :: ranges_info_array
1917 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: Y_i_aP
1918 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
1919 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
1920 : INTEGER, INTENT(IN) :: spin
1921 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
1922 :
1923 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_redistribute_gamma'
1924 :
1925 : INTEGER :: end_point, handle, handle2, iiB, ij_counter_rec, irep, kkk, lll, Lstart_pos, &
1926 : proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_L_size, start_point, tag
1927 : INTEGER(KIND=int_8) :: offset
1928 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
1929 3234 : POINTER :: BI_C_rec, BI_C_send
1930 :
1931 : ! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
1932 :
1933 3234 : CALL timeset(routineN//"_comm2", handle)
1934 :
1935 3234 : tag = 43
1936 :
1937 3234 : IF (ij_index <= my_ij_pairs) THEN
1938 : ! somethig to send
1939 : ! start with myself
1940 3216 : CALL timeset(routineN//"_comm2_w", handle2)
1941 9230 : DO irep = 0, num_integ_group - 1
1942 6014 : Lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
1943 6014 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
1944 6014 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
1945 : !$OMP PARALLEL DO DEFAULT(NONE) &
1946 : !$OMP PRIVATE(kkk,lll,iiB) &
1947 : !$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1948 9230 : !$OMP Gamma_P_ia,my_i,my_B_size,Y_i_aP)
1949 : DO kkk = start_point, end_point
1950 : lll = kkk - start_point + Lstart_pos
1951 : DO iiB = 1, my_block_size
1952 : Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) = &
1953 : Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) + &
1954 : Y_i_aP(1:my_B_size, lll, iiB)
1955 : END DO
1956 : END DO
1957 : !$OMP END PARALLEL DO
1958 : END DO
1959 3216 : CALL timestop(handle2)
1960 :
1961 : ! Y_i_aP(my_B_size,dimen_RI,block_size)
1962 :
1963 3314 : DO proc_shift = 1, comm_exchange%num_pe - 1
1964 98 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
1965 98 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
1966 :
1967 98 : send_L_size = sizes_array(proc_send)
1968 : BI_C_send(1:my_B_size, 1:my_block_size, 1:send_L_size) => &
1969 98 : buffer_1D(1:INT(my_B_size, int_8)*my_block_size*send_L_size)
1970 :
1971 98 : offset = INT(my_B_size, int_8)*my_block_size*send_L_size
1972 :
1973 98 : CALL timeset(routineN//"_comm2_w", handle2)
1974 48692 : BI_C_send = 0.0_dp
1975 196 : DO irep = 0, num_integ_group - 1
1976 98 : Lstart_pos = ranges_info_array(1, irep, proc_send)
1977 98 : start_point = ranges_info_array(3, irep, proc_send)
1978 98 : end_point = ranges_info_array(4, irep, proc_send)
1979 : !$OMP PARALLEL DO DEFAULT(NONE) &
1980 : !$OMP PRIVATE(kkk,lll,iiB) &
1981 : !$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1982 196 : !$OMP BI_C_send,my_B_size,Y_i_aP)
1983 : DO kkk = start_point, end_point
1984 : lll = kkk - start_point + Lstart_pos
1985 : DO iiB = 1, my_block_size
1986 : BI_C_send(1:my_B_size, iiB, kkk) = Y_i_aP(1:my_B_size, lll, iiB)
1987 : END DO
1988 : END DO
1989 : !$OMP END PARALLEL DO
1990 : END DO
1991 98 : CALL timestop(handle2)
1992 :
1993 98 : rec_ij_index = num_ij_pairs(proc_receive)
1994 :
1995 3412 : IF (ij_index <= rec_ij_index) THEN
1996 : ! we know that proc_receive has something to send for us, let's see what
1997 : ij_counter_rec = &
1998 80 : (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
1999 :
2000 80 : rec_i = ij_map(spin, ij_counter_rec)
2001 :
2002 : BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
2003 80 : buffer_1D(offset + 1:offset + INT(my_B_size, int_8)*my_block_size*my_group_L_size)
2004 44250 : BI_C_rec = 0.0_dp
2005 :
2006 : CALL comm_exchange%sendrecv(BI_C_send, proc_send, &
2007 80 : BI_C_rec, proc_receive, tag)
2008 :
2009 80 : CALL timeset(routineN//"_comm2_w", handle2)
2010 160 : DO irep = 0, num_integ_group - 1
2011 80 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2012 80 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2013 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2014 : !$OMP SHARED(start_point,end_point,my_block_size,&
2015 160 : !$OMP Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
2016 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2017 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2018 : BI_C_rec(1:my_B_size, :, start_point:end_point)
2019 : !$OMP END PARALLEL WORKSHARE
2020 : END DO
2021 80 : CALL timestop(handle2)
2022 :
2023 : ELSE
2024 : ! we have something to send but nothing to receive
2025 18 : CALL comm_exchange%send(BI_C_send, proc_send, tag)
2026 :
2027 : END IF
2028 :
2029 : END DO
2030 :
2031 : ELSE
2032 : ! noting to send check if we have to receive
2033 36 : DO proc_shift = 1, comm_exchange%num_pe - 1
2034 18 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2035 18 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2036 18 : rec_ij_index = num_ij_pairs(proc_receive)
2037 :
2038 36 : IF (ij_index <= rec_ij_index) THEN
2039 : ! we know that proc_receive has something to send for us, let's see what
2040 : ij_counter_rec = &
2041 18 : (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
2042 :
2043 18 : rec_i = ij_map(spin, ij_counter_rec)
2044 :
2045 : BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
2046 18 : buffer_1D(1:INT(my_B_size, int_8)*my_block_size*my_group_L_size)
2047 :
2048 4442 : BI_C_rec = 0.0_dp
2049 :
2050 18 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2051 :
2052 18 : CALL timeset(routineN//"_comm2_w", handle2)
2053 36 : DO irep = 0, num_integ_group - 1
2054 18 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2055 18 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2056 : #if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2057 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2058 : !$OMP SHARED(start_point,end_point,my_block_size,&
2059 36 : !$OMP Gamma_P_ia,rec_i,my_B_size,BI_C_rec)
2060 : #endif
2061 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2062 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2063 : BI_C_rec(1:my_B_size, :, start_point:end_point)
2064 : #if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2065 : !$OMP END PARALLEL WORKSHARE
2066 : #endif
2067 : END DO
2068 18 : CALL timestop(handle2)
2069 :
2070 : END IF
2071 : END DO
2072 :
2073 : END IF
2074 3234 : CALL timestop(handle)
2075 :
2076 3234 : END SUBROUTINE mp2_redistribute_gamma
2077 :
2078 : ! **************************************************************************************************
2079 : !> \brief ...
2080 : !> \param mp2_env ...
2081 : !> \param Eigenval ...
2082 : !> \param homo ...
2083 : !> \param virtual ...
2084 : !> \param open_shell ...
2085 : !> \param beta_beta ...
2086 : !> \param Bib_C ...
2087 : !> \param unit_nr ...
2088 : !> \param dimen_RI ...
2089 : !> \param my_B_size ...
2090 : !> \param ngroup ...
2091 : !> \param my_group_L_size ...
2092 : !> \param color_sub ...
2093 : !> \param ranges_info_array ...
2094 : !> \param comm_exchange ...
2095 : !> \param para_env_sub ...
2096 : !> \param para_env ...
2097 : !> \param my_B_virtual_start ...
2098 : !> \param my_B_virtual_end ...
2099 : !> \param sizes_array ...
2100 : !> \param gd_B_virtual ...
2101 : !> \param integ_group_pos2color_sub ...
2102 : !> \param dgemm_counter ...
2103 : !> \param buffer_1D ...
2104 : ! **************************************************************************************************
2105 384 : SUBROUTINE quasi_degenerate_P_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
2106 384 : beta_beta, Bib_C, unit_nr, dimen_RI, &
2107 384 : my_B_size, ngroup, my_group_L_size, &
2108 : color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
2109 384 : my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
2110 384 : integ_group_pos2color_sub, dgemm_counter, buffer_1D)
2111 : TYPE(mp2_type) :: mp2_env
2112 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
2113 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2114 : LOGICAL, INTENT(IN) :: open_shell, beta_beta
2115 : TYPE(three_dim_real_array), DIMENSION(:), &
2116 : INTENT(IN) :: BIb_C
2117 : INTEGER, INTENT(IN) :: unit_nr, dimen_RI
2118 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_size
2119 : INTEGER, INTENT(IN) :: ngroup, my_group_L_size, color_sub
2120 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
2121 : INTENT(IN) :: ranges_info_array
2122 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2123 : TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub, para_env
2124 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_virtual_start, my_B_virtual_end
2125 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
2126 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
2127 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub
2128 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
2129 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
2130 :
2131 : CHARACTER(LEN=*), PARAMETER :: routineN = 'quasi_degenerate_P_ij'
2132 :
2133 : INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
2134 : ijk_counter_send, ijk_index, ispin, kkB, kspin, max_block_size, max_ijk, my_i, my_ijk, &
2135 : my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
2136 : rec_B_size, rec_B_virtual_end, rec_B_virtual_start, rec_L_size, send_B_size, &
2137 : send_B_virtual_end, send_B_virtual_start, send_i, send_ijk_index, send_j, send_k, &
2138 : size_B_i, size_B_k, tag, tag2
2139 384 : INTEGER, ALLOCATABLE, DIMENSION(:) :: num_ijk
2140 384 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ijk_map, send_last_k
2141 : LOGICAL :: alpha_beta, do_recv_i, do_recv_j, &
2142 : do_recv_k, do_send_i, do_send_j, &
2143 : do_send_k
2144 : REAL(KIND=dp) :: amp_fac, P_ij_elem, t_new, t_start
2145 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
2146 384 : TARGET :: local_ab, local_aL_i, local_aL_j, t_ab
2147 384 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: local_aL_k
2148 384 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: BI_C_rec, external_ab, external_aL
2149 384 : REAL(KIND=dp), DIMENSION(:, :, :), POINTER :: BI_C_rec_3D
2150 :
2151 384 : CALL timeset(routineN//"_ij_sing", handle)
2152 :
2153 384 : tag = 44
2154 384 : tag2 = 45
2155 :
2156 384 : nspins = SIZE(BIb_C)
2157 384 : alpha_beta = (nspins == 2)
2158 :
2159 : ! Set amplitude factor
2160 384 : amp_fac = mp2_env%scale_S + mp2_env%scale_T
2161 384 : IF (open_shell) amp_fac = mp2_env%scale_T
2162 :
2163 794 : ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
2164 :
2165 : ! Loop(s) over orbital triplets
2166 846 : DO ispin = 1, nspins
2167 462 : size_B_i = my_B_size(ispin)
2168 462 : IF (ispin == 1 .AND. alpha_beta) THEN
2169 : kspin = 2
2170 : ELSE
2171 384 : kspin = 1
2172 : END IF
2173 462 : size_B_k = my_B_size(kspin)
2174 :
2175 : ! Find the number of quasi-degenerate orbitals and orbital triplets
2176 :
2177 : CALL Find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), Eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
2178 : .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
2179 618 : SIZE(buffer_1D), my_group_L_size, size_B_k, para_env, virtual(ispin), size_B_i)
2180 :
2181 462 : my_virtual = virtual(ispin)
2182 462 : IF (SIZE(ijk_map, 2) > 0) THEN
2183 98 : max_block_size = ijk_map(4, 1)
2184 : ELSE
2185 : max_block_size = 1
2186 : END IF
2187 :
2188 1848 : ALLOCATE (local_aL_i(dimen_RI, size_B_i))
2189 1386 : ALLOCATE (local_aL_j(dimen_RI, size_B_i))
2190 2310 : ALLOCATE (local_aL_k(dimen_RI, size_B_k, max_block_size))
2191 1848 : ALLOCATE (t_ab(my_virtual, size_B_k))
2192 :
2193 1386 : my_last_k = -1
2194 552 : send_last_k = -1
2195 :
2196 462 : t_start = m_walltime()
2197 618 : DO ijk_index = 1, max_ijk
2198 :
2199 : ! Prediction is unreliable if we are in the first step of the loop
2200 156 : IF (unit_nr > 0 .AND. ijk_index > 1) THEN
2201 18 : decil = ijk_index*10/max_ijk
2202 18 : IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
2203 18 : t_new = m_walltime()
2204 18 : t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
2205 : WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
2206 18 : cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
2207 18 : CALL m_flush(unit_nr)
2208 : END IF
2209 : END IF
2210 :
2211 618 : IF (ijk_index <= my_ijk) THEN
2212 : ! work to be done
2213 154 : ijk_counter = (ijk_index - MIN(1, color_sub))*ngroup + color_sub
2214 154 : my_i = ijk_map(1, ijk_counter)
2215 154 : my_j = ijk_map(2, ijk_counter)
2216 154 : my_k = ijk_map(3, ijk_counter)
2217 154 : block_size = ijk_map(4, ijk_counter)
2218 :
2219 154 : do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
2220 154 : do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
2221 154 : do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
2222 154 : my_last_k(1) = my_k
2223 154 : my_last_k(2) = my_k + block_size - 1
2224 :
2225 131054 : local_aL_i = 0.0_dp
2226 154 : IF (do_recv_i) THEN
2227 : CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
2228 125 : BIb_C(ispin)%array(:, :, my_i))
2229 : END IF
2230 :
2231 131054 : local_aL_j = 0.0_dp
2232 154 : IF (do_recv_j) THEN
2233 : CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
2234 125 : BIb_C(ispin)%array(:, :, my_j))
2235 : END IF
2236 :
2237 154 : IF (do_recv_k) THEN
2238 224533 : local_aL_k = 0.0_dp
2239 : CALL fill_local_i_aL(local_aL_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
2240 146 : BIb_C(kspin)%array(:, :, my_k:my_k + block_size - 1))
2241 : END IF
2242 :
2243 154 : CALL timeset(routineN//"_comm", handle2)
2244 164 : DO proc_shift = 1, comm_exchange%num_pe - 1
2245 10 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2246 10 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2247 :
2248 10 : send_ijk_index = num_ijk(proc_send)
2249 :
2250 10 : rec_L_size = sizes_array(proc_receive)
2251 10 : BI_C_rec(1:rec_L_size, 1:size_B_i) => buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_i)
2252 :
2253 10 : do_send_i = .FALSE.
2254 10 : do_send_j = .FALSE.
2255 10 : do_send_k = .FALSE.
2256 10 : IF (ijk_index <= send_ijk_index) THEN
2257 : ! something to send
2258 : ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))* &
2259 8 : ngroup + integ_group_pos2color_sub(proc_send)
2260 8 : send_i = ijk_map(1, ijk_counter_send)
2261 8 : send_j = ijk_map(2, ijk_counter_send)
2262 8 : send_k = ijk_map(3, ijk_counter_send)
2263 :
2264 8 : do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2265 8 : do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2266 8 : do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
2267 8 : send_last_k(1, proc_shift) = send_k
2268 8 : send_last_k(2, proc_shift) = send_k + block_size - 1
2269 : END IF
2270 :
2271 : ! occupied i
2272 722 : BI_C_rec = 0.0_dp
2273 10 : IF (do_send_i) THEN
2274 6 : IF (do_recv_i) THEN
2275 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i), proc_send, &
2276 288 : BI_C_rec, proc_receive, tag)
2277 : ELSE
2278 2 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
2279 : END IF
2280 4 : ELSE IF (do_recv_i) THEN
2281 580 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2282 : END IF
2283 10 : IF (do_recv_i) THEN
2284 8 : CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, proc_receive), BI_C_rec)
2285 : END IF
2286 :
2287 : ! occupied j
2288 722 : BI_C_rec = 0.0_dp
2289 10 : IF (do_send_j) THEN
2290 8 : IF (do_recv_j) THEN
2291 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_j), proc_send, &
2292 576 : BI_C_rec, proc_receive, tag)
2293 : ELSE
2294 0 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
2295 : END IF
2296 2 : ELSE IF (do_recv_j) THEN
2297 0 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2298 : END IF
2299 8 : IF (do_recv_j) THEN
2300 8 : CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, proc_receive), BI_C_rec)
2301 : END IF
2302 :
2303 : ! occupied k
2304 : BI_C_rec_3D(1:rec_L_size, 1:size_B_k, 1:block_size) => &
2305 10 : buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_k*block_size)
2306 10 : IF (do_send_k) THEN
2307 8 : IF (do_recv_k) THEN
2308 : CALL comm_exchange%sendrecv(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
2309 726 : BI_C_rec_3D, proc_receive, tag)
2310 : ELSE
2311 0 : CALL comm_exchange%send(BI_C_rec, proc_receive, tag)
2312 : END IF
2313 2 : ELSE IF (do_recv_k) THEN
2314 294 : CALL comm_exchange%recv(BI_C_rec_3D, proc_receive, tag)
2315 : END IF
2316 164 : IF (do_recv_k) THEN
2317 10 : CALL fill_local_i_aL(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), BI_C_rec_3D)
2318 : END IF
2319 : END DO
2320 :
2321 20868 : IF (.NOT. do_recv_i) local_aL_i(:, :) = local_aL_k(:, :, my_i - my_k + 1)
2322 20868 : IF (.NOT. do_recv_j) local_aL_j(:, :) = local_aL_k(:, :, my_j - my_k + 1)
2323 154 : CALL timestop(handle2)
2324 :
2325 : ! expand integrals
2326 376 : DO kkB = 1, block_size
2327 222 : CALL timeset(routineN//"_exp_ik", handle2)
2328 222 : CALL dgemm_counter_start(dgemm_counter)
2329 666 : ALLOCATE (local_ab(my_virtual, size_B_k))
2330 32638 : local_ab = 0.0_dp
2331 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
2332 : local_aL_i, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2333 222 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
2334 222 : DO proc_shift = 1, para_env_sub%num_pe - 1
2335 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2336 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2337 :
2338 0 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2339 :
2340 0 : external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
2341 :
2342 : CALL comm_exchange%sendrecv(local_aL_i, proc_send, &
2343 0 : external_aL, proc_receive, tag)
2344 :
2345 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
2346 : external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2347 222 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
2348 : END DO
2349 222 : CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
2350 222 : CALL timestop(handle2)
2351 :
2352 : ! Amplitudes
2353 222 : CALL timeset(routineN//"_tab", handle2)
2354 32638 : t_ab = 0.0_dp
2355 : ! Alpha-alpha, beta-beta and closed shell
2356 222 : IF (.NOT. alpha_beta) THEN
2357 1156 : DO b = 1, size_B_k
2358 1038 : b_global = b + my_B_virtual_start(1) - 1
2359 17250 : DO a = 1, my_B_size(1)
2360 16094 : a_global = a + my_B_virtual_start(1) - 1
2361 : t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
2362 : (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
2363 17132 : - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
2364 : END DO
2365 : END DO
2366 : ELSE
2367 1030 : DO b = 1, size_B_k
2368 926 : b_global = b + my_B_virtual_start(kspin) - 1
2369 15388 : DO a = 1, my_B_size(ispin)
2370 14358 : a_global = a + my_B_virtual_start(ispin) - 1
2371 : t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
2372 : (Eigenval(my_i, ispin) + Eigenval(my_k + kkB - 1, kspin) &
2373 15284 : - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
2374 : END DO
2375 : END DO
2376 : END IF
2377 :
2378 222 : IF (.NOT. alpha_beta) THEN
2379 118 : DO proc_shift = 1, para_env_sub%num_pe - 1
2380 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2381 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2382 0 : CALL get_group_dist(gd_B_virtual(1), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2383 0 : CALL get_group_dist(gd_B_virtual(1), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
2384 :
2385 0 : external_ab(1:size_B_i, 1:rec_B_size) => buffer_1D(1:INT(size_B_i, KIND=int_8)*rec_B_size)
2386 : CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:size_B_k), proc_send, &
2387 0 : external_ab(1:size_B_i, 1:rec_B_size), proc_receive, tag)
2388 :
2389 118 : DO b = 1, my_B_size(1)
2390 0 : b_global = b + my_B_virtual_start(1) - 1
2391 0 : DO a = 1, rec_B_size
2392 0 : a_global = a + rec_B_virtual_start - 1
2393 : t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
2394 : (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
2395 0 : - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
2396 : END DO
2397 : END DO
2398 : END DO
2399 : END IF
2400 222 : CALL timestop(handle2)
2401 :
2402 : ! Expand the second set of integrals
2403 222 : CALL timeset(routineN//"_exp_jk", handle2)
2404 32638 : local_ab = 0.0_dp
2405 222 : CALL dgemm_counter_start(dgemm_counter)
2406 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
2407 : local_aL_j, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2408 222 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
2409 222 : DO proc_shift = 1, para_env_sub%num_pe - 1
2410 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2411 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2412 :
2413 0 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2414 :
2415 0 : external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
2416 :
2417 : CALL comm_exchange%sendrecv(local_aL_j, proc_send, &
2418 0 : external_aL, proc_receive, tag)
2419 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
2420 : external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2421 222 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
2422 : END DO
2423 222 : CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
2424 222 : CALL timestop(handle2)
2425 :
2426 222 : CALL timeset(routineN//"_Pij", handle2)
2427 2186 : DO b = 1, size_B_k
2428 1964 : b_global = b + my_B_virtual_start(kspin) - 1
2429 32638 : DO a = 1, my_B_size(ispin)
2430 30452 : a_global = a + my_B_virtual_start(ispin) - 1
2431 : local_ab(a_global, b) = &
2432 : local_ab(a_global, b)/(Eigenval(my_j, ispin) + Eigenval(my_k + kkB - 1, kspin) &
2433 32416 : - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
2434 : END DO
2435 : END DO
2436 : !
2437 32638 : P_ij_elem = SUM(local_ab*t_ab)
2438 222 : DEALLOCATE (local_ab)
2439 222 : IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
2440 4 : P_ij_elem = P_ij_elem*2.0_dp
2441 : END IF
2442 222 : IF (beta_beta) THEN
2443 34 : mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - P_ij_elem
2444 34 : mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - P_ij_elem
2445 : ELSE
2446 188 : mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - P_ij_elem
2447 188 : mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - P_ij_elem
2448 : END IF
2449 1264 : CALL timestop(handle2)
2450 : END DO
2451 : ELSE
2452 2 : CALL timeset(routineN//"_comm", handle2)
2453 : ! no work to be done, possible messeges to be exchanged
2454 4 : DO proc_shift = 1, comm_exchange%num_pe - 1
2455 2 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2456 2 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2457 :
2458 2 : send_ijk_index = num_ijk(proc_send)
2459 :
2460 4 : IF (ijk_index <= send_ijk_index) THEN
2461 : ! somethig to send
2462 : ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
2463 2 : integ_group_pos2color_sub(proc_send)
2464 2 : send_i = ijk_map(1, ijk_counter_send)
2465 2 : send_j = ijk_map(2, ijk_counter_send)
2466 2 : send_k = ijk_map(3, ijk_counter_send)
2467 2 : block_size = ijk_map(4, ijk_counter_send)
2468 :
2469 2 : do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2470 2 : do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2471 : ! occupied i
2472 2 : IF (do_send_i) THEN
2473 2 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
2474 : END IF
2475 : ! occupied j
2476 2 : IF (do_send_j) THEN
2477 0 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
2478 : END IF
2479 : ! occupied k
2480 2 : CALL comm_exchange%send(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
2481 : END IF
2482 :
2483 : END DO ! proc loop
2484 2 : CALL timestop(handle2)
2485 : END IF
2486 : END DO ! ijk_index loop
2487 462 : DEALLOCATE (local_aL_i)
2488 462 : DEALLOCATE (local_aL_j)
2489 462 : DEALLOCATE (local_aL_k)
2490 462 : DEALLOCATE (t_ab)
2491 846 : DEALLOCATE (ijk_map)
2492 : END DO ! over number of loops (ispin)
2493 384 : CALL timestop(handle)
2494 :
2495 768 : END SUBROUTINE Quasi_degenerate_P_ij
2496 :
2497 : ! **************************************************************************************************
2498 : !> \brief ...
2499 : !> \param my_ijk ...
2500 : !> \param homo ...
2501 : !> \param homo_beta ...
2502 : !> \param Eigenval ...
2503 : !> \param mp2_env ...
2504 : !> \param ijk_map ...
2505 : !> \param unit_nr ...
2506 : !> \param ngroup ...
2507 : !> \param do_print_alpha ...
2508 : !> \param comm_exchange ...
2509 : !> \param num_ijk ...
2510 : !> \param max_ijk ...
2511 : !> \param color_sub ...
2512 : !> \param buffer_size ...
2513 : !> \param my_group_L_size ...
2514 : !> \param B_size_k ...
2515 : !> \param para_env ...
2516 : !> \param virtual ...
2517 : !> \param B_size_i ...
2518 : ! **************************************************************************************************
2519 462 : SUBROUTINE Find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
2520 : do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
2521 : buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
2522 :
2523 : INTEGER, INTENT(OUT) :: my_ijk
2524 : INTEGER, INTENT(IN) :: homo, homo_beta
2525 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
2526 : TYPE(mp2_type), INTENT(IN) :: mp2_env
2527 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
2528 : INTEGER, INTENT(IN) :: unit_nr, ngroup
2529 : LOGICAL, INTENT(IN) :: do_print_alpha
2530 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2531 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: num_ijk
2532 : INTEGER, INTENT(OUT) :: max_ijk
2533 : INTEGER, INTENT(IN) :: color_sub, buffer_size, my_group_L_size, &
2534 : B_size_k
2535 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
2536 : INTEGER, INTENT(IN) :: virtual, B_size_i
2537 :
2538 : INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
2539 : ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
2540 : my_steps, num_k_blocks, num_sing_ij, total_ijk
2541 : INTEGER(KIND=int_8) :: mem
2542 462 : LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ijk_marker
2543 :
2544 1386 : ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
2545 :
2546 462 : num_sing_ij = 0
2547 2092 : DO iiB = 1, homo
2548 : ! diagonal elements already updated
2549 4368 : DO jjB = iiB + 1, homo
2550 2276 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) < mp2_env%ri_grad%eps_canonical) &
2551 1744 : num_sing_ij = num_sing_ij + 1
2552 : END DO
2553 : END DO
2554 :
2555 462 : IF (unit_nr > 0) THEN
2556 231 : IF (do_print_alpha) THEN
2557 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2558 153 : "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
2559 : ELSE
2560 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2561 78 : "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
2562 : END IF
2563 : END IF
2564 :
2565 : ! Determine the block size, first guess: use available buffer
2566 462 : max_block_size = buffer_size/(my_group_L_size*B_size_k)
2567 :
2568 : ! Second limit: memory
2569 462 : CALL m_memory(mem)
2570 : ! Convert to number of doubles
2571 462 : mem = mem/8
2572 : ! Remove local_ab (2x) and local_aL_i (2x)
2573 462 : mem = mem - 2_int_8*(virtual*B_size_k + B_size_i*my_group_L_size)
2574 462 : max_block_size = MIN(max_block_size, MAX(1, INT(mem/(my_group_L_size*B_size_k), KIND(max_block_size))))
2575 :
2576 : ! Exchange the limit
2577 462 : CALL para_env%min(max_block_size)
2578 :
2579 : ! Find now the block size which minimizes the communication volume and then the number of communication steps
2580 462 : block_size = 1
2581 462 : min_communication_volume = 3*homo_beta*num_sing_ij
2582 462 : communication_steps = 3*homo_beta*num_sing_ij
2583 1174 : DO iiB = max_block_size, 2, -1
2584 712 : max_num_k_blocks = homo_beta/iiB*num_sing_ij
2585 712 : num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
2586 712 : communication_volume = num_k_blocks*(2 + iiB) + 3*(homo_beta*num_sing_ij - iiB*num_k_blocks)
2587 712 : my_steps = num_k_blocks + homo_beta*num_sing_ij - iiB*num_k_blocks
2588 1174 : IF (communication_volume < min_communication_volume) THEN
2589 52 : block_size = iiB
2590 52 : min_communication_volume = communication_volume
2591 52 : communication_steps = my_steps
2592 660 : ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
2593 58 : block_size = iiB
2594 58 : communication_steps = my_steps
2595 : END IF
2596 : END DO
2597 :
2598 462 : IF (unit_nr > 0) THEN
2599 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2600 231 : "MO_INFO| Block size:", block_size
2601 231 : CALL m_flush(unit_nr)
2602 : END IF
2603 :
2604 : ! Calculate number of large blocks
2605 462 : max_num_k_blocks = homo_beta/block_size*num_sing_ij
2606 462 : num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
2607 :
2608 462 : total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
2609 1022 : ALLOCATE (ijk_map(4, total_ijk))
2610 2002 : ijk_map = 0
2611 1484 : ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
2612 1020 : ijk_marker = .TRUE.
2613 :
2614 462 : my_ijk = 0
2615 462 : ijk_counter = 0
2616 462 : ij_counter = 0
2617 2092 : DO iiB = 1, homo
2618 : ! diagonal elements already updated
2619 4368 : DO jjB = iiB + 1, homo
2620 2276 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
2621 114 : ij_counter = ij_counter + 1
2622 1880 : DO kkB = 1, homo_beta - MOD(homo_beta, block_size), block_size
2623 172 : IF (ijk_counter + 1 > num_k_blocks) EXIT
2624 136 : ijk_counter = ijk_counter + 1
2625 408 : ijk_marker(kkB:kkB + block_size - 1, ij_counter) = .FALSE.
2626 136 : ijk_map(1, ijk_counter) = iiB
2627 136 : ijk_map(2, ijk_counter) = jjB
2628 136 : ijk_map(3, ijk_counter) = kkB
2629 136 : ijk_map(4, ijk_counter) = block_size
2630 2412 : IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2631 : END DO
2632 : END DO
2633 : END DO
2634 : ij_counter = 0
2635 2092 : DO iiB = 1, homo
2636 : ! diagonal elements already updated
2637 4368 : DO jjB = iiB + 1, homo
2638 2276 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
2639 114 : ij_counter = ij_counter + 1
2640 2188 : DO kkB = 1, homo_beta
2641 2720 : IF (ijk_marker(kkB, ij_counter)) THEN
2642 172 : ijk_counter = ijk_counter + 1
2643 172 : ijk_map(1, ijk_counter) = iiB
2644 172 : ijk_map(2, ijk_counter) = jjB
2645 172 : ijk_map(3, ijk_counter) = kkB
2646 172 : ijk_map(4, ijk_counter) = 1
2647 172 : IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2648 : END IF
2649 : END DO
2650 : END DO
2651 : END DO
2652 :
2653 462 : DEALLOCATE (ijk_marker)
2654 :
2655 462 : CALL comm_exchange%allgather(my_ijk, num_ijk)
2656 954 : max_ijk = MAXVAL(num_ijk)
2657 :
2658 462 : END SUBROUTINE Find_quasi_degenerate_ij
2659 :
2660 : END MODULE mp2_ri_gpw
|