Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines to calculate RI-GPW-MP2 energy using pw
10 : !> \par History
11 : !> 06.2012 created [Mauro Del Ben]
12 : !> 03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
13 : ! **************************************************************************************************
14 : MODULE mp2_ri_gpw
15 : USE ISO_C_BINDING, ONLY: C_NULL_PTR,&
16 : c_associated
17 : USE cp_log_handling, ONLY: cp_to_string
18 : USE dgemm_counter_types, ONLY: dgemm_counter_init,&
19 : dgemm_counter_start,&
20 : dgemm_counter_stop,&
21 : dgemm_counter_type,&
22 : dgemm_counter_write
23 : USE group_dist_types, ONLY: get_group_dist,&
24 : group_dist_d1_type,&
25 : maxsize,&
26 : release_group_dist
27 : USE kinds, ONLY: dp,&
28 : int_8
29 : USE libint_2c_3c, ONLY: compare_potential_types
30 : USE local_gemm_api, ONLY: LOCAL_GEMM_PU_GPU,&
31 : local_gemm,&
32 : local_gemm_create,&
33 : local_gemm_destroy,&
34 : local_gemm_set_op_threshold_gpu
35 : USE machine, ONLY: m_flush,&
36 : m_memory,&
37 : m_walltime
38 : USE message_passing, ONLY: mp_comm_type,&
39 : mp_para_env_type
40 : USE mp2_ri_grad_util, ONLY: complete_gamma
41 : USE mp2_types, ONLY: mp2_type,&
42 : three_dim_real_array
43 :
44 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
45 : #include "./base/base_uses.f90"
46 :
47 : IMPLICIT NONE
48 :
49 : PRIVATE
50 :
51 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
52 :
53 : PUBLIC :: mp2_ri_gpw_compute_en
54 :
55 : CONTAINS
56 :
57 : ! **************************************************************************************************
58 : !> \brief ...
59 : !> \param Emp2_Cou ...
60 : !> \param Emp2_EX ...
61 : !> \param Emp2_S ...
62 : !> \param Emp2_T ...
63 : !> \param BIb_C ...
64 : !> \param mp2_env ...
65 : !> \param para_env ...
66 : !> \param para_env_sub ...
67 : !> \param color_sub ...
68 : !> \param gd_array ...
69 : !> \param gd_B_virtual ...
70 : !> \param Eigenval ...
71 : !> \param nmo ...
72 : !> \param homo ...
73 : !> \param dimen_RI ...
74 : !> \param unit_nr ...
75 : !> \param calc_forces ...
76 : !> \param calc_ex ...
77 : ! **************************************************************************************************
78 1038 : SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
79 346 : gd_array, gd_B_virtual, &
80 346 : Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
81 : REAL(KIND=dp), INTENT(INOUT) :: Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T
82 : TYPE(three_dim_real_array), DIMENSION(:), &
83 : INTENT(INOUT) :: BIb_C
84 : TYPE(mp2_type) :: mp2_env
85 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
86 : INTEGER, INTENT(IN) :: color_sub
87 : TYPE(group_dist_d1_type), INTENT(INOUT) :: gd_array
88 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
89 : INTEGER, INTENT(IN) :: nmo
90 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
91 : TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
92 : INTENT(INOUT) :: gd_B_virtual
93 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
94 : LOGICAL, INTENT(IN) :: calc_forces, calc_ex
95 :
96 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_gpw_compute_en'
97 :
98 : INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
99 : iiB, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjB, jspin, &
100 : max_ij_pairs, my_block_size, my_group_L_end, my_group_L_size, my_group_L_size_orig, &
101 : my_group_L_start, my_i, my_ij_pairs, my_j, my_new_group_L_size, ngroup, nspins, &
102 : num_integ_group, proc_receive, proc_send, proc_shift, rec_B_size, rec_B_virtual_end, &
103 : rec_B_virtual_start, rec_L_size, send_B_size, send_B_virtual_end, send_B_virtual_start, &
104 : send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
105 346 : INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_B_size, &
106 346 : my_B_virtual_end, my_B_virtual_start, num_ij_pairs, sizes_array_orig, virtual
107 346 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ij_map
108 346 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: ranges_info_array
109 : LOGICAL :: my_alpha_beta_case, my_beta_beta_case, &
110 : my_open_shell_SS
111 : REAL(KIND=dp) :: amp_fac, my_Emp2_Cou, my_Emp2_EX, &
112 : sym_fac, t_new, t_start
113 346 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_1D
114 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
115 346 : TARGET :: local_ab, local_ba, t_ab
116 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
117 346 : TARGET :: local_i_aL, local_j_aL, Y_i_aP, Y_j_aP
118 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
119 346 : POINTER :: external_ab, external_i_aL
120 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
121 346 : POINTER :: BI_C_rec
122 : TYPE(dgemm_counter_type) :: dgemm_counter
123 : TYPE(mp_comm_type) :: comm_exchange, comm_rep
124 : TYPE(three_dim_real_array), ALLOCATABLE, &
125 346 : DIMENSION(:) :: B_ia_Q
126 :
127 346 : CALL timeset(routineN, handle)
128 :
129 346 : nspins = SIZE(homo)
130 :
131 1038 : ALLOCATE (virtual(nspins))
132 778 : virtual(:) = nmo - homo(:)
133 :
134 2422 : ALLOCATE (my_B_size(nspins), my_B_virtual_start(nspins), my_B_virtual_end(nspins))
135 778 : DO ispin = 1, nspins
136 : CALL get_group_dist(gd_B_virtual(ispin), para_env_sub%mepos, &
137 778 : my_B_virtual_start(ispin), my_B_virtual_end(ispin), my_B_size(ispin))
138 : END DO
139 :
140 346 : CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
141 :
142 346 : CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
143 :
144 : ! local_gemm_ctx has a very footprint the first time this routine is
145 : ! called.
146 346 : IF (.NOT. c_associated(mp2_env%local_gemm_ctx)) THEN
147 346 : CALL local_gemm_create(mp2_env%local_gemm_ctx, LOCAL_GEMM_PU_GPU)
148 346 : CALL local_gemm_set_op_threshold_gpu(mp2_env%local_gemm_ctx, 128*128*128*2)
149 : END IF
150 :
151 : CALL mp2_ri_get_integ_group_size( &
152 : mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
153 : homo, dimen_RI, unit_nr, &
154 : integ_group_size, ngroup, &
155 346 : num_integ_group, virtual, calc_forces)
156 :
157 : ! now create a group that contains all the proc that have the same virtual starting point
158 : ! in the integ group
159 : CALL mp2_ri_create_group( &
160 : para_env, para_env_sub, color_sub, &
161 : gd_array%sizes, calc_forces, &
162 : integ_group_size, my_group_L_end, &
163 : my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
164 : integ_group_pos2color_sub, sizes_array_orig, &
165 346 : ranges_info_array, comm_exchange, comm_rep, num_integ_group)
166 :
167 : ! We cannot fix the tag because of the recv routine
168 346 : tag = 42
169 :
170 778 : DO jspin = 1, nspins
171 :
172 : CALL replicate_iaK_2intgroup(BIb_C(jspin)%array, comm_exchange, comm_rep, &
173 : homo(jspin), gd_array%sizes, my_B_size(jspin), &
174 432 : my_group_L_size, ranges_info_array)
175 :
176 1296 : DO ispin = 1, jspin
177 :
178 518 : IF (unit_nr > 0) THEN
179 259 : IF (nspins == 1) THEN
180 130 : WRITE (unit_nr, *) "Start loop run"
181 129 : ELSE IF (ispin == 1 .AND. jspin == 1) THEN
182 43 : WRITE (unit_nr, *) "Start loop run alpha-alpha"
183 86 : ELSE IF (ispin == 1 .AND. jspin == 2) THEN
184 43 : WRITE (unit_nr, *) "Start loop run alpha-beta"
185 43 : ELSE IF (ispin == 2 .AND. jspin == 2) THEN
186 43 : WRITE (unit_nr, *) "Start loop run beta-beta"
187 : END IF
188 259 : CALL m_flush(unit_nr)
189 : END IF
190 :
191 518 : my_open_shell_SS = (nspins == 2) .AND. (ispin == jspin)
192 :
193 : ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
194 : ! If we calculate the gradient we need to distinguish
195 : ! between alpha-alpha and beta-beta cases for UMP2
196 :
197 518 : my_beta_beta_case = .FALSE.
198 518 : my_alpha_beta_case = .FALSE.
199 518 : IF (ispin /= jspin) THEN
200 86 : my_alpha_beta_case = .TRUE.
201 432 : ELSE IF (my_open_shell_SS) THEN
202 172 : IF (ispin == 2) my_beta_beta_case = .TRUE.
203 : END IF
204 :
205 518 : amp_fac = mp2_env%scale_S + mp2_env%scale_T
206 518 : IF (my_alpha_beta_case .OR. my_open_shell_SS) amp_fac = mp2_env%scale_T
207 :
208 : CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
209 518 : my_group_L_size, calc_forces, ispin, jspin, local_ba)
210 :
211 : CALL mp2_ri_get_block_size( &
212 : mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual(ispin:jspin), &
213 : homo(ispin:jspin), virtual(ispin:jspin), dimen_RI, unit_nr, block_size, &
214 518 : ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1D)
215 :
216 : ! *****************************************************************
217 : ! ********** REPLICATION-BLOCKED COMMUNICATION SCHEME ***********
218 : ! *****************************************************************
219 : ! introduce block size, the number of occupied orbitals has to be a
220 : ! multiple of the block size
221 :
222 : ! Calculate the maximum number of ij pairs that have to be computed
223 : ! among groups
224 : CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
225 518 : block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
226 :
227 1554 : ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
228 518 : CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
229 :
230 1146 : max_ij_pairs = MAXVAL(num_ij_pairs)
231 :
232 : ! start real stuff
233 : CALL mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, local_i_aL, &
234 518 : local_j_aL, calc_forces, Y_i_aP, Y_j_aP, ispin, jspin)
235 :
236 518 : CALL timeset(routineN//"_RI_loop", handle2)
237 518 : my_Emp2_Cou = 0.0_dp
238 518 : my_Emp2_EX = 0.0_dp
239 518 : t_start = m_walltime()
240 2509 : DO ij_index = 1, max_ij_pairs
241 :
242 : ! Prediction is unreliable if we are in the first step of the loop
243 1991 : IF (unit_nr > 0 .AND. ij_index > 1) THEN
244 723 : decil = ij_index*10/max_ij_pairs
245 723 : IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
246 682 : t_new = m_walltime()
247 682 : t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
248 : WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
249 682 : cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
250 682 : CALL m_flush(unit_nr)
251 : END IF
252 : END IF
253 :
254 1991 : IF (calc_forces) THEN
255 2791144 : Y_i_aP = 0.0_dp
256 2891166 : Y_j_aP = 0.0_dp
257 : END IF
258 :
259 1991 : IF (ij_index <= my_ij_pairs) THEN
260 : ! We have work to do
261 1942 : ij_counter = (ij_index - MIN(1, color_sub))*ngroup + color_sub
262 1942 : my_i = ij_map(1, ij_counter)
263 1942 : my_j = ij_map(2, ij_counter)
264 1942 : my_block_size = ij_map(3, ij_counter)
265 :
266 3277275 : local_i_aL = 0.0_dp
267 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
268 1942 : BIb_C(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
269 :
270 3390693 : local_j_aL = 0.0_dp
271 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
272 1942 : BIb_C(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
273 :
274 : ! collect data from other proc
275 1942 : CALL timeset(routineN//"_comm", handle3)
276 2051 : DO proc_shift = 1, comm_exchange%num_pe - 1
277 109 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
278 109 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
279 :
280 109 : send_ij_index = num_ij_pairs(proc_send)
281 :
282 109 : CALL get_group_dist(gd_array, proc_receive, sizes=rec_L_size)
283 :
284 2051 : IF (ij_index <= send_ij_index) THEN
285 : ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
286 60 : integ_group_pos2color_sub(proc_send)
287 60 : send_i = ij_map(1, ij_counter_send)
288 60 : send_j = ij_map(2, ij_counter_send)
289 :
290 : ! occupied i
291 : BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
292 60 : buffer_1D(1:rec_L_size*my_B_size(ispin)*my_block_size)
293 43353 : BI_C_rec = 0.0_dp
294 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
295 60 : proc_send, BI_C_rec, proc_receive, tag)
296 :
297 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
298 60 : BI_C_rec(:, 1:my_B_size(ispin), :))
299 :
300 : ! occupied j
301 : BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
302 60 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
303 44373 : BI_C_rec = 0.0_dp
304 : CALL comm_exchange%sendrecv(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
305 60 : proc_send, BI_C_rec, proc_receive, tag)
306 :
307 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
308 60 : BI_C_rec(:, 1:my_B_size(jspin), :))
309 :
310 : ELSE
311 : ! we send nothing while we know that we have to receive something
312 :
313 : ! occupied i
314 : BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
315 49 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(ispin)*my_block_size)
316 9124 : BI_C_rec = 0.0_dp
317 49 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
318 :
319 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
320 49 : BI_C_rec(:, 1:my_B_size(ispin), 1:my_block_size))
321 :
322 : ! occupied j
323 : BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
324 49 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
325 9124 : BI_C_rec = 0.0_dp
326 49 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
327 :
328 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
329 49 : BI_C_rec(:, 1:my_B_size(jspin), 1:my_block_size))
330 :
331 : END IF
332 :
333 : END DO
334 :
335 1942 : CALL timestop(handle3)
336 :
337 : ! loop over the block elements
338 3888 : DO iiB = 1, my_block_size
339 5842 : DO jjB = 1, my_block_size
340 1954 : CALL timeset(routineN//"_expansion", handle3)
341 3900 : ASSOCIATE (my_local_i_aL => local_i_aL(:, :, iiB), my_local_j_aL => local_j_aL(:, :, jjB))
342 :
343 : ! calculate the integrals (ia|jb) strating from my local data ...
344 578764 : local_ab = 0.0_dp
345 1954 : IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
346 139874 : local_ba = 0.0_dp
347 : END IF
348 1954 : CALL dgemm_counter_start(dgemm_counter)
349 : CALL local_gemm('T', 'N', my_B_size(ispin), my_B_size(jspin), dimen_RI, 1.0_dp, &
350 : my_local_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
351 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), &
352 1954 : my_B_size(ispin), mp2_env%local_gemm_ctx)
353 : ! Additional integrals only for alpha_beta case and forces
354 1954 : IF (my_alpha_beta_case .AND. calc_forces) THEN
355 : local_ba(my_B_virtual_start(jspin):my_B_virtual_end(jspin), :) = &
356 132909 : TRANSPOSE(local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :))
357 : END IF
358 : ! ... and from the other of my subgroup
359 2228 : DO proc_shift = 1, para_env_sub%num_pe - 1
360 274 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
361 274 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
362 :
363 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, &
364 274 : rec_B_virtual_end, rec_B_size)
365 :
366 274 : external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
367 267944 : external_i_aL = 0.0_dp
368 :
369 : CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
370 274 : external_i_aL, proc_receive, tag)
371 :
372 : CALL local_gemm('T', 'N', rec_B_size, my_B_size(jspin), dimen_RI, 1.0_dp, &
373 : external_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
374 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(jspin)), rec_B_size, &
375 274 : mp2_env%local_gemm_ctx)
376 :
377 : ! Additional integrals only for alpha_beta case and forces
378 2502 : IF (my_alpha_beta_case .AND. calc_forces) THEN
379 :
380 : CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, &
381 70 : rec_B_virtual_end, rec_B_size)
382 :
383 70 : external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
384 81655 : external_i_aL = 0.0_dp
385 :
386 : CALL para_env_sub%sendrecv(my_local_j_aL, proc_send, &
387 70 : external_i_aL, proc_receive, tag)
388 :
389 : CALL local_gemm('T', 'N', rec_B_size, my_B_size(ispin), dimen_RI, 1.0_dp, &
390 : external_i_aL, dimen_RI, my_local_i_aL, dimen_RI, &
391 : 0.0_dp, local_ba(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(ispin)), rec_B_size, &
392 70 : mp2_env%local_gemm_ctx)
393 : END IF
394 : END DO
395 1954 : IF (my_alpha_beta_case .AND. calc_forces) THEN
396 : ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
397 490 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(ispin) + my_B_size(jspin), dimen_RI)
398 : ELSE
399 1464 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(jspin), dimen_RI)
400 : END IF
401 1954 : CALL timestop(handle3)
402 :
403 : !sample peak memory
404 1954 : CALL m_memory()
405 :
406 1954 : CALL timeset(routineN//"_ener", handle3)
407 : ! calculate coulomb only MP2
408 1954 : sym_fac = 2.0_dp
409 1954 : IF (my_i == my_j) sym_fac = 1.0_dp
410 1954 : IF (my_alpha_beta_case) sym_fac = 0.5_dp
411 30429 : DO b = 1, my_B_size(jspin)
412 28475 : b_global = b + my_B_virtual_start(jspin) - 1
413 578764 : DO a = 1, virtual(ispin)
414 : my_Emp2_Cou = my_Emp2_Cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
415 : (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
416 576810 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
417 : END DO
418 : END DO
419 :
420 1954 : IF (calc_ex) THEN
421 : ! contract integrals with orbital energies for exchange MP2 energy
422 : ! starting with local ...
423 323406 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
424 29841 : DO b = 1, my_B_size(ispin)
425 27887 : b_global = b + my_B_virtual_start(ispin) - 1
426 541500 : DO a = 1, my_B_size(ispin)
427 511659 : a_global = a + my_B_virtual_start(ispin) - 1
428 : my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
429 : (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
430 511659 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
431 539546 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
432 : t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
433 : (Eigenval(homo(ispin) + a_global, ispin) + &
434 : Eigenval(homo(ispin) + b_global, ispin) - &
435 295758 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
436 : END IF
437 : END DO
438 : END DO
439 : ! ... and then with external data
440 2228 : DO proc_shift = 1, para_env_sub%num_pe - 1
441 274 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
442 274 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
443 :
444 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, &
445 274 : rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
446 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, &
447 274 : send_B_virtual_start, send_B_virtual_end, send_B_size)
448 :
449 : external_ab(1:my_B_size(ispin), 1:rec_B_size) => &
450 274 : buffer_1D(1:INT(rec_B_size, int_8)*my_B_size(ispin))
451 30405 : external_ab = 0.0_dp
452 :
453 : CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:my_B_size(ispin)), proc_send, &
454 60536 : external_ab(1:my_B_size(ispin), 1:rec_B_size), proc_receive, tag)
455 :
456 5233 : DO b = 1, my_B_size(ispin)
457 2731 : b_global = b + my_B_virtual_start(ispin) - 1
458 30405 : DO a = 1, rec_B_size
459 27400 : a_global = a + rec_B_virtual_start - 1
460 : my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
461 : (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
462 27400 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
463 27400 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
464 : t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
465 : (Eigenval(homo(ispin) + a_global, ispin) + &
466 : Eigenval(homo(ispin) + b_global, ispin) - &
467 12311 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
468 : END DO
469 : END DO
470 : END DO
471 : END IF
472 1954 : CALL timestop(handle3)
473 :
474 3908 : IF (calc_forces) THEN
475 : ! update P_ab, Gamma_P_ia
476 : CALL mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
477 : Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
478 : my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, &
479 : local_ab, t_ab, my_local_i_aL, my_local_j_aL, &
480 : my_open_shell_ss, Y_i_aP(:, :, iiB), Y_j_aP(:, :, jjB), local_ba, &
481 1569 : ispin, jspin, dgemm_counter, buffer_1D)
482 :
483 : END IF
484 :
485 : END ASSOCIATE
486 :
487 : END DO ! jjB
488 : END DO ! iiB
489 :
490 : ELSE
491 : ! We need it later in case of gradients
492 49 : my_block_size = 1
493 :
494 49 : CALL timeset(routineN//"_comm", handle3)
495 : ! No work to do and we know that we have to receive nothing, but send something
496 : ! send data to other proc
497 98 : DO proc_shift = 1, comm_exchange%num_pe - 1
498 49 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
499 49 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
500 :
501 49 : send_ij_index = num_ij_pairs(proc_send)
502 :
503 98 : IF (ij_index <= send_ij_index) THEN
504 : ! something to send
505 : ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
506 49 : integ_group_pos2color_sub(proc_send)
507 49 : send_i = ij_map(1, ij_counter_send)
508 49 : send_j = ij_map(2, ij_counter_send)
509 :
510 : ! occupied i
511 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
512 49 : proc_send, tag)
513 : ! occupied j
514 : CALL comm_exchange%send(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
515 49 : proc_send, tag)
516 : END IF
517 : END DO
518 49 : CALL timestop(handle3)
519 : END IF
520 :
521 : ! redistribute gamma
522 2509 : IF (calc_forces) THEN
523 : CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_B_size(ispin), &
524 : my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
525 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
526 : ij_map, ranges_info_array, Y_i_aP(:, :, 1:my_block_size), comm_exchange, &
527 1566 : gd_array%sizes, 1, buffer_1D)
528 : CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_B_size(jspin), &
529 : my_block_size, my_group_L_size, my_j, my_ij_pairs, ngroup, &
530 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
531 : ij_map, ranges_info_array, Y_j_aP(:, :, 1:my_block_size), comm_exchange, &
532 1566 : gd_array%sizes, 2, buffer_1D)
533 : END IF
534 :
535 : END DO
536 518 : CALL timestop(handle2)
537 :
538 518 : DEALLOCATE (local_i_aL)
539 518 : DEALLOCATE (local_j_aL)
540 518 : DEALLOCATE (ij_map)
541 518 : DEALLOCATE (num_ij_pairs)
542 518 : DEALLOCATE (local_ab)
543 :
544 518 : IF (calc_forces) THEN
545 372 : DEALLOCATE (Y_i_aP)
546 372 : DEALLOCATE (Y_j_aP)
547 372 : IF (ALLOCATED(t_ab)) THEN
548 296 : DEALLOCATE (t_ab)
549 : END IF
550 372 : DEALLOCATE (local_ba)
551 :
552 : ! here we check if there are almost degenerate ij
553 : ! pairs and we update P_ij with these contribution.
554 : ! If all pairs are degenerate with each other this step will scale O(N^6),
555 : ! if the number of degenerate pairs scales linearly with the system size
556 : ! this step will scale O(N^5).
557 : ! Start counting the number of almost degenerate ij pairs according
558 : ! to eps_canonical
559 : CALL quasi_degenerate_P_ij( &
560 : mp2_env, Eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
561 : my_beta_beta_case, Bib_C(ispin:jspin), unit_nr, dimen_RI, &
562 : my_B_size(ispin:jspin), ngroup, my_group_L_size, &
563 : color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
564 : my_B_virtual_start(ispin:jspin), my_B_virtual_end(ispin:jspin), gd_array%sizes, gd_B_virtual(ispin:jspin), &
565 372 : integ_group_pos2color_sub, dgemm_counter, buffer_1D)
566 :
567 : END IF
568 :
569 518 : DEALLOCATE (buffer_1D)
570 :
571 : ! Dereplicate BIb_C and Gamma_P_ia to save memory
572 : ! These matrices will not be needed in that fashion anymore
573 : ! B_ia_Q will needed later
574 518 : IF (calc_forces .AND. jspin == nspins) THEN
575 1032 : IF (.NOT. ALLOCATED(B_ia_Q)) ALLOCATE (B_ia_Q(nspins))
576 1480 : ALLOCATE (B_ia_Q(ispin)%array(homo(ispin), my_B_size(ispin), my_group_L_size_orig))
577 888463 : B_ia_Q(ispin)%array = 0.0_dp
578 1360 : DO jjB = 1, homo(ispin)
579 16940 : DO iiB = 1, my_B_size(ispin)
580 : B_ia_Q(ispin)%array(jjB, iiB, 1:my_group_L_size_orig) = &
581 709332 : BIb_C(ispin)%array(1:my_group_L_size_orig, iiB, jjB)
582 : END DO
583 : END DO
584 296 : DEALLOCATE (BIb_C(ispin)%array)
585 :
586 : ! sum Gamma and dereplicate
587 1480 : ALLOCATE (BIb_C(ispin)%array(my_B_size(ispin), homo(ispin), my_group_L_size_orig))
588 562 : DO proc_shift = 1, comm_rep%num_pe - 1
589 : ! invert order
590 266 : proc_send = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
591 266 : proc_receive = MODULO(comm_rep%mepos + proc_shift, comm_rep%num_pe)
592 :
593 266 : start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
594 266 : end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
595 :
596 : CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
597 266 : proc_send, BIb_C(ispin)%array, proc_receive, tag)
598 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
599 562 : !$OMP SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
600 : mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) = &
601 : mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) &
602 : + BIb_C(ispin)%array(:, :, :)
603 : !$OMP END PARALLEL WORKSHARE
604 : END DO
605 :
606 753612 : BIb_C(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig)
607 296 : DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
608 296 : CALL MOVE_ALLOC(BIb_C(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
609 222 : ELSE IF (jspin == nspins) THEN
610 136 : DEALLOCATE (BIb_C(ispin)%array)
611 : END IF
612 :
613 518 : CALL para_env%sum(my_Emp2_Cou)
614 518 : CALL para_env%sum(my_Emp2_Ex)
615 :
616 1986 : IF (my_open_shell_SS .OR. my_alpha_beta_case) THEN
617 258 : IF (my_alpha_beta_case) THEN
618 86 : Emp2_S = Emp2_S + my_Emp2_Cou
619 86 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
620 : ELSE
621 172 : my_Emp2_Cou = my_Emp2_Cou*0.25_dp
622 172 : my_Emp2_EX = my_Emp2_EX*0.5_dp
623 172 : Emp2_T = Emp2_T + my_Emp2_Cou + my_Emp2_EX
624 172 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
625 172 : Emp2_EX = Emp2_EX + my_Emp2_EX
626 : END IF
627 : ELSE
628 260 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
629 260 : Emp2_EX = Emp2_EX + my_Emp2_EX
630 : END IF
631 : END DO
632 :
633 : END DO
634 :
635 346 : DEALLOCATE (integ_group_pos2color_sub)
636 346 : DEALLOCATE (ranges_info_array)
637 :
638 346 : CALL comm_exchange%free()
639 346 : CALL comm_rep%free()
640 :
641 346 : IF (calc_forces) THEN
642 : ! recover original information (before replication)
643 220 : DEALLOCATE (gd_array%sizes)
644 220 : iiB = SIZE(sizes_array_orig)
645 660 : ALLOCATE (gd_array%sizes(0:iiB - 1))
646 654 : gd_array%sizes(:) = sizes_array_orig
647 220 : DEALLOCATE (sizes_array_orig)
648 :
649 : ! Remove replication from BIb_C and reorder the matrix
650 220 : my_group_L_size = my_group_L_size_orig
651 :
652 : ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
653 516 : DO ispin = 1, nspins
654 : CALL complete_gamma(mp2_env, B_ia_Q(ispin)%array, dimen_RI, homo(ispin), &
655 : virtual(ispin), para_env, para_env_sub, ngroup, &
656 : my_group_L_size, my_group_L_start, my_group_L_end, &
657 : my_B_size(ispin), my_B_virtual_start(ispin), &
658 : gd_array, gd_B_virtual(ispin), &
659 516 : ispin)
660 : END DO
661 516 : DEALLOCATE (B_ia_Q)
662 :
663 43758 : IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
664 : BLOCK
665 : TYPE(mp_comm_type) :: comm
666 220 : CALL comm%from_split(para_env, para_env_sub%mepos)
667 516 : DO ispin = 1, nspins
668 : ! P_ab is only replicated over all subgroups
669 296 : CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
670 : ! P_ij is replicated over all processes
671 516 : CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
672 : END DO
673 440 : CALL comm%free()
674 : END BLOCK
675 : END IF
676 :
677 346 : CALL release_group_dist(gd_array)
678 778 : DO ispin = 1, nspins
679 432 : IF (ALLOCATED(BIb_C(ispin)%array)) DEALLOCATE (BIb_C(ispin)%array)
680 778 : CALL release_group_dist(gd_B_virtual(ispin))
681 : END DO
682 :
683 : ! We do not need this matrix later, so deallocate it here to safe memory
684 346 : IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
685 346 : IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
686 8 : DEALLOCATE (mp2_env%ri_grad%operator_half)
687 :
688 346 : CALL dgemm_counter_write(dgemm_counter, para_env)
689 :
690 : ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
691 346 : CALL local_gemm_destroy(mp2_env%local_gemm_ctx)
692 346 : mp2_env%local_gemm_ctx = C_NULL_PTR
693 346 : CALL timestop(handle)
694 :
695 1384 : END SUBROUTINE mp2_ri_gpw_compute_en
696 :
697 : ! **************************************************************************************************
698 : !> \brief ...
699 : !> \param local_i_aL ...
700 : !> \param ranges_info_array ...
701 : !> \param BIb_C_rec ...
702 : ! **************************************************************************************************
703 4248 : SUBROUTINE fill_local_i_aL(local_i_aL, ranges_info_array, BIb_C_rec)
704 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: local_i_aL
705 : INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
706 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: BIb_C_rec
707 :
708 : CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL'
709 :
710 : INTEGER :: end_point, handle, irep, Lend_pos, &
711 : Lstart_pos, start_point
712 :
713 4248 : CALL timeset(routineN, handle)
714 :
715 11764 : DO irep = 1, SIZE(ranges_info_array, 2)
716 7516 : Lstart_pos = ranges_info_array(1, irep)
717 7516 : Lend_pos = ranges_info_array(2, irep)
718 7516 : start_point = ranges_info_array(3, irep)
719 7516 : end_point = ranges_info_array(4, irep)
720 :
721 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
722 11764 : !$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
723 : local_i_aL(Lstart_pos:Lend_pos, :, :) = BIb_C_rec(start_point:end_point, :, :)
724 : !$OMP END PARALLEL WORKSHARE
725 : END DO
726 :
727 4248 : CALL timestop(handle)
728 :
729 4248 : END SUBROUTINE fill_local_i_aL
730 :
731 : ! **************************************************************************************************
732 : !> \brief ...
733 : !> \param local_i_aL ...
734 : !> \param ranges_info_array ...
735 : !> \param BIb_C_rec ...
736 : ! **************************************************************************************************
737 250 : SUBROUTINE fill_local_i_aL_2D(local_i_aL, ranges_info_array, BIb_C_rec)
738 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: local_i_aL
739 : INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
740 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: BIb_C_rec
741 :
742 : CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL_2D'
743 :
744 : INTEGER :: end_point, handle, irep, Lend_pos, &
745 : Lstart_pos, start_point
746 :
747 250 : CALL timeset(routineN, handle)
748 :
749 718 : DO irep = 1, SIZE(ranges_info_array, 2)
750 468 : Lstart_pos = ranges_info_array(1, irep)
751 468 : Lend_pos = ranges_info_array(2, irep)
752 468 : start_point = ranges_info_array(3, irep)
753 468 : end_point = ranges_info_array(4, irep)
754 :
755 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
756 718 : !$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
757 : local_i_aL(Lstart_pos:Lend_pos, :) = BIb_C_rec(start_point:end_point, :)
758 : !$OMP END PARALLEL WORKSHARE
759 : END DO
760 :
761 250 : CALL timestop(handle)
762 :
763 250 : END SUBROUTINE fill_local_i_aL_2D
764 :
765 : ! **************************************************************************************************
766 : !> \brief ...
767 : !> \param BIb_C ...
768 : !> \param comm_exchange ...
769 : !> \param comm_rep ...
770 : !> \param homo ...
771 : !> \param sizes_array ...
772 : !> \param my_B_size ...
773 : !> \param my_group_L_size ...
774 : !> \param ranges_info_array ...
775 : ! **************************************************************************************************
776 432 : SUBROUTINE replicate_iaK_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
777 432 : my_group_L_size, ranges_info_array)
778 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
779 : INTENT(INOUT) :: BIb_C
780 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange, comm_rep
781 : INTEGER, INTENT(IN) :: homo
782 : INTEGER, DIMENSION(:), INTENT(IN) :: sizes_array
783 : INTEGER, INTENT(IN) :: my_B_size, my_group_L_size
784 : INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN) :: ranges_info_array
785 :
786 : CHARACTER(LEN=*), PARAMETER :: routineN = 'replicate_iaK_2intgroup'
787 :
788 : INTEGER :: end_point, handle, max_L_size, &
789 : proc_receive, proc_shift, start_point
790 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: BIb_C_copy
791 432 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: BIb_C_gather
792 :
793 432 : CALL timeset(routineN, handle)
794 :
795 : ! replication scheme using mpi_allgather
796 : ! get the max L size of the
797 968 : max_L_size = MAXVAL(sizes_array)
798 :
799 2160 : ALLOCATE (BIb_C_copy(max_L_size, my_B_size, homo))
800 1625764 : BIb_C_copy = 0.0_dp
801 883318 : BIb_C_copy(1:SIZE(BIb_C, 1), 1:my_B_size, 1:homo) = BIb_C
802 :
803 432 : DEALLOCATE (BIb_C)
804 :
805 2592 : ALLOCATE (BIb_C_gather(max_L_size, my_B_size, homo, 0:comm_rep%num_pe - 1))
806 3128994 : BIb_C_gather = 0.0_dp
807 :
808 432 : CALL comm_rep%allgather(BIb_C_copy, BIb_C_gather)
809 :
810 432 : DEALLOCATE (BIb_C_copy)
811 :
812 2160 : ALLOCATE (BIb_C(my_group_L_size, my_B_size, homo))
813 1625417 : BIb_C = 0.0_dp
814 :
815 : ! reorder data
816 1174 : DO proc_shift = 0, comm_rep%num_pe - 1
817 742 : proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
818 :
819 742 : start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
820 742 : end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
821 :
822 : BIb_C(start_point:end_point, 1:my_B_size, 1:homo) = &
823 1644449 : BIb_C_gather(1:end_point - start_point + 1, 1:my_B_size, 1:homo, proc_receive)
824 :
825 : END DO
826 :
827 432 : DEALLOCATE (BIb_C_gather)
828 :
829 432 : CALL timestop(handle)
830 :
831 432 : END SUBROUTINE replicate_iaK_2intgroup
832 :
833 : ! **************************************************************************************************
834 : !> \brief ...
835 : !> \param local_ab ...
836 : !> \param t_ab ...
837 : !> \param mp2_env ...
838 : !> \param homo ...
839 : !> \param virtual ...
840 : !> \param my_B_size ...
841 : !> \param my_group_L_size ...
842 : !> \param calc_forces ...
843 : !> \param ispin ...
844 : !> \param jspin ...
845 : !> \param local_ba ...
846 : ! **************************************************************************************************
847 518 : SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
848 : my_group_L_size, calc_forces, ispin, jspin, local_ba)
849 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
850 : INTENT(OUT) :: local_ab, t_ab
851 : TYPE(mp2_type) :: mp2_env
852 : INTEGER, INTENT(IN) :: homo(2), virtual(2), my_B_size(2), &
853 : my_group_L_size
854 : LOGICAL, INTENT(IN) :: calc_forces
855 : INTEGER, INTENT(IN) :: ispin, jspin
856 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
857 : INTENT(OUT) :: local_ba
858 :
859 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_no_blk'
860 :
861 : INTEGER :: handle
862 :
863 518 : CALL timeset(routineN, handle)
864 :
865 2072 : ALLOCATE (local_ab(virtual(ispin), my_B_size(jspin)))
866 137025 : local_ab = 0.0_dp
867 :
868 518 : IF (calc_forces) THEN
869 372 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
870 1184 : ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
871 6068 : mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
872 : END IF
873 372 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
874 1184 : ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_B_size(jspin), virtual(jspin)))
875 83688 : mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
876 : END IF
877 372 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
878 1480 : ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_B_size(jspin), homo(jspin), my_group_L_size))
879 1442032 : mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
880 : END IF
881 :
882 372 : IF (ispin == jspin) THEN
883 : ! For non-alpha-beta case we need amplitudes
884 1184 : ALLOCATE (t_ab(virtual(ispin), my_B_size(jspin)))
885 :
886 : ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
887 296 : ALLOCATE (local_ba(1, 1))
888 : ELSE
889 : ! We need more integrals
890 304 : ALLOCATE (local_ba(virtual(jspin), my_B_size(ispin)))
891 : END IF
892 : END IF
893 : !
894 :
895 518 : CALL timestop(handle)
896 :
897 518 : END SUBROUTINE mp2_ri_allocate_no_blk
898 :
899 : ! **************************************************************************************************
900 : !> \brief ...
901 : !> \param dimen_RI ...
902 : !> \param my_B_size ...
903 : !> \param block_size ...
904 : !> \param local_i_aL ...
905 : !> \param local_j_aL ...
906 : !> \param calc_forces ...
907 : !> \param Y_i_aP ...
908 : !> \param Y_j_aP ...
909 : !> \param ispin ...
910 : !> \param jspin ...
911 : ! **************************************************************************************************
912 518 : SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
913 : local_i_aL, local_j_aL, calc_forces, &
914 : Y_i_aP, Y_j_aP, ispin, jspin)
915 : INTEGER, INTENT(IN) :: dimen_RI, my_B_size(2), block_size
916 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
917 : INTENT(OUT) :: local_i_aL, local_j_aL
918 : LOGICAL, INTENT(IN) :: calc_forces
919 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
920 : INTENT(OUT) :: Y_i_aP, Y_j_aP
921 : INTEGER, INTENT(IN) :: ispin, jspin
922 :
923 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_blk'
924 :
925 : INTEGER :: handle
926 :
927 518 : CALL timeset(routineN, handle)
928 :
929 2590 : ALLOCATE (local_i_aL(dimen_RI, my_B_size(ispin), block_size))
930 648381 : local_i_aL = 0.0_dp
931 2590 : ALLOCATE (local_j_aL(dimen_RI, my_B_size(jspin), block_size))
932 660833 : local_j_aL = 0.0_dp
933 :
934 518 : IF (calc_forces) THEN
935 1860 : ALLOCATE (Y_i_aP(my_B_size(ispin), dimen_RI, block_size))
936 531690 : Y_i_aP = 0.0_dp
937 : ! For closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
938 : ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
939 1860 : ALLOCATE (Y_j_aP(my_B_size(jspin), dimen_RI, block_size))
940 542254 : Y_j_aP = 0.0_dp
941 : END IF
942 : !
943 :
944 518 : CALL timestop(handle)
945 :
946 518 : END SUBROUTINE mp2_ri_allocate_blk
947 :
948 : ! **************************************************************************************************
949 : !> \brief ...
950 : !> \param my_alpha_beta_case ...
951 : !> \param total_ij_pairs ...
952 : !> \param homo ...
953 : !> \param homo_beta ...
954 : !> \param block_size ...
955 : !> \param ngroup ...
956 : !> \param ij_map ...
957 : !> \param color_sub ...
958 : !> \param my_ij_pairs ...
959 : !> \param my_open_shell_SS ...
960 : !> \param unit_nr ...
961 : ! **************************************************************************************************
962 518 : SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
963 : block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
964 : LOGICAL, INTENT(IN) :: my_alpha_beta_case
965 : INTEGER, INTENT(OUT) :: total_ij_pairs
966 : INTEGER, INTENT(IN) :: homo, homo_beta, block_size, ngroup
967 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
968 : INTEGER, INTENT(IN) :: color_sub
969 : INTEGER, INTENT(OUT) :: my_ij_pairs
970 : LOGICAL, INTENT(IN) :: my_open_shell_SS
971 : INTEGER, INTENT(IN) :: unit_nr
972 :
973 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_communication'
974 :
975 : INTEGER :: assigned_blocks, first_I_block, first_J_block, handle, iiB, ij_block_counter, &
976 : ij_counter, jjB, last_i_block, last_J_block, num_block_per_group, num_IJ_blocks, &
977 : num_IJ_blocks_beta, total_ij_block, total_ij_pairs_blocks
978 518 : LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ij_marker
979 :
980 : ! Calculate the maximum number of ij pairs that have to be computed
981 : ! among groups
982 :
983 518 : CALL timeset(routineN, handle)
984 :
985 518 : IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
986 260 : total_ij_pairs = homo*(1 + homo)/2
987 260 : num_IJ_blocks = homo/block_size - 1
988 :
989 260 : first_I_block = 1
990 260 : last_i_block = block_size*(num_IJ_blocks - 1)
991 :
992 260 : first_J_block = block_size + 1
993 260 : last_J_block = block_size*(num_IJ_blocks + 1)
994 :
995 260 : ij_block_counter = 0
996 584 : DO iiB = first_I_block, last_i_block, block_size
997 584 : DO jjB = iiB + block_size, last_J_block, block_size
998 810 : ij_block_counter = ij_block_counter + 1
999 : END DO
1000 : END DO
1001 :
1002 260 : total_ij_block = ij_block_counter
1003 260 : num_block_per_group = total_ij_block/ngroup
1004 260 : assigned_blocks = num_block_per_group*ngroup
1005 :
1006 260 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1007 :
1008 1040 : ALLOCATE (ij_marker(homo, homo))
1009 3892 : ij_marker = .TRUE.
1010 780 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1011 7524 : ij_map = 0
1012 260 : ij_counter = 0
1013 260 : my_ij_pairs = 0
1014 584 : DO iiB = first_I_block, last_i_block, block_size
1015 1236 : DO jjB = iiB + block_size, last_J_block, block_size
1016 810 : IF (ij_counter + 1 > assigned_blocks) EXIT
1017 652 : ij_counter = ij_counter + 1
1018 1956 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1019 652 : ij_map(1, ij_counter) = iiB
1020 652 : ij_map(2, ij_counter) = jjB
1021 652 : ij_map(3, ij_counter) = block_size
1022 976 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1023 : END DO
1024 : END DO
1025 1040 : DO iiB = 1, homo
1026 2856 : DO jjB = iiB, homo
1027 2596 : IF (ij_marker(iiB, jjB)) THEN
1028 1164 : ij_counter = ij_counter + 1
1029 1164 : ij_map(1, ij_counter) = iiB
1030 1164 : ij_map(2, ij_counter) = jjB
1031 1164 : ij_map(3, ij_counter) = 1
1032 1164 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1033 : END IF
1034 : END DO
1035 : END DO
1036 260 : DEALLOCATE (ij_marker)
1037 :
1038 258 : ELSE IF (.NOT. my_alpha_beta_case) THEN
1039 : ! THese are the cases alpha/alpha and beta/beta
1040 : ! We do not have to consider the diagonal elements
1041 172 : total_ij_pairs = homo*(homo - 1)/2
1042 172 : num_IJ_blocks = (homo - 1)/block_size - 1
1043 :
1044 172 : first_I_block = 1
1045 172 : last_i_block = block_size*(num_IJ_blocks - 1)
1046 :
1047 : ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
1048 172 : first_J_block = block_size + 2
1049 172 : last_J_block = block_size*(num_IJ_blocks + 1) + 1
1050 :
1051 172 : ij_block_counter = 0
1052 256 : DO iiB = first_I_block, last_i_block, block_size
1053 256 : DO jjB = iiB + block_size + 1, last_J_block, block_size
1054 196 : ij_block_counter = ij_block_counter + 1
1055 : END DO
1056 : END DO
1057 :
1058 172 : total_ij_block = ij_block_counter
1059 172 : num_block_per_group = total_ij_block/ngroup
1060 172 : assigned_blocks = num_block_per_group*ngroup
1061 :
1062 172 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1063 :
1064 688 : ALLOCATE (ij_marker(homo, homo))
1065 2916 : ij_marker = .TRUE.
1066 516 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1067 3276 : ij_map = 0
1068 172 : ij_counter = 0
1069 172 : my_ij_pairs = 0
1070 256 : DO iiB = first_I_block, last_i_block, block_size
1071 448 : DO jjB = iiB + block_size + 1, last_J_block, block_size
1072 196 : IF (ij_counter + 1 > assigned_blocks) EXIT
1073 192 : ij_counter = ij_counter + 1
1074 592 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1075 192 : ij_map(1, ij_counter) = iiB
1076 192 : ij_map(2, ij_counter) = jjB
1077 192 : ij_map(3, ij_counter) = block_size
1078 276 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1079 : END DO
1080 : END DO
1081 756 : DO iiB = 1, homo
1082 1544 : DO jjB = iiB + 1, homo
1083 1372 : IF (ij_marker(iiB, jjB)) THEN
1084 584 : ij_counter = ij_counter + 1
1085 584 : ij_map(1, ij_counter) = iiB
1086 584 : ij_map(2, ij_counter) = jjB
1087 584 : ij_map(3, ij_counter) = 1
1088 584 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1089 : END IF
1090 : END DO
1091 : END DO
1092 172 : DEALLOCATE (ij_marker)
1093 :
1094 : ELSE
1095 86 : total_ij_pairs = homo*homo_beta
1096 86 : num_IJ_blocks = homo/block_size
1097 86 : num_IJ_blocks_beta = homo_beta/block_size
1098 :
1099 86 : first_I_block = 1
1100 86 : last_i_block = block_size*(num_IJ_blocks - 1)
1101 :
1102 86 : first_J_block = 1
1103 86 : last_J_block = block_size*(num_IJ_blocks_beta - 1)
1104 :
1105 86 : ij_block_counter = 0
1106 242 : DO iiB = first_I_block, last_i_block, block_size
1107 242 : DO jjB = first_J_block, last_J_block, block_size
1108 282 : ij_block_counter = ij_block_counter + 1
1109 : END DO
1110 : END DO
1111 :
1112 86 : total_ij_block = ij_block_counter
1113 86 : num_block_per_group = total_ij_block/ngroup
1114 86 : assigned_blocks = num_block_per_group*ngroup
1115 :
1116 86 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1117 :
1118 344 : ALLOCATE (ij_marker(homo, homo_beta))
1119 1364 : ij_marker = .TRUE.
1120 258 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1121 4206 : ij_map = 0
1122 86 : ij_counter = 0
1123 86 : my_ij_pairs = 0
1124 242 : DO iiB = first_I_block, last_i_block, block_size
1125 482 : DO jjB = first_J_block, last_J_block, block_size
1126 244 : IF (ij_counter + 1 > assigned_blocks) EXIT
1127 240 : ij_counter = ij_counter + 1
1128 720 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1129 240 : ij_map(1, ij_counter) = iiB
1130 240 : ij_map(2, ij_counter) = jjB
1131 240 : ij_map(3, ij_counter) = block_size
1132 396 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1133 : END DO
1134 : END DO
1135 422 : DO iiB = 1, homo
1136 1452 : DO jjB = 1, homo_beta
1137 1366 : IF (ij_marker(iiB, jjB)) THEN
1138 790 : ij_counter = ij_counter + 1
1139 790 : ij_map(1, ij_counter) = iiB
1140 790 : ij_map(2, ij_counter) = jjB
1141 790 : ij_map(3, ij_counter) = 1
1142 790 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1143 : END IF
1144 : END DO
1145 : END DO
1146 86 : DEALLOCATE (ij_marker)
1147 : END IF
1148 :
1149 518 : IF (unit_nr > 0) THEN
1150 259 : IF (block_size == 1) THEN
1151 : WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
1152 231 : "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
1153 : ELSE
1154 : WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
1155 28 : "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
1156 56 : 100.0_dp*REAL((total_ij_pairs - assigned_blocks*(block_size**2)), KIND=dp)/REAL(total_ij_pairs, KIND=dp)
1157 : END IF
1158 259 : CALL m_flush(unit_nr)
1159 : END IF
1160 :
1161 518 : CALL timestop(handle)
1162 :
1163 518 : END SUBROUTINE mp2_ri_communication
1164 :
1165 : ! **************************************************************************************************
1166 : !> \brief ...
1167 : !> \param para_env ...
1168 : !> \param para_env_sub ...
1169 : !> \param color_sub ...
1170 : !> \param sizes_array ...
1171 : !> \param calc_forces ...
1172 : !> \param integ_group_size ...
1173 : !> \param my_group_L_end ...
1174 : !> \param my_group_L_size ...
1175 : !> \param my_group_L_size_orig ...
1176 : !> \param my_group_L_start ...
1177 : !> \param my_new_group_L_size ...
1178 : !> \param integ_group_pos2color_sub ...
1179 : !> \param sizes_array_orig ...
1180 : !> \param ranges_info_array ...
1181 : !> \param comm_exchange ...
1182 : !> \param comm_rep ...
1183 : !> \param num_integ_group ...
1184 : ! **************************************************************************************************
1185 346 : SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
1186 : sizes_array, calc_forces, &
1187 : integ_group_size, my_group_L_end, &
1188 : my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
1189 : integ_group_pos2color_sub, &
1190 : sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
1191 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1192 : INTEGER, INTENT(IN) :: color_sub
1193 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT) :: sizes_array
1194 : LOGICAL, INTENT(IN) :: calc_forces
1195 : INTEGER, INTENT(IN) :: integ_group_size, my_group_L_end
1196 : INTEGER, INTENT(INOUT) :: my_group_L_size
1197 : INTEGER, INTENT(OUT) :: my_group_L_size_orig
1198 : INTEGER, INTENT(IN) :: my_group_L_start
1199 : INTEGER, INTENT(INOUT) :: my_new_group_L_size
1200 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: integ_group_pos2color_sub, &
1201 : sizes_array_orig
1202 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1203 : INTENT(OUT) :: ranges_info_array
1204 : TYPE(mp_comm_type), INTENT(OUT) :: comm_exchange, comm_rep
1205 : INTEGER, INTENT(IN) :: num_integ_group
1206 :
1207 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_create_group'
1208 :
1209 : INTEGER :: handle, iiB, proc_receive, proc_shift, &
1210 : sub_sub_color
1211 346 : INTEGER, ALLOCATABLE, DIMENSION(:) :: new_sizes_array, rep_ends_array, &
1212 : rep_sizes_array, rep_starts_array
1213 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: my_info
1214 :
1215 346 : CALL timeset(routineN, handle)
1216 : !
1217 346 : sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
1218 346 : CALL comm_exchange%from_split(para_env, sub_sub_color)
1219 :
1220 : ! create the replication group
1221 346 : sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
1222 346 : CALL comm_rep%from_split(para_env, sub_sub_color)
1223 :
1224 : ! create the new limits for K according to the size
1225 : ! of the integral group
1226 :
1227 : ! info array for replication
1228 1038 : ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
1229 1038 : ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
1230 1038 : ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
1231 :
1232 346 : CALL comm_rep%allgather(my_group_L_size, rep_sizes_array)
1233 346 : CALL comm_rep%allgather(my_group_L_start, rep_starts_array)
1234 346 : CALL comm_rep%allgather(my_group_L_end, rep_ends_array)
1235 :
1236 : ! calculate my_new_group_L_size according to sizes_array
1237 346 : my_new_group_L_size = my_group_L_size
1238 :
1239 : ! Info of this process
1240 1038 : ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
1241 346 : my_info(1, 0) = my_group_L_start
1242 346 : my_info(2, 0) = my_group_L_end
1243 346 : my_info(3, 0) = 1
1244 346 : my_info(4, 0) = my_group_L_size
1245 :
1246 580 : DO proc_shift = 1, comm_rep%num_pe - 1
1247 234 : proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
1248 :
1249 234 : my_new_group_L_size = my_new_group_L_size + rep_sizes_array(proc_receive)
1250 :
1251 234 : my_info(1, proc_shift) = rep_starts_array(proc_receive)
1252 234 : my_info(2, proc_shift) = rep_ends_array(proc_receive)
1253 234 : my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
1254 580 : my_info(4, proc_shift) = my_new_group_L_size
1255 :
1256 : END DO
1257 :
1258 1038 : ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
1259 1384 : ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
1260 346 : CALL comm_exchange%allgather(my_new_group_L_size, new_sizes_array)
1261 346 : CALL comm_exchange%allgather(my_info, ranges_info_array)
1262 :
1263 346 : DEALLOCATE (rep_sizes_array)
1264 346 : DEALLOCATE (rep_starts_array)
1265 346 : DEALLOCATE (rep_ends_array)
1266 :
1267 1038 : ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
1268 346 : CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
1269 :
1270 346 : IF (calc_forces) THEN
1271 220 : iiB = SIZE(sizes_array)
1272 660 : ALLOCATE (sizes_array_orig(0:iiB - 1))
1273 654 : sizes_array_orig(:) = sizes_array
1274 : END IF
1275 :
1276 346 : my_group_L_size_orig = my_group_L_size
1277 346 : my_group_L_size = my_new_group_L_size
1278 346 : DEALLOCATE (sizes_array)
1279 :
1280 1038 : ALLOCATE (sizes_array(0:integ_group_size - 1))
1281 790 : sizes_array(:) = new_sizes_array
1282 :
1283 346 : DEALLOCATE (new_sizes_array)
1284 : !
1285 346 : CALL timestop(handle)
1286 :
1287 692 : END SUBROUTINE mp2_ri_create_group
1288 :
1289 : ! **************************************************************************************************
1290 : !> \brief ...
1291 : !> \param mp2_env ...
1292 : !> \param para_env ...
1293 : !> \param para_env_sub ...
1294 : !> \param gd_array ...
1295 : !> \param gd_B_virtual ...
1296 : !> \param homo ...
1297 : !> \param dimen_RI ...
1298 : !> \param unit_nr ...
1299 : !> \param integ_group_size ...
1300 : !> \param ngroup ...
1301 : !> \param num_integ_group ...
1302 : !> \param virtual ...
1303 : !> \param calc_forces ...
1304 : ! **************************************************************************************************
1305 1384 : SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1306 346 : homo, dimen_RI, unit_nr, &
1307 : integ_group_size, &
1308 : ngroup, num_integ_group, &
1309 346 : virtual, calc_forces)
1310 : TYPE(mp2_type) :: mp2_env
1311 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1312 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1313 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1314 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
1315 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
1316 : INTEGER, INTENT(OUT) :: integ_group_size, ngroup, num_integ_group
1317 : INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1318 : LOGICAL, INTENT(IN) :: calc_forces
1319 :
1320 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_integ_group_size'
1321 :
1322 : INTEGER :: block_size, handle, iiB, &
1323 : max_repl_group_size, &
1324 : min_integ_group_size
1325 : INTEGER(KIND=int_8) :: mem
1326 : LOGICAL :: calc_group_size
1327 : REAL(KIND=dp) :: factor, mem_base, mem_min, mem_per_blk, &
1328 : mem_per_repl, mem_per_repl_blk, &
1329 : mem_real
1330 :
1331 346 : CALL timeset(routineN, handle)
1332 :
1333 346 : ngroup = para_env%num_pe/para_env_sub%num_pe
1334 :
1335 346 : calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
1336 346 : IF (.NOT. calc_group_size) THEN
1337 10 : IF (MOD(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .TRUE.
1338 : END IF
1339 :
1340 346 : IF (calc_group_size) THEN
1341 336 : CALL m_memory(mem)
1342 336 : mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1343 336 : CALL para_env%min(mem_real)
1344 :
1345 336 : mem_base = 0.0_dp
1346 336 : mem_per_blk = 0.0_dp
1347 336 : mem_per_repl = 0.0_dp
1348 336 : mem_per_repl_blk = 0.0_dp
1349 :
1350 : ! BIB_C_copy
1351 : mem_per_repl = mem_per_repl + MAXVAL(MAX(REAL(homo, KIND=dp)*maxsize(gd_array), REAL(dimen_RI, KIND=dp))* &
1352 1088 : maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1353 : ! BIB_C
1354 752 : mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_B_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
1355 : ! BIB_C_rec
1356 752 : mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1357 : ! local_i_aL+local_j_aL
1358 752 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
1359 : ! local_ab
1360 1088 : mem_base = mem_base + MAXVAL(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1361 : ! external_ab/external_i_aL
1362 1168 : mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1363 :
1364 336 : IF (calc_forces) THEN
1365 : ! Gamma_P_ia
1366 : mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_array)* &
1367 496 : maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1368 : ! Y_i_aP+Y_j_aP
1369 496 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
1370 : ! local_ba/t_ab
1371 780 : mem_base = mem_base + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1372 : ! P_ij
1373 496 : mem_base = mem_base + SUM(REAL(homo, KIND=dp)*homo)*8.0_dp/(1024**2)
1374 : ! P_ab
1375 496 : mem_base = mem_base + SUM(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1376 : ! send_ab/send_i_aL
1377 780 : mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1378 : END IF
1379 :
1380 : ! This a first guess based on the assumption of optimal block sizes
1381 752 : block_size = MAX(1, MIN(FLOOR(SQRT(REAL(MINVAL(homo), KIND=dp))), FLOOR(MINVAL(homo)/SQRT(2.0_dp*ngroup))))
1382 336 : IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
1383 :
1384 336 : mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
1385 :
1386 504 : IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
1387 336 : mem_real, ' MiB'
1388 504 : IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
1389 336 : mem_min, ' MiB'
1390 :
1391 : ! We use the following communication model
1392 : ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
1393 : ! One can show that the costs of the contraction step are independent of the block size and the replication group size
1394 : ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
1395 : ! NL ... number of RI basis functions
1396 : ! NR ... replication group size
1397 : ! NG ... number of sub groups
1398 : ! NB ... Block size
1399 : ! o ... number of occupied orbitals
1400 : ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
1401 : ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1402 : ! and with gradients
1403 : ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1404 : ! We are looking for the minimum of the communication volume,
1405 : ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
1406 : ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
1407 : ! Replication group size = 1 implies that the integration group size equals the number of subgroups
1408 :
1409 336 : integ_group_size = ngroup
1410 :
1411 : ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
1412 : factor = REAL(SUM(homo*virtual), KIND=dp) &
1413 1584 : - SUM((REAL(MAXVAL(homo), KIND=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
1414 656 : IF (SIZE(homo) == 2) factor = factor - 2.0_dp*PRODUCT(homo)/block_size/ngroup*SUM(homo*virtual)
1415 :
1416 672 : IF (factor <= 0.0_dp) THEN
1417 : ! Remove the fixed memory and divide by the memory per replication group size
1418 : max_repl_group_size = MIN(MAX(FLOOR((mem_real - mem_base - mem_per_blk*block_size)/ &
1419 248 : (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
1420 : ! Convert to an integration group size
1421 248 : min_integ_group_size = ngroup/max_repl_group_size
1422 :
1423 : ! Ensure that the integration group size is a divisor of the number of sub groups
1424 248 : DO iiB = MAX(MIN(min_integ_group_size, ngroup), 1), ngroup
1425 : ! check that the ngroup is a multiple of integ_group_size
1426 248 : IF (MOD(ngroup, iiB) == 0) THEN
1427 248 : integ_group_size = iiB
1428 248 : EXIT
1429 : END IF
1430 0 : integ_group_size = integ_group_size + 1
1431 : END DO
1432 : END IF
1433 : ELSE ! We take the user provided group size
1434 10 : integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
1435 : END IF
1436 :
1437 346 : IF (unit_nr > 0) THEN
1438 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
1439 173 : "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
1440 173 : CALL m_flush(unit_nr)
1441 : END IF
1442 :
1443 346 : num_integ_group = ngroup/integ_group_size
1444 :
1445 346 : CALL timestop(handle)
1446 :
1447 346 : END SUBROUTINE mp2_ri_get_integ_group_size
1448 :
1449 : ! **************************************************************************************************
1450 : !> \brief ...
1451 : !> \param mp2_env ...
1452 : !> \param para_env ...
1453 : !> \param para_env_sub ...
1454 : !> \param gd_array ...
1455 : !> \param gd_B_virtual ...
1456 : !> \param homo ...
1457 : !> \param virtual ...
1458 : !> \param dimen_RI ...
1459 : !> \param unit_nr ...
1460 : !> \param block_size ...
1461 : !> \param ngroup ...
1462 : !> \param num_integ_group ...
1463 : !> \param my_open_shell_ss ...
1464 : !> \param calc_forces ...
1465 : !> \param buffer_1D ...
1466 : ! **************************************************************************************************
1467 518 : SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1468 518 : homo, virtual, dimen_RI, unit_nr, &
1469 : block_size, ngroup, num_integ_group, &
1470 : my_open_shell_ss, calc_forces, buffer_1D)
1471 : TYPE(mp2_type) :: mp2_env
1472 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1473 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1474 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1475 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
1476 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
1477 : INTEGER, INTENT(OUT) :: block_size, ngroup
1478 : INTEGER, INTENT(IN) :: num_integ_group
1479 : LOGICAL, INTENT(IN) :: my_open_shell_ss, calc_forces
1480 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
1481 : INTENT(OUT) :: buffer_1D
1482 :
1483 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_block_size'
1484 :
1485 : INTEGER :: best_block_size, handle, num_IJ_blocks
1486 : INTEGER(KIND=int_8) :: buffer_size, mem
1487 : REAL(KIND=dp) :: mem_base, mem_per_blk, mem_per_repl_blk, &
1488 : mem_real
1489 :
1490 518 : CALL timeset(routineN, handle)
1491 :
1492 518 : ngroup = para_env%num_pe/para_env_sub%num_pe
1493 :
1494 518 : CALL m_memory(mem)
1495 518 : mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1496 518 : CALL para_env%min(mem_real)
1497 :
1498 518 : mem_base = 0.0_dp
1499 518 : mem_per_blk = 0.0_dp
1500 518 : mem_per_repl_blk = 0.0_dp
1501 :
1502 : ! external_ab
1503 1726 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1504 : ! BIB_C_rec
1505 1122 : mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1506 : ! local_i_aL+local_j_aL
1507 1122 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
1508 : ! Copy to keep arrays contiguous
1509 1726 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1510 :
1511 518 : IF (calc_forces) THEN
1512 : ! Y_i_aP+Y_j_aP+BIb_C_send
1513 820 : mem_per_blk = mem_per_blk + 3.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
1514 : ! send_ab
1515 1268 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1516 : END IF
1517 :
1518 518 : best_block_size = 1
1519 :
1520 : ! Here we split the memory half for the communication, half for replication
1521 518 : IF (mp2_env%ri_mp2%block_size > 0) THEN
1522 : best_block_size = mp2_env%ri_mp2%block_size
1523 : ELSE
1524 286 : best_block_size = MAX(FLOOR((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
1525 :
1526 4319682 : DO
1527 4319968 : IF (SIZE(homo) == 1) THEN
1528 3468400 : IF (.NOT. my_open_shell_ss) THEN
1529 1580206 : num_IJ_blocks = (homo(1)/best_block_size)
1530 1580206 : num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
1531 : ELSE
1532 1888194 : num_IJ_blocks = ((homo(1) - 1)/best_block_size)
1533 1888194 : num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
1534 : END IF
1535 : ELSE
1536 2554704 : num_ij_blocks = PRODUCT(homo/best_block_size)
1537 : END IF
1538 : ! Enforce at least one large block for each subgroup
1539 4319968 : IF ((num_IJ_blocks >= ngroup .AND. num_IJ_blocks > 0) .OR. best_block_size == 1) THEN
1540 : EXIT
1541 : ELSE
1542 4319682 : best_block_size = best_block_size - 1
1543 : END IF
1544 : END DO
1545 :
1546 286 : IF (SIZE(homo) == 1) THEN
1547 226 : IF (my_open_shell_ss) THEN
1548 : ! check that best_block_size is not bigger than sqrt(homo-1)
1549 : ! Diagonal elements do not have to be considered
1550 120 : best_block_size = MIN(FLOOR(SQRT(REAL(homo(1) - 1, KIND=dp))), best_block_size)
1551 : ELSE
1552 : ! check that best_block_size is not bigger than sqrt(homo)
1553 106 : best_block_size = MIN(FLOOR(SQRT(REAL(homo(1), KIND=dp))), best_block_size)
1554 : END IF
1555 : END IF
1556 : END IF
1557 518 : block_size = MAX(1, best_block_size)
1558 :
1559 518 : IF (unit_nr > 0) THEN
1560 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
1561 259 : "RI_INFO| Block size:", block_size
1562 259 : CALL m_flush(unit_nr)
1563 : END IF
1564 :
1565 : ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
1566 : buffer_size = MAX(INT(maxsize(gd_array), KIND=int_8)*block_size, INT(MAX(dimen_RI, MAXVAL(virtual)), KIND=int_8)) &
1567 1726 : *MAXVAL(maxsize(gd_B_virtual))
1568 : ! The send buffer has the same size as the recv buffer
1569 518 : IF (calc_forces) buffer_size = buffer_size*2
1570 1554 : ALLOCATE (buffer_1D(buffer_size))
1571 :
1572 518 : CALL timestop(handle)
1573 :
1574 518 : END SUBROUTINE mp2_ri_get_block_size
1575 :
1576 : ! **************************************************************************************************
1577 : !> \brief ...
1578 : !> \param mp2_env ...
1579 : !> \param para_env_sub ...
1580 : !> \param gd_B_virtual ...
1581 : !> \param Eigenval ...
1582 : !> \param homo ...
1583 : !> \param dimen_RI ...
1584 : !> \param iiB ...
1585 : !> \param jjB ...
1586 : !> \param my_B_size ...
1587 : !> \param my_B_virtual_end ...
1588 : !> \param my_B_virtual_start ...
1589 : !> \param my_i ...
1590 : !> \param my_j ...
1591 : !> \param virtual ...
1592 : !> \param local_ab ...
1593 : !> \param t_ab ...
1594 : !> \param my_local_i_aL ...
1595 : !> \param my_local_j_aL ...
1596 : !> \param open_ss ...
1597 : !> \param Y_i_aP ...
1598 : !> \param Y_j_aP ...
1599 : !> \param local_ba ...
1600 : !> \param ispin ...
1601 : !> \param jspin ...
1602 : !> \param dgemm_counter ...
1603 : !> \param buffer_1D ...
1604 : ! **************************************************************************************************
1605 6276 : SUBROUTINE mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
1606 3138 : Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
1607 3138 : my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
1608 3138 : t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
1609 1569 : local_ba, ispin, jspin, dgemm_counter, buffer_1D)
1610 : TYPE(mp2_type) :: mp2_env
1611 : TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub
1612 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1613 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
1614 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
1615 : INTEGER, INTENT(IN) :: dimen_RI, iiB, jjB
1616 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_size, my_B_virtual_end, &
1617 : my_B_virtual_start
1618 : INTEGER, INTENT(IN) :: my_i, my_j
1619 : INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1620 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1621 : INTENT(INOUT), TARGET :: local_ab
1622 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1623 : INTENT(IN), TARGET :: t_ab, my_local_i_aL, my_local_j_aL
1624 : LOGICAL, INTENT(IN) :: open_ss
1625 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1626 : INTENT(INOUT), TARGET :: Y_i_aP, Y_j_aP, local_ba
1627 : INTEGER, INTENT(IN) :: ispin, jspin
1628 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
1629 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
1630 :
1631 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_update_P_gamma'
1632 :
1633 : INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_B_size, &
1634 : rec_B_virtual_end, rec_B_virtual_start, send_B_size, send_B_virtual_end, &
1635 : send_B_virtual_start
1636 : INTEGER(KIND=int_8) :: offset
1637 : LOGICAL :: alpha_beta
1638 : REAL(KIND=dp) :: factor, P_ij_diag
1639 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1640 1569 : POINTER :: external_ab, send_ab
1641 :
1642 1569 : CALL timeset(routineN//"_Pia", handle)
1643 :
1644 1569 : alpha_beta = .NOT. (ispin == jspin)
1645 1569 : IF (open_ss) THEN
1646 : factor = 1.0_dp
1647 : ELSE
1648 1189 : factor = 2.0_dp
1649 : END IF
1650 : ! divide the (ia|jb) integrals by Delta_ij^ab
1651 24660 : DO b = 1, my_B_size(jspin)
1652 23091 : b_global = b + my_B_virtual_start(jspin) - 1
1653 462933 : DO a = 1, virtual(ispin)
1654 : local_ab(a, b) = -local_ab(a, b)/ &
1655 : (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
1656 461364 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
1657 : END DO
1658 : END DO
1659 1569 : IF (.NOT. (alpha_beta)) THEN
1660 322531 : P_ij_diag = -SUM(local_ab*t_ab)*factor
1661 : ELSE
1662 : ! update diagonal part of P_ij
1663 140402 : P_ij_diag = -SUM(local_ab*local_ab)*mp2_env%scale_S
1664 : ! More integrals needed only for alpha-beta case: local_ba
1665 6939 : DO b = 1, my_B_size(ispin)
1666 6449 : b_global = b + my_B_virtual_start(ispin) - 1
1667 139874 : DO a = 1, virtual(jspin)
1668 : local_ba(a, b) = -local_ba(a, b)/ &
1669 : (Eigenval(homo(jspin) + a, jspin) + Eigenval(homo(ispin) + b_global, ispin) - &
1670 139384 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
1671 : END DO
1672 : END DO
1673 : END IF
1674 :
1675 : ! P_ab and add diagonal part of P_ij
1676 :
1677 1569 : CALL dgemm_counter_start(dgemm_counter)
1678 1569 : IF (.NOT. (alpha_beta)) THEN
1679 : CALL local_gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(ispin), 1.0_dp, &
1680 : t_ab, virtual(ispin), local_ab, virtual(ispin), &
1681 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
1682 1079 : my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin), mp2_env%local_gemm_ctx)
1683 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
1684 1079 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
1685 : ELSE
1686 : CALL local_gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(jspin), mp2_env%scale_S, &
1687 : local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
1688 : mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin), &
1689 490 : mp2_env%local_gemm_ctx)
1690 :
1691 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
1692 490 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
1693 :
1694 : CALL local_gemm('T', 'N', my_B_size(jspin), my_B_size(jspin), virtual(ispin), mp2_env%scale_S, &
1695 : local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
1696 : mp2_env%ri_grad%P_ab(jspin)%array(:, my_B_virtual_start(jspin):my_B_virtual_end(jspin)), my_B_size(jspin), &
1697 490 : mp2_env%local_gemm_ctx)
1698 :
1699 : mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
1700 490 : mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
1701 : END IF
1702 : ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
1703 : ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
1704 : ! due to spin in alpha-beta case.
1705 1569 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1706 :
1707 : CALL local_gemm('N', 'T', my_B_size(ispin), virtual(ispin), my_B_size(ispin), 1.0_dp, &
1708 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1709 : local_ab, virtual(ispin), &
1710 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_B_size(ispin), &
1711 796 : mp2_env%local_gemm_ctx)
1712 :
1713 : mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
1714 796 : mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
1715 : END IF
1716 1741 : DO proc_shift = 1, para_env_sub%num_pe - 1
1717 172 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1718 172 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1719 :
1720 172 : CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1721 172 : CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1722 :
1723 172 : external_ab(1:virtual(ispin), 1:rec_B_size) => buffer_1D(1:INT(virtual(ispin), int_8)*rec_B_size)
1724 35072 : external_ab = 0.0_dp
1725 :
1726 : CALL para_env_sub%sendrecv(local_ab, proc_send, &
1727 172 : external_ab, proc_receive)
1728 :
1729 172 : IF (.NOT. (alpha_beta)) THEN
1730 : CALL local_gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(ispin), 1.0_dp, &
1731 : t_ab, virtual(ispin), external_ab, virtual(ispin), &
1732 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin), &
1733 102 : mp2_env%local_gemm_ctx)
1734 : ELSE
1735 : CALL local_gemm('T', 'N', my_B_size(jspin), rec_B_size, virtual(ispin), mp2_env%scale_S, &
1736 : local_ab, virtual(ispin), external_ab, virtual(ispin), &
1737 : 1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_B_virtual_start:rec_B_virtual_end), &
1738 70 : my_B_size(jspin), mp2_env%local_gemm_ctx)
1739 :
1740 : ! For alpha-beta part of alpha-density we need a new parallel code
1741 : ! And new external_ab (of a different size)
1742 70 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1743 70 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1744 70 : external_ab(1:virtual(jspin), 1:rec_B_size) => buffer_1D(1:INT(virtual(jspin), int_8)*rec_B_size)
1745 14700 : external_ab = 0.0_dp
1746 : CALL para_env_sub%sendrecv(local_ba, proc_send, &
1747 70 : external_ab, proc_receive)
1748 : CALL local_gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(jspin), mp2_env%scale_S, &
1749 : local_ba, virtual(jspin), external_ab, virtual(jspin), &
1750 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin), &
1751 70 : mp2_env%local_gemm_ctx)
1752 : END IF
1753 :
1754 1913 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1755 : external_ab(1:my_B_size(ispin), 1:virtual(ispin)) => &
1756 86 : buffer_1D(1:INT(virtual(ispin), int_8)*my_B_size(ispin))
1757 18083 : external_ab = 0.0_dp
1758 :
1759 86 : offset = INT(virtual(ispin), int_8)*my_B_size(ispin)
1760 :
1761 86 : send_ab(1:send_B_size, 1:virtual(ispin)) => buffer_1D(offset + 1:offset + INT(send_B_size, int_8)*virtual(ispin))
1762 18083 : send_ab = 0.0_dp
1763 :
1764 : CALL local_gemm('N', 'T', send_B_size, virtual(ispin), my_B_size(ispin), 1.0_dp, &
1765 : t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1766 : local_ab, virtual(ispin), &
1767 : 0.0_dp, send_ab, send_B_size, &
1768 86 : mp2_env%local_gemm_ctx)
1769 : CALL para_env_sub%sendrecv(send_ab, proc_send, &
1770 86 : external_ab, proc_receive)
1771 :
1772 18083 : mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
1773 : END IF
1774 :
1775 : END DO
1776 1569 : IF (.NOT. alpha_beta) THEN
1777 1079 : IF (my_i /= my_j) THEN
1778 796 : CALL dgemm_counter_stop(dgemm_counter, 2*my_B_size(ispin), virtual(ispin), virtual(ispin))
1779 : ELSE
1780 283 : CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), virtual(ispin), virtual(ispin))
1781 : END IF
1782 : ELSE
1783 1470 : CALL dgemm_counter_stop(dgemm_counter, SUM(my_B_size), virtual(ispin), virtual(jspin))
1784 : END IF
1785 1569 : CALL timestop(handle)
1786 :
1787 : ! Now, Gamma_P_ia (made of Y_ia_P)
1788 :
1789 1569 : CALL timeset(routineN//"_Gamma", handle)
1790 1569 : CALL dgemm_counter_start(dgemm_counter)
1791 1569 : IF (.NOT. alpha_beta) THEN
1792 : ! Alpha-alpha, beta-beta and closed shell
1793 : CALL local_gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
1794 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1795 : my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin), &
1796 1079 : mp2_env%local_gemm_ctx)
1797 : ELSE ! Alpha-beta
1798 : CALL local_gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
1799 : local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1800 490 : my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin), mp2_env%local_gemm_ctx)
1801 : CALL local_gemm('T', 'T', my_B_size(jspin), dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
1802 : local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1803 490 : my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(jspin), mp2_env%local_gemm_ctx)
1804 : END IF
1805 :
1806 1569 : IF (para_env_sub%num_pe > 1) THEN
1807 172 : external_ab(1:my_B_size(ispin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(ispin), int_8)*dimen_RI)
1808 189692 : external_ab = 0.0_dp
1809 :
1810 172 : offset = INT(my_B_size(ispin), int_8)*dimen_RI
1811 : END IF
1812 : !
1813 1741 : DO proc_shift = 1, para_env_sub%num_pe - 1
1814 172 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1815 172 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1816 :
1817 172 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1818 172 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1819 :
1820 172 : send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
1821 189692 : send_ab = 0.0_dp
1822 1913 : IF (.NOT. alpha_beta) THEN
1823 : CALL local_gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), 1.0_dp, &
1824 : t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1825 : my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size, &
1826 102 : mp2_env%local_gemm_ctx)
1827 102 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1828 :
1829 217544 : Y_i_aP(:, :) = Y_i_aP + external_ab
1830 :
1831 : ELSE ! Alpha-beta case
1832 : ! Alpha-alpha part
1833 : CALL local_gemm('N', 'T', send_B_size, dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
1834 : local_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1835 : my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size, &
1836 70 : mp2_env%local_gemm_ctx)
1837 70 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1838 161840 : Y_i_aP(:, :) = Y_i_aP + external_ab
1839 : END IF
1840 : END DO
1841 :
1842 1569 : IF (alpha_beta) THEN
1843 : ! For beta-beta part (in alpha-beta case) we need a new parallel code
1844 490 : IF (para_env_sub%num_pe > 1) THEN
1845 70 : external_ab(1:my_B_size(jspin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(jspin), int_8)*dimen_RI)
1846 88620 : external_ab = 0.0_dp
1847 :
1848 70 : offset = INT(my_B_size(jspin), int_8)*dimen_RI
1849 : END IF
1850 560 : DO proc_shift = 1, para_env_sub%num_pe - 1
1851 70 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1852 70 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1853 :
1854 70 : CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1855 70 : send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
1856 88620 : send_ab = 0.0_dp
1857 : CALL local_gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
1858 : local_ba(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1859 : my_local_i_aL, dimen_RI, 0.0_dp, send_ab, send_B_size, &
1860 70 : mp2_env%local_gemm_ctx)
1861 70 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1862 177800 : Y_j_aP(:, :) = Y_j_aP + external_ab
1863 :
1864 : END DO
1865 :
1866 : ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
1867 490 : CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_RI, my_B_size(jspin))
1868 : ELSE
1869 1079 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_RI, my_B_size(ispin))
1870 : END IF
1871 :
1872 1569 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1873 : ! Alpha-alpha, beta-beta and closed shell
1874 796 : CALL dgemm_counter_start(dgemm_counter)
1875 : CALL local_gemm('T', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
1876 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1877 : my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin), &
1878 796 : mp2_env%local_gemm_ctx)
1879 882 : DO proc_shift = 1, para_env_sub%num_pe - 1
1880 86 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1881 86 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1882 :
1883 86 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1884 :
1885 86 : external_ab(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
1886 86837 : external_ab = 0.0_dp
1887 :
1888 : CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
1889 86 : external_ab, proc_receive)
1890 :
1891 : ! Alpha-alpha, beta-beta and closed shell
1892 : CALL local_gemm('T', 'T', my_B_size(ispin), dimen_RI, rec_B_size, 1.0_dp, &
1893 : t_ab(rec_B_virtual_start:rec_B_virtual_end, :), rec_B_size, &
1894 968 : external_ab, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin), mp2_env%local_gemm_ctx)
1895 : END DO
1896 :
1897 796 : CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), dimen_RI, virtual(ispin))
1898 : END IF
1899 :
1900 1569 : CALL timestop(handle)
1901 1569 : END SUBROUTINE mp2_update_P_gamma
1902 :
1903 : ! **************************************************************************************************
1904 : !> \brief ...
1905 : !> \param Gamma_P_ia ...
1906 : !> \param ij_index ...
1907 : !> \param my_B_size ...
1908 : !> \param my_block_size ...
1909 : !> \param my_group_L_size ...
1910 : !> \param my_i ...
1911 : !> \param my_ij_pairs ...
1912 : !> \param ngroup ...
1913 : !> \param num_integ_group ...
1914 : !> \param integ_group_pos2color_sub ...
1915 : !> \param num_ij_pairs ...
1916 : !> \param ij_map ...
1917 : !> \param ranges_info_array ...
1918 : !> \param Y_i_aP ...
1919 : !> \param comm_exchange ...
1920 : !> \param sizes_array ...
1921 : !> \param spin ...
1922 : !> \param buffer_1D ...
1923 : ! **************************************************************************************************
1924 3132 : SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
1925 : my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
1926 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
1927 3132 : ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
1928 3132 : sizes_array, spin, buffer_1D)
1929 :
1930 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: Gamma_P_ia
1931 : INTEGER, INTENT(IN) :: ij_index, my_B_size, my_block_size, &
1932 : my_group_L_size, my_i, my_ij_pairs, &
1933 : ngroup, num_integ_group
1934 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub, num_ij_pairs
1935 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN) :: ij_map
1936 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1937 : INTENT(IN) :: ranges_info_array
1938 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: Y_i_aP
1939 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
1940 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
1941 : INTEGER, INTENT(IN) :: spin
1942 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
1943 :
1944 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_redistribute_gamma'
1945 :
1946 : INTEGER :: end_point, handle, handle2, iiB, ij_counter_rec, irep, kkk, lll, Lstart_pos, &
1947 : proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_L_size, start_point, tag
1948 : INTEGER(KIND=int_8) :: offset
1949 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
1950 3132 : POINTER :: BI_C_rec, BI_C_send
1951 :
1952 : ! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
1953 :
1954 3132 : CALL timeset(routineN//"_comm2", handle)
1955 :
1956 3132 : tag = 43
1957 :
1958 3132 : IF (ij_index <= my_ij_pairs) THEN
1959 : ! somethig to send
1960 : ! start with myself
1961 3114 : CALL timeset(routineN//"_comm2_w", handle2)
1962 8924 : DO irep = 0, num_integ_group - 1
1963 5810 : Lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
1964 5810 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
1965 5810 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
1966 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(kkk,lll,iiB) &
1967 : !$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1968 8924 : !$OMP Gamma_P_ia,my_i,my_B_size,Y_i_aP)
1969 : DO kkk = start_point, end_point
1970 : lll = kkk - start_point + Lstart_pos
1971 : DO iiB = 1, my_block_size
1972 : Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) = &
1973 : Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) + &
1974 : Y_i_aP(1:my_B_size, lll, iiB)
1975 : END DO
1976 : END DO
1977 : !$OMP END PARALLEL DO
1978 : END DO
1979 3114 : CALL timestop(handle2)
1980 :
1981 : ! Y_i_aP(my_B_size,dimen_RI,block_size)
1982 :
1983 3212 : DO proc_shift = 1, comm_exchange%num_pe - 1
1984 98 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
1985 98 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
1986 :
1987 98 : send_L_size = sizes_array(proc_send)
1988 : BI_C_send(1:my_B_size, 1:my_block_size, 1:send_L_size) => &
1989 98 : buffer_1D(1:INT(my_B_size, int_8)*my_block_size*send_L_size)
1990 :
1991 98 : offset = INT(my_B_size, int_8)*my_block_size*send_L_size
1992 :
1993 98 : CALL timeset(routineN//"_comm2_w", handle2)
1994 48692 : BI_C_send = 0.0_dp
1995 196 : DO irep = 0, num_integ_group - 1
1996 98 : Lstart_pos = ranges_info_array(1, irep, proc_send)
1997 98 : start_point = ranges_info_array(3, irep, proc_send)
1998 98 : end_point = ranges_info_array(4, irep, proc_send)
1999 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(kkk,lll,iiB) &
2000 : !$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
2001 196 : !$OMP BI_C_send,my_B_size,Y_i_aP)
2002 : DO kkk = start_point, end_point
2003 : lll = kkk - start_point + Lstart_pos
2004 : DO iiB = 1, my_block_size
2005 : BI_C_send(1:my_B_size, iiB, kkk) = Y_i_aP(1:my_B_size, lll, iiB)
2006 : END DO
2007 : END DO
2008 : !$OMP END PARALLEL DO
2009 : END DO
2010 98 : CALL timestop(handle2)
2011 :
2012 98 : rec_ij_index = num_ij_pairs(proc_receive)
2013 :
2014 3310 : IF (ij_index <= rec_ij_index) THEN
2015 : ! we know that proc_receive has something to send for us, let's see what
2016 : ij_counter_rec = &
2017 80 : (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
2018 :
2019 80 : rec_i = ij_map(spin, ij_counter_rec)
2020 :
2021 : BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
2022 80 : buffer_1D(offset + 1:offset + INT(my_B_size, int_8)*my_block_size*my_group_L_size)
2023 44250 : BI_C_rec = 0.0_dp
2024 :
2025 : CALL comm_exchange%sendrecv(BI_C_send, proc_send, &
2026 80 : BI_C_rec, proc_receive, tag)
2027 :
2028 80 : CALL timeset(routineN//"_comm2_w", handle2)
2029 160 : DO irep = 0, num_integ_group - 1
2030 80 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2031 80 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2032 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(start_point,end_point,my_block_size,&
2033 160 : !$OMP Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
2034 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2035 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2036 : BI_C_rec(1:my_B_size, :, start_point:end_point)
2037 : !$OMP END PARALLEL WORKSHARE
2038 : END DO
2039 80 : CALL timestop(handle2)
2040 :
2041 : ELSE
2042 : ! we have something to send but nothing to receive
2043 18 : CALL comm_exchange%send(BI_C_send, proc_send, tag)
2044 :
2045 : END IF
2046 :
2047 : END DO
2048 :
2049 : ELSE
2050 : ! noting to send check if we have to receive
2051 36 : DO proc_shift = 1, comm_exchange%num_pe - 1
2052 18 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2053 18 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2054 18 : rec_ij_index = num_ij_pairs(proc_receive)
2055 :
2056 36 : IF (ij_index <= rec_ij_index) THEN
2057 : ! we know that proc_receive has something to send for us, let's see what
2058 : ij_counter_rec = &
2059 18 : (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
2060 :
2061 18 : rec_i = ij_map(spin, ij_counter_rec)
2062 :
2063 : BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
2064 18 : buffer_1D(1:INT(my_B_size, int_8)*my_block_size*my_group_L_size)
2065 :
2066 4442 : BI_C_rec = 0.0_dp
2067 :
2068 18 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2069 :
2070 18 : CALL timeset(routineN//"_comm2_w", handle2)
2071 36 : DO irep = 0, num_integ_group - 1
2072 18 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2073 18 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2074 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(start_point,end_point,my_block_size,&
2075 36 : !$OMP Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
2076 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2077 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2078 : BI_C_rec(1:my_B_size, :, start_point:end_point)
2079 : !$OMP END PARALLEL WORKSHARE
2080 : END DO
2081 18 : CALL timestop(handle2)
2082 :
2083 : END IF
2084 : END DO
2085 :
2086 : END IF
2087 3132 : CALL timestop(handle)
2088 :
2089 3132 : END SUBROUTINE mp2_redistribute_gamma
2090 :
2091 : ! **************************************************************************************************
2092 : !> \brief ...
2093 : !> \param mp2_env ...
2094 : !> \param Eigenval ...
2095 : !> \param homo ...
2096 : !> \param virtual ...
2097 : !> \param open_shell ...
2098 : !> \param beta_beta ...
2099 : !> \param Bib_C ...
2100 : !> \param unit_nr ...
2101 : !> \param dimen_RI ...
2102 : !> \param my_B_size ...
2103 : !> \param ngroup ...
2104 : !> \param my_group_L_size ...
2105 : !> \param color_sub ...
2106 : !> \param ranges_info_array ...
2107 : !> \param comm_exchange ...
2108 : !> \param para_env_sub ...
2109 : !> \param para_env ...
2110 : !> \param my_B_virtual_start ...
2111 : !> \param my_B_virtual_end ...
2112 : !> \param sizes_array ...
2113 : !> \param gd_B_virtual ...
2114 : !> \param integ_group_pos2color_sub ...
2115 : !> \param dgemm_counter ...
2116 : !> \param buffer_1D ...
2117 : ! **************************************************************************************************
2118 372 : SUBROUTINE quasi_degenerate_P_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
2119 372 : beta_beta, Bib_C, unit_nr, dimen_RI, &
2120 372 : my_B_size, ngroup, my_group_L_size, &
2121 : color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
2122 372 : my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
2123 372 : integ_group_pos2color_sub, dgemm_counter, buffer_1D)
2124 : TYPE(mp2_type) :: mp2_env
2125 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
2126 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2127 : LOGICAL, INTENT(IN) :: open_shell, beta_beta
2128 : TYPE(three_dim_real_array), DIMENSION(:), &
2129 : INTENT(IN) :: BIb_C
2130 : INTEGER, INTENT(IN) :: unit_nr, dimen_RI
2131 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_size
2132 : INTEGER, INTENT(IN) :: ngroup, my_group_L_size, color_sub
2133 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
2134 : INTENT(IN) :: ranges_info_array
2135 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2136 : TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub, para_env
2137 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_virtual_start, my_B_virtual_end
2138 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
2139 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
2140 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub
2141 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
2142 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
2143 :
2144 : CHARACTER(LEN=*), PARAMETER :: routineN = 'quasi_degenerate_P_ij'
2145 :
2146 : INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
2147 : ijk_counter_send, ijk_index, ispin, kkB, kspin, max_block_size, max_ijk, my_i, my_ijk, &
2148 : my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
2149 : rec_B_size, rec_B_virtual_end, rec_B_virtual_start, rec_L_size, send_B_size, &
2150 : send_B_virtual_end, send_B_virtual_start, send_i, send_ijk_index, send_j, send_k, &
2151 : size_B_i, size_B_k, tag, tag2
2152 372 : INTEGER, ALLOCATABLE, DIMENSION(:) :: num_ijk
2153 372 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ijk_map, send_last_k
2154 : LOGICAL :: alpha_beta, do_recv_i, do_recv_j, &
2155 : do_recv_k, do_send_i, do_send_j, &
2156 : do_send_k
2157 : REAL(KIND=dp) :: amp_fac, P_ij_elem, t_new, t_start
2158 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
2159 372 : TARGET :: local_ab, local_aL_i, local_aL_j, t_ab
2160 372 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: local_aL_k
2161 372 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: BI_C_rec, external_ab, external_aL
2162 372 : REAL(KIND=dp), DIMENSION(:, :, :), POINTER :: BI_C_rec_3D
2163 :
2164 372 : CALL timeset(routineN//"_ij_sing", handle)
2165 :
2166 372 : tag = 44
2167 372 : tag2 = 45
2168 :
2169 372 : nspins = SIZE(BIb_C)
2170 372 : alpha_beta = (nspins == 2)
2171 :
2172 : ! Set amplitude factor
2173 372 : amp_fac = mp2_env%scale_S + mp2_env%scale_T
2174 372 : IF (open_shell) amp_fac = mp2_env%scale_T
2175 :
2176 770 : ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
2177 :
2178 : ! Loop(s) over orbital triplets
2179 820 : DO ispin = 1, nspins
2180 448 : size_B_i = my_B_size(ispin)
2181 448 : IF (ispin == 1 .AND. alpha_beta) THEN
2182 : kspin = 2
2183 : ELSE
2184 372 : kspin = 1
2185 : END IF
2186 448 : size_B_k = my_B_size(kspin)
2187 :
2188 : ! Find the number of quasi-degenerate orbitals and orbital triplets
2189 :
2190 : CALL Find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), Eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
2191 : .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
2192 600 : SIZE(buffer_1D), my_group_L_size, size_B_k, para_env, virtual(ispin), size_B_i)
2193 :
2194 448 : my_virtual = virtual(ispin)
2195 448 : IF (SIZE(ijk_map, 2) > 0) THEN
2196 90 : max_block_size = ijk_map(4, 1)
2197 : ELSE
2198 : max_block_size = 1
2199 : END IF
2200 :
2201 1792 : ALLOCATE (local_aL_i(dimen_RI, size_B_i))
2202 1792 : ALLOCATE (local_aL_j(dimen_RI, size_B_i))
2203 2240 : ALLOCATE (local_aL_k(dimen_RI, size_B_k, max_block_size))
2204 1792 : ALLOCATE (t_ab(my_virtual, size_B_k))
2205 :
2206 1344 : my_last_k = -1
2207 538 : send_last_k = -1
2208 :
2209 448 : t_start = m_walltime()
2210 594 : DO ijk_index = 1, max_ijk
2211 :
2212 : ! Prediction is unreliable if we are in the first step of the loop
2213 146 : IF (unit_nr > 0 .AND. ijk_index > 1) THEN
2214 18 : decil = ijk_index*10/max_ijk
2215 18 : IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
2216 18 : t_new = m_walltime()
2217 18 : t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
2218 : WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
2219 18 : cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
2220 18 : CALL m_flush(unit_nr)
2221 : END IF
2222 : END IF
2223 :
2224 594 : IF (ijk_index <= my_ijk) THEN
2225 : ! work to be done
2226 144 : ijk_counter = (ijk_index - MIN(1, color_sub))*ngroup + color_sub
2227 144 : my_i = ijk_map(1, ijk_counter)
2228 144 : my_j = ijk_map(2, ijk_counter)
2229 144 : my_k = ijk_map(3, ijk_counter)
2230 144 : block_size = ijk_map(4, ijk_counter)
2231 :
2232 144 : do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
2233 144 : do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
2234 144 : do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
2235 144 : my_last_k(1) = my_k
2236 144 : my_last_k(2) = my_k + block_size - 1
2237 :
2238 127614 : local_aL_i = 0.0_dp
2239 144 : IF (do_recv_i) THEN
2240 : CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
2241 117 : BIb_C(ispin)%array(:, :, my_i))
2242 : END IF
2243 :
2244 127614 : local_aL_j = 0.0_dp
2245 144 : IF (do_recv_j) THEN
2246 : CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
2247 117 : BIb_C(ispin)%array(:, :, my_j))
2248 : END IF
2249 :
2250 144 : IF (do_recv_k) THEN
2251 219805 : local_aL_k = 0.0_dp
2252 : CALL fill_local_i_aL(local_aL_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
2253 136 : BIb_C(kspin)%array(:, :, my_k:my_k + block_size - 1))
2254 : END IF
2255 :
2256 144 : CALL timeset(routineN//"_comm", handle2)
2257 154 : DO proc_shift = 1, comm_exchange%num_pe - 1
2258 10 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2259 10 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2260 :
2261 10 : send_ijk_index = num_ijk(proc_send)
2262 :
2263 10 : rec_L_size = sizes_array(proc_receive)
2264 10 : BI_C_rec(1:rec_L_size, 1:size_B_i) => buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_i)
2265 :
2266 10 : do_send_i = .FALSE.
2267 10 : do_send_j = .FALSE.
2268 10 : do_send_k = .FALSE.
2269 10 : IF (ijk_index <= send_ijk_index) THEN
2270 : ! something to send
2271 : ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))* &
2272 8 : ngroup + integ_group_pos2color_sub(proc_send)
2273 8 : send_i = ijk_map(1, ijk_counter_send)
2274 8 : send_j = ijk_map(2, ijk_counter_send)
2275 8 : send_k = ijk_map(3, ijk_counter_send)
2276 :
2277 8 : do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2278 8 : do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2279 8 : do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
2280 8 : send_last_k(1, proc_shift) = send_k
2281 8 : send_last_k(2, proc_shift) = send_k + block_size - 1
2282 : END IF
2283 :
2284 : ! occupied i
2285 722 : BI_C_rec = 0.0_dp
2286 10 : IF (do_send_i) THEN
2287 6 : IF (do_recv_i) THEN
2288 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i), proc_send, &
2289 288 : BI_C_rec, proc_receive, tag)
2290 : ELSE
2291 2 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
2292 : END IF
2293 4 : ELSE IF (do_recv_i) THEN
2294 580 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2295 : END IF
2296 10 : IF (do_recv_i) THEN
2297 8 : CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, proc_receive), BI_C_rec)
2298 : END IF
2299 :
2300 : ! occupied j
2301 722 : BI_C_rec = 0.0_dp
2302 10 : IF (do_send_j) THEN
2303 8 : IF (do_recv_j) THEN
2304 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_j), proc_send, &
2305 576 : BI_C_rec, proc_receive, tag)
2306 : ELSE
2307 0 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
2308 : END IF
2309 2 : ELSE IF (do_recv_j) THEN
2310 0 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2311 : END IF
2312 10 : IF (do_recv_j) THEN
2313 8 : CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, proc_receive), BI_C_rec)
2314 : END IF
2315 :
2316 : ! occupied k
2317 : BI_C_rec_3D(1:rec_L_size, 1:size_B_k, 1:block_size) => &
2318 10 : buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_k*block_size)
2319 10 : IF (do_send_k) THEN
2320 8 : IF (do_recv_k) THEN
2321 : CALL comm_exchange%sendrecv(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
2322 726 : BI_C_rec_3D, proc_receive, tag)
2323 : ELSE
2324 0 : CALL comm_exchange%send(BI_C_rec, proc_receive, tag)
2325 : END IF
2326 2 : ELSE IF (do_recv_k) THEN
2327 294 : CALL comm_exchange%recv(BI_C_rec_3D, proc_receive, tag)
2328 : END IF
2329 154 : IF (do_recv_k) THEN
2330 10 : CALL fill_local_i_aL(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), BI_C_rec_3D)
2331 : END IF
2332 : END DO
2333 :
2334 20172 : IF (.NOT. do_recv_i) local_aL_i(:, :) = local_aL_k(:, :, my_i - my_k + 1)
2335 20172 : IF (.NOT. do_recv_j) local_aL_j(:, :) = local_aL_k(:, :, my_j - my_k + 1)
2336 144 : CALL timestop(handle2)
2337 :
2338 : ! expand integrals
2339 352 : DO kkB = 1, block_size
2340 208 : CALL timeset(routineN//"_exp_ik", handle2)
2341 208 : CALL dgemm_counter_start(dgemm_counter)
2342 832 : ALLOCATE (local_ab(my_virtual, size_B_k))
2343 32408 : local_ab = 0.0_dp
2344 : CALL local_gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
2345 : local_aL_i, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2346 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i, &
2347 208 : mp2_env%local_gemm_ctx)
2348 208 : DO proc_shift = 1, para_env_sub%num_pe - 1
2349 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2350 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2351 :
2352 0 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2353 :
2354 0 : external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
2355 :
2356 : CALL comm_exchange%sendrecv(local_aL_i, proc_send, &
2357 0 : external_aL, proc_receive, tag)
2358 :
2359 : CALL local_gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
2360 : external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2361 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size, &
2362 208 : mp2_env%local_gemm_ctx)
2363 : END DO
2364 208 : CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
2365 208 : CALL timestop(handle2)
2366 :
2367 : ! Amplitudes
2368 208 : CALL timeset(routineN//"_tab", handle2)
2369 32408 : t_ab = 0.0_dp
2370 : ! Alpha-alpha, beta-beta and closed shell
2371 208 : IF (.NOT. alpha_beta) THEN
2372 1125 : DO b = 1, size_B_k
2373 1014 : b_global = b + my_B_virtual_start(1) - 1
2374 17135 : DO a = 1, my_B_size(1)
2375 16010 : a_global = a + my_B_virtual_start(1) - 1
2376 : t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
2377 : (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
2378 17024 : - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
2379 : END DO
2380 : END DO
2381 : ELSE
2382 999 : DO b = 1, size_B_k
2383 902 : b_global = b + my_B_virtual_start(kspin) - 1
2384 15273 : DO a = 1, my_B_size(ispin)
2385 14274 : a_global = a + my_B_virtual_start(ispin) - 1
2386 : t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
2387 : (Eigenval(my_i, ispin) + Eigenval(my_k + kkB - 1, kspin) &
2388 15176 : - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
2389 : END DO
2390 : END DO
2391 : END IF
2392 :
2393 208 : IF (.NOT. alpha_beta) THEN
2394 111 : DO proc_shift = 1, para_env_sub%num_pe - 1
2395 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2396 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2397 0 : CALL get_group_dist(gd_B_virtual(1), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2398 0 : CALL get_group_dist(gd_B_virtual(1), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
2399 :
2400 0 : external_ab(1:size_B_i, 1:rec_B_size) => buffer_1D(1:INT(size_B_i, KIND=int_8)*rec_B_size)
2401 : CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:size_B_k), proc_send, &
2402 0 : external_ab(1:size_B_i, 1:rec_B_size), proc_receive, tag)
2403 :
2404 111 : DO b = 1, my_B_size(1)
2405 0 : b_global = b + my_B_virtual_start(1) - 1
2406 0 : DO a = 1, rec_B_size
2407 0 : a_global = a + rec_B_virtual_start - 1
2408 : t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
2409 : (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
2410 0 : - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
2411 : END DO
2412 : END DO
2413 : END DO
2414 : END IF
2415 208 : CALL timestop(handle2)
2416 :
2417 : ! Expand the second set of integrals
2418 208 : CALL timeset(routineN//"_exp_jk", handle2)
2419 32408 : local_ab = 0.0_dp
2420 208 : CALL dgemm_counter_start(dgemm_counter)
2421 : CALL local_gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
2422 : local_aL_j, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2423 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i, &
2424 208 : mp2_env%local_gemm_ctx)
2425 208 : DO proc_shift = 1, para_env_sub%num_pe - 1
2426 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2427 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2428 :
2429 0 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2430 :
2431 0 : external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
2432 :
2433 : CALL comm_exchange%sendrecv(local_aL_j, proc_send, &
2434 0 : external_aL, proc_receive, tag)
2435 : CALL local_gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
2436 : external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2437 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size, &
2438 208 : mp2_env%local_gemm_ctx)
2439 : END DO
2440 208 : CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
2441 208 : CALL timestop(handle2)
2442 :
2443 208 : CALL timeset(routineN//"_Pij", handle2)
2444 2124 : DO b = 1, size_B_k
2445 1916 : b_global = b + my_B_virtual_start(kspin) - 1
2446 32408 : DO a = 1, my_B_size(ispin)
2447 30284 : a_global = a + my_B_virtual_start(ispin) - 1
2448 : local_ab(a_global, b) = &
2449 : local_ab(a_global, b)/(Eigenval(my_j, ispin) + Eigenval(my_k + kkB - 1, kspin) &
2450 32200 : - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
2451 : END DO
2452 : END DO
2453 : !
2454 32408 : P_ij_elem = SUM(local_ab*t_ab)
2455 208 : DEALLOCATE (local_ab)
2456 208 : IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
2457 4 : P_ij_elem = P_ij_elem*2.0_dp
2458 : END IF
2459 208 : IF (beta_beta) THEN
2460 31 : mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - P_ij_elem
2461 31 : mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - P_ij_elem
2462 : ELSE
2463 177 : mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - P_ij_elem
2464 177 : mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - P_ij_elem
2465 : END IF
2466 1184 : CALL timestop(handle2)
2467 : END DO
2468 : ELSE
2469 2 : CALL timeset(routineN//"_comm", handle2)
2470 : ! no work to be done, possible messeges to be exchanged
2471 4 : DO proc_shift = 1, comm_exchange%num_pe - 1
2472 2 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2473 2 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2474 :
2475 2 : send_ijk_index = num_ijk(proc_send)
2476 :
2477 4 : IF (ijk_index <= send_ijk_index) THEN
2478 : ! somethig to send
2479 : ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
2480 2 : integ_group_pos2color_sub(proc_send)
2481 2 : send_i = ijk_map(1, ijk_counter_send)
2482 2 : send_j = ijk_map(2, ijk_counter_send)
2483 2 : send_k = ijk_map(3, ijk_counter_send)
2484 2 : block_size = ijk_map(4, ijk_counter_send)
2485 :
2486 2 : do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2487 2 : do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2488 : ! occupied i
2489 2 : IF (do_send_i) THEN
2490 2 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
2491 : END IF
2492 : ! occupied j
2493 2 : IF (do_send_j) THEN
2494 0 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
2495 : END IF
2496 : ! occupied k
2497 2 : CALL comm_exchange%send(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
2498 : END IF
2499 :
2500 : END DO ! proc loop
2501 2 : CALL timestop(handle2)
2502 : END IF
2503 : END DO ! ijk_index loop
2504 448 : DEALLOCATE (local_aL_i)
2505 448 : DEALLOCATE (local_aL_j)
2506 448 : DEALLOCATE (local_aL_k)
2507 448 : DEALLOCATE (t_ab)
2508 820 : DEALLOCATE (ijk_map)
2509 : END DO ! over number of loops (ispin)
2510 372 : CALL timestop(handle)
2511 :
2512 744 : END SUBROUTINE Quasi_degenerate_P_ij
2513 :
2514 : ! **************************************************************************************************
2515 : !> \brief ...
2516 : !> \param my_ijk ...
2517 : !> \param homo ...
2518 : !> \param homo_beta ...
2519 : !> \param Eigenval ...
2520 : !> \param mp2_env ...
2521 : !> \param ijk_map ...
2522 : !> \param unit_nr ...
2523 : !> \param ngroup ...
2524 : !> \param do_print_alpha ...
2525 : !> \param comm_exchange ...
2526 : !> \param num_ijk ...
2527 : !> \param max_ijk ...
2528 : !> \param color_sub ...
2529 : !> \param buffer_size ...
2530 : !> \param my_group_L_size ...
2531 : !> \param B_size_k ...
2532 : !> \param para_env ...
2533 : !> \param virtual ...
2534 : !> \param B_size_i ...
2535 : ! **************************************************************************************************
2536 448 : SUBROUTINE Find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
2537 : do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
2538 : buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
2539 :
2540 : INTEGER, INTENT(OUT) :: my_ijk
2541 : INTEGER, INTENT(IN) :: homo, homo_beta
2542 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
2543 : TYPE(mp2_type), INTENT(IN) :: mp2_env
2544 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
2545 : INTEGER, INTENT(IN) :: unit_nr, ngroup
2546 : LOGICAL, INTENT(IN) :: do_print_alpha
2547 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2548 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: num_ijk
2549 : INTEGER, INTENT(OUT) :: max_ijk
2550 : INTEGER, INTENT(IN) :: color_sub, buffer_size, my_group_L_size, &
2551 : B_size_k
2552 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
2553 : INTEGER, INTENT(IN) :: virtual, B_size_i
2554 :
2555 : INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
2556 : ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
2557 : my_steps, num_k_blocks, num_sing_ij, total_ijk
2558 : INTEGER(KIND=int_8) :: mem
2559 448 : LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ijk_marker
2560 :
2561 1344 : ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
2562 :
2563 448 : num_sing_ij = 0
2564 2026 : DO iiB = 1, homo
2565 : ! diagonal elements already updated
2566 4230 : DO jjB = iiB + 1, homo
2567 2204 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) < mp2_env%ri_grad%eps_canonical) &
2568 1684 : num_sing_ij = num_sing_ij + 1
2569 : END DO
2570 : END DO
2571 :
2572 448 : IF (unit_nr > 0) THEN
2573 224 : IF (do_print_alpha) THEN
2574 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2575 148 : "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
2576 : ELSE
2577 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2578 76 : "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
2579 : END IF
2580 : END IF
2581 :
2582 : ! Determine the block size, first guess: use available buffer
2583 448 : max_block_size = buffer_size/(my_group_L_size*B_size_k)
2584 :
2585 : ! Second limit: memory
2586 448 : CALL m_memory(mem)
2587 : ! Convert to number of doubles
2588 448 : mem = mem/8
2589 : ! Remove local_ab (2x) and local_aL_i (2x)
2590 448 : mem = mem - 2_int_8*(virtual*B_size_k + B_size_i*my_group_L_size)
2591 448 : max_block_size = MIN(max_block_size, MAX(1, INT(mem/(my_group_L_size*B_size_k), KIND(max_block_size))))
2592 :
2593 : ! Exchange the limit
2594 448 : CALL para_env%min(max_block_size)
2595 :
2596 : ! Find now the block size which minimizes the communication volume and then the number of communication steps
2597 448 : block_size = 1
2598 448 : min_communication_volume = 3*homo_beta*num_sing_ij
2599 448 : communication_steps = 3*homo_beta*num_sing_ij
2600 1136 : DO iiB = max_block_size, 2, -1
2601 688 : max_num_k_blocks = homo_beta/iiB*num_sing_ij
2602 688 : num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
2603 688 : communication_volume = num_k_blocks*(2 + iiB) + 3*(homo_beta*num_sing_ij - iiB*num_k_blocks)
2604 688 : my_steps = num_k_blocks + homo_beta*num_sing_ij - iiB*num_k_blocks
2605 1136 : IF (communication_volume < min_communication_volume) THEN
2606 48 : block_size = iiB
2607 48 : min_communication_volume = communication_volume
2608 48 : communication_steps = my_steps
2609 640 : ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
2610 52 : block_size = iiB
2611 52 : communication_steps = my_steps
2612 : END IF
2613 : END DO
2614 :
2615 448 : IF (unit_nr > 0) THEN
2616 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2617 224 : "MO_INFO| Block size:", block_size
2618 224 : CALL m_flush(unit_nr)
2619 : END IF
2620 :
2621 : ! Calculate number of large blocks
2622 448 : max_num_k_blocks = homo_beta/block_size*num_sing_ij
2623 448 : num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
2624 :
2625 448 : total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
2626 986 : ALLOCATE (ijk_map(4, total_ijk))
2627 1888 : ijk_map = 0
2628 1434 : ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
2629 970 : ijk_marker = .TRUE.
2630 :
2631 448 : my_ijk = 0
2632 448 : ijk_counter = 0
2633 448 : ij_counter = 0
2634 2026 : DO iiB = 1, homo
2635 : ! diagonal elements already updated
2636 4230 : DO jjB = iiB + 1, homo
2637 2204 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
2638 106 : ij_counter = ij_counter + 1
2639 1812 : DO kkB = 1, homo_beta - MOD(homo_beta, block_size), block_size
2640 162 : IF (ijk_counter + 1 > num_k_blocks) EXIT
2641 128 : ijk_counter = ijk_counter + 1
2642 384 : ijk_marker(kkB:kkB + block_size - 1, ij_counter) = .FALSE.
2643 128 : ijk_map(1, ijk_counter) = iiB
2644 128 : ijk_map(2, ijk_counter) = jjB
2645 128 : ijk_map(3, ijk_counter) = kkB
2646 128 : ijk_map(4, ijk_counter) = block_size
2647 2332 : IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2648 : END DO
2649 : END DO
2650 : END DO
2651 : ij_counter = 0
2652 2026 : DO iiB = 1, homo
2653 : ! diagonal elements already updated
2654 4230 : DO jjB = iiB + 1, homo
2655 2204 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
2656 106 : ij_counter = ij_counter + 1
2657 2100 : DO kkB = 1, homo_beta
2658 2620 : IF (ijk_marker(kkB, ij_counter)) THEN
2659 160 : ijk_counter = ijk_counter + 1
2660 160 : ijk_map(1, ijk_counter) = iiB
2661 160 : ijk_map(2, ijk_counter) = jjB
2662 160 : ijk_map(3, ijk_counter) = kkB
2663 160 : ijk_map(4, ijk_counter) = 1
2664 160 : IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2665 : END IF
2666 : END DO
2667 : END DO
2668 : END DO
2669 :
2670 448 : DEALLOCATE (ijk_marker)
2671 :
2672 448 : CALL comm_exchange%allgather(my_ijk, num_ijk)
2673 926 : max_ijk = MAXVAL(num_ijk)
2674 :
2675 448 : END SUBROUTINE Find_quasi_degenerate_ij
2676 :
2677 : END MODULE mp2_ri_gpw
|