Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief GW using RI-RS Approximation for molecules
10 : !> \par History
11 : !> 04.2026 created [Ritaj Tyagi]
12 : ! **************************************************************************************************
13 : MODULE gw_non_periodic_ri_rs
14 : USE atomic_kind_types, ONLY: atomic_kind_type,&
15 : get_atomic_kind_set
16 : USE basis_set_types, ONLY: gto_basis_set_type
17 : USE cell_types, ONLY: cell_type,&
18 : pbc
19 : USE constants_operator, ONLY: operator_coulomb
20 : USE cp_blacs_env, ONLY: cp_blacs_env_create,&
21 : cp_blacs_env_release,&
22 : cp_blacs_env_type
23 : USE cp_dbcsr_api, ONLY: &
24 : dbcsr_add, dbcsr_binary_read, dbcsr_binary_write, dbcsr_copy, dbcsr_create, &
25 : dbcsr_deallocate_matrix, dbcsr_distribution_get, dbcsr_distribution_new, &
26 : dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
27 : dbcsr_get_info, dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, &
28 : dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_multiply, &
29 : dbcsr_p_type, dbcsr_put_block, dbcsr_release, dbcsr_scale, dbcsr_set, dbcsr_type, &
30 : dbcsr_type_no_symmetry
31 : USE cp_dbcsr_contrib, ONLY: dbcsr_reserve_all_blocks
32 : USE cp_dbcsr_operations, ONLY: copy_dbcsr_to_fm,&
33 : copy_fm_to_dbcsr,&
34 : dbcsr_deallocate_matrix_set,&
35 : max_elements_per_block
36 : USE cp_files, ONLY: close_file,&
37 : open_file
38 : USE cp_fm_basic_linalg, ONLY: cp_fm_scale_and_add,&
39 : cp_fm_uplo_to_full
40 : USE cp_fm_cholesky, ONLY: cp_fm_cholesky_decompose,&
41 : cp_fm_cholesky_invert,&
42 : cp_fm_cholesky_solve
43 : USE cp_fm_diag, ONLY: cp_fm_power
44 : USE cp_fm_struct, ONLY: cp_fm_struct_create,&
45 : cp_fm_struct_release,&
46 : cp_fm_struct_type
47 : USE cp_fm_types, ONLY: cp_fm_create,&
48 : cp_fm_get_info,&
49 : cp_fm_get_submatrix,&
50 : cp_fm_release,&
51 : cp_fm_set_all,&
52 : cp_fm_set_submatrix,&
53 : cp_fm_to_fm,&
54 : cp_fm_type
55 : USE cp_log_handling, ONLY: cp_get_default_logger,&
56 : cp_logger_type
57 : USE cp_output_handling, ONLY: cp_p_file,&
58 : cp_print_key_should_output
59 : USE gw_integrals, ONLY: build_3c_integral_block_ctx,&
60 : gw_3c_ctx_create,&
61 : gw_3c_ctx_release,&
62 : gw_3c_ctx_type,&
63 : gw_3c_ws_create,&
64 : gw_3c_ws_release,&
65 : gw_3c_ws_type
66 : USE gw_large_cell_gamma, ONLY: &
67 : Fourier_transform_w_to_t, G_occ_vir, compute_QP_energies, compute_fm_chi_Gamma_freq, &
68 : create_fm_W_MIC_time, delete_unnecessary_files, fill_fm_Sigma_c_Gamma_time, fm_write, &
69 : multiply_fm_W_MIC_time_with_Minv_Gamma
70 : USE gw_utils, ONLY: de_init_bs_env
71 : USE input_constants, ONLY: rtp_method_bse
72 : USE input_section_types, ONLY: section_vals_type
73 : USE kinds, ONLY: default_path_length,&
74 : default_string_length,&
75 : dp
76 : USE kpoint_coulomb_2c, ONLY: build_2c_coulomb_matrix_kp
77 : USE machine, ONLY: m_walltime
78 : USE message_passing, ONLY: mp_para_env_type
79 : USE mp2_ri_2c, ONLY: RI_2c_integral_mat
80 : USE orbital_pointers, ONLY: indco,&
81 : ncoset
82 : USE parallel_gemm_api, ONLY: parallel_gemm
83 : USE particle_types, ONLY: particle_type
84 : USE post_scf_bandstructure_types, ONLY: post_scf_bandstructure_type
85 : USE qs_environment_types, ONLY: get_qs_env,&
86 : qs_environment_type
87 : USE qs_kind_types, ONLY: get_qs_kind,&
88 : qs_kind_type
89 : #include "./base/base_uses.f90"
90 :
91 : IMPLICIT NONE
92 :
93 : PRIVATE
94 :
95 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'gw_non_periodic_ri_rs'
96 :
97 : PUBLIC :: gw_calc_non_periodic_ri_rs, ri_rs_grid_assembler, &
98 : get_basis_offsets, precompute_ri_rs_radii, solve_D_lp_distributed
99 :
100 : CONTAINS
101 :
102 : ! **************************************************************************************************
103 : !> \brief GW calculation using RI-RS formalism for molecules
104 : !> \param qs_env ...
105 : !> \param bs_env ...
106 : ! **************************************************************************************************
107 :
108 10 : SUBROUTINE gw_calc_non_periodic_ri_rs(qs_env, bs_env)
109 :
110 : TYPE(qs_environment_type), POINTER :: qs_env
111 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
112 :
113 : CHARACTER(LEN=*), PARAMETER :: routineN = 'gw_calc_non_periodic_ri_rs'
114 :
115 : INTEGER :: handle
116 10 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:) :: fm_Sigma_x_Gamma, fm_W_time
117 10 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :, :) :: fm_Sigma_c_Gamma_time
118 :
119 10 : CALL timeset(routineN, handle)
120 :
121 : !!========================================================================
122 : !! 0. Precompute AO and RI Radii
123 : !! Determine per-atom cutoff radii from the most diffuse Gaussian
124 : !! primitives in the AO and RI auxiliary basis sets.
125 : !! Equations:
126 : !! a. α_min,ao = min_{i,j} { ζ_ao(i,j) | ζ_ao(i,j) > 10⁻³ }
127 : !! b. α_min,ri = min_{i,j} { ζ_ri(i,j) | ζ_ri(i,j) > 10⁻³ }
128 : !! c. r_ao = sqrt( -ln(ε) / α_min,ao ) (radius_ao_per_atom)
129 : !! d. r_ri = sqrt( -ln(ε) / α_min,ri ) (radius_ri_per_atom)
130 : !!========================================================================
131 10 : CALL precompute_ri_rs_radii(qs_env, bs_env)
132 :
133 : !!========================================================================
134 : !! 1. Grid Generation for RI-RS
135 : !! (Modified Lebedev grids from Ivan Duchemin and Xavier Blase)
136 : !! Generate flattened 1D array of grid points for RI-RS.
137 : !! Equation: r_g(k) = R_A + r_g(A)
138 : !!========================================================================
139 10 : CALL ri_rs_grid_assembler(qs_env, bs_env, bs_env%ri_rs%grid_points)
140 :
141 : !!========================================================================
142 : !! 2. Atomic Basis Evaluation
143 : !! Compute values of spherical atomic basis functions at grid points.
144 : !! Expression: Φ_μl = Φ_μ(r_l) (mat_phi_mu_l)
145 : !!========================================================================
146 : CALL atomic_basis_at_grid_point(qs_env, bs_env, bs_env%ri_rs%grid_points, &
147 10 : bs_env%ri_rs%mat_phi_mu_l)
148 :
149 : !!========================================================================
150 : !! 3. Compute RI-RS Coefficients (Z_lp)
151 : !! Solve the regularized system for each atom P, where the grid domain
152 : !! is restricted to r_l within a cutoff distance of atom P:
153 : !! a. D_ll' = [ Σ_μ Φ_μ(r_l) Φ_μ(r_l') ]^2 (Equation 13)
154 : !! b. D_lP = Σ_{μν} Φ_μ(r_l) Φ_ν(r_l) (μν|P) (Equation 15)
155 : !! c. Conditioning:
156 : !! Dvec_l = 1 / sqrt(D_ll) (Diagonal scaling vector)
157 : !! D'_ll' = Dvec_l * D_ll' * Dvec_l' + λδ_ll'
158 : !! D'_lP = Dvec_l * D_lP
159 : !! d. Solve: Σ_l' D'_ll' * Z'_l'P = D'_lP (Equation 14)
160 : !! e. Rescale: Z_lP = Z'_lP * Dvec_l (Z_lP stored in mat_Z_lP)
161 : !!========================================================================
162 : CALL compute_coeff_Z_lP(qs_env, bs_env, bs_env%ri_rs%grid_points, &
163 10 : bs_env%ri_rs%mat_phi_mu_l, bs_env%ri_rs%mat_Z_lP)
164 :
165 : !!========================================================================
166 : !! 4. Compute Independent-Particle Polarizability (χ)
167 : !! G^occ_µλ(i|τ|) = sum_n^occ C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
168 : !! G^vir_µλ(i|τ|) = sum_n^vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
169 : !! G^occ_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
170 : !! G^vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
171 : !! χ_ll'(iτ) = G^occ_ll'(i|τ|) * G^vir_ll'(i|τ|)
172 : !! χ_PQ(iτ) = sum_ll' Z_lP χ_ll'(iτ) Z_l'Q
173 : !!========================================================================
174 : CALL get_mat_chi_Gamma_tau(bs_env, bs_env%mat_chi_Gamma_tau, &
175 10 : bs_env%ri_rs%mat_phi_mu_l, bs_env%ri_rs%mat_Z_lP)
176 :
177 : !!========================================================================
178 : !! 5. Compute Screened Interaction (W)
179 : !! χ_PQ(iτ) -> χ_PQ(iω) -> ε_PQ(iω) -> W_PQ(iω) -> W_PQ(iτ)
180 : !!========================================================================
181 10 : CALL compute_W(bs_env, qs_env, bs_env%mat_chi_Gamma_tau, fm_W_time)
182 :
183 : !!========================================================================
184 : !! 6. Compute Exact Exchange Self-Energy (Σ^x)
185 : !! D_µν = sum_n^occ C_µn C_νn
186 : !! D_ll' = sum_µν Φ_µ(r_l) D_µν Φ_ν(r_l')
187 : !! V^trunc_ll' = sum_PQ Z_lP V^trunc_PQ Z_l'Q
188 : !! Σ^x_ll' = D_ll' * V^trunc_ll'
189 : !! Σ^x_λσ(k=0) = -sum_ll' Φ_λ(r_l) Σ^x_ll' Φ_σ(r_l')
190 : !!========================================================================
191 : CALL compute_Sigma_x(bs_env, qs_env, bs_env%ri_rs%mat_phi_mu_l, &
192 10 : bs_env%ri_rs%mat_Z_lP, fm_Sigma_x_Gamma)
193 :
194 : !!========================================================================
195 : !! 7. Compute Correlation Self-Energy (Σ^c)
196 : !! W_ll'(iτ) = sum_PQ Z_lP W^MIC_PQ(iτ) Z_l'Q
197 : !! Σ^c_ll'(iτ) = -G^occ_ll'(i|τ|) * W_ll'(iτ), for τ < 0
198 : !! Σ^c_ll'(iτ) = G^vir_ll'(i|τ|) * W_ll'(iτ), for τ > 0
199 : !! Σ^c_λσ(iτ) = sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ) Φ_σ(r_l')
200 : !!========================================================================
201 : CALL compute_Sigma_c(bs_env, fm_W_time, bs_env%ri_rs%mat_phi_mu_l, &
202 10 : bs_env%ri_rs%mat_Z_lP, fm_Sigma_c_Gamma_time)
203 :
204 : !!========================================================================
205 : !! 8. Compute Quasiparticle Energies
206 : !! Σ^c_λσ(iτ) -> Σ^c_nn(ϵ)
207 : !! ϵ_nk^GW = ϵ_nk^DFT + Σ^c_nn(ϵ) + Σ^x_nn - v^xc_nn
208 : !!========================================================================
209 10 : CALL compute_QP_energies(bs_env, qs_env, fm_Sigma_x_Gamma, fm_Sigma_c_Gamma_time)
210 :
211 10 : CALL de_init_bs_env(bs_env)
212 :
213 10 : CALL timestop(handle)
214 :
215 10 : END SUBROUTINE gw_calc_non_periodic_ri_rs
216 :
217 : ! **************************************************************************************************
218 : !> \brief Compute per-atom AO and RI basis radii from the most diffuse Gaussian
219 : !> primitive in the AO ("ORB") and RI auxiliary ("RI_AUX") basis sets.
220 : !> Stores results in bs_env%ri_rs%radius_ao_per_atom(:) and
221 : !> bs_env%ri_rs%radius_ri_per_atom(:) and prints a per-atom table.
222 : !> Radius: r_kind = sqrt(-log(eps) / alpha_min_kind)
223 : !> with eps = eps_filter.
224 : !> \param qs_env ...
225 : !> \param bs_env ...
226 : ! **************************************************************************************************
227 :
228 10 : SUBROUTINE precompute_ri_rs_radii(qs_env, bs_env)
229 :
230 : TYPE(qs_environment_type), POINTER :: qs_env
231 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
232 :
233 : CHARACTER(LEN=*), PARAMETER :: routineN = 'precompute_ri_rs_radii'
234 :
235 : INTEGER :: handle, i, iatom, ikind, j, natom, nkind
236 10 : INTEGER, ALLOCATABLE, DIMENSION(:) :: kind_of
237 : REAL(KIND=dp) :: eps
238 10 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: alpha_min_ao_kind, alpha_min_ri_kind
239 10 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: zet_ao, zet_ri
240 10 : TYPE(atomic_kind_type), DIMENSION(:), POINTER :: atomic_kind_set
241 10 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
242 :
243 10 : CALL timeset(routineN, handle)
244 :
245 : CALL get_qs_env(qs_env, nkind=nkind, atomic_kind_set=atomic_kind_set, &
246 10 : particle_set=particle_set)
247 10 : natom = SIZE(particle_set)
248 :
249 10 : eps = bs_env%eps_filter
250 :
251 40 : ALLOCATE (alpha_min_ao_kind(nkind), alpha_min_ri_kind(nkind))
252 28 : alpha_min_ao_kind = HUGE(1.0_dp)
253 28 : alpha_min_ri_kind = HUGE(1.0_dp)
254 :
255 28 : DO ikind = 1, nkind
256 18 : zet_ao => bs_env%basis_set_AO(ikind)%gto_basis_set%zet
257 18 : zet_ri => bs_env%basis_set_RI(ikind)%gto_basis_set%zet
258 :
259 72 : DO i = 1, SIZE(zet_ao, 1)
260 156 : DO j = 1, SIZE(zet_ao, 2)
261 84 : IF (zet_ao(i, j) > 1.0E-3_dp) &
262 138 : alpha_min_ao_kind(ikind) = MIN(alpha_min_ao_kind(ikind), zet_ao(i, j))
263 : END DO
264 : END DO
265 46 : DO i = 1, SIZE(zet_ri, 1)
266 110 : DO j = 1, SIZE(zet_ri, 2)
267 74 : IF (zet_ri(i, j) > 1.0E-3_dp) &
268 92 : alpha_min_ri_kind(ikind) = MIN(alpha_min_ri_kind(ikind), zet_ri(i, j))
269 : END DO
270 : END DO
271 : END DO
272 :
273 10 : CALL get_atomic_kind_set(atomic_kind_set=atomic_kind_set, kind_of=kind_of)
274 :
275 30 : ALLOCATE (bs_env%ri_rs%radius_ao_per_atom(natom))
276 20 : ALLOCATE (bs_env%ri_rs%radius_ri_per_atom(natom))
277 38 : DO iatom = 1, natom
278 28 : ikind = kind_of(iatom)
279 28 : bs_env%ri_rs%radius_ao_per_atom(iatom) = SQRT(-LOG(eps)/alpha_min_ao_kind(ikind))
280 38 : bs_env%ri_rs%radius_ri_per_atom(iatom) = SQRT(-LOG(eps)/alpha_min_ri_kind(ikind))
281 : END DO
282 :
283 10 : IF (bs_env%unit_nr > 0) THEN
284 5 : WRITE (bs_env%unit_nr, '(T2,A)') 'Per-kind RI-RS basis radii (Bohr):'
285 5 : WRITE (bs_env%unit_nr, '(T4,A6,2X,A4,2A14)') 'Kind', 'Elem', 'r_AO', 'r_RI'
286 14 : DO ikind = 1, nkind
287 : WRITE (bs_env%unit_nr, '(T4,I6,2X,A4,2F14.4)') &
288 9 : ikind, &
289 9 : atomic_kind_set(ikind)%element_symbol, &
290 9 : SQRT(-LOG(eps)/alpha_min_ao_kind(ikind)), &
291 23 : SQRT(-LOG(eps)/alpha_min_ri_kind(ikind))
292 : END DO
293 5 : WRITE (bs_env%unit_nr, '(A)') ' '
294 : END IF
295 :
296 10 : DEALLOCATE (alpha_min_ao_kind, alpha_min_ri_kind, kind_of)
297 :
298 10 : CALL timestop(handle)
299 :
300 10 : END SUBROUTINE precompute_ri_rs_radii
301 :
302 : ! **************************************************************************************************
303 : !> \brief Compute grid points for RI-RS
304 : !> Right now based on Ivan and Xavier implementation
305 : !> JCP 150, 174120 (2019), JCTC 17, 2383 (2021)
306 : !> \param qs_env ...
307 : !> \param bs_env ...
308 : !> \param ri_rs_grid_points ...
309 : ! **************************************************************************************************
310 :
311 10 : SUBROUTINE ri_rs_grid_assembler(qs_env, bs_env, ri_rs_grid_points)
312 :
313 : TYPE(qs_environment_type), POINTER :: qs_env
314 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
315 : REAL(KIND=dp), ALLOCATABLE, INTENT(OUT) :: ri_rs_grid_points(:, :)
316 :
317 : CHARACTER(LEN=*), PARAMETER :: routineN = 'ri_rs_grid_assembler'
318 :
319 : INTEGER :: atom_idx, end_idx, handle, ikind, j, &
320 : natom, nkind, start_idx, &
321 : total_grid_npts
322 10 : INTEGER, ALLOCATABLE :: atom_to_kind(:), ri_rs_grid_offsets(:)
323 : REAL(KIND=dp) :: atomic_center(3)
324 10 : TYPE(atomic_kind_type), DIMENSION(:), POINTER :: atomic_kind_set
325 10 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
326 :
327 10 : CALL timeset(routineN, handle)
328 :
329 : !! Get the information about the atoms in the system
330 10 : CALL get_qs_env(qs_env, atomic_kind_set=atomic_kind_set, particle_set=particle_set)
331 :
332 10 : nkind = SIZE(atomic_kind_set)
333 10 : natom = SIZE(particle_set)
334 :
335 : !! 1. Build the grid cache
336 10 : CALL build_grid_cache(bs_env, atomic_kind_set)
337 :
338 : !! 2. Calculate grid counts and offsets
339 30 : ALLOCATE (ri_rs_grid_offsets(natom + 1))
340 30 : ALLOCATE (atom_to_kind(natom))
341 :
342 10 : total_grid_npts = 0
343 28 : DO ikind = 1, nkind
344 56 : DO j = 1, SIZE(atomic_kind_set(ikind)%atom_list)
345 28 : atom_idx = atomic_kind_set(ikind)%atom_list(j)
346 28 : atom_to_kind(atom_idx) = ikind
347 28 : ri_rs_grid_offsets(atom_idx) = total_grid_npts + 1
348 46 : total_grid_npts = total_grid_npts + bs_env%ri_rs%grid_cache(ikind)%npts
349 : END DO
350 : END DO
351 :
352 10 : ri_rs_grid_offsets(natom + 1) = total_grid_npts + 1
353 :
354 10 : IF (bs_env%unit_nr > 0) THEN
355 : WRITE (bs_env%unit_nr, FMT="(T2,A,T69,I12)") &
356 5 : 'Total grid points used for RI-RS:', total_grid_npts
357 5 : WRITE (bs_env%unit_nr, "(A)") ' '
358 : END IF
359 :
360 : !! 3. Allocate the global ri_rs_grid arrays
361 30 : ALLOCATE (ri_rs_grid_points(3, total_grid_npts))
362 :
363 : !! 4. Parallelize the grid generation loop
364 : !$OMP PARALLEL DO DEFAULT(NONE) &
365 : !$OMP SHARED(ri_rs_grid_points, ri_rs_grid_offsets, atom_to_kind, &
366 : !$OMP particle_set, bs_env, natom) &
367 : !$OMP PRIVATE(atom_idx, ikind, atomic_center, start_idx, end_idx) &
368 10 : !$OMP SCHEDULE(DYNAMIC, 1)
369 : DO atom_idx = 1, natom
370 : ikind = atom_to_kind(atom_idx)
371 : atomic_center(:) = particle_set(atom_idx)%r(:)
372 :
373 : start_idx = ri_rs_grid_offsets(atom_idx)
374 : end_idx = start_idx + bs_env%ri_rs%grid_cache(ikind)%npts - 1
375 :
376 : !! Shift the cached origin grid by the atom's center
377 : ri_rs_grid_points(1, start_idx:end_idx) = bs_env%ri_rs%grid_cache(ikind)%raw_points(1, :) + atomic_center(1)
378 : ri_rs_grid_points(2, start_idx:end_idx) = bs_env%ri_rs%grid_cache(ikind)%raw_points(2, :) + atomic_center(2)
379 : ri_rs_grid_points(3, start_idx:end_idx) = bs_env%ri_rs%grid_cache(ikind)%raw_points(3, :) + atomic_center(3)
380 :
381 : END DO
382 : !$OMP END PARALLEL DO
383 :
384 : !! 5. Cleanup memory
385 10 : IF (ALLOCATED(bs_env%ri_rs%grid_cache)) THEN
386 28 : DO ikind = 1, nkind
387 28 : IF (ALLOCATED(bs_env%ri_rs%grid_cache(ikind)%raw_points)) DEALLOCATE (bs_env%ri_rs%grid_cache(ikind)%raw_points)
388 : END DO
389 28 : DEALLOCATE (bs_env%ri_rs%grid_cache)
390 : END IF
391 :
392 10 : DEALLOCATE (atom_to_kind, ri_rs_grid_offsets)
393 10 : CALL timestop(handle)
394 :
395 20 : END SUBROUTINE ri_rs_grid_assembler
396 :
397 : ! **************************************************************************************************
398 : !> \brief Reads grids from .ion files and stores them in memory based on grid_select
399 : !> \param bs_env ...
400 : !> \param atomic_kind_set ...
401 : ! **************************************************************************************************
402 :
403 10 : SUBROUTINE build_grid_cache(bs_env, atomic_kind_set)
404 :
405 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
406 : TYPE(atomic_kind_type), DIMENSION(:), POINTER :: atomic_kind_set
407 :
408 : CHARACTER(LEN=default_path_length) :: filename, full_path, line
409 : CHARACTER(LEN=default_string_length) :: atom_sym, suffix
410 : INTEGER :: colon_idx, i, ierr, ikind, iunit, nkind
411 : REAL(KIND=dp) :: pt(3)
412 :
413 : ! Determine the file suffix based on the user's choice
414 10 : IF (bs_env%ri_rs%grid_select == 1) THEN
415 10 : suffix = "_def2-tzvp-rs.ion"
416 0 : ELSE IF (bs_env%ri_rs%grid_select == 2) THEN
417 0 : suffix = "_cc-pvtz-rs.ion"
418 : ELSE
419 0 : CPABORT("Unknown grid_select value. Valid options are 1 (def2-TZVPP) or 2 (cc-pVTZ).")
420 : END IF
421 :
422 10 : nkind = SIZE(atomic_kind_set)
423 48 : IF (.NOT. ALLOCATED(bs_env%ri_rs%grid_cache)) ALLOCATE (bs_env%ri_rs%grid_cache(nkind))
424 :
425 28 : DO ikind = 1, nkind
426 18 : atom_sym = TRIM(atomic_kind_set(ikind)%element_symbol)
427 18 : filename = TRIM(atom_sym)//TRIM(suffix)
428 :
429 18 : full_path = "ri_rs_grid/"//TRIM(filename)
430 :
431 : CALL open_file(file_name=TRIM(full_path), unit_number=iunit, &
432 18 : file_action="READ", file_status="OLD")
433 :
434 : ! Parse the preamble for 'n points'
435 18 : bs_env%ri_rs%grid_cache(ikind)%npts = 0
436 : DO
437 612 : READ (iunit, '(A)', IOSTAT=ierr) line
438 612 : IF (ierr /= 0) EXIT
439 612 : IF (INDEX(line, 'n points') > 0) THEN
440 18 : colon_idx = INDEX(line, ':')
441 18 : READ (line(colon_idx + 1:), *) bs_env%ri_rs%grid_cache(ikind)%npts
442 18 : EXIT
443 : END IF
444 : END DO
445 :
446 : ! Allocate the cache array for this specific element
447 54 : ALLOCATE (bs_env%ri_rs%grid_cache(ikind)%raw_points(3, bs_env%ri_rs%grid_cache(ikind)%npts))
448 :
449 18 : REWIND (iunit)
450 : DO
451 648 : READ (iunit, '(A)', IOSTAT=ierr) line
452 648 : IF (ierr /= 0) EXIT
453 648 : IF (INDEX(line, '<grid_points>') > 0) EXIT
454 : END DO
455 :
456 : ! Read the raw grid coordinates
457 4466 : DO i = 1, bs_env%ri_rs%grid_cache(ikind)%npts
458 4448 : READ (iunit, *, IOSTAT=ierr) pt(1), pt(2), pt(3)
459 4448 : IF (ierr /= 0) THEN
460 0 : CPABORT("Unexpected EOF in grid file ")
461 : END IF
462 17810 : bs_env%ri_rs%grid_cache(ikind)%raw_points(:, i) = pt(:)
463 : END DO
464 :
465 28 : CALL close_file(unit_number=iunit)
466 : END DO
467 :
468 10 : END SUBROUTINE build_grid_cache
469 :
470 : ! **************************************************************************************************
471 : !> \brief Evaluates atomic basis functions on a real-space grid and builds a sparse DBCSR matrix.
472 : !> \param qs_env ...
473 : !> \param bs_env ...
474 : !> \param ri_rs_grid_points ...
475 : !> \param mat_phi_mu_l ...
476 : ! **************************************************************************************************
477 :
478 10 : SUBROUTINE atomic_basis_at_grid_point(qs_env, bs_env, ri_rs_grid_points, mat_phi_mu_l)
479 :
480 : TYPE(qs_environment_type), POINTER :: qs_env
481 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
482 : REAL(KIND=dp), ALLOCATABLE, INTENT(INOUT) :: ri_rs_grid_points(:, :)
483 : TYPE(dbcsr_type), INTENT(OUT) :: mat_phi_mu_l
484 :
485 : CHARACTER(LEN=*), PARAMETER :: routineN = 'atomic_basis_at_grid_point'
486 :
487 : INTEGER :: c_size, chunk_size, dimen_ORB, handle, i, i_blk, iatom, natom, npcol, nprow, &
488 : num_grid_chunks, r_end, r_start, total_grid_npts
489 : INTEGER, ALLOCATABLE, DIMENSION(:) :: first_sgf
490 10 : INTEGER, DIMENSION(:), POINTER :: c_blk_sizes, col_dist, col_dist_ks, &
491 10 : r_blk_sizes, row_dist, row_dist_ks
492 10 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: atom_col_buffer
493 10 : TYPE(atomic_kind_type), DIMENSION(:), POINTER :: atomic_kind_set
494 : TYPE(cell_type), POINTER :: cell
495 : TYPE(dbcsr_distribution_type) :: dist
496 : TYPE(dbcsr_distribution_type), POINTER :: dbcsr_dist_ks
497 : TYPE(mp_para_env_type), POINTER :: para_env
498 10 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
499 10 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
500 :
501 10 : CALL timeset(routineN, handle)
502 :
503 10 : chunk_size = max_elements_per_block
504 :
505 : ! Extract environment variables
506 : CALL get_qs_env(qs_env, cell=cell, atomic_kind_set=atomic_kind_set, &
507 : qs_kind_set=qs_kind_set, particle_set=particle_set, &
508 10 : para_env=para_env)
509 :
510 10 : natom = SIZE(particle_set)
511 10 : total_grid_npts = SIZE(ri_rs_grid_points, 2)
512 :
513 : ! Map the starting indices of spherical gaussian functions (SGF) for each atom
514 30 : ALLOCATE (first_sgf(natom + 1))
515 10 : CALL get_basis_offsets(particle_set, qs_kind_set, first_sgf, dimen_ORB)
516 :
517 : ! =========================================================================
518 : ! 1. SETUP DBCSR MATRIX TOPOLOGY
519 : ! =========================================================================
520 :
521 : ! A. Define Column Block Sizes (1 Block = 1 Atom's full basis set)
522 30 : ALLOCATE (c_blk_sizes(natom))
523 38 : DO iatom = 1, natom
524 38 : c_blk_sizes(iatom) = first_sgf(iatom + 1) - first_sgf(iatom)
525 : END DO
526 :
527 : ! B. Define Row Block Sizes (Grid chunks of max size 256)
528 10 : num_grid_chunks = CEILING(REAL(total_grid_npts, KIND=dp)/REAL(chunk_size, KIND=dp))
529 30 : ALLOCATE (r_blk_sizes(num_grid_chunks))
530 204 : r_blk_sizes = chunk_size
531 10 : IF (MOD(total_grid_npts, chunk_size) /= 0) THEN
532 0 : r_blk_sizes(num_grid_chunks) = MOD(total_grid_npts, chunk_size)
533 : END IF
534 :
535 : ! C. Fetch CP2K's Default Process Grid Configuration
536 10 : CALL get_qs_env(qs_env, dbcsr_dist=dbcsr_dist_ks)
537 10 : CALL dbcsr_distribution_get(dbcsr_dist_ks, row_dist=row_dist_ks, col_dist=col_dist_ks)
538 :
539 : ! D. Build Custom Mappings using Round-Robin across the 2D process grid
540 : ! (MAXVAL + 1 accounts for 0-based process indexing in DBCSR)
541 38 : nprow = MAXVAL(row_dist_ks) + 1
542 38 : npcol = MAXVAL(col_dist_ks) + 1
543 :
544 20 : ALLOCATE (row_dist(num_grid_chunks))
545 204 : DO i = 1, num_grid_chunks
546 204 : row_dist(i) = MOD(i - 1, nprow)
547 : END DO
548 :
549 20 : ALLOCATE (col_dist(natom))
550 38 : DO i = 1, natom
551 38 : col_dist(i) = MOD(i - 1, npcol)
552 : END DO
553 :
554 : ! E. Create the DBCSR Distribution and Initialize the Matrix
555 : CALL dbcsr_distribution_new(dist, template=dbcsr_dist_ks, &
556 10 : row_dist=row_dist, col_dist=col_dist)
557 :
558 : CALL dbcsr_create(mat_phi_mu_l, name="phi_val_sparse", dist=dist, &
559 : matrix_type=dbcsr_type_no_symmetry, &
560 10 : row_blk_size=r_blk_sizes, col_blk_size=c_blk_sizes)
561 :
562 : ! =========================================================================
563 : ! 2. STREAM DATA DIRECTLY INTO SPARSE MATRIX
564 : ! =========================================================================
565 : ! Iterate over the atoms assigned to this specific MPI rank
566 10 : DO iatom = para_env%mepos + 1, natom, para_env%num_pe
567 :
568 14 : c_size = c_blk_sizes(iatom)
569 :
570 : ! Allocate a temporary dense buffer just for this specific atom
571 56 : ALLOCATE (atom_col_buffer(total_grid_npts, c_size))
572 14 : atom_col_buffer = 0.0_dp
573 :
574 : ! Evaluate the basis functions on the grid. Skip grid points outside
575 : ! the spatial extent of the most diffuse AO Gaussian on iatom; beyond
576 : ! that radius the contribution is guaranteed below eps_filter.
577 : CALL fill_phi_for_atom(atom_col_buffer, ri_rs_grid_points, total_grid_npts, &
578 : iatom, particle_set, qs_kind_set, cell, &
579 14 : bs_env%ri_rs%radius_ao_per_atom(iatom)**2)
580 :
581 : ! Slice the dense column into chunks and insert into DBCSR
582 284 : DO i_blk = 1, num_grid_chunks
583 270 : r_start = (i_blk - 1)*chunk_size + 1
584 270 : r_end = MIN(i_blk*chunk_size, total_grid_npts)
585 :
586 : ! Apply dynamic sparsity filtering: Only store blocks with physical significance
587 24770 : IF (MAXVAL(ABS(atom_col_buffer(r_start:r_end, 1:c_size))) > bs_env%eps_filter) THEN
588 : CALL dbcsr_put_block(mat_phi_mu_l, row=i_blk, col=iatom, &
589 270 : block=atom_col_buffer(r_start:r_end, 1:c_size))
590 : END IF
591 : END DO
592 :
593 14 : DEALLOCATE (atom_col_buffer)
594 :
595 : END DO
596 :
597 : ! Finalize triggers internal MPI communication to route blocks to their correct 2D process owners
598 10 : CALL dbcsr_finalize(mat_phi_mu_l)
599 :
600 : ! -------------------------------------------------------------------------
601 : ! CLEANUP
602 : ! -------------------------------------------------------------------------
603 10 : DEALLOCATE (first_sgf, r_blk_sizes, c_blk_sizes, row_dist, col_dist)
604 10 : CALL dbcsr_distribution_release(dist)
605 :
606 10 : CALL timestop(handle)
607 :
608 40 : END SUBROUTINE atomic_basis_at_grid_point
609 :
610 : ! **************************************************************************************************
611 : !> \brief Compute value of all basis function for single atom across all grid points
612 : !> \param phi_val ...
613 : !> \param ri_rs_grid ...
614 : !> \param npts ...
615 : !> \param iatom ...
616 : !> \param particle_set ...
617 : !> \param qs_kind_set ...
618 : !> \param cell ...
619 : !> \param r2_threshold ...
620 : ! **************************************************************************************************
621 :
622 45 : SUBROUTINE fill_phi_for_atom(phi_val, ri_rs_grid, npts, iatom, &
623 : particle_set, qs_kind_set, cell, r2_threshold)
624 :
625 : REAL(KIND=dp), INTENT(INOUT) :: phi_val(:, :)
626 : INTEGER, INTENT(IN) :: npts
627 : REAL(KIND=dp), INTENT(IN) :: ri_rs_grid(3, npts)
628 : INTEGER, INTENT(IN) :: iatom
629 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
630 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
631 : TYPE(cell_type), POINTER :: cell
632 : REAL(KIND=dp), INTENT(IN) :: r2_threshold
633 :
634 : INTEGER :: first_sgf, i_pt, ico, iend_co, ikind, &
635 : ipgf, iset, isgf, ishell, istart_co, &
636 : l, last_sgf, lx, ly, lz, n_cart_total, &
637 : row_idx
638 : REAL(KIND=dp) :: alpha, dist_vec(3), exp_val, poly, r2, &
639 : r_atom(3), weight
640 : TYPE(gto_basis_set_type), POINTER :: orb_basis_set
641 :
642 : ! Get Atom Info
643 45 : ikind = particle_set(iatom)%atomic_kind%kind_number
644 45 : CALL get_qs_kind(qs_kind_set(ikind), basis_set=orb_basis_set, basis_type="ORB")
645 45 : IF (.NOT. ASSOCIATED(orb_basis_set)) RETURN
646 :
647 180 : r_atom = particle_set(iatom)%r
648 :
649 : !$OMP PARALLEL DO DEFAULT(NONE) &
650 : !$OMP SHARED(phi_val, ri_rs_grid, npts, orb_basis_set, r_atom, cell, ncoset, indco, &
651 : !$OMP r2_threshold) &
652 : !$OMP PRIVATE(i_pt, dist_vec, r2, iset, n_cart_total, ishell, l, istart_co, iend_co, &
653 : !$OMP first_sgf, last_sgf, ipgf, alpha, exp_val, isgf, ico, row_idx, weight, &
654 : !$OMP lx, ly, lz, poly) &
655 45 : !$OMP SCHEDULE(STATIC)
656 : DO i_pt = 1, npts
657 : dist_vec = pbc(ri_rs_grid(:, i_pt) - r_atom, cell)
658 : r2 = DOT_PRODUCT(dist_vec, dist_vec)
659 : IF (r2 > r2_threshold) CYCLE
660 :
661 : DO iset = 1, orb_basis_set%nset
662 : n_cart_total = ncoset(orb_basis_set%lmax(iset))
663 :
664 : DO ishell = 1, orb_basis_set%nshell(iset)
665 : l = orb_basis_set%l(ishell, iset)
666 : istart_co = ncoset(l - 1) + 1
667 : iend_co = ncoset(l)
668 :
669 : first_sgf = orb_basis_set%first_sgf(ishell, iset)
670 : last_sgf = orb_basis_set%last_sgf(ishell, iset)
671 :
672 : DO ipgf = 1, orb_basis_set%npgf(iset)
673 : alpha = orb_basis_set%zet(ipgf, iset)
674 : exp_val = EXP(-alpha*r2)
675 :
676 : DO isgf = first_sgf, last_sgf
677 : DO ico = istart_co, iend_co
678 : row_idx = (ipgf - 1)*n_cart_total + ico
679 : weight = orb_basis_set%sphi(row_idx, isgf)
680 : lx = indco(1, ico)
681 : ly = indco(2, ico)
682 : lz = indco(3, ico)
683 : poly = (dist_vec(1)**lx)*(dist_vec(2)**ly)*(dist_vec(3)**lz)
684 :
685 : phi_val(i_pt, isgf) = phi_val(i_pt, isgf) + (weight*poly*exp_val)
686 :
687 : END DO
688 : END DO
689 : END DO
690 : END DO
691 : END DO
692 : END DO
693 : !$OMP END PARALLEL DO
694 :
695 : END SUBROUTINE fill_phi_for_atom
696 :
697 : ! **************************************************************************************************
698 : !> \brief Helper for OMP threads to fill phi_val column values
699 : !> \param particle_set ...
700 : !> \param qs_kind_set ...
701 : !> \param first_sgf ...
702 : !> \param total_sgf ...
703 : ! **************************************************************************************************
704 :
705 10 : SUBROUTINE get_basis_offsets(particle_set, qs_kind_set, first_sgf, total_sgf)
706 :
707 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
708 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
709 : INTEGER, INTENT(OUT) :: first_sgf(:), total_sgf
710 :
711 : CHARACTER(LEN=*), PARAMETER :: routineN = 'get_basis_offsets'
712 :
713 : INTEGER :: handle, iatom, ikind, nsgf
714 :
715 10 : CALL timeset(routineN, handle)
716 :
717 10 : total_sgf = 0
718 38 : DO iatom = 1, SIZE(particle_set)
719 28 : first_sgf(iatom) = total_sgf + 1
720 28 : ikind = particle_set(iatom)%atomic_kind%kind_number
721 28 : CALL get_qs_kind(qs_kind_set(ikind), nsgf=nsgf, basis_type="ORB")
722 38 : total_sgf = total_sgf + nsgf
723 : END DO
724 10 : first_sgf(SIZE(particle_set) + 1) = total_sgf + 1
725 :
726 10 : CALL timestop(handle)
727 :
728 10 : END SUBROUTINE get_basis_offsets
729 :
730 : ! **************************************************************************************************
731 : !> \brief Compute RI-RS Coefficients (Z_lP)
732 : !> \param qs_env ...
733 : !> \param bs_env ...
734 : !> \param ri_rs_grid_points ...
735 : !> \param mat_phi_mu_l ...
736 : !> \param mat_Z_lP ...
737 : ! **************************************************************************************************
738 :
739 20 : SUBROUTINE compute_coeff_Z_lP(qs_env, bs_env, ri_rs_grid_points, mat_phi_mu_l, mat_Z_lP)
740 :
741 : ! Arguments
742 : TYPE(qs_environment_type), POINTER :: qs_env
743 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
744 : REAL(KIND=dp), ALLOCATABLE, INTENT(INOUT) :: ri_rs_grid_points(:, :)
745 : TYPE(dbcsr_type), INTENT(INOUT) :: mat_phi_mu_l
746 : TYPE(dbcsr_type), INTENT(OUT) :: mat_Z_lP
747 :
748 : CHARACTER(LEN=*), PARAMETER :: key = 'PROPERTIES%BANDSTRUCTURE%GW%PRINT%RESTART', &
749 : routineN = 'compute_coeff_Z_lP'
750 :
751 : INTEGER :: atom_j_mepos, atom_j_stride, atom_P, atom_P_start, atom_P_stride, col_end, &
752 : col_start, current_chunk_size, g, group_handle, handle, i, i_blk, ikind, info, j, j_ri, &
753 : l, loc_idx, loc_ptr, max_ao_size, max_loc_ri, my_group, n_ao_total, n_grid_total, &
754 : n_groups, n_loc_ri, n_local_grid, n_procs_per_atom, natom, nkind, num_grid_chunks, &
755 : P_loop_atom, r_end, r_start, source_atom
756 10 : INTEGER, ALLOCATABLE, DIMENSION(:) :: local_grid_idx, row_offset
757 20 : INTEGER, DIMENSION(:), POINTER :: col_dist_phi, col_dist_ri, r_blk_sizes, &
758 10 : ri_blk_sizes, row_dist_grid
759 : REAL(KIND=dp) :: cutoff_ri, d_sP, dist, r2_threshold, &
760 : r_c, t1
761 10 : REAL(KIND=dp), ALLOCATABLE :: cutoff_ri_per_kind(:)
762 10 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: cutoff_ri_per_atom, d_vec_local
763 10 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: D_local, d_lp_local, phi_local, &
764 10 : sphere_grid, Z_blk
765 : REAL(KIND=dp), DIMENSION(3) :: pos_P
766 10 : TYPE(atomic_kind_type), DIMENSION(:), POINTER :: atomic_kind_set
767 : TYPE(cell_type), POINTER :: cell
768 : TYPE(cp_blacs_env_type), POINTER :: blacs_env_sub
769 : TYPE(cp_fm_struct_type), POINTER :: fm_struct_b, fm_struct_D
770 : TYPE(cp_fm_type) :: fm_b, fm_D
771 : TYPE(cp_logger_type), POINTER :: logger
772 : TYPE(dbcsr_distribution_type) :: dist_phi, dist_Z
773 130 : TYPE(gw_3c_ctx_type) :: ctx_3c
774 : TYPE(mp_para_env_type), POINTER :: para_env, para_env_sub
775 10 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
776 10 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
777 : TYPE(section_vals_type), POINTER :: input
778 :
779 10 : CALL timeset(routineN, handle)
780 :
781 10 : t1 = m_walltime()
782 :
783 : CALL get_qs_env(qs_env, para_env=para_env, particle_set=particle_set, input=input, &
784 10 : qs_kind_set=qs_kind_set, cell=cell, atomic_kind_set=atomic_kind_set)
785 :
786 : ! ---------------------------------------------------------------------
787 : ! Subgroup setup. Default G=1 keeps the single-rank BLAS path; G>1 splits
788 : ! ranks into atom-groups so the Cholesky on D_local distributes across G
789 : ! ranks (memory ~1/G) and the compute_d_lp build also splits across the
790 : ! subgroup. G=1 leaves para_env_sub / blacs_env_sub NULL — no subgroup
791 : ! comms created, atom_P loop uses per-rank round-robin, compute_d_lp runs
792 : ! its full atom_j range on each rank, no allreduce.
793 : ! ---------------------------------------------------------------------
794 10 : n_procs_per_atom = MIN(bs_env%ri_rs%n_procs_per_atom_z_lp, para_env%num_pe)
795 10 : IF (n_procs_per_atom < 1) n_procs_per_atom = 1
796 :
797 10 : NULLIFY (para_env_sub, blacs_env_sub)
798 10 : IF (n_procs_per_atom > 1) THEN
799 0 : n_groups = para_env%num_pe/n_procs_per_atom
800 0 : my_group = MIN(para_env%mepos/n_procs_per_atom, n_groups - 1)
801 0 : ALLOCATE (para_env_sub)
802 0 : CALL para_env_sub%from_split(para_env, my_group)
803 0 : CALL cp_blacs_env_create(blacs_env=blacs_env_sub, para_env=para_env_sub)
804 0 : atom_P_start = my_group + 1
805 0 : atom_P_stride = n_groups
806 0 : atom_j_mepos = para_env_sub%mepos
807 0 : atom_j_stride = para_env_sub%num_pe
808 : ELSE
809 10 : atom_P_start = para_env%mepos + 1
810 10 : atom_P_stride = para_env%num_pe
811 10 : atom_j_mepos = 0
812 10 : atom_j_stride = 1
813 : END IF
814 :
815 10 : natom = SIZE(bs_env%i_RI_start_from_atom)
816 10 : n_ao_total = bs_env%i_ao_end_from_atom(natom)
817 10 : n_grid_total = SIZE(ri_rs_grid_points, 2)
818 :
819 : ! =========================================================================
820 : ! 1. SETUP DBCSR TOPOLOGY & EXACT OFFSETS
821 : ! =========================================================================
822 10 : CALL dbcsr_get_info(mat_phi_mu_l, row_blk_size=r_blk_sizes, distribution=dist_phi)
823 10 : CALL dbcsr_distribution_get(dist_phi, row_dist=row_dist_grid, col_dist=col_dist_phi, group=group_handle)
824 :
825 10 : num_grid_chunks = SIZE(r_blk_sizes)
826 :
827 30 : ALLOCATE (row_offset(num_grid_chunks))
828 10 : row_offset(1) = 0
829 194 : DO i_blk = 2, num_grid_chunks
830 194 : row_offset(i_blk) = row_offset(i_blk - 1) + r_blk_sizes(i_blk - 1)
831 : END DO
832 :
833 40 : ALLOCATE (ri_blk_sizes(natom), col_dist_ri(natom))
834 38 : DO atom_P = 1, natom
835 28 : ri_blk_sizes(atom_P) = bs_env%i_RI_end_from_atom(atom_P) - bs_env%i_RI_start_from_atom(atom_P) + 1
836 118 : col_dist_ri(atom_P) = MOD(atom_P - 1, MAXVAL(col_dist_phi) + 1)
837 : END DO
838 :
839 10 : CALL dbcsr_distribution_new(dist_Z, template=dist_phi, row_dist=row_dist_grid, col_dist=col_dist_ri)
840 :
841 10 : IF (bs_env%ri_rs%Z_lP_exists) THEN
842 : CALL dbcsr_binary_read(filepath=TRIM(bs_env%prefix)//"Z_lP.matrix", &
843 : distribution=dist_Z, &
844 2 : matrix_new=mat_Z_lP)
845 2 : IF (bs_env%unit_nr > 0) THEN
846 : WRITE (bs_env%unit_nr, '(T2,A,T57,A,F7.1,A)') &
847 1 : 'Read Z_lP from file ', ' Execution time', m_walltime() - t1, ' s'
848 1 : WRITE (bs_env%unit_nr, '(A)') ' '
849 : END IF
850 : ELSE
851 :
852 : CALL dbcsr_create(mat_Z_lP, name="mat_Z_lP", dist=dist_Z, &
853 : matrix_type=dbcsr_type_no_symmetry, &
854 8 : row_blk_size=r_blk_sizes, col_blk_size=ri_blk_sizes)
855 :
856 8 : max_ao_size = 0
857 30 : DO j = 1, SIZE(bs_env%i_ao_start_from_atom)
858 30 : max_ao_size = MAX(max_ao_size, bs_env%i_ao_end_from_atom(j) - bs_env%i_ao_start_from_atom(j) + 1)
859 : END DO
860 30 : max_loc_ri = MAXVAL(ri_blk_sizes)
861 :
862 : ! Per-atom RI-RS integration sphere:
863 : ! cutoff_ri(P) = r_c + r_AO(P)
864 : ! where r_c is the truncated-Coulomb cutoff of the RI metric. The
865 : ! CUTOFF_RADIUS_RI_RS keyword (when > 0) overrides the entire cutoff calculation.
866 8 : nkind = SIZE(atomic_kind_set)
867 24 : ALLOCATE (cutoff_ri_per_atom(natom))
868 :
869 8 : IF (bs_env%ri_rs%cutoff_radius_ri_rs > 0.0_dp) THEN
870 0 : cutoff_ri_per_atom(:) = bs_env%ri_rs%cutoff_radius_ri_rs
871 : ELSE
872 8 : r_c = bs_env%ri_metric%cutoff_radius
873 30 : DO P_loop_atom = 1, natom
874 : cutoff_ri_per_atom(P_loop_atom) = &
875 30 : r_c + bs_env%ri_rs%radius_ao_per_atom(P_loop_atom)
876 : END DO
877 : END IF
878 :
879 24 : ALLOCATE (cutoff_ri_per_kind(nkind))
880 8 : cutoff_ri_per_kind(:) = 0.0_dp
881 8 : IF (bs_env%unit_nr > 0) THEN
882 4 : WRITE (bs_env%unit_nr, '(T2,A)') 'Per-kind maximum RI-RS sphere cutoff (Bohr):'
883 4 : WRITE (bs_env%unit_nr, '(T4,A4,A14)') 'Kind', 'max cutoff_ri'
884 : ! Walk atoms once; accumulate per-kind max using atomic_kind index
885 15 : DO P_loop_atom = 1, natom
886 11 : ikind = particle_set(P_loop_atom)%atomic_kind%kind_number
887 : cutoff_ri_per_kind(ikind) = MAX(cutoff_ri_per_kind(ikind), &
888 15 : cutoff_ri_per_atom(P_loop_atom))
889 : END DO
890 11 : DO ikind = 1, nkind
891 : WRITE (bs_env%unit_nr, '(T4,A4,F14.4)') &
892 7 : atomic_kind_set(ikind)%element_symbol, &
893 18 : cutoff_ri_per_kind(ikind)
894 : END DO
895 4 : WRITE (bs_env%unit_nr, '(A)') ' '
896 : END IF
897 8 : DEALLOCATE (cutoff_ri_per_kind)
898 :
899 : ! Build the shared 3c-integral context once, before the atom_P loop and outside
900 : ! any OMP region. This hoists out the libint / t_c_g0 / md_ftable inits and the
901 : ! contracted sphi tables, which would otherwise re-run on every per-block call.
902 : ! gw_3c_ctx_create is MPI-collective (truncated-Coulomb init reads a file + bcasts).
903 : CALL gw_3c_ctx_create(ctx_3c, qs_env, bs_env%ri_metric, &
904 : basis_j=bs_env%basis_set_AO, basis_k=bs_env%basis_set_AO, &
905 8 : basis_i=bs_env%basis_set_RI)
906 :
907 : ! =========================================================================
908 : ! 2. MPI LOOP OVER ATOMS (Fully independent, no MPI barriers inside)
909 : ! This distributes the N_atoms among the available MPI processors.
910 : ! phi_local for each atom_P's cutoff sphere is built on the fly via
911 : ! fill_phi_for_atom — no dense replicated phi_global, no allreduce.
912 : ! =========================================================================
913 8 : DO atom_P = atom_P_start, natom, atom_P_stride
914 :
915 11 : n_loc_ri = ri_blk_sizes(atom_P)
916 44 : pos_P(:) = particle_set(atom_P)%r(:)
917 :
918 11 : cutoff_ri = cutoff_ri_per_atom(atom_P)
919 :
920 : ! ---------------------------------------------------------------------
921 : ! A. Determine Local Grid Domain based on cutoff_ri
922 : ! ---------------------------------------------------------------------
923 11 : n_local_grid = 0
924 6827 : DO l = 1, n_grid_total
925 27264 : dist = SQRT(SUM((ri_rs_grid_points(1:3, l) - pos_P(1:3))**2))
926 6827 : IF (dist <= cutoff_ri) n_local_grid = n_local_grid + 1
927 : END DO
928 :
929 33 : ALLOCATE (local_grid_idx(n_local_grid))
930 :
931 11 : n_local_grid = 0
932 6827 : DO l = 1, n_grid_total
933 27264 : dist = SQRT(SUM((ri_rs_grid_points(1:3, l) - pos_P(1:3))**2))
934 6827 : IF (dist <= cutoff_ri) THEN
935 6816 : n_local_grid = n_local_grid + 1
936 6816 : local_grid_idx(n_local_grid) = l
937 : END IF
938 : END DO
939 :
940 : ! ---------------------------------------------------------------------
941 : ! B. Build phi_local on the fly via fill_phi_for_atom (no replication)
942 : ! ---------------------------------------------------------------------
943 : ! Copy the local sphere coordinates into a contiguous buffer for
944 : ! fill_phi_for_atom. Only source atoms whose AO basis can reach the
945 : ! sphere of atom_P contribute; the others are pruned by the inequality
946 : ! using the per-atom radii precomputed in precompute_ri_rs_radii.
947 33 : ALLOCATE (sphere_grid(3, n_local_grid))
948 6827 : DO loc_idx = 1, n_local_grid
949 27275 : sphere_grid(:, loc_idx) = ri_rs_grid_points(:, local_grid_idx(loc_idx))
950 : END DO
951 :
952 44 : ALLOCATE (phi_local(n_local_grid, n_ao_total))
953 11 : phi_local = 0.0_dp
954 :
955 42 : DO source_atom = 1, natom
956 124 : d_sP = NORM2(particle_set(source_atom)%r(:) - pos_P(:))
957 31 : IF (d_sP > bs_env%ri_rs%radius_ao_per_atom(source_atom) + cutoff_ri) CYCLE
958 :
959 31 : col_start = bs_env%i_ao_start_from_atom(source_atom)
960 31 : col_end = bs_env%i_ao_end_from_atom(source_atom)
961 31 : r2_threshold = bs_env%ri_rs%radius_ao_per_atom(source_atom)**2
962 :
963 : CALL fill_phi_for_atom(phi_local(:, col_start:col_end), sphere_grid, &
964 : n_local_grid, source_atom, particle_set, qs_kind_set, &
965 42 : cell, r2_threshold)
966 : END DO
967 :
968 11 : DEALLOCATE (sphere_grid)
969 :
970 : ! ---------------------------------------------------------------------
971 : ! C. Build Local RHS Matrix (d_lp_local)
972 : ! Done first so the subgroup-distributed compute_d_lp + allreduce
973 : ! is not entangled with the LHS build. compute_d_lp does not
974 : ! depend on D_local or d_vec_local.
975 : ! ---------------------------------------------------------------------
976 44 : ALLOCATE (d_lp_local(n_local_grid, n_loc_ri))
977 11 : d_lp_local = 0.0_dp
978 :
979 : CALL compute_d_lp(bs_env, ctx_3c, phi_local, d_lp_local, n_local_grid, &
980 11 : n_loc_ri, atom_P, max_ao_size, atom_j_mepos, atom_j_stride)
981 :
982 : ! Reduce per-subgroup-rank partials into the replicated d_lp_local.
983 : ! Skipped for G=1 (BLAS path): each rank has the full sum locally.
984 11 : IF (n_procs_per_atom > 1) THEN
985 0 : CALL para_env_sub%sum(d_lp_local)
986 : END IF
987 :
988 : ! ---------------------------------------------------------------------
989 : ! D. Build d_vec_local (Jacobi diagonal) + LHS — BLAS or ScaLAPACK
990 : ! ---------------------------------------------------------------------
991 33 : ALLOCATE (d_vec_local(n_local_grid))
992 :
993 11 : IF (n_procs_per_atom == 1) THEN
994 : ! BLAS path: build D_local densely, compute d_vec as side-effect
995 : ! of the Jacobi step (preserves bit-identical arithmetic with the
996 : ! previous branch).
997 44 : ALLOCATE (D_local(n_local_grid, n_local_grid))
998 11 : D_local = 0.0_dp
999 :
1000 : CALL dsyrk("L", "N", n_local_grid, n_ao_total, 1.0_dp, phi_local, &
1001 11 : n_local_grid, 0.0_dp, D_local, n_local_grid)
1002 :
1003 : !$OMP PARALLEL DO DEFAULT(NONE) &
1004 : !$OMP SHARED(n_local_grid, D_local, d_vec_local, bs_env) &
1005 : !$OMP PRIVATE(i) &
1006 11 : !$OMP SCHEDULE(STATIC)
1007 : DO i = 1, n_local_grid
1008 : D_local(i, i) = D_local(i, i)**2
1009 : d_vec_local(i) = 1.0_dp/SQRT(MAX(D_local(i, i), 1.0E-16_dp))
1010 : D_local(i, i) = (D_local(i, i)*d_vec_local(i)**2) + bs_env%ri_rs%tikhonov
1011 : END DO
1012 : !$OMP END PARALLEL DO
1013 :
1014 : !$OMP PARALLEL DO DEFAULT(NONE) &
1015 : !$OMP SHARED(n_local_grid, D_local, d_vec_local) &
1016 : !$OMP PRIVATE(j, i) &
1017 11 : !$OMP SCHEDULE(DYNAMIC)
1018 : DO j = 1, n_local_grid
1019 : DO i = j + 1, n_local_grid
1020 : D_local(i, j) = D_local(i, j)**2
1021 : D_local(i, j) = D_local(i, j)*d_vec_local(i)*d_vec_local(j)
1022 : D_local(j, i) = D_local(i, j)
1023 : END DO
1024 : END DO
1025 : !$OMP END PARALLEL DO
1026 : ELSE
1027 : ! ScaLAPACK path: d_vec computed directly from phi (= 1/||phi_i||^2);
1028 : ! solve_D_lp_distributed builds D block-cyclic internally with
1029 : ! the squared+scaled values, so no dense D_local on this rank.
1030 : !$OMP PARALLEL DO DEFAULT(NONE) &
1031 : !$OMP SHARED(n_local_grid, n_ao_total, phi_local, d_vec_local) &
1032 : !$OMP PRIVATE(i, j) &
1033 0 : !$OMP SCHEDULE(STATIC)
1034 : DO i = 1, n_local_grid
1035 : d_vec_local(i) = 0.0_dp
1036 : DO j = 1, n_ao_total
1037 : d_vec_local(i) = d_vec_local(i) + phi_local(i, j)*phi_local(i, j)
1038 : END DO
1039 : d_vec_local(i) = 1.0_dp/MAX(d_vec_local(i), 1.0E-16_dp)
1040 : END DO
1041 : !$OMP END PARALLEL DO
1042 : END IF
1043 :
1044 : ! ---------------------------------------------------------------------
1045 : ! E. Pre-scale d_lp by d_vec
1046 : ! ---------------------------------------------------------------------
1047 : !$OMP PARALLEL DO DEFAULT(NONE) &
1048 : !$OMP SHARED(n_loc_ri, n_local_grid, d_lp_local, d_vec_local) &
1049 : !$OMP PRIVATE(j_ri, i) &
1050 11 : !$OMP SCHEDULE(STATIC)
1051 : DO j_ri = 1, n_loc_ri
1052 : DO i = 1, n_local_grid
1053 : d_lp_local(i, j_ri) = d_lp_local(i, j_ri)*d_vec_local(i)
1054 : END DO
1055 : END DO
1056 : !$OMP END PARALLEL DO
1057 :
1058 : ! ---------------------------------------------------------------------
1059 : ! F. Solve — BLAS dpotrf/dpotrs or ScaLAPACK pdpotrf/pdpotrs
1060 : ! ---------------------------------------------------------------------
1061 11 : IF (n_procs_per_atom == 1) THEN
1062 11 : CALL dpotrf('L', n_local_grid, D_local, n_local_grid, info)
1063 : CALL dpotrs('L', n_local_grid, n_loc_ri, D_local, n_local_grid, &
1064 11 : d_lp_local, n_local_grid, info)
1065 11 : DEALLOCATE (D_local)
1066 : ELSE
1067 : CALL solve_D_lp_distributed(phi_local, d_vec_local, d_lp_local, &
1068 : n_local_grid, n_ao_total, n_loc_ri, &
1069 : bs_env%ri_rs%tikhonov, &
1070 : para_env_sub, blacs_env_sub, &
1071 0 : fm_struct_D, fm_struct_b, fm_D, fm_b, info)
1072 : END IF
1073 :
1074 : ! ---------------------------------------------------------------------
1075 : ! G. Post-scale solution by d_vec (common to both paths)
1076 : ! ---------------------------------------------------------------------
1077 : !$OMP PARALLEL DO DEFAULT(NONE) &
1078 : !$OMP SHARED(n_loc_ri, n_local_grid, d_lp_local, d_vec_local) &
1079 : !$OMP PRIVATE(j_ri, i) &
1080 11 : !$OMP SCHEDULE(STATIC)
1081 : DO j_ri = 1, n_loc_ri
1082 : DO i = 1, n_local_grid
1083 : d_lp_local(i, j_ri) = d_lp_local(i, j_ri)*d_vec_local(i)
1084 : END DO
1085 : END DO
1086 : !$OMP END PARALLEL DO
1087 :
1088 : ! ---------------------------------------------------------------------
1089 : ! H. Scatter Local Solution Back to Global DBCSR Matrix
1090 : ! local_grid_idx is ascending (built by the ordered scan above), so a
1091 : ! single walking pointer over it sweeps the chunks in one pass.
1092 : ! Under ScaLAPACK (G>1) the d_lp_local solution is identical on all
1093 : ! G subgroup ranks (gathered via cp_fm_get_submatrix); only the
1094 : ! subgroup root writes to mat_Z_lP so each atom column is emitted
1095 : ! exactly once. DBCSR routes blocks to their global owner on finalize.
1096 : ! ---------------------------------------------------------------------
1097 11 : IF (n_procs_per_atom == 1 .OR. para_env_sub%mepos == 0) THEN
1098 257 : ALLOCATE (Z_blk(MAXVAL(r_blk_sizes), n_loc_ri))
1099 11 : loc_ptr = 1
1100 :
1101 224 : DO i_blk = 1, num_grid_chunks
1102 213 : r_start = row_offset(i_blk) + 1
1103 213 : r_end = row_offset(i_blk) + r_blk_sizes(i_blk)
1104 213 : current_chunk_size = r_blk_sizes(i_blk)
1105 :
1106 213 : Z_blk = 0.0_dp
1107 :
1108 7029 : DO WHILE (loc_ptr <= n_local_grid)
1109 7018 : g = local_grid_idx(loc_ptr)
1110 7018 : IF (g > r_end) EXIT
1111 59904 : Z_blk(g - r_start + 1, 1:n_loc_ri) = d_lp_local(loc_ptr, 1:n_loc_ri)
1112 7018 : loc_ptr = loc_ptr + 1
1113 : END DO
1114 :
1115 54971 : IF (MAXVAL(ABS(Z_blk(1:current_chunk_size, 1:n_loc_ri))) > bs_env%eps_filter) THEN
1116 : CALL dbcsr_put_block(mat_Z_lP, row=i_blk, col=atom_P, &
1117 213 : block=Z_blk(1:current_chunk_size, 1:n_loc_ri))
1118 : END IF
1119 : END DO
1120 :
1121 11 : DEALLOCATE (Z_blk)
1122 : END IF
1123 :
1124 11 : DEALLOCATE (d_vec_local, d_lp_local)
1125 11 : DEALLOCATE (local_grid_idx, phi_local)
1126 :
1127 : END DO
1128 :
1129 8 : DEALLOCATE (cutoff_ri_per_atom)
1130 :
1131 8 : CALL gw_3c_ctx_release(ctx_3c)
1132 :
1133 8 : CALL dbcsr_finalize(mat_Z_lP)
1134 :
1135 8 : IF (bs_env%unit_nr > 0) THEN
1136 : WRITE (bs_env%unit_nr, '(T2,A,T57,A,F7.1,A)') &
1137 4 : 'Computed Z_lP ', ' Execution time', m_walltime() - t1, ' s'
1138 4 : WRITE (bs_env%unit_nr, '(A)') ' '
1139 : END IF
1140 :
1141 8 : logger => cp_get_default_logger()
1142 :
1143 8 : IF (BTEST(cp_print_key_should_output(logger%iter_info, input, key), cp_p_file)) THEN
1144 0 : CALL dbcsr_binary_write(matrix=mat_Z_lP, filepath=TRIM(bs_env%prefix)//"Z_lP.matrix")
1145 : END IF
1146 :
1147 : END IF
1148 :
1149 10 : DEALLOCATE (row_offset, ri_blk_sizes, col_dist_ri)
1150 10 : CALL dbcsr_distribution_release(dist_Z)
1151 :
1152 10 : IF (n_procs_per_atom > 1) THEN
1153 0 : CALL cp_blacs_env_release(blacs_env_sub)
1154 0 : CALL para_env_sub%free()
1155 0 : DEALLOCATE (para_env_sub)
1156 : END IF
1157 :
1158 10 : DEALLOCATE (ri_rs_grid_points)
1159 :
1160 10 : CALL timestop(handle)
1161 :
1162 50 : END SUBROUTINE compute_coeff_Z_lP
1163 :
1164 : ! **************************************************************************************************
1165 : !> \brief Computes the dense localized RHS d_lp(l,P) = Σ_{μν} Φ_μ(r_l)·Φ_ν(r_l)·(μν|P) for one
1166 : !> RI atom P, OMP-threaded over (atom_j, atom_k) AO-pair blocks: per thread, build the 3c
1167 : !> block, then contract grid-chunked pair densities into a private d_lp partial; partials
1168 : !> are reduced into d_lp at the end.
1169 : !> Pair screening is handled inside build_3c_integral_block_ctx via the `screened` output,
1170 : !> \param bs_env ...
1171 : !> \param ctx shared 3c-integral context (gw_3c_ctx_create)
1172 : !> \param phi_val Φ_μ(r_l) on the local-sphere grid (n_grid_total × n_ao)
1173 : !> \param d_lp output (n_grid_total × n_loc_ri), zeroed by the caller, accumulated here
1174 : !> \param n_grid_total number of local-sphere grid rows
1175 : !> \param n_loc_ri number of RI functions of atom_P
1176 : !> \param atom_P ...
1177 : !> \param max_ao_size ...
1178 : !> \param atom_j_mepos ...
1179 : !> \param atom_j_stride ...
1180 : ! **************************************************************************************************
1181 :
1182 11 : SUBROUTINE compute_d_lp(bs_env, ctx, phi_val, d_lp, n_grid_total, n_loc_ri, atom_P, &
1183 : max_ao_size, atom_j_mepos, atom_j_stride)
1184 :
1185 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
1186 : TYPE(gw_3c_ctx_type), INTENT(IN) :: ctx
1187 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: phi_val
1188 : INTEGER, INTENT(IN) :: n_grid_total, n_loc_ri
1189 : REAL(KIND=dp), INTENT(INOUT) :: d_lp(n_grid_total, n_loc_ri)
1190 : INTEGER, INTENT(IN) :: atom_P, max_ao_size, atom_j_mepos, &
1191 : atom_j_stride
1192 :
1193 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_d_lp'
1194 : INTEGER, PARAMETER :: grid_chunk = 1024
1195 :
1196 : INTEGER :: atom_j, atom_k, c, handle, j, jk_idx, &
1197 : jsize, jstart, k, ksize, kstart, l, &
1198 : l0, ri
1199 : LOGICAL :: screened
1200 11 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: int_2d_prv, rho_chunk
1201 11 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: int_3c_prv
1202 11 : TYPE(gw_3c_ws_type) :: ws
1203 :
1204 11 : CALL timeset(routineN, handle)
1205 :
1206 : !$OMP PARALLEL DEFAULT(NONE) &
1207 : !$OMP SHARED(bs_env, ctx, phi_val, d_lp, n_grid_total, n_loc_ri, atom_P, max_ao_size, &
1208 : !$OMP atom_j_mepos, atom_j_stride) &
1209 : !$OMP PRIVATE(atom_j, atom_k, c, j, jk_idx, jsize, jstart, k, ksize, kstart, l, l0, ri, &
1210 11 : !$OMP screened, int_2d_prv, rho_chunk, int_3c_prv, ws)
1211 :
1212 : CALL gw_3c_ws_create(ws, ctx)
1213 : ALLOCATE (int_3c_prv(max_ao_size, max_ao_size, n_loc_ri))
1214 : ALLOCATE (int_2d_prv(max_ao_size*max_ao_size, n_loc_ri))
1215 : ALLOCATE (rho_chunk(grid_chunk, max_ao_size*max_ao_size))
1216 :
1217 : ! MPI-stride atom_j over the subgroup (atom_j_stride = 1 for the BLAS
1218 : ! path, > 1 for the ScaLAPACK path). The OMP DO parallelizes the inner
1219 : ! atom_k while the outer atom_j carries the MPI stride. COLLAPSE(2) is
1220 : ! dropped because the outer stride is non-unit under ScaLAPACK; for
1221 : ! typical natom the inner loop has plenty of work for DYNAMIC.
1222 : !$OMP DO SCHEDULE(DYNAMIC) REDUCTION(+:d_lp)
1223 : DO atom_j = atom_j_mepos + 1, SIZE(bs_env%i_ao_start_from_atom), atom_j_stride
1224 : DO atom_k = 1, SIZE(bs_env%i_ao_start_from_atom)
1225 : jstart = bs_env%i_ao_start_from_atom(atom_j)
1226 : jsize = bs_env%i_ao_end_from_atom(atom_j) - jstart + 1
1227 : kstart = bs_env%i_ao_start_from_atom(atom_k)
1228 : ksize = bs_env%i_ao_end_from_atom(atom_k) - kstart + 1
1229 :
1230 : int_3c_prv(1:jsize, 1:ksize, 1:n_loc_ri) = 0.0_dp
1231 :
1232 : ! Compute B_{μν,P} = (μν|P); ctx-internal triangle-inequality screening on
1233 : ! kind_radius sets `screened=.TRUE.` for negligible triples.
1234 : CALL build_3c_integral_block_ctx(int_3c_prv(1:jsize, 1:ksize, 1:n_loc_ri), &
1235 : ctx, ws, atom_j=atom_j, atom_k=atom_k, atom_i=atom_P, &
1236 : screened=screened)
1237 :
1238 : IF (screened) CYCLE
1239 :
1240 : ! Flatten 3D B_{μν, P} tensor to 2D B_{(μν), P} matrix for BLAS
1241 : DO ri = 1, n_loc_ri
1242 : DO k = 1, ksize
1243 : DO j = 1, jsize
1244 : jk_idx = (k - 1)*jsize + j
1245 : int_2d_prv(jk_idx, ri) = int_3c_prv(j, k, ri)
1246 : END DO
1247 : END DO
1248 : END DO
1249 :
1250 : ! Pair density ρ(l, μν) = Φ_μ(r_l) Φ_ν(r_l) in grid chunks, contracted on the fly:
1251 : ! d_{l,P} += ρ(l, μν) B_{(μν),P} (dgemm runs serially inside the parallel region)
1252 : DO l0 = 1, n_grid_total, grid_chunk
1253 : c = MIN(grid_chunk, n_grid_total - l0 + 1)
1254 : DO k = 1, ksize
1255 : DO j = 1, jsize
1256 : jk_idx = (k - 1)*jsize + j
1257 : DO l = 1, c
1258 : rho_chunk(l, jk_idx) = phi_val(l0 + l - 1, jstart + j - 1)* &
1259 : phi_val(l0 + l - 1, kstart + k - 1)
1260 : END DO
1261 : END DO
1262 : END DO
1263 : CALL dgemm("N", "N", c, n_loc_ri, jsize*ksize, &
1264 : 1.0_dp, rho_chunk, grid_chunk, &
1265 : int_2d_prv, max_ao_size*max_ao_size, &
1266 : 1.0_dp, d_lp(l0, 1), n_grid_total)
1267 : END DO
1268 : END DO
1269 : END DO
1270 : !$OMP END DO
1271 :
1272 : DEALLOCATE (int_3c_prv, int_2d_prv, rho_chunk)
1273 : CALL gw_3c_ws_release(ws)
1274 :
1275 : !$OMP END PARALLEL
1276 :
1277 11 : CALL timestop(handle)
1278 :
1279 11 : END SUBROUTINE compute_d_lp
1280 :
1281 : ! **************************************************************************************************
1282 : !> \brief Distributed pdpotrf/pdpotrs solve of D x = b for one atom of the
1283 : !> RI-RS Z_lP build. Called when N_PROCS_PER_ATOM_Z_LP > 1, with a
1284 : !> subgroup of cooperating ranks and an associated BLACS context.
1285 : !> Each rank in the subgroup holds the (replicated) phi_local and the
1286 : !> (replicated) RHS d_lp; it builds its own block-cyclic slice of the
1287 : !> squared+Jacobi-scaled Gram matrix D via tiled DGEMM, factorizes via
1288 : !> cp_fm_cholesky_decompose (UPLO='U'), and solves with
1289 : !> cp_fm_cholesky_solve. The replicated d_lp is updated in place via
1290 : !> cp_fm_get_submatrix.
1291 : !> \param phi_local replicated (n_loc x n_ao) AO values on the sphere grid
1292 : !> \param d_vec replicated Jacobi diagonal = 1 / ||phi_i||^2
1293 : !> \param d_lp replicated RHS in/out (n_loc x n_rhs); on input pre-scaled
1294 : !> by d_vec, on output also pre-scaled (caller post-scales)
1295 : !> \param n_loc number of sphere-local grid points
1296 : !> \param n_ao number of AO basis functions
1297 : !> \param n_rhs number of RI functions of this atom_P (RHS columns)
1298 : !> \param tikhonov regularisation added to the diagonal
1299 : !> \param para_env_sub sub-communicator of the atom-group
1300 : !> \param blacs_env_sub BLACS context on the sub-communicator
1301 : !> \param fm_struct_D ...
1302 : !> \param fm_struct_b ...
1303 : !> \param fm_D ...
1304 : !> \param fm_b ...
1305 : !> \param info 0 on success, non-zero if pdpotrf or pdpotrs failed
1306 : ! **************************************************************************************************
1307 0 : SUBROUTINE solve_D_lp_distributed(phi_local, d_vec, d_lp, n_loc, n_ao, n_rhs, &
1308 : tikhonov, para_env_sub, blacs_env_sub, &
1309 : fm_struct_D, fm_struct_b, fm_D, fm_b, info)
1310 :
1311 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: phi_local
1312 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: d_vec
1313 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: d_lp
1314 : INTEGER, INTENT(IN) :: n_loc, n_ao, n_rhs
1315 : REAL(KIND=dp), INTENT(IN) :: tikhonov
1316 : TYPE(mp_para_env_type), POINTER :: para_env_sub
1317 : TYPE(cp_blacs_env_type), POINTER :: blacs_env_sub
1318 : TYPE(cp_fm_struct_type), POINTER :: fm_struct_D, fm_struct_b
1319 : TYPE(cp_fm_type), INTENT(INOUT) :: fm_D, fm_b
1320 : INTEGER, INTENT(OUT) :: info
1321 :
1322 : CHARACTER(LEN=*), PARAMETER :: routineN = 'solve_D_lp_distributed'
1323 :
1324 : INTEGER :: handle, i_loc, ig, j_loc, jg, &
1325 : ncol_local, nrow_local
1326 0 : INTEGER, DIMENSION(:), POINTER :: col_indices, row_indices
1327 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1328 0 : POINTER :: local_data
1329 :
1330 0 : CALL timeset(routineN, handle)
1331 0 : info = 0
1332 :
1333 0 : NULLIFY (fm_struct_D, fm_struct_b)
1334 : CALL cp_fm_struct_create(fm_struct_D, para_env=para_env_sub, &
1335 : context=blacs_env_sub, &
1336 0 : nrow_global=n_loc, ncol_global=n_loc)
1337 : CALL cp_fm_struct_create(fm_struct_b, para_env=para_env_sub, &
1338 : context=blacs_env_sub, &
1339 0 : nrow_global=n_loc, ncol_global=n_rhs)
1340 0 : CALL cp_fm_create(fm_D, fm_struct_D)
1341 0 : CALL cp_fm_create(fm_b, fm_struct_b)
1342 :
1343 : ! ---- Build the local block-cyclic slice of fm_D --------------------
1344 : ! Tiled DGEMM build: for each (row_tile x col_tile) sub-block we
1345 : ! gather small phi strips, DGEMM into a small gram tile, then write
1346 : ! the squared & d_vec-scaled result directly into fm_D%local_data.
1347 : ! This avoids materialising the full (nrow_local x n_ao),
1348 : ! (n_ao x ncol_local), and (nrow_local x ncol_local) buffers.
1349 : CALL cp_fm_get_info(fm_D, nrow_local=nrow_local, ncol_local=ncol_local, &
1350 : row_indices=row_indices, col_indices=col_indices, &
1351 0 : local_data=local_data)
1352 :
1353 0 : IF (nrow_local > 0 .AND. ncol_local > 0) THEN
1354 0 : BLOCK
1355 : INTEGER, PARAMETER :: ntile = 1024
1356 : INTEGER :: ib, ie, jb, je, mb, kb, ti, tj
1357 0 : REAL(KIND=dp), ALLOCATABLE :: gram_t(:, :), phi_cols_t(:, :), phi_rows_t(:, :)
1358 0 : ALLOCATE (phi_rows_t(ntile, n_ao), phi_cols_t(n_ao, ntile), gram_t(ntile, ntile))
1359 0 : DO ib = 1, nrow_local, ntile
1360 0 : ie = MIN(ib + ntile - 1, nrow_local)
1361 0 : mb = ie - ib + 1
1362 : !$OMP PARALLEL DO DEFAULT(NONE) &
1363 : !$OMP SHARED(mb, n_ao, phi_rows_t, phi_local, row_indices, ib) &
1364 0 : !$OMP PRIVATE(ti, j_loc) SCHEDULE(STATIC)
1365 : DO j_loc = 1, n_ao
1366 : DO ti = 1, mb
1367 : phi_rows_t(ti, j_loc) = phi_local(row_indices(ib + ti - 1), j_loc)
1368 : END DO
1369 : END DO
1370 : !$OMP END PARALLEL DO
1371 0 : DO jb = 1, ncol_local, ntile
1372 0 : je = MIN(jb + ntile - 1, ncol_local)
1373 0 : kb = je - jb + 1
1374 : !$OMP PARALLEL DO DEFAULT(NONE) &
1375 : !$OMP SHARED(kb, n_ao, phi_cols_t, phi_local, col_indices, jb) &
1376 0 : !$OMP PRIVATE(tj, i_loc) SCHEDULE(STATIC)
1377 : DO tj = 1, kb
1378 : DO i_loc = 1, n_ao
1379 : phi_cols_t(i_loc, tj) = phi_local(col_indices(jb + tj - 1), i_loc)
1380 : END DO
1381 : END DO
1382 : !$OMP END PARALLEL DO
1383 : CALL dgemm('N', 'N', mb, kb, n_ao, &
1384 : 1.0_dp, phi_rows_t, ntile, phi_cols_t, n_ao, &
1385 0 : 0.0_dp, gram_t, ntile)
1386 : !$OMP PARALLEL DO DEFAULT(NONE) &
1387 : !$OMP SHARED(mb, kb, gram_t, d_vec, row_indices, col_indices, ib, jb) &
1388 : !$OMP SHARED(local_data, tikhonov) &
1389 0 : !$OMP PRIVATE(ti, tj, ig, jg) SCHEDULE(STATIC)
1390 : DO tj = 1, kb
1391 : jg = col_indices(jb + tj - 1)
1392 : DO ti = 1, mb
1393 : ig = row_indices(ib + ti - 1)
1394 : local_data(ib + ti - 1, jb + tj - 1) = &
1395 : gram_t(ti, tj)*gram_t(ti, tj)*d_vec(ig)*d_vec(jg)
1396 : IF (ig == jg) THEN
1397 : local_data(ib + ti - 1, jb + tj - 1) = &
1398 : local_data(ib + ti - 1, jb + tj - 1) + tikhonov
1399 : END IF
1400 : END DO
1401 : END DO
1402 : !$OMP END PARALLEL DO
1403 : END DO
1404 : END DO
1405 0 : DEALLOCATE (phi_rows_t, phi_cols_t, gram_t)
1406 : END BLOCK
1407 : END IF
1408 :
1409 : ! ---- Load the replicated d_lp into the block-cyclic fm_b -----------
1410 0 : CALL cp_fm_set_submatrix(fm_b, d_lp)
1411 :
1412 : ! ---- pdpotrf (Cholesky factorisation; cp_fm_cholesky_decompose
1413 : ! factors with UPLO='U', so pdpotrs must match)
1414 0 : CALL cp_fm_cholesky_decompose(fm_D, n=n_loc, info_out=info)
1415 0 : IF (info /= 0) THEN
1416 0 : CPABORT("pdpotrf failed in solve_D_lp_distributed")
1417 : END IF
1418 :
1419 : ! ---- pdpotrs/dpotrs (solve in place on fm_b) -----------------------
1420 0 : CALL cp_fm_cholesky_solve(fm_D, fm_b, n=n_loc, info_out=info)
1421 0 : IF (info /= 0) THEN
1422 0 : CPABORT("pdpotrs failed in solve_D_lp_distributed")
1423 : END IF
1424 :
1425 : ! ---- Gather distributed solution back into the replicated d_lp ----
1426 0 : CALL cp_fm_get_submatrix(fm_b, d_lp)
1427 :
1428 0 : CALL cp_fm_release(fm_D)
1429 0 : CALL cp_fm_release(fm_b)
1430 0 : CALL cp_fm_struct_release(fm_struct_D)
1431 0 : CALL cp_fm_struct_release(fm_struct_b)
1432 :
1433 0 : CALL timestop(handle)
1434 :
1435 0 : END SUBROUTINE solve_D_lp_distributed
1436 :
1437 : ! **************************************************************************************************
1438 : !> \brief Computes the χ(iτ) matrix
1439 : !> \param bs_env ...
1440 : !> \param mat_chi_Gamma_tau ...
1441 : !> \param mat_phi_mu_l ...
1442 : !> \param mat_Z_lP ...
1443 : ! **************************************************************************************************
1444 :
1445 10 : SUBROUTINE get_mat_chi_Gamma_tau(bs_env, mat_chi_Gamma_tau, mat_phi_mu_l, mat_Z_lP)
1446 :
1447 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
1448 : TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: mat_chi_Gamma_tau
1449 : TYPE(dbcsr_type), INTENT(INOUT) :: mat_phi_mu_l, mat_Z_lP
1450 :
1451 : CHARACTER(LEN=*), PARAMETER :: routineN = 'get_mat_chi_Gamma_tau'
1452 :
1453 : INTEGER :: handle, i, i_t, ispin, npcol
1454 10 : INTEGER, DIMENSION(:), POINTER :: blk_ao, blk_grid, dist_col_ao, &
1455 10 : dist_col_grid, dist_row_grid
1456 : REAL(KIND=dp) :: t1, tau
1457 : TYPE(dbcsr_distribution_type) :: dist_grid_grid, dist_phi
1458 : TYPE(dbcsr_type) :: matrix_chi_grid, matrix_chi_grid_spin, &
1459 : matrix_G_occ_grid, matrix_G_vir_grid
1460 :
1461 10 : CALL timeset(routineN, handle)
1462 :
1463 : ! =========================================================================
1464 : ! 1. SETUP CORE TOPOLOGIES
1465 : ! =========================================================================
1466 10 : CALL dbcsr_get_info(mat_phi_mu_l, distribution=dist_phi, row_blk_size=blk_grid, col_blk_size=blk_ao)
1467 10 : CALL dbcsr_distribution_get(dist_phi, row_dist=dist_row_grid, col_dist=dist_col_ao)
1468 :
1469 : ! Determine number of MPI process columns
1470 38 : npcol = MAXVAL(dist_col_ao) + 1
1471 :
1472 : ! Build a perfectly safe column distribution for the Grid dimension
1473 30 : ALLOCATE (dist_col_grid(SIZE(blk_grid)))
1474 204 : DO i = 1, SIZE(blk_grid)
1475 204 : dist_col_grid(i) = MOD(i - 1, npcol)
1476 : END DO
1477 :
1478 : CALL dbcsr_distribution_new(dist_grid_grid, template=dist_phi, &
1479 10 : row_dist=dist_row_grid, col_dist=dist_col_grid)
1480 :
1481 10 : CALL dbcsr_create(matrix_G_occ_grid, "G_occ_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
1482 10 : CALL dbcsr_create(matrix_G_vir_grid, "G_vir_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
1483 10 : CALL dbcsr_create(matrix_chi_grid, "chi_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
1484 10 : CALL dbcsr_create(matrix_chi_grid_spin, "chi_grid_spin", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
1485 :
1486 : ! =========================================================================
1487 : ! 2. MAIN IMAGINARY TIME LOOP
1488 : ! =========================================================================
1489 120 : DO i_t = 1, bs_env%num_time_freq_points
1490 110 : t1 = m_walltime()
1491 :
1492 110 : tau = bs_env%imag_time_points(i_t)
1493 110 : CALL dbcsr_set(matrix_chi_grid, 0.0_dp)
1494 :
1495 : ! ----------------------------------------------------------------------
1496 : ! A. SPIN LOOP (Allocations safely encapsulated in wrappers)
1497 : ! ----------------------------------------------------------------------
1498 240 : DO ispin = 1, bs_env%n_spin
1499 :
1500 : ! G^occ_µλ(i|τ|) = sum_n^occ C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
1501 : ! G^occ_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
1502 : CALL build_G_grid(bs_env, tau, ispin, .TRUE., .FALSE., mat_phi_mu_l, &
1503 130 : matrix_G_occ_grid, bs_env%eps_filter)
1504 :
1505 : ! G^vir_µλ(i|τ|) = sum_n^vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
1506 : ! G^vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
1507 : CALL build_G_grid(bs_env, tau, ispin, .FALSE., .TRUE., mat_phi_mu_l, &
1508 130 : matrix_G_vir_grid, bs_env%eps_filter)
1509 :
1510 : ! -------------------------------------------------------------------
1511 : ! B. ELEMENT-WISE HADAMARD PRODUCT
1512 : ! -------------------------------------------------------------------
1513 : ! χ_ll'(iτ) = G^occ_ll'(i|τ|) * G^vir_ll'(i|τ|)
1514 130 : CALL hadamard_product(matrix_G_occ_grid, matrix_G_vir_grid, matrix_chi_grid_spin, bs_env%spin_degeneracy)
1515 :
1516 : ! Accumulate spin contributions
1517 240 : CALL dbcsr_add(matrix_chi_grid, matrix_chi_grid_spin, 1.0_dp, 1.0_dp)
1518 :
1519 : END DO ! ispin
1520 :
1521 : ! ----------------------------------------------------------------------
1522 : ! C. TRANSFORM TO AUXILIARY BASIS & EXPORT DIRECTLY
1523 : ! χ_aux = Z^T * χ_grid * Z
1524 : ! χ_PQ(iτ) = sum_ll' Z_lP χ_ll'(iτ) Z_l'Q
1525 : ! Result is dumped into the final array mat_chi_Gamma_tau!
1526 : ! ----------------------------------------------------------------------
1527 : CALL contract_A_B_A("T", "N", mat_Z_lP, matrix_chi_grid, &
1528 110 : mat_chi_Gamma_tau(i_t)%matrix, bs_env%eps_filter)
1529 :
1530 120 : IF (bs_env%unit_nr > 0) THEN
1531 : WRITE (bs_env%unit_nr, '(T2,A,I13,A,I3,A,F7.1,A)') &
1532 55 : 'Computed χ(iτ,k=0) for time point', i_t, ' /', bs_env%num_time_freq_points, &
1533 110 : ', Execution time', m_walltime() - t1, ' s'
1534 : END IF
1535 :
1536 : END DO ! i_t
1537 :
1538 : ! =========================================================================
1539 : ! 3. FINAL CLEANUP
1540 : ! =========================================================================
1541 10 : CALL dbcsr_release(matrix_G_occ_grid)
1542 10 : CALL dbcsr_release(matrix_G_vir_grid)
1543 10 : CALL dbcsr_release(matrix_chi_grid)
1544 10 : CALL dbcsr_release(matrix_chi_grid_spin)
1545 10 : CALL dbcsr_distribution_release(dist_grid_grid)
1546 10 : DEALLOCATE (dist_col_grid)
1547 :
1548 10 : IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
1549 :
1550 10 : CALL timestop(handle)
1551 :
1552 20 : END SUBROUTINE get_mat_chi_Gamma_tau
1553 :
1554 : ! **************************************************************************************************
1555 : !> \brief Computes Green's Function in grid basis
1556 : !> \param bs_env ...
1557 : !> \param tau ...
1558 : !> \param ispin ...
1559 : !> \param occ ...
1560 : !> \param vir ...
1561 : !> \param mat_phi_mu_l ...
1562 : !> \param matrix_G_grid ...
1563 : !> \param eps_filter ...
1564 : ! **************************************************************************************************
1565 :
1566 1064 : SUBROUTINE build_G_grid(bs_env, tau, ispin, occ, vir, mat_phi_mu_l, matrix_G_grid, eps_filter)
1567 :
1568 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
1569 : REAL(KIND=dp), INTENT(IN) :: tau
1570 : INTEGER, INTENT(IN) :: ispin
1571 : LOGICAL, INTENT(IN) :: occ, vir
1572 : TYPE(dbcsr_type), INTENT(INOUT) :: mat_phi_mu_l, matrix_G_grid
1573 : REAL(KIND=dp), INTENT(IN) :: eps_filter
1574 :
1575 : CHARACTER(LEN=*), PARAMETER :: routineN = 'build_G_grid'
1576 :
1577 : INTEGER :: handle
1578 532 : INTEGER, DIMENSION(:), POINTER :: blk_ao, dist_row_ao
1579 : TYPE(cp_fm_type), POINTER :: fm_G
1580 : TYPE(dbcsr_distribution_type) :: dist_ao_ao
1581 : TYPE(dbcsr_type) :: matrix_G_ao
1582 :
1583 532 : CALL timeset(routineN, handle)
1584 :
1585 : ! 1. Select the FM matrix based on occ/vir flags
1586 532 : IF (occ) THEN
1587 272 : fm_G => bs_env%fm_Gocc
1588 : ELSE
1589 260 : fm_G => bs_env%fm_Gvir
1590 : END IF
1591 :
1592 : ! 2. Compute Dense FM Green's Function
1593 : ! G^occ/vir_µλ(i|τ|) = sum_G^occ/vir_µλn^occ/vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
1594 532 : CALL G_occ_vir(bs_env, tau, fm_G, ispin, occ=occ, vir=vir)
1595 :
1596 : ! 3. Setup AO DBCSR Topology and Create Matrix dynamically
1597 532 : CALL setup_square_topology(mat_phi_mu_l, 'COL', dist_ao_ao, blk_ao, dist_row_ao)
1598 :
1599 : CALL dbcsr_create(matrix_G_ao, name="G_ao", dist=dist_ao_ao, &
1600 : matrix_type=dbcsr_type_no_symmetry, &
1601 532 : row_blk_size=blk_ao, col_blk_size=blk_ao)
1602 :
1603 : ! 4. Convert FM to Sparse DBCSR
1604 532 : CALL copy_fm_to_dbcsr(fm_G, matrix_G_ao, keep_sparsity=.FALSE.)
1605 :
1606 : ! 5. Transform to Grid Basis: G_grid = phi * G_ao * phi^T
1607 : ! G^occ/vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ/vir_µν Φ_ν(r_l')
1608 532 : CALL contract_A_B_A("N", "T", mat_phi_mu_l, matrix_G_ao, matrix_G_grid, eps_filter)
1609 :
1610 : ! 6. Release AO matrix and topology
1611 532 : CALL release_dbcsr_topology_and_matrices(dist=dist_ao_ao, mapped_dist=dist_row_ao, m1=matrix_G_ao)
1612 :
1613 532 : CALL timestop(handle)
1614 :
1615 532 : END SUBROUTINE build_G_grid
1616 :
1617 : ! **************************************************************************************************
1618 : !> \brief Generalized routine to compute OUT = A * B * A^T OR OUT = A^T * B * A using DBCSR
1619 : !> \param transA_left ...
1620 : !> \param transA_right ...
1621 : !> \param matrix_A ...
1622 : !> \param matrix_B ...
1623 : !> \param matrix_out ...
1624 : !> \param eps_filter ...
1625 : ! **************************************************************************************************
1626 :
1627 1034 : SUBROUTINE contract_A_B_A(transA_left, transA_right, matrix_A, matrix_B, matrix_out, eps_filter)
1628 :
1629 : CHARACTER(LEN=1), INTENT(IN) :: transA_left, transA_right
1630 : TYPE(dbcsr_type), INTENT(INOUT) :: matrix_A, matrix_B, matrix_out
1631 : REAL(KIND=dp), INTENT(IN) :: eps_filter
1632 :
1633 : CHARACTER(LEN=*), PARAMETER :: routineN = 'contract_A_B_A'
1634 :
1635 : INTEGER :: handle
1636 : TYPE(dbcsr_type) :: matrix_tmp
1637 :
1638 1034 : CALL timeset(routineN, handle)
1639 :
1640 1034 : CALL dbcsr_create(matrix_tmp, template=matrix_A)
1641 :
1642 1034 : IF (transA_left == "N" .AND. transA_right == "T") THEN
1643 : ! Path 1: Out = A * B * A^T
1644 : CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_A, matrix_B, &
1645 652 : 0.0_dp, matrix_tmp, filter_eps=eps_filter)
1646 : CALL dbcsr_multiply("N", "T", 1.0_dp, matrix_tmp, matrix_A, &
1647 652 : 0.0_dp, matrix_out, filter_eps=eps_filter)
1648 :
1649 382 : ELSE IF (transA_left == "T" .AND. transA_right == "N") THEN
1650 : ! Path 2: Out = A^T * B * A
1651 : CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_B, matrix_A, &
1652 382 : 0.0_dp, matrix_tmp, filter_eps=eps_filter)
1653 : CALL dbcsr_multiply("T", "N", 1.0_dp, matrix_A, matrix_tmp, &
1654 382 : 0.0_dp, matrix_out, filter_eps=eps_filter)
1655 : ELSE
1656 0 : CPABORT("Unsupported transposition pair in contract_A_B_A")
1657 : END IF
1658 :
1659 1034 : CALL dbcsr_release(matrix_tmp)
1660 :
1661 1034 : CALL timestop(handle)
1662 :
1663 1034 : END SUBROUTINE contract_A_B_A
1664 :
1665 : ! **************************************************************************************************
1666 : !> \brief Computes C = A ◦ B (Element-wise Hadamard product) for sparse DBCSR matrices.
1667 : !> \param matrix_A ...
1668 : !> \param matrix_B ...
1669 : !> \param matrix_C ...
1670 : !> \param fac (Scaling factor applied to the product)
1671 : ! **************************************************************************************************
1672 :
1673 804 : SUBROUTINE hadamard_product(matrix_A, matrix_B, matrix_C, fac)
1674 :
1675 : TYPE(dbcsr_type), INTENT(INOUT) :: matrix_A, matrix_B, matrix_C
1676 : REAL(KIND=dp), INTENT(IN) :: fac
1677 :
1678 : CHARACTER(LEN=*), PARAMETER :: routineN = 'hadamard_product'
1679 :
1680 : INTEGER :: col, handle, row
1681 : LOGICAL :: found
1682 402 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: blk_B, blk_C
1683 : TYPE(dbcsr_iterator_type) :: iter
1684 :
1685 402 : CALL timeset(routineN, handle)
1686 :
1687 402 : CALL dbcsr_copy(matrix_C, matrix_A)
1688 :
1689 402 : CALL dbcsr_iterator_start(iter, matrix_C)
1690 77923 : DO WHILE (dbcsr_iterator_blocks_left(iter))
1691 77521 : CALL dbcsr_iterator_next_block(iter, row, col, blk_C)
1692 :
1693 77521 : CALL dbcsr_get_block_p(matrix_B, row, col, blk_B, found)
1694 :
1695 77923 : IF (found) THEN
1696 163879394 : blk_C(:, :) = fac*blk_C(:, :)*blk_B(:, :)
1697 : ELSE
1698 : ! If B is sparse here, the product is zero
1699 0 : blk_C(:, :) = 0.0_dp
1700 : END IF
1701 : END DO
1702 402 : CALL dbcsr_iterator_stop(iter)
1703 :
1704 402 : CALL timestop(handle)
1705 :
1706 402 : END SUBROUTINE hadamard_product
1707 :
1708 : ! **************************************************************************************************
1709 : !> \brief Compute screened Coulomb interaction matrix
1710 : !> \param bs_env ...
1711 : !> \param qs_env ...
1712 : !> \param mat_chi_Gamma_tau ...
1713 : !> \param fm_W_time ...
1714 : ! **************************************************************************************************
1715 :
1716 40 : SUBROUTINE compute_W(bs_env, qs_env, mat_chi_Gamma_tau, fm_W_time)
1717 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
1718 : TYPE(qs_environment_type), POINTER :: qs_env
1719 : TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: mat_chi_Gamma_tau
1720 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:) :: fm_W_time
1721 :
1722 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_W'
1723 :
1724 : INTEGER :: handle, i_t, j_w
1725 : REAL(KIND=dp) :: t1
1726 : TYPE(cp_fm_type) :: fm_M_inv_V_sqrt, fm_V, fm_V_sqrt
1727 :
1728 10 : CALL timeset(routineN, handle)
1729 :
1730 10 : t1 = m_walltime()
1731 :
1732 10 : CALL create_fm_W_MIC_time(bs_env, fm_W_time)
1733 :
1734 : ! 1. Allocate V and M matrices
1735 10 : CALL cp_fm_create(fm_V, bs_env%fm_RI_RI%matrix_struct)
1736 10 : CALL cp_fm_create(fm_V_sqrt, bs_env%fm_RI_RI%matrix_struct)
1737 10 : CALL cp_fm_create(fm_M_inv_V_sqrt, bs_env%fm_RI_RI%matrix_struct)
1738 :
1739 : ! Compute V and M^-1 * V^0.5
1740 10 : CALL compute_V_MinvVsqrt(bs_env, qs_env, fm_V, fm_V_sqrt, fm_M_inv_V_sqrt)
1741 :
1742 : ! 2. Loop over frequencies
1743 120 : DO j_w = 1, bs_env%num_time_freq_points
1744 : ! Fourier transformation of χ_PQ(iτ) to χ_PQ(iω_j)
1745 110 : CALL compute_fm_chi_Gamma_freq(bs_env, bs_env%fm_chi_Gamma_freq, j_w, mat_chi_Gamma_tau)
1746 :
1747 : ! ε(iω_j) = Id - V^0.5*M^-1*χ(iω_j)*M^-1*V^0.5
1748 : ! W(iω_j) = V^0.5*(ε^-1(iω_j)-Id)*V^0.5
1749 : CALL compute_fm_W_freq(bs_env, bs_env%fm_chi_Gamma_freq, fm_V_sqrt, &
1750 110 : fm_M_inv_V_sqrt, bs_env%fm_W_MIC_freq)
1751 :
1752 : ! Fourier transform from W_PQ^MIC(iω_j) to W_PQ^MIC(iτ)
1753 120 : CALL Fourier_transform_w_to_t(bs_env, fm_W_time, bs_env%fm_W_MIC_freq, j_w)
1754 : END DO
1755 :
1756 : ! M^-1*W^MIC(iτ)*M^-1
1757 10 : CALL multiply_fm_W_MIC_time_with_Minv_Gamma(bs_env, qs_env, fm_W_time)
1758 :
1759 10 : IF (bs_env%unit_nr > 0) THEN
1760 : WRITE (bs_env%unit_nr, '(T2,A,T55,A,F10.1,A)') &
1761 5 : 'Computed W(iτ),', ' Execution time', m_walltime() - t1, ' s'
1762 : END IF
1763 :
1764 10 : CALL dbcsr_deallocate_matrix_set(mat_chi_Gamma_tau)
1765 :
1766 : ! Cleanup
1767 10 : CALL cp_fm_release(fm_V)
1768 10 : CALL cp_fm_release(fm_V_sqrt)
1769 10 : CALL cp_fm_release(fm_M_inv_V_sqrt)
1770 :
1771 : ! Marek : Fourier transform W^MIC(itau) back to get it at a specific im.frequency point - iomega = 0
1772 10 : IF (bs_env%rtp_method == rtp_method_bse) THEN
1773 0 : t1 = m_walltime()
1774 0 : CALL cp_fm_create(bs_env%fm_W_MIC_freq_zero, bs_env%fm_W_MIC_freq%matrix_struct)
1775 : ! Set to zero
1776 0 : CALL cp_fm_set_all(bs_env%fm_W_MIC_freq_zero, 0.0_dp)
1777 : ! Sum over all times
1778 0 : DO i_t = 1, bs_env%num_time_freq_points
1779 : ! Add the relevant structure with correct weight
1780 : CALL cp_fm_scale_and_add(1.0_dp, bs_env%fm_W_MIC_freq_zero, &
1781 0 : bs_env%imag_time_weights_freq_zero(i_t), fm_W_time(i_t))
1782 : END DO
1783 : ! Done, save to file
1784 0 : CALL fm_write(bs_env%fm_W_MIC_freq_zero, 0, "W_freq_rtp", qs_env)
1785 : ! Report calculation
1786 0 : IF (bs_env%unit_nr > 0) THEN
1787 : WRITE (bs_env%unit_nr, '(T2,A,T55,A,F10.1,A)') &
1788 0 : 'Computed W(0),', ' Execution time', m_walltime() - t1, ' s'
1789 : END IF
1790 : END IF
1791 :
1792 10 : IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
1793 :
1794 10 : CALL timestop(handle)
1795 :
1796 10 : END SUBROUTINE compute_W
1797 :
1798 : ! **************************************************************************************************
1799 : !> \brief Computes V, V^0.5, and M^-1 * V^0.5
1800 : !> \param bs_env ...
1801 : !> \param qs_env ...
1802 : !> \param fm_V ...
1803 : !> \param fm_V_sqrt ...
1804 : !> \param fm_Minv_Vsqrt ...
1805 : ! **************************************************************************************************
1806 :
1807 10 : SUBROUTINE compute_V_MinvVsqrt(bs_env, qs_env, fm_V, fm_V_sqrt, fm_Minv_Vsqrt)
1808 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
1809 : TYPE(qs_environment_type), POINTER :: qs_env
1810 : TYPE(cp_fm_type), INTENT(INOUT) :: fm_V, fm_V_sqrt, fm_Minv_Vsqrt
1811 :
1812 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_V_MinvVsqrt'
1813 :
1814 : INTEGER :: handle, info, n_RI, ndep
1815 10 : TYPE(atomic_kind_type), DIMENSION(:), POINTER :: atomic_kind_set
1816 : TYPE(cell_type), POINTER :: cell
1817 : TYPE(cp_fm_type) :: fm_work
1818 10 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :) :: fm_M
1819 10 : TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER :: mat_V_kp
1820 10 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1821 10 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
1822 :
1823 10 : CALL timeset(routineN, handle)
1824 :
1825 10 : n_RI = bs_env%n_RI
1826 10 : CALL cp_fm_create(fm_work, fm_V%matrix_struct)
1827 :
1828 : ! -----------------------------------------------------------------------
1829 : ! 1. Build Coulomb Matrix V(k=0) using the kp-routine but only for ikp=1
1830 : ! -----------------------------------------------------------------------
1831 : CALL get_qs_env(qs_env=qs_env, particle_set=particle_set, cell=cell, &
1832 10 : qs_kind_set=qs_kind_set, atomic_kind_set=atomic_kind_set)
1833 :
1834 50 : ALLOCATE (mat_V_kp(1:1, 1:2))
1835 10 : NULLIFY (mat_V_kp(1, 1)%matrix, mat_V_kp(1, 2)%matrix)
1836 10 : ALLOCATE (mat_V_kp(1, 1)%matrix, mat_V_kp(1, 2)%matrix)
1837 :
1838 10 : CALL dbcsr_create(mat_V_kp(1, 1)%matrix, template=bs_env%mat_RI_RI%matrix)
1839 10 : CALL dbcsr_reserve_all_blocks(mat_V_kp(1, 1)%matrix)
1840 10 : CALL dbcsr_set(mat_V_kp(1, 1)%matrix, 0.0_dp)
1841 :
1842 : ! Dummy imaginary part just to satisfy the routine
1843 10 : CALL dbcsr_create(mat_V_kp(1, 2)%matrix, template=bs_env%mat_RI_RI%matrix)
1844 10 : CALL dbcsr_reserve_all_blocks(mat_V_kp(1, 2)%matrix)
1845 10 : CALL dbcsr_set(mat_V_kp(1, 2)%matrix, 0.0_dp)
1846 :
1847 40 : bs_env%kpoints_chi_eps_W%nkp_grid = bs_env%nkp_grid_chi_eps_W_orig
1848 :
1849 : CALL build_2c_coulomb_matrix_kp(mat_V_kp, bs_env%kpoints_chi_eps_W, "RI_AUX", cell, &
1850 : particle_set, qs_kind_set, atomic_kind_set, &
1851 10 : bs_env%size_lattice_sum_V, operator_coulomb, 1, 1)
1852 :
1853 : ! Copy real part to fm_V
1854 10 : CALL copy_dbcsr_to_fm(mat_V_kp(1, 1)%matrix, fm_V)
1855 :
1856 10 : CALL dbcsr_deallocate_matrix(mat_V_kp(1, 1)%matrix)
1857 10 : CALL dbcsr_deallocate_matrix(mat_V_kp(1, 2)%matrix)
1858 10 : DEALLOCATE (mat_V_kp)
1859 :
1860 : ! -----------------------------------------------------------------------
1861 : ! 2. Get RI-Metric Matrix M(k=0)
1862 : ! -----------------------------------------------------------------------
1863 : CALL RI_2c_integral_mat(qs_env, fm_M, fm_V, n_RI, bs_env%ri_metric, &
1864 10 : do_kpoints=.FALSE., regularization_RI=bs_env%regularization_RI)
1865 :
1866 : ! -----------------------------------------------------------------------
1867 : ! 3. M -> M^-1
1868 : ! -----------------------------------------------------------------------
1869 10 : CALL cp_fm_cholesky_decompose(fm_M(1, 1), info_out=info)
1870 10 : IF (info == 0) THEN
1871 10 : CALL cp_fm_cholesky_invert(fm_M(1, 1))
1872 10 : CALL cp_fm_uplo_to_full(fm_M(1, 1), fm_work)
1873 : ELSE
1874 : ! Fallback if Cholesky fails due to conditioning
1875 0 : CALL cp_fm_power(fm_M(1, 1), fm_work, -1.0_dp, bs_env%eps_eigval_mat_RI, ndep)
1876 0 : CALL cp_fm_to_fm(fm_work, fm_M(1, 1))
1877 : END IF
1878 :
1879 : ! -----------------------------------------------------------------------
1880 : ! 4. V -> V^0.5
1881 : ! -----------------------------------------------------------------------
1882 10 : CALL cp_fm_to_fm(fm_V, fm_V_sqrt)
1883 10 : CALL cp_fm_cholesky_decompose(fm_V_sqrt, info_out=info)
1884 10 : IF (info == 0) THEN
1885 10 : CALL clean_lower_part(fm_V_sqrt)
1886 : ELSE
1887 0 : CALL cp_fm_power(fm_V, fm_V_sqrt, 0.5_dp, bs_env%eps_eigval_mat_RI, ndep)
1888 : END IF
1889 :
1890 : ! -----------------------------------------------------------------------
1891 : ! 5. M^-1 * V^0.5
1892 : ! -----------------------------------------------------------------------
1893 : CALL parallel_gemm("N", "T", n_RI, n_RI, n_RI, 1.0_dp, fm_M(1, 1), fm_V_sqrt, &
1894 10 : 0.0_dp, fm_Minv_Vsqrt)
1895 :
1896 10 : CALL cp_fm_release(fm_M)
1897 10 : CALL cp_fm_release(fm_work)
1898 :
1899 10 : CALL timestop(handle)
1900 :
1901 40 : END SUBROUTINE compute_V_MinvVsqrt
1902 :
1903 : ! **************************************************************************************************
1904 : !> \brief Computes W(iω) from χ_PQ(iω_j)
1905 : !> \param bs_env ...
1906 : !> \param fm_chi_freq_j ...
1907 : !> \param fm_V_sqrt ...
1908 : !> \param fm_Minv_Vsqrt ...
1909 : !> \param fm_W_freq_j ...
1910 : ! **************************************************************************************************
1911 :
1912 440 : SUBROUTINE compute_fm_W_freq(bs_env, fm_chi_freq_j, fm_V_sqrt, fm_Minv_Vsqrt, fm_W_freq_j)
1913 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
1914 : TYPE(cp_fm_type), INTENT(IN) :: fm_chi_freq_j, fm_V_sqrt, fm_Minv_Vsqrt
1915 : TYPE(cp_fm_type), INTENT(INOUT) :: fm_W_freq_j
1916 :
1917 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_fm_W_freq'
1918 :
1919 : INTEGER :: handle, info, n_RI, ndep
1920 : TYPE(cp_fm_type) :: fm_eps_freq_j, fm_work
1921 :
1922 110 : CALL timeset(routineN, handle)
1923 :
1924 110 : n_RI = bs_env%n_RI
1925 :
1926 110 : CALL cp_fm_create(fm_eps_freq_j, fm_chi_freq_j%matrix_struct)
1927 110 : CALL cp_fm_create(fm_work, fm_chi_freq_j%matrix_struct)
1928 :
1929 : ! -----------------------------------------------------------------------
1930 : ! 1. ε(iω_j) = Id - (M^-1 * V^0.5)^T * χ(iω_j) * (M^-1 * V^0.5)
1931 : ! -----------------------------------------------------------------------
1932 : ! work = χ(iω_j) * (M^-1 * V^0.5)
1933 : CALL parallel_gemm('N', 'N', n_RI, n_RI, n_RI, 1.0_dp, &
1934 110 : fm_chi_freq_j, fm_Minv_Vsqrt, 0.0_dp, fm_work)
1935 :
1936 : ! eps_work = (M^-1 * V^0.5)^T * work
1937 : CALL parallel_gemm('T', 'N', n_RI, n_RI, n_RI, 1.0_dp, &
1938 110 : fm_Minv_Vsqrt, fm_work, 0.0_dp, fm_eps_freq_j)
1939 :
1940 : ! ε(iω_j) = Id - eps_work --> -eps_work + Id
1941 110 : CALL fm_add_on_diag(fm_eps_freq_j, 1.0_dp)
1942 :
1943 : ! Force perfect symmetry before Cholesky to avoid info != 0 due to GEMM noise
1944 110 : CALL cp_fm_uplo_to_full(fm_eps_freq_j, fm_work)
1945 :
1946 : ! -----------------------------------------------------------------------
1947 : ! 2. W(iω_j) = V^0.5^T * (ε^-1(iω_j) - Id) * V^0.5
1948 : ! -----------------------------------------------------------------------
1949 :
1950 : ! a) Cholesky decomposition of ε(iω_j)
1951 110 : CALL cp_fm_cholesky_decompose(fm_eps_freq_j, info_out=info)
1952 :
1953 : ! b) Inversion
1954 110 : IF (info == 0) THEN
1955 110 : CALL cp_fm_cholesky_invert(fm_eps_freq_j)
1956 110 : CALL cp_fm_uplo_to_full(fm_eps_freq_j, fm_work)
1957 : ELSE
1958 : ! Fallback to expensive diagonalization if Cholesky fails
1959 0 : CALL cp_fm_power(fm_eps_freq_j, fm_work, -1.0_dp, bs_env%eps_eigval_mat_RI, ndep)
1960 0 : CALL cp_fm_to_fm(fm_work, fm_eps_freq_j)
1961 : END IF
1962 :
1963 : ! c) ε^-1(iω_j) - Id
1964 110 : CALL fm_add_on_diag(fm_eps_freq_j, -1.0_dp)
1965 :
1966 : ! d) work = (ε^-1(iω_j) - Id) * V^0.5
1967 : CALL parallel_gemm('N', 'N', n_RI, n_RI, n_RI, 1.0_dp, fm_eps_freq_j, fm_V_sqrt, &
1968 110 : 0.0_dp, fm_work)
1969 :
1970 : ! e) W(iw) = V^0.5^T * work
1971 : CALL parallel_gemm('T', 'N', n_RI, n_RI, n_RI, 1.0_dp, fm_V_sqrt, fm_work, &
1972 110 : 0.0_dp, fm_W_freq_j)
1973 :
1974 : ! Cleanup
1975 110 : CALL cp_fm_release(fm_work)
1976 110 : CALL cp_fm_release(fm_eps_freq_j)
1977 :
1978 110 : CALL timestop(handle)
1979 :
1980 110 : END SUBROUTINE compute_fm_W_freq
1981 :
1982 : ! **************************************************************************************************
1983 : !> \brief Adds a real scalar value to the diagonal of a real full matrix (fm)
1984 : !> \param fm ...
1985 : !> \param alpha ...
1986 : ! **************************************************************************************************
1987 :
1988 440 : SUBROUTINE fm_add_on_diag(fm, alpha)
1989 : TYPE(cp_fm_type), INTENT(INOUT) :: fm
1990 : REAL(KIND=dp), INTENT(IN) :: alpha
1991 :
1992 : CHARACTER(LEN=*), PARAMETER :: routineN = 'fm_add_on_diag'
1993 :
1994 : INTEGER :: handle, i_global, i_row, j_col, &
1995 : j_global, ncol_local, nrow_local
1996 220 : INTEGER, DIMENSION(:), POINTER :: col_indices, row_indices
1997 :
1998 220 : CALL timeset(routineN, handle)
1999 :
2000 : CALL cp_fm_get_info(matrix=fm, &
2001 : nrow_local=nrow_local, &
2002 : ncol_local=ncol_local, &
2003 : row_indices=row_indices, &
2004 220 : col_indices=col_indices)
2005 :
2006 4880 : DO j_col = 1, ncol_local
2007 4660 : j_global = col_indices(j_col)
2008 54250 : DO i_row = 1, nrow_local
2009 49370 : i_global = row_indices(i_row)
2010 54030 : IF (j_global == i_global) THEN
2011 2330 : fm%local_data(i_row, j_col) = fm%local_data(i_row, j_col) + alpha
2012 : END IF
2013 : END DO
2014 : END DO
2015 :
2016 220 : CALL timestop(handle)
2017 :
2018 220 : END SUBROUTINE fm_add_on_diag
2019 :
2020 : ! **************************************************************************************************
2021 : !> \brief Zeroes out the strictly lower triangular part of a real matrix
2022 : !> \param fm_mat ...
2023 : ! **************************************************************************************************
2024 20 : SUBROUTINE clean_lower_part(fm_mat)
2025 : TYPE(cp_fm_type) :: fm_mat
2026 :
2027 : CHARACTER(LEN=*), PARAMETER :: routineN = 'clean_lower_part'
2028 :
2029 : INTEGER :: handle, i_row, j_col, j_global, &
2030 : ncol_local, nrow_local
2031 10 : INTEGER, DIMENSION(:), POINTER :: col_indices, row_indices
2032 :
2033 10 : CALL timeset(routineN, handle)
2034 :
2035 : CALL cp_fm_get_info(matrix=fm_mat, &
2036 : nrow_local=nrow_local, ncol_local=ncol_local, &
2037 10 : row_indices=row_indices, col_indices=col_indices)
2038 :
2039 222 : DO j_col = 1, ncol_local
2040 212 : j_global = col_indices(j_col)
2041 2470 : DO i_row = 1, nrow_local
2042 2460 : IF (j_global < row_indices(i_row)) fm_mat%local_data(i_row, j_col) = 0.0_dp
2043 : END DO
2044 : END DO
2045 :
2046 10 : CALL timestop(handle)
2047 :
2048 10 : END SUBROUTINE clean_lower_part
2049 :
2050 : ! **************************************************************************************************
2051 : !> \brief Computes the exact exchange part of the GW self-energy
2052 : !> \param bs_env ...
2053 : !> \param qs_env ...
2054 : !> \param mat_phi_mu_l ...
2055 : !> \param mat_Z_lP ...
2056 : !> \param fm_Sigma_x_Gamma ...
2057 : ! **************************************************************************************************
2058 :
2059 10 : SUBROUTINE compute_Sigma_x(bs_env, qs_env, mat_phi_mu_l, mat_Z_lP, fm_Sigma_x_Gamma)
2060 :
2061 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
2062 : TYPE(qs_environment_type), POINTER :: qs_env
2063 : TYPE(dbcsr_type), INTENT(INOUT) :: mat_phi_mu_l, mat_Z_lP
2064 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:) :: fm_Sigma_x_Gamma
2065 :
2066 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_Sigma_x'
2067 :
2068 : INTEGER :: handle, ispin
2069 10 : INTEGER, DIMENSION(:), POINTER :: blk_aux, blk_grid, dist_col_grid, &
2070 10 : dist_row_aux
2071 : REAL(KIND=dp) :: t1
2072 10 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :) :: fm_Vtr_Gamma
2073 : TYPE(dbcsr_distribution_type) :: dist_aux_aux, dist_grid_grid
2074 : TYPE(dbcsr_type) :: mat_Sigma_x_Gamma, matrix_D_grid, &
2075 : matrix_Sigma_x_grid, matrix_V_aux, &
2076 : matrix_V_grid
2077 :
2078 10 : CALL timeset(routineN, handle)
2079 :
2080 10 : t1 = m_walltime()
2081 :
2082 42 : ALLOCATE (fm_Sigma_x_Gamma(bs_env%n_spin))
2083 22 : DO ispin = 1, bs_env%n_spin
2084 22 : CALL cp_fm_create(fm_Sigma_x_Gamma(ispin), bs_env%fm_s_Gamma%matrix_struct)
2085 : END DO
2086 :
2087 10 : CALL dbcsr_create(mat_Sigma_x_Gamma, template=bs_env%mat_ao_ao%matrix)
2088 :
2089 : ! =========================================================================
2090 : ! 1. SETUP CORE TOPOLOGIES
2091 : ! =========================================================================
2092 10 : CALL setup_square_topology(mat_phi_mu_l, 'ROW', dist_grid_grid, blk_grid, dist_col_grid)
2093 10 : CALL setup_square_topology(mat_Z_lP, 'COL', dist_aux_aux, blk_aux, dist_row_aux)
2094 :
2095 : ! =========================================================================
2096 : ! 2. COMPUTE V^tr_ll'
2097 : ! =========================================================================
2098 : CALL RI_2c_integral_mat(qs_env, fm_Vtr_Gamma, bs_env%fm_RI_RI, bs_env%n_RI, &
2099 10 : bs_env%trunc_coulomb, do_kpoints=.FALSE.)
2100 :
2101 : ! M^-1 * V^tr * M^-1 directly modifies fm_Vtr_Gamma(:, 1)
2102 10 : CALL multiply_fm_W_MIC_time_with_Minv_Gamma(bs_env, qs_env, fm_Vtr_Gamma(:, 1))
2103 :
2104 10 : CALL dbcsr_create(matrix_V_aux, "V_aux", dist_aux_aux, dbcsr_type_no_symmetry, blk_aux, blk_aux)
2105 10 : CALL dbcsr_create(matrix_V_grid, "V_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
2106 :
2107 10 : CALL copy_fm_to_dbcsr(fm_Vtr_Gamma(1, 1), matrix_V_aux, keep_sparsity=.FALSE.)
2108 :
2109 : ! V^tr_ll' = sum_PQ Z_lP V^trunc_PQ Z_l'Q
2110 10 : CALL contract_A_B_A("N", "T", mat_Z_lP, matrix_V_aux, matrix_V_grid, bs_env%eps_filter)
2111 10 : CALL dbcsr_release(matrix_V_aux)
2112 :
2113 : ! =========================================================================
2114 : ! 3. SPIN LOOP FOR EXACT EXCHANGE
2115 : ! =========================================================================
2116 22 : DO ispin = 1, bs_env%n_spin
2117 :
2118 : ! Density matrix on grid is essentially G_occ at tau = 0.0
2119 : ! D_µν = sum_n^occ C_µn C_νn
2120 : ! D_ll' = sum_µν Φ_µ(r_l) D_µν Φ_ν(r_l')
2121 12 : CALL dbcsr_create(matrix_D_grid, "D_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
2122 12 : CALL build_G_grid(bs_env, 0.0_dp, ispin, .TRUE., .FALSE., mat_phi_mu_l, matrix_D_grid, bs_env%eps_filter)
2123 :
2124 : ! Element-wise Hadamard product: Σ^x_grid = D_grid ◦ V_grid
2125 : ! Σ^x_ll' = D_ll' * V^tr_ll'
2126 12 : CALL dbcsr_create(matrix_Sigma_x_grid, template=matrix_V_grid)
2127 12 : CALL hadamard_product(matrix_D_grid, matrix_V_grid, matrix_Sigma_x_grid, 1.0_dp)
2128 :
2129 12 : CALL dbcsr_release(matrix_D_grid)
2130 :
2131 : ! Transform back to AO basis: Σ^x_ao = -1.0 * phi^T * Σ^x_grid * phi
2132 : ! Σ^x_λσ = -sum_ll' Φ_λ(r_l) Σ^x_ll' Φ_σ(r_l')
2133 12 : CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_x_grid, mat_Sigma_x_Gamma, bs_env%eps_filter)
2134 12 : CALL dbcsr_scale(mat_Sigma_x_Gamma, -1.0_dp)
2135 :
2136 12 : CALL dbcsr_release(matrix_Sigma_x_grid)
2137 :
2138 : ! Data I/O and Export to CP2K Full Matrices
2139 22 : CALL copy_dbcsr_to_fm(mat_Sigma_x_Gamma, fm_Sigma_x_Gamma(ispin))
2140 :
2141 : END DO ! ispin
2142 :
2143 10 : IF (bs_env%unit_nr > 0) THEN
2144 : WRITE (bs_env%unit_nr, '(T2,A,T58,A,F7.1,A)') &
2145 5 : 'Computed Σ^x(k=0),', ' Execution time', m_walltime() - t1, ' s'
2146 5 : WRITE (bs_env%unit_nr, '(A)') ' '
2147 : END IF
2148 :
2149 : ! =========================================================================
2150 : ! 4. CLEANUP
2151 : ! =========================================================================
2152 : CALL release_dbcsr_topology_and_matrices(dist=dist_grid_grid, mapped_dist=dist_col_grid, &
2153 10 : m1=mat_Sigma_x_Gamma, m2=matrix_V_grid)
2154 10 : CALL release_dbcsr_topology_and_matrices(dist=dist_aux_aux, mapped_dist=dist_row_aux)
2155 :
2156 10 : CALL cp_fm_release(fm_Vtr_Gamma)
2157 :
2158 10 : CALL timestop(handle)
2159 :
2160 30 : END SUBROUTINE compute_Sigma_x
2161 :
2162 : ! **************************************************************************************************
2163 : !> \brief Computes the correlation part of the GW self-energy
2164 : !> \param bs_env ...
2165 : !> \param fm_W_time ...
2166 : !> \param mat_phi_mu_l ...
2167 : !> \param mat_Z_lP ...
2168 : !> \param fm_Sigma_c_Gamma_time ...
2169 : ! **************************************************************************************************
2170 :
2171 10 : SUBROUTINE compute_Sigma_c(bs_env, fm_W_time, mat_phi_mu_l, mat_Z_lP, fm_Sigma_c_Gamma_time)
2172 :
2173 : TYPE(post_scf_bandstructure_type), POINTER :: bs_env
2174 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:) :: fm_W_time
2175 : TYPE(dbcsr_type), INTENT(INOUT) :: mat_phi_mu_l, mat_Z_lP
2176 : TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :, :) :: fm_Sigma_c_Gamma_time
2177 :
2178 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_Sigma_c'
2179 :
2180 : INTEGER :: handle, i_t, ispin
2181 10 : INTEGER, DIMENSION(:), POINTER :: blk_aux, blk_grid, dist_col_grid, &
2182 10 : dist_row_aux
2183 : REAL(KIND=dp) :: t1, tau
2184 : TYPE(dbcsr_distribution_type) :: dist_aux_aux, dist_grid_grid
2185 10 : TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER :: mat_Sigma_neg_tau, mat_Sigma_pos_tau
2186 : TYPE(dbcsr_type) :: matrix_G_occ_grid, matrix_G_vir_grid, matrix_Sigma_neg_grid, &
2187 : matrix_Sigma_pos_grid, matrix_W_aux, matrix_W_grid
2188 :
2189 10 : CALL timeset(routineN, handle)
2190 :
2191 : ! =========================================================================
2192 : ! 1. SETUP CORE TOPOLOGIES AND PRE-ALLOCATE OUTPUT ARRAYS
2193 : ! =========================================================================
2194 10 : CALL setup_square_topology(mat_phi_mu_l, 'ROW', dist_grid_grid, blk_grid, dist_col_grid)
2195 10 : CALL setup_square_topology(mat_Z_lP, 'COL', dist_aux_aux, blk_aux, dist_row_aux)
2196 :
2197 : ! Pre-allocate local DBCSR matrices to act as targets for final output
2198 10 : NULLIFY (mat_Sigma_neg_tau, mat_Sigma_pos_tau)
2199 182 : ALLOCATE (mat_Sigma_neg_tau(bs_env%num_time_freq_points, bs_env%n_spin))
2200 182 : ALLOCATE (mat_Sigma_pos_tau(bs_env%num_time_freq_points, bs_env%n_spin))
2201 :
2202 120 : DO i_t = 1, bs_env%num_time_freq_points
2203 250 : DO ispin = 1, bs_env%n_spin
2204 130 : ALLOCATE (mat_Sigma_neg_tau(i_t, ispin)%matrix)
2205 130 : ALLOCATE (mat_Sigma_pos_tau(i_t, ispin)%matrix)
2206 130 : CALL dbcsr_create(mat_Sigma_neg_tau(i_t, ispin)%matrix, template=bs_env%mat_ao_ao%matrix)
2207 240 : CALL dbcsr_create(mat_Sigma_pos_tau(i_t, ispin)%matrix, template=bs_env%mat_ao_ao%matrix)
2208 : END DO
2209 : END DO
2210 :
2211 : ! =========================================================================
2212 : ! 2. MAIN IMAGINARY TIME LOOP
2213 : ! =========================================================================
2214 120 : DO i_t = 1, bs_env%num_time_freq_points
2215 110 : tau = bs_env%imag_time_points(i_t)
2216 :
2217 : ! -------------------------------------------------------------------
2218 : ! Compute W_grid = Z * W_aux * Z^T
2219 : ! -------------------------------------------------------------------
2220 110 : CALL dbcsr_create(matrix_W_aux, "W_aux", dist_aux_aux, dbcsr_type_no_symmetry, blk_aux, blk_aux)
2221 110 : CALL dbcsr_create(matrix_W_grid, "W_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
2222 :
2223 110 : CALL copy_fm_to_dbcsr(fm_W_time(i_t), matrix_W_aux, keep_sparsity=.FALSE.)
2224 :
2225 : ! W^MIC_ll'(iτ,k=0) = sum_PQ Z_lP W^MIC_PQ(iτ) Z_l'Q
2226 110 : CALL contract_A_B_A("N", "T", mat_Z_lP, matrix_W_aux, matrix_W_grid, bs_env%eps_filter)
2227 :
2228 110 : CALL dbcsr_release(matrix_W_aux) ! Clean up aux basis immediately
2229 :
2230 240 : DO ispin = 1, bs_env%n_spin
2231 130 : t1 = m_walltime()
2232 :
2233 : ! -------------------------------------------------------------------
2234 : ! A. Transform Green's Functions to the Grid
2235 : ! -------------------------------------------------------------------
2236 130 : CALL dbcsr_create(matrix_G_occ_grid, "G_occ_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
2237 130 : CALL dbcsr_create(matrix_G_vir_grid, "G_vir_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
2238 :
2239 : ! G^occ_µλ(i|τ|) = sum_G^occ_µλn^occ C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
2240 : ! G^occ_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
2241 130 : CALL build_G_grid(bs_env, tau, ispin, .TRUE., .FALSE., mat_phi_mu_l, matrix_G_occ_grid, bs_env%eps_filter)
2242 :
2243 : ! G^vir_µλ(i|τ|) = sum_n^vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
2244 : ! G^vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
2245 130 : CALL build_G_grid(bs_env, tau, ispin, .FALSE., .TRUE., mat_phi_mu_l, matrix_G_vir_grid, bs_env%eps_filter)
2246 :
2247 : ! -------------------------------------------------------------------
2248 : ! B. Element-wise Hadamard Products for Sigma_c on Grid
2249 : ! Σ_neg_grid = G_occ_grid ◦ W_grid
2250 : ! Σ_pos_grid = G_vir_grid ◦ W_grid
2251 : ! -------------------------------------------------------------------
2252 130 : CALL dbcsr_create(matrix_Sigma_neg_grid, template=matrix_W_grid)
2253 130 : CALL dbcsr_create(matrix_Sigma_pos_grid, template=matrix_W_grid)
2254 :
2255 : ! Σ^c_ll'(iτ) = -G^occ_ll'(i|τ|) * W^MIC_ll'(iτ), for τ < 0
2256 130 : CALL hadamard_product(matrix_G_occ_grid, matrix_W_grid, matrix_Sigma_neg_grid, 1.0_dp)
2257 :
2258 : ! Σ^c_ll'(iτ) = G^vir_ll'(i|τ|) * W^MIC_ll'(iτ), for τ > 0
2259 130 : CALL hadamard_product(matrix_G_vir_grid, matrix_W_grid, matrix_Sigma_pos_grid, 1.0_dp)
2260 :
2261 : ! Instantly purge massive G_grid arrays to save memory
2262 130 : CALL dbcsr_release(matrix_G_occ_grid)
2263 130 : CALL dbcsr_release(matrix_G_vir_grid)
2264 :
2265 : ! -------------------------------------------------------------------
2266 : ! C. Transform Sigma back to AO Basis
2267 : ! Σ_AO = phi^T * Σ_grid * phi
2268 : ! -------------------------------------------------------------------
2269 :
2270 : ! Σ^c_λσ(iτ) = sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ) Φ_σ(r_l'), for τ < 0
2271 : CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_neg_grid, &
2272 130 : mat_Sigma_neg_tau(i_t, ispin)%matrix, bs_env%eps_filter)
2273 130 : CALL dbcsr_scale(mat_Sigma_neg_tau(i_t, ispin)%matrix, -1.0_dp)
2274 :
2275 : ! Σ^c_λσ(iτ) = sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ) Φ_σ(r_l'), for τ > 0
2276 : CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_pos_grid, &
2277 130 : mat_Sigma_pos_tau(i_t, ispin)%matrix, bs_env%eps_filter)
2278 :
2279 : ! Purge Grid Sigma arrays
2280 130 : CALL dbcsr_release(matrix_Sigma_neg_grid)
2281 130 : CALL dbcsr_release(matrix_Sigma_pos_grid)
2282 :
2283 240 : IF (bs_env%unit_nr > 0) THEN
2284 : WRITE (bs_env%unit_nr, '(T2,A,I10,A,I3,A,F7.1,A)') &
2285 65 : 'Computed Σ^c(iτ) for time point ', i_t, ' /', bs_env%num_time_freq_points, &
2286 130 : ', Execution time', m_walltime() - t1, ' s'
2287 : END IF
2288 :
2289 : END DO ! ispin
2290 :
2291 : ! Release the W_grid for this time point
2292 120 : CALL dbcsr_release(matrix_W_grid)
2293 :
2294 : END DO ! i_t
2295 :
2296 10 : IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
2297 :
2298 : ! -------------------------------------------------------------------------
2299 : ! 3. FINALIZE AND CLEANUP
2300 : ! -------------------------------------------------------------------------
2301 : CALL fill_fm_Sigma_c_Gamma_time(fm_Sigma_c_Gamma_time, bs_env, &
2302 10 : mat_Sigma_pos_tau, mat_Sigma_neg_tau)
2303 :
2304 10 : CALL cp_fm_release(fm_W_time)
2305 :
2306 10 : CALL dbcsr_deallocate_matrix_set(mat_Sigma_neg_tau)
2307 10 : CALL dbcsr_deallocate_matrix_set(mat_Sigma_pos_tau)
2308 :
2309 10 : CALL release_dbcsr_topology_and_matrices(dist=dist_grid_grid, mapped_dist=dist_col_grid)
2310 10 : CALL release_dbcsr_topology_and_matrices(dist=dist_aux_aux, mapped_dist=dist_row_aux)
2311 :
2312 10 : CALL delete_unnecessary_files(bs_env)
2313 10 : CALL timestop(handle)
2314 :
2315 10 : END SUBROUTINE compute_Sigma_c
2316 :
2317 : ! **************************************************************************************************
2318 : !> \brief DBCSR Topology Generation
2319 : !> \param matrix_template ...
2320 : !> \param dim_type ...
2321 : !> \param square_dist ...
2322 : !> \param blk_sizes ...
2323 : !> \param mapped_dist ...
2324 : ! **************************************************************************************************
2325 :
2326 572 : SUBROUTINE setup_square_topology(matrix_template, dim_type, square_dist, blk_sizes, mapped_dist)
2327 :
2328 : TYPE(dbcsr_type), INTENT(IN) :: matrix_template
2329 : CHARACTER(LEN=*), INTENT(IN) :: dim_type
2330 : TYPE(dbcsr_distribution_type), INTENT(OUT) :: square_dist
2331 : INTEGER, DIMENSION(:), INTENT(OUT), POINTER :: blk_sizes, mapped_dist
2332 :
2333 : INTEGER :: i, np
2334 572 : INTEGER, DIMENSION(:), POINTER :: col_blk, col_dist, row_blk, row_dist
2335 : TYPE(dbcsr_distribution_type) :: dist_template
2336 :
2337 : CALL dbcsr_get_info(matrix_template, distribution=dist_template, &
2338 572 : row_blk_size=row_blk, col_blk_size=col_blk)
2339 572 : CALL dbcsr_distribution_get(dist_template, row_dist=row_dist, col_dist=col_dist)
2340 :
2341 572 : IF (TRIM(dim_type) == 'ROW') THEN
2342 : ! Creates ROW x ROW (e.g., Grid x Grid from mat_phi_mu_l)
2343 20 : blk_sizes => row_blk
2344 76 : np = MAXVAL(col_dist) + 1 ! npcol
2345 60 : ALLOCATE (mapped_dist(SIZE(blk_sizes)))
2346 408 : DO i = 1, SIZE(blk_sizes)
2347 408 : mapped_dist(i) = MOD(i - 1, np)
2348 : END DO
2349 : CALL dbcsr_distribution_new(square_dist, template=dist_template, &
2350 20 : row_dist=row_dist, col_dist=mapped_dist)
2351 :
2352 552 : ELSE IF (TRIM(dim_type) == 'COL') THEN
2353 : ! Creates COL x COL (e.g., Aux x Aux from mat_Z_lP)
2354 552 : blk_sizes => col_blk
2355 11376 : np = MAXVAL(row_dist) + 1 ! nprow
2356 1656 : ALLOCATE (mapped_dist(SIZE(blk_sizes)))
2357 2040 : DO i = 1, SIZE(blk_sizes)
2358 2040 : mapped_dist(i) = MOD(i - 1, np)
2359 : END DO
2360 : CALL dbcsr_distribution_new(square_dist, template=dist_template, &
2361 552 : row_dist=mapped_dist, col_dist=col_dist)
2362 : END IF
2363 :
2364 572 : END SUBROUTINE setup_square_topology
2365 :
2366 : ! **************************************************************************************************
2367 : !> \brief DBCSR matrices deallocation
2368 : !> \param dist ...
2369 : !> \param mapped_dist ...
2370 : !> \param m1 ...
2371 : !> \param m2 ...
2372 : !> \param m3 ...
2373 : !> \param m4 ...
2374 : ! **************************************************************************************************
2375 :
2376 572 : SUBROUTINE release_dbcsr_topology_and_matrices(dist, mapped_dist, m1, m2, m3, m4)
2377 :
2378 : TYPE(dbcsr_distribution_type), INTENT(INOUT), &
2379 : OPTIONAL :: dist
2380 : INTEGER, DIMENSION(:), INTENT(INOUT), OPTIONAL, &
2381 : POINTER :: mapped_dist
2382 : TYPE(dbcsr_type), INTENT(INOUT), OPTIONAL :: m1, m2, m3, m4
2383 :
2384 572 : IF (PRESENT(dist)) CALL dbcsr_distribution_release(dist)
2385 572 : IF (PRESENT(mapped_dist)) THEN
2386 572 : IF (ASSOCIATED(mapped_dist)) THEN
2387 572 : DEALLOCATE (mapped_dist)
2388 : NULLIFY (mapped_dist)
2389 : END IF
2390 : END IF
2391 572 : IF (PRESENT(m1)) CALL dbcsr_release(m1)
2392 572 : IF (PRESENT(m2)) CALL dbcsr_release(m2)
2393 572 : IF (PRESENT(m3)) CALL dbcsr_release(m3)
2394 572 : IF (PRESENT(m4)) CALL dbcsr_release(m4)
2395 :
2396 572 : END SUBROUTINE release_dbcsr_topology_and_matrices
2397 :
2398 : END MODULE gw_non_periodic_ri_rs
|