LCOV - code coverage report
Current view: top level - src - gw_non_periodic_ri_rs.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:c24029e) Lines: 89.2 % 714 637
Test Date: 2026-07-04 06:36:57 Functions: 95.7 % 23 22

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2026 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief GW using RI-RS Approximation for molecules
      10              : !> \par History
      11              : !>      04.2026 created [Ritaj Tyagi]
      12              : ! **************************************************************************************************
      13              : MODULE gw_non_periodic_ri_rs
      14              :    USE atomic_kind_types,               ONLY: atomic_kind_type,&
      15              :                                               get_atomic_kind_set
      16              :    USE basis_set_types,                 ONLY: gto_basis_set_type
      17              :    USE cell_types,                      ONLY: cell_type,&
      18              :                                               pbc
      19              :    USE constants_operator,              ONLY: operator_coulomb
      20              :    USE cp_blacs_env,                    ONLY: cp_blacs_env_create,&
      21              :                                               cp_blacs_env_release,&
      22              :                                               cp_blacs_env_type
      23              :    USE cp_dbcsr_api,                    ONLY: &
      24              :         dbcsr_add, dbcsr_binary_read, dbcsr_binary_write, dbcsr_copy, dbcsr_create, &
      25              :         dbcsr_deallocate_matrix, dbcsr_distribution_get, dbcsr_distribution_new, &
      26              :         dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
      27              :         dbcsr_get_info, dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, &
      28              :         dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_multiply, &
      29              :         dbcsr_p_type, dbcsr_put_block, dbcsr_release, dbcsr_scale, dbcsr_set, dbcsr_type, &
      30              :         dbcsr_type_no_symmetry
      31              :    USE cp_dbcsr_contrib,                ONLY: dbcsr_reserve_all_blocks
      32              :    USE cp_dbcsr_operations,             ONLY: copy_dbcsr_to_fm,&
      33              :                                               copy_fm_to_dbcsr,&
      34              :                                               dbcsr_deallocate_matrix_set,&
      35              :                                               max_elements_per_block
      36              :    USE cp_files,                        ONLY: close_file,&
      37              :                                               open_file
      38              :    USE cp_fm_basic_linalg,              ONLY: cp_fm_scale_and_add,&
      39              :                                               cp_fm_uplo_to_full
      40              :    USE cp_fm_cholesky,                  ONLY: cp_fm_cholesky_decompose,&
      41              :                                               cp_fm_cholesky_invert,&
      42              :                                               cp_fm_cholesky_solve
      43              :    USE cp_fm_diag,                      ONLY: cp_fm_power
      44              :    USE cp_fm_struct,                    ONLY: cp_fm_struct_create,&
      45              :                                               cp_fm_struct_release,&
      46              :                                               cp_fm_struct_type
      47              :    USE cp_fm_types,                     ONLY: cp_fm_create,&
      48              :                                               cp_fm_get_info,&
      49              :                                               cp_fm_get_submatrix,&
      50              :                                               cp_fm_release,&
      51              :                                               cp_fm_set_all,&
      52              :                                               cp_fm_set_submatrix,&
      53              :                                               cp_fm_to_fm,&
      54              :                                               cp_fm_type
      55              :    USE cp_log_handling,                 ONLY: cp_get_default_logger,&
      56              :                                               cp_logger_type
      57              :    USE cp_output_handling,              ONLY: cp_p_file,&
      58              :                                               cp_print_key_should_output
      59              :    USE gw_integrals,                    ONLY: build_3c_integral_block_ctx,&
      60              :                                               gw_3c_ctx_create,&
      61              :                                               gw_3c_ctx_release,&
      62              :                                               gw_3c_ctx_type,&
      63              :                                               gw_3c_ws_create,&
      64              :                                               gw_3c_ws_release,&
      65              :                                               gw_3c_ws_type
      66              :    USE gw_large_cell_gamma,             ONLY: &
      67              :         Fourier_transform_w_to_t, G_occ_vir, compute_QP_energies, compute_fm_chi_Gamma_freq, &
      68              :         create_fm_W_MIC_time, delete_unnecessary_files, fill_fm_Sigma_c_Gamma_time, fm_write, &
      69              :         multiply_fm_W_MIC_time_with_Minv_Gamma
      70              :    USE gw_utils,                        ONLY: de_init_bs_env
      71              :    USE input_constants,                 ONLY: rtp_method_bse
      72              :    USE input_section_types,             ONLY: section_vals_type
      73              :    USE kinds,                           ONLY: default_path_length,&
      74              :                                               default_string_length,&
      75              :                                               dp
      76              :    USE kpoint_coulomb_2c,               ONLY: build_2c_coulomb_matrix_kp
      77              :    USE machine,                         ONLY: m_walltime
      78              :    USE message_passing,                 ONLY: mp_para_env_type
      79              :    USE mp2_ri_2c,                       ONLY: RI_2c_integral_mat
      80              :    USE orbital_pointers,                ONLY: indco,&
      81              :                                               ncoset
      82              :    USE parallel_gemm_api,               ONLY: parallel_gemm
      83              :    USE particle_types,                  ONLY: particle_type
      84              :    USE post_scf_bandstructure_types,    ONLY: post_scf_bandstructure_type
      85              :    USE qs_environment_types,            ONLY: get_qs_env,&
      86              :                                               qs_environment_type
      87              :    USE qs_kind_types,                   ONLY: get_qs_kind,&
      88              :                                               qs_kind_type
      89              : #include "./base/base_uses.f90"
      90              : 
      91              :    IMPLICIT NONE
      92              : 
      93              :    PRIVATE
      94              : 
      95              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'gw_non_periodic_ri_rs'
      96              : 
      97              :    PUBLIC :: gw_calc_non_periodic_ri_rs, ri_rs_grid_assembler, &
      98              :              get_basis_offsets, precompute_ri_rs_radii, solve_D_lp_distributed
      99              : 
     100              : CONTAINS
     101              : 
     102              : ! **************************************************************************************************
     103              : !> \brief GW calculation using RI-RS formalism for molecules
     104              : !> \param qs_env ...
     105              : !> \param bs_env ...
     106              : ! **************************************************************************************************
     107              : 
     108           10 :    SUBROUTINE gw_calc_non_periodic_ri_rs(qs_env, bs_env)
     109              : 
     110              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     111              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     112              : 
     113              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'gw_calc_non_periodic_ri_rs'
     114              : 
     115              :       INTEGER                                            :: handle
     116           10 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_Sigma_x_Gamma, fm_W_time
     117           10 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :, :)  :: fm_Sigma_c_Gamma_time
     118              : 
     119           10 :       CALL timeset(routineN, handle)
     120              : 
     121              :        !!========================================================================
     122              :        !! 0. Precompute AO and RI Radii
     123              :        !!    Determine per-atom cutoff radii from the most diffuse Gaussian
     124              :        !!    primitives in the AO and RI auxiliary basis sets.
     125              :        !!    Equations:
     126              :        !!    a. α_min,ao = min_{i,j} { ζ_ao(i,j) | ζ_ao(i,j) > 10⁻³ }
     127              :        !!    b. α_min,ri = min_{i,j} { ζ_ri(i,j) | ζ_ri(i,j) > 10⁻³ }
     128              :        !!    c. r_ao     = sqrt( -ln(ε) / α_min,ao )  (radius_ao_per_atom)
     129              :        !!    d. r_ri     = sqrt( -ln(ε) / α_min,ri )  (radius_ri_per_atom)
     130              :        !!========================================================================
     131           10 :       CALL precompute_ri_rs_radii(qs_env, bs_env)
     132              : 
     133              :        !!========================================================================
     134              :        !! 1. Grid Generation for RI-RS
     135              :        !!    (Modified Lebedev grids from Ivan Duchemin and Xavier Blase)
     136              :        !!    Generate flattened 1D array of grid points for RI-RS.
     137              :        !!    Equation: r_g(k) = R_A + r_g(A)
     138              :        !!========================================================================
     139           10 :       CALL ri_rs_grid_assembler(qs_env, bs_env, bs_env%ri_rs%grid_points)
     140              : 
     141              :        !!========================================================================
     142              :        !! 2. Atomic Basis Evaluation
     143              :        !!    Compute values of spherical atomic basis functions at grid points.
     144              :        !!    Expression: Φ_μl = Φ_μ(r_l) (mat_phi_mu_l)
     145              :        !!========================================================================
     146              :       CALL atomic_basis_at_grid_point(qs_env, bs_env, bs_env%ri_rs%grid_points, &
     147           10 :                                       bs_env%ri_rs%mat_phi_mu_l)
     148              : 
     149              :        !!========================================================================
     150              :        !! 3. Compute RI-RS Coefficients (Z_lp)
     151              :        !!    Solve the regularized system for each atom P, where the grid domain
     152              :        !!    is restricted to r_l within a cutoff distance of atom P:
     153              :        !!    a. D_ll' = [ Σ_μ Φ_μ(r_l) Φ_μ(r_l') ]^2 (Equation 13)
     154              :        !!    b. D_lP  = Σ_{μν} Φ_μ(r_l) Φ_ν(r_l) (μν|P) (Equation 15)
     155              :        !!    c. Conditioning:
     156              :        !!       Dvec_l   = 1 / sqrt(D_ll)  (Diagonal scaling vector)
     157              :        !!       D'_ll' = Dvec_l * D_ll' * Dvec_l' + λδ_ll'
     158              :        !!       D'_lP  = Dvec_l * D_lP
     159              :        !!    d. Solve: Σ_l' D'_ll' * Z'_l'P = D'_lP (Equation 14)
     160              :        !!    e. Rescale: Z_lP = Z'_lP * Dvec_l   (Z_lP stored in mat_Z_lP)
     161              :        !!========================================================================
     162              :       CALL compute_coeff_Z_lP(qs_env, bs_env, bs_env%ri_rs%grid_points, &
     163           10 :                               bs_env%ri_rs%mat_phi_mu_l, bs_env%ri_rs%mat_Z_lP)
     164              : 
     165              :        !!========================================================================
     166              :        !! 4. Compute Independent-Particle Polarizability (χ)
     167              :        !!    G^occ_µλ(i|τ|)  = sum_n^occ C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
     168              :        !!    G^vir_µλ(i|τ|)  = sum_n^vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
     169              :        !!    G^occ_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
     170              :        !!    G^vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
     171              :        !!    χ_ll'(iτ)       = G^occ_ll'(i|τ|) * G^vir_ll'(i|τ|)
     172              :        !!    χ_PQ(iτ)        = sum_ll' Z_lP χ_ll'(iτ) Z_l'Q
     173              :        !!========================================================================
     174              :       CALL get_mat_chi_Gamma_tau(bs_env, bs_env%mat_chi_Gamma_tau, &
     175           10 :                                  bs_env%ri_rs%mat_phi_mu_l, bs_env%ri_rs%mat_Z_lP)
     176              : 
     177              :        !!========================================================================
     178              :        !! 5. Compute Screened Interaction (W)
     179              :        !!    χ_PQ(iτ) -> χ_PQ(iω) -> ε_PQ(iω) -> W_PQ(iω) -> W_PQ(iτ)
     180              :        !!========================================================================
     181           10 :       CALL compute_W(bs_env, qs_env, bs_env%mat_chi_Gamma_tau, fm_W_time)
     182              : 
     183              :        !!========================================================================
     184              :        !! 6. Compute Exact Exchange Self-Energy (Σ^x)
     185              :        !!    D_µν          = sum_n^occ C_µn C_νn
     186              :        !!    D_ll'         = sum_µν Φ_µ(r_l) D_µν Φ_ν(r_l')
     187              :        !!    V^trunc_ll'   = sum_PQ Z_lP V^trunc_PQ Z_l'Q
     188              :        !!    Σ^x_ll'       = D_ll' * V^trunc_ll'
     189              :        !!    Σ^x_λσ(k=0)   = -sum_ll' Φ_λ(r_l) Σ^x_ll' Φ_σ(r_l')
     190              :        !!========================================================================
     191              :       CALL compute_Sigma_x(bs_env, qs_env, bs_env%ri_rs%mat_phi_mu_l, &
     192           10 :                            bs_env%ri_rs%mat_Z_lP, fm_Sigma_x_Gamma)
     193              : 
     194              :        !!========================================================================
     195              :        !! 7. Compute Correlation Self-Energy (Σ^c)
     196              :        !!    W_ll'(iτ)     =  sum_PQ Z_lP W^MIC_PQ(iτ) Z_l'Q
     197              :        !!    Σ^c_ll'(iτ)   = -G^occ_ll'(i|τ|) * W_ll'(iτ), for τ < 0
     198              :        !!    Σ^c_ll'(iτ)   =  G^vir_ll'(i|τ|) * W_ll'(iτ), for τ > 0
     199              :        !!    Σ^c_λσ(iτ)    =  sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ) Φ_σ(r_l')
     200              :        !!========================================================================
     201              :       CALL compute_Sigma_c(bs_env, fm_W_time, bs_env%ri_rs%mat_phi_mu_l, &
     202           10 :                            bs_env%ri_rs%mat_Z_lP, fm_Sigma_c_Gamma_time)
     203              : 
     204              :        !!========================================================================
     205              :        !! 8. Compute Quasiparticle Energies
     206              :        !!    Σ^c_λσ(iτ) -> Σ^c_nn(ϵ)
     207              :        !!    ϵ_nk^GW = ϵ_nk^DFT + Σ^c_nn(ϵ) + Σ^x_nn - v^xc_nn
     208              :        !!========================================================================
     209           10 :       CALL compute_QP_energies(bs_env, qs_env, fm_Sigma_x_Gamma, fm_Sigma_c_Gamma_time)
     210              : 
     211           10 :       CALL de_init_bs_env(bs_env)
     212              : 
     213           10 :       CALL timestop(handle)
     214              : 
     215           10 :    END SUBROUTINE gw_calc_non_periodic_ri_rs
     216              : 
     217              : ! **************************************************************************************************
     218              : !> \brief Compute per-atom AO and RI basis radii from the most diffuse Gaussian
     219              : !>        primitive in the AO ("ORB") and RI auxiliary ("RI_AUX") basis sets.
     220              : !>        Stores results in bs_env%ri_rs%radius_ao_per_atom(:) and
     221              : !>        bs_env%ri_rs%radius_ri_per_atom(:) and prints a per-atom table.
     222              : !>        Radius:  r_kind = sqrt(-log(eps) / alpha_min_kind)
     223              : !>        with eps = eps_filter.
     224              : !> \param qs_env ...
     225              : !> \param bs_env ...
     226              : ! **************************************************************************************************
     227              : 
     228           10 :    SUBROUTINE precompute_ri_rs_radii(qs_env, bs_env)
     229              : 
     230              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     231              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     232              : 
     233              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'precompute_ri_rs_radii'
     234              : 
     235              :       INTEGER                                            :: handle, i, iatom, ikind, j, natom, nkind
     236           10 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: kind_of
     237              :       REAL(KIND=dp)                                      :: eps
     238           10 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: alpha_min_ao_kind, alpha_min_ri_kind
     239           10 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: zet_ao, zet_ri
     240           10 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     241           10 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     242              : 
     243           10 :       CALL timeset(routineN, handle)
     244              : 
     245              :       CALL get_qs_env(qs_env, nkind=nkind, atomic_kind_set=atomic_kind_set, &
     246           10 :                       particle_set=particle_set)
     247           10 :       natom = SIZE(particle_set)
     248              : 
     249           10 :       eps = bs_env%eps_filter
     250              : 
     251           40 :       ALLOCATE (alpha_min_ao_kind(nkind), alpha_min_ri_kind(nkind))
     252           28 :       alpha_min_ao_kind = HUGE(1.0_dp)
     253           28 :       alpha_min_ri_kind = HUGE(1.0_dp)
     254              : 
     255           28 :       DO ikind = 1, nkind
     256           18 :          zet_ao => bs_env%basis_set_AO(ikind)%gto_basis_set%zet
     257           18 :          zet_ri => bs_env%basis_set_RI(ikind)%gto_basis_set%zet
     258              : 
     259           72 :          DO i = 1, SIZE(zet_ao, 1)
     260          156 :             DO j = 1, SIZE(zet_ao, 2)
     261           84 :                IF (zet_ao(i, j) > 1.0E-3_dp) &
     262          138 :                   alpha_min_ao_kind(ikind) = MIN(alpha_min_ao_kind(ikind), zet_ao(i, j))
     263              :             END DO
     264              :          END DO
     265           46 :          DO i = 1, SIZE(zet_ri, 1)
     266          110 :             DO j = 1, SIZE(zet_ri, 2)
     267           74 :                IF (zet_ri(i, j) > 1.0E-3_dp) &
     268           92 :                   alpha_min_ri_kind(ikind) = MIN(alpha_min_ri_kind(ikind), zet_ri(i, j))
     269              :             END DO
     270              :          END DO
     271              :       END DO
     272              : 
     273           10 :       CALL get_atomic_kind_set(atomic_kind_set=atomic_kind_set, kind_of=kind_of)
     274              : 
     275           30 :       ALLOCATE (bs_env%ri_rs%radius_ao_per_atom(natom))
     276           20 :       ALLOCATE (bs_env%ri_rs%radius_ri_per_atom(natom))
     277           38 :       DO iatom = 1, natom
     278           28 :          ikind = kind_of(iatom)
     279           28 :          bs_env%ri_rs%radius_ao_per_atom(iatom) = SQRT(-LOG(eps)/alpha_min_ao_kind(ikind))
     280           38 :          bs_env%ri_rs%radius_ri_per_atom(iatom) = SQRT(-LOG(eps)/alpha_min_ri_kind(ikind))
     281              :       END DO
     282              : 
     283           10 :       IF (bs_env%unit_nr > 0) THEN
     284            5 :          WRITE (bs_env%unit_nr, '(T2,A)') 'Per-kind RI-RS basis radii (Bohr):'
     285            5 :          WRITE (bs_env%unit_nr, '(T4,A6,2X,A4,2A14)') 'Kind', 'Elem', 'r_AO', 'r_RI'
     286           14 :          DO ikind = 1, nkind
     287              :             WRITE (bs_env%unit_nr, '(T4,I6,2X,A4,2F14.4)') &
     288            9 :                ikind, &
     289            9 :                atomic_kind_set(ikind)%element_symbol, &
     290            9 :                SQRT(-LOG(eps)/alpha_min_ao_kind(ikind)), &
     291           23 :                SQRT(-LOG(eps)/alpha_min_ri_kind(ikind))
     292              :          END DO
     293            5 :          WRITE (bs_env%unit_nr, '(A)') ' '
     294              :       END IF
     295              : 
     296           10 :       DEALLOCATE (alpha_min_ao_kind, alpha_min_ri_kind, kind_of)
     297              : 
     298           10 :       CALL timestop(handle)
     299              : 
     300           10 :    END SUBROUTINE precompute_ri_rs_radii
     301              : 
     302              : ! **************************************************************************************************
     303              : !> \brief Compute grid points for RI-RS
     304              : !>        Right now based on Ivan and Xavier implementation
     305              : !>        JCP 150, 174120 (2019), JCTC 17, 2383 (2021)
     306              : !> \param qs_env ...
     307              : !> \param bs_env ...
     308              : !> \param ri_rs_grid_points ...
     309              : ! **************************************************************************************************
     310              : 
     311           10 :    SUBROUTINE ri_rs_grid_assembler(qs_env, bs_env, ri_rs_grid_points)
     312              : 
     313              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     314              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     315              :       REAL(KIND=dp), ALLOCATABLE, INTENT(OUT)            :: ri_rs_grid_points(:, :)
     316              : 
     317              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'ri_rs_grid_assembler'
     318              : 
     319              :       INTEGER                                            :: atom_idx, end_idx, handle, ikind, j, &
     320              :                                                             natom, nkind, start_idx, &
     321              :                                                             total_grid_npts
     322           10 :       INTEGER, ALLOCATABLE                               :: atom_to_kind(:), ri_rs_grid_offsets(:)
     323              :       REAL(KIND=dp)                                      :: atomic_center(3)
     324           10 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     325           10 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     326              : 
     327           10 :       CALL timeset(routineN, handle)
     328              : 
     329              :        !! Get the information about the atoms in the system
     330           10 :       CALL get_qs_env(qs_env, atomic_kind_set=atomic_kind_set, particle_set=particle_set)
     331              : 
     332           10 :       nkind = SIZE(atomic_kind_set)
     333           10 :       natom = SIZE(particle_set)
     334              : 
     335              :        !! 1. Build the grid cache
     336           10 :       CALL build_grid_cache(bs_env, atomic_kind_set)
     337              : 
     338              :        !! 2. Calculate grid counts and offsets
     339           30 :       ALLOCATE (ri_rs_grid_offsets(natom + 1))
     340           30 :       ALLOCATE (atom_to_kind(natom))
     341              : 
     342           10 :       total_grid_npts = 0
     343           28 :       DO ikind = 1, nkind
     344           56 :          DO j = 1, SIZE(atomic_kind_set(ikind)%atom_list)
     345           28 :             atom_idx = atomic_kind_set(ikind)%atom_list(j)
     346           28 :             atom_to_kind(atom_idx) = ikind
     347           28 :             ri_rs_grid_offsets(atom_idx) = total_grid_npts + 1
     348           46 :             total_grid_npts = total_grid_npts + bs_env%ri_rs%grid_cache(ikind)%npts
     349              :          END DO
     350              :       END DO
     351              : 
     352           10 :       ri_rs_grid_offsets(natom + 1) = total_grid_npts + 1
     353              : 
     354           10 :       IF (bs_env%unit_nr > 0) THEN
     355              :          WRITE (bs_env%unit_nr, FMT="(T2,A,T69,I12)") &
     356            5 :             'Total grid points used for RI-RS:', total_grid_npts
     357            5 :          WRITE (bs_env%unit_nr, "(A)") ' '
     358              :       END IF
     359              : 
     360              :        !! 3. Allocate the global ri_rs_grid arrays
     361           30 :       ALLOCATE (ri_rs_grid_points(3, total_grid_npts))
     362              : 
     363              :        !! 4. Parallelize the grid generation loop
     364              :       !$OMP PARALLEL DO DEFAULT(NONE) &
     365              :       !$OMP SHARED(ri_rs_grid_points, ri_rs_grid_offsets, atom_to_kind, &
     366              :       !$OMP        particle_set, bs_env, natom) &
     367              :       !$OMP PRIVATE(atom_idx, ikind, atomic_center, start_idx, end_idx) &
     368           10 :       !$OMP SCHEDULE(DYNAMIC, 1)
     369              :       DO atom_idx = 1, natom
     370              :          ikind = atom_to_kind(atom_idx)
     371              :          atomic_center(:) = particle_set(atom_idx)%r(:)
     372              : 
     373              :          start_idx = ri_rs_grid_offsets(atom_idx)
     374              :          end_idx = start_idx + bs_env%ri_rs%grid_cache(ikind)%npts - 1
     375              : 
     376              :           !! Shift the cached origin grid by the atom's center
     377              :          ri_rs_grid_points(1, start_idx:end_idx) = bs_env%ri_rs%grid_cache(ikind)%raw_points(1, :) + atomic_center(1)
     378              :          ri_rs_grid_points(2, start_idx:end_idx) = bs_env%ri_rs%grid_cache(ikind)%raw_points(2, :) + atomic_center(2)
     379              :          ri_rs_grid_points(3, start_idx:end_idx) = bs_env%ri_rs%grid_cache(ikind)%raw_points(3, :) + atomic_center(3)
     380              : 
     381              :       END DO
     382              :       !$OMP END PARALLEL DO
     383              : 
     384              :        !! 5. Cleanup memory
     385           10 :       IF (ALLOCATED(bs_env%ri_rs%grid_cache)) THEN
     386           28 :          DO ikind = 1, nkind
     387           28 :             IF (ALLOCATED(bs_env%ri_rs%grid_cache(ikind)%raw_points)) DEALLOCATE (bs_env%ri_rs%grid_cache(ikind)%raw_points)
     388              :          END DO
     389           28 :          DEALLOCATE (bs_env%ri_rs%grid_cache)
     390              :       END IF
     391              : 
     392           10 :       DEALLOCATE (atom_to_kind, ri_rs_grid_offsets)
     393           10 :       CALL timestop(handle)
     394              : 
     395           20 :    END SUBROUTINE ri_rs_grid_assembler
     396              : 
     397              : ! **************************************************************************************************
     398              : !> \brief Reads grids from .ion files and stores them in memory based on grid_select
     399              : !> \param bs_env ...
     400              : !> \param atomic_kind_set ...
     401              : ! **************************************************************************************************
     402              : 
     403           10 :    SUBROUTINE build_grid_cache(bs_env, atomic_kind_set)
     404              : 
     405              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     406              :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     407              : 
     408              :       CHARACTER(LEN=default_path_length)                 :: filename, full_path, line
     409              :       CHARACTER(LEN=default_string_length)               :: atom_sym, suffix
     410              :       INTEGER                                            :: colon_idx, i, ierr, ikind, iunit, nkind
     411              :       REAL(KIND=dp)                                      :: pt(3)
     412              : 
     413              :       ! Determine the file suffix based on the user's choice
     414           10 :       IF (bs_env%ri_rs%grid_select == 1) THEN
     415           10 :          suffix = "_def2-tzvp-rs.ion"
     416            0 :       ELSE IF (bs_env%ri_rs%grid_select == 2) THEN
     417            0 :          suffix = "_cc-pvtz-rs.ion"
     418              :       ELSE
     419            0 :          CPABORT("Unknown grid_select value. Valid options are 1 (def2-TZVPP) or 2 (cc-pVTZ).")
     420              :       END IF
     421              : 
     422           10 :       nkind = SIZE(atomic_kind_set)
     423           48 :       IF (.NOT. ALLOCATED(bs_env%ri_rs%grid_cache)) ALLOCATE (bs_env%ri_rs%grid_cache(nkind))
     424              : 
     425           28 :       DO ikind = 1, nkind
     426           18 :          atom_sym = TRIM(atomic_kind_set(ikind)%element_symbol)
     427           18 :          filename = TRIM(atom_sym)//TRIM(suffix)
     428              : 
     429           18 :          full_path = "ri_rs_grid/"//TRIM(filename)
     430              : 
     431              :          CALL open_file(file_name=TRIM(full_path), unit_number=iunit, &
     432           18 :                         file_action="READ", file_status="OLD")
     433              : 
     434              :          ! Parse the preamble for 'n points'
     435           18 :          bs_env%ri_rs%grid_cache(ikind)%npts = 0
     436              :          DO
     437          612 :             READ (iunit, '(A)', IOSTAT=ierr) line
     438          612 :             IF (ierr /= 0) EXIT
     439          612 :             IF (INDEX(line, 'n points') > 0) THEN
     440           18 :                colon_idx = INDEX(line, ':')
     441           18 :                READ (line(colon_idx + 1:), *) bs_env%ri_rs%grid_cache(ikind)%npts
     442           18 :                EXIT
     443              :             END IF
     444              :          END DO
     445              : 
     446              :          ! Allocate the cache array for this specific element
     447           54 :          ALLOCATE (bs_env%ri_rs%grid_cache(ikind)%raw_points(3, bs_env%ri_rs%grid_cache(ikind)%npts))
     448              : 
     449           18 :          REWIND (iunit)
     450              :          DO
     451          648 :             READ (iunit, '(A)', IOSTAT=ierr) line
     452          648 :             IF (ierr /= 0) EXIT
     453          648 :             IF (INDEX(line, '<grid_points>') > 0) EXIT
     454              :          END DO
     455              : 
     456              :          ! Read the raw grid coordinates
     457         4466 :          DO i = 1, bs_env%ri_rs%grid_cache(ikind)%npts
     458         4448 :             READ (iunit, *, IOSTAT=ierr) pt(1), pt(2), pt(3)
     459         4448 :             IF (ierr /= 0) THEN
     460            0 :                CPABORT("Unexpected EOF in grid file ")
     461              :             END IF
     462        17810 :             bs_env%ri_rs%grid_cache(ikind)%raw_points(:, i) = pt(:)
     463              :          END DO
     464              : 
     465           28 :          CALL close_file(unit_number=iunit)
     466              :       END DO
     467              : 
     468           10 :    END SUBROUTINE build_grid_cache
     469              : 
     470              : ! **************************************************************************************************
     471              : !> \brief Evaluates atomic basis functions on a real-space grid and builds a sparse DBCSR matrix.
     472              : !> \param qs_env ...
     473              : !> \param bs_env ...
     474              : !> \param ri_rs_grid_points ...
     475              : !> \param mat_phi_mu_l ...
     476              : ! **************************************************************************************************
     477              : 
     478           10 :    SUBROUTINE atomic_basis_at_grid_point(qs_env, bs_env, ri_rs_grid_points, mat_phi_mu_l)
     479              : 
     480              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     481              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     482              :       REAL(KIND=dp), ALLOCATABLE, INTENT(INOUT)          :: ri_rs_grid_points(:, :)
     483              :       TYPE(dbcsr_type), INTENT(OUT)                      :: mat_phi_mu_l
     484              : 
     485              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'atomic_basis_at_grid_point'
     486              : 
     487              :       INTEGER :: c_size, chunk_size, dimen_ORB, handle, i, i_blk, iatom, natom, npcol, nprow, &
     488              :          num_grid_chunks, r_end, r_start, total_grid_npts
     489              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: first_sgf
     490           10 :       INTEGER, DIMENSION(:), POINTER                     :: c_blk_sizes, col_dist, col_dist_ks, &
     491           10 :                                                             r_blk_sizes, row_dist, row_dist_ks
     492           10 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: atom_col_buffer
     493           10 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     494              :       TYPE(cell_type), POINTER                           :: cell
     495              :       TYPE(dbcsr_distribution_type)                      :: dist
     496              :       TYPE(dbcsr_distribution_type), POINTER             :: dbcsr_dist_ks
     497              :       TYPE(mp_para_env_type), POINTER                    :: para_env
     498           10 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     499           10 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     500              : 
     501           10 :       CALL timeset(routineN, handle)
     502              : 
     503           10 :       chunk_size = max_elements_per_block
     504              : 
     505              :       ! Extract environment variables
     506              :       CALL get_qs_env(qs_env, cell=cell, atomic_kind_set=atomic_kind_set, &
     507              :                       qs_kind_set=qs_kind_set, particle_set=particle_set, &
     508           10 :                       para_env=para_env)
     509              : 
     510           10 :       natom = SIZE(particle_set)
     511           10 :       total_grid_npts = SIZE(ri_rs_grid_points, 2)
     512              : 
     513              :       ! Map the starting indices of spherical gaussian functions (SGF) for each atom
     514           30 :       ALLOCATE (first_sgf(natom + 1))
     515           10 :       CALL get_basis_offsets(particle_set, qs_kind_set, first_sgf, dimen_ORB)
     516              : 
     517              :       ! =========================================================================
     518              :       ! 1. SETUP DBCSR MATRIX TOPOLOGY
     519              :       ! =========================================================================
     520              : 
     521              :       ! A. Define Column Block Sizes (1 Block = 1 Atom's full basis set)
     522           30 :       ALLOCATE (c_blk_sizes(natom))
     523           38 :       DO iatom = 1, natom
     524           38 :          c_blk_sizes(iatom) = first_sgf(iatom + 1) - first_sgf(iatom)
     525              :       END DO
     526              : 
     527              :       ! B. Define Row Block Sizes (Grid chunks of max size 256)
     528           10 :       num_grid_chunks = CEILING(REAL(total_grid_npts, KIND=dp)/REAL(chunk_size, KIND=dp))
     529           30 :       ALLOCATE (r_blk_sizes(num_grid_chunks))
     530          204 :       r_blk_sizes = chunk_size
     531           10 :       IF (MOD(total_grid_npts, chunk_size) /= 0) THEN
     532            0 :          r_blk_sizes(num_grid_chunks) = MOD(total_grid_npts, chunk_size)
     533              :       END IF
     534              : 
     535              :       ! C. Fetch CP2K's Default Process Grid Configuration
     536           10 :       CALL get_qs_env(qs_env, dbcsr_dist=dbcsr_dist_ks)
     537           10 :       CALL dbcsr_distribution_get(dbcsr_dist_ks, row_dist=row_dist_ks, col_dist=col_dist_ks)
     538              : 
     539              :       ! D. Build Custom Mappings using Round-Robin across the 2D process grid
     540              :       !    (MAXVAL + 1 accounts for 0-based process indexing in DBCSR)
     541           38 :       nprow = MAXVAL(row_dist_ks) + 1
     542           38 :       npcol = MAXVAL(col_dist_ks) + 1
     543              : 
     544           20 :       ALLOCATE (row_dist(num_grid_chunks))
     545          204 :       DO i = 1, num_grid_chunks
     546          204 :          row_dist(i) = MOD(i - 1, nprow)
     547              :       END DO
     548              : 
     549           20 :       ALLOCATE (col_dist(natom))
     550           38 :       DO i = 1, natom
     551           38 :          col_dist(i) = MOD(i - 1, npcol)
     552              :       END DO
     553              : 
     554              :       ! E. Create the DBCSR Distribution and Initialize the Matrix
     555              :       CALL dbcsr_distribution_new(dist, template=dbcsr_dist_ks, &
     556           10 :                                   row_dist=row_dist, col_dist=col_dist)
     557              : 
     558              :       CALL dbcsr_create(mat_phi_mu_l, name="phi_val_sparse", dist=dist, &
     559              :                         matrix_type=dbcsr_type_no_symmetry, &
     560           10 :                         row_blk_size=r_blk_sizes, col_blk_size=c_blk_sizes)
     561              : 
     562              :       ! =========================================================================
     563              :       ! 2. STREAM DATA DIRECTLY INTO SPARSE MATRIX
     564              :       ! =========================================================================
     565              :       ! Iterate over the atoms assigned to this specific MPI rank
     566           10 :       DO iatom = para_env%mepos + 1, natom, para_env%num_pe
     567              : 
     568           14 :          c_size = c_blk_sizes(iatom)
     569              : 
     570              :          ! Allocate a temporary dense buffer just for this specific atom
     571           56 :          ALLOCATE (atom_col_buffer(total_grid_npts, c_size))
     572           14 :          atom_col_buffer = 0.0_dp
     573              : 
     574              :          ! Evaluate the basis functions on the grid. Skip grid points outside
     575              :          ! the spatial extent of the most diffuse AO Gaussian on iatom; beyond
     576              :          ! that radius the contribution is guaranteed below eps_filter.
     577              :          CALL fill_phi_for_atom(atom_col_buffer, ri_rs_grid_points, total_grid_npts, &
     578              :                                 iatom, particle_set, qs_kind_set, cell, &
     579           14 :                                 bs_env%ri_rs%radius_ao_per_atom(iatom)**2)
     580              : 
     581              :          ! Slice the dense column into chunks and insert into DBCSR
     582          284 :          DO i_blk = 1, num_grid_chunks
     583          270 :             r_start = (i_blk - 1)*chunk_size + 1
     584          270 :             r_end = MIN(i_blk*chunk_size, total_grid_npts)
     585              : 
     586              :             ! Apply dynamic sparsity filtering: Only store blocks with physical significance
     587        24770 :             IF (MAXVAL(ABS(atom_col_buffer(r_start:r_end, 1:c_size))) > bs_env%eps_filter) THEN
     588              :                CALL dbcsr_put_block(mat_phi_mu_l, row=i_blk, col=iatom, &
     589          270 :                                     block=atom_col_buffer(r_start:r_end, 1:c_size))
     590              :             END IF
     591              :          END DO
     592              : 
     593           14 :          DEALLOCATE (atom_col_buffer)
     594              : 
     595              :       END DO
     596              : 
     597              :       ! Finalize triggers internal MPI communication to route blocks to their correct 2D process owners
     598           10 :       CALL dbcsr_finalize(mat_phi_mu_l)
     599              : 
     600              :       ! -------------------------------------------------------------------------
     601              :       ! CLEANUP
     602              :       ! -------------------------------------------------------------------------
     603           10 :       DEALLOCATE (first_sgf, r_blk_sizes, c_blk_sizes, row_dist, col_dist)
     604           10 :       CALL dbcsr_distribution_release(dist)
     605              : 
     606           10 :       CALL timestop(handle)
     607              : 
     608           40 :    END SUBROUTINE atomic_basis_at_grid_point
     609              : 
     610              : ! **************************************************************************************************
     611              : !> \brief Compute value of all basis function for single atom across all grid points
     612              : !> \param phi_val ...
     613              : !> \param ri_rs_grid ...
     614              : !> \param npts ...
     615              : !> \param iatom ...
     616              : !> \param particle_set ...
     617              : !> \param qs_kind_set ...
     618              : !> \param cell ...
     619              : !> \param r2_threshold ...
     620              : ! **************************************************************************************************
     621              : 
     622           45 :    SUBROUTINE fill_phi_for_atom(phi_val, ri_rs_grid, npts, iatom, &
     623              :                                 particle_set, qs_kind_set, cell, r2_threshold)
     624              : 
     625              :       REAL(KIND=dp), INTENT(INOUT)                       :: phi_val(:, :)
     626              :       INTEGER, INTENT(IN)                                :: npts
     627              :       REAL(KIND=dp), INTENT(IN)                          :: ri_rs_grid(3, npts)
     628              :       INTEGER, INTENT(IN)                                :: iatom
     629              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     630              :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     631              :       TYPE(cell_type), POINTER                           :: cell
     632              :       REAL(KIND=dp), INTENT(IN)                          :: r2_threshold
     633              : 
     634              :       INTEGER                                            :: first_sgf, i_pt, ico, iend_co, ikind, &
     635              :                                                             ipgf, iset, isgf, ishell, istart_co, &
     636              :                                                             l, last_sgf, lx, ly, lz, n_cart_total, &
     637              :                                                             row_idx
     638              :       REAL(KIND=dp)                                      :: alpha, dist_vec(3), exp_val, poly, r2, &
     639              :                                                             r_atom(3), weight
     640              :       TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
     641              : 
     642              :       ! Get Atom Info
     643           45 :       ikind = particle_set(iatom)%atomic_kind%kind_number
     644           45 :       CALL get_qs_kind(qs_kind_set(ikind), basis_set=orb_basis_set, basis_type="ORB")
     645           45 :       IF (.NOT. ASSOCIATED(orb_basis_set)) RETURN
     646              : 
     647          180 :       r_atom = particle_set(iatom)%r
     648              : 
     649              :       !$OMP PARALLEL DO DEFAULT(NONE) &
     650              :       !$OMP SHARED(phi_val, ri_rs_grid, npts, orb_basis_set, r_atom, cell, ncoset, indco, &
     651              :       !$OMP        r2_threshold) &
     652              :       !$OMP PRIVATE(i_pt, dist_vec, r2, iset, n_cart_total, ishell, l, istart_co, iend_co, &
     653              :       !$OMP         first_sgf, last_sgf, ipgf, alpha, exp_val, isgf, ico, row_idx, weight, &
     654              :       !$OMP         lx, ly, lz, poly) &
     655           45 :       !$OMP SCHEDULE(STATIC)
     656              :       DO i_pt = 1, npts
     657              :          dist_vec = pbc(ri_rs_grid(:, i_pt) - r_atom, cell)
     658              :          r2 = DOT_PRODUCT(dist_vec, dist_vec)
     659              :          IF (r2 > r2_threshold) CYCLE
     660              : 
     661              :          DO iset = 1, orb_basis_set%nset
     662              :             n_cart_total = ncoset(orb_basis_set%lmax(iset))
     663              : 
     664              :             DO ishell = 1, orb_basis_set%nshell(iset)
     665              :                l = orb_basis_set%l(ishell, iset)
     666              :                istart_co = ncoset(l - 1) + 1
     667              :                iend_co = ncoset(l)
     668              : 
     669              :                first_sgf = orb_basis_set%first_sgf(ishell, iset)
     670              :                last_sgf = orb_basis_set%last_sgf(ishell, iset)
     671              : 
     672              :                DO ipgf = 1, orb_basis_set%npgf(iset)
     673              :                   alpha = orb_basis_set%zet(ipgf, iset)
     674              :                   exp_val = EXP(-alpha*r2)
     675              : 
     676              :                   DO isgf = first_sgf, last_sgf
     677              :                      DO ico = istart_co, iend_co
     678              :                         row_idx = (ipgf - 1)*n_cart_total + ico
     679              :                         weight = orb_basis_set%sphi(row_idx, isgf)
     680              :                         lx = indco(1, ico)
     681              :                         ly = indco(2, ico)
     682              :                         lz = indco(3, ico)
     683              :                         poly = (dist_vec(1)**lx)*(dist_vec(2)**ly)*(dist_vec(3)**lz)
     684              : 
     685              :                         phi_val(i_pt, isgf) = phi_val(i_pt, isgf) + (weight*poly*exp_val)
     686              : 
     687              :                      END DO
     688              :                   END DO
     689              :                END DO
     690              :             END DO
     691              :          END DO
     692              :       END DO
     693              :       !$OMP END PARALLEL DO
     694              : 
     695              :    END SUBROUTINE fill_phi_for_atom
     696              : 
     697              : ! **************************************************************************************************
     698              : !> \brief Helper for OMP threads to fill phi_val column values
     699              : !> \param particle_set ...
     700              : !> \param qs_kind_set ...
     701              : !> \param first_sgf ...
     702              : !> \param total_sgf ...
     703              : ! **************************************************************************************************
     704              : 
     705           10 :    SUBROUTINE get_basis_offsets(particle_set, qs_kind_set, first_sgf, total_sgf)
     706              : 
     707              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     708              :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     709              :       INTEGER, INTENT(OUT)                               :: first_sgf(:), total_sgf
     710              : 
     711              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'get_basis_offsets'
     712              : 
     713              :       INTEGER                                            :: handle, iatom, ikind, nsgf
     714              : 
     715           10 :       CALL timeset(routineN, handle)
     716              : 
     717           10 :       total_sgf = 0
     718           38 :       DO iatom = 1, SIZE(particle_set)
     719           28 :          first_sgf(iatom) = total_sgf + 1
     720           28 :          ikind = particle_set(iatom)%atomic_kind%kind_number
     721           28 :          CALL get_qs_kind(qs_kind_set(ikind), nsgf=nsgf, basis_type="ORB")
     722           38 :          total_sgf = total_sgf + nsgf
     723              :       END DO
     724           10 :       first_sgf(SIZE(particle_set) + 1) = total_sgf + 1
     725              : 
     726           10 :       CALL timestop(handle)
     727              : 
     728           10 :    END SUBROUTINE get_basis_offsets
     729              : 
     730              : ! **************************************************************************************************
     731              : !> \brief Compute RI-RS Coefficients (Z_lP)
     732              : !> \param qs_env ...
     733              : !> \param bs_env ...
     734              : !> \param ri_rs_grid_points ...
     735              : !> \param mat_phi_mu_l ...
     736              : !> \param mat_Z_lP ...
     737              : ! **************************************************************************************************
     738              : 
     739           20 :    SUBROUTINE compute_coeff_Z_lP(qs_env, bs_env, ri_rs_grid_points, mat_phi_mu_l, mat_Z_lP)
     740              : 
     741              :       ! Arguments
     742              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     743              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     744              :       REAL(KIND=dp), ALLOCATABLE, INTENT(INOUT)          :: ri_rs_grid_points(:, :)
     745              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l
     746              :       TYPE(dbcsr_type), INTENT(OUT)                      :: mat_Z_lP
     747              : 
     748              :       CHARACTER(LEN=*), PARAMETER :: key = 'PROPERTIES%BANDSTRUCTURE%GW%PRINT%RESTART', &
     749              :          routineN = 'compute_coeff_Z_lP'
     750              : 
     751              :       INTEGER :: atom_j_mepos, atom_j_stride, atom_P, atom_P_start, atom_P_stride, col_end, &
     752              :          col_start, current_chunk_size, g, group_handle, handle, i, i_blk, ikind, info, j, j_ri, &
     753              :          l, loc_idx, loc_ptr, max_ao_size, max_loc_ri, my_group, n_ao_total, n_grid_total, &
     754              :          n_groups, n_loc_ri, n_local_grid, n_procs_per_atom, natom, nkind, num_grid_chunks, &
     755              :          P_loop_atom, r_end, r_start, source_atom
     756           10 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: local_grid_idx, row_offset
     757           20 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist_phi, col_dist_ri, r_blk_sizes, &
     758           10 :                                                             ri_blk_sizes, row_dist_grid
     759              :       REAL(KIND=dp)                                      :: cutoff_ri, d_sP, dist, r2_threshold, &
     760              :                                                             r_c, t1
     761           10 :       REAL(KIND=dp), ALLOCATABLE                         :: cutoff_ri_per_kind(:)
     762           10 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: cutoff_ri_per_atom, d_vec_local
     763           10 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: D_local, d_lp_local, phi_local, &
     764           10 :                                                             sphere_grid, Z_blk
     765              :       REAL(KIND=dp), DIMENSION(3)                        :: pos_P
     766           10 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     767              :       TYPE(cell_type), POINTER                           :: cell
     768              :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub
     769              :       TYPE(cp_fm_struct_type), POINTER                   :: fm_struct_b, fm_struct_D
     770              :       TYPE(cp_fm_type)                                   :: fm_b, fm_D
     771              :       TYPE(cp_logger_type), POINTER                      :: logger
     772              :       TYPE(dbcsr_distribution_type)                      :: dist_phi, dist_Z
     773          130 :       TYPE(gw_3c_ctx_type)                               :: ctx_3c
     774              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
     775           10 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     776           10 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     777              :       TYPE(section_vals_type), POINTER                   :: input
     778              : 
     779           10 :       CALL timeset(routineN, handle)
     780              : 
     781           10 :       t1 = m_walltime()
     782              : 
     783              :       CALL get_qs_env(qs_env, para_env=para_env, particle_set=particle_set, input=input, &
     784           10 :                       qs_kind_set=qs_kind_set, cell=cell, atomic_kind_set=atomic_kind_set)
     785              : 
     786              :       ! ---------------------------------------------------------------------
     787              :       ! Subgroup setup. Default G=1 keeps the single-rank BLAS path; G>1 splits
     788              :       ! ranks into atom-groups so the Cholesky on D_local distributes across G
     789              :       ! ranks (memory ~1/G) and the compute_d_lp build also splits across the
     790              :       ! subgroup. G=1 leaves para_env_sub / blacs_env_sub NULL — no subgroup
     791              :       ! comms created, atom_P loop uses per-rank round-robin, compute_d_lp runs
     792              :       ! its full atom_j range on each rank, no allreduce.
     793              :       ! ---------------------------------------------------------------------
     794           10 :       n_procs_per_atom = MIN(bs_env%ri_rs%n_procs_per_atom_z_lp, para_env%num_pe)
     795           10 :       IF (n_procs_per_atom < 1) n_procs_per_atom = 1
     796              : 
     797           10 :       NULLIFY (para_env_sub, blacs_env_sub)
     798           10 :       IF (n_procs_per_atom > 1) THEN
     799            0 :          n_groups = para_env%num_pe/n_procs_per_atom
     800            0 :          my_group = MIN(para_env%mepos/n_procs_per_atom, n_groups - 1)
     801            0 :          ALLOCATE (para_env_sub)
     802            0 :          CALL para_env_sub%from_split(para_env, my_group)
     803            0 :          CALL cp_blacs_env_create(blacs_env=blacs_env_sub, para_env=para_env_sub)
     804            0 :          atom_P_start = my_group + 1
     805            0 :          atom_P_stride = n_groups
     806            0 :          atom_j_mepos = para_env_sub%mepos
     807            0 :          atom_j_stride = para_env_sub%num_pe
     808              :       ELSE
     809           10 :          atom_P_start = para_env%mepos + 1
     810           10 :          atom_P_stride = para_env%num_pe
     811           10 :          atom_j_mepos = 0
     812           10 :          atom_j_stride = 1
     813              :       END IF
     814              : 
     815           10 :       natom = SIZE(bs_env%i_RI_start_from_atom)
     816           10 :       n_ao_total = bs_env%i_ao_end_from_atom(natom)
     817           10 :       n_grid_total = SIZE(ri_rs_grid_points, 2)
     818              : 
     819              :       ! =========================================================================
     820              :       ! 1. SETUP DBCSR TOPOLOGY & EXACT OFFSETS
     821              :       ! =========================================================================
     822           10 :       CALL dbcsr_get_info(mat_phi_mu_l, row_blk_size=r_blk_sizes, distribution=dist_phi)
     823           10 :       CALL dbcsr_distribution_get(dist_phi, row_dist=row_dist_grid, col_dist=col_dist_phi, group=group_handle)
     824              : 
     825           10 :       num_grid_chunks = SIZE(r_blk_sizes)
     826              : 
     827           30 :       ALLOCATE (row_offset(num_grid_chunks))
     828           10 :       row_offset(1) = 0
     829          194 :       DO i_blk = 2, num_grid_chunks
     830          194 :          row_offset(i_blk) = row_offset(i_blk - 1) + r_blk_sizes(i_blk - 1)
     831              :       END DO
     832              : 
     833           40 :       ALLOCATE (ri_blk_sizes(natom), col_dist_ri(natom))
     834           38 :       DO atom_P = 1, natom
     835           28 :          ri_blk_sizes(atom_P) = bs_env%i_RI_end_from_atom(atom_P) - bs_env%i_RI_start_from_atom(atom_P) + 1
     836          118 :          col_dist_ri(atom_P) = MOD(atom_P - 1, MAXVAL(col_dist_phi) + 1)
     837              :       END DO
     838              : 
     839           10 :       CALL dbcsr_distribution_new(dist_Z, template=dist_phi, row_dist=row_dist_grid, col_dist=col_dist_ri)
     840              : 
     841           10 :       IF (bs_env%ri_rs%Z_lP_exists) THEN
     842              :          CALL dbcsr_binary_read(filepath=TRIM(bs_env%prefix)//"Z_lP.matrix", &
     843              :                                 distribution=dist_Z, &
     844            2 :                                 matrix_new=mat_Z_lP)
     845            2 :          IF (bs_env%unit_nr > 0) THEN
     846              :             WRITE (bs_env%unit_nr, '(T2,A,T57,A,F7.1,A)') &
     847            1 :                'Read Z_lP from file ', ' Execution time', m_walltime() - t1, ' s'
     848            1 :             WRITE (bs_env%unit_nr, '(A)') ' '
     849              :          END IF
     850              :       ELSE
     851              : 
     852              :          CALL dbcsr_create(mat_Z_lP, name="mat_Z_lP", dist=dist_Z, &
     853              :                            matrix_type=dbcsr_type_no_symmetry, &
     854            8 :                            row_blk_size=r_blk_sizes, col_blk_size=ri_blk_sizes)
     855              : 
     856            8 :          max_ao_size = 0
     857           30 :          DO j = 1, SIZE(bs_env%i_ao_start_from_atom)
     858           30 :             max_ao_size = MAX(max_ao_size, bs_env%i_ao_end_from_atom(j) - bs_env%i_ao_start_from_atom(j) + 1)
     859              :          END DO
     860           30 :          max_loc_ri = MAXVAL(ri_blk_sizes)
     861              : 
     862              :          ! Per-atom RI-RS integration sphere:
     863              :          !   cutoff_ri(P) = r_c + r_AO(P)
     864              :          ! where r_c is the truncated-Coulomb cutoff of the RI metric. The
     865              :          ! CUTOFF_RADIUS_RI_RS keyword (when > 0) overrides the entire cutoff calculation.
     866            8 :          nkind = SIZE(atomic_kind_set)
     867           24 :          ALLOCATE (cutoff_ri_per_atom(natom))
     868              : 
     869            8 :          IF (bs_env%ri_rs%cutoff_radius_ri_rs > 0.0_dp) THEN
     870            0 :             cutoff_ri_per_atom(:) = bs_env%ri_rs%cutoff_radius_ri_rs
     871              :          ELSE
     872            8 :             r_c = bs_env%ri_metric%cutoff_radius
     873           30 :             DO P_loop_atom = 1, natom
     874              :                cutoff_ri_per_atom(P_loop_atom) = &
     875           30 :                   r_c + bs_env%ri_rs%radius_ao_per_atom(P_loop_atom)
     876              :             END DO
     877              :          END IF
     878              : 
     879           24 :          ALLOCATE (cutoff_ri_per_kind(nkind))
     880            8 :          cutoff_ri_per_kind(:) = 0.0_dp
     881            8 :          IF (bs_env%unit_nr > 0) THEN
     882            4 :             WRITE (bs_env%unit_nr, '(T2,A)') 'Per-kind maximum RI-RS sphere cutoff (Bohr):'
     883            4 :             WRITE (bs_env%unit_nr, '(T4,A4,A14)') 'Kind', 'max cutoff_ri'
     884              :             ! Walk atoms once; accumulate per-kind max using atomic_kind index
     885           15 :             DO P_loop_atom = 1, natom
     886           11 :                ikind = particle_set(P_loop_atom)%atomic_kind%kind_number
     887              :                cutoff_ri_per_kind(ikind) = MAX(cutoff_ri_per_kind(ikind), &
     888           15 :                                                cutoff_ri_per_atom(P_loop_atom))
     889              :             END DO
     890           11 :             DO ikind = 1, nkind
     891              :                WRITE (bs_env%unit_nr, '(T4,A4,F14.4)') &
     892            7 :                   atomic_kind_set(ikind)%element_symbol, &
     893           18 :                   cutoff_ri_per_kind(ikind)
     894              :             END DO
     895            4 :             WRITE (bs_env%unit_nr, '(A)') ' '
     896              :          END IF
     897            8 :          DEALLOCATE (cutoff_ri_per_kind)
     898              : 
     899              :          ! Build the shared 3c-integral context once, before the atom_P loop and outside
     900              :          ! any OMP region. This hoists out the libint / t_c_g0 / md_ftable inits and the
     901              :          ! contracted sphi tables, which would otherwise re-run on every per-block call.
     902              :          ! gw_3c_ctx_create is MPI-collective (truncated-Coulomb init reads a file + bcasts).
     903              :          CALL gw_3c_ctx_create(ctx_3c, qs_env, bs_env%ri_metric, &
     904              :                                basis_j=bs_env%basis_set_AO, basis_k=bs_env%basis_set_AO, &
     905            8 :                                basis_i=bs_env%basis_set_RI)
     906              : 
     907              :          ! =========================================================================
     908              :          ! 2. MPI LOOP OVER ATOMS (Fully independent, no MPI barriers inside)
     909              :          ! This distributes the N_atoms among the available MPI processors.
     910              :          ! phi_local for each atom_P's cutoff sphere is built on the fly via
     911              :          ! fill_phi_for_atom — no dense replicated phi_global, no allreduce.
     912              :          ! =========================================================================
     913            8 :          DO atom_P = atom_P_start, natom, atom_P_stride
     914              : 
     915           11 :             n_loc_ri = ri_blk_sizes(atom_P)
     916           44 :             pos_P(:) = particle_set(atom_P)%r(:)
     917              : 
     918           11 :             cutoff_ri = cutoff_ri_per_atom(atom_P)
     919              : 
     920              :             ! ---------------------------------------------------------------------
     921              :             ! A. Determine Local Grid Domain based on cutoff_ri
     922              :             ! ---------------------------------------------------------------------
     923           11 :             n_local_grid = 0
     924         6827 :             DO l = 1, n_grid_total
     925        27264 :                dist = SQRT(SUM((ri_rs_grid_points(1:3, l) - pos_P(1:3))**2))
     926         6827 :                IF (dist <= cutoff_ri) n_local_grid = n_local_grid + 1
     927              :             END DO
     928              : 
     929           33 :             ALLOCATE (local_grid_idx(n_local_grid))
     930              : 
     931           11 :             n_local_grid = 0
     932         6827 :             DO l = 1, n_grid_total
     933        27264 :                dist = SQRT(SUM((ri_rs_grid_points(1:3, l) - pos_P(1:3))**2))
     934         6827 :                IF (dist <= cutoff_ri) THEN
     935         6816 :                   n_local_grid = n_local_grid + 1
     936         6816 :                   local_grid_idx(n_local_grid) = l
     937              :                END IF
     938              :             END DO
     939              : 
     940              :             ! ---------------------------------------------------------------------
     941              :             ! B. Build phi_local on the fly via fill_phi_for_atom (no replication)
     942              :             ! ---------------------------------------------------------------------
     943              :             ! Copy the local sphere coordinates into a contiguous buffer for
     944              :             ! fill_phi_for_atom. Only source atoms whose AO basis can reach the
     945              :             ! sphere of atom_P contribute; the others are pruned by the inequality
     946              :             ! using the per-atom radii precomputed in precompute_ri_rs_radii.
     947           33 :             ALLOCATE (sphere_grid(3, n_local_grid))
     948         6827 :             DO loc_idx = 1, n_local_grid
     949        27275 :                sphere_grid(:, loc_idx) = ri_rs_grid_points(:, local_grid_idx(loc_idx))
     950              :             END DO
     951              : 
     952           44 :             ALLOCATE (phi_local(n_local_grid, n_ao_total))
     953           11 :             phi_local = 0.0_dp
     954              : 
     955           42 :             DO source_atom = 1, natom
     956          124 :                d_sP = NORM2(particle_set(source_atom)%r(:) - pos_P(:))
     957           31 :                IF (d_sP > bs_env%ri_rs%radius_ao_per_atom(source_atom) + cutoff_ri) CYCLE
     958              : 
     959           31 :                col_start = bs_env%i_ao_start_from_atom(source_atom)
     960           31 :                col_end = bs_env%i_ao_end_from_atom(source_atom)
     961           31 :                r2_threshold = bs_env%ri_rs%radius_ao_per_atom(source_atom)**2
     962              : 
     963              :                CALL fill_phi_for_atom(phi_local(:, col_start:col_end), sphere_grid, &
     964              :                                       n_local_grid, source_atom, particle_set, qs_kind_set, &
     965           42 :                                       cell, r2_threshold)
     966              :             END DO
     967              : 
     968           11 :             DEALLOCATE (sphere_grid)
     969              : 
     970              :             ! ---------------------------------------------------------------------
     971              :             ! C. Build Local RHS Matrix (d_lp_local)
     972              :             !    Done first so the subgroup-distributed compute_d_lp + allreduce
     973              :             !    is not entangled with the LHS build. compute_d_lp does not
     974              :             !    depend on D_local or d_vec_local.
     975              :             ! ---------------------------------------------------------------------
     976           44 :             ALLOCATE (d_lp_local(n_local_grid, n_loc_ri))
     977           11 :             d_lp_local = 0.0_dp
     978              : 
     979              :             CALL compute_d_lp(bs_env, ctx_3c, phi_local, d_lp_local, n_local_grid, &
     980           11 :                               n_loc_ri, atom_P, max_ao_size, atom_j_mepos, atom_j_stride)
     981              : 
     982              :             ! Reduce per-subgroup-rank partials into the replicated d_lp_local.
     983              :             ! Skipped for G=1 (BLAS path): each rank has the full sum locally.
     984           11 :             IF (n_procs_per_atom > 1) THEN
     985            0 :                CALL para_env_sub%sum(d_lp_local)
     986              :             END IF
     987              : 
     988              :             ! ---------------------------------------------------------------------
     989              :             ! D. Build d_vec_local (Jacobi diagonal) + LHS — BLAS or ScaLAPACK
     990              :             ! ---------------------------------------------------------------------
     991           33 :             ALLOCATE (d_vec_local(n_local_grid))
     992              : 
     993           11 :             IF (n_procs_per_atom == 1) THEN
     994              :                ! BLAS path: build D_local densely, compute d_vec as side-effect
     995              :                ! of the Jacobi step (preserves bit-identical arithmetic with the
     996              :                ! previous branch).
     997           44 :                ALLOCATE (D_local(n_local_grid, n_local_grid))
     998           11 :                D_local = 0.0_dp
     999              : 
    1000              :                CALL dsyrk("L", "N", n_local_grid, n_ao_total, 1.0_dp, phi_local, &
    1001           11 :                           n_local_grid, 0.0_dp, D_local, n_local_grid)
    1002              : 
    1003              :                !$OMP PARALLEL DO DEFAULT(NONE) &
    1004              :                !$OMP SHARED(n_local_grid, D_local, d_vec_local, bs_env) &
    1005              :                !$OMP PRIVATE(i) &
    1006           11 :                !$OMP SCHEDULE(STATIC)
    1007              :                DO i = 1, n_local_grid
    1008              :                   D_local(i, i) = D_local(i, i)**2
    1009              :                   d_vec_local(i) = 1.0_dp/SQRT(MAX(D_local(i, i), 1.0E-16_dp))
    1010              :                   D_local(i, i) = (D_local(i, i)*d_vec_local(i)**2) + bs_env%ri_rs%tikhonov
    1011              :                END DO
    1012              :                !$OMP END PARALLEL DO
    1013              : 
    1014              :                !$OMP PARALLEL DO DEFAULT(NONE) &
    1015              :                !$OMP SHARED(n_local_grid, D_local, d_vec_local) &
    1016              :                !$OMP PRIVATE(j, i) &
    1017           11 :                !$OMP SCHEDULE(DYNAMIC)
    1018              :                DO j = 1, n_local_grid
    1019              :                   DO i = j + 1, n_local_grid
    1020              :                      D_local(i, j) = D_local(i, j)**2
    1021              :                      D_local(i, j) = D_local(i, j)*d_vec_local(i)*d_vec_local(j)
    1022              :                      D_local(j, i) = D_local(i, j)
    1023              :                   END DO
    1024              :                END DO
    1025              :                !$OMP END PARALLEL DO
    1026              :             ELSE
    1027              :                ! ScaLAPACK path: d_vec computed directly from phi (= 1/||phi_i||^2);
    1028              :                ! solve_D_lp_distributed builds D block-cyclic internally with
    1029              :                ! the squared+scaled values, so no dense D_local on this rank.
    1030              :                !$OMP PARALLEL DO DEFAULT(NONE) &
    1031              :                !$OMP SHARED(n_local_grid, n_ao_total, phi_local, d_vec_local) &
    1032              :                !$OMP PRIVATE(i, j) &
    1033            0 :                !$OMP SCHEDULE(STATIC)
    1034              :                DO i = 1, n_local_grid
    1035              :                   d_vec_local(i) = 0.0_dp
    1036              :                   DO j = 1, n_ao_total
    1037              :                      d_vec_local(i) = d_vec_local(i) + phi_local(i, j)*phi_local(i, j)
    1038              :                   END DO
    1039              :                   d_vec_local(i) = 1.0_dp/MAX(d_vec_local(i), 1.0E-16_dp)
    1040              :                END DO
    1041              :                !$OMP END PARALLEL DO
    1042              :             END IF
    1043              : 
    1044              :             ! ---------------------------------------------------------------------
    1045              :             ! E. Pre-scale d_lp by d_vec
    1046              :             ! ---------------------------------------------------------------------
    1047              :             !$OMP PARALLEL DO DEFAULT(NONE) &
    1048              :             !$OMP SHARED(n_loc_ri, n_local_grid, d_lp_local, d_vec_local) &
    1049              :             !$OMP PRIVATE(j_ri, i) &
    1050           11 :             !$OMP SCHEDULE(STATIC)
    1051              :             DO j_ri = 1, n_loc_ri
    1052              :                DO i = 1, n_local_grid
    1053              :                   d_lp_local(i, j_ri) = d_lp_local(i, j_ri)*d_vec_local(i)
    1054              :                END DO
    1055              :             END DO
    1056              :             !$OMP END PARALLEL DO
    1057              : 
    1058              :             ! ---------------------------------------------------------------------
    1059              :             ! F. Solve — BLAS dpotrf/dpotrs or ScaLAPACK pdpotrf/pdpotrs
    1060              :             ! ---------------------------------------------------------------------
    1061           11 :             IF (n_procs_per_atom == 1) THEN
    1062           11 :                CALL dpotrf('L', n_local_grid, D_local, n_local_grid, info)
    1063              :                CALL dpotrs('L', n_local_grid, n_loc_ri, D_local, n_local_grid, &
    1064           11 :                            d_lp_local, n_local_grid, info)
    1065           11 :                DEALLOCATE (D_local)
    1066              :             ELSE
    1067              :                CALL solve_D_lp_distributed(phi_local, d_vec_local, d_lp_local, &
    1068              :                                            n_local_grid, n_ao_total, n_loc_ri, &
    1069              :                                            bs_env%ri_rs%tikhonov, &
    1070              :                                            para_env_sub, blacs_env_sub, &
    1071            0 :                                            fm_struct_D, fm_struct_b, fm_D, fm_b, info)
    1072              :             END IF
    1073              : 
    1074              :             ! ---------------------------------------------------------------------
    1075              :             ! G. Post-scale solution by d_vec (common to both paths)
    1076              :             ! ---------------------------------------------------------------------
    1077              :             !$OMP PARALLEL DO DEFAULT(NONE) &
    1078              :             !$OMP SHARED(n_loc_ri, n_local_grid, d_lp_local, d_vec_local) &
    1079              :             !$OMP PRIVATE(j_ri, i) &
    1080           11 :             !$OMP SCHEDULE(STATIC)
    1081              :             DO j_ri = 1, n_loc_ri
    1082              :                DO i = 1, n_local_grid
    1083              :                   d_lp_local(i, j_ri) = d_lp_local(i, j_ri)*d_vec_local(i)
    1084              :                END DO
    1085              :             END DO
    1086              :             !$OMP END PARALLEL DO
    1087              : 
    1088              :             ! ---------------------------------------------------------------------
    1089              :             ! H. Scatter Local Solution Back to Global DBCSR Matrix
    1090              :             ! local_grid_idx is ascending (built by the ordered scan above), so a
    1091              :             ! single walking pointer over it sweeps the chunks in one pass.
    1092              :             ! Under ScaLAPACK (G>1) the d_lp_local solution is identical on all
    1093              :             ! G subgroup ranks (gathered via cp_fm_get_submatrix); only the
    1094              :             ! subgroup root writes to mat_Z_lP so each atom column is emitted
    1095              :             ! exactly once. DBCSR routes blocks to their global owner on finalize.
    1096              :             ! ---------------------------------------------------------------------
    1097           11 :             IF (n_procs_per_atom == 1 .OR. para_env_sub%mepos == 0) THEN
    1098          257 :                ALLOCATE (Z_blk(MAXVAL(r_blk_sizes), n_loc_ri))
    1099           11 :                loc_ptr = 1
    1100              : 
    1101          224 :                DO i_blk = 1, num_grid_chunks
    1102          213 :                   r_start = row_offset(i_blk) + 1
    1103          213 :                   r_end = row_offset(i_blk) + r_blk_sizes(i_blk)
    1104          213 :                   current_chunk_size = r_blk_sizes(i_blk)
    1105              : 
    1106          213 :                   Z_blk = 0.0_dp
    1107              : 
    1108         7029 :                   DO WHILE (loc_ptr <= n_local_grid)
    1109         7018 :                      g = local_grid_idx(loc_ptr)
    1110         7018 :                      IF (g > r_end) EXIT
    1111        59904 :                      Z_blk(g - r_start + 1, 1:n_loc_ri) = d_lp_local(loc_ptr, 1:n_loc_ri)
    1112         7018 :                      loc_ptr = loc_ptr + 1
    1113              :                   END DO
    1114              : 
    1115        54971 :                   IF (MAXVAL(ABS(Z_blk(1:current_chunk_size, 1:n_loc_ri))) > bs_env%eps_filter) THEN
    1116              :                      CALL dbcsr_put_block(mat_Z_lP, row=i_blk, col=atom_P, &
    1117          213 :                                           block=Z_blk(1:current_chunk_size, 1:n_loc_ri))
    1118              :                   END IF
    1119              :                END DO
    1120              : 
    1121           11 :                DEALLOCATE (Z_blk)
    1122              :             END IF
    1123              : 
    1124           11 :             DEALLOCATE (d_vec_local, d_lp_local)
    1125           11 :             DEALLOCATE (local_grid_idx, phi_local)
    1126              : 
    1127              :          END DO
    1128              : 
    1129            8 :          DEALLOCATE (cutoff_ri_per_atom)
    1130              : 
    1131            8 :          CALL gw_3c_ctx_release(ctx_3c)
    1132              : 
    1133            8 :          CALL dbcsr_finalize(mat_Z_lP)
    1134              : 
    1135            8 :          IF (bs_env%unit_nr > 0) THEN
    1136              :             WRITE (bs_env%unit_nr, '(T2,A,T57,A,F7.1,A)') &
    1137            4 :                'Computed Z_lP ', ' Execution time', m_walltime() - t1, ' s'
    1138            4 :             WRITE (bs_env%unit_nr, '(A)') ' '
    1139              :          END IF
    1140              : 
    1141            8 :          logger => cp_get_default_logger()
    1142              : 
    1143            8 :          IF (BTEST(cp_print_key_should_output(logger%iter_info, input, key), cp_p_file)) THEN
    1144            0 :             CALL dbcsr_binary_write(matrix=mat_Z_lP, filepath=TRIM(bs_env%prefix)//"Z_lP.matrix")
    1145              :          END IF
    1146              : 
    1147              :       END IF
    1148              : 
    1149           10 :       DEALLOCATE (row_offset, ri_blk_sizes, col_dist_ri)
    1150           10 :       CALL dbcsr_distribution_release(dist_Z)
    1151              : 
    1152           10 :       IF (n_procs_per_atom > 1) THEN
    1153            0 :          CALL cp_blacs_env_release(blacs_env_sub)
    1154            0 :          CALL para_env_sub%free()
    1155            0 :          DEALLOCATE (para_env_sub)
    1156              :       END IF
    1157              : 
    1158           10 :       DEALLOCATE (ri_rs_grid_points)
    1159              : 
    1160           10 :       CALL timestop(handle)
    1161              : 
    1162           50 :    END SUBROUTINE compute_coeff_Z_lP
    1163              : 
    1164              : ! **************************************************************************************************
    1165              : !> \brief Computes the dense localized RHS d_lp(l,P) = Σ_{μν} Φ_μ(r_l)·Φ_ν(r_l)·(μν|P) for one
    1166              : !>        RI atom P, OMP-threaded over (atom_j, atom_k) AO-pair blocks: per thread, build the 3c
    1167              : !>        block, then contract grid-chunked pair densities into a private d_lp partial; partials
    1168              : !>        are reduced into d_lp at the end.
    1169              : !>        Pair screening is handled inside build_3c_integral_block_ctx via the `screened` output,
    1170              : !> \param bs_env ...
    1171              : !> \param ctx shared 3c-integral context (gw_3c_ctx_create)
    1172              : !> \param phi_val Φ_μ(r_l) on the local-sphere grid (n_grid_total × n_ao)
    1173              : !> \param d_lp output (n_grid_total × n_loc_ri), zeroed by the caller, accumulated here
    1174              : !> \param n_grid_total number of local-sphere grid rows
    1175              : !> \param n_loc_ri number of RI functions of atom_P
    1176              : !> \param atom_P ...
    1177              : !> \param max_ao_size ...
    1178              : !> \param atom_j_mepos ...
    1179              : !> \param atom_j_stride ...
    1180              : ! **************************************************************************************************
    1181              : 
    1182           11 :    SUBROUTINE compute_d_lp(bs_env, ctx, phi_val, d_lp, n_grid_total, n_loc_ri, atom_P, &
    1183              :                            max_ao_size, atom_j_mepos, atom_j_stride)
    1184              : 
    1185              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1186              :       TYPE(gw_3c_ctx_type), INTENT(IN)                   :: ctx
    1187              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: phi_val
    1188              :       INTEGER, INTENT(IN)                                :: n_grid_total, n_loc_ri
    1189              :       REAL(KIND=dp), INTENT(INOUT)                       :: d_lp(n_grid_total, n_loc_ri)
    1190              :       INTEGER, INTENT(IN)                                :: atom_P, max_ao_size, atom_j_mepos, &
    1191              :                                                             atom_j_stride
    1192              : 
    1193              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_d_lp'
    1194              :       INTEGER, PARAMETER                                 :: grid_chunk = 1024
    1195              : 
    1196              :       INTEGER                                            :: atom_j, atom_k, c, handle, j, jk_idx, &
    1197              :                                                             jsize, jstart, k, ksize, kstart, l, &
    1198              :                                                             l0, ri
    1199              :       LOGICAL                                            :: screened
    1200           11 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: int_2d_prv, rho_chunk
    1201           11 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: int_3c_prv
    1202           11 :       TYPE(gw_3c_ws_type)                                :: ws
    1203              : 
    1204           11 :       CALL timeset(routineN, handle)
    1205              : 
    1206              :       !$OMP PARALLEL DEFAULT(NONE) &
    1207              :       !$OMP SHARED(bs_env, ctx, phi_val, d_lp, n_grid_total, n_loc_ri, atom_P, max_ao_size, &
    1208              :       !$OMP        atom_j_mepos, atom_j_stride) &
    1209              :       !$OMP PRIVATE(atom_j, atom_k, c, j, jk_idx, jsize, jstart, k, ksize, kstart, l, l0, ri, &
    1210           11 :       !$OMP         screened, int_2d_prv, rho_chunk, int_3c_prv, ws)
    1211              : 
    1212              :       CALL gw_3c_ws_create(ws, ctx)
    1213              :       ALLOCATE (int_3c_prv(max_ao_size, max_ao_size, n_loc_ri))
    1214              :       ALLOCATE (int_2d_prv(max_ao_size*max_ao_size, n_loc_ri))
    1215              :       ALLOCATE (rho_chunk(grid_chunk, max_ao_size*max_ao_size))
    1216              : 
    1217              :       ! MPI-stride atom_j over the subgroup (atom_j_stride = 1 for the BLAS
    1218              :       ! path, > 1 for the ScaLAPACK path). The OMP DO parallelizes the inner
    1219              :       ! atom_k while the outer atom_j carries the MPI stride. COLLAPSE(2) is
    1220              :       ! dropped because the outer stride is non-unit under ScaLAPACK; for
    1221              :       ! typical natom the inner loop has plenty of work for DYNAMIC.
    1222              :       !$OMP DO SCHEDULE(DYNAMIC) REDUCTION(+:d_lp)
    1223              :       DO atom_j = atom_j_mepos + 1, SIZE(bs_env%i_ao_start_from_atom), atom_j_stride
    1224              :          DO atom_k = 1, SIZE(bs_env%i_ao_start_from_atom)
    1225              :             jstart = bs_env%i_ao_start_from_atom(atom_j)
    1226              :             jsize = bs_env%i_ao_end_from_atom(atom_j) - jstart + 1
    1227              :             kstart = bs_env%i_ao_start_from_atom(atom_k)
    1228              :             ksize = bs_env%i_ao_end_from_atom(atom_k) - kstart + 1
    1229              : 
    1230              :             int_3c_prv(1:jsize, 1:ksize, 1:n_loc_ri) = 0.0_dp
    1231              : 
    1232              :             ! Compute B_{μν,P} = (μν|P); ctx-internal triangle-inequality screening on
    1233              :             ! kind_radius sets `screened=.TRUE.` for negligible triples.
    1234              :             CALL build_3c_integral_block_ctx(int_3c_prv(1:jsize, 1:ksize, 1:n_loc_ri), &
    1235              :                                              ctx, ws, atom_j=atom_j, atom_k=atom_k, atom_i=atom_P, &
    1236              :                                              screened=screened)
    1237              : 
    1238              :             IF (screened) CYCLE
    1239              : 
    1240              :             ! Flatten 3D B_{μν, P} tensor to 2D B_{(μν), P} matrix for BLAS
    1241              :             DO ri = 1, n_loc_ri
    1242              :                DO k = 1, ksize
    1243              :                   DO j = 1, jsize
    1244              :                      jk_idx = (k - 1)*jsize + j
    1245              :                      int_2d_prv(jk_idx, ri) = int_3c_prv(j, k, ri)
    1246              :                   END DO
    1247              :                END DO
    1248              :             END DO
    1249              : 
    1250              :             ! Pair density ρ(l, μν) = Φ_μ(r_l) Φ_ν(r_l) in grid chunks, contracted on the fly:
    1251              :             ! d_{l,P} += ρ(l, μν) B_{(μν),P}  (dgemm runs serially inside the parallel region)
    1252              :             DO l0 = 1, n_grid_total, grid_chunk
    1253              :                c = MIN(grid_chunk, n_grid_total - l0 + 1)
    1254              :                DO k = 1, ksize
    1255              :                   DO j = 1, jsize
    1256              :                      jk_idx = (k - 1)*jsize + j
    1257              :                      DO l = 1, c
    1258              :                         rho_chunk(l, jk_idx) = phi_val(l0 + l - 1, jstart + j - 1)* &
    1259              :                                                phi_val(l0 + l - 1, kstart + k - 1)
    1260              :                      END DO
    1261              :                   END DO
    1262              :                END DO
    1263              :                CALL dgemm("N", "N", c, n_loc_ri, jsize*ksize, &
    1264              :                           1.0_dp, rho_chunk, grid_chunk, &
    1265              :                           int_2d_prv, max_ao_size*max_ao_size, &
    1266              :                           1.0_dp, d_lp(l0, 1), n_grid_total)
    1267              :             END DO
    1268              :          END DO
    1269              :       END DO
    1270              :       !$OMP END DO
    1271              : 
    1272              :       DEALLOCATE (int_3c_prv, int_2d_prv, rho_chunk)
    1273              :       CALL gw_3c_ws_release(ws)
    1274              : 
    1275              :       !$OMP END PARALLEL
    1276              : 
    1277           11 :       CALL timestop(handle)
    1278              : 
    1279           11 :    END SUBROUTINE compute_d_lp
    1280              : 
    1281              : ! **************************************************************************************************
    1282              : !> \brief Distributed pdpotrf/pdpotrs solve of D x = b for one atom of the
    1283              : !>        RI-RS Z_lP build. Called when N_PROCS_PER_ATOM_Z_LP > 1, with a
    1284              : !>        subgroup of cooperating ranks and an associated BLACS context.
    1285              : !>        Each rank in the subgroup holds the (replicated) phi_local and the
    1286              : !>        (replicated) RHS d_lp; it builds its own block-cyclic slice of the
    1287              : !>        squared+Jacobi-scaled Gram matrix D via tiled DGEMM, factorizes via
    1288              : !>        cp_fm_cholesky_decompose (UPLO='U'), and solves with
    1289              : !>        cp_fm_cholesky_solve. The replicated d_lp is updated in place via
    1290              : !>        cp_fm_get_submatrix.
    1291              : !> \param phi_local   replicated (n_loc x n_ao) AO values on the sphere grid
    1292              : !> \param d_vec       replicated Jacobi diagonal = 1 / ||phi_i||^2
    1293              : !> \param d_lp        replicated RHS in/out (n_loc x n_rhs); on input pre-scaled
    1294              : !>                    by d_vec, on output also pre-scaled (caller post-scales)
    1295              : !> \param n_loc       number of sphere-local grid points
    1296              : !> \param n_ao        number of AO basis functions
    1297              : !> \param n_rhs       number of RI functions of this atom_P (RHS columns)
    1298              : !> \param tikhonov    regularisation added to the diagonal
    1299              : !> \param para_env_sub  sub-communicator of the atom-group
    1300              : !> \param blacs_env_sub BLACS context on the sub-communicator
    1301              : !> \param fm_struct_D ...
    1302              : !> \param fm_struct_b ...
    1303              : !> \param fm_D ...
    1304              : !> \param fm_b ...
    1305              : !> \param info        0 on success, non-zero if pdpotrf or pdpotrs failed
    1306              : ! **************************************************************************************************
    1307            0 :    SUBROUTINE solve_D_lp_distributed(phi_local, d_vec, d_lp, n_loc, n_ao, n_rhs, &
    1308              :                                      tikhonov, para_env_sub, blacs_env_sub, &
    1309              :                                      fm_struct_D, fm_struct_b, fm_D, fm_b, info)
    1310              : 
    1311              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: phi_local
    1312              :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: d_vec
    1313              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: d_lp
    1314              :       INTEGER, INTENT(IN)                                :: n_loc, n_ao, n_rhs
    1315              :       REAL(KIND=dp), INTENT(IN)                          :: tikhonov
    1316              :       TYPE(mp_para_env_type), POINTER                    :: para_env_sub
    1317              :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub
    1318              :       TYPE(cp_fm_struct_type), POINTER                   :: fm_struct_D, fm_struct_b
    1319              :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm_D, fm_b
    1320              :       INTEGER, INTENT(OUT)                               :: info
    1321              : 
    1322              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'solve_D_lp_distributed'
    1323              : 
    1324              :       INTEGER                                            :: handle, i_loc, ig, j_loc, jg, &
    1325              :                                                             ncol_local, nrow_local
    1326            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
    1327              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1328            0 :          POINTER                                         :: local_data
    1329              : 
    1330            0 :       CALL timeset(routineN, handle)
    1331            0 :       info = 0
    1332              : 
    1333            0 :       NULLIFY (fm_struct_D, fm_struct_b)
    1334              :       CALL cp_fm_struct_create(fm_struct_D, para_env=para_env_sub, &
    1335              :                                context=blacs_env_sub, &
    1336            0 :                                nrow_global=n_loc, ncol_global=n_loc)
    1337              :       CALL cp_fm_struct_create(fm_struct_b, para_env=para_env_sub, &
    1338              :                                context=blacs_env_sub, &
    1339            0 :                                nrow_global=n_loc, ncol_global=n_rhs)
    1340            0 :       CALL cp_fm_create(fm_D, fm_struct_D)
    1341            0 :       CALL cp_fm_create(fm_b, fm_struct_b)
    1342              : 
    1343              :       ! ---- Build the local block-cyclic slice of fm_D --------------------
    1344              :       ! Tiled DGEMM build: for each (row_tile x col_tile) sub-block we
    1345              :       ! gather small phi strips, DGEMM into a small gram tile, then write
    1346              :       ! the squared & d_vec-scaled result directly into fm_D%local_data.
    1347              :       ! This avoids materialising the full (nrow_local x n_ao),
    1348              :       ! (n_ao x ncol_local), and (nrow_local x ncol_local) buffers.
    1349              :       CALL cp_fm_get_info(fm_D, nrow_local=nrow_local, ncol_local=ncol_local, &
    1350              :                           row_indices=row_indices, col_indices=col_indices, &
    1351            0 :                           local_data=local_data)
    1352              : 
    1353            0 :       IF (nrow_local > 0 .AND. ncol_local > 0) THEN
    1354            0 :          BLOCK
    1355              :             INTEGER, PARAMETER :: ntile = 1024
    1356              :             INTEGER :: ib, ie, jb, je, mb, kb, ti, tj
    1357            0 :             REAL(KIND=dp), ALLOCATABLE :: gram_t(:, :), phi_cols_t(:, :), phi_rows_t(:, :)
    1358            0 :             ALLOCATE (phi_rows_t(ntile, n_ao), phi_cols_t(n_ao, ntile), gram_t(ntile, ntile))
    1359            0 :             DO ib = 1, nrow_local, ntile
    1360            0 :                ie = MIN(ib + ntile - 1, nrow_local)
    1361            0 :                mb = ie - ib + 1
    1362              :                !$OMP PARALLEL DO DEFAULT(NONE) &
    1363              :                !$OMP SHARED(mb, n_ao, phi_rows_t, phi_local, row_indices, ib) &
    1364            0 :                !$OMP PRIVATE(ti, j_loc) SCHEDULE(STATIC)
    1365              :                DO j_loc = 1, n_ao
    1366              :                   DO ti = 1, mb
    1367              :                      phi_rows_t(ti, j_loc) = phi_local(row_indices(ib + ti - 1), j_loc)
    1368              :                   END DO
    1369              :                END DO
    1370              :                !$OMP END PARALLEL DO
    1371            0 :                DO jb = 1, ncol_local, ntile
    1372            0 :                   je = MIN(jb + ntile - 1, ncol_local)
    1373            0 :                   kb = je - jb + 1
    1374              :                   !$OMP PARALLEL DO DEFAULT(NONE) &
    1375              :                   !$OMP SHARED(kb, n_ao, phi_cols_t, phi_local, col_indices, jb) &
    1376            0 :                   !$OMP PRIVATE(tj, i_loc) SCHEDULE(STATIC)
    1377              :                   DO tj = 1, kb
    1378              :                      DO i_loc = 1, n_ao
    1379              :                         phi_cols_t(i_loc, tj) = phi_local(col_indices(jb + tj - 1), i_loc)
    1380              :                      END DO
    1381              :                   END DO
    1382              :                   !$OMP END PARALLEL DO
    1383              :                   CALL dgemm('N', 'N', mb, kb, n_ao, &
    1384              :                              1.0_dp, phi_rows_t, ntile, phi_cols_t, n_ao, &
    1385            0 :                              0.0_dp, gram_t, ntile)
    1386              :                   !$OMP PARALLEL DO DEFAULT(NONE) &
    1387              :                   !$OMP SHARED(mb, kb, gram_t, d_vec, row_indices, col_indices, ib, jb) &
    1388              :                   !$OMP SHARED(local_data, tikhonov) &
    1389            0 :                   !$OMP PRIVATE(ti, tj, ig, jg) SCHEDULE(STATIC)
    1390              :                   DO tj = 1, kb
    1391              :                      jg = col_indices(jb + tj - 1)
    1392              :                      DO ti = 1, mb
    1393              :                         ig = row_indices(ib + ti - 1)
    1394              :                         local_data(ib + ti - 1, jb + tj - 1) = &
    1395              :                            gram_t(ti, tj)*gram_t(ti, tj)*d_vec(ig)*d_vec(jg)
    1396              :                         IF (ig == jg) THEN
    1397              :                            local_data(ib + ti - 1, jb + tj - 1) = &
    1398              :                               local_data(ib + ti - 1, jb + tj - 1) + tikhonov
    1399              :                         END IF
    1400              :                      END DO
    1401              :                   END DO
    1402              :                   !$OMP END PARALLEL DO
    1403              :                END DO
    1404              :             END DO
    1405            0 :             DEALLOCATE (phi_rows_t, phi_cols_t, gram_t)
    1406              :          END BLOCK
    1407              :       END IF
    1408              : 
    1409              :       ! ---- Load the replicated d_lp into the block-cyclic fm_b -----------
    1410            0 :       CALL cp_fm_set_submatrix(fm_b, d_lp)
    1411              : 
    1412              :       ! ---- pdpotrf (Cholesky factorisation; cp_fm_cholesky_decompose
    1413              :       !      factors with UPLO='U', so pdpotrs must match)
    1414            0 :       CALL cp_fm_cholesky_decompose(fm_D, n=n_loc, info_out=info)
    1415            0 :       IF (info /= 0) THEN
    1416            0 :          CPABORT("pdpotrf failed in solve_D_lp_distributed")
    1417              :       END IF
    1418              : 
    1419              :       ! ---- pdpotrs/dpotrs (solve in place on fm_b) -----------------------
    1420            0 :       CALL cp_fm_cholesky_solve(fm_D, fm_b, n=n_loc, info_out=info)
    1421            0 :       IF (info /= 0) THEN
    1422            0 :          CPABORT("pdpotrs failed in solve_D_lp_distributed")
    1423              :       END IF
    1424              : 
    1425              :       ! ---- Gather distributed solution back into the replicated d_lp ----
    1426            0 :       CALL cp_fm_get_submatrix(fm_b, d_lp)
    1427              : 
    1428            0 :       CALL cp_fm_release(fm_D)
    1429            0 :       CALL cp_fm_release(fm_b)
    1430            0 :       CALL cp_fm_struct_release(fm_struct_D)
    1431            0 :       CALL cp_fm_struct_release(fm_struct_b)
    1432              : 
    1433            0 :       CALL timestop(handle)
    1434              : 
    1435            0 :    END SUBROUTINE solve_D_lp_distributed
    1436              : 
    1437              : ! **************************************************************************************************
    1438              : !> \brief Computes the χ(iτ) matrix
    1439              : !> \param bs_env ...
    1440              : !> \param mat_chi_Gamma_tau ...
    1441              : !> \param mat_phi_mu_l ...
    1442              : !> \param mat_Z_lP ...
    1443              : ! **************************************************************************************************
    1444              : 
    1445           10 :    SUBROUTINE get_mat_chi_Gamma_tau(bs_env, mat_chi_Gamma_tau, mat_phi_mu_l, mat_Z_lP)
    1446              : 
    1447              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1448              :       TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: mat_chi_Gamma_tau
    1449              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, mat_Z_lP
    1450              : 
    1451              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'get_mat_chi_Gamma_tau'
    1452              : 
    1453              :       INTEGER                                            :: handle, i, i_t, ispin, npcol
    1454           10 :       INTEGER, DIMENSION(:), POINTER                     :: blk_ao, blk_grid, dist_col_ao, &
    1455           10 :                                                             dist_col_grid, dist_row_grid
    1456              :       REAL(KIND=dp)                                      :: t1, tau
    1457              :       TYPE(dbcsr_distribution_type)                      :: dist_grid_grid, dist_phi
    1458              :       TYPE(dbcsr_type)                                   :: matrix_chi_grid, matrix_chi_grid_spin, &
    1459              :                                                             matrix_G_occ_grid, matrix_G_vir_grid
    1460              : 
    1461           10 :       CALL timeset(routineN, handle)
    1462              : 
    1463              :       ! =========================================================================
    1464              :       ! 1. SETUP CORE TOPOLOGIES
    1465              :       ! =========================================================================
    1466           10 :       CALL dbcsr_get_info(mat_phi_mu_l, distribution=dist_phi, row_blk_size=blk_grid, col_blk_size=blk_ao)
    1467           10 :       CALL dbcsr_distribution_get(dist_phi, row_dist=dist_row_grid, col_dist=dist_col_ao)
    1468              : 
    1469              :       ! Determine number of MPI process columns
    1470           38 :       npcol = MAXVAL(dist_col_ao) + 1
    1471              : 
    1472              :       ! Build a perfectly safe column distribution for the Grid dimension
    1473           30 :       ALLOCATE (dist_col_grid(SIZE(blk_grid)))
    1474          204 :       DO i = 1, SIZE(blk_grid)
    1475          204 :          dist_col_grid(i) = MOD(i - 1, npcol)
    1476              :       END DO
    1477              : 
    1478              :       CALL dbcsr_distribution_new(dist_grid_grid, template=dist_phi, &
    1479           10 :                                   row_dist=dist_row_grid, col_dist=dist_col_grid)
    1480              : 
    1481           10 :       CALL dbcsr_create(matrix_G_occ_grid, "G_occ_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1482           10 :       CALL dbcsr_create(matrix_G_vir_grid, "G_vir_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1483           10 :       CALL dbcsr_create(matrix_chi_grid, "chi_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1484           10 :       CALL dbcsr_create(matrix_chi_grid_spin, "chi_grid_spin", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1485              : 
    1486              :       ! =========================================================================
    1487              :       ! 2. MAIN IMAGINARY TIME LOOP
    1488              :       ! =========================================================================
    1489          120 :       DO i_t = 1, bs_env%num_time_freq_points
    1490          110 :          t1 = m_walltime()
    1491              : 
    1492          110 :          tau = bs_env%imag_time_points(i_t)
    1493          110 :          CALL dbcsr_set(matrix_chi_grid, 0.0_dp)
    1494              : 
    1495              :          ! ----------------------------------------------------------------------
    1496              :          ! A. SPIN LOOP (Allocations safely encapsulated in wrappers)
    1497              :          ! ----------------------------------------------------------------------
    1498          240 :          DO ispin = 1, bs_env%n_spin
    1499              : 
    1500              :             ! G^occ_µλ(i|τ|) = sum_n^occ C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
    1501              :             ! G^occ_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
    1502              :             CALL build_G_grid(bs_env, tau, ispin, .TRUE., .FALSE., mat_phi_mu_l, &
    1503          130 :                               matrix_G_occ_grid, bs_env%eps_filter)
    1504              : 
    1505              :             ! G^vir_µλ(i|τ|) = sum_n^vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
    1506              :             ! G^vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
    1507              :             CALL build_G_grid(bs_env, tau, ispin, .FALSE., .TRUE., mat_phi_mu_l, &
    1508          130 :                               matrix_G_vir_grid, bs_env%eps_filter)
    1509              : 
    1510              :             ! -------------------------------------------------------------------
    1511              :             ! B. ELEMENT-WISE HADAMARD PRODUCT
    1512              :             ! -------------------------------------------------------------------
    1513              :             ! χ_ll'(iτ) = G^occ_ll'(i|τ|) * G^vir_ll'(i|τ|)
    1514          130 :             CALL hadamard_product(matrix_G_occ_grid, matrix_G_vir_grid, matrix_chi_grid_spin, bs_env%spin_degeneracy)
    1515              : 
    1516              :             ! Accumulate spin contributions
    1517          240 :             CALL dbcsr_add(matrix_chi_grid, matrix_chi_grid_spin, 1.0_dp, 1.0_dp)
    1518              : 
    1519              :          END DO ! ispin
    1520              : 
    1521              :          ! ----------------------------------------------------------------------
    1522              :          ! C. TRANSFORM TO AUXILIARY BASIS & EXPORT DIRECTLY
    1523              :          ! χ_aux        = Z^T * χ_grid * Z
    1524              :          ! χ_PQ(iτ) = sum_ll' Z_lP χ_ll'(iτ) Z_l'Q
    1525              :          ! Result is dumped into the final array mat_chi_Gamma_tau!
    1526              :          ! ----------------------------------------------------------------------
    1527              :          CALL contract_A_B_A("T", "N", mat_Z_lP, matrix_chi_grid, &
    1528          110 :                              mat_chi_Gamma_tau(i_t)%matrix, bs_env%eps_filter)
    1529              : 
    1530          120 :          IF (bs_env%unit_nr > 0) THEN
    1531              :             WRITE (bs_env%unit_nr, '(T2,A,I13,A,I3,A,F7.1,A)') &
    1532           55 :                'Computed χ(iτ,k=0) for time point', i_t, ' /', bs_env%num_time_freq_points, &
    1533          110 :                ',    Execution time', m_walltime() - t1, ' s'
    1534              :          END IF
    1535              : 
    1536              :       END DO ! i_t
    1537              : 
    1538              :       ! =========================================================================
    1539              :       ! 3. FINAL CLEANUP
    1540              :       ! =========================================================================
    1541           10 :       CALL dbcsr_release(matrix_G_occ_grid)
    1542           10 :       CALL dbcsr_release(matrix_G_vir_grid)
    1543           10 :       CALL dbcsr_release(matrix_chi_grid)
    1544           10 :       CALL dbcsr_release(matrix_chi_grid_spin)
    1545           10 :       CALL dbcsr_distribution_release(dist_grid_grid)
    1546           10 :       DEALLOCATE (dist_col_grid)
    1547              : 
    1548           10 :       IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
    1549              : 
    1550           10 :       CALL timestop(handle)
    1551              : 
    1552           20 :    END SUBROUTINE get_mat_chi_Gamma_tau
    1553              : 
    1554              : ! **************************************************************************************************
    1555              : !> \brief Computes Green's Function in grid basis
    1556              : !> \param bs_env ...
    1557              : !> \param tau ...
    1558              : !> \param ispin ...
    1559              : !> \param occ ...
    1560              : !> \param vir ...
    1561              : !> \param mat_phi_mu_l ...
    1562              : !> \param matrix_G_grid ...
    1563              : !> \param eps_filter ...
    1564              : ! **************************************************************************************************
    1565              : 
    1566         1064 :    SUBROUTINE build_G_grid(bs_env, tau, ispin, occ, vir, mat_phi_mu_l, matrix_G_grid, eps_filter)
    1567              : 
    1568              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1569              :       REAL(KIND=dp), INTENT(IN)                          :: tau
    1570              :       INTEGER, INTENT(IN)                                :: ispin
    1571              :       LOGICAL, INTENT(IN)                                :: occ, vir
    1572              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, matrix_G_grid
    1573              :       REAL(KIND=dp), INTENT(IN)                          :: eps_filter
    1574              : 
    1575              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'build_G_grid'
    1576              : 
    1577              :       INTEGER                                            :: handle
    1578          532 :       INTEGER, DIMENSION(:), POINTER                     :: blk_ao, dist_row_ao
    1579              :       TYPE(cp_fm_type), POINTER                          :: fm_G
    1580              :       TYPE(dbcsr_distribution_type)                      :: dist_ao_ao
    1581              :       TYPE(dbcsr_type)                                   :: matrix_G_ao
    1582              : 
    1583          532 :       CALL timeset(routineN, handle)
    1584              : 
    1585              :       ! 1. Select the FM matrix based on occ/vir flags
    1586          532 :       IF (occ) THEN
    1587          272 :          fm_G => bs_env%fm_Gocc
    1588              :       ELSE
    1589          260 :          fm_G => bs_env%fm_Gvir
    1590              :       END IF
    1591              : 
    1592              :       ! 2. Compute Dense FM Green's Function
    1593              :       ! G^occ/vir_µλ(i|τ|) = sum_G^occ/vir_µλn^occ/vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
    1594          532 :       CALL G_occ_vir(bs_env, tau, fm_G, ispin, occ=occ, vir=vir)
    1595              : 
    1596              :       ! 3. Setup AO DBCSR Topology and Create Matrix dynamically
    1597          532 :       CALL setup_square_topology(mat_phi_mu_l, 'COL', dist_ao_ao, blk_ao, dist_row_ao)
    1598              : 
    1599              :       CALL dbcsr_create(matrix_G_ao, name="G_ao", dist=dist_ao_ao, &
    1600              :                         matrix_type=dbcsr_type_no_symmetry, &
    1601          532 :                         row_blk_size=blk_ao, col_blk_size=blk_ao)
    1602              : 
    1603              :       ! 4. Convert FM to Sparse DBCSR
    1604          532 :       CALL copy_fm_to_dbcsr(fm_G, matrix_G_ao, keep_sparsity=.FALSE.)
    1605              : 
    1606              :       ! 5. Transform to Grid Basis: G_grid = phi * G_ao * phi^T
    1607              :       ! G^occ/vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ/vir_µν Φ_ν(r_l')
    1608          532 :       CALL contract_A_B_A("N", "T", mat_phi_mu_l, matrix_G_ao, matrix_G_grid, eps_filter)
    1609              : 
    1610              :       ! 6. Release AO matrix and topology
    1611          532 :       CALL release_dbcsr_topology_and_matrices(dist=dist_ao_ao, mapped_dist=dist_row_ao, m1=matrix_G_ao)
    1612              : 
    1613          532 :       CALL timestop(handle)
    1614              : 
    1615          532 :    END SUBROUTINE build_G_grid
    1616              : 
    1617              : ! **************************************************************************************************
    1618              : !> \brief Generalized routine to compute OUT = A * B * A^T  OR  OUT = A^T * B * A using DBCSR
    1619              : !> \param transA_left ...
    1620              : !> \param transA_right ...
    1621              : !> \param matrix_A ...
    1622              : !> \param matrix_B ...
    1623              : !> \param matrix_out ...
    1624              : !> \param eps_filter ...
    1625              : ! **************************************************************************************************
    1626              : 
    1627         1034 :    SUBROUTINE contract_A_B_A(transA_left, transA_right, matrix_A, matrix_B, matrix_out, eps_filter)
    1628              : 
    1629              :       CHARACTER(LEN=1), INTENT(IN)                       :: transA_left, transA_right
    1630              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_A, matrix_B, matrix_out
    1631              :       REAL(KIND=dp), INTENT(IN)                          :: eps_filter
    1632              : 
    1633              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'contract_A_B_A'
    1634              : 
    1635              :       INTEGER                                            :: handle
    1636              :       TYPE(dbcsr_type)                                   :: matrix_tmp
    1637              : 
    1638         1034 :       CALL timeset(routineN, handle)
    1639              : 
    1640         1034 :       CALL dbcsr_create(matrix_tmp, template=matrix_A)
    1641              : 
    1642         1034 :       IF (transA_left == "N" .AND. transA_right == "T") THEN
    1643              :          ! Path 1: Out = A * B * A^T
    1644              :          CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_A, matrix_B, &
    1645          652 :                              0.0_dp, matrix_tmp, filter_eps=eps_filter)
    1646              :          CALL dbcsr_multiply("N", "T", 1.0_dp, matrix_tmp, matrix_A, &
    1647          652 :                              0.0_dp, matrix_out, filter_eps=eps_filter)
    1648              : 
    1649          382 :       ELSE IF (transA_left == "T" .AND. transA_right == "N") THEN
    1650              :          ! Path 2: Out = A^T * B * A
    1651              :          CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_B, matrix_A, &
    1652          382 :                              0.0_dp, matrix_tmp, filter_eps=eps_filter)
    1653              :          CALL dbcsr_multiply("T", "N", 1.0_dp, matrix_A, matrix_tmp, &
    1654          382 :                              0.0_dp, matrix_out, filter_eps=eps_filter)
    1655              :       ELSE
    1656            0 :          CPABORT("Unsupported transposition pair in contract_A_B_A")
    1657              :       END IF
    1658              : 
    1659         1034 :       CALL dbcsr_release(matrix_tmp)
    1660              : 
    1661         1034 :       CALL timestop(handle)
    1662              : 
    1663         1034 :    END SUBROUTINE contract_A_B_A
    1664              : 
    1665              : ! **************************************************************************************************
    1666              : !> \brief Computes C = A ◦ B (Element-wise Hadamard product) for sparse DBCSR matrices.
    1667              : !> \param matrix_A ...
    1668              : !> \param matrix_B ...
    1669              : !> \param matrix_C ...
    1670              : !> \param fac (Scaling factor applied to the product)
    1671              : ! **************************************************************************************************
    1672              : 
    1673          804 :    SUBROUTINE hadamard_product(matrix_A, matrix_B, matrix_C, fac)
    1674              : 
    1675              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_A, matrix_B, matrix_C
    1676              :       REAL(KIND=dp), INTENT(IN)                          :: fac
    1677              : 
    1678              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'hadamard_product'
    1679              : 
    1680              :       INTEGER                                            :: col, handle, row
    1681              :       LOGICAL                                            :: found
    1682          402 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: blk_B, blk_C
    1683              :       TYPE(dbcsr_iterator_type)                          :: iter
    1684              : 
    1685          402 :       CALL timeset(routineN, handle)
    1686              : 
    1687          402 :       CALL dbcsr_copy(matrix_C, matrix_A)
    1688              : 
    1689          402 :       CALL dbcsr_iterator_start(iter, matrix_C)
    1690        77923 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
    1691        77521 :          CALL dbcsr_iterator_next_block(iter, row, col, blk_C)
    1692              : 
    1693        77521 :          CALL dbcsr_get_block_p(matrix_B, row, col, blk_B, found)
    1694              : 
    1695        77923 :          IF (found) THEN
    1696    163879394 :             blk_C(:, :) = fac*blk_C(:, :)*blk_B(:, :)
    1697              :          ELSE
    1698              :             ! If B is sparse here, the product is zero
    1699            0 :             blk_C(:, :) = 0.0_dp
    1700              :          END IF
    1701              :       END DO
    1702          402 :       CALL dbcsr_iterator_stop(iter)
    1703              : 
    1704          402 :       CALL timestop(handle)
    1705              : 
    1706          402 :    END SUBROUTINE hadamard_product
    1707              : 
    1708              : ! **************************************************************************************************
    1709              : !> \brief Compute screened Coulomb interaction matrix
    1710              : !> \param bs_env ...
    1711              : !> \param qs_env ...
    1712              : !> \param mat_chi_Gamma_tau ...
    1713              : !> \param fm_W_time ...
    1714              : ! **************************************************************************************************
    1715              : 
    1716           40 :    SUBROUTINE compute_W(bs_env, qs_env, mat_chi_Gamma_tau, fm_W_time)
    1717              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1718              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1719              :       TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: mat_chi_Gamma_tau
    1720              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_W_time
    1721              : 
    1722              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_W'
    1723              : 
    1724              :       INTEGER                                            :: handle, i_t, j_w
    1725              :       REAL(KIND=dp)                                      :: t1
    1726              :       TYPE(cp_fm_type)                                   :: fm_M_inv_V_sqrt, fm_V, fm_V_sqrt
    1727              : 
    1728           10 :       CALL timeset(routineN, handle)
    1729              : 
    1730           10 :       t1 = m_walltime()
    1731              : 
    1732           10 :       CALL create_fm_W_MIC_time(bs_env, fm_W_time)
    1733              : 
    1734              :       ! 1. Allocate V and M matrices
    1735           10 :       CALL cp_fm_create(fm_V, bs_env%fm_RI_RI%matrix_struct)
    1736           10 :       CALL cp_fm_create(fm_V_sqrt, bs_env%fm_RI_RI%matrix_struct)
    1737           10 :       CALL cp_fm_create(fm_M_inv_V_sqrt, bs_env%fm_RI_RI%matrix_struct)
    1738              : 
    1739              :       ! Compute V and M^-1 * V^0.5
    1740           10 :       CALL compute_V_MinvVsqrt(bs_env, qs_env, fm_V, fm_V_sqrt, fm_M_inv_V_sqrt)
    1741              : 
    1742              :       ! 2. Loop over frequencies
    1743          120 :       DO j_w = 1, bs_env%num_time_freq_points
    1744              :          ! Fourier transformation of χ_PQ(iτ) to χ_PQ(iω_j)
    1745          110 :          CALL compute_fm_chi_Gamma_freq(bs_env, bs_env%fm_chi_Gamma_freq, j_w, mat_chi_Gamma_tau)
    1746              : 
    1747              :          ! ε(iω_j) = Id - V^0.5*M^-1*χ(iω_j)*M^-1*V^0.5
    1748              :          ! W(iω_j) = V^0.5*(ε^-1(iω_j)-Id)*V^0.5
    1749              :          CALL compute_fm_W_freq(bs_env, bs_env%fm_chi_Gamma_freq, fm_V_sqrt, &
    1750          110 :                                 fm_M_inv_V_sqrt, bs_env%fm_W_MIC_freq)
    1751              : 
    1752              :          ! Fourier transform from W_PQ^MIC(iω_j) to W_PQ^MIC(iτ)
    1753          120 :          CALL Fourier_transform_w_to_t(bs_env, fm_W_time, bs_env%fm_W_MIC_freq, j_w)
    1754              :       END DO
    1755              : 
    1756              :       ! M^-1*W^MIC(iτ)*M^-1
    1757           10 :       CALL multiply_fm_W_MIC_time_with_Minv_Gamma(bs_env, qs_env, fm_W_time)
    1758              : 
    1759           10 :       IF (bs_env%unit_nr > 0) THEN
    1760              :          WRITE (bs_env%unit_nr, '(T2,A,T55,A,F10.1,A)') &
    1761            5 :             'Computed W(iτ),', ' Execution time', m_walltime() - t1, ' s'
    1762              :       END IF
    1763              : 
    1764           10 :       CALL dbcsr_deallocate_matrix_set(mat_chi_Gamma_tau)
    1765              : 
    1766              :       ! Cleanup
    1767           10 :       CALL cp_fm_release(fm_V)
    1768           10 :       CALL cp_fm_release(fm_V_sqrt)
    1769           10 :       CALL cp_fm_release(fm_M_inv_V_sqrt)
    1770              : 
    1771              :       ! Marek : Fourier transform W^MIC(itau) back to get it at a specific im.frequency point - iomega = 0
    1772           10 :       IF (bs_env%rtp_method == rtp_method_bse) THEN
    1773            0 :          t1 = m_walltime()
    1774            0 :          CALL cp_fm_create(bs_env%fm_W_MIC_freq_zero, bs_env%fm_W_MIC_freq%matrix_struct)
    1775              :          ! Set to zero
    1776            0 :          CALL cp_fm_set_all(bs_env%fm_W_MIC_freq_zero, 0.0_dp)
    1777              :          ! Sum over all times
    1778            0 :          DO i_t = 1, bs_env%num_time_freq_points
    1779              :             ! Add the relevant structure with correct weight
    1780              :             CALL cp_fm_scale_and_add(1.0_dp, bs_env%fm_W_MIC_freq_zero, &
    1781            0 :                                      bs_env%imag_time_weights_freq_zero(i_t), fm_W_time(i_t))
    1782              :          END DO
    1783              :          ! Done, save to file
    1784            0 :          CALL fm_write(bs_env%fm_W_MIC_freq_zero, 0, "W_freq_rtp", qs_env)
    1785              :          ! Report calculation
    1786            0 :          IF (bs_env%unit_nr > 0) THEN
    1787              :             WRITE (bs_env%unit_nr, '(T2,A,T55,A,F10.1,A)') &
    1788            0 :                'Computed W(0),', ' Execution time', m_walltime() - t1, ' s'
    1789              :          END IF
    1790              :       END IF
    1791              : 
    1792           10 :       IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
    1793              : 
    1794           10 :       CALL timestop(handle)
    1795              : 
    1796           10 :    END SUBROUTINE compute_W
    1797              : 
    1798              : ! **************************************************************************************************
    1799              : !> \brief Computes V, V^0.5, and M^-1 * V^0.5
    1800              : !> \param bs_env ...
    1801              : !> \param qs_env ...
    1802              : !> \param fm_V ...
    1803              : !> \param fm_V_sqrt ...
    1804              : !> \param fm_Minv_Vsqrt ...
    1805              : ! **************************************************************************************************
    1806              : 
    1807           10 :    SUBROUTINE compute_V_MinvVsqrt(bs_env, qs_env, fm_V, fm_V_sqrt, fm_Minv_Vsqrt)
    1808              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1809              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1810              :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm_V, fm_V_sqrt, fm_Minv_Vsqrt
    1811              : 
    1812              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_V_MinvVsqrt'
    1813              : 
    1814              :       INTEGER                                            :: handle, info, n_RI, ndep
    1815           10 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
    1816              :       TYPE(cell_type), POINTER                           :: cell
    1817              :       TYPE(cp_fm_type)                                   :: fm_work
    1818           10 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :)     :: fm_M
    1819           10 :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: mat_V_kp
    1820           10 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1821           10 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
    1822              : 
    1823           10 :       CALL timeset(routineN, handle)
    1824              : 
    1825           10 :       n_RI = bs_env%n_RI
    1826           10 :       CALL cp_fm_create(fm_work, fm_V%matrix_struct)
    1827              : 
    1828              :       ! -----------------------------------------------------------------------
    1829              :       ! 1. Build Coulomb Matrix V(k=0) using the kp-routine but only for ikp=1
    1830              :       ! -----------------------------------------------------------------------
    1831              :       CALL get_qs_env(qs_env=qs_env, particle_set=particle_set, cell=cell, &
    1832           10 :                       qs_kind_set=qs_kind_set, atomic_kind_set=atomic_kind_set)
    1833              : 
    1834           50 :       ALLOCATE (mat_V_kp(1:1, 1:2))
    1835           10 :       NULLIFY (mat_V_kp(1, 1)%matrix, mat_V_kp(1, 2)%matrix)
    1836           10 :       ALLOCATE (mat_V_kp(1, 1)%matrix, mat_V_kp(1, 2)%matrix)
    1837              : 
    1838           10 :       CALL dbcsr_create(mat_V_kp(1, 1)%matrix, template=bs_env%mat_RI_RI%matrix)
    1839           10 :       CALL dbcsr_reserve_all_blocks(mat_V_kp(1, 1)%matrix)
    1840           10 :       CALL dbcsr_set(mat_V_kp(1, 1)%matrix, 0.0_dp)
    1841              : 
    1842              :       ! Dummy imaginary part just to satisfy the routine
    1843           10 :       CALL dbcsr_create(mat_V_kp(1, 2)%matrix, template=bs_env%mat_RI_RI%matrix)
    1844           10 :       CALL dbcsr_reserve_all_blocks(mat_V_kp(1, 2)%matrix)
    1845           10 :       CALL dbcsr_set(mat_V_kp(1, 2)%matrix, 0.0_dp)
    1846              : 
    1847           40 :       bs_env%kpoints_chi_eps_W%nkp_grid = bs_env%nkp_grid_chi_eps_W_orig
    1848              : 
    1849              :       CALL build_2c_coulomb_matrix_kp(mat_V_kp, bs_env%kpoints_chi_eps_W, "RI_AUX", cell, &
    1850              :                                       particle_set, qs_kind_set, atomic_kind_set, &
    1851           10 :                                       bs_env%size_lattice_sum_V, operator_coulomb, 1, 1)
    1852              : 
    1853              :       ! Copy real part to fm_V
    1854           10 :       CALL copy_dbcsr_to_fm(mat_V_kp(1, 1)%matrix, fm_V)
    1855              : 
    1856           10 :       CALL dbcsr_deallocate_matrix(mat_V_kp(1, 1)%matrix)
    1857           10 :       CALL dbcsr_deallocate_matrix(mat_V_kp(1, 2)%matrix)
    1858           10 :       DEALLOCATE (mat_V_kp)
    1859              : 
    1860              :       ! -----------------------------------------------------------------------
    1861              :       ! 2. Get RI-Metric Matrix M(k=0)
    1862              :       ! -----------------------------------------------------------------------
    1863              :       CALL RI_2c_integral_mat(qs_env, fm_M, fm_V, n_RI, bs_env%ri_metric, &
    1864           10 :                               do_kpoints=.FALSE., regularization_RI=bs_env%regularization_RI)
    1865              : 
    1866              :       ! -----------------------------------------------------------------------
    1867              :       ! 3. M -> M^-1
    1868              :       ! -----------------------------------------------------------------------
    1869           10 :       CALL cp_fm_cholesky_decompose(fm_M(1, 1), info_out=info)
    1870           10 :       IF (info == 0) THEN
    1871           10 :          CALL cp_fm_cholesky_invert(fm_M(1, 1))
    1872           10 :          CALL cp_fm_uplo_to_full(fm_M(1, 1), fm_work)
    1873              :       ELSE
    1874              :          ! Fallback if Cholesky fails due to conditioning
    1875            0 :          CALL cp_fm_power(fm_M(1, 1), fm_work, -1.0_dp, bs_env%eps_eigval_mat_RI, ndep)
    1876            0 :          CALL cp_fm_to_fm(fm_work, fm_M(1, 1))
    1877              :       END IF
    1878              : 
    1879              :       ! -----------------------------------------------------------------------
    1880              :       ! 4. V -> V^0.5
    1881              :       ! -----------------------------------------------------------------------
    1882           10 :       CALL cp_fm_to_fm(fm_V, fm_V_sqrt)
    1883           10 :       CALL cp_fm_cholesky_decompose(fm_V_sqrt, info_out=info)
    1884           10 :       IF (info == 0) THEN
    1885           10 :          CALL clean_lower_part(fm_V_sqrt)
    1886              :       ELSE
    1887            0 :          CALL cp_fm_power(fm_V, fm_V_sqrt, 0.5_dp, bs_env%eps_eigval_mat_RI, ndep)
    1888              :       END IF
    1889              : 
    1890              :       ! -----------------------------------------------------------------------
    1891              :       ! 5. M^-1 * V^0.5
    1892              :       ! -----------------------------------------------------------------------
    1893              :       CALL parallel_gemm("N", "T", n_RI, n_RI, n_RI, 1.0_dp, fm_M(1, 1), fm_V_sqrt, &
    1894           10 :                          0.0_dp, fm_Minv_Vsqrt)
    1895              : 
    1896           10 :       CALL cp_fm_release(fm_M)
    1897           10 :       CALL cp_fm_release(fm_work)
    1898              : 
    1899           10 :       CALL timestop(handle)
    1900              : 
    1901           40 :    END SUBROUTINE compute_V_MinvVsqrt
    1902              : 
    1903              : ! **************************************************************************************************
    1904              : !> \brief Computes W(iω) from χ_PQ(iω_j)
    1905              : !> \param bs_env ...
    1906              : !> \param fm_chi_freq_j ...
    1907              : !> \param fm_V_sqrt ...
    1908              : !> \param fm_Minv_Vsqrt ...
    1909              : !> \param fm_W_freq_j ...
    1910              : ! **************************************************************************************************
    1911              : 
    1912          440 :    SUBROUTINE compute_fm_W_freq(bs_env, fm_chi_freq_j, fm_V_sqrt, fm_Minv_Vsqrt, fm_W_freq_j)
    1913              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1914              :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_chi_freq_j, fm_V_sqrt, fm_Minv_Vsqrt
    1915              :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm_W_freq_j
    1916              : 
    1917              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_fm_W_freq'
    1918              : 
    1919              :       INTEGER                                            :: handle, info, n_RI, ndep
    1920              :       TYPE(cp_fm_type)                                   :: fm_eps_freq_j, fm_work
    1921              : 
    1922          110 :       CALL timeset(routineN, handle)
    1923              : 
    1924          110 :       n_RI = bs_env%n_RI
    1925              : 
    1926          110 :       CALL cp_fm_create(fm_eps_freq_j, fm_chi_freq_j%matrix_struct)
    1927          110 :       CALL cp_fm_create(fm_work, fm_chi_freq_j%matrix_struct)
    1928              : 
    1929              :       ! -----------------------------------------------------------------------
    1930              :       ! 1. ε(iω_j) = Id - (M^-1 * V^0.5)^T * χ(iω_j) * (M^-1 * V^0.5)
    1931              :       ! -----------------------------------------------------------------------
    1932              :       ! work = χ(iω_j) * (M^-1 * V^0.5)
    1933              :       CALL parallel_gemm('N', 'N', n_RI, n_RI, n_RI, 1.0_dp, &
    1934          110 :                          fm_chi_freq_j, fm_Minv_Vsqrt, 0.0_dp, fm_work)
    1935              : 
    1936              :       ! eps_work = (M^-1 * V^0.5)^T * work
    1937              :       CALL parallel_gemm('T', 'N', n_RI, n_RI, n_RI, 1.0_dp, &
    1938          110 :                          fm_Minv_Vsqrt, fm_work, 0.0_dp, fm_eps_freq_j)
    1939              : 
    1940              :       ! ε(iω_j) = Id - eps_work  -->  -eps_work + Id
    1941          110 :       CALL fm_add_on_diag(fm_eps_freq_j, 1.0_dp)
    1942              : 
    1943              :       ! Force perfect symmetry before Cholesky to avoid info != 0 due to GEMM noise
    1944          110 :       CALL cp_fm_uplo_to_full(fm_eps_freq_j, fm_work)
    1945              : 
    1946              :       ! -----------------------------------------------------------------------
    1947              :       ! 2. W(iω_j) = V^0.5^T * (ε^-1(iω_j) - Id) * V^0.5
    1948              :       ! -----------------------------------------------------------------------
    1949              : 
    1950              :       ! a) Cholesky decomposition of ε(iω_j)
    1951          110 :       CALL cp_fm_cholesky_decompose(fm_eps_freq_j, info_out=info)
    1952              : 
    1953              :       ! b) Inversion
    1954          110 :       IF (info == 0) THEN
    1955          110 :          CALL cp_fm_cholesky_invert(fm_eps_freq_j)
    1956          110 :          CALL cp_fm_uplo_to_full(fm_eps_freq_j, fm_work)
    1957              :       ELSE
    1958              :          ! Fallback to expensive diagonalization if Cholesky fails
    1959            0 :          CALL cp_fm_power(fm_eps_freq_j, fm_work, -1.0_dp, bs_env%eps_eigval_mat_RI, ndep)
    1960            0 :          CALL cp_fm_to_fm(fm_work, fm_eps_freq_j)
    1961              :       END IF
    1962              : 
    1963              :       ! c) ε^-1(iω_j) - Id
    1964          110 :       CALL fm_add_on_diag(fm_eps_freq_j, -1.0_dp)
    1965              : 
    1966              :       ! d) work = (ε^-1(iω_j) - Id) * V^0.5
    1967              :       CALL parallel_gemm('N', 'N', n_RI, n_RI, n_RI, 1.0_dp, fm_eps_freq_j, fm_V_sqrt, &
    1968          110 :                          0.0_dp, fm_work)
    1969              : 
    1970              :       ! e) W(iw) = V^0.5^T * work
    1971              :       CALL parallel_gemm('T', 'N', n_RI, n_RI, n_RI, 1.0_dp, fm_V_sqrt, fm_work, &
    1972          110 :                          0.0_dp, fm_W_freq_j)
    1973              : 
    1974              :       ! Cleanup
    1975          110 :       CALL cp_fm_release(fm_work)
    1976          110 :       CALL cp_fm_release(fm_eps_freq_j)
    1977              : 
    1978          110 :       CALL timestop(handle)
    1979              : 
    1980          110 :    END SUBROUTINE compute_fm_W_freq
    1981              : 
    1982              : ! **************************************************************************************************
    1983              : !> \brief Adds a real scalar value to the diagonal of a real full matrix (fm)
    1984              : !> \param fm ...
    1985              : !> \param alpha ...
    1986              : ! **************************************************************************************************
    1987              : 
    1988          440 :    SUBROUTINE fm_add_on_diag(fm, alpha)
    1989              :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm
    1990              :       REAL(KIND=dp), INTENT(IN)                          :: alpha
    1991              : 
    1992              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'fm_add_on_diag'
    1993              : 
    1994              :       INTEGER                                            :: handle, i_global, i_row, j_col, &
    1995              :                                                             j_global, ncol_local, nrow_local
    1996          220 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
    1997              : 
    1998          220 :       CALL timeset(routineN, handle)
    1999              : 
    2000              :       CALL cp_fm_get_info(matrix=fm, &
    2001              :                           nrow_local=nrow_local, &
    2002              :                           ncol_local=ncol_local, &
    2003              :                           row_indices=row_indices, &
    2004          220 :                           col_indices=col_indices)
    2005              : 
    2006         4880 :       DO j_col = 1, ncol_local
    2007         4660 :          j_global = col_indices(j_col)
    2008        54250 :          DO i_row = 1, nrow_local
    2009        49370 :             i_global = row_indices(i_row)
    2010        54030 :             IF (j_global == i_global) THEN
    2011         2330 :                fm%local_data(i_row, j_col) = fm%local_data(i_row, j_col) + alpha
    2012              :             END IF
    2013              :          END DO
    2014              :       END DO
    2015              : 
    2016          220 :       CALL timestop(handle)
    2017              : 
    2018          220 :    END SUBROUTINE fm_add_on_diag
    2019              : 
    2020              : ! **************************************************************************************************
    2021              : !> \brief Zeroes out the strictly lower triangular part of a real matrix
    2022              : !> \param fm_mat ...
    2023              : ! **************************************************************************************************
    2024           20 :    SUBROUTINE clean_lower_part(fm_mat)
    2025              :       TYPE(cp_fm_type)                                   :: fm_mat
    2026              : 
    2027              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'clean_lower_part'
    2028              : 
    2029              :       INTEGER                                            :: handle, i_row, j_col, j_global, &
    2030              :                                                             ncol_local, nrow_local
    2031           10 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
    2032              : 
    2033           10 :       CALL timeset(routineN, handle)
    2034              : 
    2035              :       CALL cp_fm_get_info(matrix=fm_mat, &
    2036              :                           nrow_local=nrow_local, ncol_local=ncol_local, &
    2037           10 :                           row_indices=row_indices, col_indices=col_indices)
    2038              : 
    2039          222 :       DO j_col = 1, ncol_local
    2040          212 :          j_global = col_indices(j_col)
    2041         2470 :          DO i_row = 1, nrow_local
    2042         2460 :             IF (j_global < row_indices(i_row)) fm_mat%local_data(i_row, j_col) = 0.0_dp
    2043              :          END DO
    2044              :       END DO
    2045              : 
    2046           10 :       CALL timestop(handle)
    2047              : 
    2048           10 :    END SUBROUTINE clean_lower_part
    2049              : 
    2050              : ! **************************************************************************************************
    2051              : !> \brief Computes the exact exchange part of the GW self-energy
    2052              : !> \param bs_env ...
    2053              : !> \param qs_env ...
    2054              : !> \param mat_phi_mu_l ...
    2055              : !> \param mat_Z_lP ...
    2056              : !> \param fm_Sigma_x_Gamma ...
    2057              : ! **************************************************************************************************
    2058              : 
    2059           10 :    SUBROUTINE compute_Sigma_x(bs_env, qs_env, mat_phi_mu_l, mat_Z_lP, fm_Sigma_x_Gamma)
    2060              : 
    2061              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    2062              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    2063              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, mat_Z_lP
    2064              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_Sigma_x_Gamma
    2065              : 
    2066              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_Sigma_x'
    2067              : 
    2068              :       INTEGER                                            :: handle, ispin
    2069           10 :       INTEGER, DIMENSION(:), POINTER                     :: blk_aux, blk_grid, dist_col_grid, &
    2070           10 :                                                             dist_row_aux
    2071              :       REAL(KIND=dp)                                      :: t1
    2072           10 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :)     :: fm_Vtr_Gamma
    2073              :       TYPE(dbcsr_distribution_type)                      :: dist_aux_aux, dist_grid_grid
    2074              :       TYPE(dbcsr_type)                                   :: mat_Sigma_x_Gamma, matrix_D_grid, &
    2075              :                                                             matrix_Sigma_x_grid, matrix_V_aux, &
    2076              :                                                             matrix_V_grid
    2077              : 
    2078           10 :       CALL timeset(routineN, handle)
    2079              : 
    2080           10 :       t1 = m_walltime()
    2081              : 
    2082           42 :       ALLOCATE (fm_Sigma_x_Gamma(bs_env%n_spin))
    2083           22 :       DO ispin = 1, bs_env%n_spin
    2084           22 :          CALL cp_fm_create(fm_Sigma_x_Gamma(ispin), bs_env%fm_s_Gamma%matrix_struct)
    2085              :       END DO
    2086              : 
    2087           10 :       CALL dbcsr_create(mat_Sigma_x_Gamma, template=bs_env%mat_ao_ao%matrix)
    2088              : 
    2089              :       ! =========================================================================
    2090              :       ! 1. SETUP CORE TOPOLOGIES
    2091              :       ! =========================================================================
    2092           10 :       CALL setup_square_topology(mat_phi_mu_l, 'ROW', dist_grid_grid, blk_grid, dist_col_grid)
    2093           10 :       CALL setup_square_topology(mat_Z_lP, 'COL', dist_aux_aux, blk_aux, dist_row_aux)
    2094              : 
    2095              :       ! =========================================================================
    2096              :       ! 2. COMPUTE V^tr_ll'
    2097              :       ! =========================================================================
    2098              :       CALL RI_2c_integral_mat(qs_env, fm_Vtr_Gamma, bs_env%fm_RI_RI, bs_env%n_RI, &
    2099           10 :                               bs_env%trunc_coulomb, do_kpoints=.FALSE.)
    2100              : 
    2101              :       ! M^-1 * V^tr * M^-1 directly modifies fm_Vtr_Gamma(:, 1)
    2102           10 :       CALL multiply_fm_W_MIC_time_with_Minv_Gamma(bs_env, qs_env, fm_Vtr_Gamma(:, 1))
    2103              : 
    2104           10 :       CALL dbcsr_create(matrix_V_aux, "V_aux", dist_aux_aux, dbcsr_type_no_symmetry, blk_aux, blk_aux)
    2105           10 :       CALL dbcsr_create(matrix_V_grid, "V_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    2106              : 
    2107           10 :       CALL copy_fm_to_dbcsr(fm_Vtr_Gamma(1, 1), matrix_V_aux, keep_sparsity=.FALSE.)
    2108              : 
    2109              :       ! V^tr_ll' = sum_PQ Z_lP V^trunc_PQ Z_l'Q
    2110           10 :       CALL contract_A_B_A("N", "T", mat_Z_lP, matrix_V_aux, matrix_V_grid, bs_env%eps_filter)
    2111           10 :       CALL dbcsr_release(matrix_V_aux)
    2112              : 
    2113              :       ! =========================================================================
    2114              :       ! 3. SPIN LOOP FOR EXACT EXCHANGE
    2115              :       ! =========================================================================
    2116           22 :       DO ispin = 1, bs_env%n_spin
    2117              : 
    2118              :          ! Density matrix on grid is essentially G_occ at tau = 0.0
    2119              :          ! D_µν  = sum_n^occ C_µn C_νn
    2120              :          ! D_ll' = sum_µν Φ_µ(r_l) D_µν Φ_ν(r_l')
    2121           12 :          CALL dbcsr_create(matrix_D_grid, "D_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    2122           12 :          CALL build_G_grid(bs_env, 0.0_dp, ispin, .TRUE., .FALSE., mat_phi_mu_l, matrix_D_grid, bs_env%eps_filter)
    2123              : 
    2124              :          ! Element-wise Hadamard product: Σ^x_grid = D_grid ◦ V_grid
    2125              :          ! Σ^x_ll' = D_ll' * V^tr_ll'
    2126           12 :          CALL dbcsr_create(matrix_Sigma_x_grid, template=matrix_V_grid)
    2127           12 :          CALL hadamard_product(matrix_D_grid, matrix_V_grid, matrix_Sigma_x_grid, 1.0_dp)
    2128              : 
    2129           12 :          CALL dbcsr_release(matrix_D_grid)
    2130              : 
    2131              :          ! Transform back to AO basis: Σ^x_ao = -1.0 * phi^T * Σ^x_grid * phi
    2132              :          ! Σ^x_λσ   = -sum_ll' Φ_λ(r_l) Σ^x_ll' Φ_σ(r_l')
    2133           12 :          CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_x_grid, mat_Sigma_x_Gamma, bs_env%eps_filter)
    2134           12 :          CALL dbcsr_scale(mat_Sigma_x_Gamma, -1.0_dp)
    2135              : 
    2136           12 :          CALL dbcsr_release(matrix_Sigma_x_grid)
    2137              : 
    2138              :          ! Data I/O and Export to CP2K Full Matrices
    2139           22 :          CALL copy_dbcsr_to_fm(mat_Sigma_x_Gamma, fm_Sigma_x_Gamma(ispin))
    2140              : 
    2141              :       END DO ! ispin
    2142              : 
    2143           10 :       IF (bs_env%unit_nr > 0) THEN
    2144              :          WRITE (bs_env%unit_nr, '(T2,A,T58,A,F7.1,A)') &
    2145            5 :             'Computed Σ^x(k=0),', ' Execution time', m_walltime() - t1, ' s'
    2146            5 :          WRITE (bs_env%unit_nr, '(A)') ' '
    2147              :       END IF
    2148              : 
    2149              :       ! =========================================================================
    2150              :       ! 4. CLEANUP
    2151              :       ! =========================================================================
    2152              :       CALL release_dbcsr_topology_and_matrices(dist=dist_grid_grid, mapped_dist=dist_col_grid, &
    2153           10 :                                                m1=mat_Sigma_x_Gamma, m2=matrix_V_grid)
    2154           10 :       CALL release_dbcsr_topology_and_matrices(dist=dist_aux_aux, mapped_dist=dist_row_aux)
    2155              : 
    2156           10 :       CALL cp_fm_release(fm_Vtr_Gamma)
    2157              : 
    2158           10 :       CALL timestop(handle)
    2159              : 
    2160           30 :    END SUBROUTINE compute_Sigma_x
    2161              : 
    2162              : ! **************************************************************************************************
    2163              : !> \brief Computes the correlation part of the GW self-energy
    2164              : !> \param bs_env ...
    2165              : !> \param fm_W_time ...
    2166              : !> \param mat_phi_mu_l ...
    2167              : !> \param mat_Z_lP ...
    2168              : !> \param fm_Sigma_c_Gamma_time ...
    2169              : ! **************************************************************************************************
    2170              : 
    2171           10 :    SUBROUTINE compute_Sigma_c(bs_env, fm_W_time, mat_phi_mu_l, mat_Z_lP, fm_Sigma_c_Gamma_time)
    2172              : 
    2173              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    2174              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_W_time
    2175              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, mat_Z_lP
    2176              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :, :)  :: fm_Sigma_c_Gamma_time
    2177              : 
    2178              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_Sigma_c'
    2179              : 
    2180              :       INTEGER                                            :: handle, i_t, ispin
    2181           10 :       INTEGER, DIMENSION(:), POINTER                     :: blk_aux, blk_grid, dist_col_grid, &
    2182           10 :                                                             dist_row_aux
    2183              :       REAL(KIND=dp)                                      :: t1, tau
    2184              :       TYPE(dbcsr_distribution_type)                      :: dist_aux_aux, dist_grid_grid
    2185           10 :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: mat_Sigma_neg_tau, mat_Sigma_pos_tau
    2186              :       TYPE(dbcsr_type) :: matrix_G_occ_grid, matrix_G_vir_grid, matrix_Sigma_neg_grid, &
    2187              :          matrix_Sigma_pos_grid, matrix_W_aux, matrix_W_grid
    2188              : 
    2189           10 :       CALL timeset(routineN, handle)
    2190              : 
    2191              :       ! =========================================================================
    2192              :       ! 1. SETUP CORE TOPOLOGIES AND PRE-ALLOCATE OUTPUT ARRAYS
    2193              :       ! =========================================================================
    2194           10 :       CALL setup_square_topology(mat_phi_mu_l, 'ROW', dist_grid_grid, blk_grid, dist_col_grid)
    2195           10 :       CALL setup_square_topology(mat_Z_lP, 'COL', dist_aux_aux, blk_aux, dist_row_aux)
    2196              : 
    2197              :       ! Pre-allocate local DBCSR matrices to act as targets for final output
    2198           10 :       NULLIFY (mat_Sigma_neg_tau, mat_Sigma_pos_tau)
    2199          182 :       ALLOCATE (mat_Sigma_neg_tau(bs_env%num_time_freq_points, bs_env%n_spin))
    2200          182 :       ALLOCATE (mat_Sigma_pos_tau(bs_env%num_time_freq_points, bs_env%n_spin))
    2201              : 
    2202          120 :       DO i_t = 1, bs_env%num_time_freq_points
    2203          250 :          DO ispin = 1, bs_env%n_spin
    2204          130 :             ALLOCATE (mat_Sigma_neg_tau(i_t, ispin)%matrix)
    2205          130 :             ALLOCATE (mat_Sigma_pos_tau(i_t, ispin)%matrix)
    2206          130 :             CALL dbcsr_create(mat_Sigma_neg_tau(i_t, ispin)%matrix, template=bs_env%mat_ao_ao%matrix)
    2207          240 :             CALL dbcsr_create(mat_Sigma_pos_tau(i_t, ispin)%matrix, template=bs_env%mat_ao_ao%matrix)
    2208              :          END DO
    2209              :       END DO
    2210              : 
    2211              :       ! =========================================================================
    2212              :       ! 2. MAIN IMAGINARY TIME LOOP
    2213              :       ! =========================================================================
    2214          120 :       DO i_t = 1, bs_env%num_time_freq_points
    2215          110 :          tau = bs_env%imag_time_points(i_t)
    2216              : 
    2217              :          ! -------------------------------------------------------------------
    2218              :          ! Compute W_grid = Z * W_aux * Z^T
    2219              :          ! -------------------------------------------------------------------
    2220          110 :          CALL dbcsr_create(matrix_W_aux, "W_aux", dist_aux_aux, dbcsr_type_no_symmetry, blk_aux, blk_aux)
    2221          110 :          CALL dbcsr_create(matrix_W_grid, "W_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    2222              : 
    2223          110 :          CALL copy_fm_to_dbcsr(fm_W_time(i_t), matrix_W_aux, keep_sparsity=.FALSE.)
    2224              : 
    2225              :          ! W^MIC_ll'(iτ,k=0) = sum_PQ Z_lP W^MIC_PQ(iτ) Z_l'Q
    2226          110 :          CALL contract_A_B_A("N", "T", mat_Z_lP, matrix_W_aux, matrix_W_grid, bs_env%eps_filter)
    2227              : 
    2228          110 :          CALL dbcsr_release(matrix_W_aux) ! Clean up aux basis immediately
    2229              : 
    2230          240 :          DO ispin = 1, bs_env%n_spin
    2231          130 :             t1 = m_walltime()
    2232              : 
    2233              :             ! -------------------------------------------------------------------
    2234              :             ! A. Transform Green's Functions to the Grid
    2235              :             ! -------------------------------------------------------------------
    2236          130 :             CALL dbcsr_create(matrix_G_occ_grid, "G_occ_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    2237          130 :             CALL dbcsr_create(matrix_G_vir_grid, "G_vir_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    2238              : 
    2239              :             ! G^occ_µλ(i|τ|) = sum_G^occ_µλn^occ C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
    2240              :             ! G^occ_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
    2241          130 :             CALL build_G_grid(bs_env, tau, ispin, .TRUE., .FALSE., mat_phi_mu_l, matrix_G_occ_grid, bs_env%eps_filter)
    2242              : 
    2243              :             ! G^vir_µλ(i|τ|) = sum_n^vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
    2244              :             ! G^vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
    2245          130 :             CALL build_G_grid(bs_env, tau, ispin, .FALSE., .TRUE., mat_phi_mu_l, matrix_G_vir_grid, bs_env%eps_filter)
    2246              : 
    2247              :             ! -------------------------------------------------------------------
    2248              :             ! B. Element-wise Hadamard Products for Sigma_c on Grid
    2249              :             ! Σ_neg_grid = G_occ_grid ◦ W_grid
    2250              :             ! Σ_pos_grid = G_vir_grid ◦ W_grid
    2251              :             ! -------------------------------------------------------------------
    2252          130 :             CALL dbcsr_create(matrix_Sigma_neg_grid, template=matrix_W_grid)
    2253          130 :             CALL dbcsr_create(matrix_Sigma_pos_grid, template=matrix_W_grid)
    2254              : 
    2255              :             ! Σ^c_ll'(iτ)   = -G^occ_ll'(i|τ|) * W^MIC_ll'(iτ), for τ < 0
    2256          130 :             CALL hadamard_product(matrix_G_occ_grid, matrix_W_grid, matrix_Sigma_neg_grid, 1.0_dp)
    2257              : 
    2258              :             ! Σ^c_ll'(iτ)   =  G^vir_ll'(i|τ|) * W^MIC_ll'(iτ), for τ > 0
    2259          130 :             CALL hadamard_product(matrix_G_vir_grid, matrix_W_grid, matrix_Sigma_pos_grid, 1.0_dp)
    2260              : 
    2261              :             ! Instantly purge massive G_grid arrays to save memory
    2262          130 :             CALL dbcsr_release(matrix_G_occ_grid)
    2263          130 :             CALL dbcsr_release(matrix_G_vir_grid)
    2264              : 
    2265              :             ! -------------------------------------------------------------------
    2266              :             ! C. Transform Sigma back to AO Basis
    2267              :             ! Σ_AO = phi^T * Σ_grid * phi
    2268              :             ! -------------------------------------------------------------------
    2269              : 
    2270              :             ! Σ^c_λσ(iτ)    = sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ) Φ_σ(r_l'), for τ < 0
    2271              :             CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_neg_grid, &
    2272          130 :                                 mat_Sigma_neg_tau(i_t, ispin)%matrix, bs_env%eps_filter)
    2273          130 :             CALL dbcsr_scale(mat_Sigma_neg_tau(i_t, ispin)%matrix, -1.0_dp)
    2274              : 
    2275              :             ! Σ^c_λσ(iτ)    = sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ) Φ_σ(r_l'), for τ > 0
    2276              :             CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_pos_grid, &
    2277          130 :                                 mat_Sigma_pos_tau(i_t, ispin)%matrix, bs_env%eps_filter)
    2278              : 
    2279              :             ! Purge Grid Sigma arrays
    2280          130 :             CALL dbcsr_release(matrix_Sigma_neg_grid)
    2281          130 :             CALL dbcsr_release(matrix_Sigma_pos_grid)
    2282              : 
    2283          240 :             IF (bs_env%unit_nr > 0) THEN
    2284              :                WRITE (bs_env%unit_nr, '(T2,A,I10,A,I3,A,F7.1,A)') &
    2285           65 :                   'Computed Σ^c(iτ) for time point ', i_t, ' /', bs_env%num_time_freq_points, &
    2286          130 :                   ',    Execution time', m_walltime() - t1, ' s'
    2287              :             END IF
    2288              : 
    2289              :          END DO ! ispin
    2290              : 
    2291              :          ! Release the W_grid for this time point
    2292          120 :          CALL dbcsr_release(matrix_W_grid)
    2293              : 
    2294              :       END DO ! i_t
    2295              : 
    2296           10 :       IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
    2297              : 
    2298              :       ! -------------------------------------------------------------------------
    2299              :       ! 3. FINALIZE AND CLEANUP
    2300              :       ! -------------------------------------------------------------------------
    2301              :       CALL fill_fm_Sigma_c_Gamma_time(fm_Sigma_c_Gamma_time, bs_env, &
    2302           10 :                                       mat_Sigma_pos_tau, mat_Sigma_neg_tau)
    2303              : 
    2304           10 :       CALL cp_fm_release(fm_W_time)
    2305              : 
    2306           10 :       CALL dbcsr_deallocate_matrix_set(mat_Sigma_neg_tau)
    2307           10 :       CALL dbcsr_deallocate_matrix_set(mat_Sigma_pos_tau)
    2308              : 
    2309           10 :       CALL release_dbcsr_topology_and_matrices(dist=dist_grid_grid, mapped_dist=dist_col_grid)
    2310           10 :       CALL release_dbcsr_topology_and_matrices(dist=dist_aux_aux, mapped_dist=dist_row_aux)
    2311              : 
    2312           10 :       CALL delete_unnecessary_files(bs_env)
    2313           10 :       CALL timestop(handle)
    2314              : 
    2315           10 :    END SUBROUTINE compute_Sigma_c
    2316              : 
    2317              : ! **************************************************************************************************
    2318              : !> \brief DBCSR Topology Generation
    2319              : !> \param matrix_template ...
    2320              : !> \param dim_type ...
    2321              : !> \param square_dist ...
    2322              : !> \param blk_sizes ...
    2323              : !> \param mapped_dist ...
    2324              : ! **************************************************************************************************
    2325              : 
    2326          572 :    SUBROUTINE setup_square_topology(matrix_template, dim_type, square_dist, blk_sizes, mapped_dist)
    2327              : 
    2328              :       TYPE(dbcsr_type), INTENT(IN)                       :: matrix_template
    2329              :       CHARACTER(LEN=*), INTENT(IN)                       :: dim_type
    2330              :       TYPE(dbcsr_distribution_type), INTENT(OUT)         :: square_dist
    2331              :       INTEGER, DIMENSION(:), INTENT(OUT), POINTER        :: blk_sizes, mapped_dist
    2332              : 
    2333              :       INTEGER                                            :: i, np
    2334          572 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk, col_dist, row_blk, row_dist
    2335              :       TYPE(dbcsr_distribution_type)                      :: dist_template
    2336              : 
    2337              :       CALL dbcsr_get_info(matrix_template, distribution=dist_template, &
    2338          572 :                           row_blk_size=row_blk, col_blk_size=col_blk)
    2339          572 :       CALL dbcsr_distribution_get(dist_template, row_dist=row_dist, col_dist=col_dist)
    2340              : 
    2341          572 :       IF (TRIM(dim_type) == 'ROW') THEN
    2342              :          ! Creates ROW x ROW (e.g., Grid x Grid from mat_phi_mu_l)
    2343           20 :          blk_sizes => row_blk
    2344           76 :          np = MAXVAL(col_dist) + 1  ! npcol
    2345           60 :          ALLOCATE (mapped_dist(SIZE(blk_sizes)))
    2346          408 :          DO i = 1, SIZE(blk_sizes)
    2347          408 :             mapped_dist(i) = MOD(i - 1, np)
    2348              :          END DO
    2349              :          CALL dbcsr_distribution_new(square_dist, template=dist_template, &
    2350           20 :                                      row_dist=row_dist, col_dist=mapped_dist)
    2351              : 
    2352          552 :       ELSE IF (TRIM(dim_type) == 'COL') THEN
    2353              :          ! Creates COL x COL (e.g., Aux x Aux from mat_Z_lP)
    2354          552 :          blk_sizes => col_blk
    2355        11376 :          np = MAXVAL(row_dist) + 1  ! nprow
    2356         1656 :          ALLOCATE (mapped_dist(SIZE(blk_sizes)))
    2357         2040 :          DO i = 1, SIZE(blk_sizes)
    2358         2040 :             mapped_dist(i) = MOD(i - 1, np)
    2359              :          END DO
    2360              :          CALL dbcsr_distribution_new(square_dist, template=dist_template, &
    2361          552 :                                      row_dist=mapped_dist, col_dist=col_dist)
    2362              :       END IF
    2363              : 
    2364          572 :    END SUBROUTINE setup_square_topology
    2365              : 
    2366              : ! **************************************************************************************************
    2367              : !> \brief DBCSR matrices deallocation
    2368              : !> \param dist ...
    2369              : !> \param mapped_dist    ...
    2370              : !> \param m1 ...
    2371              : !> \param m2 ...
    2372              : !> \param m3 ...
    2373              : !> \param m4 ...
    2374              : ! **************************************************************************************************
    2375              : 
    2376          572 :    SUBROUTINE release_dbcsr_topology_and_matrices(dist, mapped_dist, m1, m2, m3, m4)
    2377              : 
    2378              :       TYPE(dbcsr_distribution_type), INTENT(INOUT), &
    2379              :          OPTIONAL                                        :: dist
    2380              :       INTEGER, DIMENSION(:), INTENT(INOUT), OPTIONAL, &
    2381              :          POINTER                                         :: mapped_dist
    2382              :       TYPE(dbcsr_type), INTENT(INOUT), OPTIONAL          :: m1, m2, m3, m4
    2383              : 
    2384          572 :       IF (PRESENT(dist)) CALL dbcsr_distribution_release(dist)
    2385          572 :       IF (PRESENT(mapped_dist)) THEN
    2386          572 :          IF (ASSOCIATED(mapped_dist)) THEN
    2387          572 :             DEALLOCATE (mapped_dist)
    2388              :             NULLIFY (mapped_dist)
    2389              :          END IF
    2390              :       END IF
    2391          572 :       IF (PRESENT(m1)) CALL dbcsr_release(m1)
    2392          572 :       IF (PRESENT(m2)) CALL dbcsr_release(m2)
    2393          572 :       IF (PRESENT(m3)) CALL dbcsr_release(m3)
    2394          572 :       IF (PRESENT(m4)) CALL dbcsr_release(m4)
    2395              : 
    2396          572 :    END SUBROUTINE release_dbcsr_topology_and_matrices
    2397              : 
    2398              : END MODULE gw_non_periodic_ri_rs
        

Generated by: LCOV version 2.0-1