LCOV - code coverage report
Current view: top level - src - gw_large_cell_gamma_ri_rs.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:c24029e) Lines: 0.0 % 573 0
Test Date: 2026-07-04 06:36:57 Functions: 0.0 % 18 0

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2026 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief GW using RI-RS Approximation for molecules
      10              : !> \par History
      11              : !>      04.2026 created [Ritaj Tyagi]
      12              : ! **************************************************************************************************
      13              : MODULE gw_large_cell_Gamma_ri_rs
      14              :    USE atomic_kind_types,               ONLY: atomic_kind_type
      15              :    USE basis_set_types,                 ONLY: gto_basis_set_type
      16              :    USE cell_types,                      ONLY: cell_type,&
      17              :                                               get_cell,&
      18              :                                               pbc
      19              :    USE constants_operator,              ONLY: operator_coulomb
      20              :    USE cp_blacs_env,                    ONLY: cp_blacs_env_create,&
      21              :                                               cp_blacs_env_release,&
      22              :                                               cp_blacs_env_type
      23              :    USE cp_dbcsr_api,                    ONLY: &
      24              :         dbcsr_add, dbcsr_binary_read, dbcsr_binary_write, dbcsr_copy, dbcsr_create, &
      25              :         dbcsr_deallocate_matrix, dbcsr_distribution_get, dbcsr_distribution_new, &
      26              :         dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
      27              :         dbcsr_get_info, dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, &
      28              :         dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_multiply, &
      29              :         dbcsr_p_type, dbcsr_put_block, dbcsr_release, dbcsr_scale, dbcsr_set, dbcsr_type, &
      30              :         dbcsr_type_no_symmetry
      31              :    USE cp_dbcsr_contrib,                ONLY: dbcsr_reserve_all_blocks
      32              :    USE cp_dbcsr_operations,             ONLY: copy_dbcsr_to_fm,&
      33              :                                               copy_fm_to_dbcsr,&
      34              :                                               dbcsr_deallocate_matrix_set,&
      35              :                                               max_elements_per_block
      36              :    USE cp_fm_basic_linalg,              ONLY: cp_fm_scale_and_add,&
      37              :                                               cp_fm_uplo_to_full
      38              :    USE cp_fm_cholesky,                  ONLY: cp_fm_cholesky_decompose,&
      39              :                                               cp_fm_cholesky_invert
      40              :    USE cp_fm_diag,                      ONLY: cp_fm_power
      41              :    USE cp_fm_struct,                    ONLY: cp_fm_struct_type
      42              :    USE cp_fm_types,                     ONLY: cp_fm_create,&
      43              :                                               cp_fm_get_info,&
      44              :                                               cp_fm_release,&
      45              :                                               cp_fm_set_all,&
      46              :                                               cp_fm_to_fm,&
      47              :                                               cp_fm_type
      48              :    USE cp_log_handling,                 ONLY: cp_get_default_logger,&
      49              :                                               cp_logger_type
      50              :    USE cp_output_handling,              ONLY: cp_p_file,&
      51              :                                               cp_print_key_should_output
      52              :    USE gw_integrals,                    ONLY: build_3c_integral_block_ctx,&
      53              :                                               gw_3c_ctx_create,&
      54              :                                               gw_3c_ctx_release,&
      55              :                                               gw_3c_ctx_type,&
      56              :                                               gw_3c_ws_create,&
      57              :                                               gw_3c_ws_release,&
      58              :                                               gw_3c_ws_type
      59              :    USE gw_large_cell_gamma,             ONLY: &
      60              :         Fourier_transform_w_to_t, G_occ_vir, compute_QP_energies, compute_fm_chi_Gamma_freq, &
      61              :         create_fm_W_MIC_time, delete_unnecessary_files, fill_fm_Sigma_c_Gamma_time, fm_write, &
      62              :         get_W_MIC, multiply_fm_W_MIC_time_with_Minv_Gamma
      63              :    USE gw_non_periodic_ri_rs,           ONLY: get_basis_offsets,&
      64              :                                               precompute_ri_rs_radii,&
      65              :                                               ri_rs_grid_assembler,&
      66              :                                               solve_D_lp_distributed
      67              :    USE gw_utils,                        ONLY: de_init_bs_env
      68              :    USE input_constants,                 ONLY: rtp_method_bse
      69              :    USE input_section_types,             ONLY: section_vals_type
      70              :    USE kinds,                           ONLY: dp
      71              :    USE kpoint_coulomb_2c,               ONLY: build_2c_coulomb_matrix_kp
      72              :    USE machine,                         ONLY: m_walltime
      73              :    USE message_passing,                 ONLY: mp_para_env_type
      74              :    USE mp2_ri_2c,                       ONLY: RI_2c_integral_mat
      75              :    USE orbital_pointers,                ONLY: indco,&
      76              :                                               ncoset
      77              :    USE parallel_gemm_api,               ONLY: parallel_gemm
      78              :    USE particle_types,                  ONLY: particle_type
      79              :    USE post_scf_bandstructure_types,    ONLY: post_scf_bandstructure_type
      80              :    USE qs_environment_types,            ONLY: get_qs_env,&
      81              :                                               qs_environment_type
      82              :    USE qs_kind_types,                   ONLY: get_qs_kind,&
      83              :                                               qs_kind_type
      84              : #include "./base/base_uses.f90"
      85              : 
      86              :    IMPLICIT NONE
      87              : 
      88              :    PRIVATE
      89              : 
      90              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'gw_large_cell_Gamma_ri_rs'
      91              : 
      92              :    PUBLIC :: gw_calc_large_cell_Gamma_ri_rs
      93              : 
      94              : CONTAINS
      95              : 
      96              : ! **************************************************************************************************
      97              : !> \brief GW calculation using RI-RS formalism for molecules
      98              : !> \param qs_env ...
      99              : !> \param bs_env ...
     100              : ! **************************************************************************************************
     101              : 
     102            0 :    SUBROUTINE gw_calc_large_cell_Gamma_ri_rs(qs_env, bs_env)
     103              : 
     104              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     105              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     106              : 
     107              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'gw_calc_large_cell_Gamma_ri_rs'
     108              : 
     109              :       INTEGER                                            :: handle
     110            0 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_Sigma_x_Gamma, fm_W_time
     111            0 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :, :)  :: fm_Sigma_c_Gamma_time
     112              : 
     113            0 :       CALL timeset(routineN, handle)
     114              : 
     115              :        !!========================================================================
     116              :        !! 0. Precompute AO and RI Radii
     117              :        !!    Per-atom cutoff radii from the most diffuse Gaussian primitives in
     118              :        !!    the AO and RI auxiliary basis sets. Stored in bs_env%ri_rs%
     119              :        !!    radius_ao_per_atom and radius_ri_per_atom, used for sphere-cutoff
     120              :        !!    and phi_local screening.
     121              :        !!========================================================================
     122            0 :       CALL precompute_ri_rs_radii(qs_env, bs_env)
     123              : 
     124              :        !!========================================================================
     125              :        !! 1. Grid Generation for RI-RS
     126              :        !!    (Modified Lebedev grids from Ivan Duchemin and Xavier Blase)
     127              :        !!    Generate flattened 1D array of grid points for RI-RS.
     128              :        !!    Equation: r_g(k) = R_A + r_g(A)
     129              :        !!========================================================================
     130            0 :       CALL ri_rs_grid_assembler(qs_env, bs_env, bs_env%ri_rs%grid_points)
     131              : 
     132              :        !!========================================================================
     133              :        !! 2. Atomic Basis Evaluation
     134              :        !!    Compute values of spherical atomic basis functions at grid points.
     135              :        !!    Expression: Φ_μl = Φ_μ(r_l) (mat_phi_mu_l)
     136              :        !!========================================================================
     137              :       CALL atomic_basis_at_grid_point(qs_env, bs_env, bs_env%ri_rs%grid_points, &
     138            0 :                                       bs_env%ri_rs%mat_phi_mu_l)
     139              : 
     140              :        !!========================================================================
     141              :        !! 3. Compute RI-RS Coefficients (Z_lp)
     142              :        !!    Solve the regularized system for each atom P, where the grid domain
     143              :        !!    is restricted to r_l within a cutoff distance of atom P:
     144              :        !!    a. D_ll' = [ Σ_μ Φ_μ(r_l) Φ_μ(r_l') ]^2 (Equation 13)
     145              :        !!    b. D_lP  = Σ_{μν} Φ_μ(r_l) Φ_ν(r_l) (μν|P) (Equation 15)
     146              :        !!    c. Conditioning:
     147              :        !!       Dvec_l   = 1 / sqrt(D_ll)  (Diagonal scaling vector)
     148              :        !!       D'_ll' = Dvec_l * D_ll' * Dvec_l' + λδ_ll'
     149              :        !!       D'_lP  = Dvec_l * D_lP
     150              :        !!    d. Solve: Σ_l' D'_ll' * Z'_l'P = D'_lP (Equation 14)
     151              :        !!    e. Rescale: Z_lP = Z'_lP * Dvec_l   (Z_lP stored in mat_Z_lP)
     152              :        !!========================================================================
     153              :       CALL compute_coeff_Z_lP(qs_env, bs_env, bs_env%ri_rs%grid_points, &
     154            0 :                               bs_env%ri_rs%mat_phi_mu_l, bs_env%ri_rs%mat_Z_lP)
     155              : 
     156              :        !!========================================================================
     157              :        !! 4. Compute Independent-Particle Polarizability (χ)
     158              :        !!    G^occ_µλ(i|τ|)  = sum_n^occ C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
     159              :        !!    G^vir_µλ(i|τ|)  = sum_n^vir C_µn e^(-|(ϵ_n-ϵ_F)τ|) C_λn
     160              :        !!    G^occ_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
     161              :        !!    G^vir_ll'(i|τ|) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
     162              :        !!    χ_ll'(iτ)       = G^occ_ll'(i|τ|) * G^vir_ll'(i|τ|)
     163              :        !!    χ_PQ(iτ)        = sum_ll' Z_lP χ_ll'(iτ) Z_l'Q
     164              :        !!========================================================================
     165              :       CALL get_mat_chi_Gamma_tau(bs_env, bs_env%mat_chi_Gamma_tau, &
     166            0 :                                  bs_env%ri_rs%mat_phi_mu_l, bs_env%ri_rs%mat_Z_lP)
     167              : 
     168              :        !!========================================================================
     169              :        !! 5. Compute Screened Interaction (W^MIC)
     170              :        !!    χ_PQ(iτ) -> χ_PQ(iω) -> ε_PQ(iω) -> W_PQ(iω) -> W^MIC_PQ(iτ)
     171              :        !!========================================================================
     172            0 :       CALL get_W_MIC(bs_env, qs_env, bs_env%mat_chi_Gamma_tau, fm_W_time)
     173              : 
     174              :        !!========================================================================
     175              :        !! 6. Compute Exact Exchange Self-Energy (Σ^x)
     176              :        !!    D_µν          = sum_n^occ C_µn C_νn
     177              :        !!    D_ll'         = sum_µν Φ_µ(r_l) D_µν Φ_ν(r_l')
     178              :        !!    V^trunc_ll'   = sum_PQ Z_lP V^trunc_PQ Z_l'Q
     179              :        !!    Σ^x_ll'       = D_ll' * V^trunc_ll'
     180              :        !!    Σ^x_λσ(k=0)   = -sum_ll' Φ_λ(r_l) Σ^x_ll' Φ_σ(r_l')
     181              :        !!========================================================================
     182              :       CALL compute_Sigma_x(bs_env, qs_env, bs_env%ri_rs%mat_phi_mu_l, &
     183            0 :                            bs_env%ri_rs%mat_Z_lP, fm_Sigma_x_Gamma)
     184              : 
     185              :        !!========================================================================
     186              :        !! 7. Compute Correlation Self-Energy (Σ^c)
     187              :        !!    W^MIC_ll'(iτ) =  sum_PQ Z_lP W^MIC_PQ(iτ) Z_l'Q
     188              :        !!    Σ^c_ll'(iτ)   = -G^occ_ll'(i|τ|) * W^MIC_ll'(iτ), for τ < 0
     189              :        !!    Σ^c_ll'(iτ)   =  G^vir_ll'(i|τ|) * W^MIC_ll'(iτ), for τ > 0
     190              :        !!    Σ^c_λσ(iτ)    =  sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ) Φ_σ(r_l')
     191              :        !!========================================================================
     192              :       CALL compute_Sigma_c(bs_env, fm_W_time, bs_env%ri_rs%mat_phi_mu_l, &
     193            0 :                            bs_env%ri_rs%mat_Z_lP, fm_Sigma_c_Gamma_time)
     194              : 
     195              :        !!========================================================================
     196              :        !! 8. Compute Quasiparticle Energies
     197              :        !!    Σ^c_λσ(iτ) -> Σ^c_nn(ϵ)
     198              :        !!    ϵ_nk^GW = ϵ_nk^DFT + Σ^c_nn(ϵ) + Σ^x_nn - v^xc_nn
     199              :        !!========================================================================
     200            0 :       CALL compute_QP_energies(bs_env, qs_env, fm_Sigma_x_Gamma, fm_Sigma_c_Gamma_time)
     201              : 
     202            0 :       CALL de_init_bs_env(bs_env)
     203              : 
     204            0 :       CALL timestop(handle)
     205              : 
     206            0 :    END SUBROUTINE gw_calc_large_cell_Gamma_ri_rs
     207              : 
     208              : ! **************************************************************************************************
     209              : !> \brief Evaluates atomic basis functions on a real-space grid and builds a sparse DBCSR matrix.
     210              : !> \param qs_env ...
     211              : !> \param bs_env ...
     212              : !> \param ri_rs_grid_points ...
     213              : !> \param mat_phi_mu_l ...
     214              : ! **************************************************************************************************
     215              : 
     216            0 :    SUBROUTINE atomic_basis_at_grid_point(qs_env, bs_env, ri_rs_grid_points, mat_phi_mu_l)
     217              : 
     218              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     219              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     220              :       REAL(KIND=dp), ALLOCATABLE, INTENT(INOUT)          :: ri_rs_grid_points(:, :)
     221              :       TYPE(dbcsr_type), INTENT(OUT)                      :: mat_phi_mu_l
     222              : 
     223              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'atomic_basis_at_grid_point'
     224              : 
     225              :       INTEGER :: c_size, chunk_size, dimen_ORB, handle, i, i_blk, iatom, natom, npcol, nprow, &
     226              :          num_grid_chunks, r_end, r_start, total_grid_npts
     227              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: first_sgf
     228            0 :       INTEGER, DIMENSION(:), POINTER                     :: c_blk_sizes, col_dist, col_dist_ks, &
     229            0 :                                                             r_blk_sizes, row_dist, row_dist_ks
     230            0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: atom_col_buffer
     231            0 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     232              :       TYPE(cell_type), POINTER                           :: cell
     233              :       TYPE(dbcsr_distribution_type)                      :: dist
     234              :       TYPE(dbcsr_distribution_type), POINTER             :: dbcsr_dist_ks
     235              :       TYPE(mp_para_env_type), POINTER                    :: para_env
     236            0 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     237            0 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     238              : 
     239            0 :       CALL timeset(routineN, handle)
     240              : 
     241              :       ! Setup Grid Blocking
     242            0 :       chunk_size = max_elements_per_block
     243              : 
     244              :       ! Extract environment variables
     245              :       CALL get_qs_env(qs_env, cell=cell, atomic_kind_set=atomic_kind_set, &
     246              :                       qs_kind_set=qs_kind_set, particle_set=particle_set, &
     247            0 :                       para_env=para_env)
     248              : 
     249            0 :       natom = SIZE(particle_set)
     250            0 :       total_grid_npts = SIZE(ri_rs_grid_points, 2)
     251              : 
     252              :       ! Map the starting indices of spherical gaussian functions (SGF) for each atom
     253            0 :       ALLOCATE (first_sgf(natom + 1))
     254            0 :       CALL get_basis_offsets(particle_set, qs_kind_set, first_sgf, dimen_ORB)
     255              : 
     256              :       ! =========================================================================
     257              :       ! 1. SETUP DBCSR MATRIX TOPOLOGY
     258              :       ! =========================================================================
     259              : 
     260              :       ! A. Define Column Block Sizes (1 Block = 1 Atom's full basis set)
     261            0 :       ALLOCATE (c_blk_sizes(natom))
     262            0 :       DO iatom = 1, natom
     263            0 :          c_blk_sizes(iatom) = first_sgf(iatom + 1) - first_sgf(iatom)
     264              :       END DO
     265              : 
     266              :       ! B. Define Row Block Sizes (Grid chunks of max size 256)
     267            0 :       num_grid_chunks = CEILING(REAL(total_grid_npts, KIND=dp)/REAL(chunk_size, KIND=dp))
     268            0 :       ALLOCATE (r_blk_sizes(num_grid_chunks))
     269            0 :       r_blk_sizes = chunk_size
     270            0 :       IF (MOD(total_grid_npts, chunk_size) /= 0) THEN
     271            0 :          r_blk_sizes(num_grid_chunks) = MOD(total_grid_npts, chunk_size)
     272              :       END IF
     273              : 
     274              :       ! C. Fetch CP2K's Default Process Grid Configuration
     275            0 :       CALL get_qs_env(qs_env, dbcsr_dist=dbcsr_dist_ks)
     276            0 :       CALL dbcsr_distribution_get(dbcsr_dist_ks, row_dist=row_dist_ks, col_dist=col_dist_ks)
     277              : 
     278              :       ! D. Build Custom Mappings using Round-Robin across the 2D process grid
     279              :       !    (MAXVAL + 1 accounts for 0-based process indexing in DBCSR)
     280            0 :       nprow = MAXVAL(row_dist_ks) + 1
     281            0 :       npcol = MAXVAL(col_dist_ks) + 1
     282              : 
     283            0 :       ALLOCATE (row_dist(num_grid_chunks))
     284            0 :       DO i = 1, num_grid_chunks
     285            0 :          row_dist(i) = MOD(i - 1, nprow)
     286              :       END DO
     287              : 
     288            0 :       ALLOCATE (col_dist(natom))
     289            0 :       DO i = 1, natom
     290            0 :          col_dist(i) = MOD(i - 1, npcol)
     291              :       END DO
     292              : 
     293              :       ! E. Create the DBCSR Distribution and Initialize the Matrix
     294              :       CALL dbcsr_distribution_new(dist, template=dbcsr_dist_ks, &
     295            0 :                                   row_dist=row_dist, col_dist=col_dist)
     296              : 
     297              :       CALL dbcsr_create(mat_phi_mu_l, name="phi_val_sparse", dist=dist, &
     298              :                         matrix_type=dbcsr_type_no_symmetry, &
     299            0 :                         row_blk_size=r_blk_sizes, col_blk_size=c_blk_sizes)
     300              : 
     301              :       ! =========================================================================
     302              :       ! 2. STREAM DATA DIRECTLY INTO SPARSE MATRIX
     303              :       ! =========================================================================
     304              :       ! Iterate over the atoms assigned to this specific MPI rank
     305            0 :       DO iatom = para_env%mepos + 1, natom, para_env%num_pe
     306              : 
     307            0 :          c_size = c_blk_sizes(iatom)
     308              : 
     309              :          ! Allocate a temporary dense buffer just for this specific atom
     310            0 :          ALLOCATE (atom_col_buffer(total_grid_npts, c_size))
     311            0 :          atom_col_buffer = 0.0_dp
     312              : 
     313              :          ! Evaluate the basis functions on the grid. Skip grid points outside the spatial
     314              :          ! extent of the most diffuse AO Gaussian on iatom; beyond that radius the contribution
     315              :          ! is guaranteed below eps_filter.
     316              :          CALL fill_phi_for_atom(atom_col_buffer, ri_rs_grid_points, total_grid_npts, &
     317              :                                 iatom, particle_set, qs_kind_set, cell, &
     318            0 :                                 r2_threshold=bs_env%ri_rs%radius_ao_per_atom(iatom)**2)
     319              : 
     320              :          ! Slice the dense column into chunks and insert into DBCSR
     321            0 :          DO i_blk = 1, num_grid_chunks
     322            0 :             r_start = (i_blk - 1)*chunk_size + 1
     323            0 :             r_end = MIN(i_blk*chunk_size, total_grid_npts)
     324              : 
     325              :             ! Apply dynamic sparsity filtering: Only store blocks with physical significance
     326            0 :             IF (MAXVAL(ABS(atom_col_buffer(r_start:r_end, 1:c_size))) > bs_env%eps_filter) THEN
     327              :                CALL dbcsr_put_block(mat_phi_mu_l, row=i_blk, col=iatom, &
     328            0 :                                     block=atom_col_buffer(r_start:r_end, 1:c_size))
     329              :             END IF
     330              :          END DO
     331              : 
     332            0 :          DEALLOCATE (atom_col_buffer)
     333              : 
     334              :       END DO
     335              : 
     336              :       ! Finalize triggers internal MPI communication to route blocks to their correct 2D process owners
     337            0 :       CALL dbcsr_finalize(mat_phi_mu_l)
     338              : 
     339            0 :       IF (bs_env%unit_nr > 0) THEN
     340            0 :          WRITE (bs_env%unit_nr, *) "Done with evaluation of phi"
     341              :       END IF
     342              : 
     343              :       ! -------------------------------------------------------------------------
     344              :       ! CLEANUP
     345              :       ! -------------------------------------------------------------------------
     346            0 :       DEALLOCATE (first_sgf, r_blk_sizes, c_blk_sizes, row_dist, col_dist)
     347            0 :       CALL dbcsr_distribution_release(dist)
     348              : 
     349            0 :       CALL timestop(handle)
     350              : 
     351            0 :    END SUBROUTINE atomic_basis_at_grid_point
     352              : 
     353              : ! **************************************************************************************************
     354              : !> \brief Compute value of all basis functions for a single atom across all grid points.
     355              : !>        Sums contributions from periodic images of `iatom` (loop over (ix, iy, iz) cells gated
     356              : !>        by `cell%perd`). Each per-image squared distance is compared against `r2_threshold`
     357              : !>        (per-atom AO Gaussian extent²); images beyond that radius contribute below eps_filter
     358              : !>        and are skipped.
     359              : !> \param phi_val ...
     360              : !> \param ri_rs_grid ...
     361              : !> \param npts ...
     362              : !> \param iatom ...
     363              : !> \param particle_set ...
     364              : !> \param qs_kind_set ...
     365              : !> \param cell ...
     366              : !> \param r2_threshold per-image squared-distance threshold; CYCLE if r² > r2_threshold. Pass
     367              : !>        HUGE(1.0_dp) to disable.
     368              : ! **************************************************************************************************
     369              : 
     370            0 :    SUBROUTINE fill_phi_for_atom(phi_val, ri_rs_grid, npts, iatom, &
     371              :                                 particle_set, qs_kind_set, cell, r2_threshold)
     372              : 
     373              :       REAL(KIND=dp), INTENT(INOUT)                       :: phi_val(:, :)
     374              :       INTEGER, INTENT(IN)                                :: npts
     375              :       REAL(KIND=dp), INTENT(IN)                          :: ri_rs_grid(3, npts)
     376              :       INTEGER, INTENT(IN)                                :: iatom
     377              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     378              :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     379              :       TYPE(cell_type), POINTER                           :: cell
     380              :       REAL(KIND=dp), INTENT(IN)                          :: r2_threshold
     381              : 
     382              :       INTEGER :: first_sgf, i_pt, ico, iend_co, ikind, ipgf, iset, isgf, ishell, istart_co, ix, &
     383              :          ix_max, ix_min, iy, iy_max, iy_min, iz, iz_max, iz_min, l, last_sgf, lx, ly, lz, &
     384              :          n_cart_total, row_idx
     385              :       REAL(KIND=dp)                                      :: alpha, cell_vector(3), dist_vec(3), &
     386              :                                                             dist_vec_raw(3), exp_val, poly, r2, &
     387              :                                                             r_atom(3), weight
     388              :       REAL(KIND=dp), DIMENSION(3, 3)                     :: hmat
     389              :       TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
     390              : 
     391              :       ! Get Atom Info
     392            0 :       ikind = particle_set(iatom)%atomic_kind%kind_number
     393            0 :       CALL get_qs_kind(qs_kind_set(ikind), basis_set=orb_basis_set, basis_type="ORB")
     394            0 :       CALL get_cell(cell=cell, h=hmat)
     395              : 
     396            0 :       IF (.NOT. ASSOCIATED(orb_basis_set)) RETURN
     397              : 
     398            0 :       IF (cell%perd(1) == 1) THEN; ix_min = -1; ix_max = 1; ELSE; ix_min = 0; ix_max = 0; END IF
     399            0 :       IF (cell%perd(2) == 1) THEN; iy_min = -1; iy_max = 1; ELSE; iy_min = 0; iy_max = 0; END IF
     400            0 :       IF (cell%perd(3) == 1) THEN; iz_min = -1; iz_max = 1; ELSE; iz_min = 0; iz_max = 0; END IF
     401              : 
     402            0 :       r_atom = particle_set(iatom)%r
     403              : 
     404              :       !$OMP PARALLEL DO DEFAULT(NONE) &
     405              :       !$OMP SHARED(phi_val, ri_rs_grid, npts, orb_basis_set, r_atom, hmat, &
     406              :       !$OMP        ncoset, indco, cell, ix_min, ix_max, &
     407              :       !$OMP        iy_min, iy_max, iz_min, iz_max, r2_threshold) &
     408              :       !$OMP PRIVATE(i_pt, dist_vec_raw, ix, iy, iz, cell_vector, dist_vec, r2, iset, &
     409              :       !$OMP         n_cart_total, ishell, l, istart_co, iend_co, first_sgf, last_sgf, &
     410              :       !$OMP         ipgf, alpha, exp_val, isgf, ico, row_idx, weight, lx, ly, lz, poly) &
     411            0 :       !$OMP SCHEDULE(DYNAMIC)
     412              : 
     413              :       DO i_pt = 1, npts
     414              : 
     415              :          dist_vec_raw = ri_rs_grid(:, i_pt) - r_atom
     416              : 
     417              :          DO ix = ix_min, ix_max
     418              :          DO iy = iy_min, iy_max
     419              :          DO iz = iz_min, iz_max
     420              : 
     421              :             cell_vector(1:3) = MATMUL(hmat, REAL([ix, iy, iz], dp))
     422              : 
     423              :             dist_vec = dist_vec_raw - cell_vector
     424              : 
     425              :             r2 = DOT_PRODUCT(dist_vec, dist_vec)
     426              : 
     427              :             IF (r2 > r2_threshold) CYCLE
     428              : 
     429              :             DO iset = 1, orb_basis_set%nset
     430              :                n_cart_total = ncoset(orb_basis_set%lmax(iset))
     431              : 
     432              :                DO ishell = 1, orb_basis_set%nshell(iset)
     433              :                   l = orb_basis_set%l(ishell, iset)
     434              :                   istart_co = ncoset(l - 1) + 1
     435              :                   iend_co = ncoset(l)
     436              : 
     437              :                   first_sgf = orb_basis_set%first_sgf(ishell, iset)
     438              :                   last_sgf = orb_basis_set%last_sgf(ishell, iset)
     439              : 
     440              :                   DO ipgf = 1, orb_basis_set%npgf(iset)
     441              :                      alpha = orb_basis_set%zet(ipgf, iset)
     442              :                      exp_val = EXP(-alpha*r2)
     443              : 
     444              :                      DO isgf = first_sgf, last_sgf
     445              :                         DO ico = istart_co, iend_co
     446              :                            row_idx = (ipgf - 1)*n_cart_total + ico
     447              :                            weight = orb_basis_set%sphi(row_idx, isgf)
     448              :                            lx = indco(1, ico)
     449              :                            ly = indco(2, ico)
     450              :                            lz = indco(3, ico)
     451              :                            poly = (dist_vec(1)**lx)*(dist_vec(2)**ly)*(dist_vec(3)**lz)
     452              : 
     453              :                            phi_val(i_pt, isgf) = phi_val(i_pt, isgf) + (weight*poly*exp_val)
     454              : 
     455              :                         END DO
     456              :                      END DO
     457              :                   END DO
     458              :                END DO
     459              :             END DO
     460              :          END DO
     461              :          END DO
     462              :          END DO
     463              :       END DO
     464              :       !$OMP END PARALLEL DO
     465              : 
     466              :    END SUBROUTINE fill_phi_for_atom
     467              : 
     468              : ! **************************************************************************************************
     469              : !> \brief Compute RI-RS Coefficients (Z_lP)
     470              : !> \param qs_env ...
     471              : !> \param bs_env ...
     472              : !> \param ri_rs_grid_points ...
     473              : !> \param mat_phi_mu_l ...
     474              : !> \param mat_Z_lP ...
     475              : ! **************************************************************************************************
     476              : 
     477            0 :    SUBROUTINE compute_coeff_Z_lP(qs_env, bs_env, ri_rs_grid_points, mat_phi_mu_l, mat_Z_lP)
     478              : 
     479              :       ! Arguments
     480              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     481              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     482              :       REAL(KIND=dp), ALLOCATABLE, INTENT(INOUT)          :: ri_rs_grid_points(:, :)
     483              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l
     484              :       TYPE(dbcsr_type), INTENT(OUT)                      :: mat_Z_lP
     485              : 
     486              :       CHARACTER(LEN=*), PARAMETER :: key = 'PROPERTIES%BANDSTRUCTURE%GW%PRINT%RESTART', &
     487              :          routineN = 'compute_coeff_Z_lP'
     488              : 
     489              :       INTEGER :: atom_j_mepos, atom_j_stride, atom_P, atom_P_start, atom_P_stride, col_end, &
     490              :          col_start, current_chunk_size, g, group_handle, handle, i, i_blk, ikind, info, j, j_ri, &
     491              :          l, loc_idx, loc_ptr, max_ao_size, max_loc_ri, my_group, n_ao_total, n_grid_total, &
     492              :          n_groups, n_loc_ri, n_local_grid, n_procs_per_atom, natom, nkind, num_grid_chunks, &
     493              :          P_loop_atom, r_end, r_start, source_atom
     494            0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: local_grid_idx, row_offset
     495            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist_phi, col_dist_ri, r_blk_sizes, &
     496            0 :                                                             ri_blk_sizes, row_dist_grid
     497              :       REAL(KIND=dp)                                      :: cutoff_ri, cutoff_ri_2, d_sP, dist2_min, &
     498              :                                                             r2_threshold, r_c, t1, t2, t3
     499            0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: cutoff_ri_per_atom, cutoff_ri_per_kind, &
     500            0 :                                                             d_vec_local
     501            0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: D_local, d_lp_local, phi_local, &
     502            0 :                                                             sphere_grid, Z_blk
     503              :       REAL(KIND=dp), DIMENSION(3)                        :: dist_vec_raw, pos_P
     504            0 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     505              :       TYPE(cell_type), POINTER                           :: cell
     506              :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub
     507              :       TYPE(cp_fm_struct_type), POINTER                   :: fm_struct_b, fm_struct_D
     508              :       TYPE(cp_fm_type)                                   :: fm_b, fm_D
     509              :       TYPE(cp_logger_type), POINTER                      :: logger
     510              :       TYPE(dbcsr_distribution_type)                      :: dist_phi, dist_Z
     511            0 :       TYPE(gw_3c_ctx_type)                               :: ctx_3c
     512              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
     513            0 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     514            0 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     515              :       TYPE(section_vals_type), POINTER                   :: input
     516              : 
     517            0 :       CALL timeset(routineN, handle)
     518              : 
     519            0 :       t1 = m_walltime()
     520              : 
     521              :       CALL get_qs_env(qs_env, para_env=para_env, particle_set=particle_set, input=input, &
     522            0 :                       cell=cell, qs_kind_set=qs_kind_set, atomic_kind_set=atomic_kind_set)
     523              : 
     524              :       ! ---------------------------------------------------------------------
     525              :       ! Subgroup setup. Default G=1 keeps the single-rank BLAS path; G>1 splits
     526              :       ! ranks into atom-groups so the Cholesky on D_local distributes across G
     527              :       ! ranks (memory ~1/G) and the compute_d_lp build also splits across the
     528              :       ! subgroup. G=1 leaves para_env_sub / blacs_env_sub NULL — no subgroup
     529              :       ! comms created, atom_P loop uses per-rank round-robin, compute_d_lp runs
     530              :       ! its full atom_j range on each rank, no allreduce.
     531              :       ! ---------------------------------------------------------------------
     532            0 :       n_procs_per_atom = MIN(bs_env%ri_rs%n_procs_per_atom_z_lp, para_env%num_pe)
     533            0 :       IF (n_procs_per_atom < 1) n_procs_per_atom = 1
     534              : 
     535            0 :       NULLIFY (para_env_sub, blacs_env_sub)
     536            0 :       IF (n_procs_per_atom > 1) THEN
     537            0 :          n_groups = para_env%num_pe/n_procs_per_atom
     538            0 :          my_group = MIN(para_env%mepos/n_procs_per_atom, n_groups - 1)
     539            0 :          ALLOCATE (para_env_sub)
     540            0 :          CALL para_env_sub%from_split(para_env, my_group)
     541            0 :          CALL cp_blacs_env_create(blacs_env=blacs_env_sub, para_env=para_env_sub)
     542            0 :          atom_P_start = my_group + 1
     543            0 :          atom_P_stride = n_groups
     544            0 :          atom_j_mepos = para_env_sub%mepos
     545            0 :          atom_j_stride = para_env_sub%num_pe
     546              :       ELSE
     547            0 :          atom_P_start = para_env%mepos + 1
     548            0 :          atom_P_stride = para_env%num_pe
     549            0 :          atom_j_mepos = 0
     550            0 :          atom_j_stride = 1
     551              :       END IF
     552              : 
     553            0 :       natom = SIZE(bs_env%i_RI_start_from_atom)
     554            0 :       n_ao_total = bs_env%i_ao_end_from_atom(natom)
     555            0 :       n_grid_total = SIZE(ri_rs_grid_points, 2)
     556              : 
     557              :       ! =========================================================================
     558              :       ! 1. SETUP DBCSR TOPOLOGY & EXACT OFFSETS
     559              :       ! =========================================================================
     560            0 :       CALL dbcsr_get_info(mat_phi_mu_l, row_blk_size=r_blk_sizes, distribution=dist_phi)
     561            0 :       CALL dbcsr_distribution_get(dist_phi, row_dist=row_dist_grid, col_dist=col_dist_phi, group=group_handle)
     562              : 
     563            0 :       num_grid_chunks = SIZE(r_blk_sizes)
     564              : 
     565            0 :       ALLOCATE (row_offset(num_grid_chunks))
     566            0 :       row_offset(1) = 0
     567            0 :       DO i_blk = 2, num_grid_chunks
     568            0 :          row_offset(i_blk) = row_offset(i_blk - 1) + r_blk_sizes(i_blk - 1)
     569              :       END DO
     570              : 
     571            0 :       ALLOCATE (ri_blk_sizes(natom), col_dist_ri(natom))
     572            0 :       DO atom_P = 1, natom
     573            0 :          ri_blk_sizes(atom_P) = bs_env%i_RI_end_from_atom(atom_P) - bs_env%i_RI_start_from_atom(atom_P) + 1
     574            0 :          col_dist_ri(atom_P) = MOD(atom_P - 1, MAXVAL(col_dist_phi) + 1)
     575              :       END DO
     576              : 
     577            0 :       CALL dbcsr_distribution_new(dist_Z, template=dist_phi, row_dist=row_dist_grid, col_dist=col_dist_ri)
     578              : 
     579            0 :       IF (bs_env%ri_rs%Z_lP_exists) THEN
     580              :          CALL dbcsr_binary_read(filepath=TRIM(bs_env%prefix)//"Z_lP.matrix", &
     581              :                                 distribution=dist_Z, &
     582            0 :                                 matrix_new=mat_Z_lP)
     583            0 :          IF (bs_env%unit_nr > 0) THEN
     584              :             WRITE (bs_env%unit_nr, '(T2,A,T57,A,F7.1,A)') &
     585            0 :                'Read Z_lP from file ', ' Execution time', m_walltime() - t1, ' s'
     586            0 :             WRITE (bs_env%unit_nr, '(A)') ' '
     587              :          END IF
     588              :       ELSE
     589              : 
     590              :          CALL dbcsr_create(mat_Z_lP, name="mat_Z_lP", dist=dist_Z, &
     591              :                            matrix_type=dbcsr_type_no_symmetry, &
     592            0 :                            row_blk_size=r_blk_sizes, col_blk_size=ri_blk_sizes)
     593              : 
     594            0 :          max_ao_size = 0
     595            0 :          DO j = 1, SIZE(bs_env%i_ao_start_from_atom)
     596            0 :             max_ao_size = MAX(max_ao_size, bs_env%i_ao_end_from_atom(j) - bs_env%i_ao_start_from_atom(j) + 1)
     597              :          END DO
     598            0 :          max_loc_ri = MAXVAL(ri_blk_sizes)
     599              : 
     600              :          ! Per-atom RI-RS integration sphere:
     601              :          !   cutoff_ri(P) = r_c + r_AO(P)
     602              :          ! where r_c is the truncated-Coulomb cutoff of the RI metric. The
     603              :          ! CUTOFF_RADIUS_RI_RS keyword (when > 0) overrides the entire cutoff calculation.
     604            0 :          nkind = SIZE(atomic_kind_set)
     605            0 :          ALLOCATE (cutoff_ri_per_atom(natom))
     606              : 
     607            0 :          IF (bs_env%ri_rs%cutoff_radius_ri_rs > 0.0_dp) THEN
     608            0 :             cutoff_ri_per_atom(:) = bs_env%ri_rs%cutoff_radius_ri_rs
     609              :          ELSE
     610            0 :             r_c = bs_env%ri_metric%cutoff_radius
     611            0 :             DO P_loop_atom = 1, natom
     612              :                cutoff_ri_per_atom(P_loop_atom) = &
     613            0 :                   r_c + bs_env%ri_rs%radius_ao_per_atom(P_loop_atom)
     614              :             END DO
     615              :          END IF
     616              : 
     617            0 :          ALLOCATE (cutoff_ri_per_kind(nkind))
     618            0 :          cutoff_ri_per_kind(:) = 0.0_dp
     619            0 :          IF (bs_env%unit_nr > 0) THEN
     620            0 :             DO P_loop_atom = 1, natom
     621            0 :                ikind = particle_set(P_loop_atom)%atomic_kind%kind_number
     622              :                cutoff_ri_per_kind(ikind) = MAX(cutoff_ri_per_kind(ikind), &
     623            0 :                                                cutoff_ri_per_atom(P_loop_atom))
     624              :             END DO
     625            0 :             WRITE (bs_env%unit_nr, '(T2,A)') 'Per-kind maximum RI-RS sphere cutoff (Bohr):'
     626            0 :             WRITE (bs_env%unit_nr, '(T4,A4,A14)') 'Kind', 'max cutoff_ri'
     627            0 :             DO ikind = 1, nkind
     628              :                WRITE (bs_env%unit_nr, '(T4,A4,F14.4)') &
     629            0 :                   atomic_kind_set(ikind)%element_symbol, &
     630            0 :                   cutoff_ri_per_kind(ikind)
     631              :             END DO
     632            0 :             WRITE (bs_env%unit_nr, '(A)') ' '
     633            0 :             DEALLOCATE (cutoff_ri_per_kind)
     634              :          END IF
     635              : 
     636              :          ! Shared 3c-integral context: hoists libint / t_c_g0 / md_ftable / contracted
     637              :          ! sphi tables out of the per-triple call so compute_d_lp threads only allocate
     638              :          ! a lightweight per-thread workspace. MPI-collective; must be outside any
     639              :          ! OMP region.
     640              :          CALL gw_3c_ctx_create(ctx_3c, qs_env, bs_env%ri_metric, &
     641              :                                basis_j=bs_env%basis_set_AO, basis_k=bs_env%basis_set_AO, &
     642            0 :                                basis_i=bs_env%basis_set_RI)
     643              : 
     644              :          ! =========================================================================
     645              :          ! 2. MPI LOOP OVER ATOMS (Fully independent, no MPI barriers inside)
     646              :          ! phi_local for each atom_P's cutoff sphere is built on the fly via
     647              :          ! fill_phi_for_atom — no dense replicated phi_global, no allreduce.
     648              :          ! =========================================================================
     649            0 :          DO atom_P = atom_P_start, natom, atom_P_stride
     650              : 
     651            0 :             n_loc_ri = ri_blk_sizes(atom_P)
     652            0 :             pos_P(:) = particle_set(atom_P)%r(:)
     653              : 
     654            0 :             cutoff_ri = cutoff_ri_per_atom(atom_P)
     655            0 :             cutoff_ri_2 = cutoff_ri**2
     656              : 
     657              :             ! ---------------------------------------------------------------------
     658              :             ! A. Determine Local Grid Domain based on cutoff_ri (PBC distance)
     659              :             ! ---------------------------------------------------------------------
     660            0 :             n_local_grid = 0
     661            0 :             DO l = 1, n_grid_total
     662            0 :                dist_vec_raw = pbc(ri_rs_grid_points(1:3, l), pos_P(1:3), cell)
     663            0 :                dist2_min = DOT_PRODUCT(dist_vec_raw, dist_vec_raw)
     664            0 :                IF (dist2_min <= cutoff_ri_2) n_local_grid = n_local_grid + 1
     665              :             END DO
     666              : 
     667            0 :             ALLOCATE (local_grid_idx(n_local_grid))
     668              : 
     669            0 :             n_local_grid = 0
     670            0 :             DO l = 1, n_grid_total
     671            0 :                dist_vec_raw = pbc(ri_rs_grid_points(1:3, l), pos_P(1:3), cell)
     672            0 :                dist2_min = DOT_PRODUCT(dist_vec_raw, dist_vec_raw)
     673            0 :                IF (dist2_min <= cutoff_ri_2) THEN
     674            0 :                   n_local_grid = n_local_grid + 1
     675            0 :                   local_grid_idx(n_local_grid) = l
     676              :                END IF
     677              :             END DO
     678              : 
     679              :             ! ---------------------------------------------------------------------
     680              :             ! B. Build phi_local on the fly via fill_phi_for_atom.
     681              :             !    Only source atoms whose AO basis can reach the cutoff sphere of
     682              :             !    atom_P (MIC distance) contribute; the rest are pruned. The
     683              :             !    periodic fill_phi_for_atom sums over (ix, iy, iz) images of
     684              :             !    source_atom internally.
     685              :             ! ---------------------------------------------------------------------
     686            0 :             ALLOCATE (sphere_grid(3, n_local_grid))
     687            0 :             DO loc_idx = 1, n_local_grid
     688            0 :                sphere_grid(:, loc_idx) = ri_rs_grid_points(:, local_grid_idx(loc_idx))
     689              :             END DO
     690              : 
     691            0 :             ALLOCATE (phi_local(n_local_grid, n_ao_total))
     692            0 :             phi_local = 0.0_dp
     693              : 
     694            0 :             DO source_atom = 1, natom
     695            0 :                dist_vec_raw = pbc(particle_set(source_atom)%r(:), pos_P(:), cell)
     696            0 :                d_sP = NORM2(dist_vec_raw)
     697            0 :                IF (d_sP > bs_env%ri_rs%radius_ao_per_atom(source_atom) + cutoff_ri) CYCLE
     698              : 
     699            0 :                col_start = bs_env%i_ao_start_from_atom(source_atom)
     700            0 :                col_end = bs_env%i_ao_end_from_atom(source_atom)
     701            0 :                r2_threshold = bs_env%ri_rs%radius_ao_per_atom(source_atom)**2
     702              : 
     703              :                CALL fill_phi_for_atom(phi_local(:, col_start:col_end), sphere_grid, &
     704              :                                       n_local_grid, source_atom, particle_set, qs_kind_set, &
     705            0 :                                       cell, r2_threshold)
     706              :             END DO
     707              : 
     708            0 :             DEALLOCATE (sphere_grid)
     709              : 
     710              :             ! ---------------------------------------------------------------------
     711              :             ! C. Build Local RHS Matrix (d_lp_local) first so the subgroup-
     712              :             !    distributed compute_d_lp + allreduce is not entangled with the LHS
     713              :             !    build. compute_d_lp does not depend on D_local or d_vec_local.
     714              :             ! ---------------------------------------------------------------------
     715            0 :             ALLOCATE (d_lp_local(n_local_grid, n_loc_ri))
     716            0 :             d_lp_local = 0.0_dp
     717              : 
     718            0 :             t2 = m_walltime()
     719              : 
     720              :             CALL compute_d_lp(bs_env, ctx_3c, cell, phi_local, d_lp_local, n_local_grid, &
     721            0 :                               n_loc_ri, atom_P, max_ao_size, atom_j_mepos, atom_j_stride)
     722              : 
     723              :             ! Reduce per-subgroup-rank partials into the replicated d_lp_local.
     724              :             ! Skipped for G=1 (BLAS path): each rank has the full sum locally.
     725            0 :             IF (n_procs_per_atom > 1) THEN
     726            0 :                CALL para_env_sub%sum(d_lp_local)
     727              :             END IF
     728              : 
     729            0 :             t3 = m_walltime()
     730              : 
     731              :             ! ---------------------------------------------------------------------
     732              :             ! D. Build d_vec_local (Jacobi diagonal) + LHS — BLAS or ScaLAPACK
     733              :             ! ---------------------------------------------------------------------
     734            0 :             ALLOCATE (d_vec_local(n_local_grid))
     735              : 
     736            0 :             IF (n_procs_per_atom == 1) THEN
     737              :                ! BLAS path: build D_local densely, compute d_vec as side-effect
     738              :                ! of the Jacobi step (preserves bit-identical arithmetic with the
     739              :                ! previous branch).
     740            0 :                ALLOCATE (D_local(n_local_grid, n_local_grid))
     741            0 :                D_local = 0.0_dp
     742              : 
     743              :                CALL dsyrk("L", "N", n_local_grid, n_ao_total, 1.0_dp, phi_local, &
     744            0 :                           n_local_grid, 0.0_dp, D_local, n_local_grid)
     745              : 
     746              :                !$OMP PARALLEL DO DEFAULT(NONE) &
     747              :                !$OMP SHARED(n_local_grid, D_local, d_vec_local, bs_env) &
     748              :                !$OMP PRIVATE(i) &
     749            0 :                !$OMP SCHEDULE(STATIC)
     750              :                DO i = 1, n_local_grid
     751              :                   D_local(i, i) = D_local(i, i)**2
     752              :                   d_vec_local(i) = 1.0_dp/SQRT(MAX(D_local(i, i), 1.0E-16_dp))
     753              :                   D_local(i, i) = (D_local(i, i)*d_vec_local(i)**2) + bs_env%ri_rs%tikhonov
     754              :                END DO
     755              :                !$OMP END PARALLEL DO
     756              : 
     757              :                !$OMP PARALLEL DO DEFAULT(NONE) &
     758              :                !$OMP SHARED(n_local_grid, D_local, d_vec_local) &
     759              :                !$OMP PRIVATE(j, i) &
     760            0 :                !$OMP SCHEDULE(DYNAMIC)
     761              :                DO j = 1, n_local_grid
     762              :                   DO i = j + 1, n_local_grid
     763              :                      D_local(i, j) = D_local(i, j)**2
     764              :                      D_local(i, j) = D_local(i, j)*d_vec_local(i)*d_vec_local(j)
     765              :                      D_local(j, i) = D_local(i, j)
     766              :                   END DO
     767              :                END DO
     768              :                !$OMP END PARALLEL DO
     769              :             ELSE
     770              :                ! ScaLAPACK path: d_vec computed directly from phi (= 1/||phi_i||^2);
     771              :                ! solve_D_lp_distributed builds D block-cyclic internally with
     772              :                ! the squared+scaled values, so no dense D_local on this rank.
     773              :                !$OMP PARALLEL DO DEFAULT(NONE) &
     774              :                !$OMP SHARED(n_local_grid, n_ao_total, phi_local, d_vec_local) &
     775              :                !$OMP PRIVATE(i, j) &
     776            0 :                !$OMP SCHEDULE(STATIC)
     777              :                DO i = 1, n_local_grid
     778              :                   d_vec_local(i) = 0.0_dp
     779              :                   DO j = 1, n_ao_total
     780              :                      d_vec_local(i) = d_vec_local(i) + phi_local(i, j)*phi_local(i, j)
     781              :                   END DO
     782              :                   d_vec_local(i) = 1.0_dp/MAX(d_vec_local(i), 1.0E-16_dp)
     783              :                END DO
     784              :                !$OMP END PARALLEL DO
     785              :             END IF
     786              : 
     787              :             ! ---------------------------------------------------------------------
     788              :             ! E. Pre-scale d_lp by d_vec
     789              :             ! ---------------------------------------------------------------------
     790              :             !$OMP PARALLEL DO DEFAULT(NONE) &
     791              :             !$OMP SHARED(n_loc_ri, n_local_grid, d_lp_local, d_vec_local) &
     792              :             !$OMP PRIVATE(j_ri, i) &
     793            0 :             !$OMP SCHEDULE(STATIC)
     794              :             DO j_ri = 1, n_loc_ri
     795              :                DO i = 1, n_local_grid
     796              :                   d_lp_local(i, j_ri) = d_lp_local(i, j_ri)*d_vec_local(i)
     797              :                END DO
     798              :             END DO
     799              :             !$OMP END PARALLEL DO
     800              : 
     801              :             ! ---------------------------------------------------------------------
     802              :             ! F. Solve — BLAS dpotrf/dpotrs or ScaLAPACK pdpotrf/pdpotrs
     803              :             ! ---------------------------------------------------------------------
     804            0 :             IF (n_procs_per_atom == 1) THEN
     805            0 :                CALL dpotrf('L', n_local_grid, D_local, n_local_grid, info)
     806              :                CALL dpotrs('L', n_local_grid, n_loc_ri, D_local, n_local_grid, &
     807            0 :                            d_lp_local, n_local_grid, info)
     808            0 :                DEALLOCATE (D_local)
     809              :             ELSE
     810              :                CALL solve_D_lp_distributed(phi_local, d_vec_local, d_lp_local, &
     811              :                                            n_local_grid, n_ao_total, n_loc_ri, &
     812              :                                            bs_env%ri_rs%tikhonov, &
     813              :                                            para_env_sub, blacs_env_sub, &
     814            0 :                                            fm_struct_D, fm_struct_b, fm_D, fm_b, info)
     815              :             END IF
     816              : 
     817              :             ! ---------------------------------------------------------------------
     818              :             ! G. Post-scale solution by d_vec (common to both paths)
     819              :             ! ---------------------------------------------------------------------
     820              :             !$OMP PARALLEL DO DEFAULT(NONE) &
     821              :             !$OMP SHARED(n_loc_ri, n_local_grid, d_lp_local, d_vec_local) &
     822              :             !$OMP PRIVATE(j_ri, i) &
     823            0 :             !$OMP SCHEDULE(STATIC)
     824              :             DO j_ri = 1, n_loc_ri
     825              :                DO i = 1, n_local_grid
     826              :                   d_lp_local(i, j_ri) = d_lp_local(i, j_ri)*d_vec_local(i)
     827              :                END DO
     828              :             END DO
     829              :             !$OMP END PARALLEL DO
     830              : 
     831              :             ! ---------------------------------------------------------------------
     832              :             ! H. Scatter Local Solution Back to Global DBCSR Matrix.
     833              :             !    Under ScaLAPACK (G>1) the d_lp_local solution is identical on all
     834              :             !    G subgroup ranks (gathered via cp_fm_get_submatrix); only the
     835              :             !    subgroup root writes to mat_Z_lP so each atom column is emitted
     836              :             !    exactly once. DBCSR routes blocks to their global owner on finalize.
     837              :             !    local_grid_idx is ascending (built by the ordered scan above), so
     838              :             !    a single walking pointer over chunks works.
     839              :             ! ---------------------------------------------------------------------
     840            0 :             IF (n_procs_per_atom == 1 .OR. para_env_sub%mepos == 0) THEN
     841            0 :                ALLOCATE (Z_blk(MAXVAL(r_blk_sizes), n_loc_ri))
     842            0 :                loc_ptr = 1
     843              : 
     844            0 :                DO i_blk = 1, num_grid_chunks
     845            0 :                   r_start = row_offset(i_blk) + 1
     846            0 :                   r_end = row_offset(i_blk) + r_blk_sizes(i_blk)
     847            0 :                   current_chunk_size = r_blk_sizes(i_blk)
     848              : 
     849            0 :                   Z_blk = 0.0_dp
     850              : 
     851            0 :                   DO WHILE (loc_ptr <= n_local_grid)
     852            0 :                      g = local_grid_idx(loc_ptr)
     853            0 :                      IF (g > r_end) EXIT
     854            0 :                      Z_blk(g - r_start + 1, 1:n_loc_ri) = d_lp_local(loc_ptr, 1:n_loc_ri)
     855            0 :                      loc_ptr = loc_ptr + 1
     856              :                   END DO
     857              : 
     858            0 :                   IF (MAXVAL(ABS(Z_blk(1:current_chunk_size, 1:n_loc_ri))) > bs_env%eps_filter) THEN
     859              :                      CALL dbcsr_put_block(mat_Z_lP, row=i_blk, col=atom_P, &
     860            0 :                                           block=Z_blk(1:current_chunk_size, 1:n_loc_ri))
     861              :                   END IF
     862              :                END DO
     863              : 
     864            0 :                DEALLOCATE (Z_blk)
     865              :             END IF
     866              : 
     867            0 :             DEALLOCATE (d_vec_local, d_lp_local)
     868            0 :             DEALLOCATE (local_grid_idx, phi_local)
     869              : 
     870              :          END DO
     871              : 
     872            0 :          DEALLOCATE (cutoff_ri_per_atom)
     873            0 :          CALL gw_3c_ctx_release(ctx_3c)
     874              : 
     875            0 :          CALL dbcsr_finalize(mat_Z_lP)
     876              : 
     877            0 :          IF (bs_env%unit_nr > 0) THEN
     878              :             WRITE (bs_env%unit_nr, '(T2,A,T57,A,F7.1,A)') &
     879            0 :                'Computed Z_lP ', ' Execution time', m_walltime() - t1, ' s'
     880            0 :             WRITE (bs_env%unit_nr, '(A)') ' '
     881              :          END IF
     882              : 
     883            0 :          logger => cp_get_default_logger()
     884              : 
     885            0 :          IF (BTEST(cp_print_key_should_output(logger%iter_info, input, key), cp_p_file)) THEN
     886            0 :             CALL dbcsr_binary_write(matrix=mat_Z_lP, filepath=TRIM(bs_env%prefix)//"Z_lP.matrix")
     887              :          END IF
     888              : 
     889              :       END IF
     890              : 
     891            0 :       DEALLOCATE (row_offset, ri_blk_sizes, col_dist_ri)
     892            0 :       CALL dbcsr_distribution_release(dist_Z)
     893              : 
     894            0 :       IF (n_procs_per_atom > 1) THEN
     895            0 :          CALL cp_blacs_env_release(blacs_env_sub)
     896            0 :          CALL para_env_sub%free()
     897            0 :          DEALLOCATE (para_env_sub)
     898              :       END IF
     899              : 
     900            0 :       DEALLOCATE (ri_rs_grid_points)
     901              : 
     902            0 :       CALL timestop(handle)
     903              : 
     904            0 :    END SUBROUTINE compute_coeff_Z_lP
     905              : 
     906              : ! **************************************************************************************************
     907              : !> \brief Computes the dense localized RHS d_lp(l,P) = Σ_{μν,R,S} Φ_μ(r_l)·Φ_ν(r_l)·(μν|P) for one
     908              : !>        RI atom P. OMP-threaded over (atom_j, atom_k) AO-pair blocks: per thread, sweep all
     909              : !>        (cell_R, cell_S) periodic images of (atom_j, atom_k) about atom_P at cell (0,0,0); each
     910              : !>        3c block is built by build_3c_integral_block_ctx (cached libint / sphi tables in ctx,
     911              : !>        kind-radius triangle screen → `screened` short-circuits negligible image triples), and
     912              : !>        grid-chunked pair densities are contracted into a private d_lp partial that is reduced
     913              : !>        into d_lp at the end of the parallel region.
     914              : !> \param bs_env ...
     915              : !> \param ctx shared 3c-integral context (gw_3c_ctx_create)
     916              : !> \param cell ...
     917              : !> \param phi_val Φ_μ(r_l) on the local-sphere grid (n_grid_total × n_ao)
     918              : !> \param d_lp output (n_grid_total × n_loc_ri), zeroed by the caller, accumulated here
     919              : !> \param n_grid_total number of local-sphere grid rows
     920              : !> \param n_loc_ri number of RI functions of atom_P
     921              : !> \param atom_P RI atom (pinned to cell (0,0,0))
     922              : !> \param max_ao_size ...
     923              : !> \param atom_j_mepos ...
     924              : !> \param atom_j_stride ...
     925              : ! **************************************************************************************************
     926              : 
     927            0 :    SUBROUTINE compute_d_lp(bs_env, ctx, cell, phi_val, d_lp, n_grid_total, n_loc_ri, atom_P, &
     928              :                            max_ao_size, atom_j_mepos, atom_j_stride)
     929              : 
     930              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
     931              :       TYPE(gw_3c_ctx_type), INTENT(IN)                   :: ctx
     932              :       TYPE(cell_type), POINTER                           :: cell
     933              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: phi_val
     934              :       INTEGER, INTENT(IN)                                :: n_grid_total, n_loc_ri
     935              :       REAL(KIND=dp), INTENT(INOUT)                       :: d_lp(n_grid_total, n_loc_ri)
     936              :       INTEGER, INTENT(IN)                                :: atom_P, max_ao_size, atom_j_mepos, &
     937              :                                                             atom_j_stride
     938              : 
     939              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_d_lp'
     940              :       INTEGER, PARAMETER                                 :: grid_chunk = 1024
     941              : 
     942              :       INTEGER :: atom_j, atom_k, c, handle, ix_max, ix_min, ix_R, ix_S, iy_max, iy_min, iy_R, &
     943              :          iy_S, iz_max, iz_min, iz_R, iz_S, j, jk_idx, jsize, jstart, k, ksize, kstart, l, l0, &
     944              :          natom, ri
     945              :       INTEGER, DIMENSION(3)                              :: cell_R_vec, cell_S_vec
     946              :       LOGICAL                                            :: any_kept, screened
     947            0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: int_2d_prv, rho_chunk
     948            0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: int_3c_prv, int_3c_sum
     949            0 :       TYPE(gw_3c_ws_type)                                :: ws
     950              : 
     951            0 :       CALL timeset(routineN, handle)
     952              : 
     953            0 :       natom = SIZE(bs_env%i_ao_start_from_atom)
     954              : 
     955            0 :       IF (cell%perd(1) == 1) THEN; ix_min = -1; ix_max = 1; ELSE; ix_min = 0; ix_max = 0; END IF
     956            0 :       IF (cell%perd(2) == 1) THEN; iy_min = -1; iy_max = 1; ELSE; iy_min = 0; iy_max = 0; END IF
     957            0 :       IF (cell%perd(3) == 1) THEN; iz_min = -1; iz_max = 1; ELSE; iz_min = 0; iz_max = 0; END IF
     958              : 
     959              :       !$OMP PARALLEL DEFAULT(NONE) &
     960              :       !$OMP SHARED(bs_env, ctx, phi_val, d_lp, n_grid_total, n_loc_ri, atom_P, max_ao_size, &
     961              :       !$OMP        natom, ix_min, ix_max, iy_min, iy_max, iz_min, iz_max, &
     962              :       !$OMP        atom_j_mepos, atom_j_stride) &
     963              :       !$OMP PRIVATE(any_kept, atom_j, atom_k, c, j, jk_idx, jsize, jstart, k, ksize, kstart, l, &
     964              :       !$OMP         l0, ri, ix_R, iy_R, iz_R, ix_S, iy_S, iz_S, cell_R_vec, cell_S_vec, &
     965            0 :       !$OMP         screened, int_2d_prv, rho_chunk, int_3c_prv, int_3c_sum, ws)
     966              : 
     967              :       CALL gw_3c_ws_create(ws, ctx)
     968              :       ALLOCATE (int_3c_prv(max_ao_size, max_ao_size, n_loc_ri))
     969              :       ALLOCATE (int_3c_sum(max_ao_size, max_ao_size, n_loc_ri))
     970              :       ALLOCATE (int_2d_prv(max_ao_size*max_ao_size, n_loc_ri))
     971              :       ALLOCATE (rho_chunk(grid_chunk, max_ao_size*max_ao_size))
     972              : 
     973              :       ! atom_P pinned at cell (0,0,0); enumerate (atom_j, cell_R) × (atom_k, cell_S). The ctx
     974              :       ! integral builder's kind_radius triangle screen sets screened=.TRUE. for the bulk of
     975              :       ! image triples (one or both AO atoms beyond the truncated-Coulomb reach of atom_P),
     976              :       ! so the 27 × 27 = 729 candidate cells collapse to "adjacent cells" in practice.
     977              :       ! MPI-stride atom_j over the subgroup (atom_j_stride = 1 for the BLAS path, > 1 for the
     978              :       ! ScaLAPACK path). COLLAPSE(2) dropped because the outer stride is non-unit under
     979              :       ! ScaLAPACK; the inner atom_k loop carries enough work for DYNAMIC.
     980              :       !$OMP DO SCHEDULE(DYNAMIC) REDUCTION(+:d_lp)
     981              :       DO atom_j = atom_j_mepos + 1, natom, atom_j_stride
     982              :          DO atom_k = 1, natom
     983              :             jstart = bs_env%i_ao_start_from_atom(atom_j)
     984              :             jsize = bs_env%i_ao_end_from_atom(atom_j) - jstart + 1
     985              :             kstart = bs_env%i_ao_start_from_atom(atom_k)
     986              :             ksize = bs_env%i_ao_end_from_atom(atom_k) - kstart + 1
     987              : 
     988              :             int_3c_sum(1:jsize, 1:ksize, 1:n_loc_ri) = 0.0_dp
     989              :             any_kept = .FALSE.
     990              : 
     991              :             DO ix_R = ix_min, ix_max
     992              :             DO iy_R = iy_min, iy_max
     993              :             DO iz_R = iz_min, iz_max
     994              :                cell_R_vec = [ix_R, iy_R, iz_R]
     995              :                DO ix_S = ix_min, ix_max
     996              :                DO iy_S = iy_min, iy_max
     997              :                DO iz_S = iz_min, iz_max
     998              :                   cell_S_vec = [ix_S, iy_S, iz_S]
     999              : 
    1000              :                   int_3c_prv(1:jsize, 1:ksize, 1:n_loc_ri) = 0.0_dp
    1001              : 
    1002              :                   CALL build_3c_integral_block_ctx( &
    1003              :                      int_3c_prv(1:jsize, 1:ksize, 1:n_loc_ri), ctx, ws, &
    1004              :                      atom_j=atom_j, atom_k=atom_k, atom_i=atom_P, &
    1005              :                      cell_j=cell_R_vec, cell_k=cell_S_vec, cell_i=[0, 0, 0], &
    1006              :                      screened=screened)
    1007              :                   IF (screened) CYCLE
    1008              : 
    1009              :                   any_kept = .TRUE.
    1010              :                   int_3c_sum(1:jsize, 1:ksize, 1:n_loc_ri) = &
    1011              :                      int_3c_sum(1:jsize, 1:ksize, 1:n_loc_ri) + &
    1012              :                      int_3c_prv(1:jsize, 1:ksize, 1:n_loc_ri)
    1013              :                END DO
    1014              :                END DO
    1015              :                END DO
    1016              :             END DO
    1017              :             END DO
    1018              :             END DO
    1019              : 
    1020              :             IF (.NOT. any_kept) CYCLE
    1021              : 
    1022              :             ! Flatten 3D B_{μν,P} → 2D B_{(μν),P}
    1023              :             DO ri = 1, n_loc_ri
    1024              :                DO k = 1, ksize
    1025              :                   DO j = 1, jsize
    1026              :                      jk_idx = (k - 1)*jsize + j
    1027              :                      int_2d_prv(jk_idx, ri) = int_3c_sum(j, k, ri)
    1028              :                   END DO
    1029              :                END DO
    1030              :             END DO
    1031              : 
    1032              :             ! Pair density ρ(l,μν) = Φ_μ(r_l)Φ_ν(r_l) in grid chunks, contracted on the fly:
    1033              :             ! d_{l,P} += ρ(l,μν) B_{(μν),P}  (dgemm runs serially inside the parallel region)
    1034              :             DO l0 = 1, n_grid_total, grid_chunk
    1035              :                c = MIN(grid_chunk, n_grid_total - l0 + 1)
    1036              :                DO k = 1, ksize
    1037              :                   DO j = 1, jsize
    1038              :                      jk_idx = (k - 1)*jsize + j
    1039              :                      DO l = 1, c
    1040              :                         rho_chunk(l, jk_idx) = phi_val(l0 + l - 1, jstart + j - 1)* &
    1041              :                                                phi_val(l0 + l - 1, kstart + k - 1)
    1042              :                      END DO
    1043              :                   END DO
    1044              :                END DO
    1045              :                CALL dgemm("N", "N", c, n_loc_ri, jsize*ksize, &
    1046              :                           1.0_dp, rho_chunk, grid_chunk, &
    1047              :                           int_2d_prv, max_ao_size*max_ao_size, &
    1048              :                           1.0_dp, d_lp(l0, 1), n_grid_total)
    1049              :             END DO
    1050              :          END DO
    1051              :       END DO
    1052              :       !$OMP END DO
    1053              : 
    1054              :       DEALLOCATE (int_3c_prv, int_3c_sum, int_2d_prv, rho_chunk)
    1055              :       CALL gw_3c_ws_release(ws)
    1056              : 
    1057              :       !$OMP END PARALLEL
    1058              : 
    1059            0 :       CALL timestop(handle)
    1060              : 
    1061            0 :    END SUBROUTINE compute_d_lp
    1062              : 
    1063              : ! **************************************************************************************************
    1064              : !> \brief Computes the χ(iτ, k=0) matrix
    1065              : !> \param bs_env ...
    1066              : !> \param mat_chi_Gamma_tau ...
    1067              : !> \param mat_phi_mu_l ...
    1068              : !> \param mat_Z_lP ...
    1069              : ! **************************************************************************************************
    1070              : 
    1071            0 :    SUBROUTINE get_mat_chi_Gamma_tau(bs_env, mat_chi_Gamma_tau, mat_phi_mu_l, mat_Z_lP)
    1072              : 
    1073              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1074              :       TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: mat_chi_Gamma_tau
    1075              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, mat_Z_lP
    1076              : 
    1077              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'get_mat_chi_Gamma_tau'
    1078              : 
    1079              :       INTEGER                                            :: handle, i, i_t, ispin, npcol
    1080            0 :       INTEGER, DIMENSION(:), POINTER                     :: blk_ao, blk_grid, dist_col_ao, &
    1081            0 :                                                             dist_col_grid, dist_row_grid
    1082              :       REAL(KIND=dp)                                      :: t1, tau
    1083              :       TYPE(dbcsr_distribution_type)                      :: dist_grid_grid, dist_phi
    1084              :       TYPE(dbcsr_type)                                   :: matrix_chi_grid, matrix_chi_grid_spin, &
    1085              :                                                             matrix_G_occ_grid, matrix_G_vir_grid
    1086              : 
    1087            0 :       CALL timeset(routineN, handle)
    1088              : 
    1089              :       ! =========================================================================
    1090              :       ! 1. SETUP CORE TOPOLOGIES
    1091              :       ! =========================================================================
    1092            0 :       CALL dbcsr_get_info(mat_phi_mu_l, distribution=dist_phi, row_blk_size=blk_grid, col_blk_size=blk_ao)
    1093            0 :       CALL dbcsr_distribution_get(dist_phi, row_dist=dist_row_grid, col_dist=dist_col_ao)
    1094              : 
    1095              :       ! Determine number of MPI process columns
    1096            0 :       npcol = MAXVAL(dist_col_ao) + 1
    1097              : 
    1098              :       ! Build a perfectly safe column distribution for the Grid dimension
    1099            0 :       ALLOCATE (dist_col_grid(SIZE(blk_grid)))
    1100            0 :       DO i = 1, SIZE(blk_grid)
    1101            0 :          dist_col_grid(i) = MOD(i - 1, npcol)
    1102              :       END DO
    1103              : 
    1104              :       CALL dbcsr_distribution_new(dist_grid_grid, template=dist_phi, &
    1105            0 :                                   row_dist=dist_row_grid, col_dist=dist_col_grid)
    1106              : 
    1107            0 :       CALL dbcsr_create(matrix_G_occ_grid, "G_occ_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1108            0 :       CALL dbcsr_create(matrix_G_vir_grid, "G_vir_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1109            0 :       CALL dbcsr_create(matrix_chi_grid, "chi_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1110            0 :       CALL dbcsr_create(matrix_chi_grid_spin, "chi_grid_spin", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1111              : 
    1112              :       ! =========================================================================
    1113              :       ! 2. MAIN IMAGINARY TIME LOOP
    1114              :       ! =========================================================================
    1115            0 :       DO i_t = 1, bs_env%num_time_freq_points
    1116            0 :          t1 = m_walltime()
    1117              : 
    1118            0 :          tau = bs_env%imag_time_points(i_t)
    1119            0 :          CALL dbcsr_set(matrix_chi_grid, 0.0_dp)
    1120              : 
    1121              :          ! ----------------------------------------------------------------------
    1122              :          ! A. SPIN LOOP (Allocations safely encapsulated in wrappers)
    1123              :          ! ----------------------------------------------------------------------
    1124            0 :          DO ispin = 1, bs_env%n_spin
    1125              : 
    1126              :             ! G^occ_µλ(i|τ|,k=0) = sum_n^occ C_µn(k=0) e^(-|(ϵ_nk=0-ϵ_F)τ|) C_λn(k=0)
    1127              :             ! G^occ_ll'(i|τ|,k=0) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
    1128              :             CALL build_G_grid(bs_env, tau, ispin, .TRUE., .FALSE., mat_phi_mu_l, &
    1129            0 :                               matrix_G_occ_grid, bs_env%eps_filter)
    1130              : 
    1131              :             ! G^vir_µλ(i|τ|,k=0) = sum_n^vir C_µn(k=0) e^(-|(ϵ_nk=0-ϵ_F)τ|) C_λn(k=0)
    1132              :             ! G^vir_ll'(i|τ|,k=0) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
    1133              :             CALL build_G_grid(bs_env, tau, ispin, .FALSE., .TRUE., mat_phi_mu_l, &
    1134            0 :                               matrix_G_vir_grid, bs_env%eps_filter)
    1135              : 
    1136              :             ! -------------------------------------------------------------------
    1137              :             ! B. ELEMENT-WISE HADAMARD PRODUCT
    1138              :             ! -------------------------------------------------------------------
    1139              :             ! χ_ll'(iτ,k=0) = G^occ_ll'(i|τ|,k=0) * G^vir_ll'(i|τ|,k=0)
    1140            0 :             CALL hadamard_product(matrix_G_occ_grid, matrix_G_vir_grid, matrix_chi_grid_spin, bs_env%spin_degeneracy)
    1141              : 
    1142              :             ! Accumulate spin contributions
    1143            0 :             CALL dbcsr_add(matrix_chi_grid, matrix_chi_grid_spin, 1.0_dp, 1.0_dp)
    1144              : 
    1145              :          END DO ! ispin
    1146              : 
    1147              :          ! ----------------------------------------------------------------------
    1148              :          ! C. TRANSFORM TO AUXILIARY BASIS & EXPORT DIRECTLY
    1149              :          ! χ_aux        = Z^T * χ_grid * Z
    1150              :          ! χ_PQ(iτ,k=0) = sum_ll' Z_lP χ_ll'(iτ,k=0) Z_l'Q
    1151              :          ! Result is dumped directly into the final array mat_chi_Gamma_tau!
    1152              :          ! ----------------------------------------------------------------------
    1153              :          CALL contract_A_B_A("T", "N", mat_Z_lP, matrix_chi_grid, &
    1154            0 :                              mat_chi_Gamma_tau(i_t)%matrix, bs_env%eps_filter)
    1155              : 
    1156            0 :          IF (bs_env%unit_nr > 0) THEN
    1157              :             WRITE (bs_env%unit_nr, '(T2,A,I13,A,I3,A,F7.1,A)') &
    1158            0 :                'Computed χ(iτ,k=0) for time point', i_t, ' /', bs_env%num_time_freq_points, &
    1159            0 :                ',    Execution time', m_walltime() - t1, ' s'
    1160              :          END IF
    1161              : 
    1162              :       END DO ! i_t
    1163              : 
    1164              :       ! =========================================================================
    1165              :       ! 3. FINAL CLEANUP
    1166              :       ! =========================================================================
    1167            0 :       CALL dbcsr_release(matrix_G_occ_grid)
    1168            0 :       CALL dbcsr_release(matrix_G_vir_grid)
    1169            0 :       CALL dbcsr_release(matrix_chi_grid)
    1170            0 :       CALL dbcsr_release(matrix_chi_grid_spin)
    1171            0 :       CALL dbcsr_distribution_release(dist_grid_grid)
    1172            0 :       DEALLOCATE (dist_col_grid)
    1173              : 
    1174            0 :       IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
    1175              : 
    1176            0 :       CALL timestop(handle)
    1177              : 
    1178            0 :    END SUBROUTINE get_mat_chi_Gamma_tau
    1179              : 
    1180              : ! **************************************************************************************************
    1181              : !> \brief Computes Green's Function in grid basis
    1182              : !> \param bs_env ...
    1183              : !> \param tau ...
    1184              : !> \param ispin ...
    1185              : !> \param occ ...
    1186              : !> \param vir ...
    1187              : !> \param mat_phi_mu_l ...
    1188              : !> \param matrix_G_grid ...
    1189              : !> \param eps_filter ...
    1190              : ! **************************************************************************************************
    1191              : 
    1192            0 :    SUBROUTINE build_G_grid(bs_env, tau, ispin, occ, vir, mat_phi_mu_l, matrix_G_grid, eps_filter)
    1193              : 
    1194              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1195              :       REAL(KIND=dp), INTENT(IN)                          :: tau
    1196              :       INTEGER, INTENT(IN)                                :: ispin
    1197              :       LOGICAL, INTENT(IN)                                :: occ, vir
    1198              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, matrix_G_grid
    1199              :       REAL(KIND=dp), INTENT(IN)                          :: eps_filter
    1200              : 
    1201              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'build_G_grid'
    1202              : 
    1203              :       INTEGER                                            :: handle
    1204            0 :       INTEGER, DIMENSION(:), POINTER                     :: blk_ao, dist_row_ao
    1205              :       TYPE(cp_fm_type), POINTER                          :: fm_G
    1206              :       TYPE(dbcsr_distribution_type)                      :: dist_ao_ao
    1207              :       TYPE(dbcsr_type)                                   :: matrix_G_ao
    1208              : 
    1209            0 :       CALL timeset(routineN, handle)
    1210              : 
    1211              :       ! 1. Select the correct FM matrix based on occ/vir flags
    1212            0 :       IF (occ) THEN
    1213            0 :          fm_G => bs_env%fm_Gocc
    1214              :       ELSE
    1215            0 :          fm_G => bs_env%fm_Gvir
    1216              :       END IF
    1217              : 
    1218              :       ! 2. Compute Dense FM Green's Function
    1219              :       ! G^occ/vir_µλ(i|τ|,k=0) = sum_G^occ/vir_µλn^occ/vir C_µn(k=0) e^(-|(ϵ_nk=0-ϵ_F)τ|) C_λn(k=0)
    1220            0 :       CALL G_occ_vir(bs_env, tau, fm_G, ispin, occ=occ, vir=vir)
    1221              : 
    1222              :       ! 3. Setup AO DBCSR Topology and Create Matrix dynamically
    1223            0 :       CALL setup_square_topology(mat_phi_mu_l, 'COL', dist_ao_ao, blk_ao, dist_row_ao)
    1224              : 
    1225              :       CALL dbcsr_create(matrix_G_ao, name="G_ao", dist=dist_ao_ao, &
    1226              :                         matrix_type=dbcsr_type_no_symmetry, &
    1227            0 :                         row_blk_size=blk_ao, col_blk_size=blk_ao)
    1228              : 
    1229              :       ! 4. Convert FM to Sparse DBCSR
    1230            0 :       CALL copy_fm_to_dbcsr(fm_G, matrix_G_ao, keep_sparsity=.FALSE.)
    1231              : 
    1232              :       ! 5. Transform to Grid Basis: G_grid = phi * G_ao * phi^T
    1233              :       ! G^occ/vir_ll'(i|τ|,k=0) = sum_µν Φ_µ(r_l) G^occ/vir_µν Φ_ν(r_l')
    1234            0 :       CALL contract_A_B_A("N", "T", mat_phi_mu_l, matrix_G_ao, matrix_G_grid, eps_filter)
    1235              : 
    1236              :       ! 6. Release AO matrix and topology
    1237            0 :       CALL release_dbcsr_topology_and_matrices(dist=dist_ao_ao, mapped_dist=dist_row_ao, m1=matrix_G_ao)
    1238              : 
    1239            0 :       CALL timestop(handle)
    1240              : 
    1241            0 :    END SUBROUTINE build_G_grid
    1242              : 
    1243              : ! **************************************************************************************************
    1244              : !> \brief Generalized routine to compute OUT = A * B * A^T  OR  OUT = A^T * B * A using DBCSR
    1245              : !> \param transA_left ...
    1246              : !> \param transA_right ...
    1247              : !> \param matrix_A ...
    1248              : !> \param matrix_B ...
    1249              : !> \param matrix_out ...
    1250              : !> \param eps_filter ...
    1251              : ! **************************************************************************************************
    1252              : 
    1253            0 :    SUBROUTINE contract_A_B_A(transA_left, transA_right, matrix_A, matrix_B, matrix_out, eps_filter)
    1254              : 
    1255              :       CHARACTER(LEN=1), INTENT(IN)                       :: transA_left, transA_right
    1256              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_A, matrix_B, matrix_out
    1257              :       REAL(KIND=dp), INTENT(IN)                          :: eps_filter
    1258              : 
    1259              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'contract_A_B_A'
    1260              : 
    1261              :       INTEGER                                            :: handle
    1262              :       TYPE(dbcsr_type)                                   :: matrix_tmp
    1263              : 
    1264            0 :       CALL timeset(routineN, handle)
    1265              : 
    1266            0 :       CALL dbcsr_create(matrix_tmp, template=matrix_A)
    1267              : 
    1268            0 :       IF (transA_left == "N" .AND. transA_right == "T") THEN
    1269              :          ! Path 1: Out = A * B * A^T
    1270              :          CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_A, matrix_B, &
    1271            0 :                              0.0_dp, matrix_tmp, filter_eps=eps_filter)
    1272              :          CALL dbcsr_multiply("N", "T", 1.0_dp, matrix_tmp, matrix_A, &
    1273            0 :                              0.0_dp, matrix_out, filter_eps=eps_filter)
    1274              : 
    1275            0 :       ELSE IF (transA_left == "T" .AND. transA_right == "N") THEN
    1276              :          ! Path 2: Out = A^T * B * A
    1277              :          CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_B, matrix_A, &
    1278            0 :                              0.0_dp, matrix_tmp, filter_eps=eps_filter)
    1279              :          CALL dbcsr_multiply("T", "N", 1.0_dp, matrix_A, matrix_tmp, &
    1280            0 :                              0.0_dp, matrix_out, filter_eps=eps_filter)
    1281              :       ELSE
    1282            0 :          CPABORT("Unsupported transposition pair in contract_A_B_A")
    1283              :       END IF
    1284              : 
    1285            0 :       CALL dbcsr_release(matrix_tmp)
    1286              : 
    1287            0 :       CALL timestop(handle)
    1288              : 
    1289            0 :    END SUBROUTINE contract_A_B_A
    1290              : 
    1291              : ! **************************************************************************************************
    1292              : !> \brief Computes C = A ◦ B (Element-wise Hadamard product) for sparse DBCSR matrices.
    1293              : !> \param matrix_A ...
    1294              : !> \param matrix_B ...
    1295              : !> \param matrix_C ...
    1296              : !> \param fac (Scaling factor applied to the product)
    1297              : ! **************************************************************************************************
    1298              : 
    1299            0 :    SUBROUTINE hadamard_product(matrix_A, matrix_B, matrix_C, fac)
    1300              : 
    1301              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_A, matrix_B, matrix_C
    1302              :       REAL(KIND=dp), INTENT(IN)                          :: fac
    1303              : 
    1304              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'hadamard_product'
    1305              : 
    1306              :       INTEGER                                            :: col, handle, row
    1307              :       LOGICAL                                            :: found
    1308            0 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: blk_B, blk_C
    1309              :       TYPE(dbcsr_iterator_type)                          :: iter
    1310              : 
    1311            0 :       CALL timeset(routineN, handle)
    1312              : 
    1313            0 :       CALL dbcsr_copy(matrix_C, matrix_A)
    1314              : 
    1315            0 :       CALL dbcsr_iterator_start(iter, matrix_C)
    1316            0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
    1317            0 :          CALL dbcsr_iterator_next_block(iter, row, col, blk_C)
    1318              : 
    1319            0 :          CALL dbcsr_get_block_p(matrix_B, row, col, blk_B, found)
    1320              : 
    1321            0 :          IF (found) THEN
    1322            0 :             blk_C(:, :) = fac*blk_C(:, :)*blk_B(:, :)
    1323              :          ELSE
    1324              :             ! If B is sparse here, the product is zero
    1325            0 :             blk_C(:, :) = 0.0_dp
    1326              :          END IF
    1327              :       END DO
    1328            0 :       CALL dbcsr_iterator_stop(iter)
    1329              : 
    1330            0 :       CALL timestop(handle)
    1331              : 
    1332            0 :    END SUBROUTINE hadamard_product
    1333              : 
    1334              : ! **************************************************************************************************
    1335              : !> \brief Compute screened Coulomb interaction matrix
    1336              : !> \param bs_env ...
    1337              : !> \param qs_env ...
    1338              : !> \param mat_chi_Gamma_tau ...
    1339              : !> \param fm_W_time ...
    1340              : ! **************************************************************************************************
    1341              : 
    1342            0 :    SUBROUTINE compute_W(bs_env, qs_env, mat_chi_Gamma_tau, fm_W_time)
    1343              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1344              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1345              :       TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: mat_chi_Gamma_tau
    1346              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_W_time
    1347              : 
    1348              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_W'
    1349              : 
    1350              :       INTEGER                                            :: handle, i_t, j_w
    1351              :       REAL(KIND=dp)                                      :: t1
    1352              :       TYPE(cp_fm_type)                                   :: fm_M_inv_V_sqrt, fm_V, fm_V_sqrt
    1353              : 
    1354            0 :       CALL timeset(routineN, handle)
    1355              : 
    1356            0 :       t1 = m_walltime()
    1357              : 
    1358            0 :       CALL create_fm_W_MIC_time(bs_env, fm_W_time)
    1359              : 
    1360              :       ! 1. Allocate V and M matrices
    1361            0 :       CALL cp_fm_create(fm_V, bs_env%fm_RI_RI%matrix_struct)
    1362            0 :       CALL cp_fm_create(fm_V_sqrt, bs_env%fm_RI_RI%matrix_struct)
    1363            0 :       CALL cp_fm_create(fm_M_inv_V_sqrt, bs_env%fm_RI_RI%matrix_struct)
    1364              : 
    1365              :       ! Compute V and M^-1 * V^0.5
    1366            0 :       CALL compute_V_MinvVsqrt(bs_env, qs_env, fm_V, fm_V_sqrt, fm_M_inv_V_sqrt)
    1367              : 
    1368              :       ! 2. Loop over frequencies
    1369            0 :       DO j_w = 1, bs_env%num_time_freq_points
    1370              :          ! Fourier transformation of χ_PQ(iτ) to χ_PQ(iω_j)
    1371            0 :          CALL compute_fm_chi_Gamma_freq(bs_env, bs_env%fm_chi_Gamma_freq, j_w, mat_chi_Gamma_tau)
    1372              : 
    1373              :          ! ε(iω_j) = Id - V^0.5*M^-1*χ(iω_j)*M^-1*V^0.5
    1374              :          ! W(iω_j) = V^0.5*(ε^-1(iω_j)-Id)*V^0.5
    1375              :          CALL compute_fm_W_freq(bs_env, bs_env%fm_chi_Gamma_freq, fm_V_sqrt, &
    1376            0 :                                 fm_M_inv_V_sqrt, bs_env%fm_W_MIC_freq)
    1377              : 
    1378              :          ! Fourier transform from W_PQ^MIC(iω_j) to W_PQ^MIC(iτ)
    1379            0 :          CALL Fourier_transform_w_to_t(bs_env, fm_W_time, bs_env%fm_W_MIC_freq, j_w)
    1380              :       END DO
    1381              : 
    1382              :       ! M^-1*W^MIC(iτ)*M^-1
    1383            0 :       CALL multiply_fm_W_MIC_time_with_Minv_Gamma(bs_env, qs_env, fm_W_time)
    1384              : 
    1385            0 :       IF (bs_env%unit_nr > 0) THEN
    1386              :          WRITE (bs_env%unit_nr, '(T2,A,T55,A,F10.1,A)') &
    1387            0 :             'Computed W(iτ),', ' Execution time', m_walltime() - t1, ' s'
    1388              :       END IF
    1389              : 
    1390            0 :       CALL dbcsr_deallocate_matrix_set(mat_chi_Gamma_tau)
    1391              : 
    1392              :       ! Cleanup
    1393            0 :       CALL cp_fm_release(fm_V)
    1394            0 :       CALL cp_fm_release(fm_V_sqrt)
    1395            0 :       CALL cp_fm_release(fm_M_inv_V_sqrt)
    1396              : 
    1397              :       ! Marek : Fourier transform W^MIC(itau) back to get it at a specific im.frequency point - iomega = 0
    1398            0 :       IF (bs_env%rtp_method == rtp_method_bse) THEN
    1399            0 :          t1 = m_walltime()
    1400            0 :          CALL cp_fm_create(bs_env%fm_W_MIC_freq_zero, bs_env%fm_W_MIC_freq%matrix_struct)
    1401              :          ! Set to zero
    1402            0 :          CALL cp_fm_set_all(bs_env%fm_W_MIC_freq_zero, 0.0_dp)
    1403              :          ! Sum over all times
    1404            0 :          DO i_t = 1, bs_env%num_time_freq_points
    1405              :             ! Add the relevant structure with correct weight
    1406              :             CALL cp_fm_scale_and_add(1.0_dp, bs_env%fm_W_MIC_freq_zero, &
    1407            0 :                                      bs_env%imag_time_weights_freq_zero(i_t), fm_W_time(i_t))
    1408              :          END DO
    1409              :          ! Done, save to file
    1410            0 :          CALL fm_write(bs_env%fm_W_MIC_freq_zero, 0, "W_freq_rtp", qs_env)
    1411              :          ! Report calculation
    1412            0 :          IF (bs_env%unit_nr > 0) THEN
    1413              :             WRITE (bs_env%unit_nr, '(T2,A,T55,A,F10.1,A)') &
    1414            0 :                'Computed W(0),', ' Execution time', m_walltime() - t1, ' s'
    1415              :          END IF
    1416              :       END IF
    1417              : 
    1418            0 :       IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
    1419              : 
    1420            0 :       CALL timestop(handle)
    1421              : 
    1422            0 :    END SUBROUTINE compute_W
    1423              : 
    1424              : ! **************************************************************************************************
    1425              : !> \brief Computes V, V^0.5, and M^-1 * V^0.5
    1426              : !> \param bs_env ...
    1427              : !> \param qs_env ...
    1428              : !> \param fm_V ...
    1429              : !> \param fm_V_sqrt ...
    1430              : !> \param fm_Minv_Vsqrt ...
    1431              : ! **************************************************************************************************
    1432              : 
    1433            0 :    SUBROUTINE compute_V_MinvVsqrt(bs_env, qs_env, fm_V, fm_V_sqrt, fm_Minv_Vsqrt)
    1434              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1435              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1436              :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm_V, fm_V_sqrt, fm_Minv_Vsqrt
    1437              : 
    1438              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_V_MinvVsqrt'
    1439              : 
    1440              :       INTEGER                                            :: handle, info, n_RI, ndep
    1441            0 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
    1442              :       TYPE(cell_type), POINTER                           :: cell
    1443              :       TYPE(cp_fm_type)                                   :: fm_work
    1444            0 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :)     :: fm_M
    1445            0 :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: mat_V_kp
    1446            0 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1447            0 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
    1448              : 
    1449            0 :       CALL timeset(routineN, handle)
    1450              : 
    1451            0 :       n_RI = bs_env%n_RI
    1452            0 :       CALL cp_fm_create(fm_work, fm_V%matrix_struct)
    1453              : 
    1454              :       ! -----------------------------------------------------------------------
    1455              :       ! 1. Build Coulomb Matrix V(k=0) using the kp-routine but only for ikp=1
    1456              :       ! -----------------------------------------------------------------------
    1457              :       CALL get_qs_env(qs_env=qs_env, particle_set=particle_set, cell=cell, &
    1458            0 :                       qs_kind_set=qs_kind_set, atomic_kind_set=atomic_kind_set)
    1459              : 
    1460            0 :       ALLOCATE (mat_V_kp(1:1, 1:2))
    1461            0 :       NULLIFY (mat_V_kp(1, 1)%matrix, mat_V_kp(1, 2)%matrix)
    1462            0 :       ALLOCATE (mat_V_kp(1, 1)%matrix, mat_V_kp(1, 2)%matrix)
    1463              : 
    1464            0 :       CALL dbcsr_create(mat_V_kp(1, 1)%matrix, template=bs_env%mat_RI_RI%matrix)
    1465            0 :       CALL dbcsr_reserve_all_blocks(mat_V_kp(1, 1)%matrix)
    1466            0 :       CALL dbcsr_set(mat_V_kp(1, 1)%matrix, 0.0_dp)
    1467              : 
    1468              :       ! Dummy imaginary part just to satisfy the routine
    1469            0 :       CALL dbcsr_create(mat_V_kp(1, 2)%matrix, template=bs_env%mat_RI_RI%matrix)
    1470            0 :       CALL dbcsr_reserve_all_blocks(mat_V_kp(1, 2)%matrix)
    1471            0 :       CALL dbcsr_set(mat_V_kp(1, 2)%matrix, 0.0_dp)
    1472              : 
    1473            0 :       bs_env%kpoints_chi_eps_W%nkp_grid = bs_env%nkp_grid_chi_eps_W_orig
    1474              : 
    1475              :       CALL build_2c_coulomb_matrix_kp(mat_V_kp, bs_env%kpoints_chi_eps_W, "RI_AUX", cell, &
    1476              :                                       particle_set, qs_kind_set, atomic_kind_set, &
    1477            0 :                                       bs_env%size_lattice_sum_V, operator_coulomb, 1, 1)
    1478              : 
    1479              :       ! Copy real part to fm_V
    1480            0 :       CALL copy_dbcsr_to_fm(mat_V_kp(1, 1)%matrix, fm_V)
    1481              : 
    1482            0 :       CALL dbcsr_deallocate_matrix(mat_V_kp(1, 1)%matrix)
    1483            0 :       CALL dbcsr_deallocate_matrix(mat_V_kp(1, 2)%matrix)
    1484            0 :       DEALLOCATE (mat_V_kp)
    1485              : 
    1486              :       ! -----------------------------------------------------------------------
    1487              :       ! 2. Get RI-Metric Matrix M(k=0)
    1488              :       ! -----------------------------------------------------------------------
    1489              :       CALL RI_2c_integral_mat(qs_env, fm_M, fm_V, n_RI, bs_env%ri_metric, &
    1490            0 :                               do_kpoints=.FALSE., regularization_RI=bs_env%regularization_RI)
    1491              : 
    1492              :       ! -----------------------------------------------------------------------
    1493              :       ! 3. M -> M^-1
    1494              :       ! -----------------------------------------------------------------------
    1495            0 :       CALL cp_fm_cholesky_decompose(fm_M(1, 1), info_out=info)
    1496            0 :       IF (info == 0) THEN
    1497            0 :          CALL cp_fm_cholesky_invert(fm_M(1, 1))
    1498            0 :          CALL cp_fm_uplo_to_full(fm_M(1, 1), fm_work)
    1499              :       ELSE
    1500              :          ! Fallback if Cholesky fails due to conditioning
    1501            0 :          CALL cp_fm_power(fm_M(1, 1), fm_work, -1.0_dp, bs_env%eps_eigval_mat_RI, ndep)
    1502            0 :          CALL cp_fm_to_fm(fm_work, fm_M(1, 1))
    1503              :       END IF
    1504              : 
    1505              :       ! -----------------------------------------------------------------------
    1506              :       ! 4. V -> V^0.5
    1507              :       ! -----------------------------------------------------------------------
    1508            0 :       CALL cp_fm_to_fm(fm_V, fm_V_sqrt)
    1509            0 :       CALL cp_fm_cholesky_decompose(fm_V_sqrt, info_out=info)
    1510            0 :       IF (info == 0) THEN
    1511            0 :          CALL clean_lower_part(fm_V_sqrt)
    1512              :       ELSE
    1513            0 :          CALL cp_fm_power(fm_V, fm_V_sqrt, 0.5_dp, bs_env%eps_eigval_mat_RI, ndep)
    1514              :       END IF
    1515              : 
    1516              :       ! -----------------------------------------------------------------------
    1517              :       ! 5. M^-1 * V^0.5
    1518              :       ! -----------------------------------------------------------------------
    1519              :       CALL parallel_gemm("N", "T", n_RI, n_RI, n_RI, 1.0_dp, fm_M(1, 1), fm_V_sqrt, &
    1520            0 :                          0.0_dp, fm_Minv_Vsqrt)
    1521              : 
    1522            0 :       CALL cp_fm_release(fm_M)
    1523            0 :       CALL cp_fm_release(fm_work)
    1524              : 
    1525            0 :       CALL timestop(handle)
    1526              : 
    1527            0 :    END SUBROUTINE compute_V_MinvVsqrt
    1528              : 
    1529              : ! **************************************************************************************************
    1530              : !> \brief Computes W(iω) from χ_PQ(iω_j)
    1531              : !> \param bs_env ...
    1532              : !> \param fm_chi_freq_j ...
    1533              : !> \param fm_V_sqrt ...
    1534              : !> \param fm_Minv_Vsqrt ...
    1535              : !> \param fm_W_freq_j ...
    1536              : ! **************************************************************************************************
    1537              : 
    1538            0 :    SUBROUTINE compute_fm_W_freq(bs_env, fm_chi_freq_j, fm_V_sqrt, fm_Minv_Vsqrt, fm_W_freq_j)
    1539              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1540              :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_chi_freq_j, fm_V_sqrt, fm_Minv_Vsqrt
    1541              :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm_W_freq_j
    1542              : 
    1543              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_fm_W_freq'
    1544              : 
    1545              :       INTEGER                                            :: handle, info, n_RI, ndep
    1546              :       TYPE(cp_fm_type)                                   :: fm_eps_freq_j, fm_work
    1547              : 
    1548            0 :       CALL timeset(routineN, handle)
    1549              : 
    1550            0 :       n_RI = bs_env%n_RI
    1551              : 
    1552            0 :       CALL cp_fm_create(fm_eps_freq_j, fm_chi_freq_j%matrix_struct)
    1553            0 :       CALL cp_fm_create(fm_work, fm_chi_freq_j%matrix_struct)
    1554              : 
    1555              :       ! -----------------------------------------------------------------------
    1556              :       ! 1. ε(iω_j) = Id - (M^-1 * V^0.5)^T * χ(iω_j) * (M^-1 * V^0.5)
    1557              :       ! -----------------------------------------------------------------------
    1558              :       ! work = χ(iω_j) * (M^-1 * V^0.5)
    1559              :       CALL parallel_gemm('N', 'N', n_RI, n_RI, n_RI, 1.0_dp, &
    1560            0 :                          fm_chi_freq_j, fm_Minv_Vsqrt, 0.0_dp, fm_work)
    1561              : 
    1562              :       ! eps_work = (M^-1 * V^0.5)^T * work
    1563              :       CALL parallel_gemm('T', 'N', n_RI, n_RI, n_RI, 1.0_dp, &
    1564            0 :                          fm_Minv_Vsqrt, fm_work, 0.0_dp, fm_eps_freq_j)
    1565              : 
    1566              :       ! ε(iω_j) = Id - eps_work  -->  -eps_work + Id
    1567            0 :       CALL fm_add_on_diag(fm_eps_freq_j, 1.0_dp)
    1568              : 
    1569              :       ! Force perfect symmetry before Cholesky to avoid info != 0 due to GEMM noise
    1570            0 :       CALL cp_fm_uplo_to_full(fm_eps_freq_j, fm_work)
    1571              : 
    1572              :       ! -----------------------------------------------------------------------
    1573              :       ! 2. W(iω_j) = V^0.5^T * (ε^-1(iω_j) - Id) * V^0.5
    1574              :       ! -----------------------------------------------------------------------
    1575              : 
    1576              :       ! a) Cholesky decomposition of ε(iω_j)
    1577            0 :       CALL cp_fm_cholesky_decompose(fm_eps_freq_j, info_out=info)
    1578              : 
    1579              :       ! b) Inversion
    1580            0 :       IF (info == 0) THEN
    1581            0 :          CALL cp_fm_cholesky_invert(fm_eps_freq_j)
    1582            0 :          CALL cp_fm_uplo_to_full(fm_eps_freq_j, fm_work)
    1583              :       ELSE
    1584              :          ! Fallback to expensive diagonalization if Cholesky fails
    1585            0 :          CALL cp_fm_power(fm_eps_freq_j, fm_work, -1.0_dp, bs_env%eps_eigval_mat_RI, ndep)
    1586            0 :          CALL cp_fm_to_fm(fm_work, fm_eps_freq_j)
    1587              :       END IF
    1588              : 
    1589              :       ! c) ε^-1(iω_j) - Id
    1590            0 :       CALL fm_add_on_diag(fm_eps_freq_j, -1.0_dp)
    1591              : 
    1592              :       ! d) work = (ε^-1(iω_j) - Id) * V^0.5
    1593              :       CALL parallel_gemm('N', 'N', n_RI, n_RI, n_RI, 1.0_dp, fm_eps_freq_j, fm_V_sqrt, &
    1594            0 :                          0.0_dp, fm_work)
    1595              : 
    1596              :       ! e) W(iw) = V^0.5^T * work
    1597              :       CALL parallel_gemm('T', 'N', n_RI, n_RI, n_RI, 1.0_dp, fm_V_sqrt, fm_work, &
    1598            0 :                          0.0_dp, fm_W_freq_j)
    1599              : 
    1600              :       ! Cleanup
    1601            0 :       CALL cp_fm_release(fm_work)
    1602            0 :       CALL cp_fm_release(fm_eps_freq_j)
    1603              : 
    1604            0 :       CALL timestop(handle)
    1605              : 
    1606            0 :    END SUBROUTINE compute_fm_W_freq
    1607              : 
    1608              : ! **************************************************************************************************
    1609              : !> \brief Adds a real scalar value to the diagonal of a real full matrix (fm)
    1610              : !> \param fm ...
    1611              : !> \param alpha ...
    1612              : ! **************************************************************************************************
    1613              : 
    1614            0 :    SUBROUTINE fm_add_on_diag(fm, alpha)
    1615              :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm
    1616              :       REAL(KIND=dp), INTENT(IN)                          :: alpha
    1617              : 
    1618              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'fm_add_on_diag'
    1619              : 
    1620              :       INTEGER                                            :: handle, i_global, i_row, j_col, &
    1621              :                                                             j_global, ncol_local, nrow_local
    1622            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
    1623              : 
    1624            0 :       CALL timeset(routineN, handle)
    1625              : 
    1626              :       CALL cp_fm_get_info(matrix=fm, &
    1627              :                           nrow_local=nrow_local, &
    1628              :                           ncol_local=ncol_local, &
    1629              :                           row_indices=row_indices, &
    1630            0 :                           col_indices=col_indices)
    1631              : 
    1632            0 :       DO j_col = 1, ncol_local
    1633            0 :          j_global = col_indices(j_col)
    1634            0 :          DO i_row = 1, nrow_local
    1635            0 :             i_global = row_indices(i_row)
    1636            0 :             IF (j_global == i_global) THEN
    1637            0 :                fm%local_data(i_row, j_col) = fm%local_data(i_row, j_col) + alpha
    1638              :             END IF
    1639              :          END DO
    1640              :       END DO
    1641              : 
    1642            0 :       CALL timestop(handle)
    1643              : 
    1644            0 :    END SUBROUTINE fm_add_on_diag
    1645              : 
    1646              : ! **************************************************************************************************
    1647              : !> \brief Zeroes out the strictly lower triangular part of a real matrix
    1648              : !> \param fm_mat ...
    1649              : ! **************************************************************************************************
    1650            0 :    SUBROUTINE clean_lower_part(fm_mat)
    1651              :       TYPE(cp_fm_type)                                   :: fm_mat
    1652              : 
    1653              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'clean_lower_part'
    1654              : 
    1655              :       INTEGER                                            :: handle, i_row, j_col, j_global, &
    1656              :                                                             ncol_local, nrow_local
    1657            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
    1658              : 
    1659            0 :       CALL timeset(routineN, handle)
    1660              : 
    1661              :       CALL cp_fm_get_info(matrix=fm_mat, &
    1662              :                           nrow_local=nrow_local, ncol_local=ncol_local, &
    1663            0 :                           row_indices=row_indices, col_indices=col_indices)
    1664              : 
    1665            0 :       DO j_col = 1, ncol_local
    1666            0 :          j_global = col_indices(j_col)
    1667            0 :          DO i_row = 1, nrow_local
    1668            0 :             IF (j_global < row_indices(i_row)) fm_mat%local_data(i_row, j_col) = 0.0_dp
    1669              :          END DO
    1670              :       END DO
    1671              : 
    1672            0 :       CALL timestop(handle)
    1673              : 
    1674            0 :    END SUBROUTINE clean_lower_part
    1675              : 
    1676              : ! **************************************************************************************************
    1677              : !> \brief Computes the exact exchange part of the GW self-energy
    1678              : !> \param bs_env ...
    1679              : !> \param qs_env ...
    1680              : !> \param mat_phi_mu_l ...
    1681              : !> \param mat_Z_lP ...
    1682              : !> \param fm_Sigma_x_Gamma ...
    1683              : ! **************************************************************************************************
    1684              : 
    1685            0 :    SUBROUTINE compute_Sigma_x(bs_env, qs_env, mat_phi_mu_l, mat_Z_lP, fm_Sigma_x_Gamma)
    1686              : 
    1687              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1688              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1689              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, mat_Z_lP
    1690              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_Sigma_x_Gamma
    1691              : 
    1692              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_Sigma_x'
    1693              : 
    1694              :       INTEGER                                            :: handle, ispin
    1695            0 :       INTEGER, DIMENSION(:), POINTER                     :: blk_aux, blk_grid, dist_col_grid, &
    1696            0 :                                                             dist_row_aux
    1697              :       REAL(KIND=dp)                                      :: t1
    1698            0 :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :)     :: fm_Vtr_Gamma
    1699              :       TYPE(dbcsr_distribution_type)                      :: dist_aux_aux, dist_grid_grid
    1700              :       TYPE(dbcsr_type)                                   :: mat_Sigma_x_Gamma, matrix_D_grid, &
    1701              :                                                             matrix_Sigma_x_grid, matrix_V_aux, &
    1702              :                                                             matrix_V_grid
    1703              : 
    1704            0 :       CALL timeset(routineN, handle)
    1705              : 
    1706            0 :       t1 = m_walltime()
    1707              : 
    1708            0 :       ALLOCATE (fm_Sigma_x_Gamma(bs_env%n_spin))
    1709            0 :       DO ispin = 1, bs_env%n_spin
    1710            0 :          CALL cp_fm_create(fm_Sigma_x_Gamma(ispin), bs_env%fm_s_Gamma%matrix_struct)
    1711              :       END DO
    1712              : 
    1713            0 :       CALL dbcsr_create(mat_Sigma_x_Gamma, template=bs_env%mat_ao_ao%matrix)
    1714              : 
    1715              :       ! =========================================================================
    1716              :       ! 1. SETUP CORE TOPOLOGIES
    1717              :       ! =========================================================================
    1718            0 :       CALL setup_square_topology(mat_phi_mu_l, 'ROW', dist_grid_grid, blk_grid, dist_col_grid)
    1719            0 :       CALL setup_square_topology(mat_Z_lP, 'COL', dist_aux_aux, blk_aux, dist_row_aux)
    1720              : 
    1721              :       ! =========================================================================
    1722              :       ! 2. COMPUTE V^tr_ll'
    1723              :       ! =========================================================================
    1724              :       CALL RI_2c_integral_mat(qs_env, fm_Vtr_Gamma, bs_env%fm_RI_RI, bs_env%n_RI, &
    1725            0 :                               bs_env%trunc_coulomb, do_kpoints=.FALSE.)
    1726              : 
    1727              :       ! M^-1 * V^tr * M^-1 directly modifies fm_Vtr_Gamma(:, 1)
    1728            0 :       CALL multiply_fm_W_MIC_time_with_Minv_Gamma(bs_env, qs_env, fm_Vtr_Gamma(:, 1))
    1729              : 
    1730            0 :       CALL dbcsr_create(matrix_V_aux, "V_aux", dist_aux_aux, dbcsr_type_no_symmetry, blk_aux, blk_aux)
    1731            0 :       CALL dbcsr_create(matrix_V_grid, "V_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1732              : 
    1733            0 :       CALL copy_fm_to_dbcsr(fm_Vtr_Gamma(1, 1), matrix_V_aux, keep_sparsity=.FALSE.)
    1734              : 
    1735              :       ! V^tr_ll' = sum_PQ Z_lP V^trunc_PQ Z_l'Q
    1736            0 :       CALL contract_A_B_A("N", "T", mat_Z_lP, matrix_V_aux, matrix_V_grid, bs_env%eps_filter)
    1737            0 :       CALL dbcsr_release(matrix_V_aux)
    1738              : 
    1739              :       ! =========================================================================
    1740              :       ! 3. SPIN LOOP FOR EXACT EXCHANGE
    1741              :       ! =========================================================================
    1742            0 :       DO ispin = 1, bs_env%n_spin
    1743              : 
    1744              :          ! Density matrix on grid is essentially G_occ at tau = 0.0
    1745              :          ! D_µν  = sum_n^occ C_µn(k=0) C_νn(k=0)
    1746              :          ! D_ll' = sum_µν Φ_µ(r_l) D_µν Φ_ν(r_l')
    1747            0 :          CALL dbcsr_create(matrix_D_grid, "D_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1748            0 :          CALL build_G_grid(bs_env, 0.0_dp, ispin, .TRUE., .FALSE., mat_phi_mu_l, matrix_D_grid, bs_env%eps_filter)
    1749              : 
    1750              :          ! Element-wise Hadamard product: Σ^x_grid = D_grid ◦ V_grid
    1751              :          ! Σ^x_ll' = D_ll' * V^tr_ll'
    1752            0 :          CALL dbcsr_create(matrix_Sigma_x_grid, template=matrix_V_grid)
    1753            0 :          CALL hadamard_product(matrix_D_grid, matrix_V_grid, matrix_Sigma_x_grid, 1.0_dp)
    1754              : 
    1755            0 :          CALL dbcsr_release(matrix_D_grid)
    1756              : 
    1757              :          ! Transform back to AO basis: Σ^x_ao = -1.0 * phi^T * Σ^x_grid * phi
    1758              :          ! Σ^x_λσ(k=0)   = -sum_ll' Φ_λ(r_l) Σ^x_ll' Φ_σ(r_l')
    1759            0 :          CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_x_grid, mat_Sigma_x_Gamma, bs_env%eps_filter)
    1760            0 :          CALL dbcsr_scale(mat_Sigma_x_Gamma, -1.0_dp)
    1761              : 
    1762            0 :          CALL dbcsr_release(matrix_Sigma_x_grid)
    1763              : 
    1764              :          ! Data I/O and Export to CP2K Full Matrices
    1765            0 :          CALL copy_dbcsr_to_fm(mat_Sigma_x_Gamma, fm_Sigma_x_Gamma(ispin))
    1766              : 
    1767              :       END DO ! ispin
    1768              : 
    1769            0 :       IF (bs_env%unit_nr > 0) THEN
    1770              :          WRITE (bs_env%unit_nr, '(T2,A,T58,A,F7.1,A)') &
    1771            0 :             'Computed Σ^x(k=0),', ' Execution time', m_walltime() - t1, ' s'
    1772            0 :          WRITE (bs_env%unit_nr, '(A)') ' '
    1773              :       END IF
    1774              : 
    1775              :       ! =========================================================================
    1776              :       ! 4. CLEANUP
    1777              :       ! =========================================================================
    1778              :       CALL release_dbcsr_topology_and_matrices(dist=dist_grid_grid, mapped_dist=dist_col_grid, &
    1779            0 :                                                m1=mat_Sigma_x_Gamma, m2=matrix_V_grid)
    1780            0 :       CALL release_dbcsr_topology_and_matrices(dist=dist_aux_aux, mapped_dist=dist_row_aux)
    1781              : 
    1782            0 :       CALL cp_fm_release(fm_Vtr_Gamma)
    1783              : 
    1784            0 :       CALL timestop(handle)
    1785              : 
    1786            0 :    END SUBROUTINE compute_Sigma_x
    1787              : 
    1788              : ! **************************************************************************************************
    1789              : !> \brief Computes the correlation part of the GW self-energy
    1790              : !> \param bs_env ...
    1791              : !> \param fm_W_time ...
    1792              : !> \param mat_phi_mu_l ...
    1793              : !> \param mat_Z_lP ...
    1794              : !> \param fm_Sigma_c_Gamma_time ...
    1795              : ! **************************************************************************************************
    1796              : 
    1797            0 :    SUBROUTINE compute_Sigma_c(bs_env, fm_W_time, mat_phi_mu_l, mat_Z_lP, fm_Sigma_c_Gamma_time)
    1798              : 
    1799              :       TYPE(post_scf_bandstructure_type), POINTER         :: bs_env
    1800              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: fm_W_time
    1801              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_phi_mu_l, mat_Z_lP
    1802              :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :, :)  :: fm_Sigma_c_Gamma_time
    1803              : 
    1804              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'compute_Sigma_c'
    1805              : 
    1806              :       INTEGER                                            :: handle, i_t, ispin
    1807            0 :       INTEGER, DIMENSION(:), POINTER                     :: blk_aux, blk_grid, dist_col_grid, &
    1808            0 :                                                             dist_row_aux
    1809              :       REAL(KIND=dp)                                      :: t1, tau
    1810              :       TYPE(dbcsr_distribution_type)                      :: dist_aux_aux, dist_grid_grid
    1811            0 :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: mat_Sigma_neg_tau, mat_Sigma_pos_tau
    1812              :       TYPE(dbcsr_type) :: matrix_G_occ_grid, matrix_G_vir_grid, matrix_Sigma_neg_grid, &
    1813              :          matrix_Sigma_pos_grid, matrix_W_aux, matrix_W_grid
    1814              : 
    1815            0 :       CALL timeset(routineN, handle)
    1816              : 
    1817              :       ! =========================================================================
    1818              :       ! 1. SETUP CORE TOPOLOGIES AND PRE-ALLOCATE OUTPUT ARRAYS
    1819              :       ! =========================================================================
    1820            0 :       CALL setup_square_topology(mat_phi_mu_l, 'ROW', dist_grid_grid, blk_grid, dist_col_grid)
    1821            0 :       CALL setup_square_topology(mat_Z_lP, 'COL', dist_aux_aux, blk_aux, dist_row_aux)
    1822              : 
    1823              :       ! Pre-allocate local DBCSR matrices to act as targets for final output
    1824            0 :       NULLIFY (mat_Sigma_neg_tau, mat_Sigma_pos_tau)
    1825            0 :       ALLOCATE (mat_Sigma_neg_tau(bs_env%num_time_freq_points, bs_env%n_spin))
    1826            0 :       ALLOCATE (mat_Sigma_pos_tau(bs_env%num_time_freq_points, bs_env%n_spin))
    1827              : 
    1828            0 :       DO i_t = 1, bs_env%num_time_freq_points
    1829            0 :          DO ispin = 1, bs_env%n_spin
    1830            0 :             ALLOCATE (mat_Sigma_neg_tau(i_t, ispin)%matrix)
    1831            0 :             ALLOCATE (mat_Sigma_pos_tau(i_t, ispin)%matrix)
    1832            0 :             CALL dbcsr_create(mat_Sigma_neg_tau(i_t, ispin)%matrix, template=bs_env%mat_ao_ao%matrix)
    1833            0 :             CALL dbcsr_create(mat_Sigma_pos_tau(i_t, ispin)%matrix, template=bs_env%mat_ao_ao%matrix)
    1834              :          END DO
    1835              :       END DO
    1836              : 
    1837              :       ! =========================================================================
    1838              :       ! 2. MAIN IMAGINARY TIME LOOP
    1839              :       ! =========================================================================
    1840            0 :       DO i_t = 1, bs_env%num_time_freq_points
    1841            0 :          tau = bs_env%imag_time_points(i_t)
    1842              : 
    1843              :          ! -------------------------------------------------------------------
    1844              :          ! Compute W_grid = Z * W_aux * Z^T
    1845              :          ! -------------------------------------------------------------------
    1846            0 :          CALL dbcsr_create(matrix_W_aux, "W_aux", dist_aux_aux, dbcsr_type_no_symmetry, blk_aux, blk_aux)
    1847            0 :          CALL dbcsr_create(matrix_W_grid, "W_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1848              : 
    1849            0 :          CALL copy_fm_to_dbcsr(fm_W_time(i_t), matrix_W_aux, keep_sparsity=.FALSE.)
    1850              : 
    1851              :          ! W^MIC_ll'(iτ,k=0) = sum_PQ Z_lP W^MIC_PQ(iτ) Z_l'Q
    1852            0 :          CALL contract_A_B_A("N", "T", mat_Z_lP, matrix_W_aux, matrix_W_grid, bs_env%eps_filter)
    1853              : 
    1854            0 :          CALL dbcsr_release(matrix_W_aux) ! Clean up aux basis immediately
    1855              : 
    1856            0 :          DO ispin = 1, bs_env%n_spin
    1857            0 :             t1 = m_walltime()
    1858              : 
    1859              :             ! -------------------------------------------------------------------
    1860              :             ! A. Transform Green's Functions to the Grid
    1861              :             ! -------------------------------------------------------------------
    1862            0 :             CALL dbcsr_create(matrix_G_occ_grid, "G_occ_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1863            0 :             CALL dbcsr_create(matrix_G_vir_grid, "G_vir_grid", dist_grid_grid, dbcsr_type_no_symmetry, blk_grid, blk_grid)
    1864              : 
    1865              :             ! G^occ_µλ(i|τ|,k=0) = sum_G^occ_µλn^occ C_µn(k=0) e^(-|(ϵ_nk=0-ϵ_F)τ|) C_λn(k=0)
    1866              :             ! G^occ_ll'(i|τ|,k=0) = sum_µν Φ_µ(r_l) G^occ_µν Φ_ν(r_l')
    1867            0 :             CALL build_G_grid(bs_env, tau, ispin, .TRUE., .FALSE., mat_phi_mu_l, matrix_G_occ_grid, bs_env%eps_filter)
    1868              : 
    1869              :             ! G^vir_µλ(i|τ|,k=0) = sum_n^vir C_µn(k=0) e^(-|(ϵ_nk=0-ϵ_F)τ|) C_λn(k=0)
    1870              :             ! G^vir_ll'(i|τ|,k=0) = sum_µν Φ_µ(r_l) G^vir_µν Φ_ν(r_l')
    1871            0 :             CALL build_G_grid(bs_env, tau, ispin, .FALSE., .TRUE., mat_phi_mu_l, matrix_G_vir_grid, bs_env%eps_filter)
    1872              : 
    1873              :             ! -------------------------------------------------------------------
    1874              :             ! B. Element-wise Hadamard Products for Sigma_c on Grid
    1875              :             ! Σ_neg_grid = G_occ_grid ◦ W_grid
    1876              :             ! Σ_pos_grid = G_vir_grid ◦ W_grid
    1877              :             ! -------------------------------------------------------------------
    1878            0 :             CALL dbcsr_create(matrix_Sigma_neg_grid, template=matrix_W_grid)
    1879            0 :             CALL dbcsr_create(matrix_Sigma_pos_grid, template=matrix_W_grid)
    1880              : 
    1881              :             ! Σ^c_ll'(iτ,k=0)   = -G^occ_ll'(i|τ|,k=0) * W^MIC_ll'(iτ,k=0), for τ < 0
    1882            0 :             CALL hadamard_product(matrix_G_occ_grid, matrix_W_grid, matrix_Sigma_neg_grid, 1.0_dp)
    1883              : 
    1884              :             ! Σ^c_ll'(iτ,k=0)   =  G^vir_ll'(i|τ|,k=0) * W^MIC_ll'(iτ,k=0), for τ > 0
    1885            0 :             CALL hadamard_product(matrix_G_vir_grid, matrix_W_grid, matrix_Sigma_pos_grid, 1.0_dp)
    1886              : 
    1887              :             ! Instantly purge massive G_grid arrays to save memory
    1888            0 :             CALL dbcsr_release(matrix_G_occ_grid)
    1889            0 :             CALL dbcsr_release(matrix_G_vir_grid)
    1890              : 
    1891              :             ! -------------------------------------------------------------------
    1892              :             ! C. Transform Sigma back to AO Basis
    1893              :             ! Σ_AO = phi^T * Σ_grid * phi
    1894              :             ! -------------------------------------------------------------------
    1895              : 
    1896              :             ! Σ^c_λσ(iτ,k=0)    = sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ,k=0) Φ_σ(r_l'), for τ < 0
    1897              :             CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_neg_grid, &
    1898            0 :                                 mat_Sigma_neg_tau(i_t, ispin)%matrix, bs_env%eps_filter)
    1899            0 :             CALL dbcsr_scale(mat_Sigma_neg_tau(i_t, ispin)%matrix, -1.0_dp)
    1900              : 
    1901              :             ! Σ^c_λσ(iτ,k=0)    = sum_ll' Φ_λ(r_l) Σ^c_ll'(iτ,k=0) Φ_σ(r_l'), for τ > 0
    1902              :             CALL contract_A_B_A("T", "N", mat_phi_mu_l, matrix_Sigma_pos_grid, &
    1903            0 :                                 mat_Sigma_pos_tau(i_t, ispin)%matrix, bs_env%eps_filter)
    1904              : 
    1905              :             ! Purge Grid Sigma arrays
    1906            0 :             CALL dbcsr_release(matrix_Sigma_neg_grid)
    1907            0 :             CALL dbcsr_release(matrix_Sigma_pos_grid)
    1908              : 
    1909            0 :             IF (bs_env%unit_nr > 0) THEN
    1910              :                WRITE (bs_env%unit_nr, '(T2,A,I10,A,I3,A,F7.1,A)') &
    1911            0 :                   'Computed Σ^c(iτ,k=0) for time point ', i_t, ' /', bs_env%num_time_freq_points, &
    1912            0 :                   ',    Execution time', m_walltime() - t1, ' s'
    1913              :             END IF
    1914              : 
    1915              :          END DO ! ispin
    1916              : 
    1917              :          ! Release the W_grid for this time point
    1918            0 :          CALL dbcsr_release(matrix_W_grid)
    1919              : 
    1920              :       END DO ! i_t
    1921              : 
    1922            0 :       IF (bs_env%unit_nr > 0) WRITE (bs_env%unit_nr, '(A)') ' '
    1923              : 
    1924              :       ! -------------------------------------------------------------------------
    1925              :       ! 3. FINALIZE AND CLEANUP
    1926              :       ! -------------------------------------------------------------------------
    1927              :       CALL fill_fm_Sigma_c_Gamma_time(fm_Sigma_c_Gamma_time, bs_env, &
    1928            0 :                                       mat_Sigma_pos_tau, mat_Sigma_neg_tau)
    1929              : 
    1930            0 :       CALL cp_fm_release(fm_W_time)
    1931              : 
    1932            0 :       CALL dbcsr_deallocate_matrix_set(mat_Sigma_neg_tau)
    1933            0 :       CALL dbcsr_deallocate_matrix_set(mat_Sigma_pos_tau)
    1934              : 
    1935            0 :       CALL release_dbcsr_topology_and_matrices(dist=dist_grid_grid, mapped_dist=dist_col_grid)
    1936            0 :       CALL release_dbcsr_topology_and_matrices(dist=dist_aux_aux, mapped_dist=dist_row_aux)
    1937              : 
    1938            0 :       CALL delete_unnecessary_files(bs_env)
    1939            0 :       CALL timestop(handle)
    1940              : 
    1941            0 :    END SUBROUTINE compute_Sigma_c
    1942              : 
    1943              : ! **************************************************************************************************
    1944              : !> \brief DBCSR Topology Generation
    1945              : !> \param matrix_template ...
    1946              : !> \param dim_type ...
    1947              : !> \param square_dist ...
    1948              : !> \param blk_sizes ...
    1949              : !> \param mapped_dist ...
    1950              : ! **************************************************************************************************
    1951              : 
    1952            0 :    SUBROUTINE setup_square_topology(matrix_template, dim_type, square_dist, blk_sizes, mapped_dist)
    1953              : 
    1954              :       TYPE(dbcsr_type), INTENT(IN)                       :: matrix_template
    1955              :       CHARACTER(LEN=*), INTENT(IN)                       :: dim_type
    1956              :       TYPE(dbcsr_distribution_type), INTENT(OUT)         :: square_dist
    1957              :       INTEGER, DIMENSION(:), INTENT(OUT), POINTER        :: blk_sizes, mapped_dist
    1958              : 
    1959              :       INTEGER                                            :: i, np
    1960            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk, col_dist, row_blk, row_dist
    1961              :       TYPE(dbcsr_distribution_type)                      :: dist_template
    1962              : 
    1963              :       CALL dbcsr_get_info(matrix_template, distribution=dist_template, &
    1964            0 :                           row_blk_size=row_blk, col_blk_size=col_blk)
    1965            0 :       CALL dbcsr_distribution_get(dist_template, row_dist=row_dist, col_dist=col_dist)
    1966              : 
    1967            0 :       IF (TRIM(dim_type) == 'ROW') THEN
    1968              :          ! Creates ROW x ROW (e.g., Grid x Grid from mat_phi_mu_l)
    1969            0 :          blk_sizes => row_blk
    1970            0 :          np = MAXVAL(col_dist) + 1  ! npcol
    1971            0 :          ALLOCATE (mapped_dist(SIZE(blk_sizes)))
    1972            0 :          DO i = 1, SIZE(blk_sizes)
    1973            0 :             mapped_dist(i) = MOD(i - 1, np)
    1974              :          END DO
    1975              :          CALL dbcsr_distribution_new(square_dist, template=dist_template, &
    1976            0 :                                      row_dist=row_dist, col_dist=mapped_dist)
    1977              : 
    1978            0 :       ELSE IF (TRIM(dim_type) == 'COL') THEN
    1979              :          ! Creates COL x COL (e.g., Aux x Aux from mat_Z_lP)
    1980            0 :          blk_sizes => col_blk
    1981            0 :          np = MAXVAL(row_dist) + 1  ! nprow
    1982            0 :          ALLOCATE (mapped_dist(SIZE(blk_sizes)))
    1983            0 :          DO i = 1, SIZE(blk_sizes)
    1984            0 :             mapped_dist(i) = MOD(i - 1, np)
    1985              :          END DO
    1986              :          CALL dbcsr_distribution_new(square_dist, template=dist_template, &
    1987            0 :                                      row_dist=mapped_dist, col_dist=col_dist)
    1988              :       END IF
    1989              : 
    1990            0 :    END SUBROUTINE setup_square_topology
    1991              : 
    1992              : ! **************************************************************************************************
    1993              : !> \brief DBCSR matrices deallocation
    1994              : !> \param dist ...
    1995              : !> \param mapped_dist    ...
    1996              : !> \param m1 ...
    1997              : !> \param m2 ...
    1998              : !> \param m3 ...
    1999              : !> \param m4 ...
    2000              : ! **************************************************************************************************
    2001              : 
    2002            0 :    SUBROUTINE release_dbcsr_topology_and_matrices(dist, mapped_dist, m1, m2, m3, m4)
    2003              : 
    2004              :       TYPE(dbcsr_distribution_type), INTENT(INOUT), &
    2005              :          OPTIONAL                                        :: dist
    2006              :       INTEGER, DIMENSION(:), INTENT(INOUT), OPTIONAL, &
    2007              :          POINTER                                         :: mapped_dist
    2008              :       TYPE(dbcsr_type), INTENT(INOUT), OPTIONAL          :: m1, m2, m3, m4
    2009              : 
    2010            0 :       IF (PRESENT(dist)) CALL dbcsr_distribution_release(dist)
    2011            0 :       IF (PRESENT(mapped_dist)) THEN
    2012            0 :          IF (ASSOCIATED(mapped_dist)) THEN
    2013            0 :             DEALLOCATE (mapped_dist)
    2014              :             NULLIFY (mapped_dist)
    2015              :          END IF
    2016              :       END IF
    2017            0 :       IF (PRESENT(m1)) CALL dbcsr_release(m1)
    2018            0 :       IF (PRESENT(m2)) CALL dbcsr_release(m2)
    2019            0 :       IF (PRESENT(m3)) CALL dbcsr_release(m3)
    2020            0 :       IF (PRESENT(m4)) CALL dbcsr_release(m4)
    2021              : 
    2022            0 :    END SUBROUTINE release_dbcsr_topology_and_matrices
    2023              : 
    2024              : END MODULE gw_large_cell_Gamma_ri_rs
        

Generated by: LCOV version 2.0-1