LCOV - code coverage report
Current view: top level - src - hfx_ri_kp.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:42dac4a) Lines: 97.1 % 2263 2198
Test Date: 2025-07-25 12:55:17 Functions: 92.9 % 42 39

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief RI-methods for HFX and K-points.
      10              : !> \auhtor Augustin Bussy (01.2023)
      11              : ! **************************************************************************************************
      12              : 
      13              : MODULE hfx_ri_kp
      14              :    USE admm_types,                      ONLY: get_admm_env
      15              :    USE atomic_kind_types,               ONLY: atomic_kind_type,&
      16              :                                               get_atomic_kind_set
      17              :    USE basis_set_types,                 ONLY: get_gto_basis_set,&
      18              :                                               gto_basis_set_p_type
      19              :    USE bibliography,                    ONLY: Bussy2024,&
      20              :                                               cite_reference
      21              :    USE cell_types,                      ONLY: cell_type,&
      22              :                                               pbc,&
      23              :                                               real_to_scaled,&
      24              :                                               scaled_to_real
      25              :    USE cp_array_utils,                  ONLY: cp_1d_logical_p_type,&
      26              :                                               cp_2d_r_p_type,&
      27              :                                               cp_3d_r_p_type
      28              :    USE cp_blacs_env,                    ONLY: cp_blacs_env_create,&
      29              :                                               cp_blacs_env_release,&
      30              :                                               cp_blacs_env_type
      31              :    USE cp_control_types,                ONLY: dft_control_type
      32              :    USE cp_dbcsr_api,                    ONLY: &
      33              :         dbcsr_add, dbcsr_clear, dbcsr_copy, dbcsr_create, dbcsr_distribution_get, &
      34              :         dbcsr_distribution_new, dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_filter, &
      35              :         dbcsr_finalize, dbcsr_get_block_p, dbcsr_get_info, dbcsr_iterator_blocks_left, &
      36              :         dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, &
      37              :         dbcsr_p_type, dbcsr_put_block, dbcsr_release, dbcsr_type, dbcsr_type_no_symmetry, &
      38              :         dbcsr_type_symmetric
      39              :    USE cp_dbcsr_cholesky,               ONLY: cp_dbcsr_cholesky_decompose,&
      40              :                                               cp_dbcsr_cholesky_invert
      41              :    USE cp_dbcsr_contrib,                ONLY: dbcsr_dot
      42              :    USE cp_dbcsr_cp2k_link,              ONLY: cp_dbcsr_alloc_block_from_nbl
      43              :    USE cp_dbcsr_diag,                   ONLY: cp_dbcsr_power
      44              :    USE cp_dbcsr_operations,             ONLY: cp_dbcsr_dist2d_to_dist
      45              :    USE dbt_api,                         ONLY: &
      46              :         dbt_batched_contract_finalize, dbt_batched_contract_init, dbt_clear, dbt_contract, &
      47              :         dbt_copy, dbt_copy_matrix_to_tensor, dbt_copy_tensor_to_matrix, dbt_create, dbt_destroy, &
      48              :         dbt_distribution_destroy, dbt_distribution_new, dbt_distribution_type, dbt_filter, &
      49              :         dbt_finalize, dbt_get_block, dbt_get_info, dbt_get_stored_coordinates, &
      50              :         dbt_iterator_blocks_left, dbt_iterator_next_block, dbt_iterator_start, dbt_iterator_stop, &
      51              :         dbt_iterator_type, dbt_mp_environ_pgrid, dbt_pgrid_create, dbt_pgrid_destroy, &
      52              :         dbt_pgrid_type, dbt_put_block, dbt_scale, dbt_type
      53              :    USE distribution_2d_types,           ONLY: distribution_2d_release,&
      54              :                                               distribution_2d_type
      55              :    USE hfx_ri,                          ONLY: get_idx_to_atom,&
      56              :                                               hfx_ri_pre_scf_calc_tensors
      57              :    USE hfx_types,                       ONLY: hfx_ri_type
      58              :    USE input_constants,                 ONLY: do_potential_short,&
      59              :                                               hfx_ri_do_2c_cholesky,&
      60              :                                               hfx_ri_do_2c_diag,&
      61              :                                               hfx_ri_do_2c_iter
      62              :    USE input_cp2k_hfx,                  ONLY: ri_pmat
      63              :    USE input_section_types,             ONLY: section_vals_get_subs_vals,&
      64              :                                               section_vals_type,&
      65              :                                               section_vals_val_get,&
      66              :                                               section_vals_val_set
      67              :    USE iterate_matrix,                  ONLY: invert_hotelling
      68              :    USE kinds,                           ONLY: default_string_length,&
      69              :                                               dp,&
      70              :                                               int_8
      71              :    USE kpoint_types,                    ONLY: get_kpoint_info,&
      72              :                                               kpoint_type
      73              :    USE libint_2c_3c,                    ONLY: cutoff_screen_factor
      74              :    USE machine,                         ONLY: m_flush,&
      75              :                                               m_memory,&
      76              :                                               m_walltime
      77              :    USE mathlib,                         ONLY: erfc_cutoff
      78              :    USE message_passing,                 ONLY: mp_cart_type,&
      79              :                                               mp_para_env_type,&
      80              :                                               mp_request_type,&
      81              :                                               mp_waitall
      82              :    USE particle_methods,                ONLY: get_particle_set
      83              :    USE particle_types,                  ONLY: particle_type
      84              :    USE physcon,                         ONLY: angstrom
      85              :    USE qs_environment_types,            ONLY: get_qs_env,&
      86              :                                               qs_environment_type
      87              :    USE qs_force_types,                  ONLY: qs_force_type
      88              :    USE qs_integral_utils,               ONLY: basis_set_list_setup
      89              :    USE qs_interactions,                 ONLY: init_interaction_radii_orb_basis
      90              :    USE qs_kind_types,                   ONLY: qs_kind_type
      91              :    USE qs_neighbor_list_types,          ONLY: get_iterator_info,&
      92              :                                               neighbor_list_iterate,&
      93              :                                               neighbor_list_iterator_create,&
      94              :                                               neighbor_list_iterator_p_type,&
      95              :                                               neighbor_list_iterator_release,&
      96              :                                               neighbor_list_set_p_type,&
      97              :                                               release_neighbor_list_sets
      98              :    USE qs_scf_types,                    ONLY: qs_scf_env_type
      99              :    USE qs_tensors,                      ONLY: &
     100              :         build_2c_derivatives, build_2c_neighbor_lists, build_3c_derivatives, &
     101              :         build_3c_neighbor_lists, get_3c_iterator_info, get_tensor_occupancy, &
     102              :         neighbor_list_3c_destroy, neighbor_list_3c_iterate, neighbor_list_3c_iterator_create, &
     103              :         neighbor_list_3c_iterator_destroy
     104              :    USE qs_tensors_types,                ONLY: create_2c_tensor,&
     105              :                                               create_3c_tensor,&
     106              :                                               create_tensor_batches,&
     107              :                                               distribution_2d_create,&
     108              :                                               distribution_3d_create,&
     109              :                                               distribution_3d_type,&
     110              :                                               neighbor_list_3c_iterator_type,&
     111              :                                               neighbor_list_3c_type
     112              :    USE util,                            ONLY: get_limit
     113              :    USE virial_types,                    ONLY: virial_type
     114              : #include "./base/base_uses.f90"
     115              : 
     116              : !$ USE OMP_LIB, ONLY: omp_get_num_threads
     117              : 
     118              :    IMPLICIT NONE
     119              :    PRIVATE
     120              : 
     121              :    PUBLIC :: hfx_ri_update_ks_kp, hfx_ri_update_forces_kp
     122              : 
     123              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_ri_kp'
     124              : CONTAINS
     125              : 
     126              : ! **************************************************************************************************
     127              : !> \brief I_1nitialize the ri_data for K-point. For now, we take the normal, usual existing ri_data
     128              : !>        and we adapt it to our needs
     129              : !> \param dbcsr_template ...
     130              : !> \param ri_data ...
     131              : !> \param qs_env ...
     132              : ! **************************************************************************************************
     133           80 :    SUBROUTINE adapt_ri_data_to_kp(dbcsr_template, ri_data, qs_env)
     134              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: dbcsr_template
     135              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
     136              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     137              : 
     138              :       INTEGER                                            :: i_img, i_RI, i_spin, iatom, natom, &
     139              :                                                             nblks_RI, nimg, nkind, nspins
     140           80 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes_RI_ext, dist1, dist2, dist3
     141              :       TYPE(dft_control_type), POINTER                    :: dft_control
     142              :       TYPE(mp_para_env_type), POINTER                    :: para_env
     143              : 
     144           80 :       NULLIFY (dft_control, para_env)
     145              : 
     146              :       !The main thing that we need to do is to allocate more space for the integrals, such that there
     147              :       !is room for each periodic image. Note that we only go in 1D, i.e. we store (mu^0 sigma^a|P^0),
     148              :       !and (P^0|Q^a) => the RI basis is always in the main cell.
     149              : 
     150              :       !Get kpoint info
     151           80 :       CALL get_qs_env(qs_env, dft_control=dft_control, natom=natom, para_env=para_env, nkind=nkind)
     152           80 :       nimg = ri_data%nimg
     153              : 
     154              :       !Along the RI direction we have basis elements spread accross ncell_RI images.
     155           80 :       nblks_RI = SIZE(ri_data%bsizes_RI_split)
     156          240 :       ALLOCATE (bsizes_RI_ext(nblks_RI*ri_data%ncell_RI))
     157          562 :       DO i_RI = 1, ri_data%ncell_RI
     158         2744 :          bsizes_RI_ext((i_RI - 1)*nblks_RI + 1:i_RI*nblks_RI) = ri_data%bsizes_RI_split(:)
     159              :       END DO
     160              : 
     161         4876 :       ALLOCATE (ri_data%t_3c_int_ctr_1(1, nimg))
     162              :       CALL create_3c_tensor(ri_data%t_3c_int_ctr_1(1, 1), dist1, dist2, dist3, &
     163              :                             ri_data%pgrid_1, ri_data%bsizes_AO_split, bsizes_RI_ext, &
     164           80 :                             ri_data%bsizes_AO_split, [1, 2], [3], name="(AO RI | AO)")
     165              : 
     166         1998 :       DO i_img = 2, nimg
     167         1998 :          CALL dbt_create(ri_data%t_3c_int_ctr_1(1, 1), ri_data%t_3c_int_ctr_1(1, i_img))
     168              :       END DO
     169           80 :       DEALLOCATE (dist1, dist2, dist3)
     170              : 
     171          880 :       ALLOCATE (ri_data%t_3c_int_ctr_2(1, 1))
     172              :       CALL create_3c_tensor(ri_data%t_3c_int_ctr_2(1, 1), dist1, dist2, dist3, &
     173              :                             ri_data%pgrid_1, ri_data%bsizes_AO_split, bsizes_RI_ext, &
     174           80 :                             ri_data%bsizes_AO_split, [1], [2, 3], name="(AO RI | AO)")
     175           80 :       DEALLOCATE (dist1, dist2, dist3)
     176              : 
     177              :       !We use full block sizes for the 2c quantities
     178           80 :       DEALLOCATE (bsizes_RI_ext)
     179           80 :       nblks_RI = SIZE(ri_data%bsizes_RI)
     180          240 :       ALLOCATE (bsizes_RI_ext(nblks_RI*ri_data%ncell_RI))
     181          562 :       DO i_RI = 1, ri_data%ncell_RI
     182         1526 :          bsizes_RI_ext((i_RI - 1)*nblks_RI + 1:i_RI*nblks_RI) = ri_data%bsizes_RI(:)
     183              :       END DO
     184              : 
     185         3440 :       ALLOCATE (ri_data%t_2c_inv(1, natom), ri_data%t_2c_int(1, natom), ri_data%t_2c_pot(1, natom))
     186              :       CALL create_2c_tensor(ri_data%t_2c_inv(1, 1), dist1, dist2, ri_data%pgrid_2d, &
     187              :                             bsizes_RI_ext, bsizes_RI_ext, &
     188           80 :                             name="(RI | RI)")
     189           80 :       DEALLOCATE (dist1, dist2)
     190           80 :       CALL dbt_create(ri_data%t_2c_inv(1, 1), ri_data%t_2c_int(1, 1))
     191           80 :       CALL dbt_create(ri_data%t_2c_inv(1, 1), ri_data%t_2c_pot(1, 1))
     192          160 :       DO iatom = 2, natom
     193           80 :          CALL dbt_create(ri_data%t_2c_inv(1, 1), ri_data%t_2c_inv(1, iatom))
     194           80 :          CALL dbt_create(ri_data%t_2c_inv(1, 1), ri_data%t_2c_int(1, iatom))
     195          160 :          CALL dbt_create(ri_data%t_2c_inv(1, 1), ri_data%t_2c_pot(1, iatom))
     196              :       END DO
     197              : 
     198          400 :       ALLOCATE (ri_data%kp_cost(natom, natom, nimg))
     199        14066 :       ri_data%kp_cost = 0.0_dp
     200              : 
     201              :       !We store the density and KS matrix in tensor format
     202           80 :       nspins = dft_control%nspins
     203        10604 :       ALLOCATE (ri_data%rho_ao_t(nspins, nimg), ri_data%ks_t(nspins, nimg))
     204              :       CALL create_2c_tensor(ri_data%rho_ao_t(1, 1), dist1, dist2, ri_data%pgrid_2d, &
     205              :                             ri_data%bsizes_AO_split, ri_data%bsizes_AO_split, &
     206           80 :                             name="(AO | AO)")
     207           80 :       DEALLOCATE (dist1, dist2)
     208              : 
     209           80 :       CALL dbt_create(dbcsr_template, ri_data%ks_t(1, 1))
     210              : 
     211           80 :       IF (nspins == 2) THEN
     212           26 :          CALL dbt_create(ri_data%rho_ao_t(1, 1), ri_data%rho_ao_t(2, 1))
     213           26 :          CALL dbt_create(ri_data%ks_t(1, 1), ri_data%ks_t(2, 1))
     214              :       END IF
     215         1998 :       DO i_img = 2, nimg
     216         4276 :          DO i_spin = 1, nspins
     217         2278 :             CALL dbt_create(ri_data%rho_ao_t(1, 1), ri_data%rho_ao_t(i_spin, i_img))
     218         4196 :             CALL dbt_create(ri_data%ks_t(1, 1), ri_data%ks_t(i_spin, i_img))
     219              :          END DO
     220              :       END DO
     221              : 
     222          240 :    END SUBROUTINE adapt_ri_data_to_kp
     223              : 
     224              : ! **************************************************************************************************
     225              : !> \brief The pre-scf steps for RI-HFX k-points calculation. Namely the calculation of the integrals
     226              : !> \param dbcsr_template ...
     227              : !> \param ri_data ...
     228              : !> \param qs_env ...
     229              : ! **************************************************************************************************
     230           80 :    SUBROUTINE hfx_ri_pre_scf_kp(dbcsr_template, ri_data, qs_env)
     231              :       TYPE(dbcsr_type), INTENT(INOUT)                    :: dbcsr_template
     232              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
     233              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     234              : 
     235              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'hfx_ri_pre_scf_kp'
     236              : 
     237              :       INTEGER                                            :: handle, i_img, iatom, natom, nimg, nkind
     238           80 :       TYPE(dbcsr_type), ALLOCATABLE, DIMENSION(:)        :: t_2c_op_pot, t_2c_op_RI
     239           80 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:, :)       :: t_3c_int
     240              :       TYPE(dft_control_type), POINTER                    :: dft_control
     241              : 
     242           80 :       NULLIFY (dft_control)
     243              : 
     244           80 :       CALL timeset(routineN, handle)
     245              : 
     246           80 :       CALL get_qs_env(qs_env, dft_control=dft_control, natom=natom, nkind=nkind)
     247              : 
     248           80 :       CALL cleanup_kp(ri_data)
     249              : 
     250              :       !We do all the checks on what we allow in this initial implementation
     251           80 :       IF (ri_data%flavor .NE. ri_pmat) CPABORT("K-points RI-HFX only with RHO flavor")
     252           80 :       IF (ri_data%same_op) ri_data%same_op = .FALSE. !force the full calculation with RI metric
     253           80 :       IF (ABS(ri_data%eps_pgf_orb - dft_control%qs_control%eps_pgf_orb) > 1.0E-16_dp) &
     254            0 :          CPABORT("RI%EPS_PGF_ORB and QS%EPS_PGF_ORB must be identical for RI-HFX k-points")
     255              : 
     256           80 :       CALL get_kp_and_ri_images(ri_data, qs_env)
     257           80 :       nimg = ri_data%nimg
     258              : 
     259              :       !Calculate the integrals
     260         4316 :       ALLOCATE (t_2c_op_pot(nimg), t_2c_op_RI(nimg))
     261         4876 :       ALLOCATE (t_3c_int(1, nimg))
     262           80 :       CALL hfx_ri_pre_scf_calc_tensors(qs_env, ri_data, t_2c_op_RI, t_2c_op_pot, t_3c_int, do_kpoints=.TRUE.)
     263              : 
     264              :       !Make sure the internals have the k-point format
     265           80 :       CALL adapt_ri_data_to_kp(dbcsr_template, ri_data, qs_env)
     266              : 
     267              :       !For each atom i, we calculate the inverse RI metric (P^0 | Q^0)^-1 without external bumping yet
     268              :       !Also store the off-diagonal integrals of the RI metric in case of forces, bumped from the left
     269          240 :       DO iatom = 1, natom
     270              :          CALL get_ext_2c_int(ri_data%t_2c_inv(1, iatom), t_2c_op_RI, iatom, iatom, 1, ri_data, qs_env, &
     271          160 :                              do_inverse=.TRUE.)
     272              :          !for the forces:
     273              :          !off-diagonl RI metric bumped from the left
     274              :          CALL get_ext_2c_int(ri_data%t_2c_int(1, iatom), t_2c_op_RI, iatom, iatom, 1, ri_data, &
     275          160 :                              qs_env, off_diagonal=.TRUE.)
     276          160 :          CALL apply_bump(ri_data%t_2c_int(1, iatom), iatom, ri_data, qs_env, from_left=.TRUE., from_right=.FALSE.)
     277              : 
     278              :          !RI metric with bumped off-diagonal blocks (but not inverted), depumed from left and right
     279              :          CALL get_ext_2c_int(ri_data%t_2c_pot(1, iatom), t_2c_op_RI, iatom, iatom, 1, ri_data, qs_env, &
     280          160 :                              do_inverse=.TRUE., skip_inverse=.TRUE.)
     281              :          CALL apply_bump(ri_data%t_2c_pot(1, iatom), iatom, ri_data, qs_env, from_left=.TRUE., &
     282          240 :                          from_right=.TRUE., debump=.TRUE.)
     283              : 
     284              :       END DO
     285              : 
     286         2078 :       DO i_img = 1, nimg
     287         2078 :          CALL dbcsr_release(t_2c_op_RI(i_img))
     288              :       END DO
     289              : 
     290         4156 :       ALLOCATE (ri_data%kp_mat_2c_pot(1, nimg))
     291         2078 :       DO i_img = 1, nimg
     292         1998 :          CALL dbcsr_create(ri_data%kp_mat_2c_pot(1, i_img), template=t_2c_op_pot(i_img))
     293         1998 :          CALL dbcsr_copy(ri_data%kp_mat_2c_pot(1, i_img), t_2c_op_pot(i_img))
     294         2078 :          CALL dbcsr_release(t_2c_op_pot(i_img))
     295              :       END DO
     296              : 
     297              :       !reorder the 3c integrals such that empty images are bunched up together
     298           80 :       CALL reorder_3c_ints(t_3c_int(1, :), ri_data)
     299              : 
     300              :       !Pre-contract all 3c integrals with the bumped inverse RI metric (P^0|Q^0)^-1,
     301              :       !and store in ri_data%t_3c_int_ctr_1
     302           80 :       CALL precontract_3c_ints(t_3c_int, ri_data, qs_env)
     303              : 
     304           80 :       CALL timestop(handle)
     305              : 
     306         2158 :    END SUBROUTINE hfx_ri_pre_scf_kp
     307              : 
     308              : ! **************************************************************************************************
     309              : !> \brief clean-up the KP specific data from ri_data
     310              : !> \param ri_data ...
     311              : ! **************************************************************************************************
     312           80 :    SUBROUTINE cleanup_kp(ri_data)
     313              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
     314              : 
     315              :       INTEGER                                            :: i, j
     316              : 
     317           80 :       IF (ALLOCATED(ri_data%kp_cost)) DEALLOCATE (ri_data%kp_cost)
     318           80 :       IF (ALLOCATED(ri_data%idx_to_img)) DEALLOCATE (ri_data%idx_to_img)
     319           80 :       IF (ALLOCATED(ri_data%img_to_idx)) DEALLOCATE (ri_data%img_to_idx)
     320           80 :       IF (ALLOCATED(ri_data%present_images)) DEALLOCATE (ri_data%present_images)
     321           80 :       IF (ALLOCATED(ri_data%img_to_RI_cell)) DEALLOCATE (ri_data%img_to_RI_cell)
     322           80 :       IF (ALLOCATED(ri_data%RI_cell_to_img)) DEALLOCATE (ri_data%RI_cell_to_img)
     323              : 
     324           80 :       IF (ALLOCATED(ri_data%kp_mat_2c_pot)) THEN
     325          716 :          DO j = 1, SIZE(ri_data%kp_mat_2c_pot, 2)
     326         1406 :             DO i = 1, SIZE(ri_data%kp_mat_2c_pot, 1)
     327         1380 :                CALL dbcsr_release(ri_data%kp_mat_2c_pot(i, j))
     328              :             END DO
     329              :          END DO
     330           26 :          DEALLOCATE (ri_data%kp_mat_2c_pot)
     331              :       END IF
     332              : 
     333           80 :       IF (ALLOCATED(ri_data%kp_t_3c_int)) THEN
     334          716 :          DO i = 1, SIZE(ri_data%kp_t_3c_int)
     335          716 :             CALL dbt_destroy(ri_data%kp_t_3c_int(i))
     336              :          END DO
     337          716 :          DEALLOCATE (ri_data%kp_t_3c_int)
     338              :       END IF
     339              : 
     340           80 :       IF (ALLOCATED(ri_data%t_2c_inv)) THEN
     341          186 :          DO j = 1, SIZE(ri_data%t_2c_inv, 2)
     342          292 :             DO i = 1, SIZE(ri_data%t_2c_inv, 1)
     343          212 :                CALL dbt_destroy(ri_data%t_2c_inv(i, j))
     344              :             END DO
     345              :          END DO
     346          186 :          DEALLOCATE (ri_data%t_2c_inv)
     347              :       END IF
     348              : 
     349           80 :       IF (ALLOCATED(ri_data%t_2c_int)) THEN
     350          186 :          DO j = 1, SIZE(ri_data%t_2c_int, 2)
     351          292 :             DO i = 1, SIZE(ri_data%t_2c_int, 1)
     352          212 :                CALL dbt_destroy(ri_data%t_2c_int(i, j))
     353              :             END DO
     354              :          END DO
     355          186 :          DEALLOCATE (ri_data%t_2c_int)
     356              :       END IF
     357              : 
     358           80 :       IF (ALLOCATED(ri_data%t_2c_pot)) THEN
     359          186 :          DO j = 1, SIZE(ri_data%t_2c_pot, 2)
     360          292 :             DO i = 1, SIZE(ri_data%t_2c_pot, 1)
     361          212 :                CALL dbt_destroy(ri_data%t_2c_pot(i, j))
     362              :             END DO
     363              :          END DO
     364          186 :          DEALLOCATE (ri_data%t_2c_pot)
     365              :       END IF
     366              : 
     367           80 :       IF (ALLOCATED(ri_data%t_3c_int_ctr_1)) THEN
     368          824 :          DO j = 1, SIZE(ri_data%t_3c_int_ctr_1, 2)
     369         1568 :             DO i = 1, SIZE(ri_data%t_3c_int_ctr_1, 1)
     370         1488 :                CALL dbt_destroy(ri_data%t_3c_int_ctr_1(i, j))
     371              :             END DO
     372              :          END DO
     373          824 :          DEALLOCATE (ri_data%t_3c_int_ctr_1)
     374              :       END IF
     375              : 
     376           80 :       IF (ALLOCATED(ri_data%t_3c_int_ctr_2)) THEN
     377          160 :          DO j = 1, SIZE(ri_data%t_3c_int_ctr_2, 2)
     378          240 :             DO i = 1, SIZE(ri_data%t_3c_int_ctr_2, 1)
     379          160 :                CALL dbt_destroy(ri_data%t_3c_int_ctr_2(i, j))
     380              :             END DO
     381              :          END DO
     382          160 :          DEALLOCATE (ri_data%t_3c_int_ctr_2)
     383              :       END IF
     384              : 
     385           80 :       IF (ALLOCATED(ri_data%rho_ao_t)) THEN
     386          824 :          DO j = 1, SIZE(ri_data%rho_ao_t, 2)
     387         1804 :             DO i = 1, SIZE(ri_data%rho_ao_t, 1)
     388         1724 :                CALL dbt_destroy(ri_data%rho_ao_t(i, j))
     389              :             END DO
     390              :          END DO
     391         1060 :          DEALLOCATE (ri_data%rho_ao_t)
     392              :       END IF
     393              : 
     394           80 :       IF (ALLOCATED(ri_data%ks_t)) THEN
     395          824 :          DO j = 1, SIZE(ri_data%ks_t, 2)
     396         1804 :             DO i = 1, SIZE(ri_data%ks_t, 1)
     397         1724 :                CALL dbt_destroy(ri_data%ks_t(i, j))
     398              :             END DO
     399              :          END DO
     400         1060 :          DEALLOCATE (ri_data%ks_t)
     401              :       END IF
     402              : 
     403           80 :    END SUBROUTINE cleanup_kp
     404              : 
     405              : ! **************************************************************************************************
     406              : !> \brief Prints a progress bar for the k-point RI-HFX triple loop
     407              : !> \param b_img ...
     408              : !> \param nimg ...
     409              : !> \param iprint ...
     410              : !> \param ri_data ...
     411              : ! **************************************************************************************************
     412            0 :    SUBROUTINE print_progress_bar(b_img, nimg, iprint, ri_data)
     413              :       INTEGER, INTENT(IN)                                :: b_img, nimg
     414              :       INTEGER, INTENT(INOUT)                             :: iprint
     415              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
     416              : 
     417              :       CHARACTER(LEN=default_string_length)               :: bar
     418              :       INTEGER                                            :: rep
     419              : 
     420            0 :       IF (ri_data%unit_nr > 0) THEN
     421            0 :          IF (b_img == 1) THEN
     422            0 :             WRITE (ri_data%unit_nr, '(/T6,A)', advance="no") '[-'
     423            0 :             CALL m_flush(ri_data%unit_nr)
     424              :          END IF
     425            0 :          IF (b_img > iprint*nimg/71) THEN
     426            0 :             rep = MAX(1, 71/nimg)
     427            0 :             bar = REPEAT("-", rep)
     428            0 :             WRITE (ri_data%unit_nr, '(A)', advance="no") TRIM(bar)
     429            0 :             CALL m_flush(ri_data%unit_nr)
     430            0 :             iprint = iprint + 1
     431              :          END IF
     432            0 :          IF (b_img == nimg) THEN
     433            0 :             rep = MAX(0, 1 + 71 - iprint*rep)
     434            0 :             bar = REPEAT("-", rep)
     435            0 :             WRITE (ri_data%unit_nr, '(A,A)') TRIM(bar), '-]'
     436            0 :             CALL m_flush(ri_data%unit_nr)
     437              :          END IF
     438              :       END IF
     439              : 
     440            0 :    END SUBROUTINE print_progress_bar
     441              : 
     442              : ! **************************************************************************************************
     443              : !> \brief Update the KS matrices for each real-space image
     444              : !> \param qs_env ...
     445              : !> \param ri_data ...
     446              : !> \param ks_matrix ...
     447              : !> \param ehfx ...
     448              : !> \param rho_ao ...
     449              : !> \param geometry_did_change ...
     450              : !> \param nspins ...
     451              : !> \param hf_fraction ...
     452              : ! **************************************************************************************************
     453          248 :    SUBROUTINE hfx_ri_update_ks_kp(qs_env, ri_data, ks_matrix, ehfx, rho_ao, &
     454              :                                   geometry_did_change, nspins, hf_fraction)
     455              : 
     456              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     457              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
     458              :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: ks_matrix
     459              :       REAL(KIND=dp), INTENT(OUT)                         :: ehfx
     460              :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: rho_ao
     461              :       LOGICAL, INTENT(IN)                                :: geometry_did_change
     462              :       INTEGER, INTENT(IN)                                :: nspins
     463              :       REAL(KIND=dp), INTENT(IN)                          :: hf_fraction
     464              : 
     465              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_ri_update_ks_kp'
     466              : 
     467              :       INTEGER :: b_img, batch_size, group_size, handle, handle2, i_batch, i_img, i_spin, iatom, &
     468              :          iblk, igroup, iprint, jatom, mb_img, n_batch_nze, n_nze, natom, ngroups, nimg, nimg_nze
     469              :       INTEGER(int_8)                                     :: mem, nflop, nze
     470          248 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: batch_ranges_at, batch_ranges_nze, &
     471          248 :                                                             idx_to_at_AO
     472          248 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: iapc_pairs
     473          248 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: sparsity_pattern
     474              :       LOGICAL                                            :: estimate_mem, print_progress, use_delta_p
     475              :       REAL(dp)                                           :: etmp, fac, occ, pfac, pref, t1, t2, t3, &
     476              :                                                             t4
     477              :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub
     478              :       TYPE(dbcsr_type)                                   :: ks_desymm, rho_desymm, tmp
     479          248 :       TYPE(dbcsr_type), ALLOCATABLE, DIMENSION(:)        :: mat_2c_pot
     480              :       TYPE(dbcsr_type), POINTER                          :: dbcsr_template
     481          248 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:)          :: ks_t_split, t_2c_ao_tmp, t_2c_work, &
     482          248 :                                                             t_3c_int, t_3c_work_2, t_3c_work_3
     483          248 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:, :)       :: ks_t, ks_t_sub, t_3c_apc, t_3c_apc_sub
     484              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
     485              :       TYPE(section_vals_type), POINTER                   :: hfx_section, print_section
     486              : 
     487          248 :       NULLIFY (para_env, para_env_sub, blacs_env_sub, hfx_section, dbcsr_template, print_section)
     488              : 
     489          248 :       CALL cite_reference(Bussy2024)
     490              : 
     491          248 :       CALL timeset(routineN, handle)
     492              : 
     493          248 :       CALL get_qs_env(qs_env, para_env=para_env, natom=natom)
     494              : 
     495          248 :       IF (nspins == 1) THEN
     496          164 :          fac = 0.5_dp*hf_fraction
     497              :       ELSE
     498           84 :          fac = 1.0_dp*hf_fraction
     499              :       END IF
     500              : 
     501          248 :       hfx_section => section_vals_get_subs_vals(qs_env%input, "DFT%XC%HF%RI")
     502          248 :       CALL section_vals_val_get(hfx_section, "KP_NGROUPS", i_val=ngroups)
     503          248 :       CALL section_vals_val_get(hfx_section, "KP_STACK_SIZE", i_val=batch_size)
     504          248 :       CALL section_vals_val_get(hfx_section, "KP_USE_DELTA_P", l_val=use_delta_p)
     505          248 :       ri_data%kp_stack_size = batch_size
     506          248 :       ri_data%kp_ngroups = ngroups
     507              : 
     508          248 :       IF (geometry_did_change) THEN
     509           80 :          CALL hfx_ri_pre_scf_kp(ks_matrix(1, 1)%matrix, ri_data, qs_env)
     510              :       END IF
     511          248 :       nimg = ri_data%nimg
     512          248 :       nimg_nze = ri_data%nimg_nze
     513              : 
     514              :       !We need to calculate the KS matrix for each periodic cell with index b: F_mu^0,nu^b
     515              :       !F_mu^0,nu^b = -0.5 sum_a,c P_sigma^0,lambda^c (mu^0, sigma^a| P^0) V_P^0,Q^b (Q^b| nu^b lambda^a+c)
     516              :       !with V_P^0,Q^b = (P^0|R^0)^-1 * (R^0|S^b) * (S^b|Q^b)^-1
     517              : 
     518              :       !We use a local RI basis set for each atom in the system, which inlcudes RI basis elements for
     519              :       !each neighboring atom standing within the KIND radius (decay of Gaussian with smallest exponent)
     520              : 
     521              :       !We also limit the number of periodic images we consider accorrding to the HFX potentail in the
     522              :       !RI basis, because if V_P^0,Q^b is zero everywhere, then image b can be ignored (RI basis less diffuse)
     523              : 
     524              :       !We manage to calculate each KS matrix doing a double loop on iamges, and a double loop on atoms
     525              :       !First, we pre-contract and store P_sigma^0,lambda^c (mu^0, sigma^a| P^0) (P^0|R^0)^-1 into T_mu^0,lambda^a+c,P^0
     526              :       !Then, we loop over b_img, iatom, jatom to get (R^0|S^b)
     527              :       !Finally, we do an additional loop over a+c images where we do (R^0|S^b) (S^b|Q^b)^-1 (Q^b| nu^b lambda^a+c)
     528              :       !and the final contraction with T_mu^0,lambda^a+c,P^0
     529              : 
     530              :       !Note that the 3-center integrals are pre-contracted with the RI metric, and that the same tensor can be used
     531              :       !(mu^0, sigma^a| P^0) (P^0|R^0)  <===> (S^b|Q^b)^-1 (Q^b| nu^b lambda^a+c) by relabelling the images
     532              : 
     533              :       !By default, build the density tensor based on the difference of this SCF P and that of the prev. SCF
     534          248 :       pfac = -1.0_dp
     535          248 :       IF (.NOT. use_delta_p) pfac = 0.0_dp
     536          248 :       CALL get_pmat_images(ri_data%rho_ao_t, rho_ao, pfac, ri_data, qs_env)
     537              : 
     538          248 :       n_nze = 0
     539         7034 :       DO i_img = 1, nimg
     540        15386 :          DO i_spin = 1, nspins
     541         8352 :             CALL get_tensor_occupancy(ri_data%rho_ao_t(i_spin, i_img), nze, occ)
     542        15138 :             IF (nze > 0) THEN
     543         6678 :                n_nze = n_nze + 1
     544              :             END IF
     545              :          END DO
     546              :       END DO
     547          248 :       IF (n_nze == nspins) THEN
     548           32 :          CPWARN("It is highly recommended to restart from a converged GGA K-point calculations.")
     549              :       END IF
     550              : 
     551        18114 :       ALLOCATE (ks_t(nspins, nimg))
     552         7034 :       DO i_img = 1, nimg
     553        15386 :          DO i_spin = 1, nspins
     554        15138 :             CALL dbt_create(ri_data%ks_t(1, 1), ks_t(i_spin, i_img))
     555              :          END DO
     556              :       END DO
     557              : 
     558          744 :       ALLOCATE (idx_to_at_AO(SIZE(ri_data%bsizes_AO_split)))
     559          248 :       CALL get_idx_to_atom(idx_to_at_AO, ri_data%bsizes_AO_split, ri_data%bsizes_AO)
     560              : 
     561              :       !First we calculate and store T^1_mu^0,lambda^a+c,P = P_mu^0,lambda^c * (mu_0 sigma^a | P^0) (P^0|R^0)^-1
     562              :       !To avoid doing nimg**2 tiny contractions that do not scale well with a large number of CPUs,
     563              :       !we instead do a single loop over the a+c image index. For each a+c, we get a list of allowed
     564              :       !combination of a,c indices. Then we build TAS tensors P_mu^0,lambda^c with all concerned c's
     565              :       !and (mu^0 sigma^a | P^0)*(P^0|R^0)^-1 with all a's. Then we perform a single contraction with larger tensors,
     566              :       !were the sum over a,c is automatically taken care of
     567        17866 :       ALLOCATE (t_3c_apc(nspins, nimg))
     568         7034 :       DO i_img = 1, nimg
     569        15386 :          DO i_spin = 1, nspins
     570        15138 :             CALL dbt_create(ri_data%t_3c_int_ctr_2(1, 1), t_3c_apc(i_spin, i_img))
     571              :          END DO
     572              :       END DO
     573          248 :       CALL contract_pmat_3c(t_3c_apc, ri_data%rho_ao_t, ri_data, qs_env)
     574              : 
     575          248 :       IF (MOD(para_env%num_pe, ngroups) .NE. 0) THEN
     576            0 :          CPWARN("KP_NGROUPS must be an integer divisor of the total number of MPI ranks. It was set to 1.")
     577            0 :          ngroups = 1
     578            0 :          CALL section_vals_val_set(hfx_section, "KP_NGROUPS", i_val=ngroups)
     579              :       END IF
     580          248 :       IF ((MOD(ngroups, natom) .NE. 0) .AND. (MOD(natom, ngroups) .NE. 0) .AND. geometry_did_change) THEN
     581            0 :          IF (ngroups > 1) THEN
     582            0 :             CPWARN("Better load balancing is reached if NGROUPS is a multiple/divisor of the number of atoms")
     583              :          END IF
     584              :       END IF
     585          248 :       group_size = para_env%num_pe/ngroups
     586          248 :       igroup = para_env%mepos/group_size
     587              : 
     588          248 :       ALLOCATE (para_env_sub)
     589          248 :       CALL para_env_sub%from_split(para_env, igroup)
     590          248 :       CALL cp_blacs_env_create(blacs_env_sub, para_env_sub)
     591              : 
     592              :       ! The sparsity pattern of each iatom, jatom pair, on each b_img, and on which subgroup
     593         1240 :       ALLOCATE (sparsity_pattern(natom, natom, nimg))
     594          248 :       CALL get_sparsity_pattern(sparsity_pattern, ri_data, qs_env)
     595          248 :       CALL get_sub_dist(sparsity_pattern, ngroups, ri_data)
     596              : 
     597              :       !Get all the required tensors in the subgroups
     598        32588 :       ALLOCATE (mat_2c_pot(nimg), ks_t_sub(nspins, nimg), t_2c_ao_tmp(1), ks_t_split(2), t_2c_work(3))
     599              :       CALL get_subgroup_2c_tensors(mat_2c_pot, t_2c_work, t_2c_ao_tmp, ks_t_split, ks_t_sub, &
     600          248 :                                    group_size, ngroups, para_env, para_env_sub, ri_data)
     601              : 
     602        32836 :       ALLOCATE (t_3c_int(nimg), t_3c_apc_sub(nspins, nimg), t_3c_work_2(3), t_3c_work_3(3))
     603              :       CALL get_subgroup_3c_tensors(t_3c_int, t_3c_work_2, t_3c_work_3, t_3c_apc, t_3c_apc_sub, &
     604          248 :                                    group_size, ngroups, para_env, para_env_sub, ri_data)
     605              : 
     606              :       !We go atom by atom, therefore there is an automatic batching along that direction
     607              :       !Also, because we stack the 3c tensors nimg times, we naturally do some batching there too
     608          744 :       ALLOCATE (batch_ranges_at(natom + 1))
     609          248 :       batch_ranges_at(natom + 1) = SIZE(ri_data%bsizes_AO_split) + 1
     610          248 :       iatom = 0
     611         1138 :       DO iblk = 1, SIZE(ri_data%bsizes_AO_split)
     612         1138 :          IF (idx_to_at_AO(iblk) == iatom + 1) THEN
     613          496 :             iatom = iatom + 1
     614          496 :             batch_ranges_at(iatom) = iblk
     615              :          END IF
     616              :       END DO
     617              : 
     618          248 :       n_batch_nze = nimg_nze/batch_size
     619          248 :       IF (MODULO(nimg_nze, batch_size) .NE. 0) n_batch_nze = n_batch_nze + 1
     620          744 :       ALLOCATE (batch_ranges_nze(n_batch_nze + 1))
     621          608 :       DO i_batch = 1, n_batch_nze
     622          608 :          batch_ranges_nze(i_batch) = (i_batch - 1)*batch_size + 1
     623              :       END DO
     624          248 :       batch_ranges_nze(n_batch_nze + 1) = nimg_nze + 1
     625              : 
     626          248 :       print_section => section_vals_get_subs_vals(qs_env%input, "DFT%XC%HF%RI%PRINT")
     627          248 :       CALL section_vals_val_get(print_section, "KP_RI_PROGRESS_BAR", l_val=print_progress)
     628          248 :       CALL section_vals_val_get(print_section, "KP_RI_MEMORY_ESTIMATE", l_val=estimate_mem)
     629              : 
     630          744 :       ALLOCATE (iapc_pairs(nimg, 2))
     631          248 :       IF (estimate_mem .AND. geometry_did_change) THEN
     632              :          !Populate work tensors to simulate maximum usage
     633            0 :          CALL get_iapc_pairs(iapc_pairs, 1, ri_data, qs_env)
     634              :          CALL fill_3c_stack(t_3c_work_3(1), t_3c_int, iapc_pairs(:, 1), 3, ri_data, &
     635              :                             filter_at=1, filter_dim=2, idx_to_at=idx_to_at_AO, &
     636            0 :                             img_bounds=[batch_ranges_nze(1), batch_ranges_nze(2)])
     637              :          CALL fill_3c_stack(t_3c_work_3(2), t_3c_int, iapc_pairs(:, 1), 3, ri_data, &
     638              :                             filter_at=1, filter_dim=2, idx_to_at=idx_to_at_AO, &
     639            0 :                             img_bounds=[batch_ranges_nze(1), batch_ranges_nze(2)])
     640              :          CALL fill_3c_stack(t_3c_work_2(1), t_3c_apc_sub(1, :), iapc_pairs(:, 2), 3, &
     641              :                             ri_data, filter_at=1, filter_dim=1, idx_to_at=idx_to_at_AO, &
     642            0 :                             img_bounds=[batch_ranges_nze(1), batch_ranges_nze(2)])
     643              :          CALL fill_3c_stack(t_3c_work_2(2), t_3c_apc_sub(1, :), iapc_pairs(:, 2), 3, &
     644              :                             ri_data, filter_at=1, filter_dim=1, idx_to_at=idx_to_at_AO, &
     645            0 :                             img_bounds=[batch_ranges_nze(1), batch_ranges_nze(2)])
     646              :          CALL get_ext_2c_int(t_2c_work(1), mat_2c_pot, 1, 1, 1, ri_data, qs_env, &
     647              :                              blacs_env_ext=blacs_env_sub, para_env_ext=para_env_sub, &
     648            0 :                              dbcsr_template=dbcsr_template)
     649            0 :          CALL m_memory(mem)
     650            0 :          CALL para_env%max(mem)
     651            0 :          CALL dbt_clear(t_3c_work_2(1))
     652            0 :          CALL dbt_clear(t_3c_work_2(2))
     653            0 :          CALL dbt_clear(t_3c_work_3(1))
     654            0 :          CALL dbt_clear(t_3c_work_3(2))
     655            0 :          CALL dbt_clear(t_2c_work(1))
     656              : 
     657            0 :          IF (ri_data%unit_nr > 0) THEN
     658              :             WRITE (ri_data%unit_nr, FMT="(T3,A,I14)") &
     659            0 :                "KP-HFX_RI_INFO| Estimated peak memory usage per MPI rank (MiB):", mem/(1024*1024)
     660            0 :             CALL m_flush(ri_data%unit_nr)
     661              :          END IF
     662              :       END IF
     663              : 
     664          248 :       CALL dbt_batched_contract_init(t_3c_work_3(1), batch_range_2=batch_ranges_at)
     665          248 :       CALL dbt_batched_contract_init(t_3c_work_3(2), batch_range_2=batch_ranges_at)
     666          248 :       CALL dbt_batched_contract_init(t_3c_work_2(1), batch_range_1=batch_ranges_at)
     667          248 :       CALL dbt_batched_contract_init(t_3c_work_2(2), batch_range_1=batch_ranges_at)
     668              : 
     669          248 :       iprint = 1
     670          248 :       t1 = m_walltime()
     671        47750 :       ri_data%kp_cost(:, :, :) = 0.0_dp
     672         7034 :       DO b_img = 1, nimg
     673         6786 :          IF (print_progress) CALL print_progress_bar(b_img, nimg, iprint, ri_data)
     674         6786 :          CALL dbt_batched_contract_init(ks_t_split(1))
     675         6786 :          CALL dbt_batched_contract_init(ks_t_split(2))
     676        20358 :          DO jatom = 1, natom
     677        47502 :             DO iatom = 1, natom
     678        27144 :                IF (.NOT. sparsity_pattern(iatom, jatom, b_img) == igroup) CYCLE
     679         4640 :                pref = 1.0_dp
     680         4640 :                IF (iatom == jatom .AND. b_img == 1) pref = 0.5_dp
     681              : 
     682              :                !measure the cost of the given i, j, b configuration
     683         4640 :                t3 = m_walltime()
     684              : 
     685              :                !Get the proper HFX potential 2c integrals (R_i^0|S_j^b)
     686         4640 :                CALL timeset(routineN//"_2c", handle2)
     687              :                CALL get_ext_2c_int(t_2c_work(1), mat_2c_pot, iatom, jatom, b_img, ri_data, qs_env, &
     688              :                                    blacs_env_ext=blacs_env_sub, para_env_ext=para_env_sub, &
     689         4640 :                                    dbcsr_template=dbcsr_template)
     690         4640 :                CALL dbt_copy(t_2c_work(1), t_2c_work(2), move_data=.TRUE.) !move to split blocks
     691         4640 :                CALL dbt_filter(t_2c_work(2), ri_data%filter_eps)
     692         4640 :                CALL timestop(handle2)
     693              : 
     694         4640 :                CALL dbt_batched_contract_init(t_2c_work(2))
     695         4640 :                CALL get_iapc_pairs(iapc_pairs, b_img, ri_data, qs_env)
     696         4640 :                CALL timeset(routineN//"_3c", handle2)
     697              : 
     698              :                !Stack the (S^b|Q^b)^-1 * (Q^b| nu^b lambda^a+c) integrals over a+c and multiply by (R_i^0|S_j^b)
     699        12696 :                DO i_batch = 1, n_batch_nze
     700              :                   CALL fill_3c_stack(t_3c_work_3(3), t_3c_int, iapc_pairs(:, 1), 3, ri_data, &
     701              :                                      filter_at=jatom, filter_dim=2, idx_to_at=idx_to_at_AO, &
     702        24168 :                                      img_bounds=[batch_ranges_nze(i_batch), batch_ranges_nze(i_batch + 1)])
     703         8056 :                   CALL dbt_copy(t_3c_work_3(3), t_3c_work_3(1), move_data=.TRUE.)
     704              : 
     705              :                   CALL dbt_contract(1.0_dp, t_2c_work(2), t_3c_work_3(1), &
     706              :                                     0.0_dp, t_3c_work_3(2), map_1=[1], map_2=[2, 3], &
     707              :                                     contract_1=[2], notcontract_1=[1], &
     708              :                                     contract_2=[1], notcontract_2=[2, 3], &
     709         8056 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
     710         8056 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
     711         8056 :                   CALL dbt_copy(t_3c_work_3(2), t_3c_work_2(2), order=[2, 1, 3], move_data=.TRUE.)
     712         8056 :                   CALL dbt_copy(t_3c_work_3(3), t_3c_work_3(1))
     713              : 
     714              :                   !Stack the P_sigma^a,lambda^a+c * (mu^0 sigma^a | P^0)*(P^0|R^0)^-1 integrals over a+c and contract
     715              :                   !to get the final block of the KS matrix
     716        22760 :                   DO i_spin = 1, nspins
     717              :                      CALL fill_3c_stack(t_3c_work_2(3), t_3c_apc_sub(i_spin, :), iapc_pairs(:, 2), 3, &
     718              :                                         ri_data, filter_at=iatom, filter_dim=1, idx_to_at=idx_to_at_AO, &
     719        30192 :                                         img_bounds=[batch_ranges_nze(i_batch), batch_ranges_nze(i_batch + 1)])
     720        10064 :                      CALL get_tensor_occupancy(t_3c_work_2(3), nze, occ)
     721              : 
     722        10064 :                      IF (nze == 0) CYCLE
     723         9809 :                      CALL dbt_copy(t_3c_work_2(3), t_3c_work_2(1), move_data=.TRUE.)
     724              :                      CALL dbt_contract(-pref*fac, t_3c_work_2(1), t_3c_work_2(2), &
     725              :                                        1.0_dp, ks_t_split(i_spin), map_1=[1], map_2=[2], &
     726              :                                        contract_1=[2, 3], notcontract_1=[1], &
     727              :                                        contract_2=[2, 3], notcontract_2=[1], &
     728              :                                        filter_eps=ri_data%filter_eps, &
     729         9809 :                                        move_data=i_spin == nspins, flop=nflop)
     730        27929 :                      ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
     731              :                   END DO
     732              :                END DO !i_batch
     733         4640 :                CALL timestop(handle2)
     734         4640 :                CALL dbt_batched_contract_finalize(t_2c_work(2))
     735              : 
     736         4640 :                t4 = m_walltime()
     737        49996 :                ri_data%kp_cost(iatom, jatom, b_img) = t4 - t3
     738              :             END DO !iatom
     739              :          END DO !jatom
     740         6786 :          CALL dbt_batched_contract_finalize(ks_t_split(1))
     741         6786 :          CALL dbt_batched_contract_finalize(ks_t_split(2))
     742              : 
     743        15386 :          DO i_spin = 1, nspins
     744         8352 :             CALL dbt_copy(ks_t_split(i_spin), t_2c_ao_tmp(1), move_data=.TRUE.)
     745        15138 :             CALL dbt_copy(t_2c_ao_tmp(1), ks_t_sub(i_spin, b_img), summation=.TRUE.)
     746              :          END DO
     747              :       END DO !b_img
     748          248 :       CALL dbt_batched_contract_finalize(t_3c_work_3(1))
     749          248 :       CALL dbt_batched_contract_finalize(t_3c_work_3(2))
     750          248 :       CALL dbt_batched_contract_finalize(t_3c_work_2(1))
     751          248 :       CALL dbt_batched_contract_finalize(t_3c_work_2(2))
     752          248 :       CALL para_env%sync()
     753          248 :       CALL para_env%sum(ri_data%dbcsr_nflop)
     754          248 :       CALL para_env%sum(ri_data%kp_cost)
     755          248 :       t2 = m_walltime()
     756          248 :       ri_data%dbcsr_time = ri_data%dbcsr_time + t2 - t1
     757              : 
     758              :       !transfer KS tensor from subgroup to main group
     759          248 :       CALL gather_ks_matrix(ks_t, ks_t_sub, group_size, sparsity_pattern, para_env, ri_data)
     760              : 
     761              :       !Keep the 3c integrals on the subgroups to avoid communication at next SCF step
     762         7034 :       DO i_img = 1, nimg
     763         7034 :          CALL dbt_copy(t_3c_int(i_img), ri_data%kp_t_3c_int(i_img), move_data=.TRUE.)
     764              :       END DO
     765              : 
     766              :       !clean-up subgroup tensors
     767          248 :       CALL dbt_destroy(t_2c_ao_tmp(1))
     768          248 :       CALL dbt_destroy(ks_t_split(1))
     769          248 :       CALL dbt_destroy(ks_t_split(2))
     770          248 :       CALL dbt_destroy(t_2c_work(1))
     771          248 :       CALL dbt_destroy(t_2c_work(2))
     772          248 :       CALL dbt_destroy(t_3c_work_2(1))
     773          248 :       CALL dbt_destroy(t_3c_work_2(2))
     774          248 :       CALL dbt_destroy(t_3c_work_2(3))
     775          248 :       CALL dbt_destroy(t_3c_work_3(1))
     776          248 :       CALL dbt_destroy(t_3c_work_3(2))
     777          248 :       CALL dbt_destroy(t_3c_work_3(3))
     778         7034 :       DO i_img = 1, nimg
     779         6786 :          CALL dbt_destroy(t_3c_int(i_img))
     780         6786 :          CALL dbcsr_release(mat_2c_pot(i_img))
     781        15386 :          DO i_spin = 1, nspins
     782         8352 :             CALL dbt_destroy(t_3c_apc_sub(i_spin, i_img))
     783        15138 :             CALL dbt_destroy(ks_t_sub(i_spin, i_img))
     784              :          END DO
     785              :       END DO
     786          248 :       IF (ASSOCIATED(dbcsr_template)) THEN
     787          248 :          CALL dbcsr_release(dbcsr_template)
     788          248 :          DEALLOCATE (dbcsr_template)
     789              :       END IF
     790              : 
     791              :       !End of subgroup parallelization
     792          248 :       CALL cp_blacs_env_release(blacs_env_sub)
     793          248 :       CALL para_env_sub%free()
     794          248 :       DEALLOCATE (para_env_sub)
     795              : 
     796              :       !Currently, rho_ao_t holds the density difference (wrt to pref SCF step).
     797              :       !ks_t also hold that diff, while only having half the blocks => need to add to prev ks_t and symmetrize
     798              :       !We need the full thing for the energy, on the next SCF step
     799          248 :       CALL get_pmat_images(ri_data%rho_ao_t, rho_ao, 0.0_dp, ri_data, qs_env)
     800          580 :       DO i_spin = 1, nspins
     801         8932 :          DO b_img = 1, nimg
     802         8352 :             CALL dbt_copy(ks_t(i_spin, b_img), ri_data%ks_t(i_spin, b_img), summation=.TRUE.)
     803              : 
     804              :             !desymmetrize
     805         8352 :             mb_img = get_opp_index(b_img, qs_env)
     806         8684 :             IF (mb_img > 0 .AND. mb_img .LE. nimg) THEN
     807         7428 :                CALL dbt_copy(ks_t(i_spin, mb_img), ri_data%ks_t(i_spin, b_img), order=[2, 1], summation=.TRUE.)
     808              :             END IF
     809              :          END DO
     810              :       END DO
     811         7034 :       DO b_img = 1, nimg
     812        15386 :          DO i_spin = 1, nspins
     813        15138 :             CALL dbt_destroy(ks_t(i_spin, b_img))
     814              :          END DO
     815              :       END DO
     816              : 
     817              :       !calculate the energy
     818          248 :       CALL dbt_create(ri_data%ks_t(1, 1), t_2c_ao_tmp(1))
     819          248 :       CALL dbcsr_create(tmp, template=ks_matrix(1, 1)%matrix, matrix_type=dbcsr_type_symmetric)
     820          248 :       CALL dbcsr_create(ks_desymm, template=ks_matrix(1, 1)%matrix, matrix_type=dbcsr_type_no_symmetry)
     821          248 :       CALL dbcsr_create(rho_desymm, template=ks_matrix(1, 1)%matrix, matrix_type=dbcsr_type_no_symmetry)
     822          248 :       ehfx = 0.0_dp
     823         7034 :       DO i_img = 1, nimg
     824        15386 :          DO i_spin = 1, nspins
     825         8352 :             CALL dbt_filter(ri_data%ks_t(i_spin, i_img), ri_data%filter_eps)
     826         8352 :             CALL dbt_copy(ri_data%ks_t(i_spin, i_img), t_2c_ao_tmp(1))
     827         8352 :             CALL dbt_copy_tensor_to_matrix(t_2c_ao_tmp(1), ks_desymm)
     828         8352 :             CALL dbt_copy_tensor_to_matrix(t_2c_ao_tmp(1), tmp)
     829         8352 :             CALL dbcsr_add(ks_matrix(i_spin, i_img)%matrix, tmp, 1.0_dp, 1.0_dp)
     830              : 
     831         8352 :             CALL dbt_copy(ri_data%rho_ao_t(i_spin, i_img), t_2c_ao_tmp(1))
     832         8352 :             CALL dbt_copy_tensor_to_matrix(t_2c_ao_tmp(1), rho_desymm)
     833              : 
     834         8352 :             CALL dbcsr_dot(ks_desymm, rho_desymm, etmp)
     835         8352 :             ehfx = ehfx + 0.5_dp*etmp
     836              : 
     837        15138 :             IF (.NOT. use_delta_p) CALL dbt_clear(ri_data%ks_t(i_spin, i_img))
     838              :          END DO
     839              :       END DO
     840          248 :       CALL dbcsr_release(rho_desymm)
     841          248 :       CALL dbcsr_release(ks_desymm)
     842          248 :       CALL dbcsr_release(tmp)
     843          248 :       CALL dbt_destroy(t_2c_ao_tmp(1))
     844              : 
     845          248 :       CALL timestop(handle)
     846              : 
     847        44410 :    END SUBROUTINE hfx_ri_update_ks_kp
     848              : 
     849              : ! **************************************************************************************************
     850              : !> \brief Update the K-points RI-HFX forces
     851              : !> \param qs_env ...
     852              : !> \param ri_data ...
     853              : !> \param nspins ...
     854              : !> \param hf_fraction ...
     855              : !> \param rho_ao ...
     856              : !> \param use_virial ...
     857              : !> \note Because this routine uses stored quantities calculated in the energy calculation, they should
     858              : !>       always be called by pairs, and with the same input densities
     859              : ! **************************************************************************************************
     860           46 :    SUBROUTINE hfx_ri_update_forces_kp(qs_env, ri_data, nspins, hf_fraction, rho_ao, use_virial)
     861              : 
     862              :       TYPE(qs_environment_type), POINTER                 :: qs_env
     863              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
     864              :       INTEGER, INTENT(IN)                                :: nspins
     865              :       REAL(KIND=dp), INTENT(IN)                          :: hf_fraction
     866              :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: rho_ao
     867              :       LOGICAL, INTENT(IN), OPTIONAL                      :: use_virial
     868              : 
     869              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_ri_update_forces_kp'
     870              : 
     871              :       INTEGER :: b_img, batch_size, group_size, handle, handle2, i_batch, i_img, i_loop, i_spin, &
     872              :          i_xyz, iatom, iblk, igroup, j_xyz, jatom, k_xyz, n_batch, natom, ngroups, nimg, nimg_nze
     873              :       INTEGER(int_8)                                     :: nflop, nze
     874           46 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: atom_of_kind, batch_ranges_at, &
     875           46 :                                                             batch_ranges_nze, dist1, dist2, &
     876           46 :                                                             i_images, idx_to_at_AO, idx_to_at_RI, &
     877           46 :                                                             kind_of
     878           46 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: iapc_pairs
     879           46 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: force_pattern, sparsity_pattern
     880              :       INTEGER, DIMENSION(2, 1)                           :: bounds_iat, bounds_jat
     881              :       LOGICAL                                            :: use_virial_prv
     882              :       REAL(dp)                                           :: fac, occ, pref, t1, t2
     883              :       REAL(dp), DIMENSION(3, 3)                          :: work_virial
     884           46 :       TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
     885              :       TYPE(cell_type), POINTER                           :: cell
     886              :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub
     887           46 :       TYPE(dbcsr_type), ALLOCATABLE, DIMENSION(:)        :: mat_2c_pot
     888           46 :       TYPE(dbcsr_type), ALLOCATABLE, DIMENSION(:, :)     :: mat_der_pot, mat_der_pot_sub
     889              :       TYPE(dbcsr_type), POINTER                          :: dbcsr_template
     890          782 :       TYPE(dbt_type)                                     :: t_2c_R, t_2c_R_split
     891           46 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:)          :: t_2c_bint, t_2c_binv, t_2c_der_pot, &
     892           92 :                                                             t_2c_inv, t_2c_metric, t_2c_work, &
     893           46 :                                                             t_3c_der_stack, t_3c_work_2, &
     894           46 :                                                             t_3c_work_3
     895           46 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:, :) :: rho_ao_t, rho_ao_t_sub, t_2c_der_metric, &
     896           92 :          t_2c_der_metric_sub, t_3c_apc, t_3c_apc_sub, t_3c_der_AO, t_3c_der_AO_sub, t_3c_der_RI, &
     897           46 :          t_3c_der_RI_sub
     898              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
     899           46 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     900           46 :       TYPE(qs_force_type), DIMENSION(:), POINTER         :: force
     901              :       TYPE(section_vals_type), POINTER                   :: hfx_section
     902              :       TYPE(virial_type), POINTER                         :: virial
     903              : 
     904           46 :       NULLIFY (para_env, para_env_sub, hfx_section, blacs_env_sub, dbcsr_template, force, atomic_kind_set, &
     905           46 :                virial, particle_set, cell)
     906              : 
     907           46 :       CALL timeset(routineN, handle)
     908              : 
     909           46 :       use_virial_prv = .FALSE.
     910           46 :       IF (PRESENT(use_virial)) use_virial_prv = use_virial
     911              : 
     912           46 :       IF (nspins == 1) THEN
     913           30 :          fac = 0.5_dp*hf_fraction
     914              :       ELSE
     915           16 :          fac = 1.0_dp*hf_fraction
     916              :       END IF
     917              : 
     918              :       CALL get_qs_env(qs_env, natom=natom, para_env=para_env, force=force, cell=cell, virial=virial, &
     919           46 :                       atomic_kind_set=atomic_kind_set, particle_set=particle_set)
     920           46 :       CALL get_atomic_kind_set(atomic_kind_set, kind_of=kind_of, atom_of_kind=atom_of_kind)
     921              : 
     922          138 :       ALLOCATE (idx_to_at_AO(SIZE(ri_data%bsizes_AO_split)))
     923           46 :       CALL get_idx_to_atom(idx_to_at_AO, ri_data%bsizes_AO_split, ri_data%bsizes_AO)
     924              : 
     925          138 :       ALLOCATE (idx_to_at_RI(SIZE(ri_data%bsizes_RI_split)))
     926           46 :       CALL get_idx_to_atom(idx_to_at_RI, ri_data%bsizes_RI_split, ri_data%bsizes_RI)
     927              : 
     928           46 :       nimg = ri_data%nimg
     929        12218 :       ALLOCATE (t_3c_der_RI(nimg, 3), t_3c_der_AO(nimg, 3), mat_der_pot(nimg, 3), t_2c_der_metric(natom, 3))
     930              : 
     931              :       !We assume that the integrals are available from the SCF
     932              :       !pre-calculate the derivs. 3c tensors as (P^0| sigma^a mu^0), with t_3c_der_AO holding deriv wrt mu^0
     933           46 :       CALL precalc_derivatives(t_3c_der_RI, t_3c_der_AO, mat_der_pot, t_2c_der_metric, ri_data, qs_env)
     934              : 
     935              :       !Calculate the density matrix at each image
     936         2936 :       ALLOCATE (rho_ao_t(nspins, nimg))
     937              :       CALL create_2c_tensor(rho_ao_t(1, 1), dist1, dist2, ri_data%pgrid_2d, &
     938              :                             ri_data%bsizes_AO_split, ri_data%bsizes_AO_split, &
     939           46 :                             name="(AO | AO)")
     940           46 :       DEALLOCATE (dist1, dist2)
     941           46 :       IF (nspins == 2) CALL dbt_create(rho_ao_t(1, 1), rho_ao_t(2, 1))
     942         1102 :       DO i_img = 2, nimg
     943         2322 :          DO i_spin = 1, nspins
     944         2276 :             CALL dbt_create(rho_ao_t(1, 1), rho_ao_t(i_spin, i_img))
     945              :          END DO
     946              :       END DO
     947           46 :       CALL get_pmat_images(rho_ao_t, rho_ao, 0.0_dp, ri_data, qs_env)
     948              : 
     949              :       !Contract integrals with the density matrix
     950         2936 :       ALLOCATE (t_3c_apc(nspins, nimg))
     951         1148 :       DO i_img = 1, nimg
     952         2430 :          DO i_spin = 1, nspins
     953         2384 :             CALL dbt_create(ri_data%t_3c_int_ctr_2(1, 1), t_3c_apc(i_spin, i_img))
     954              :          END DO
     955              :       END DO
     956           46 :       CALL contract_pmat_3c(t_3c_apc, rho_ao_t, ri_data, qs_env)
     957              : 
     958              :       !Setup the subgroups
     959           46 :       hfx_section => section_vals_get_subs_vals(qs_env%input, "DFT%XC%HF%RI")
     960           46 :       CALL section_vals_val_get(hfx_section, "KP_NGROUPS", i_val=ngroups)
     961           46 :       group_size = para_env%num_pe/ngroups
     962           46 :       igroup = para_env%mepos/group_size
     963              : 
     964           46 :       ALLOCATE (para_env_sub)
     965           46 :       CALL para_env_sub%from_split(para_env, igroup)
     966           46 :       CALL cp_blacs_env_create(blacs_env_sub, para_env_sub)
     967              : 
     968              :       !Get the ususal sparsity pattern
     969          230 :       ALLOCATE (sparsity_pattern(natom, natom, nimg))
     970           46 :       CALL get_sparsity_pattern(sparsity_pattern, ri_data, qs_env)
     971           46 :       CALL get_sub_dist(sparsity_pattern, ngroups, ri_data)
     972              : 
     973              :       !Get the 2-center quantities in the subgroups (note: main group derivs are deleted wihtin)
     974            0 :       ALLOCATE (t_2c_inv(natom), mat_2c_pot(nimg), rho_ao_t_sub(nspins, nimg), t_2c_work(5), &
     975            0 :                 t_2c_der_metric_sub(natom, 3), mat_der_pot_sub(nimg, 3), t_2c_bint(natom), &
     976        11254 :                 t_2c_metric(natom), t_2c_binv(natom))
     977              :       CALL get_subgroup_2c_derivs(t_2c_inv, t_2c_bint, t_2c_metric, mat_2c_pot, t_2c_work, rho_ao_t, &
     978              :                                   rho_ao_t_sub, t_2c_der_metric, t_2c_der_metric_sub, mat_der_pot, &
     979           46 :                                   mat_der_pot_sub, group_size, ngroups, para_env, para_env_sub, ri_data)
     980           46 :       CALL dbt_create(t_2c_work(1), t_2c_R) !nRI x nRI
     981           46 :       CALL dbt_create(t_2c_work(5), t_2c_R_split) !nRI x nRI with split blocks
     982              : 
     983          552 :       ALLOCATE (t_2c_der_pot(3))
     984          184 :       DO i_xyz = 1, 3
     985          184 :          CALL dbt_create(t_2c_R, t_2c_der_pot(i_xyz))
     986              :       END DO
     987              : 
     988              :       !Get the 3-center quantities in the subgroups. The integrals and t_3c_apc already there
     989            0 :       ALLOCATE (t_3c_work_2(3), t_3c_work_3(4), t_3c_der_stack(6), t_3c_der_AO_sub(nimg, 3), &
     990        12354 :                 t_3c_der_RI_sub(nimg, 3), t_3c_apc_sub(nspins, nimg))
     991              :       CALL get_subgroup_3c_derivs(t_3c_work_2, t_3c_work_3, t_3c_der_AO, t_3c_der_AO_sub, &
     992              :                                   t_3c_der_RI, t_3c_der_RI_sub, t_3c_apc, t_3c_apc_sub, t_3c_der_stack, &
     993           46 :                                   group_size, ngroups, para_env, para_env_sub, ri_data)
     994              : 
     995              :       !Set up batched contraction (go atom by atom)
     996          138 :       ALLOCATE (batch_ranges_at(natom + 1))
     997           46 :       batch_ranges_at(natom + 1) = SIZE(ri_data%bsizes_AO_split) + 1
     998           46 :       iatom = 0
     999          232 :       DO iblk = 1, SIZE(ri_data%bsizes_AO_split)
    1000          232 :          IF (idx_to_at_AO(iblk) == iatom + 1) THEN
    1001           92 :             iatom = iatom + 1
    1002           92 :             batch_ranges_at(iatom) = iblk
    1003              :          END IF
    1004              :       END DO
    1005              : 
    1006           46 :       CALL dbt_batched_contract_init(t_3c_work_3(1), batch_range_2=batch_ranges_at)
    1007           46 :       CALL dbt_batched_contract_init(t_3c_work_3(2), batch_range_2=batch_ranges_at)
    1008           46 :       CALL dbt_batched_contract_init(t_3c_work_3(3), batch_range_2=batch_ranges_at)
    1009           46 :       CALL dbt_batched_contract_init(t_3c_work_2(1), batch_range_1=batch_ranges_at)
    1010           46 :       CALL dbt_batched_contract_init(t_3c_work_2(2), batch_range_1=batch_ranges_at)
    1011              : 
    1012              :       !Preparing for the stacking of 3c tensors
    1013           46 :       nimg_nze = ri_data%nimg_nze
    1014           46 :       batch_size = ri_data%kp_stack_size
    1015           46 :       n_batch = nimg_nze/batch_size
    1016           46 :       IF (MODULO(nimg_nze, batch_size) .NE. 0) n_batch = n_batch + 1
    1017          138 :       ALLOCATE (batch_ranges_nze(n_batch + 1))
    1018          112 :       DO i_batch = 1, n_batch
    1019          112 :          batch_ranges_nze(i_batch) = (i_batch - 1)*batch_size + 1
    1020              :       END DO
    1021           46 :       batch_ranges_nze(n_batch + 1) = nimg_nze + 1
    1022              : 
    1023              :       !Applying the external bump to ((P|Q)_D + B*(P|Q)_OD*B)^-1 from left and right
    1024              :       !And keep the bump on LHS only version as well, with B*M^-1 = (M^-1*B)^T
    1025          138 :       DO iatom = 1, natom
    1026           92 :          CALL dbt_create(t_2c_inv(iatom), t_2c_binv(iatom))
    1027           92 :          CALL dbt_copy(t_2c_inv(iatom), t_2c_binv(iatom))
    1028           92 :          CALL apply_bump(t_2c_binv(iatom), iatom, ri_data, qs_env, from_left=.TRUE., from_right=.FALSE.)
    1029          138 :          CALL apply_bump(t_2c_inv(iatom), iatom, ri_data, qs_env, from_left=.TRUE., from_right=.TRUE.)
    1030              :       END DO
    1031              : 
    1032           46 :       t1 = m_walltime()
    1033           46 :       work_virial = 0.0_dp
    1034          230 :       ALLOCATE (iapc_pairs(nimg, 2), i_images(nimg))
    1035          230 :       ALLOCATE (force_pattern(natom, natom, nimg))
    1036         7760 :       force_pattern(:, :, :) = -1
    1037              :       !We proceed with 2 loops: one over the sparsity pattern from the SCF, one over the rest
    1038              :       !We use the SCF cost model for the first loop, while we calculate the cost of the upcoming loop
    1039          138 :       DO i_loop = 1, 2
    1040         2296 :          DO b_img = 1, nimg
    1041         6704 :             DO jatom = 1, natom
    1042        15428 :                DO iatom = 1, natom
    1043              : 
    1044         8816 :                   pref = -0.5_dp*fac
    1045         8816 :                   IF (i_loop == 1 .AND. (.NOT. sparsity_pattern(iatom, jatom, b_img) == igroup)) CYCLE
    1046         5054 :                   IF (i_loop == 2 .AND. (.NOT. force_pattern(iatom, jatom, b_img) == igroup)) CYCLE
    1047              : 
    1048              :                   !Get the proper HFX potential 2c integrals (R_i^0|S_j^b), times (S_j^b|Q_j^b)^-1
    1049         1228 :                   CALL timeset(routineN//"_2c_1", handle2)
    1050              :                   CALL get_ext_2c_int(t_2c_work(1), mat_2c_pot, iatom, jatom, b_img, ri_data, qs_env, &
    1051              :                                       blacs_env_ext=blacs_env_sub, para_env_ext=para_env_sub, &
    1052         1228 :                                       dbcsr_template=dbcsr_template)
    1053              :                   CALL dbt_contract(1.0_dp, t_2c_work(1), t_2c_inv(jatom), &
    1054              :                                     0.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1055              :                                     contract_1=[2], notcontract_1=[1], &
    1056              :                                     contract_2=[1], notcontract_2=[2], &
    1057         1228 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1058         1228 :                   CALL dbt_copy(t_2c_work(2), t_2c_work(5), move_data=.TRUE.) !move to split blocks
    1059         1228 :                   CALL dbt_filter(t_2c_work(5), ri_data%filter_eps)
    1060         1228 :                   CALL timestop(handle2)
    1061              : 
    1062         1228 :                   CALL timeset(routineN//"_3c", handle2)
    1063         6138 :                   bounds_iat(:, 1) = [SUM(ri_data%bsizes_AO(1:iatom - 1)) + 1, SUM(ri_data%bsizes_AO(1:iatom))]
    1064         6102 :                   bounds_jat(:, 1) = [SUM(ri_data%bsizes_AO(1:jatom - 1)) + 1, SUM(ri_data%bsizes_AO(1:jatom))]
    1065         1228 :                   CALL dbt_clear(t_2c_R_split)
    1066              : 
    1067         2717 :                   DO i_spin = 1, nspins
    1068         2717 :                      CALL dbt_batched_contract_init(rho_ao_t_sub(i_spin, b_img))
    1069              :                   END DO
    1070              : 
    1071         1228 :                   CALL get_iapc_pairs(iapc_pairs, b_img, ri_data, qs_env, i_images) !i = a+c-b
    1072         3393 :                   DO i_batch = 1, n_batch
    1073              : 
    1074              :                      !Stack the 3c derivatives to take the trace later on
    1075         8660 :                      DO i_xyz = 1, 3
    1076         6495 :                         CALL dbt_clear(t_3c_der_stack(i_xyz))
    1077              :                         CALL fill_3c_stack(t_3c_der_stack(i_xyz), t_3c_der_RI_sub(:, i_xyz), &
    1078              :                                            iapc_pairs(:, 1), 3, ri_data, filter_at=jatom, &
    1079              :                                            filter_dim=2, idx_to_at=idx_to_at_AO, &
    1080        19485 :                                            img_bounds=[batch_ranges_nze(i_batch), batch_ranges_nze(i_batch + 1)])
    1081              : 
    1082         6495 :                         CALL dbt_clear(t_3c_der_stack(3 + i_xyz))
    1083              :                         CALL fill_3c_stack(t_3c_der_stack(3 + i_xyz), t_3c_der_AO_sub(:, i_xyz), &
    1084              :                                            iapc_pairs(:, 1), 3, ri_data, filter_at=jatom, &
    1085              :                                            filter_dim=2, idx_to_at=idx_to_at_AO, &
    1086        21650 :                                            img_bounds=[batch_ranges_nze(i_batch), batch_ranges_nze(i_batch + 1)])
    1087              :                      END DO
    1088              : 
    1089         5919 :                      DO i_spin = 1, nspins
    1090              :                         !stack the t_3c_apc tensors
    1091         2526 :                         CALL dbt_clear(t_3c_work_2(3))
    1092              :                         CALL fill_3c_stack(t_3c_work_2(3), t_3c_apc_sub(i_spin, :), iapc_pairs(:, 2), 3, &
    1093              :                                            ri_data, filter_at=iatom, filter_dim=1, idx_to_at=idx_to_at_AO, &
    1094         7578 :                                            img_bounds=[batch_ranges_nze(i_batch), batch_ranges_nze(i_batch + 1)])
    1095         2526 :                         CALL get_tensor_occupancy(t_3c_work_2(3), nze, occ)
    1096         2526 :                         IF (nze == 0) CYCLE
    1097         2516 :                         CALL dbt_copy(t_3c_work_2(3), t_3c_work_2(1), move_data=.TRUE.)
    1098              : 
    1099              :                         !Contract with the second density matrix: P_mu^0,nu^b * t_3c_apc,
    1100              :                         !where t_3c_apc = P_sigma^a,lambda^a+c (mu^0 P^0 sigma^a) *(P^0|R^0)^-1 (stacked along a+c)
    1101              :                         CALL dbt_contract(1.0_dp, rho_ao_t_sub(i_spin, b_img), t_3c_work_2(1), &
    1102              :                                           0.0_dp, t_3c_work_2(2), map_1=[1], map_2=[2, 3], &
    1103              :                                           contract_1=[1], notcontract_1=[2], &
    1104              :                                           contract_2=[1], notcontract_2=[2, 3], &
    1105              :                                           bounds_1=bounds_iat, bounds_2=bounds_jat, &
    1106         2516 :                                           filter_eps=ri_data%filter_eps, flop=nflop)
    1107         2516 :                         ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1108              : 
    1109         2516 :                         CALL get_tensor_occupancy(t_3c_work_2(2), nze, occ)
    1110         2516 :                         IF (nze == 0) CYCLE
    1111              : 
    1112              :                         !Contract with V_PQ so that we can take the trace with (Q^b|nu^b lmabda^a+c)^(x)
    1113         2228 :                         CALL dbt_copy(t_3c_work_2(2), t_3c_work_3(1), order=[2, 1, 3], move_data=.TRUE.)
    1114         2228 :                         CALL dbt_batched_contract_init(t_2c_work(5))
    1115              :                         CALL dbt_contract(1.0_dp, t_2c_work(5), t_3c_work_3(1), &
    1116              :                                           0.0_dp, t_3c_work_3(2), map_1=[1], map_2=[2, 3], &
    1117              :                                           contract_1=[1], notcontract_1=[2], &
    1118              :                                           contract_2=[1], notcontract_2=[2, 3], &
    1119         2228 :                                           filter_eps=ri_data%filter_eps, flop=nflop)
    1120         2228 :                         ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1121         2228 :                         CALL dbt_batched_contract_finalize(t_2c_work(5))
    1122              : 
    1123              :                         !Contract with the 3c derivatives to get the force/virial
    1124         2228 :                         CALL dbt_copy(t_3c_work_3(2), t_3c_work_3(4), move_data=.TRUE.)
    1125         2228 :                         IF (use_virial_prv) THEN
    1126              :                            CALL get_force_from_3c_trace(force, t_3c_work_3(4), t_3c_der_stack(1:3), &
    1127              :                                                         t_3c_der_stack(4:6), atom_of_kind, kind_of, &
    1128              :                                                         idx_to_at_RI, idx_to_at_AO, i_images, &
    1129              :                                                         batch_ranges_nze(i_batch), 2.0_dp*pref, &
    1130          460 :                                                         ri_data, qs_env, work_virial, cell, particle_set)
    1131              :                         ELSE
    1132              :                            CALL get_force_from_3c_trace(force, t_3c_work_3(4), t_3c_der_stack(1:3), &
    1133              :                                                         t_3c_der_stack(4:6), atom_of_kind, kind_of, &
    1134              :                                                         idx_to_at_RI, idx_to_at_AO, i_images, &
    1135              :                                                         batch_ranges_nze(i_batch), 2.0_dp*pref, &
    1136         1768 :                                                         ri_data, qs_env)
    1137              :                         END IF
    1138         2228 :                         CALL dbt_clear(t_3c_work_3(4))
    1139              : 
    1140              :                         !Contract with the 3-center integrals in order to have a matrix R_PQ such that
    1141              :                         !we can take the trace sum_PQ R_PQ (P^0|Q^b)^(x)
    1142         2228 :                         IF (i_loop == 2) CYCLE
    1143              : 
    1144              :                         !Stack the 3c integrals
    1145              :                         CALL fill_3c_stack(t_3c_work_3(4), ri_data%kp_t_3c_int, iapc_pairs(:, 1), 3, ri_data, &
    1146              :                                            filter_at=jatom, filter_dim=2, idx_to_at=idx_to_at_AO, &
    1147         3522 :                                            img_bounds=[batch_ranges_nze(i_batch), batch_ranges_nze(i_batch + 1)])
    1148         1174 :                         CALL dbt_copy(t_3c_work_3(4), t_3c_work_3(3), move_data=.TRUE.)
    1149              : 
    1150         1174 :                         CALL dbt_batched_contract_init(t_2c_R_split)
    1151              :                         CALL dbt_contract(1.0_dp, t_3c_work_3(1), t_3c_work_3(3), &
    1152              :                                           1.0_dp, t_2c_R_split, map_1=[1], map_2=[2], &
    1153              :                                           contract_1=[2, 3], notcontract_1=[1], &
    1154              :                                           contract_2=[2, 3], notcontract_2=[1], &
    1155         1174 :                                           filter_eps=ri_data%filter_eps, flop=nflop)
    1156         1174 :                         ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1157         1174 :                         CALL dbt_batched_contract_finalize(t_2c_R_split)
    1158         8381 :                         CALL dbt_copy(t_3c_work_3(4), t_3c_work_3(1))
    1159              :                      END DO
    1160              :                   END DO
    1161         2717 :                   DO i_spin = 1, nspins
    1162         2717 :                      CALL dbt_batched_contract_finalize(rho_ao_t_sub(i_spin, b_img))
    1163              :                   END DO
    1164         1228 :                   CALL timestop(handle2)
    1165              : 
    1166         1228 :                   IF (i_loop == 2) CYCLE
    1167          646 :                   pref = 2.0_dp*pref
    1168          646 :                   IF (iatom == jatom .AND. b_img == 1) pref = 0.5_dp*pref
    1169              : 
    1170          646 :                   CALL timeset(routineN//"_2c_2", handle2)
    1171              :                   !Note that the derivatives are in atomic block format (not split)
    1172          646 :                   CALL dbt_copy(t_2c_R_split, t_2c_R, move_data=.TRUE.)
    1173              : 
    1174              :                   CALL get_ext_2c_int(t_2c_work(1), mat_2c_pot, iatom, jatom, b_img, ri_data, qs_env, &
    1175              :                                       blacs_env_ext=blacs_env_sub, para_env_ext=para_env_sub, &
    1176          646 :                                       dbcsr_template=dbcsr_template)
    1177              : 
    1178              :                   !We have to calculate: S^-1(iat) * R_PQ * S^-1(jat)    to trace with HFX pot der
    1179              :                   !                      + R_PQ * S^-1(jat) * pot^T      to trace with S^(x) (iat)
    1180              :                   !                      + pot^T * S^-1(iat) *R_PQ       to trace with S^(x) (jat)
    1181              : 
    1182              :                   !Because 3c tensors are all precontracted with the inverse RI metric,
    1183              :                   !t_2c_R is currently implicitely multiplied by S^-1(iat) from the left
    1184              :                   !and S^-1(jat) from the right, directly in the proper format for the trace
    1185              :                   !with the HFX potential derivative
    1186              : 
    1187              :                   !Trace with HFX pot deriv, that we need to build first
    1188         2584 :                   DO i_xyz = 1, 3
    1189              :                      CALL get_ext_2c_int(t_2c_der_pot(i_xyz), mat_der_pot_sub(:, i_xyz), iatom, jatom, &
    1190              :                                          b_img, ri_data, qs_env, blacs_env_ext=blacs_env_sub, &
    1191         2584 :                                          para_env_ext=para_env_sub, dbcsr_template=dbcsr_template)
    1192              :                   END DO
    1193              : 
    1194          646 :                   IF (use_virial_prv) THEN
    1195              :                      CALL get_2c_der_force(force, t_2c_R, t_2c_der_pot, atom_of_kind, kind_of, &
    1196          125 :                                            b_img, pref, ri_data, qs_env, work_virial, cell, particle_set)
    1197              :                   ELSE
    1198              :                      CALL get_2c_der_force(force, t_2c_R, t_2c_der_pot, atom_of_kind, kind_of, &
    1199          521 :                                            b_img, pref, ri_data, qs_env)
    1200              :                   END IF
    1201              : 
    1202         2584 :                   DO i_xyz = 1, 3
    1203         2584 :                      CALL dbt_clear(t_2c_der_pot(i_xyz))
    1204              :                   END DO
    1205              : 
    1206              :                   !R_PQ * S^-1(jat) * pot^T  (=A)
    1207              :                   CALL dbt_contract(1.0_dp, t_2c_metric(iatom), t_2c_R, & !get rid of implicit S^-1(iat)
    1208              :                                     0.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1209              :                                     contract_1=[2], notcontract_1=[1], &
    1210              :                                     contract_2=[1], notcontract_2=[2], &
    1211          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1212          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1213              :                   CALL dbt_contract(1.0_dp, t_2c_work(2), t_2c_work(1), &
    1214              :                                     0.0_dp, t_2c_work(3), map_1=[1], map_2=[2], &
    1215              :                                     contract_1=[2], notcontract_1=[1], &
    1216              :                                     contract_2=[2], notcontract_2=[1], &
    1217          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1218          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1219              : 
    1220              :                   !With the RI bump function, things get more complex. M = (S|P)_D + B*(S|P)_OD*B
    1221              :                   !Calculate M^-1*B*A + A*B*M^-1 to contract with B^x. A is in t_2c_work(3)
    1222              :                   CALL dbt_contract(1.0_dp, t_2c_work(3), t_2c_binv(iatom), &
    1223              :                                     0.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1224              :                                     contract_1=[2], notcontract_1=[1], &
    1225              :                                     contract_2=[1], notcontract_2=[2], &
    1226          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1227          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1228              : 
    1229              :                   CALL dbt_contract(1.0_dp, t_2c_binv(iatom), t_2c_work(3), & !use transpose of B*M^-1 = M^-1*B
    1230              :                                     0.0_dp, t_2c_work(4), map_1=[1], map_2=[2], &
    1231              :                                     contract_1=[1], notcontract_1=[2], &
    1232              :                                     contract_2=[1], notcontract_2=[2], &
    1233          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1234          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1235              : 
    1236          646 :                   CALL dbt_copy(t_2c_work(2), t_2c_work(4), summation=.TRUE.)
    1237              :                   CALL get_2c_bump_forces(force, t_2c_work(4), iatom, atom_of_kind, kind_of, pref, &
    1238          646 :                                           ri_data, qs_env, work_virial)
    1239              : 
    1240              :                   !Calculate -M^-1*B*A*B*M^-1 to contracte with diagonal RI metric deriv. t_2c_work(2) holds A*B*M^-1
    1241              :                   CALL dbt_contract(1.0_dp, t_2c_binv(iatom), t_2c_work(2), &
    1242              :                                     0.0_dp, t_2c_work(4), map_1=[1], map_2=[2], &
    1243              :                                     contract_1=[1], notcontract_1=[2], &
    1244              :                                     contract_2=[1], notcontract_2=[2], &
    1245          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1246          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1247              : 
    1248          646 :                   IF (use_virial_prv) THEN
    1249              :                      CALL get_2c_der_force(force, t_2c_work(4), t_2c_der_metric_sub(iatom, :), atom_of_kind, &
    1250              :                                            kind_of, 1, -pref, ri_data, qs_env, work_virial, cell, particle_set, &
    1251          125 :                                            diag=.TRUE., offdiag=.FALSE.)
    1252              :                   ELSE
    1253              :                      CALL get_2c_der_force(force, t_2c_work(4), t_2c_der_metric_sub(iatom, :), atom_of_kind, &
    1254          521 :                                            kind_of, 1, -pref, ri_data, qs_env, diag=.TRUE., offdiag=.FALSE.)
    1255              :                   END IF
    1256              : 
    1257              :                   !Calculate -B*M^-1*B*A*B*M^-1*B to contract with off-diagonal RI metric derivs
    1258          646 :                   CALL dbt_copy(t_2c_work(4), t_2c_work(2))
    1259          646 :                   CALL apply_bump(t_2c_work(2), iatom, ri_data, qs_env, from_left=.TRUE., from_right=.TRUE.)
    1260              : 
    1261          646 :                   IF (use_virial_prv) THEN
    1262              :                      CALL get_2c_der_force(force, t_2c_work(2), t_2c_der_metric_sub(iatom, :), atom_of_kind, &
    1263              :                                            kind_of, 1, -pref, ri_data, qs_env, work_virial, cell, particle_set, &
    1264          125 :                                            diag=.FALSE., offdiag=.TRUE.)
    1265              :                   ELSE
    1266              :                      CALL get_2c_der_force(force, t_2c_work(2), t_2c_der_metric_sub(iatom, :), atom_of_kind, &
    1267          521 :                                            kind_of, 1, -pref, ri_data, qs_env, diag=.FALSE., offdiag=.TRUE.)
    1268              :                   END IF
    1269              : 
    1270              :                   !Calculate -O*B*M^-1*B*A*B*M^-1 - M^-1*B*A*B*M^-1*B*O, where O is off-diagonal integrals
    1271              :                   !t_2c_work(4) holds M^-1*B*A*B*M^-1, and exploit transpose of B*O (stored in t_2c_bint)
    1272              :                   CALL dbt_contract(1.0_dp, t_2c_work(4), t_2c_bint(iatom), &
    1273              :                                     0.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1274              :                                     contract_1=[2], notcontract_1=[1], &
    1275              :                                     contract_2=[1], notcontract_2=[2], &
    1276          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1277          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1278              : 
    1279              :                   CALL dbt_contract(1.0_dp, t_2c_bint(iatom), t_2c_work(4), &
    1280              :                                     1.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1281              :                                     contract_1=[1], notcontract_1=[2], &
    1282              :                                     contract_2=[1], notcontract_2=[2], &
    1283          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1284          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1285              : 
    1286              :                   CALL get_2c_bump_forces(force, t_2c_work(2), iatom, atom_of_kind, kind_of, -pref, &
    1287          646 :                                           ri_data, qs_env, work_virial)
    1288              : 
    1289              :                   ! pot^T * S^-1(iat) * R_PQ (=A)
    1290              :                   CALL dbt_contract(1.0_dp, t_2c_work(1), t_2c_R, &
    1291              :                                     0.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1292              :                                     contract_1=[1], notcontract_1=[2], &
    1293              :                                     contract_2=[1], notcontract_2=[2], &
    1294          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1295          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1296              : 
    1297              :                   CALL dbt_contract(1.0_dp, t_2c_work(2), t_2c_metric(jatom), & !get rid of implicit S^-1(jat)
    1298              :                                     0.0_dp, t_2c_work(3), map_1=[1], map_2=[2], &
    1299              :                                     contract_1=[2], notcontract_1=[1], &
    1300              :                                     contract_2=[1], notcontract_2=[2], &
    1301          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1302          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1303              : 
    1304              :                   !Do the same shenanigans with the S^(x) (jatom)
    1305              :                   !Calculate M^-1*B*A + A*B*M^-1 to contract with B^x. A is in t_2c_work(3)
    1306              :                   CALL dbt_contract(1.0_dp, t_2c_work(3), t_2c_binv(jatom), &
    1307              :                                     0.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1308              :                                     contract_1=[2], notcontract_1=[1], &
    1309              :                                     contract_2=[1], notcontract_2=[2], &
    1310          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1311          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1312              : 
    1313              :                   CALL dbt_contract(1.0_dp, t_2c_binv(jatom), t_2c_work(3), & !use transpose of B*M^-1 = M^-1*B
    1314              :                                     0.0_dp, t_2c_work(4), map_1=[1], map_2=[2], &
    1315              :                                     contract_1=[1], notcontract_1=[2], &
    1316              :                                     contract_2=[1], notcontract_2=[2], &
    1317          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1318          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1319              : 
    1320          646 :                   CALL dbt_copy(t_2c_work(2), t_2c_work(4), summation=.TRUE.)
    1321              :                   CALL get_2c_bump_forces(force, t_2c_work(4), jatom, atom_of_kind, kind_of, pref, &
    1322          646 :                                           ri_data, qs_env, work_virial)
    1323              : 
    1324              :                   !Calculate -M^-1*B*A*B*M^-1 to contracte with diagonal RI metric deriv. t_2c_work(2) holds A*B*M^-1
    1325              :                   CALL dbt_contract(1.0_dp, t_2c_binv(jatom), t_2c_work(2), &
    1326              :                                     0.0_dp, t_2c_work(4), map_1=[1], map_2=[2], &
    1327              :                                     contract_1=[1], notcontract_1=[2], &
    1328              :                                     contract_2=[1], notcontract_2=[2], &
    1329          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1330          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1331              : 
    1332          646 :                   IF (use_virial_prv) THEN
    1333              :                      CALL get_2c_der_force(force, t_2c_work(4), t_2c_der_metric_sub(jatom, :), atom_of_kind, &
    1334              :                                            kind_of, 1, -pref, ri_data, qs_env, work_virial, cell, particle_set, &
    1335          125 :                                            diag=.TRUE., offdiag=.FALSE.)
    1336              :                   ELSE
    1337              :                      CALL get_2c_der_force(force, t_2c_work(4), t_2c_der_metric_sub(jatom, :), atom_of_kind, &
    1338          521 :                                            kind_of, 1, -pref, ri_data, qs_env, diag=.TRUE., offdiag=.FALSE.)
    1339              :                   END IF
    1340              : 
    1341              :                   !Calculate -B*M^-1*B*A*B*M^-1*B to contract with off-diagonal RI metric derivs
    1342          646 :                   CALL dbt_copy(t_2c_work(4), t_2c_work(2))
    1343          646 :                   CALL apply_bump(t_2c_work(2), jatom, ri_data, qs_env, from_left=.TRUE., from_right=.TRUE.)
    1344              : 
    1345          646 :                   IF (use_virial_prv) THEN
    1346              :                      CALL get_2c_der_force(force, t_2c_work(2), t_2c_der_metric_sub(jatom, :), atom_of_kind, &
    1347              :                                            kind_of, 1, -pref, ri_data, qs_env, work_virial, cell, particle_set, &
    1348          125 :                                            diag=.FALSE., offdiag=.TRUE.)
    1349              :                   ELSE
    1350              :                      CALL get_2c_der_force(force, t_2c_work(2), t_2c_der_metric_sub(jatom, :), atom_of_kind, &
    1351          521 :                                            kind_of, 1, -pref, ri_data, qs_env, diag=.FALSE., offdiag=.TRUE.)
    1352              :                   END IF
    1353              : 
    1354              :                   !Calculate -O*B*M^-1*B*A*B*M^-1 - M^-1*B*A*B*M^-1*B*O, where O is off-diagonal integrals
    1355              :                   !t_2c_work(4) holds M^-1*B*A*B*M^-1, and exploit transpose of B*O (stored in t_2c_bint)
    1356              :                   CALL dbt_contract(1.0_dp, t_2c_work(4), t_2c_bint(jatom), &
    1357              :                                     0.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1358              :                                     contract_1=[2], notcontract_1=[1], &
    1359              :                                     contract_2=[1], notcontract_2=[2], &
    1360          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1361          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1362              : 
    1363              :                   CALL dbt_contract(1.0_dp, t_2c_bint(jatom), t_2c_work(4), &
    1364              :                                     1.0_dp, t_2c_work(2), map_1=[1], map_2=[2], &
    1365              :                                     contract_1=[1], notcontract_1=[2], &
    1366              :                                     contract_2=[1], notcontract_2=[2], &
    1367          646 :                                     filter_eps=ri_data%filter_eps, flop=nflop)
    1368          646 :                   ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    1369              : 
    1370              :                   CALL get_2c_bump_forces(force, t_2c_work(2), jatom, atom_of_kind, kind_of, -pref, &
    1371          646 :                                           ri_data, qs_env, work_virial)
    1372              : 
    1373        15744 :                   CALL timestop(handle2)
    1374              :                END DO !iatom
    1375              :             END DO !jatom
    1376              :          END DO !b_img
    1377              : 
    1378          138 :          IF (i_loop == 1) THEN
    1379           46 :             CALL update_pattern_to_forces(force_pattern, sparsity_pattern, ngroups, ri_data, qs_env)
    1380              :          END IF
    1381              :       END DO !i_loop
    1382              : 
    1383           46 :       CALL dbt_batched_contract_finalize(t_3c_work_3(1))
    1384           46 :       CALL dbt_batched_contract_finalize(t_3c_work_3(2))
    1385           46 :       CALL dbt_batched_contract_finalize(t_3c_work_3(3))
    1386           46 :       CALL dbt_batched_contract_finalize(t_3c_work_2(1))
    1387           46 :       CALL dbt_batched_contract_finalize(t_3c_work_2(2))
    1388              : 
    1389           46 :       IF (use_virial_prv) THEN
    1390           40 :          DO k_xyz = 1, 3
    1391          130 :             DO j_xyz = 1, 3
    1392          390 :                DO i_xyz = 1, 3
    1393              :                   virial%pv_fock_4c(i_xyz, j_xyz) = virial%pv_fock_4c(i_xyz, j_xyz) &
    1394          360 :                                                     + work_virial(i_xyz, k_xyz)*cell%hmat(j_xyz, k_xyz)
    1395              :                END DO
    1396              :             END DO
    1397              :          END DO
    1398              :       END IF
    1399              : 
    1400              :       !End of subgroup parallelization
    1401           46 :       CALL cp_blacs_env_release(blacs_env_sub)
    1402           46 :       CALL para_env_sub%free()
    1403           46 :       DEALLOCATE (para_env_sub)
    1404              : 
    1405           46 :       CALL para_env%sync()
    1406           46 :       t2 = m_walltime()
    1407           46 :       ri_data%dbcsr_time = ri_data%dbcsr_time + t2 - t1
    1408              : 
    1409              :       !clean-up
    1410           46 :       IF (ASSOCIATED(dbcsr_template)) THEN
    1411           46 :          CALL dbcsr_release(dbcsr_template)
    1412           46 :          DEALLOCATE (dbcsr_template)
    1413              :       END IF
    1414           46 :       CALL dbt_destroy(t_2c_R)
    1415           46 :       CALL dbt_destroy(t_2c_R_split)
    1416           46 :       CALL dbt_destroy(t_2c_work(1))
    1417           46 :       CALL dbt_destroy(t_2c_work(2))
    1418           46 :       CALL dbt_destroy(t_2c_work(3))
    1419           46 :       CALL dbt_destroy(t_2c_work(4))
    1420           46 :       CALL dbt_destroy(t_2c_work(5))
    1421           46 :       CALL dbt_destroy(t_3c_work_2(1))
    1422           46 :       CALL dbt_destroy(t_3c_work_2(2))
    1423           46 :       CALL dbt_destroy(t_3c_work_2(3))
    1424           46 :       CALL dbt_destroy(t_3c_work_3(1))
    1425           46 :       CALL dbt_destroy(t_3c_work_3(2))
    1426           46 :       CALL dbt_destroy(t_3c_work_3(3))
    1427           46 :       CALL dbt_destroy(t_3c_work_3(4))
    1428           46 :       CALL dbt_destroy(t_3c_der_stack(1))
    1429           46 :       CALL dbt_destroy(t_3c_der_stack(2))
    1430           46 :       CALL dbt_destroy(t_3c_der_stack(3))
    1431           46 :       CALL dbt_destroy(t_3c_der_stack(4))
    1432           46 :       CALL dbt_destroy(t_3c_der_stack(5))
    1433           46 :       CALL dbt_destroy(t_3c_der_stack(6))
    1434          184 :       DO i_xyz = 1, 3
    1435          184 :          CALL dbt_destroy(t_2c_der_pot(i_xyz))
    1436              :       END DO
    1437          138 :       DO iatom = 1, natom
    1438           92 :          CALL dbt_destroy(t_2c_inv(iatom))
    1439           92 :          CALL dbt_destroy(t_2c_binv(iatom))
    1440           92 :          CALL dbt_destroy(t_2c_bint(iatom))
    1441           92 :          CALL dbt_destroy(t_2c_metric(iatom))
    1442          414 :          DO i_xyz = 1, 3
    1443          368 :             CALL dbt_destroy(t_2c_der_metric_sub(iatom, i_xyz))
    1444              :          END DO
    1445              :       END DO
    1446         1148 :       DO i_img = 1, nimg
    1447         1102 :          CALL dbcsr_release(mat_2c_pot(i_img))
    1448         2430 :          DO i_spin = 1, nspins
    1449         1282 :             CALL dbt_destroy(rho_ao_t_sub(i_spin, i_img))
    1450         2384 :             CALL dbt_destroy(t_3c_apc_sub(i_spin, i_img))
    1451              :          END DO
    1452              :       END DO
    1453          184 :       DO i_xyz = 1, 3
    1454         3490 :          DO i_img = 1, nimg
    1455         3306 :             CALL dbt_destroy(t_3c_der_RI_sub(i_img, i_xyz))
    1456         3306 :             CALL dbt_destroy(t_3c_der_AO_sub(i_img, i_xyz))
    1457         3444 :             CALL dbcsr_release(mat_der_pot_sub(i_img, i_xyz))
    1458              :          END DO
    1459              :       END DO
    1460              : 
    1461           46 :       CALL timestop(handle)
    1462              : 
    1463        20468 :    END SUBROUTINE hfx_ri_update_forces_kp
    1464              : 
    1465              : ! **************************************************************************************************
    1466              : !> \brief A routine the applies the RI bump matrix from the left and/or the right, given an input
    1467              : !>        matrix and the central RI atom. We assume atomic block sizes
    1468              : !> \param t_2c_inout ...
    1469              : !> \param atom_i ...
    1470              : !> \param ri_data ...
    1471              : !> \param qs_env ...
    1472              : !> \param from_left ...
    1473              : !> \param from_right ...
    1474              : !> \param debump ...
    1475              : ! **************************************************************************************************
    1476         1956 :    SUBROUTINE apply_bump(t_2c_inout, atom_i, ri_data, qs_env, from_left, from_right, debump)
    1477              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_2c_inout
    1478              :       INTEGER, INTENT(IN)                                :: atom_i
    1479              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    1480              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1481              :       LOGICAL, INTENT(IN), OPTIONAL                      :: from_left, from_right, debump
    1482              : 
    1483              :       INTEGER                                            :: i_img, i_RI, iatom, ind(2), j_img, j_RI, &
    1484              :                                                             jatom, natom, nblks(2), nimg, nkind
    1485         1956 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    1486         1956 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    1487              :       LOGICAL                                            :: found, my_debump, my_left, my_right
    1488              :       REAL(dp)                                           :: bval, r0, r1, ri(3), rj(3), rref(3), &
    1489              :                                                             scoord(3)
    1490         1956 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: blk
    1491              :       TYPE(cell_type), POINTER                           :: cell
    1492              :       TYPE(dbt_iterator_type)                            :: iter
    1493              :       TYPE(kpoint_type), POINTER                         :: kpoints
    1494         1956 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1495         1956 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
    1496              : 
    1497         1956 :       NULLIFY (qs_kind_set, particle_set, kpoints, index_to_cell, cell_to_index, cell)
    1498              : 
    1499              :       CALL get_qs_env(qs_env, natom=natom, nkind=nkind, qs_kind_set=qs_kind_set, cell=cell, &
    1500         1956 :                       kpoints=kpoints, particle_set=particle_set)
    1501         1956 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell)
    1502              : 
    1503         1956 :       my_debump = .FALSE.
    1504         1956 :       IF (PRESENT(debump)) my_debump = debump
    1505              : 
    1506         1956 :       my_left = .FALSE.
    1507         1956 :       IF (PRESENT(from_left)) my_left = from_left
    1508              : 
    1509         1956 :       my_right = .FALSE.
    1510         1956 :       IF (PRESENT(from_right)) my_right = from_right
    1511         1956 :       CPASSERT(my_left .OR. my_right)
    1512              : 
    1513         1956 :       CALL dbt_get_info(t_2c_inout, nblks_total=nblks)
    1514         1956 :       CPASSERT(nblks(1) == ri_data%ncell_RI*natom)
    1515         1956 :       CPASSERT(nblks(2) == ri_data%ncell_RI*natom)
    1516              : 
    1517         1956 :       nimg = ri_data%nimg
    1518              : 
    1519              :       !Loop over the RI cells and atoms, and apply bump accordingly
    1520         1956 :       r1 = ri_data%kp_RI_range
    1521         1956 :       r0 = ri_data%kp_bump_rad
    1522         1956 :       rref = pbc(particle_set(atom_i)%r, cell)
    1523              : 
    1524              : !$OMP PARALLEL DEFAULT(NONE) SHARED(t_2c_inout,natom,ri_data,cell,particle_set,index_to_cell,my_left, &
    1525              : !$OMP                               my_right,r0,r1,rref,my_debump) &
    1526         1956 : !$OMP PRIVATE(iter,ind,blk,found,i_RI,i_img,iatom,j_RI,j_img,jatom,scoord,ri,rj,bval)
    1527              :       CALL dbt_iterator_start(iter, t_2c_inout)
    1528              :       DO WHILE (dbt_iterator_blocks_left(iter))
    1529              :          CALL dbt_iterator_next_block(iter, ind)
    1530              :          CALL dbt_get_block(t_2c_inout, ind, blk, found)
    1531              :          IF (.NOT. found) CYCLE
    1532              : 
    1533              :          i_RI = (ind(1) - 1)/natom + 1
    1534              :          i_img = ri_data%RI_cell_to_img(i_RI)
    1535              :          iatom = ind(1) - (i_RI - 1)*natom
    1536              : 
    1537              :          CALL real_to_scaled(scoord, pbc(particle_set(iatom)%r, cell), cell)
    1538              :          CALL scaled_to_real(ri, scoord(:) + index_to_cell(:, i_img), cell)
    1539              : 
    1540              :          j_RI = (ind(2) - 1)/natom + 1
    1541              :          j_img = ri_data%RI_cell_to_img(j_RI)
    1542              :          jatom = ind(2) - (j_RI - 1)*natom
    1543              : 
    1544              :          CALL real_to_scaled(scoord, pbc(particle_set(jatom)%r, cell), cell)
    1545              :          CALL scaled_to_real(rj, scoord(:) + index_to_cell(:, j_img), cell)
    1546              : 
    1547              :          IF (.NOT. my_debump) THEN
    1548              :             IF (my_left) blk(:, :) = blk(:, :)*bump(NORM2(ri - rref), r0, r1)
    1549              :             IF (my_right) blk(:, :) = blk(:, :)*bump(NORM2(rj - rref), r0, r1)
    1550              :          ELSE
    1551              :             !Note: by construction, the bump function is never quite zero, as its range is the same
    1552              :             !      as that of the extended RI basis (but we are safe)
    1553              :             bval = bump(NORM2(ri - rref), r0, r1)
    1554              :             IF (my_left .AND. bval > EPSILON(1.0_dp)) blk(:, :) = blk(:, :)/bval
    1555              :             bval = bump(NORM2(rj - rref), r0, r1)
    1556              :             IF (my_right .AND. bval > EPSILON(1.0_dp)) blk(:, :) = blk(:, :)/bval
    1557              :          END IF
    1558              : 
    1559              :          CALL dbt_put_block(t_2c_inout, ind, SHAPE(blk), blk)
    1560              : 
    1561              :          DEALLOCATE (blk)
    1562              :       END DO
    1563              :       CALL dbt_iterator_stop(iter)
    1564              : !$OMP END PARALLEL
    1565         1956 :       CALL dbt_filter(t_2c_inout, ri_data%filter_eps)
    1566              : 
    1567         3912 :    END SUBROUTINE apply_bump
    1568              : 
    1569              : ! **************************************************************************************************
    1570              : !> \brief A routine that calculates the forces due to the derivative of the bump function
    1571              : !> \param force ...
    1572              : !> \param t_2c_in ...
    1573              : !> \param atom_i ...
    1574              : !> \param atom_of_kind ...
    1575              : !> \param kind_of ...
    1576              : !> \param pref ...
    1577              : !> \param ri_data ...
    1578              : !> \param qs_env ...
    1579              : !> \param work_virial ...
    1580              : ! **************************************************************************************************
    1581         2584 :    SUBROUTINE get_2c_bump_forces(force, t_2c_in, atom_i, atom_of_kind, kind_of, pref, ri_data, &
    1582              :                                  qs_env, work_virial)
    1583              :       TYPE(qs_force_type), DIMENSION(:), POINTER         :: force
    1584              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_2c_in
    1585              :       INTEGER, INTENT(IN)                                :: atom_i
    1586              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: atom_of_kind, kind_of
    1587              :       REAL(dp), INTENT(IN)                               :: pref
    1588              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    1589              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1590              :       REAL(dp), DIMENSION(3, 3), INTENT(INOUT)           :: work_virial
    1591              : 
    1592              :       INTEGER :: i, i_img, i_RI, i_xyz, iat_of_kind, iatom, ikind, ind(2), j_img, j_RI, j_xyz, &
    1593              :          jat_of_kind, jatom, jkind, natom, nblks(2), nimg, nkind
    1594         2584 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    1595         2584 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    1596              :       LOGICAL                                            :: found
    1597              :       REAL(dp)                                           :: new_force, r0, r1, ri(3), rj(3), &
    1598              :                                                             rref(3), scoord(3), x
    1599         2584 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: blk
    1600              :       TYPE(cell_type), POINTER                           :: cell
    1601              :       TYPE(dbt_iterator_type)                            :: iter
    1602              :       TYPE(kpoint_type), POINTER                         :: kpoints
    1603         2584 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1604         2584 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
    1605              : 
    1606         2584 :       NULLIFY (qs_kind_set, particle_set, kpoints, index_to_cell, cell_to_index, cell)
    1607              : 
    1608              :       CALL get_qs_env(qs_env, natom=natom, nkind=nkind, qs_kind_set=qs_kind_set, cell=cell, &
    1609         2584 :                       kpoints=kpoints, particle_set=particle_set)
    1610         2584 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell)
    1611              : 
    1612         2584 :       CALL dbt_get_info(t_2c_in, nblks_total=nblks)
    1613         2584 :       CPASSERT(nblks(1) == ri_data%ncell_RI*natom)
    1614         2584 :       CPASSERT(nblks(2) == ri_data%ncell_RI*natom)
    1615              : 
    1616         2584 :       nimg = ri_data%nimg
    1617              : 
    1618              :       !Loop over the RI cells and atoms, and apply bump accordingly
    1619         2584 :       r1 = ri_data%kp_RI_range
    1620         2584 :       r0 = ri_data%kp_bump_rad
    1621         2584 :       rref = pbc(particle_set(atom_i)%r, cell)
    1622              : 
    1623         2584 :       iat_of_kind = atom_of_kind(atom_i)
    1624         2584 :       ikind = kind_of(atom_i)
    1625              : 
    1626              : !$OMP PARALLEL DEFAULT(NONE) SHARED(t_2c_in,natom,ri_data,cell,particle_set,index_to_cell,pref, &
    1627              : !$OMP force,r0,r1,rref,atom_of_kind,kind_of,iat_of_kind,ikind,work_virial) &
    1628              : !$OMP PRIVATE(iter,ind,blk,found,i_RI,i_img,iatom,j_RI,j_img,jatom,scoord,ri,rj,jkind,jat_of_kind, &
    1629         2584 : !$OMP         new_force,i_xyz,i,x,j_xyz)
    1630              :       CALL dbt_iterator_start(iter, t_2c_in)
    1631              :       DO WHILE (dbt_iterator_blocks_left(iter))
    1632              :          CALL dbt_iterator_next_block(iter, ind)
    1633              :          IF (ind(1) .NE. ind(2)) CYCLE !bump matrix is diagonal
    1634              : 
    1635              :          CALL dbt_get_block(t_2c_in, ind, blk, found)
    1636              :          IF (.NOT. found) CYCLE
    1637              : 
    1638              :          !bump is a function of x = SQRT((R - Rref)^2). We refer to R as jatom, and Rref as atom_i
    1639              :          j_RI = (ind(2) - 1)/natom + 1
    1640              :          j_img = ri_data%RI_cell_to_img(j_RI)
    1641              :          jatom = ind(2) - (j_RI - 1)*natom
    1642              :          jat_of_kind = atom_of_kind(jatom)
    1643              :          jkind = kind_of(jatom)
    1644              : 
    1645              :          CALL real_to_scaled(scoord, pbc(particle_set(jatom)%r, cell), cell)
    1646              :          CALL scaled_to_real(rj, scoord(:) + index_to_cell(:, j_img), cell)
    1647              :          x = NORM2(rj - rref)
    1648              :          IF (x < r0 .OR. x > r1) CYCLE
    1649              : 
    1650              :          new_force = 0.0_dp
    1651              :          DO i = 1, SIZE(blk, 1)
    1652              :             new_force = new_force + blk(i, i)
    1653              :          END DO
    1654              :          new_force = pref*new_force*dbump(x, r0, r1)
    1655              : 
    1656              :          !x = SQRT((R - Rref)^2), so we multiply by dx/dR and dx/dRref
    1657              :          DO i_xyz = 1, 3
    1658              :             !Force acting on second atom
    1659              : !$OMP ATOMIC
    1660              :             force(jkind)%fock_4c(i_xyz, jat_of_kind) = force(jkind)%fock_4c(i_xyz, jat_of_kind) + &
    1661              :                                                        new_force*(rj(i_xyz) - rref(i_xyz))/x
    1662              : 
    1663              :             !virial acting on second atom
    1664              :             CALL real_to_scaled(scoord, rj, cell)
    1665              :             DO j_xyz = 1, 3
    1666              : !$OMP ATOMIC
    1667              :                work_virial(i_xyz, j_xyz) = work_virial(i_xyz, j_xyz) &
    1668              :                                            + new_force*scoord(j_xyz)*(rj(i_xyz) - rref(i_xyz))/x
    1669              :             END DO
    1670              : 
    1671              :             !Force acting on reference atom, defining the RI basis
    1672              : !$OMP ATOMIC
    1673              :             force(ikind)%fock_4c(i_xyz, iat_of_kind) = force(ikind)%fock_4c(i_xyz, iat_of_kind) - &
    1674              :                                                        new_force*(rj(i_xyz) - rref(i_xyz))/x
    1675              : 
    1676              :             !virial of ref atom
    1677              :             CALL real_to_scaled(scoord, rref, cell)
    1678              :             DO j_xyz = 1, 3
    1679              : !$OMP ATOMIC
    1680              :                work_virial(i_xyz, j_xyz) = work_virial(i_xyz, j_xyz) &
    1681              :                                            - new_force*scoord(j_xyz)*(rj(i_xyz) - rref(i_xyz))/x
    1682              :             END DO
    1683              :          END DO !i_xyz
    1684              : 
    1685              :          DEALLOCATE (blk)
    1686              :       END DO
    1687              :       CALL dbt_iterator_stop(iter)
    1688              : !$OMP END PARALLEL
    1689              : 
    1690         5168 :    END SUBROUTINE get_2c_bump_forces
    1691              : 
    1692              : ! **************************************************************************************************
    1693              : !> \brief The bumb function as defined by Juerg
    1694              : !> \param x ...
    1695              : !> \param r0 ...
    1696              : !> \param r1 ...
    1697              : !> \return ...
    1698              : ! **************************************************************************************************
    1699        27477 :    FUNCTION bump(x, r0, r1) RESULT(b)
    1700              :       REAL(dp), INTENT(IN)                               :: x, r0, r1
    1701              :       REAL(dp)                                           :: b
    1702              : 
    1703              :       REAL(dp)                                           :: r
    1704              : 
    1705              :       !Head-Gordon
    1706              :       !b = 1.0_dp/(1.0_dp+EXP((r1-r0)/(r1-x)-(r1-r0)/(x-r0)))
    1707              :       !Juerg
    1708        27477 :       r = (x - r0)/(r1 - r0)
    1709        27477 :       b = -6.0_dp*r**5 + 15.0_dp*r**4 - 10.0_dp*r**3 + 1.0_dp
    1710        27477 :       IF (x .GE. r1) b = 0.0_dp
    1711        27477 :       IF (x .LE. r0) b = 1.0_dp
    1712              : 
    1713        27477 :    END FUNCTION bump
    1714              : 
    1715              : ! **************************************************************************************************
    1716              : !> \brief The derivative of the bump function
    1717              : !> \param x ...
    1718              : !> \param r0 ...
    1719              : !> \param r1 ...
    1720              : !> \return ...
    1721              : ! **************************************************************************************************
    1722          597 :    FUNCTION dbump(x, r0, r1) RESULT(b)
    1723              :       REAL(dp), INTENT(IN)                               :: x, r0, r1
    1724              :       REAL(dp)                                           :: b
    1725              : 
    1726              :       REAL(dp)                                           :: r
    1727              : 
    1728          597 :       r = (x - r0)/(r1 - r0)
    1729          597 :       b = (-30.0_dp*r**4 + 60.0_dp*r**3 - 30.0_dp*r**2)/(r1 - r0)
    1730          597 :       IF (x .GE. r1) b = 0.0_dp
    1731          597 :       IF (x .LE. r0) b = 0.0_dp
    1732              : 
    1733          597 :    END FUNCTION dbump
    1734              : 
    1735              : ! **************************************************************************************************
    1736              : !> \brief return the cell index a+c corresponding to given cell index i and b, with i = a+c-b
    1737              : !> \param i_index ...
    1738              : !> \param b_index ...
    1739              : !> \param qs_env ...
    1740              : !> \return ...
    1741              : ! **************************************************************************************************
    1742       518709 :    FUNCTION get_apc_index_from_ib(i_index, b_index, qs_env) RESULT(apc_index)
    1743              :       INTEGER, INTENT(IN)                                :: i_index, b_index
    1744              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1745              :       INTEGER                                            :: apc_index
    1746              : 
    1747              :       INTEGER, DIMENSION(3)                              :: cell_apc
    1748       518709 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    1749       518709 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    1750              :       TYPE(kpoint_type), POINTER                         :: kpoints
    1751              : 
    1752       518709 :       CALL get_qs_env(qs_env, kpoints=kpoints)
    1753       518709 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell)
    1754              : 
    1755              :       !i = a+c-b => a+c = i+b
    1756      2074836 :       cell_apc(:) = index_to_cell(:, i_index) + index_to_cell(:, b_index)
    1757              : 
    1758      3541648 :       IF (ANY([cell_apc(1), cell_apc(2), cell_apc(3)] < LBOUND(cell_to_index)) .OR. &
    1759              :           ANY([cell_apc(1), cell_apc(2), cell_apc(3)] > UBOUND(cell_to_index))) THEN
    1760              : 
    1761              :          apc_index = 0
    1762              :       ELSE
    1763       447335 :          apc_index = cell_to_index(cell_apc(1), cell_apc(2), cell_apc(3))
    1764              :       END IF
    1765              : 
    1766       518709 :    END FUNCTION get_apc_index_from_ib
    1767              : 
    1768              : ! **************************************************************************************************
    1769              : !> \brief return the cell index i corresponding to the summ of cell_a and cell_c
    1770              : !> \param a_index ...
    1771              : !> \param c_index ...
    1772              : !> \param qs_env ...
    1773              : !> \return ...
    1774              : ! **************************************************************************************************
    1775            0 :    FUNCTION get_apc_index(a_index, c_index, qs_env) RESULT(i_index)
    1776              :       INTEGER, INTENT(IN)                                :: a_index, c_index
    1777              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1778              :       INTEGER                                            :: i_index
    1779              : 
    1780              :       INTEGER, DIMENSION(3)                              :: cell_i
    1781            0 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    1782            0 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    1783              :       TYPE(kpoint_type), POINTER                         :: kpoints
    1784              : 
    1785            0 :       CALL get_qs_env(qs_env, kpoints=kpoints)
    1786            0 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell)
    1787              : 
    1788            0 :       cell_i(:) = index_to_cell(:, a_index) + index_to_cell(:, c_index)
    1789              : 
    1790            0 :       IF (ANY([cell_i(1), cell_i(2), cell_i(3)] < LBOUND(cell_to_index)) .OR. &
    1791              :           ANY([cell_i(1), cell_i(2), cell_i(3)] > UBOUND(cell_to_index))) THEN
    1792              : 
    1793              :          i_index = 0
    1794              :       ELSE
    1795            0 :          i_index = cell_to_index(cell_i(1), cell_i(2), cell_i(3))
    1796              :       END IF
    1797              : 
    1798            0 :    END FUNCTION get_apc_index
    1799              : 
    1800              : ! **************************************************************************************************
    1801              : !> \brief return the cell index i corresponding to the summ of cell_a + cell_c - cell_b
    1802              : !> \param apc_index ...
    1803              : !> \param b_index ...
    1804              : !> \param qs_env ...
    1805              : !> \return ...
    1806              : ! **************************************************************************************************
    1807       714366 :    FUNCTION get_i_index(apc_index, b_index, qs_env) RESULT(i_index)
    1808              :       INTEGER, INTENT(IN)                                :: apc_index, b_index
    1809              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1810              :       INTEGER                                            :: i_index
    1811              : 
    1812              :       INTEGER, DIMENSION(3)                              :: cell_i
    1813       714366 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    1814       714366 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    1815              :       TYPE(kpoint_type), POINTER                         :: kpoints
    1816              : 
    1817       714366 :       CALL get_qs_env(qs_env, kpoints=kpoints)
    1818       714366 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell)
    1819              : 
    1820      2857464 :       cell_i(:) = index_to_cell(:, apc_index) - index_to_cell(:, b_index)
    1821              : 
    1822      4880718 :       IF (ANY([cell_i(1), cell_i(2), cell_i(3)] < LBOUND(cell_to_index)) .OR. &
    1823              :           ANY([cell_i(1), cell_i(2), cell_i(3)] > UBOUND(cell_to_index))) THEN
    1824              : 
    1825              :          i_index = 0
    1826              :       ELSE
    1827       611682 :          i_index = cell_to_index(cell_i(1), cell_i(2), cell_i(3))
    1828              :       END IF
    1829              : 
    1830       714366 :    END FUNCTION get_i_index
    1831              : 
    1832              : ! **************************************************************************************************
    1833              : !> \brief A routine that returns all allowed a,c pairs such that a+c images corresponds to the value
    1834              : !>        of the apc_index input. Takes into account that image a corresponds to 3c integrals, which
    1835              : !>        are ordered in their own way
    1836              : !> \param ac_pairs ...
    1837              : !> \param apc_index ...
    1838              : !> \param ri_data ...
    1839              : !> \param qs_env ...
    1840              : ! **************************************************************************************************
    1841        16994 :    SUBROUTINE get_ac_pairs(ac_pairs, apc_index, ri_data, qs_env)
    1842              :       INTEGER, DIMENSION(:, :), INTENT(INOUT)            :: ac_pairs
    1843              :       INTEGER, INTENT(IN)                                :: apc_index
    1844              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    1845              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1846              : 
    1847              :       INTEGER                                            :: a_index, actual_img, c_index, nimg
    1848              : 
    1849        16994 :       nimg = SIZE(ac_pairs, 1)
    1850              : 
    1851      1479714 :       ac_pairs(:, :) = 0
    1852              : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(ac_pairs,nimg,ri_data,qs_env,apc_index) &
    1853        16994 : !$OMP PRIVATE(a_index,actual_img,c_index)
    1854              :       DO a_index = 1, nimg
    1855              :          actual_img = ri_data%idx_to_img(a_index)
    1856              :          !c = a+c - a
    1857              :          c_index = get_i_index(apc_index, actual_img, qs_env)
    1858              :          ac_pairs(a_index, 1) = a_index
    1859              :          ac_pairs(a_index, 2) = c_index
    1860              :       END DO
    1861              : !$OMP END PARALLEL DO
    1862              : 
    1863        16994 :    END SUBROUTINE get_ac_pairs
    1864              : 
    1865              : ! **************************************************************************************************
    1866              : !> \brief A routine that returns all allowed i,a+c pairs such that, for the given value of b, we have
    1867              : !>        i = a+c-b. Takes into account that image i corrsponds to the 3c ints, which are ordered in
    1868              : !>        their own way
    1869              : !> \param iapc_pairs ...
    1870              : !> \param b_index ...
    1871              : !> \param ri_data ...
    1872              : !> \param qs_env ...
    1873              : !> \param actual_i_img ...
    1874              : ! **************************************************************************************************
    1875        13756 :    SUBROUTINE get_iapc_pairs(iapc_pairs, b_index, ri_data, qs_env, actual_i_img)
    1876              :       INTEGER, DIMENSION(:, :), INTENT(INOUT)            :: iapc_pairs
    1877              :       INTEGER, INTENT(IN)                                :: b_index
    1878              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    1879              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1880              :       INTEGER, DIMENSION(:), INTENT(INOUT), OPTIONAL     :: actual_i_img
    1881              : 
    1882              :       INTEGER                                            :: actual_img, apc_index, i_index, nimg
    1883              : 
    1884        13756 :       nimg = SIZE(iapc_pairs, 1)
    1885        57318 :       IF (PRESENT(actual_i_img)) actual_i_img(:) = 0
    1886              : 
    1887      1078686 :       iapc_pairs(:, :) = 0
    1888              : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(iapc_pairs,nimg,ri_data,qs_env,b_index,actual_i_img) &
    1889        13756 : !$OMP PRIVATE(i_index,actual_img,apc_index)
    1890              :       DO i_index = 1, nimg
    1891              :          actual_img = ri_data%idx_to_img(i_index)
    1892              :          apc_index = get_apc_index_from_ib(actual_img, b_index, qs_env)
    1893              :          IF (apc_index == 0) CYCLE
    1894              :          iapc_pairs(i_index, 1) = i_index
    1895              :          iapc_pairs(i_index, 2) = apc_index
    1896              :          IF (PRESENT(actual_i_img)) actual_i_img(i_index) = actual_img
    1897              :       END DO
    1898              : 
    1899        13756 :    END SUBROUTINE get_iapc_pairs
    1900              : 
    1901              : ! **************************************************************************************************
    1902              : !> \brief A function that, given a cell index a, returun the index corresponding to -a, and zero if
    1903              : !>        if out of bounds
    1904              : !> \param a_index ...
    1905              : !> \param qs_env ...
    1906              : !> \return ...
    1907              : ! **************************************************************************************************
    1908        83455 :    FUNCTION get_opp_index(a_index, qs_env) RESULT(opp_index)
    1909              :       INTEGER, INTENT(IN)                                :: a_index
    1910              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1911              :       INTEGER                                            :: opp_index
    1912              : 
    1913              :       INTEGER, DIMENSION(3)                              :: opp_cell
    1914        83455 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    1915        83455 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    1916              :       TYPE(kpoint_type), POINTER                         :: kpoints
    1917              : 
    1918        83455 :       NULLIFY (kpoints, cell_to_index, index_to_cell)
    1919              : 
    1920        83455 :       CALL get_qs_env(qs_env, kpoints=kpoints)
    1921        83455 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell)
    1922              : 
    1923       333820 :       opp_cell(:) = -index_to_cell(:, a_index)
    1924              : 
    1925       584185 :       IF (ANY([opp_cell(1), opp_cell(2), opp_cell(3)] < LBOUND(cell_to_index)) .OR. &
    1926              :           ANY([opp_cell(1), opp_cell(2), opp_cell(3)] > UBOUND(cell_to_index))) THEN
    1927              : 
    1928              :          opp_index = 0
    1929              :       ELSE
    1930        83455 :          opp_index = cell_to_index(opp_cell(1), opp_cell(2), opp_cell(3))
    1931              :       END IF
    1932              : 
    1933        83455 :    END FUNCTION get_opp_index
    1934              : 
    1935              : ! **************************************************************************************************
    1936              : !> \brief A routine that returns the actual non-symemtric density matrix for each image, by Fourier
    1937              : !>        transforming the kpoint density matrix
    1938              : !> \param rho_ao_t ...
    1939              : !> \param rho_ao ...
    1940              : !> \param scale_prev_p ...
    1941              : !> \param ri_data ...
    1942              : !> \param qs_env ...
    1943              : ! **************************************************************************************************
    1944          542 :    SUBROUTINE get_pmat_images(rho_ao_t, rho_ao, scale_prev_p, ri_data, qs_env)
    1945              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: rho_ao_t
    1946              :       TYPE(dbcsr_p_type), DIMENSION(:, :), INTENT(INOUT) :: rho_ao
    1947              :       REAL(dp), INTENT(IN)                               :: scale_prev_p
    1948              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    1949              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    1950              : 
    1951              :       INTEGER                                            :: cell_j(3), i_img, i_spin, iatom, icol, &
    1952              :                                                             irow, j_img, jatom, mi_img, mj_img, &
    1953              :                                                             nimg, nspins
    1954          542 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    1955              :       LOGICAL                                            :: found
    1956              :       REAL(dp)                                           :: fac
    1957          542 :       REAL(dp), DIMENSION(:, :), POINTER                 :: pblock, pblock_desymm
    1958          542 :       TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: matrix_ks, rho_desymm
    1959         4878 :       TYPE(dbt_type)                                     :: tmp
    1960              :       TYPE(dft_control_type), POINTER                    :: dft_control
    1961              :       TYPE(kpoint_type), POINTER                         :: kpoints
    1962              :       TYPE(neighbor_list_iterator_p_type), &
    1963          542 :          DIMENSION(:), POINTER                           :: nl_iterator
    1964              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    1965          542 :          POINTER                                         :: sab_nl, sab_nl_nosym
    1966              :       TYPE(qs_scf_env_type), POINTER                     :: scf_env
    1967              : 
    1968          542 :       NULLIFY (rho_desymm, kpoints, sab_nl_nosym, scf_env, matrix_ks, dft_control, &
    1969          542 :                sab_nl, nl_iterator, cell_to_index, pblock, pblock_desymm)
    1970              : 
    1971          542 :       CALL get_qs_env(qs_env, kpoints=kpoints, scf_env=scf_env, matrix_ks_kp=matrix_ks, dft_control=dft_control)
    1972          542 :       CALL get_kpoint_info(kpoints, sab_nl_nosym=sab_nl_nosym, cell_to_index=cell_to_index, sab_nl=sab_nl)
    1973              : 
    1974          542 :       IF (dft_control%do_admm) THEN
    1975          302 :          CALL get_admm_env(qs_env%admm_env, matrix_ks_aux_fit_kp=matrix_ks)
    1976              :       END IF
    1977              : 
    1978          542 :       nspins = SIZE(matrix_ks, 1)
    1979          542 :       nimg = ri_data%nimg
    1980              : 
    1981        34828 :       ALLOCATE (rho_desymm(nspins, nimg))
    1982        15216 :       DO i_img = 1, nimg
    1983        33202 :          DO i_spin = 1, nspins
    1984        17986 :             ALLOCATE (rho_desymm(i_spin, i_img)%matrix)
    1985              :             CALL dbcsr_create(rho_desymm(i_spin, i_img)%matrix, template=matrix_ks(i_spin, i_img)%matrix, &
    1986        17986 :                               matrix_type=dbcsr_type_no_symmetry)
    1987        32660 :             CALL cp_dbcsr_alloc_block_from_nbl(rho_desymm(i_spin, i_img)%matrix, sab_nl_nosym)
    1988              :          END DO
    1989              :       END DO
    1990          542 :       CALL dbt_create(rho_desymm(1, 1)%matrix, tmp)
    1991              : 
    1992              :       !We transfor the symmtric typed (but not actually symmetric: P_ab^i = P_ba^-i) real-spaced density
    1993              :       !matrix into proper non-symemtric ones (using the same nl for consistency)
    1994          542 :       CALL neighbor_list_iterator_create(nl_iterator, sab_nl)
    1995        24035 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    1996        23493 :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, cell=cell_j)
    1997        23493 :          j_img = cell_to_index(cell_j(1), cell_j(2), cell_j(3))
    1998        23493 :          IF (j_img > nimg .OR. j_img < 1) CYCLE
    1999              : 
    2000        18111 :          fac = 1.0_dp
    2001        18111 :          IF (iatom == jatom) fac = 0.5_dp
    2002        18111 :          mj_img = get_opp_index(j_img, qs_env)
    2003              :          !if no opposite image, then no sum of P^j + P^-j => need full diag
    2004        18111 :          IF (mj_img == 0) fac = 1.0_dp
    2005              : 
    2006        18111 :          irow = iatom
    2007        18111 :          icol = jatom
    2008        18111 :          IF (iatom > jatom) THEN
    2009              :             !because symmetric nl. Value for atom pair i,j is actually stored in j,i if i > j
    2010         5935 :             irow = jatom
    2011         5935 :             icol = iatom
    2012              :          END IF
    2013              : 
    2014        41058 :          DO i_spin = 1, nspins
    2015        22405 :             CALL dbcsr_get_block_p(rho_ao(i_spin, j_img)%matrix, irow, icol, pblock, found)
    2016        22405 :             IF (.NOT. found) CYCLE
    2017              : 
    2018              :             !distribution of symm and non-symm matrix match in that way
    2019        22405 :             CALL dbcsr_get_block_p(rho_desymm(i_spin, j_img)%matrix, iatom, jatom, pblock_desymm, found)
    2020        22405 :             IF (.NOT. found) CYCLE
    2021              : 
    2022        90708 :             IF (iatom > jatom) THEN
    2023       723498 :                pblock_desymm(:, :) = fac*TRANSPOSE(pblock(:, :))
    2024              :             ELSE
    2025      1729360 :                pblock_desymm(:, :) = fac*pblock(:, :)
    2026              :             END IF
    2027              :          END DO
    2028              :       END DO
    2029          542 :       CALL neighbor_list_iterator_release(nl_iterator)
    2030              : 
    2031        15216 :       DO i_img = 1, nimg
    2032        33202 :          DO i_spin = 1, nspins
    2033        17986 :             CALL dbt_scale(rho_ao_t(i_spin, i_img), scale_prev_p)
    2034              : 
    2035        17986 :             CALL dbt_copy_matrix_to_tensor(rho_desymm(i_spin, i_img)%matrix, tmp)
    2036        17986 :             CALL dbt_copy(tmp, rho_ao_t(i_spin, i_img), summation=.TRUE., move_data=.TRUE.)
    2037              : 
    2038              :             !symmetrize by addin transpose of opp img
    2039        17986 :             mi_img = get_opp_index(i_img, qs_env)
    2040        17986 :             IF (mi_img > 0 .AND. mi_img .LE. nimg) THEN
    2041        15998 :                CALL dbt_copy_matrix_to_tensor(rho_desymm(i_spin, mi_img)%matrix, tmp)
    2042        15998 :                CALL dbt_copy(tmp, rho_ao_t(i_spin, i_img), order=[2, 1], summation=.TRUE., move_data=.TRUE.)
    2043              :             END IF
    2044        32660 :             CALL dbt_filter(rho_ao_t(i_spin, i_img), ri_data%filter_eps)
    2045              :          END DO
    2046              :       END DO
    2047              : 
    2048        15216 :       DO i_img = 1, nimg
    2049        33202 :          DO i_spin = 1, nspins
    2050        17986 :             CALL dbcsr_release(rho_desymm(i_spin, i_img)%matrix)
    2051        32660 :             DEALLOCATE (rho_desymm(i_spin, i_img)%matrix)
    2052              :          END DO
    2053              :       END DO
    2054              : 
    2055          542 :       CALL dbt_destroy(tmp)
    2056          542 :       DEALLOCATE (rho_desymm)
    2057              : 
    2058         1084 :    END SUBROUTINE get_pmat_images
    2059              : 
    2060              : ! **************************************************************************************************
    2061              : !> \brief A routine that, given a cell index b and atom indices ij, returns a 2c tensor with the HFX
    2062              : !>        potential (P_i^0|Q_j^b), within the extended RI basis
    2063              : !> \param t_2c_pot ...
    2064              : !> \param mat_orig ...
    2065              : !> \param atom_i ...
    2066              : !> \param atom_j ...
    2067              : !> \param img_b ...
    2068              : !> \param ri_data ...
    2069              : !> \param qs_env ...
    2070              : !> \param do_inverse ...
    2071              : !> \param para_env_ext ...
    2072              : !> \param blacs_env_ext ...
    2073              : !> \param dbcsr_template ...
    2074              : !> \param off_diagonal ...
    2075              : !> \param skip_inverse ...
    2076              : ! **************************************************************************************************
    2077         9208 :    SUBROUTINE get_ext_2c_int(t_2c_pot, mat_orig, atom_i, atom_j, img_b, ri_data, qs_env, do_inverse, &
    2078              :                              para_env_ext, blacs_env_ext, dbcsr_template, off_diagonal, skip_inverse)
    2079              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_2c_pot
    2080              :       TYPE(dbcsr_type), DIMENSION(:), INTENT(INOUT)      :: mat_orig
    2081              :       INTEGER, INTENT(IN)                                :: atom_i, atom_j, img_b
    2082              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    2083              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    2084              :       LOGICAL, INTENT(IN), OPTIONAL                      :: do_inverse
    2085              :       TYPE(mp_para_env_type), OPTIONAL, POINTER          :: para_env_ext
    2086              :       TYPE(cp_blacs_env_type), OPTIONAL, POINTER         :: blacs_env_ext
    2087              :       TYPE(dbcsr_type), OPTIONAL, POINTER                :: dbcsr_template
    2088              :       LOGICAL, INTENT(IN), OPTIONAL                      :: off_diagonal, skip_inverse
    2089              : 
    2090              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'get_ext_2c_int'
    2091              : 
    2092              :       INTEGER :: group, handle, handle2, i_img, i_RI, iatom, iblk, ikind, img_tot, j_img, j_RI, &
    2093              :          jatom, jblk, jkind, n_dependent, natom, nblks_RI, nimg, nkind
    2094         9208 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dist1, dist2
    2095         9208 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: present_atoms_i, present_atoms_j
    2096              :       INTEGER, DIMENSION(3)                              :: cell_b, cell_i, cell_j, cell_tot
    2097         9208 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist, col_dist_ext, ri_blk_size_ext, &
    2098         9208 :                                                             row_dist, row_dist_ext
    2099         9208 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell, pgrid
    2100         9208 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    2101              :       LOGICAL                                            :: do_inverse_prv, found, my_offd, &
    2102              :                                                             skip_inverse_prv, use_template
    2103              :       REAL(dp)                                           :: bfac, dij, r0, r1, threshold
    2104              :       REAL(dp), DIMENSION(3)                             :: ri, rij, rj, rref, scoord
    2105         9208 :       REAL(dp), DIMENSION(:, :), POINTER                 :: pblock
    2106              :       TYPE(cell_type), POINTER                           :: cell
    2107              :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env
    2108              :       TYPE(dbcsr_distribution_type)                      :: dbcsr_dist, dbcsr_dist_ext
    2109              :       TYPE(dbcsr_iterator_type)                          :: dbcsr_iter
    2110              :       TYPE(dbcsr_type)                                   :: work, work_tight, work_tight_inv
    2111        64456 :       TYPE(dbt_type)                                     :: t_2c_tmp
    2112              :       TYPE(distribution_2d_type), POINTER                :: dist_2d
    2113              :       TYPE(gto_basis_set_p_type), ALLOCATABLE, &
    2114         9208 :          DIMENSION(:), TARGET                            :: basis_set_RI
    2115              :       TYPE(kpoint_type), POINTER                         :: kpoints
    2116              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    2117              :       TYPE(neighbor_list_iterator_p_type), &
    2118         9208 :          DIMENSION(:), POINTER                           :: nl_iterator
    2119              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    2120         9208 :          POINTER                                         :: nl_2c
    2121         9208 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    2122         9208 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
    2123              : 
    2124         9208 :       NULLIFY (qs_kind_set, nl_2c, nl_iterator, cell, kpoints, cell_to_index, index_to_cell, dist_2d, &
    2125         9208 :                para_env, pblock, blacs_env, particle_set, col_dist, row_dist, pgrid, &
    2126         9208 :                col_dist_ext, row_dist_ext)
    2127              : 
    2128         9208 :       CALL timeset(routineN, handle)
    2129              : 
    2130              :       !Idea: run over the neighbor list once for i and once for j, and record in which cell the MIC
    2131              :       !      atoms are. Then loop over the atoms and only take the pairs the we need
    2132              : 
    2133              :       CALL get_qs_env(qs_env, natom=natom, nkind=nkind, qs_kind_set=qs_kind_set, cell=cell, &
    2134         9208 :                       kpoints=kpoints, para_env=para_env, blacs_env=blacs_env, particle_set=particle_set)
    2135         9208 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell)
    2136              : 
    2137         9208 :       do_inverse_prv = .FALSE.
    2138         9208 :       IF (PRESENT(do_inverse)) do_inverse_prv = do_inverse
    2139          320 :       IF (do_inverse_prv) THEN
    2140          320 :          CPASSERT(atom_i == atom_j)
    2141              :       END IF
    2142              : 
    2143         9208 :       skip_inverse_prv = .FALSE.
    2144         9208 :       IF (PRESENT(skip_inverse)) skip_inverse_prv = skip_inverse
    2145              : 
    2146         9208 :       my_offd = .FALSE.
    2147         9208 :       IF (PRESENT(off_diagonal)) my_offd = off_diagonal
    2148              : 
    2149         9208 :       IF (PRESENT(para_env_ext)) para_env => para_env_ext
    2150         9208 :       IF (PRESENT(blacs_env_ext)) blacs_env => blacs_env_ext
    2151              : 
    2152         9208 :       nimg = SIZE(mat_orig)
    2153              : 
    2154         9208 :       CALL timeset(routineN//"_nl_iter", handle2)
    2155              : 
    2156              :       !create our own dist_2d in the subgroup
    2157        36832 :       ALLOCATE (dist1(natom), dist2(natom))
    2158        27624 :       DO iatom = 1, natom
    2159        18416 :          dist1(iatom) = MOD(iatom, blacs_env%num_pe(1))
    2160        27624 :          dist2(iatom) = MOD(iatom, blacs_env%num_pe(2))
    2161              :       END DO
    2162         9208 :       CALL distribution_2d_create(dist_2d, dist1, dist2, nkind, particle_set, blacs_env_ext=blacs_env)
    2163              : 
    2164        40679 :       ALLOCATE (basis_set_RI(nkind))
    2165         9208 :       CALL basis_set_list_setup(basis_set_RI, ri_data%ri_basis_type, qs_kind_set)
    2166              : 
    2167              :       CALL build_2c_neighbor_lists(nl_2c, basis_set_RI, basis_set_RI, ri_data%ri_metric, &
    2168         9208 :                                    "HFX_2c_nl_RI", qs_env, sym_ij=.FALSE., dist_2d=dist_2d)
    2169              : 
    2170        55248 :       ALLOCATE (present_atoms_i(natom, nimg), present_atoms_j(natom, nimg))
    2171       953167 :       present_atoms_i = 0
    2172       953167 :       present_atoms_j = 0
    2173              : 
    2174         9208 :       CALL neighbor_list_iterator_create(nl_iterator, nl_2c)
    2175       409876 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    2176              :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, r=rij, cell=cell_j, &
    2177       400668 :                                 ikind=ikind, jkind=jkind)
    2178              : 
    2179      1602672 :          dij = NORM2(rij)
    2180              : 
    2181       400668 :          j_img = cell_to_index(cell_j(1), cell_j(2), cell_j(3))
    2182       400668 :          IF (j_img > nimg .OR. j_img < 1) CYCLE
    2183              : 
    2184       398338 :          IF (iatom == atom_i .AND. dij .LE. ri_data%kp_RI_range) present_atoms_i(jatom, j_img) = 1
    2185       407546 :          IF (iatom == atom_j .AND. dij .LE. ri_data%kp_RI_range) present_atoms_j(jatom, j_img) = 1
    2186              :       END DO
    2187         9208 :       CALL neighbor_list_iterator_release(nl_iterator)
    2188         9208 :       CALL release_neighbor_list_sets(nl_2c)
    2189         9208 :       CALL distribution_2d_release(dist_2d)
    2190         9208 :       CALL timestop(handle2)
    2191              : 
    2192         9208 :       CALL para_env%sum(present_atoms_i)
    2193         9208 :       CALL para_env%sum(present_atoms_j)
    2194              : 
    2195              :       !Need to build a work matrix with matching distribution to mat_orig
    2196              :       !If template is provided, use it. If not, we create it.
    2197         9208 :       use_template = .FALSE.
    2198         9208 :       IF (PRESENT(dbcsr_template)) THEN
    2199         8452 :          IF (ASSOCIATED(dbcsr_template)) use_template = .TRUE.
    2200              :       END IF
    2201              : 
    2202              :       IF (use_template) THEN
    2203         8158 :          CALL dbcsr_create(work, template=dbcsr_template)
    2204              :       ELSE
    2205         1050 :          CALL dbcsr_get_info(mat_orig(1), distribution=dbcsr_dist)
    2206         1050 :          CALL dbcsr_distribution_get(dbcsr_dist, row_dist=row_dist, col_dist=col_dist, group=group, pgrid=pgrid)
    2207         4200 :          ALLOCATE (row_dist_ext(ri_data%ncell_RI*natom), col_dist_ext(ri_data%ncell_RI*natom))
    2208         2100 :          ALLOCATE (ri_blk_size_ext(ri_data%ncell_RI*natom))
    2209         7204 :          DO i_RI = 1, ri_data%ncell_RI
    2210        30770 :             row_dist_ext((i_RI - 1)*natom + 1:i_RI*natom) = row_dist(:)
    2211        30770 :             col_dist_ext((i_RI - 1)*natom + 1:i_RI*natom) = col_dist(:)
    2212        19512 :             RI_blk_size_ext((i_RI - 1)*natom + 1:i_RI*natom) = ri_data%bsizes_RI(:)
    2213              :          END DO
    2214              : 
    2215              :          CALL dbcsr_distribution_new(dbcsr_dist_ext, group=group, pgrid=pgrid, &
    2216         1050 :                                      row_dist=row_dist_ext, col_dist=col_dist_ext)
    2217              :          CALL dbcsr_create(work, dist=dbcsr_dist_ext, name="RI_ext", matrix_type=dbcsr_type_no_symmetry, &
    2218         1050 :                            row_blk_size=RI_blk_size_ext, col_blk_size=RI_blk_size_ext)
    2219         1050 :          CALL dbcsr_distribution_release(dbcsr_dist_ext)
    2220         1050 :          DEALLOCATE (col_dist_ext, row_dist_ext, RI_blk_size_ext)
    2221              : 
    2222         3150 :          IF (PRESENT(dbcsr_template)) THEN
    2223          294 :             ALLOCATE (dbcsr_template)
    2224          294 :             CALL dbcsr_create(dbcsr_template, template=work)
    2225              :          END IF
    2226              :       END IF !use_template
    2227              : 
    2228        36832 :       cell_b(:) = index_to_cell(:, img_b)
    2229       323861 :       DO i_img = 1, nimg
    2230       314653 :          i_RI = ri_data%img_to_RI_cell(i_img)
    2231       314653 :          IF (i_RI == 0) CYCLE
    2232       221148 :          cell_i(:) = index_to_cell(:, i_img)
    2233      2283724 :          DO j_img = 1, nimg
    2234      2219229 :             j_RI = ri_data%img_to_RI_cell(j_img)
    2235      2219229 :             IF (j_RI == 0) CYCLE
    2236      1792812 :             cell_j(:) = index_to_cell(:, j_img)
    2237      1792812 :             cell_tot = cell_j - cell_i + cell_b
    2238              : 
    2239      3092996 :             IF (ANY([cell_tot(1), cell_tot(2), cell_tot(3)] < LBOUND(cell_to_index)) .OR. &
    2240              :                 ANY([cell_tot(1), cell_tot(2), cell_tot(3)] > UBOUND(cell_to_index))) CYCLE
    2241       409584 :             img_tot = cell_to_index(cell_tot(1), cell_tot(2), cell_tot(3))
    2242       409584 :             IF (img_tot > nimg .OR. img_tot < 1) CYCLE
    2243              : 
    2244       280422 :             CALL dbcsr_iterator_start(dbcsr_iter, mat_orig(img_tot))
    2245       817406 :             DO WHILE (dbcsr_iterator_blocks_left(dbcsr_iter))
    2246       536984 :                CALL dbcsr_iterator_next_block(dbcsr_iter, row=iatom, column=jatom)
    2247       536984 :                IF (present_atoms_i(iatom, i_img) == 0) CYCLE
    2248       209075 :                IF (present_atoms_j(jatom, j_img) == 0) CYCLE
    2249        93781 :                IF (my_offd .AND. (i_RI - 1)*natom + iatom == (j_RI - 1)*natom + jatom) CYCLE
    2250              : 
    2251        93456 :                CALL dbcsr_get_block_p(mat_orig(img_tot), iatom, jatom, pblock, found)
    2252        93456 :                IF (.NOT. found) CYCLE
    2253              : 
    2254       817406 :                CALL dbcsr_put_block(work, (i_RI - 1)*natom + iatom, (j_RI - 1)*natom + jatom, pblock)
    2255              : 
    2256              :             END DO
    2257      2775685 :             CALL dbcsr_iterator_stop(dbcsr_iter)
    2258              : 
    2259              :          END DO !j_img
    2260              :       END DO !i_img
    2261         9208 :       CALL dbcsr_finalize(work)
    2262              : 
    2263         9208 :       IF (do_inverse_prv) THEN
    2264              : 
    2265          320 :          r1 = ri_data%kp_RI_range
    2266          320 :          r0 = ri_data%kp_bump_rad
    2267              : 
    2268              :          !Because there are a lot of empty rows/cols in work, we need to get rid of them for inversion
    2269        24296 :          nblks_RI = SUM(present_atoms_i)
    2270         1600 :          ALLOCATE (col_dist_ext(nblks_RI), row_dist_ext(nblks_RI), RI_blk_size_ext(nblks_RI))
    2271          320 :          iblk = 0
    2272         8312 :          DO i_img = 1, nimg
    2273         7992 :             i_RI = ri_data%img_to_RI_cell(i_img)
    2274         7992 :             IF (i_RI == 0) CYCLE
    2275         6104 :             DO iatom = 1, natom
    2276         3856 :                IF (present_atoms_i(iatom, i_img) == 0) CYCLE
    2277         1300 :                iblk = iblk + 1
    2278         1300 :                col_dist_ext(iblk) = col_dist(iatom)
    2279         1300 :                row_dist_ext(iblk) = row_dist(iatom)
    2280        11848 :                RI_blk_size_ext(iblk) = ri_data%bsizes_RI(iatom)
    2281              :             END DO
    2282              :          END DO
    2283              : 
    2284              :          CALL dbcsr_distribution_new(dbcsr_dist_ext, group=group, pgrid=pgrid, &
    2285          320 :                                      row_dist=row_dist_ext, col_dist=col_dist_ext)
    2286              :          CALL dbcsr_create(work_tight, dist=dbcsr_dist_ext, name="RI_ext", matrix_type=dbcsr_type_no_symmetry, &
    2287          320 :                            row_blk_size=RI_blk_size_ext, col_blk_size=RI_blk_size_ext)
    2288              :          CALL dbcsr_create(work_tight_inv, dist=dbcsr_dist_ext, name="RI_ext", matrix_type=dbcsr_type_no_symmetry, &
    2289          320 :                            row_blk_size=RI_blk_size_ext, col_blk_size=RI_blk_size_ext)
    2290          320 :          CALL dbcsr_distribution_release(dbcsr_dist_ext)
    2291          320 :          DEALLOCATE (col_dist_ext, row_dist_ext, RI_blk_size_ext)
    2292              : 
    2293              :          !We apply a bump function to the RI metric inverse for smooth RI basis extension:
    2294              :          ! S^-1 = B * ((P|Q)_D + B*(P|Q)_OD*B)^-1 * B, with D block-diagonal blocks and OD off-diagonal
    2295          320 :          rref = pbc(particle_set(atom_i)%r, cell)
    2296              : 
    2297          320 :          iblk = 0
    2298         8312 :          DO i_img = 1, nimg
    2299         7992 :             i_RI = ri_data%img_to_RI_cell(i_img)
    2300         7992 :             IF (i_RI == 0) CYCLE
    2301         6104 :             DO iatom = 1, natom
    2302         3856 :                IF (present_atoms_i(iatom, i_img) == 0) CYCLE
    2303         1300 :                iblk = iblk + 1
    2304              : 
    2305         1300 :                CALL real_to_scaled(scoord, pbc(particle_set(iatom)%r, cell), cell)
    2306         5200 :                CALL scaled_to_real(ri, scoord(:) + index_to_cell(:, i_img), cell)
    2307              : 
    2308         1300 :                jblk = 0
    2309        47668 :                DO j_img = 1, nimg
    2310        38376 :                   j_RI = ri_data%img_to_RI_cell(j_img)
    2311        38376 :                   IF (j_RI == 0) CYCLE
    2312        32032 :                   DO jatom = 1, natom
    2313        18784 :                      IF (present_atoms_j(jatom, j_img) == 0) CYCLE
    2314         6124 :                      jblk = jblk + 1
    2315              : 
    2316         6124 :                      CALL real_to_scaled(scoord, pbc(particle_set(jatom)%r, cell), cell)
    2317        24496 :                      CALL scaled_to_real(rj, scoord(:) + index_to_cell(:, j_img), cell)
    2318              : 
    2319         6124 :                      CALL dbcsr_get_block_p(work, (i_RI - 1)*natom + iatom, (j_RI - 1)*natom + jatom, pblock, found)
    2320         6124 :                      IF (.NOT. found) CYCLE
    2321              : 
    2322         2732 :                      bfac = 1.0_dp
    2323        15224 :                      IF (iblk .NE. jblk) bfac = bump(NORM2(ri - rref), r0, r1)*bump(NORM2(rj - rref), r0, r1)
    2324      5601896 :                      CALL dbcsr_put_block(work_tight, iblk, jblk, bfac*pblock(:, :))
    2325              :                   END DO
    2326              :                END DO
    2327              :             END DO
    2328              :          END DO
    2329          320 :          CALL dbcsr_finalize(work_tight)
    2330          320 :          CALL dbcsr_clear(work)
    2331              : 
    2332          320 :          IF (.NOT. skip_inverse_prv) THEN
    2333          160 :             SELECT CASE (ri_data%t2c_method)
    2334              :             CASE (hfx_ri_do_2c_iter)
    2335            0 :                threshold = MAX(ri_data%filter_eps, 1.0e-12_dp)
    2336            0 :                CALL invert_hotelling(work_tight_inv, work_tight, threshold=threshold, silent=.FALSE.)
    2337              :             CASE (hfx_ri_do_2c_cholesky)
    2338          160 :                CALL dbcsr_copy(work_tight_inv, work_tight)
    2339          160 :                CALL cp_dbcsr_cholesky_decompose(work_tight_inv, para_env=para_env, blacs_env=blacs_env)
    2340              :                CALL cp_dbcsr_cholesky_invert(work_tight_inv, para_env=para_env, blacs_env=blacs_env, &
    2341          160 :                                              uplo_to_full=.TRUE.)
    2342              :             CASE (hfx_ri_do_2c_diag)
    2343            0 :                CALL dbcsr_copy(work_tight_inv, work_tight)
    2344              :                CALL cp_dbcsr_power(work_tight_inv, -1.0_dp, ri_data%eps_eigval, n_dependent, &
    2345          160 :                                    para_env, blacs_env, verbose=ri_data%unit_nr_dbcsr > 0)
    2346              :             END SELECT
    2347              :          ELSE
    2348          160 :             CALL dbcsr_copy(work_tight_inv, work_tight)
    2349              :          END IF
    2350              : 
    2351              :          !move back data to standard extended RI pattern
    2352              :          !Note: we apply the external bump to ((P|Q)_D + B*(P|Q)_OD*B)^-1 later, because this matrix
    2353              :          !      is required for forces
    2354          320 :          iblk = 0
    2355         8312 :          DO i_img = 1, nimg
    2356         7992 :             i_RI = ri_data%img_to_RI_cell(i_img)
    2357         7992 :             IF (i_RI == 0) CYCLE
    2358         6104 :             DO iatom = 1, natom
    2359         3856 :                IF (present_atoms_i(iatom, i_img) == 0) CYCLE
    2360         1300 :                iblk = iblk + 1
    2361              : 
    2362         1300 :                jblk = 0
    2363        47668 :                DO j_img = 1, nimg
    2364        38376 :                   j_RI = ri_data%img_to_RI_cell(j_img)
    2365        38376 :                   IF (j_RI == 0) CYCLE
    2366        32032 :                   DO jatom = 1, natom
    2367        18784 :                      IF (present_atoms_j(jatom, j_img) == 0) CYCLE
    2368         6124 :                      jblk = jblk + 1
    2369              : 
    2370         6124 :                      CALL dbcsr_get_block_p(work_tight_inv, iblk, jblk, pblock, found)
    2371         6124 :                      IF (.NOT. found) CYCLE
    2372              : 
    2373        60057 :                      CALL dbcsr_put_block(work, (i_RI - 1)*natom + iatom, (j_RI - 1)*natom + jatom, pblock)
    2374              :                   END DO
    2375              :                END DO
    2376              :             END DO
    2377              :          END DO
    2378          320 :          CALL dbcsr_finalize(work)
    2379              : 
    2380          320 :          CALL dbcsr_release(work_tight)
    2381          640 :          CALL dbcsr_release(work_tight_inv)
    2382              :       END IF
    2383              : 
    2384         9208 :       CALL dbt_create(work, t_2c_tmp)
    2385         9208 :       CALL dbt_copy_matrix_to_tensor(work, t_2c_tmp)
    2386         9208 :       CALL dbt_copy(t_2c_tmp, t_2c_pot, move_data=.TRUE.)
    2387         9208 :       CALL dbt_filter(t_2c_pot, ri_data%filter_eps)
    2388              : 
    2389         9208 :       CALL dbt_destroy(t_2c_tmp)
    2390         9208 :       CALL dbcsr_release(work)
    2391              : 
    2392         9208 :       CALL timestop(handle)
    2393              : 
    2394        36832 :    END SUBROUTINE get_ext_2c_int
    2395              : 
    2396              : ! **************************************************************************************************
    2397              : !> \brief Pre-contract the density matrices with the 3-center integrals:
    2398              : !>        P_sigma^a,lambda^a+c (mu^0 sigma^a| P^0)
    2399              : !> \param t_3c_apc ...
    2400              : !> \param rho_ao_t ...
    2401              : !> \param ri_data ...
    2402              : !> \param qs_env ...
    2403              : ! **************************************************************************************************
    2404          294 :    SUBROUTINE contract_pmat_3c(t_3c_apc, rho_ao_t, ri_data, qs_env)
    2405              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: t_3c_apc, rho_ao_t
    2406              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    2407              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    2408              : 
    2409              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'contract_pmat_3c'
    2410              : 
    2411              :       INTEGER                                            :: apc_img, b_img, batch_size, handle, &
    2412              :                                                             i_batch, i_img, i_spin, idx, j_batch, &
    2413              :                                                             n_batch_img, n_batch_nze, nimg, &
    2414              :                                                             nimg_nze, nspins
    2415              :       INTEGER(int_8)                                     :: nflop, nze
    2416          294 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: apc_filter, batch_ranges_img, &
    2417          294 :                                                             batch_ranges_nze, int_indices
    2418          294 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: ac_pairs, iapc_pairs
    2419              :       REAL(dp)                                           :: occ, t1, t2
    2420         2646 :       TYPE(dbt_type)                                     :: t_3c_tmp
    2421          294 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:)          :: ints_stack, res_stack, rho_stack
    2422              :       TYPE(dft_control_type), POINTER                    :: dft_control
    2423              : 
    2424          294 :       CALL timeset(routineN, handle)
    2425              : 
    2426          294 :       CALL get_qs_env(qs_env, dft_control=dft_control)
    2427              : 
    2428          294 :       nimg = ri_data%nimg
    2429          294 :       nimg_nze = ri_data%nimg_nze
    2430          294 :       nspins = dft_control%nspins
    2431              : 
    2432          294 :       CALL dbt_create(t_3c_apc(1, 1), t_3c_tmp)
    2433              : 
    2434          294 :       batch_size = ri_data%kp_stack_size
    2435              : 
    2436         1470 :       ALLOCATE (apc_filter(nimg), iapc_pairs(nimg, 2))
    2437         8182 :       apc_filter = 0
    2438         8182 :       DO b_img = 1, nimg
    2439         7888 :          CALL get_iapc_pairs(iapc_pairs, b_img, ri_data, qs_env)
    2440       217710 :          DO i_img = 1, nimg_nze
    2441       209528 :             idx = iapc_pairs(i_img, 2)
    2442       209528 :             IF (idx < 1 .OR. idx > nimg) CYCLE
    2443       217416 :             apc_filter(idx) = 1
    2444              :          END DO
    2445              :       END DO
    2446              : 
    2447              :       !batching over all images
    2448          294 :       n_batch_img = nimg/batch_size
    2449          294 :       IF (MODULO(nimg, batch_size) .NE. 0) n_batch_img = n_batch_img + 1
    2450          882 :       ALLOCATE (batch_ranges_img(n_batch_img + 1))
    2451          876 :       DO i_batch = 1, n_batch_img
    2452          876 :          batch_ranges_img(i_batch) = (i_batch - 1)*batch_size + 1
    2453              :       END DO
    2454          294 :       batch_ranges_img(n_batch_img + 1) = nimg + 1
    2455              : 
    2456              :       !batching over images with non-zero 3c integrals
    2457          294 :       n_batch_nze = nimg_nze/batch_size
    2458          294 :       IF (MODULO(nimg_nze, batch_size) .NE. 0) n_batch_nze = n_batch_nze + 1
    2459          882 :       ALLOCATE (batch_ranges_nze(n_batch_nze + 1))
    2460          720 :       DO i_batch = 1, n_batch_nze
    2461          720 :          batch_ranges_nze(i_batch) = (i_batch - 1)*batch_size + 1
    2462              :       END DO
    2463          294 :       batch_ranges_nze(n_batch_nze + 1) = nimg_nze + 1
    2464              : 
    2465              :       !Create the stack tensors in the approriate distribution
    2466         9114 :       ALLOCATE (rho_stack(2), ints_stack(2), res_stack(2))
    2467              :       CALL get_stack_tensors(res_stack, rho_stack, ints_stack, rho_ao_t(1, 1), &
    2468          294 :                              ri_data%t_3c_int_ctr_1(1, 1), batch_size, ri_data, qs_env)
    2469              : 
    2470         1176 :       ALLOCATE (ac_pairs(nimg, 2), int_indices(nimg_nze))
    2471         6056 :       DO i_img = 1, nimg_nze
    2472         6056 :          int_indices(i_img) = i_img
    2473              :       END DO
    2474              : 
    2475          294 :       t1 = m_walltime()
    2476          720 :       DO j_batch = 1, n_batch_nze
    2477              :          !First batch is over the integrals. They are always in the same order, consistent with get_ac_pairs
    2478              :          CALL fill_3c_stack(ints_stack(1), ri_data%t_3c_int_ctr_1(1, :), int_indices, 3, ri_data, &
    2479         1278 :                             img_bounds=[batch_ranges_nze(j_batch), batch_ranges_nze(j_batch + 1)])
    2480          426 :          CALL dbt_copy(ints_stack(1), ints_stack(2), move_data=.TRUE.)
    2481              : 
    2482         1280 :          DO i_spin = 1, nspins
    2483         2296 :             DO i_batch = 1, n_batch_img
    2484              :                !Second batch is over the P matrix. Here we fill the stacked rho tensors col by col
    2485        18304 :                DO apc_img = batch_ranges_img(i_batch), batch_ranges_img(i_batch + 1) - 1
    2486        16994 :                   IF (apc_filter(apc_img) == 0) CYCLE
    2487        16994 :                   CALL get_ac_pairs(ac_pairs, apc_img, ri_data, qs_env)
    2488              :                   CALL fill_2c_stack(rho_stack(1), rho_ao_t(i_spin, :), ac_pairs(:, 2), 1, ri_data, &
    2489              :                                      img_bounds=[batch_ranges_nze(j_batch), batch_ranges_nze(j_batch + 1)], &
    2490        52292 :                                      shift=apc_img - batch_ranges_img(i_batch) + 1)
    2491              : 
    2492              :                END DO !apc_img
    2493         1310 :                CALL get_tensor_occupancy(rho_stack(1), nze, occ)
    2494         1310 :                IF (nze == 0) CYCLE
    2495         1290 :                CALL dbt_copy(rho_stack(1), rho_stack(2), move_data=.TRUE.)
    2496              : 
    2497              :                !The actual contraction
    2498         1290 :                CALL dbt_batched_contract_init(rho_stack(2))
    2499              :                CALL dbt_contract(1.0_dp, ints_stack(2), rho_stack(2), &
    2500              :                                  0.0_dp, res_stack(2), map_1=[1, 2], map_2=[3], &
    2501              :                                  contract_1=[3], notcontract_1=[1, 2], &
    2502              :                                  contract_2=[1], notcontract_2=[2], &
    2503         1290 :                                  filter_eps=ri_data%filter_eps, flop=nflop)
    2504         1290 :                ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    2505         1290 :                CALL dbt_batched_contract_finalize(rho_stack(2))
    2506         1290 :                CALL dbt_copy(res_stack(2), res_stack(1), move_data=.TRUE.)
    2507              : 
    2508        19996 :                DO apc_img = batch_ranges_img(i_batch), batch_ranges_img(i_batch + 1) - 1
    2509              :                   !Destack the resulting tensor and put it in t_3c_apc with correct apc_img
    2510        16836 :                   IF (apc_filter(apc_img) == 0) CYCLE
    2511        16836 :                   CALL unstack_t_3c_apc(t_3c_tmp, res_stack(1), apc_img - batch_ranges_img(i_batch) + 1)
    2512        18146 :                   CALL dbt_copy(t_3c_tmp, t_3c_apc(i_spin, apc_img), summation=.TRUE., move_data=.TRUE.)
    2513              :                END DO
    2514              : 
    2515              :             END DO !i_batch
    2516              :          END DO !i_spin
    2517              :       END DO !j_batch
    2518          294 :       DEALLOCATE (batch_ranges_img)
    2519          294 :       DEALLOCATE (batch_ranges_nze)
    2520          294 :       t2 = m_walltime()
    2521          294 :       ri_data%dbcsr_time = ri_data%dbcsr_time + t2 - t1
    2522              : 
    2523          294 :       CALL dbt_destroy(rho_stack(1))
    2524          294 :       CALL dbt_destroy(rho_stack(2))
    2525          294 :       CALL dbt_destroy(ints_stack(1))
    2526          294 :       CALL dbt_destroy(ints_stack(2))
    2527          294 :       CALL dbt_destroy(res_stack(1))
    2528          294 :       CALL dbt_destroy(res_stack(2))
    2529          294 :       CALL dbt_destroy(t_3c_tmp)
    2530              : 
    2531          294 :       CALL timestop(handle)
    2532              : 
    2533         2940 :    END SUBROUTINE contract_pmat_3c
    2534              : 
    2535              : ! **************************************************************************************************
    2536              : !> \brief Pre-contract 3-center integrals with the bumped invrse RI metric, for each atom
    2537              : !> \param t_3c_int ...
    2538              : !> \param ri_data ...
    2539              : !> \param qs_env ...
    2540              : ! **************************************************************************************************
    2541           80 :    SUBROUTINE precontract_3c_ints(t_3c_int, ri_data, qs_env)
    2542              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: t_3c_int
    2543              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    2544              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    2545              : 
    2546              :       CHARACTER(len=*), PARAMETER :: routineN = 'precontract_3c_ints'
    2547              : 
    2548              :       INTEGER                                            :: batch_size, handle, i_batch, i_img, &
    2549              :                                                             i_RI, iatom, is, n_batch, natom, &
    2550              :                                                             nblks, nblks_3c(3), nimg
    2551              :       INTEGER(int_8)                                     :: nflop
    2552           80 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: batch_ranges, bsizes_RI_ext, bsizes_RI_ext_split, &
    2553           80 :          bsizes_stack, dist1, dist2, dist3, dist_stack3, idx_to_at_AO, int_indices
    2554          720 :       TYPE(dbt_distribution_type)                        :: t_dist
    2555        16720 :       TYPE(dbt_type)                                     :: t_2c_RI_tmp(2), t_3c_tmp(3)
    2556              : 
    2557           80 :       CALL timeset(routineN, handle)
    2558              : 
    2559           80 :       CALL get_qs_env(qs_env, natom=natom)
    2560              : 
    2561           80 :       nimg = ri_data%nimg
    2562          240 :       ALLOCATE (int_indices(nimg))
    2563         2078 :       DO i_img = 1, nimg
    2564         2078 :          int_indices(i_img) = i_img
    2565              :       END DO
    2566              : 
    2567          240 :       ALLOCATE (idx_to_at_AO(SIZE(ri_data%bsizes_AO_split)))
    2568           80 :       CALL get_idx_to_atom(idx_to_at_AO, ri_data%bsizes_AO_split, ri_data%bsizes_AO)
    2569              : 
    2570           80 :       nblks = SIZE(ri_data%bsizes_RI_split)
    2571          240 :       ALLOCATE (bsizes_RI_ext(ri_data%ncell_RI*natom))
    2572          240 :       ALLOCATE (bsizes_RI_ext_split(ri_data%ncell_RI*nblks))
    2573          562 :       DO i_RI = 1, ri_data%ncell_RI
    2574         1446 :          bsizes_RI_ext((i_RI - 1)*natom + 1:i_RI*natom) = ri_data%bsizes_RI(:)
    2575         2744 :          bsizes_RI_ext_split((i_RI - 1)*nblks + 1:i_RI*nblks) = ri_data%bsizes_RI_split(:)
    2576              :       END DO
    2577              :       CALL create_2c_tensor(t_2c_RI_tmp(1), dist1, dist2, ri_data%pgrid_2d, &
    2578              :                             bsizes_RI_ext, bsizes_RI_ext, &
    2579              :                             name="(RI | RI)")
    2580           80 :       DEALLOCATE (dist1, dist2)
    2581              :       CALL create_2c_tensor(t_2c_RI_tmp(2), dist1, dist2, ri_data%pgrid_2d, &
    2582              :                             bsizes_RI_ext_split, bsizes_RI_ext_split, &
    2583              :                             name="(RI | RI)")
    2584           80 :       DEALLOCATE (dist1, dist2)
    2585              : 
    2586              :       !For more efficiency, we stack multiple images of the 3-center integrals into a single tensor
    2587           80 :       batch_size = ri_data%kp_stack_size
    2588           80 :       n_batch = nimg/batch_size
    2589           80 :       IF (MODULO(nimg, batch_size) .NE. 0) n_batch = n_batch + 1
    2590          240 :       ALLOCATE (batch_ranges(n_batch + 1))
    2591          234 :       DO i_batch = 1, n_batch
    2592          234 :          batch_ranges(i_batch) = (i_batch - 1)*batch_size + 1
    2593              :       END DO
    2594           80 :       batch_ranges(n_batch + 1) = nimg + 1
    2595              : 
    2596           80 :       nblks = SIZE(ri_data%bsizes_AO_split)
    2597          240 :       ALLOCATE (bsizes_stack(batch_size*nblks))
    2598         1456 :       DO is = 1, batch_size
    2599         6736 :          bsizes_stack((is - 1)*nblks + 1:is*nblks) = ri_data%bsizes_AO_split(:)
    2600              :       END DO
    2601              : 
    2602           80 :       CALL dbt_get_info(t_3c_int(1, 1), nblks_total=nblks_3c)
    2603          720 :       ALLOCATE (dist1(nblks_3c(1)), dist2(nblks_3c(2)), dist3(nblks_3c(3)), dist_stack3(batch_size*nblks_3c(3)))
    2604           80 :       CALL dbt_get_info(t_3c_int(1, 1), proc_dist_1=dist1, proc_dist_2=dist2, proc_dist_3=dist3)
    2605         1456 :       DO is = 1, batch_size
    2606         6736 :          dist_stack3((is - 1)*nblks_3c(3) + 1:is*nblks_3c(3)) = dist3(:)
    2607              :       END DO
    2608              : 
    2609           80 :       CALL dbt_distribution_new(t_dist, ri_data%pgrid, dist1, dist2, dist_stack3)
    2610              :       CALL dbt_create(t_3c_tmp(1), "ints_stack", t_dist, [1], [2, 3], bsizes_RI_ext_split, &
    2611           80 :                       ri_data%bsizes_AO_split, bsizes_stack)
    2612           80 :       CALL dbt_distribution_destroy(t_dist)
    2613           80 :       DEALLOCATE (dist1, dist2, dist3, dist_stack3)
    2614              : 
    2615           80 :       CALL dbt_create(t_3c_tmp(1), t_3c_tmp(2))
    2616           80 :       CALL dbt_create(t_3c_int(1, 1), t_3c_tmp(3))
    2617              : 
    2618          240 :       DO iatom = 1, natom
    2619          160 :          CALL dbt_copy(ri_data%t_2c_inv(1, iatom), t_2c_RI_tmp(1))
    2620          160 :          CALL apply_bump(t_2c_RI_tmp(1), iatom, ri_data, qs_env, from_left=.TRUE., from_right=.TRUE.)
    2621          160 :          CALL dbt_copy(t_2c_RI_tmp(1), t_2c_RI_tmp(2), move_data=.TRUE.)
    2622              : 
    2623          160 :          CALL dbt_batched_contract_init(t_2c_RI_tmp(2))
    2624          468 :          DO i_batch = 1, n_batch
    2625              : 
    2626              :             CALL fill_3c_stack(t_3c_tmp(1), t_3c_int(1, :), int_indices, 3, ri_data, &
    2627              :                                img_bounds=[batch_ranges(i_batch), batch_ranges(i_batch + 1)], &
    2628          924 :                                filter_at=iatom, filter_dim=2, idx_to_at=idx_to_at_AO)
    2629              : 
    2630              :             CALL dbt_contract(1.0_dp, t_2c_RI_tmp(2), t_3c_tmp(1), &
    2631              :                               0.0_dp, t_3c_tmp(2), map_1=[1], map_2=[2, 3], &
    2632              :                               contract_1=[2], notcontract_1=[1], &
    2633              :                               contract_2=[1], notcontract_2=[2, 3], &
    2634          308 :                               filter_eps=ri_data%filter_eps, flop=nflop)
    2635          308 :             ri_data%dbcsr_nflop = ri_data%dbcsr_nflop + nflop
    2636              : 
    2637         4304 :             DO i_img = batch_ranges(i_batch), batch_ranges(i_batch + 1) - 1
    2638         3996 :                CALL unstack_t_3c_apc(t_3c_tmp(3), t_3c_tmp(2), i_img - batch_ranges(i_batch) + 1)
    2639              :                CALL dbt_copy(t_3c_tmp(3), ri_data%t_3c_int_ctr_1(1, i_img), summation=.TRUE., &
    2640         4304 :                              order=[2, 1, 3], move_data=.TRUE.)
    2641              :             END DO
    2642          468 :             CALL dbt_clear(t_3c_tmp(1))
    2643              :          END DO
    2644          240 :          CALL dbt_batched_contract_finalize(t_2c_RI_tmp(2))
    2645              : 
    2646              :       END DO
    2647           80 :       CALL dbt_destroy(t_2c_RI_tmp(1))
    2648           80 :       CALL dbt_destroy(t_2c_RI_tmp(2))
    2649           80 :       CALL dbt_destroy(t_3c_tmp(1))
    2650           80 :       CALL dbt_destroy(t_3c_tmp(2))
    2651           80 :       CALL dbt_destroy(t_3c_tmp(3))
    2652              : 
    2653         2078 :       DO i_img = 1, nimg
    2654         2078 :          CALL dbt_destroy(t_3c_int(1, i_img))
    2655              :       END DO
    2656              : 
    2657           80 :       CALL timestop(handle)
    2658              : 
    2659          400 :    END SUBROUTINE precontract_3c_ints
    2660              : 
    2661              : ! **************************************************************************************************
    2662              : !> \brief Copy the data of a 2D tensor living in the main MPI group to a sub-group, given the proc
    2663              : !>        mapping from one to the other (e.g. for a proc idx in the subgroup, we get the idx in the main)
    2664              : !> \param t2c_sub ...
    2665              : !> \param t2c_main ...
    2666              : !> \param group_size ...
    2667              : !> \param ngroups ...
    2668              : !> \param para_env ...
    2669              : ! **************************************************************************************************
    2670         9832 :    SUBROUTINE copy_2c_to_subgroup(t2c_sub, t2c_main, group_size, ngroups, para_env)
    2671              :       TYPE(dbt_type), INTENT(INOUT)                      :: t2c_sub, t2c_main
    2672              :       INTEGER, INTENT(IN)                                :: group_size, ngroups
    2673              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    2674              : 
    2675              :       INTEGER                                            :: batch_size, i, i_batch, i_msg, iblk, &
    2676              :                                                             igroup, iproc, ir, is, jblk, n_batch, &
    2677              :                                                             nocc, tag
    2678         9832 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes1, bsizes2
    2679         9832 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: block_dest, block_source
    2680         9832 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: current_dest
    2681              :       INTEGER, DIMENSION(2)                              :: ind, nblks
    2682              :       LOGICAL                                            :: found
    2683         9832 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: blk
    2684         9832 :       TYPE(cp_2d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_buff, send_buff
    2685              :       TYPE(dbt_iterator_type)                            :: iter
    2686         9832 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_req, send_req
    2687              : 
    2688              :       !Stategy: we loop over the main tensor, and send all the data. Then we loop over the sub tensor
    2689              :       !         and receive it. We do all of it with async MPI communication. The sub tensor needs
    2690              :       !         to have blocks pre-reserved though
    2691              : 
    2692         9832 :       CALL dbt_get_info(t2c_main, nblks_total=nblks)
    2693              : 
    2694              :       !Loop over the main tensor, count how many blocks are there, which ones, and on which proc
    2695        39328 :       ALLOCATE (block_source(nblks(1), nblks(2)))
    2696       186060 :       block_source = -1
    2697         9832 :       nocc = 0
    2698         9832 : !$OMP PARALLEL DEFAULT(NONE) SHARED(t2c_main,para_env,nocc,block_source) PRIVATE(iter,ind,blk,found)
    2699              :       CALL dbt_iterator_start(iter, t2c_main)
    2700              :       DO WHILE (dbt_iterator_blocks_left(iter))
    2701              :          CALL dbt_iterator_next_block(iter, ind)
    2702              :          CALL dbt_get_block(t2c_main, ind, blk, found)
    2703              :          IF (.NOT. found) CYCLE
    2704              : 
    2705              :          block_source(ind(1), ind(2)) = para_env%mepos
    2706              : !$OMP ATOMIC
    2707              :          nocc = nocc + 1
    2708              :          DEALLOCATE (blk)
    2709              :       END DO
    2710              :       CALL dbt_iterator_stop(iter)
    2711              : !$OMP END PARALLEL
    2712              : 
    2713         9832 :       CALL para_env%sum(nocc)
    2714         9832 :       CALL para_env%sum(block_source)
    2715       186060 :       block_source = block_source + para_env%num_pe - 1
    2716         9832 :       IF (nocc == 0) RETURN
    2717              : 
    2718              :       !Loop over the sub tensor, get the block destination
    2719         9652 :       igroup = para_env%mepos/group_size
    2720        28956 :       ALLOCATE (block_dest(nblks(1), nblks(2)))
    2721       184800 :       block_dest = -1
    2722        35832 :       DO jblk = 1, nblks(2)
    2723       184800 :          DO iblk = 1, nblks(1)
    2724       148968 :             IF (block_source(iblk, jblk) == -1) CYCLE
    2725              : 
    2726       121128 :             CALL dbt_get_stored_coordinates(t2c_sub, [iblk, jblk], iproc)
    2727       175148 :             block_dest(iblk, jblk) = igroup*group_size + iproc !mapping of iproc in subgroup to main group idx
    2728              :          END DO
    2729              :       END DO
    2730              : 
    2731        48260 :       ALLOCATE (bsizes1(nblks(1)), bsizes2(nblks(2)))
    2732         9652 :       CALL dbt_get_info(t2c_main, blk_size_1=bsizes1, blk_size_2=bsizes2)
    2733              : 
    2734        48260 :       ALLOCATE (current_dest(nblks(1), nblks(2), 0:ngroups - 1))
    2735        28956 :       DO igroup = 0, ngroups - 1
    2736              :          !for a given subgroup, need to make the destination available to everyone in the main group
    2737       369600 :          current_dest(:, :, igroup) = block_dest(:, :)
    2738        28956 :          CALL para_env%bcast(current_dest(:, :, igroup), source=igroup*group_size) !bcast from first proc in sub-group
    2739              :       END DO
    2740              : 
    2741              :       !We go by batches, which cannot be larger than the maximum MPI tag value
    2742         9652 :       batch_size = MIN(para_env%get_tag_ub(), 128000, nocc*ngroups)
    2743         9652 :       n_batch = (nocc*ngroups)/batch_size
    2744         9652 :       IF (MODULO(nocc*ngroups, batch_size) .NE. 0) n_batch = n_batch + 1
    2745              : 
    2746        19304 :       DO i_batch = 1, n_batch
    2747              :          !Loop over groups, blocks and send/receive
    2748       200112 :          ALLOCATE (send_buff(batch_size), recv_buff(batch_size))
    2749       200112 :          ALLOCATE (send_req(batch_size), recv_req(batch_size))
    2750              :          ir = 0
    2751              :          is = 0
    2752              :          i_msg = 0
    2753        35832 :          DO jblk = 1, nblks(2)
    2754       184800 :             DO iblk = 1, nblks(1)
    2755       473084 :                DO igroup = 0, ngroups - 1
    2756       297936 :                   IF (block_source(iblk, jblk) == -1) CYCLE
    2757              : 
    2758        80752 :                   i_msg = i_msg + 1
    2759        80752 :                   IF (i_msg < (i_batch - 1)*batch_size + 1 .OR. i_msg > i_batch*batch_size) CYCLE
    2760              : 
    2761              :                   !a unique tag per block, within this batch
    2762        80752 :                   tag = i_msg - (i_batch - 1)*batch_size
    2763              : 
    2764        80752 :                   found = .FALSE.
    2765        80752 :                   IF (para_env%mepos == block_source(iblk, jblk)) THEN
    2766       121128 :                      CALL dbt_get_block(t2c_main, [iblk, jblk], blk, found)
    2767              :                   END IF
    2768              : 
    2769              :                   !If blocks live on same proc, simply copy. Else MPI send/recv
    2770        80752 :                   IF (block_source(iblk, jblk) == current_dest(iblk, jblk, igroup)) THEN
    2771       121128 :                      IF (found) CALL dbt_put_block(t2c_sub, [iblk, jblk], SHAPE(blk), blk)
    2772              :                   ELSE
    2773        40376 :                      IF (para_env%mepos == block_source(iblk, jblk) .AND. found) THEN
    2774        80752 :                         ALLOCATE (send_buff(tag)%array(bsizes1(iblk), bsizes2(jblk)))
    2775     22294244 :                         send_buff(tag)%array(:, :) = blk(:, :)
    2776        20188 :                         is = is + 1
    2777              :                         CALL para_env%isend(msgin=send_buff(tag)%array, dest=current_dest(iblk, jblk, igroup), &
    2778        20188 :                                             request=send_req(is), tag=tag)
    2779              :                      END IF
    2780              : 
    2781        40376 :                      IF (para_env%mepos == current_dest(iblk, jblk, igroup)) THEN
    2782        80752 :                         ALLOCATE (recv_buff(tag)%array(bsizes1(iblk), bsizes2(jblk)))
    2783        20188 :                         ir = ir + 1
    2784              :                         CALL para_env%irecv(msgout=recv_buff(tag)%array, source=block_source(iblk, jblk), &
    2785        20188 :                                             request=recv_req(ir), tag=tag)
    2786              :                      END IF
    2787              :                   END IF
    2788              : 
    2789       229720 :                   IF (found) DEALLOCATE (blk)
    2790              :                END DO
    2791              :             END DO
    2792              :          END DO
    2793              : 
    2794         9652 :          CALL mp_waitall(send_req(1:is))
    2795         9652 :          CALL mp_waitall(recv_req(1:ir))
    2796              :          !clean-up
    2797        90404 :          DO i = 1, batch_size
    2798        90404 :             IF (ASSOCIATED(send_buff(i)%array)) DEALLOCATE (send_buff(i)%array)
    2799              :          END DO
    2800              : 
    2801              :          !Finally copy the data from the buffer to the sub-tensor
    2802              :          i_msg = 0
    2803        35832 :          DO jblk = 1, nblks(2)
    2804       184800 :             DO iblk = 1, nblks(1)
    2805       473084 :                DO igroup = 0, ngroups - 1
    2806       297936 :                   IF (block_source(iblk, jblk) == -1) CYCLE
    2807              : 
    2808        80752 :                   i_msg = i_msg + 1
    2809        80752 :                   IF (i_msg < (i_batch - 1)*batch_size + 1 .OR. i_msg > i_batch*batch_size) CYCLE
    2810              : 
    2811              :                   !a unique tag per block, within this batch
    2812        80752 :                   tag = i_msg - (i_batch - 1)*batch_size
    2813              : 
    2814        80752 :                   IF (para_env%mepos == current_dest(iblk, jblk, igroup) .AND. &
    2815       148968 :                       block_source(iblk, jblk) .NE. current_dest(iblk, jblk, igroup)) THEN
    2816              : 
    2817        80752 :                      ALLOCATE (blk(bsizes1(iblk), bsizes2(jblk)))
    2818     22294244 :                      blk(:, :) = recv_buff(tag)%array(:, :)
    2819       100940 :                      CALL dbt_put_block(t2c_sub, [iblk, jblk], SHAPE(blk), blk)
    2820        20188 :                      DEALLOCATE (blk)
    2821              :                   END IF
    2822              :                END DO
    2823              :             END DO
    2824              :          END DO
    2825              : 
    2826              :          !clean-up
    2827        90404 :          DO i = 1, batch_size
    2828        90404 :             IF (ASSOCIATED(recv_buff(i)%array)) DEALLOCATE (recv_buff(i)%array)
    2829              :          END DO
    2830        19304 :          DEALLOCATE (send_buff, recv_buff, send_req, recv_req)
    2831              :       END DO !i_batch
    2832         9652 :       CALL dbt_finalize(t2c_sub)
    2833              : 
    2834        19664 :    END SUBROUTINE copy_2c_to_subgroup
    2835              : 
    2836              : ! **************************************************************************************************
    2837              : !> \brief Pre-compute the destination of the block of a 3D tensor in various subgroups
    2838              : !> \param subgroup_dest ...
    2839              : !> \param t3c_sub ...
    2840              : !> \param t3c_main ...
    2841              : !> \param group_size ...
    2842              : !> \param ngroups ...
    2843              : !> \param para_env ...
    2844              : ! **************************************************************************************************
    2845          588 :    SUBROUTINE get_3c_subgroup_dest(subgroup_dest, t3c_sub, t3c_main, group_size, ngroups, para_env)
    2846              :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :), &
    2847              :          INTENT(INOUT)                                   :: subgroup_dest
    2848              :       TYPE(dbt_type), INTENT(INOUT)                      :: t3c_sub, t3c_main
    2849              :       INTEGER, INTENT(IN)                                :: group_size, ngroups
    2850              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    2851              : 
    2852              :       INTEGER                                            :: iblk, igroup, iproc, jblk, kblk
    2853          588 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: block_dest
    2854              :       INTEGER, DIMENSION(3)                              :: nblks
    2855              : 
    2856          588 :       CALL dbt_get_info(t3c_main, nblks_total=nblks)
    2857              : 
    2858              :       !Loop over the sub tensor, get the block destination
    2859          588 :       igroup = para_env%mepos/group_size
    2860         2940 :       ALLOCATE (block_dest(nblks(1), nblks(2), nblks(3)))
    2861         1764 :       DO kblk = 1, nblks(3)
    2862         9460 :          DO jblk = 1, nblks(2)
    2863        34952 :             DO iblk = 1, nblks(1)
    2864       104320 :                CALL dbt_get_stored_coordinates(t3c_sub, [iblk, jblk, kblk], iproc)
    2865        33776 :                block_dest(iblk, jblk, kblk) = igroup*group_size + iproc !mapping of iproc in subgroup to main group idx
    2866              :             END DO
    2867              :          END DO
    2868              :       END DO
    2869              : 
    2870         3528 :       ALLOCATE (subgroup_dest(nblks(1), nblks(2), nblks(3), ngroups))
    2871         1764 :       DO igroup = 0, ngroups - 1
    2872              :          !for a given subgroup, need to make the destination available to everyone in the main group
    2873        71080 :          subgroup_dest(:, :, :, igroup + 1) = block_dest(:, :, :)
    2874         1764 :          CALL para_env%bcast(subgroup_dest(:, :, :, igroup + 1), source=igroup*group_size) !bcast from first proc in subgroup
    2875              :       END DO
    2876              : 
    2877          588 :    END SUBROUTINE get_3c_subgroup_dest
    2878              : 
    2879              : ! **************************************************************************************************
    2880              : !> \brief Copy the data of a 3D tensor living in the main MPI group to a sub-group, given the proc
    2881              : !>        mapping from one to the other (e.g. for a proc idx in the subgroup, we get the idx in the main)
    2882              : !> \param t3c_sub ...
    2883              : !> \param t3c_main ...
    2884              : !> \param ngroups ...
    2885              : !> \param para_env ...
    2886              : !> \param subgroup_dest ...
    2887              : !> \param iatom_to_subgroup ...
    2888              : !> \param dim_at ...
    2889              : !> \param idx_to_at ...
    2890              : ! **************************************************************************************************
    2891        14652 :    SUBROUTINE copy_3c_to_subgroup(t3c_sub, t3c_main, ngroups, para_env, subgroup_dest, &
    2892        14652 :                                   iatom_to_subgroup, dim_at, idx_to_at)
    2893              :       TYPE(dbt_type), INTENT(INOUT)                      :: t3c_sub, t3c_main
    2894              :       INTEGER, INTENT(IN)                                :: ngroups
    2895              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    2896              :       INTEGER, DIMENSION(:, :, :, :), INTENT(IN)         :: subgroup_dest
    2897              :       TYPE(cp_1d_logical_p_type), DIMENSION(:), &
    2898              :          INTENT(INOUT), OPTIONAL                         :: iatom_to_subgroup
    2899              :       INTEGER, INTENT(IN), OPTIONAL                      :: dim_at
    2900              :       INTEGER, DIMENSION(:), OPTIONAL                    :: idx_to_at
    2901              : 
    2902              :       INTEGER                                            :: batch_size, i, i_batch, i_msg, iatom, &
    2903              :                                                             iblk, igroup, ir, is, isbuff, jblk, &
    2904              :                                                             kblk, n_batch, nocc, tag
    2905        14652 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes1, bsizes2, bsizes3
    2906        14652 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: block_source
    2907              :       INTEGER, DIMENSION(3)                              :: ind, nblks
    2908              :       LOGICAL                                            :: filter_at, found
    2909        14652 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: blk
    2910        14652 :       TYPE(cp_3d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_buff, send_buff
    2911              :       TYPE(dbt_iterator_type)                            :: iter
    2912        14652 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_req, send_req
    2913              : 
    2914              :       !Stategy: we loop over the main tensor, and send all the data. Then we loop over the sub tensor
    2915              :       !         and receive it. We do all of it with async MPI communication. The sub tensor needs
    2916              :       !         to have blocks pre-reserved though
    2917              : 
    2918        14652 :       CALL dbt_get_info(t3c_main, nblks_total=nblks)
    2919              : 
    2920              :       !in some cases, only copy a fraction of the 3c tensor to a given subgroup (corresponding to some atoms)
    2921        14652 :       filter_at = .FALSE.
    2922        14652 :       IF (PRESENT(iatom_to_subgroup) .AND. PRESENT(dim_at) .AND. PRESENT(idx_to_at)) THEN
    2923         8620 :          filter_at = .TRUE.
    2924         8620 :          CPASSERT(nblks(dim_at) == SIZE(idx_to_at))
    2925              :       END IF
    2926              : 
    2927              :       !Loop over the main tensor, count how many blocks are there, which ones, and on which proc
    2928        73260 :       ALLOCATE (block_source(nblks(1), nblks(2), nblks(3)))
    2929      1036628 :       block_source = -1
    2930        14652 :       nocc = 0
    2931        14652 : !$OMP PARALLEL DEFAULT(NONE) SHARED(t3c_main,para_env,nocc,block_source) PRIVATE(iter,ind,blk,found)
    2932              :       CALL dbt_iterator_start(iter, t3c_main)
    2933              :       DO WHILE (dbt_iterator_blocks_left(iter))
    2934              :          CALL dbt_iterator_next_block(iter, ind)
    2935              :          CALL dbt_get_block(t3c_main, ind, blk, found)
    2936              :          IF (.NOT. found) CYCLE
    2937              : 
    2938              :          block_source(ind(1), ind(2), ind(3)) = para_env%mepos
    2939              : !$OMP ATOMIC
    2940              :          nocc = nocc + 1
    2941              :          DEALLOCATE (blk)
    2942              :       END DO
    2943              :       CALL dbt_iterator_stop(iter)
    2944              : !$OMP END PARALLEL
    2945              : 
    2946        14652 :       CALL para_env%sum(nocc)
    2947        14652 :       CALL para_env%sum(block_source)
    2948      1036628 :       block_source = block_source + para_env%num_pe - 1
    2949        14652 :       IF (nocc == 0) RETURN
    2950              : 
    2951       102564 :       ALLOCATE (bsizes1(nblks(1)), bsizes2(nblks(2)), bsizes3(nblks(3)))
    2952        14652 :       CALL dbt_get_info(t3c_main, blk_size_1=bsizes1, blk_size_2=bsizes2, blk_size_3=bsizes3)
    2953              : 
    2954              :       !We go by batches, which cannot be larger than the maximum MPI tag value
    2955        14652 :       batch_size = MIN(para_env%get_tag_ub(), 128000, nocc*ngroups)
    2956        14652 :       n_batch = (nocc*ngroups)/batch_size
    2957        14652 :       IF (MODULO(nocc*ngroups, batch_size) .NE. 0) n_batch = n_batch + 1
    2958              : 
    2959        29304 :       DO i_batch = 1, n_batch
    2960              :          !Loop over groups, blocks and send/receive
    2961       716888 :          ALLOCATE (send_buff(batch_size), recv_buff(batch_size))
    2962       716888 :          ALLOCATE (send_req(batch_size), recv_req(batch_size))
    2963              :          ir = 0
    2964              :          is = 0
    2965              :          i_msg = 0
    2966              :          isbuff = 0
    2967        43956 :          DO kblk = 1, nblks(3)
    2968       277108 :             DO jblk = 1, nblks(2)
    2969      1021976 :                DO iblk = 1, nblks(1)
    2970       759520 :                   IF (block_source(iblk, jblk, kblk) == -1) CYCLE
    2971              : 
    2972       164570 :                   found = .FALSE.
    2973       164570 :                   IF (para_env%mepos == block_source(iblk, jblk, kblk)) THEN
    2974       329140 :                      CALL dbt_get_block(t3c_main, [iblk, jblk, kblk], blk, found)
    2975        82285 :                      IF (found) THEN
    2976        82285 :                         isbuff = isbuff + 1
    2977       411425 :                         ALLOCATE (send_buff(isbuff)%array(bsizes1(iblk), bsizes2(jblk), bsizes3(kblk)))
    2978              :                      END IF
    2979              :                   END IF
    2980              : 
    2981       493710 :                   DO igroup = 0, ngroups - 1
    2982              : 
    2983       329140 :                      i_msg = i_msg + 1
    2984       329140 :                      IF (i_msg < (i_batch - 1)*batch_size + 1 .OR. i_msg > i_batch*batch_size) CYCLE
    2985              : 
    2986              :                      !a unique tag per block, within this batch
    2987       329140 :                      tag = i_msg - (i_batch - 1)*batch_size
    2988              : 
    2989       329140 :                      IF (filter_at) THEN
    2990       961520 :                         ind(:) = [iblk, jblk, kblk]
    2991       240380 :                         iatom = idx_to_at(ind(dim_at))
    2992       240380 :                         IF (.NOT. iatom_to_subgroup(iatom)%array(igroup + 1)) CYCLE
    2993              :                      END IF
    2994              : 
    2995              :                      !If blocks live on same proc, simply copy. Else MPI send/recv
    2996       373520 :                      IF (block_source(iblk, jblk, kblk) == subgroup_dest(iblk, jblk, kblk, igroup + 1)) THEN
    2997       479680 :                         IF (found) CALL dbt_put_block(t3c_sub, [iblk, jblk, kblk], SHAPE(blk), blk)
    2998              :                      ELSE
    2999        89030 :                         IF (para_env%mepos == block_source(iblk, jblk, kblk) .AND. found) THEN
    3000    128638380 :                            send_buff(isbuff)%array(:, :, :) = blk(:, :, :)
    3001        44515 :                            is = is + 1
    3002              :                            CALL para_env%isend(msgin=send_buff(isbuff)%array, &
    3003              :                                                dest=subgroup_dest(iblk, jblk, kblk, igroup + 1), &
    3004        44515 :                                                request=send_req(is), tag=tag)
    3005              :                         END IF
    3006              : 
    3007        89030 :                         IF (para_env%mepos == subgroup_dest(iblk, jblk, kblk, igroup + 1)) THEN
    3008       222575 :                            ALLOCATE (recv_buff(tag)%array(bsizes1(iblk), bsizes2(jblk), bsizes3(kblk)))
    3009        44515 :                            ir = ir + 1
    3010              :                            CALL para_env%irecv(msgout=recv_buff(tag)%array, source=block_source(iblk, jblk, kblk), &
    3011        44515 :                                                request=recv_req(ir), tag=tag)
    3012              :                         END IF
    3013              :                      END IF
    3014              :                   END DO !igroup
    3015              : 
    3016       397722 :                   IF (found) DEALLOCATE (blk)
    3017              :                END DO
    3018              :             END DO
    3019              :          END DO
    3020              : 
    3021              :          !Finally copy the data from the buffer to the sub-tensor
    3022              :          i_msg = 0
    3023              :          ir = 0
    3024        43956 :          DO kblk = 1, nblks(3)
    3025       277108 :             DO jblk = 1, nblks(2)
    3026      1021976 :                DO iblk = 1, nblks(1)
    3027      2511712 :                   DO igroup = 0, ngroups - 1
    3028      1519040 :                      IF (block_source(iblk, jblk, kblk) == -1) CYCLE
    3029              : 
    3030       329140 :                      i_msg = i_msg + 1
    3031       329140 :                      IF (i_msg < (i_batch - 1)*batch_size + 1 .OR. i_msg > i_batch*batch_size) CYCLE
    3032              : 
    3033              :                      !a unique tag per block, within this batch
    3034       329140 :                      tag = i_msg - (i_batch - 1)*batch_size
    3035              : 
    3036       329140 :                      IF (filter_at) THEN
    3037       961520 :                         ind(:) = [iblk, jblk, kblk]
    3038       240380 :                         iatom = idx_to_at(ind(dim_at))
    3039       240380 :                         IF (.NOT. iatom_to_subgroup(iatom)%array(igroup + 1)) CYCLE
    3040              :                      END IF
    3041              : 
    3042       208950 :                      IF (para_env%mepos == subgroup_dest(iblk, jblk, kblk, igroup + 1) .AND. &
    3043       759520 :                          block_source(iblk, jblk, kblk) .NE. subgroup_dest(iblk, jblk, kblk, igroup + 1)) THEN
    3044              : 
    3045        44515 :                         ir = ir + 1
    3046        44515 :                         CALL mp_waitall(recv_req(ir:ir))
    3047       311605 :                         CALL dbt_put_block(t3c_sub, [iblk, jblk, kblk], SHAPE(recv_buff(tag)%array), recv_buff(tag)%array)
    3048              :                      END IF
    3049              :                   END DO
    3050              :                END DO
    3051              :             END DO
    3052              :          END DO
    3053              : 
    3054              :          !clean-up
    3055        14652 :          CALL mp_waitall(send_req(1:is))
    3056       343792 :          DO i = 1, batch_size
    3057       329140 :             IF (ASSOCIATED(recv_buff(i)%array)) DEALLOCATE (recv_buff(i)%array)
    3058       343792 :             IF (ASSOCIATED(send_buff(i)%array)) DEALLOCATE (send_buff(i)%array)
    3059              :          END DO
    3060        29304 :          DEALLOCATE (send_buff, recv_buff, send_req, recv_req)
    3061              :       END DO !i_batch
    3062        14652 :       CALL dbt_finalize(t3c_sub)
    3063              : 
    3064        29304 :    END SUBROUTINE copy_3c_to_subgroup
    3065              : 
    3066              : ! **************************************************************************************************
    3067              : !> \brief A routine that gather the pieces of the KS matrix accross the subgroup and puts it in the
    3068              : !>        main group. Each b_img, iatom, jatom tuple is one a single CPU
    3069              : !> \param ks_t ...
    3070              : !> \param ks_t_sub ...
    3071              : !> \param group_size ...
    3072              : !> \param sparsity_pattern ...
    3073              : !> \param para_env ...
    3074              : !> \param ri_data ...
    3075              : ! **************************************************************************************************
    3076          248 :    SUBROUTINE gather_ks_matrix(ks_t, ks_t_sub, group_size, sparsity_pattern, para_env, ri_data)
    3077              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: ks_t, ks_t_sub
    3078              :       INTEGER, INTENT(IN)                                :: group_size
    3079              :       INTEGER, DIMENSION(:, :, :), INTENT(IN)            :: sparsity_pattern
    3080              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    3081              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    3082              : 
    3083              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'gather_ks_matrix'
    3084              : 
    3085              :       INTEGER                                            :: b_img, dest, handle, i, i_spin, iatom, &
    3086              :                                                             igroup, ir, is, jatom, n_mess, natom, &
    3087              :                                                             nimg, nspins, source, tag
    3088              :       LOGICAL                                            :: found
    3089          248 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: blk
    3090          248 :       TYPE(cp_2d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_buff, send_buff
    3091          248 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_req, send_req
    3092              : 
    3093          248 :       CALL timeset(routineN, handle)
    3094              : 
    3095          248 :       nimg = SIZE(sparsity_pattern, 3)
    3096          248 :       natom = SIZE(sparsity_pattern, 2)
    3097          248 :       nspins = SIZE(ks_t, 1)
    3098              : 
    3099         7034 :       DO b_img = 1, nimg
    3100              :          n_mess = 0
    3101        15138 :          DO i_spin = 1, nspins
    3102        31842 :             DO jatom = 1, natom
    3103        58464 :                DO iatom = 1, natom
    3104        50112 :                   IF (sparsity_pattern(iatom, jatom, b_img) > -1) n_mess = n_mess + 1
    3105              :                END DO
    3106              :             END DO
    3107              :          END DO
    3108              : 
    3109        51326 :          ALLOCATE (send_buff(n_mess), recv_buff(n_mess))
    3110        58112 :          ALLOCATE (send_req(n_mess), recv_req(n_mess))
    3111         6786 :          ir = 0
    3112         6786 :          is = 0
    3113         6786 :          n_mess = 0
    3114         6786 :          tag = 0
    3115              : 
    3116        15138 :          DO i_spin = 1, nspins
    3117        31842 :             DO jatom = 1, natom
    3118        58464 :                DO iatom = 1, natom
    3119        33408 :                   IF (sparsity_pattern(iatom, jatom, b_img) < 0) CYCLE
    3120        11664 :                   n_mess = n_mess + 1
    3121        11664 :                   tag = tag + 1
    3122              : 
    3123              :                   !sending the message
    3124        34992 :                   CALL dbt_get_stored_coordinates(ks_t(i_spin, b_img), [iatom, jatom], dest)
    3125        34992 :                   CALL dbt_get_stored_coordinates(ks_t_sub(i_spin, b_img), [iatom, jatom], source) !source within sub
    3126        11664 :                   igroup = sparsity_pattern(iatom, jatom, b_img)
    3127        11664 :                   source = source + igroup*group_size
    3128        11664 :                   IF (para_env%mepos == source) THEN
    3129        17496 :                      CALL dbt_get_block(ks_t_sub(i_spin, b_img), [iatom, jatom], blk, found)
    3130         5832 :                      IF (source == dest) THEN
    3131         3531 :                         IF (found) CALL dbt_put_block(ks_t(i_spin, b_img), [iatom, jatom], SHAPE(blk), blk)
    3132              :                      ELSE
    3133        19540 :                         ALLOCATE (send_buff(n_mess)%array(ri_data%bsizes_AO(iatom), ri_data%bsizes_AO(jatom)))
    3134       316161 :                         send_buff(n_mess)%array(:, :) = 0.0_dp
    3135         4885 :                         IF (found) THEN
    3136       219507 :                            send_buff(n_mess)%array(:, :) = blk(:, :)
    3137              :                         END IF
    3138         4885 :                         is = is + 1
    3139              :                         CALL para_env%isend(msgin=send_buff(n_mess)%array, dest=dest, &
    3140         4885 :                                             request=send_req(is), tag=tag)
    3141              :                      END IF
    3142         5832 :                      DEALLOCATE (blk)
    3143              :                   END IF
    3144              : 
    3145              :                   !receiving the message
    3146        28368 :                   IF (para_env%mepos == dest .AND. source .NE. dest) THEN
    3147        19540 :                      ALLOCATE (recv_buff(n_mess)%array(ri_data%bsizes_AO(iatom), ri_data%bsizes_AO(jatom)))
    3148         4885 :                      ir = ir + 1
    3149              :                      CALL para_env%irecv(msgout=recv_buff(n_mess)%array, source=source, &
    3150         4885 :                                          request=recv_req(ir), tag=tag)
    3151              :                   END IF
    3152              :                END DO !iatom
    3153              :             END DO !jatom
    3154              :          END DO !ispin
    3155              : 
    3156         6786 :          CALL mp_waitall(send_req(1:is))
    3157         6786 :          CALL mp_waitall(recv_req(1:ir))
    3158              : 
    3159              :          !Copy the messages received into the KS matrix
    3160         6786 :          n_mess = 0
    3161        15138 :          DO i_spin = 1, nspins
    3162        31842 :             DO jatom = 1, natom
    3163        58464 :                DO iatom = 1, natom
    3164        33408 :                   IF (sparsity_pattern(iatom, jatom, b_img) < 0) CYCLE
    3165        11664 :                   n_mess = n_mess + 1
    3166              : 
    3167        34992 :                   CALL dbt_get_stored_coordinates(ks_t(i_spin, b_img), [iatom, jatom], dest)
    3168        28368 :                   IF (para_env%mepos == dest) THEN
    3169         5832 :                      IF (.NOT. ASSOCIATED(recv_buff(n_mess)%array)) CYCLE
    3170        19540 :                      ALLOCATE (blk(ri_data%bsizes_AO(iatom), ri_data%bsizes_AO(jatom)))
    3171       316161 :                      blk(:, :) = recv_buff(n_mess)%array(:, :)
    3172        24425 :                      CALL dbt_put_block(ks_t(i_spin, b_img), [iatom, jatom], SHAPE(blk), blk)
    3173         4885 :                      DEALLOCATE (blk)
    3174              :                   END IF
    3175              :                END DO
    3176              :             END DO
    3177              :          END DO
    3178              : 
    3179              :          !clean-up
    3180        18450 :          DO i = 1, n_mess
    3181        11664 :             IF (ASSOCIATED(send_buff(i)%array)) DEALLOCATE (send_buff(i)%array)
    3182        18450 :             IF (ASSOCIATED(recv_buff(i)%array)) DEALLOCATE (recv_buff(i)%array)
    3183              :          END DO
    3184         7034 :          DEALLOCATE (send_buff, recv_buff, send_req, recv_req)
    3185              :       END DO !b_img
    3186              : 
    3187          248 :       CALL timestop(handle)
    3188              : 
    3189          248 :    END SUBROUTINE gather_ks_matrix
    3190              : 
    3191              : ! **************************************************************************************************
    3192              : !> \brief copy all required 2c tensors from the main MPI group to the subgroups
    3193              : !> \param mat_2c_pot ...
    3194              : !> \param t_2c_work ...
    3195              : !> \param t_2c_ao_tmp ...
    3196              : !> \param ks_t_split ...
    3197              : !> \param ks_t_sub ...
    3198              : !> \param group_size ...
    3199              : !> \param ngroups ...
    3200              : !> \param para_env ...
    3201              : !> \param para_env_sub ...
    3202              : !> \param ri_data ...
    3203              : ! **************************************************************************************************
    3204          248 :    SUBROUTINE get_subgroup_2c_tensors(mat_2c_pot, t_2c_work, t_2c_ao_tmp, ks_t_split, ks_t_sub, &
    3205              :                                       group_size, ngroups, para_env, para_env_sub, ri_data)
    3206              :       TYPE(dbcsr_type), DIMENSION(:), INTENT(INOUT)      :: mat_2c_pot
    3207              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_2c_work, t_2c_ao_tmp, ks_t_split
    3208              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: ks_t_sub
    3209              :       INTEGER, INTENT(IN)                                :: group_size, ngroups
    3210              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
    3211              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    3212              : 
    3213              :       CHARACTER(len=*), PARAMETER :: routineN = 'get_subgroup_2c_tensors'
    3214              : 
    3215              :       INTEGER                                            :: handle, i, i_img, i_RI, i_spin, iproc, &
    3216              :                                                             j, natom, nblks, nimg, nspins
    3217              :       INTEGER(int_8)                                     :: nze
    3218              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes_RI_ext, bsizes_RI_ext_split, &
    3219          248 :                                                             dist1, dist2
    3220              :       INTEGER, DIMENSION(2)                              :: pdims_2d
    3221          496 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist, RI_blk_size, row_dist
    3222          248 :       INTEGER, DIMENSION(:, :), POINTER                  :: dbcsr_pgrid
    3223              :       REAL(dp)                                           :: occ
    3224              :       TYPE(dbcsr_distribution_type)                      :: dbcsr_dist_sub
    3225          744 :       TYPE(dbt_pgrid_type)                               :: pgrid_2d
    3226         3224 :       TYPE(dbt_type)                                     :: work, work_sub
    3227              : 
    3228          248 :       CALL timeset(routineN, handle)
    3229              : 
    3230              :       !Create the 2d pgrid
    3231          248 :       pdims_2d = 0
    3232          248 :       CALL dbt_pgrid_create(para_env_sub, pdims_2d, pgrid_2d)
    3233              : 
    3234          248 :       natom = SIZE(ri_data%bsizes_RI)
    3235          248 :       nblks = SIZE(ri_data%bsizes_RI_split)
    3236          744 :       ALLOCATE (bsizes_RI_ext(ri_data%ncell_RI*natom))
    3237          744 :       ALLOCATE (bsizes_RI_ext_split(ri_data%ncell_RI*nblks))
    3238         1606 :       DO i_RI = 1, ri_data%ncell_RI
    3239         4074 :          bsizes_RI_ext((i_RI - 1)*natom + 1:i_RI*natom) = ri_data%bsizes_RI(:)
    3240         7492 :          bsizes_RI_ext_split((i_RI - 1)*nblks + 1:i_RI*nblks) = ri_data%bsizes_RI_split(:)
    3241              :       END DO
    3242              : 
    3243              :       !nRI x nRI 2c tensors
    3244              :       CALL create_2c_tensor(t_2c_work(1), dist1, dist2, pgrid_2d, &
    3245              :                             bsizes_RI_ext, bsizes_RI_ext, &
    3246              :                             name="(RI | RI)")
    3247          248 :       DEALLOCATE (dist1, dist2)
    3248              : 
    3249              :       CALL create_2c_tensor(t_2c_work(2), dist1, dist2, pgrid_2d, &
    3250              :                             bsizes_RI_ext_split, bsizes_RI_ext_split, &
    3251          248 :                             name="(RI | RI)")
    3252          248 :       DEALLOCATE (dist1, dist2)
    3253              : 
    3254              :       !the AO based tensors
    3255              :       CALL create_2c_tensor(ks_t_split(1), dist1, dist2, pgrid_2d, &
    3256              :                             ri_data%bsizes_AO_split, ri_data%bsizes_AO_split, &
    3257              :                             name="(AO | AO)")
    3258          248 :       DEALLOCATE (dist1, dist2)
    3259          248 :       CALL dbt_create(ks_t_split(1), ks_t_split(2))
    3260              : 
    3261              :       CALL create_2c_tensor(t_2c_ao_tmp(1), dist1, dist2, pgrid_2d, &
    3262              :                             ri_data%bsizes_AO, ri_data%bsizes_AO, &
    3263              :                             name="(AO | AO)")
    3264          248 :       DEALLOCATE (dist1, dist2)
    3265              : 
    3266          248 :       nspins = SIZE(ks_t_sub, 1)
    3267          248 :       nimg = SIZE(ks_t_sub, 2)
    3268         7034 :       DO i_img = 1, nimg
    3269        15386 :          DO i_spin = 1, nspins
    3270        15138 :             CALL dbt_create(t_2c_ao_tmp(1), ks_t_sub(i_spin, i_img))
    3271              :          END DO
    3272              :       END DO
    3273              : 
    3274              :       !Finally the HFX potential matrices
    3275              :       !For now, we do a convoluted things where we go to tensors first, then back to matrices.
    3276              :       CALL create_2c_tensor(work_sub, dist1, dist2, pgrid_2d, &
    3277              :                             ri_data%bsizes_RI, ri_data%bsizes_RI, &
    3278              :                             name="(RI | RI)")
    3279          248 :       CALL dbt_create(ri_data%kp_mat_2c_pot(1, 1), work)
    3280              : 
    3281          992 :       ALLOCATE (dbcsr_pgrid(0:pdims_2d(1) - 1, 0:pdims_2d(2) - 1))
    3282          248 :       iproc = 0
    3283          496 :       DO i = 0, pdims_2d(1) - 1
    3284          744 :          DO j = 0, pdims_2d(2) - 1
    3285          248 :             dbcsr_pgrid(i, j) = iproc
    3286          496 :             iproc = iproc + 1
    3287              :          END DO
    3288              :       END DO
    3289              : 
    3290              :       !We need to have the same exact 2d block dist as the tensors
    3291          992 :       ALLOCATE (col_dist(natom), row_dist(natom))
    3292          744 :       row_dist(:) = dist1(:)
    3293          744 :       col_dist(:) = dist2(:)
    3294              : 
    3295          496 :       ALLOCATE (RI_blk_size(natom))
    3296          744 :       RI_blk_size(:) = ri_data%bsizes_RI(:)
    3297              : 
    3298              :       CALL dbcsr_distribution_new(dbcsr_dist_sub, group=para_env_sub%get_handle(), pgrid=dbcsr_pgrid, &
    3299          248 :                                   row_dist=row_dist, col_dist=col_dist)
    3300              :       CALL dbcsr_create(mat_2c_pot(1), dist=dbcsr_dist_sub, name="sub", matrix_type=dbcsr_type_no_symmetry, &
    3301          248 :                         row_blk_size=RI_blk_size, col_blk_size=RI_blk_size)
    3302              : 
    3303         7034 :       DO i_img = 1, nimg
    3304         6786 :          IF (i_img > 1) CALL dbcsr_create(mat_2c_pot(i_img), template=mat_2c_pot(1))
    3305         6786 :          CALL dbt_copy_matrix_to_tensor(ri_data%kp_mat_2c_pot(1, i_img), work)
    3306         6786 :          CALL get_tensor_occupancy(work, nze, occ)
    3307         6786 :          IF (nze == 0) CYCLE
    3308              : 
    3309         5114 :          CALL copy_2c_to_subgroup(work_sub, work, group_size, ngroups, para_env)
    3310         5114 :          CALL dbt_copy_tensor_to_matrix(work_sub, mat_2c_pot(i_img))
    3311         5114 :          CALL dbcsr_filter(mat_2c_pot(i_img), ri_data%filter_eps)
    3312        12148 :          CALL dbt_clear(work_sub)
    3313              :       END DO
    3314              : 
    3315          248 :       CALL dbt_destroy(work)
    3316          248 :       CALL dbt_destroy(work_sub)
    3317          248 :       CALL dbt_pgrid_destroy(pgrid_2d)
    3318          248 :       CALL dbcsr_distribution_release(dbcsr_dist_sub)
    3319          248 :       DEALLOCATE (col_dist, row_dist, RI_blk_size, dbcsr_pgrid)
    3320          248 :       CALL timestop(handle)
    3321              : 
    3322         2232 :    END SUBROUTINE get_subgroup_2c_tensors
    3323              : 
    3324              : ! **************************************************************************************************
    3325              : !> \brief copy all required 3c tensors from the main MPI group to the subgroups
    3326              : !> \param t_3c_int ...
    3327              : !> \param t_3c_work_2 ...
    3328              : !> \param t_3c_work_3 ...
    3329              : !> \param t_3c_apc ...
    3330              : !> \param t_3c_apc_sub ...
    3331              : !> \param group_size ...
    3332              : !> \param ngroups ...
    3333              : !> \param para_env ...
    3334              : !> \param para_env_sub ...
    3335              : !> \param ri_data ...
    3336              : ! **************************************************************************************************
    3337          248 :    SUBROUTINE get_subgroup_3c_tensors(t_3c_int, t_3c_work_2, t_3c_work_3, t_3c_apc, t_3c_apc_sub, &
    3338              :                                       group_size, ngroups, para_env, para_env_sub, ri_data)
    3339              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_3c_int, t_3c_work_2, t_3c_work_3
    3340              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: t_3c_apc, t_3c_apc_sub
    3341              :       INTEGER, INTENT(IN)                                :: group_size, ngroups
    3342              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
    3343              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    3344              : 
    3345              :       CHARACTER(len=*), PARAMETER :: routineN = 'get_subgroup_3c_tensors'
    3346              : 
    3347              :       INTEGER                                            :: batch_size, bo(2), handle, handle2, &
    3348              :                                                             i_blk, i_img, i_RI, i_spin, ib, natom, &
    3349              :                                                             nblks_AO, nblks_RI, nimg, nspins
    3350              :       INTEGER(int_8)                                     :: nze
    3351          248 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes_RI_ext, bsizes_RI_ext_split, &
    3352          248 :                                                             bsizes_stack, bsizes_tmp, dist1, &
    3353          248 :                                                             dist2, dist3, dist_stack, idx_to_at
    3354          248 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: subgroup_dest
    3355              :       INTEGER, DIMENSION(3)                              :: pdims
    3356              :       REAL(dp)                                           :: occ
    3357         2232 :       TYPE(dbt_distribution_type)                        :: t_dist
    3358          744 :       TYPE(dbt_pgrid_type)                               :: pgrid
    3359         6200 :       TYPE(dbt_type)                                     :: tmp, work_atom_block, work_atom_block_sub
    3360              : 
    3361          248 :       CALL timeset(routineN, handle)
    3362              : 
    3363          248 :       nblks_RI = SIZE(ri_data%bsizes_RI_split)
    3364          744 :       ALLOCATE (bsizes_RI_ext_split(ri_data%ncell_RI*nblks_RI))
    3365         1606 :       DO i_RI = 1, ri_data%ncell_RI
    3366         7492 :          bsizes_RI_ext_split((i_RI - 1)*nblks_RI + 1:i_RI*nblks_RI) = ri_data%bsizes_RI_split(:)
    3367              :       END DO
    3368              : 
    3369              :       !Preparing larger block sizes for efficient communication (less, bigger messages)
    3370          248 :       natom = SIZE(ri_data%bsizes_RI)
    3371          248 :       nblks_RI = natom
    3372          744 :       ALLOCATE (bsizes_tmp(nblks_RI))
    3373          744 :       DO i_blk = 1, nblks_RI
    3374          496 :          bo = get_limit(natom, nblks_RI, i_blk - 1)
    3375         1240 :          bsizes_tmp(i_blk) = SUM(ri_data%bsizes_RI(bo(1):bo(2)))
    3376              :       END DO
    3377          744 :       ALLOCATE (bsizes_RI_ext(ri_data%ncell_RI*nblks_RI))
    3378         1606 :       DO i_RI = 1, ri_data%ncell_RI
    3379         4322 :          bsizes_RI_ext((i_RI - 1)*nblks_RI + 1:i_RI*nblks_RI) = bsizes_tmp(:)
    3380              :       END DO
    3381              : 
    3382          248 :       batch_size = ri_data%kp_stack_size
    3383          248 :       nblks_AO = SIZE(ri_data%bsizes_AO_split)
    3384          744 :       ALLOCATE (bsizes_stack(batch_size*nblks_AO))
    3385         4984 :       DO ib = 1, batch_size
    3386        20760 :          bsizes_stack((ib - 1)*nblks_AO + 1:ib*nblks_AO) = ri_data%bsizes_AO_split(:)
    3387              :       END DO
    3388              : 
    3389              :       !Create the pgrid for the configuration correspoinding to ri_data%t_3c_int_ctr_3
    3390          248 :       natom = SIZE(ri_data%bsizes_RI)
    3391          248 :       pdims = 0
    3392              :       CALL dbt_pgrid_create(para_env_sub, pdims, pgrid, &
    3393          992 :                             tensor_dims=[SIZE(bsizes_RI_ext_split), 1, batch_size*SIZE(ri_data%bsizes_AO_split)])
    3394              : 
    3395              :       !Create all required 3c tensors in that configuration
    3396              :       CALL create_3c_tensor(t_3c_int(1), dist1, dist2, dist3, &
    3397              :                             pgrid, bsizes_RI_ext_split, ri_data%bsizes_AO_split, &
    3398          248 :                             ri_data%bsizes_AO_split, [1], [2, 3], name="(RI | AO AO)")
    3399          248 :       nimg = SIZE(t_3c_int)
    3400         6786 :       DO i_img = 2, nimg
    3401         6786 :          CALL dbt_create(t_3c_int(1), t_3c_int(i_img))
    3402              :       END DO
    3403              : 
    3404              :       !The stacked work tensors, in a distribution that matches that of t_3c_int
    3405          496 :       ALLOCATE (dist_stack(batch_size*nblks_AO))
    3406         4984 :       DO ib = 1, batch_size
    3407        20760 :          dist_stack((ib - 1)*nblks_AO + 1:ib*nblks_AO) = dist3(:)
    3408              :       END DO
    3409              : 
    3410          248 :       CALL dbt_distribution_new(t_dist, pgrid, dist1, dist2, dist_stack)
    3411              :       CALL dbt_create(t_3c_work_3(1), "work_3_stack", t_dist, [1], [2, 3], &
    3412          248 :                       bsizes_RI_ext_split, ri_data%bsizes_AO_split, bsizes_stack)
    3413          248 :       CALL dbt_create(t_3c_work_3(1), t_3c_work_3(2))
    3414          248 :       CALL dbt_create(t_3c_work_3(1), t_3c_work_3(3))
    3415          248 :       CALL dbt_distribution_destroy(t_dist)
    3416          248 :       DEALLOCATE (dist1, dist2, dist3, dist_stack)
    3417              : 
    3418              :       !For more efficient communication, we use intermediate tensors with larger block size
    3419              :       CALL create_3c_tensor(work_atom_block_sub, dist1, dist2, dist3, &
    3420              :                             pgrid, bsizes_RI_ext, ri_data%bsizes_AO, &
    3421          248 :                             ri_data%bsizes_AO, [1], [2, 3], name="(RI | AO AO)")
    3422          248 :       DEALLOCATE (dist1, dist2, dist3)
    3423              : 
    3424              :       CALL create_3c_tensor(work_atom_block, dist1, dist2, dist3, &
    3425              :                             ri_data%pgrid, bsizes_RI_ext, ri_data%bsizes_AO, &
    3426          248 :                             ri_data%bsizes_AO, [1], [2, 3], name="(RI | AO AO)")
    3427          248 :       DEALLOCATE (dist1, dist2, dist3)
    3428              : 
    3429              :       CALL get_3c_subgroup_dest(subgroup_dest, work_atom_block_sub, work_atom_block, &
    3430          248 :                                 group_size, ngroups, para_env)
    3431              : 
    3432              :       !Finally copy the integrals into the subgroups (if not there already)
    3433          248 :       CALL timeset(routineN//"_ints", handle2)
    3434          248 :       IF (ALLOCATED(ri_data%kp_t_3c_int)) THEN
    3435         4956 :          DO i_img = 1, nimg
    3436         4956 :             CALL dbt_copy(ri_data%kp_t_3c_int(i_img), t_3c_int(i_img), move_data=.TRUE.)
    3437              :          END DO
    3438              :       ELSE
    3439         2878 :          ALLOCATE (ri_data%kp_t_3c_int(nimg))
    3440         2078 :          DO i_img = 1, nimg
    3441         1998 :             CALL dbt_create(t_3c_int(i_img), ri_data%kp_t_3c_int(i_img))
    3442         1998 :             CALL get_tensor_occupancy(ri_data%t_3c_int_ctr_1(1, i_img), nze, occ)
    3443         1998 :             IF (nze == 0) CYCLE
    3444         1776 :             CALL dbt_copy(ri_data%t_3c_int_ctr_1(1, i_img), work_atom_block, order=[2, 1, 3])
    3445              :             CALL copy_3c_to_subgroup(work_atom_block_sub, work_atom_block, &
    3446         1776 :                                      ngroups, para_env, subgroup_dest)
    3447         1776 :             CALL dbt_copy(work_atom_block_sub, t_3c_int(i_img), move_data=.TRUE.)
    3448         3854 :             CALL dbt_filter(t_3c_int(i_img), ri_data%filter_eps)
    3449              :          END DO
    3450              :       END IF
    3451          248 :       CALL timestop(handle2)
    3452          248 :       CALL dbt_pgrid_destroy(pgrid)
    3453          248 :       CALL dbt_destroy(work_atom_block)
    3454          248 :       CALL dbt_destroy(work_atom_block_sub)
    3455          248 :       DEALLOCATE (subgroup_dest)
    3456              : 
    3457              :       !Do the same for the t_3c_ctr_2 configuration
    3458          248 :       pdims = 0
    3459              :       CALL dbt_pgrid_create(para_env_sub, pdims, pgrid, &
    3460          992 :                             tensor_dims=[1, SIZE(bsizes_RI_ext_split), batch_size*SIZE(ri_data%bsizes_AO_split)])
    3461              : 
    3462              :       !For more efficient communication, we use intermediate tensors with larger block size
    3463              :       CALL create_3c_tensor(work_atom_block_sub, dist1, dist2, dist3, &
    3464              :                             pgrid, ri_data%bsizes_AO, bsizes_RI_ext, &
    3465          248 :                             ri_data%bsizes_AO, [1], [2, 3], name="(AO RI | AO)")
    3466          248 :       DEALLOCATE (dist1, dist2, dist3)
    3467              : 
    3468              :       CALL create_3c_tensor(work_atom_block, dist1, dist2, dist3, &
    3469              :                             ri_data%pgrid_1, ri_data%bsizes_AO, bsizes_RI_ext, &
    3470          248 :                             ri_data%bsizes_AO, [1], [2, 3], name="(AO RI | AO)")
    3471          248 :       DEALLOCATE (dist1, dist2, dist3)
    3472              : 
    3473              :       CALL get_3c_subgroup_dest(subgroup_dest, work_atom_block_sub, work_atom_block, &
    3474          248 :                                 group_size, ngroups, para_env)
    3475              : 
    3476              :       !template for t_3c_apc_sub
    3477              :       CALL create_3c_tensor(tmp, dist1, dist2, dist3, &
    3478              :                             pgrid, ri_data%bsizes_AO_split, bsizes_RI_ext_split, &
    3479          248 :                             ri_data%bsizes_AO_split, [1], [2, 3], name="(AO RI | AO)")
    3480              : 
    3481              :       !create t_3c_work_2 tensors in a distribution that matches the above
    3482          496 :       ALLOCATE (dist_stack(batch_size*nblks_AO))
    3483         4984 :       DO ib = 1, batch_size
    3484        20760 :          dist_stack((ib - 1)*nblks_AO + 1:ib*nblks_AO) = dist3(:)
    3485              :       END DO
    3486              : 
    3487          248 :       CALL dbt_distribution_new(t_dist, pgrid, dist1, dist2, dist_stack)
    3488              :       CALL dbt_create(t_3c_work_2(1), "work_2_stack", t_dist, [1], [2, 3], &
    3489          248 :                       ri_data%bsizes_AO_split, bsizes_RI_ext_split, bsizes_stack)
    3490          248 :       CALL dbt_create(t_3c_work_2(1), t_3c_work_2(2))
    3491          248 :       CALL dbt_create(t_3c_work_2(1), t_3c_work_2(3))
    3492          248 :       CALL dbt_distribution_destroy(t_dist)
    3493          248 :       DEALLOCATE (dist1, dist2, dist3, dist_stack)
    3494              : 
    3495              :       !Finally copy data from t_3c_apc to the subgroups
    3496          744 :       ALLOCATE (idx_to_at(SIZE(ri_data%bsizes_AO)))
    3497          248 :       CALL get_idx_to_atom(idx_to_at, ri_data%bsizes_AO, ri_data%bsizes_AO)
    3498          248 :       nspins = SIZE(t_3c_apc, 1)
    3499          248 :       CALL timeset(routineN//"_apc", handle2)
    3500         7034 :       DO i_img = 1, nimg
    3501        15138 :          DO i_spin = 1, nspins
    3502         8352 :             CALL dbt_create(tmp, t_3c_apc_sub(i_spin, i_img))
    3503         8352 :             CALL get_tensor_occupancy(t_3c_apc(i_spin, i_img), nze, occ)
    3504         8352 :             IF (nze == 0) CYCLE
    3505         7362 :             CALL dbt_copy(t_3c_apc(i_spin, i_img), work_atom_block, move_data=.TRUE.)
    3506              :             CALL copy_3c_to_subgroup(work_atom_block_sub, work_atom_block, ngroups, para_env, &
    3507         7362 :                                      subgroup_dest, ri_data%iatom_to_subgroup, 1, idx_to_at)
    3508         7362 :             CALL dbt_copy(work_atom_block_sub, t_3c_apc_sub(i_spin, i_img), move_data=.TRUE.)
    3509        22500 :             CALL dbt_filter(t_3c_apc_sub(i_spin, i_img), ri_data%filter_eps)
    3510              :          END DO
    3511        15386 :          DO i_spin = 1, nspins
    3512        15138 :             CALL dbt_destroy(t_3c_apc(i_spin, i_img))
    3513              :          END DO
    3514              :       END DO
    3515          248 :       CALL timestop(handle2)
    3516          248 :       CALL dbt_pgrid_destroy(pgrid)
    3517          248 :       CALL dbt_destroy(tmp)
    3518          248 :       CALL dbt_destroy(work_atom_block)
    3519          248 :       CALL dbt_destroy(work_atom_block_sub)
    3520              : 
    3521          248 :       CALL timestop(handle)
    3522              : 
    3523          992 :    END SUBROUTINE get_subgroup_3c_tensors
    3524              : 
    3525              : ! **************************************************************************************************
    3526              : !> \brief copy all required 2c force tensors from the main MPI group to the subgroups
    3527              : !> \param t_2c_inv ...
    3528              : !> \param t_2c_bint ...
    3529              : !> \param t_2c_metric ...
    3530              : !> \param mat_2c_pot ...
    3531              : !> \param t_2c_work ...
    3532              : !> \param rho_ao_t ...
    3533              : !> \param rho_ao_t_sub ...
    3534              : !> \param t_2c_der_metric ...
    3535              : !> \param t_2c_der_metric_sub ...
    3536              : !> \param mat_der_pot ...
    3537              : !> \param mat_der_pot_sub ...
    3538              : !> \param group_size ...
    3539              : !> \param ngroups ...
    3540              : !> \param para_env ...
    3541              : !> \param para_env_sub ...
    3542              : !> \param ri_data ...
    3543              : !> \note Main MPI group tensors are deleted within this routine, for memory optimization
    3544              : ! **************************************************************************************************
    3545           92 :    SUBROUTINE get_subgroup_2c_derivs(t_2c_inv, t_2c_bint, t_2c_metric, mat_2c_pot, t_2c_work, rho_ao_t, &
    3546           46 :                                      rho_ao_t_sub, t_2c_der_metric, t_2c_der_metric_sub, mat_der_pot, &
    3547           46 :                                      mat_der_pot_sub, group_size, ngroups, para_env, para_env_sub, ri_data)
    3548              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_2c_inv, t_2c_bint, t_2c_metric
    3549              :       TYPE(dbcsr_type), DIMENSION(:), INTENT(INOUT)      :: mat_2c_pot
    3550              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_2c_work
    3551              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: rho_ao_t, rho_ao_t_sub, t_2c_der_metric, &
    3552              :                                                             t_2c_der_metric_sub
    3553              :       TYPE(dbcsr_type), DIMENSION(:, :), INTENT(INOUT)   :: mat_der_pot, mat_der_pot_sub
    3554              :       INTEGER, INTENT(IN)                                :: group_size, ngroups
    3555              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
    3556              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    3557              : 
    3558              :       CHARACTER(len=*), PARAMETER :: routineN = 'get_subgroup_2c_derivs'
    3559              : 
    3560              :       INTEGER                                            :: handle, i, i_img, i_RI, i_spin, i_xyz, &
    3561              :                                                             iatom, iproc, j, natom, nblks, nimg, &
    3562              :                                                             nspins
    3563              :       INTEGER(int_8)                                     :: nze
    3564              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes_RI_ext, bsizes_RI_ext_split, &
    3565           46 :                                                             dist1, dist2
    3566              :       INTEGER, DIMENSION(2)                              :: pdims_2d
    3567           92 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist, RI_blk_size, row_dist
    3568           46 :       INTEGER, DIMENSION(:, :), POINTER                  :: dbcsr_pgrid
    3569              :       REAL(dp)                                           :: occ
    3570              :       TYPE(dbcsr_distribution_type)                      :: dbcsr_dist_sub
    3571          138 :       TYPE(dbt_pgrid_type)                               :: pgrid_2d
    3572          598 :       TYPE(dbt_type)                                     :: work, work_sub
    3573              : 
    3574           46 :       CALL timeset(routineN, handle)
    3575              : 
    3576              :       !Note: a fair portion of this routine is copied from the energy version of it
    3577              :       !Create the 2d pgrid
    3578           46 :       pdims_2d = 0
    3579           46 :       CALL dbt_pgrid_create(para_env_sub, pdims_2d, pgrid_2d)
    3580              : 
    3581           46 :       natom = SIZE(ri_data%bsizes_RI)
    3582           46 :       nblks = SIZE(ri_data%bsizes_RI_split)
    3583          138 :       ALLOCATE (bsizes_RI_ext(ri_data%ncell_RI*natom))
    3584          138 :       ALLOCATE (bsizes_RI_ext_split(ri_data%ncell_RI*nblks))
    3585          318 :       DO i_RI = 1, ri_data%ncell_RI
    3586          816 :          bsizes_RI_ext((i_RI - 1)*natom + 1:i_RI*natom) = ri_data%bsizes_RI(:)
    3587         1622 :          bsizes_RI_ext_split((i_RI - 1)*nblks + 1:i_RI*nblks) = ri_data%bsizes_RI_split(:)
    3588              :       END DO
    3589              : 
    3590              :       !nRI x nRI 2c tensors
    3591              :       CALL create_2c_tensor(t_2c_inv(1), dist1, dist2, pgrid_2d, &
    3592              :                             bsizes_RI_ext, bsizes_RI_ext, &
    3593              :                             name="(RI | RI)")
    3594           46 :       DEALLOCATE (dist1, dist2)
    3595              : 
    3596           46 :       CALL dbt_create(t_2c_inv(1), t_2c_bint(1))
    3597           46 :       CALL dbt_create(t_2c_inv(1), t_2c_metric(1))
    3598           92 :       DO iatom = 2, natom
    3599           46 :          CALL dbt_create(t_2c_inv(1), t_2c_inv(iatom))
    3600           46 :          CALL dbt_create(t_2c_inv(1), t_2c_bint(iatom))
    3601           92 :          CALL dbt_create(t_2c_inv(1), t_2c_metric(iatom))
    3602              :       END DO
    3603           46 :       CALL dbt_create(t_2c_inv(1), t_2c_work(1))
    3604           46 :       CALL dbt_create(t_2c_inv(1), t_2c_work(2))
    3605           46 :       CALL dbt_create(t_2c_inv(1), t_2c_work(3))
    3606           46 :       CALL dbt_create(t_2c_inv(1), t_2c_work(4))
    3607              : 
    3608              :       CALL create_2c_tensor(t_2c_work(5), dist1, dist2, pgrid_2d, &
    3609              :                             bsizes_RI_ext_split, bsizes_RI_ext_split, &
    3610           46 :                             name="(RI | RI)")
    3611           46 :       DEALLOCATE (dist1, dist2)
    3612              : 
    3613              :       !copy the data from the main group.
    3614          138 :       DO iatom = 1, natom
    3615           92 :          CALL copy_2c_to_subgroup(t_2c_inv(iatom), ri_data%t_2c_inv(1, iatom), group_size, ngroups, para_env)
    3616           92 :          CALL copy_2c_to_subgroup(t_2c_bint(iatom), ri_data%t_2c_int(1, iatom), group_size, ngroups, para_env)
    3617          138 :          CALL copy_2c_to_subgroup(t_2c_metric(iatom), ri_data%t_2c_pot(1, iatom), group_size, ngroups, para_env)
    3618              :       END DO
    3619              : 
    3620              :       !This includes the derivatives of the RI metric, for which there is one per atom
    3621          184 :       DO i_xyz = 1, 3
    3622          460 :          DO iatom = 1, natom
    3623          276 :             CALL dbt_create(t_2c_inv(1), t_2c_der_metric_sub(iatom, i_xyz))
    3624              :             CALL copy_2c_to_subgroup(t_2c_der_metric_sub(iatom, i_xyz), t_2c_der_metric(iatom, i_xyz), &
    3625          276 :                                      group_size, ngroups, para_env)
    3626          414 :             CALL dbt_destroy(t_2c_der_metric(iatom, i_xyz))
    3627              :          END DO
    3628              :       END DO
    3629              : 
    3630              :       !AO x AO 2c tensors
    3631              :       CALL create_2c_tensor(rho_ao_t_sub(1, 1), dist1, dist2, pgrid_2d, &
    3632              :                             ri_data%bsizes_AO_split, ri_data%bsizes_AO_split, &
    3633              :                             name="(AO | AO)")
    3634           46 :       DEALLOCATE (dist1, dist2)
    3635           46 :       nspins = SIZE(rho_ao_t, 1)
    3636           46 :       nimg = SIZE(rho_ao_t, 2)
    3637              : 
    3638         1148 :       DO i_img = 1, nimg
    3639         2430 :          DO i_spin = 1, nspins
    3640         1282 :             IF (.NOT. (i_img == 1 .AND. i_spin == 1)) &
    3641         1236 :                CALL dbt_create(rho_ao_t_sub(1, 1), rho_ao_t_sub(i_spin, i_img))
    3642              :             CALL copy_2c_to_subgroup(rho_ao_t_sub(i_spin, i_img), rho_ao_t(i_spin, i_img), &
    3643         1282 :                                      group_size, ngroups, para_env)
    3644         2384 :             CALL dbt_destroy(rho_ao_t(i_spin, i_img))
    3645              :          END DO
    3646              :       END DO
    3647              : 
    3648              :       !The RIxRI matrices, going through tensors
    3649              :       CALL create_2c_tensor(work_sub, dist1, dist2, pgrid_2d, &
    3650              :                             ri_data%bsizes_RI, ri_data%bsizes_RI, &
    3651              :                             name="(RI | RI)")
    3652           46 :       CALL dbt_create(ri_data%kp_mat_2c_pot(1, 1), work)
    3653              : 
    3654          184 :       ALLOCATE (dbcsr_pgrid(0:pdims_2d(1) - 1, 0:pdims_2d(2) - 1))
    3655           46 :       iproc = 0
    3656           92 :       DO i = 0, pdims_2d(1) - 1
    3657          138 :          DO j = 0, pdims_2d(2) - 1
    3658           46 :             dbcsr_pgrid(i, j) = iproc
    3659           92 :             iproc = iproc + 1
    3660              :          END DO
    3661              :       END DO
    3662              : 
    3663              :       !We need to have the same exact 2d block dist as the tensors
    3664          184 :       ALLOCATE (col_dist(natom), row_dist(natom))
    3665          138 :       row_dist(:) = dist1(:)
    3666          138 :       col_dist(:) = dist2(:)
    3667              : 
    3668           92 :       ALLOCATE (RI_blk_size(natom))
    3669          138 :       RI_blk_size(:) = ri_data%bsizes_RI(:)
    3670              : 
    3671              :       CALL dbcsr_distribution_new(dbcsr_dist_sub, group=para_env_sub%get_handle(), pgrid=dbcsr_pgrid, &
    3672           46 :                                   row_dist=row_dist, col_dist=col_dist)
    3673              :       CALL dbcsr_create(mat_2c_pot(1), dist=dbcsr_dist_sub, name="sub", matrix_type=dbcsr_type_no_symmetry, &
    3674           46 :                         row_blk_size=RI_blk_size, col_blk_size=RI_blk_size)
    3675              : 
    3676              :       !The HFX potential
    3677         1148 :       DO i_img = 1, nimg
    3678         1102 :          IF (i_img > 1) CALL dbcsr_create(mat_2c_pot(i_img), template=mat_2c_pot(1))
    3679         1102 :          CALL dbt_copy_matrix_to_tensor(ri_data%kp_mat_2c_pot(1, i_img), work)
    3680         1102 :          CALL get_tensor_occupancy(work, nze, occ)
    3681         1102 :          IF (nze == 0) CYCLE
    3682              : 
    3683          722 :          CALL copy_2c_to_subgroup(work_sub, work, group_size, ngroups, para_env)
    3684          722 :          CALL dbt_copy_tensor_to_matrix(work_sub, mat_2c_pot(i_img))
    3685          722 :          CALL dbcsr_filter(mat_2c_pot(i_img), ri_data%filter_eps)
    3686         1870 :          CALL dbt_clear(work_sub)
    3687              :       END DO
    3688              : 
    3689              :       !The derivatives of the HFX potential
    3690          184 :       DO i_xyz = 1, 3
    3691         3490 :          DO i_img = 1, nimg
    3692         3306 :             CALL dbcsr_create(mat_der_pot_sub(i_img, i_xyz), template=mat_2c_pot(1))
    3693         3306 :             CALL dbt_copy_matrix_to_tensor(mat_der_pot(i_img, i_xyz), work)
    3694         3306 :             CALL dbcsr_release(mat_der_pot(i_img, i_xyz))
    3695         3306 :             CALL get_tensor_occupancy(work, nze, occ)
    3696         3306 :             IF (nze == 0) CYCLE
    3697              : 
    3698         2162 :             CALL copy_2c_to_subgroup(work_sub, work, group_size, ngroups, para_env)
    3699         2162 :             CALL dbt_copy_tensor_to_matrix(work_sub, mat_der_pot_sub(i_img, i_xyz))
    3700         2162 :             CALL dbcsr_filter(mat_der_pot_sub(i_img, i_xyz), ri_data%filter_eps)
    3701         5606 :             CALL dbt_clear(work_sub)
    3702              :          END DO
    3703              :       END DO
    3704              : 
    3705           46 :       CALL dbt_destroy(work)
    3706           46 :       CALL dbt_destroy(work_sub)
    3707           46 :       CALL dbt_pgrid_destroy(pgrid_2d)
    3708           46 :       CALL dbcsr_distribution_release(dbcsr_dist_sub)
    3709           46 :       DEALLOCATE (col_dist, row_dist, RI_blk_size, dbcsr_pgrid)
    3710              : 
    3711           46 :       CALL timestop(handle)
    3712              : 
    3713          368 :    END SUBROUTINE get_subgroup_2c_derivs
    3714              : 
    3715              : ! **************************************************************************************************
    3716              : !> \brief copy all required 3c derivative tensors from the main MPI group to the subgroups
    3717              : !> \param t_3c_work_2 ...
    3718              : !> \param t_3c_work_3 ...
    3719              : !> \param t_3c_der_AO ...
    3720              : !> \param t_3c_der_AO_sub ...
    3721              : !> \param t_3c_der_RI ...
    3722              : !> \param t_3c_der_RI_sub ...
    3723              : !> \param t_3c_apc ...
    3724              : !> \param t_3c_apc_sub ...
    3725              : !> \param t_3c_der_stack ...
    3726              : !> \param group_size ...
    3727              : !> \param ngroups ...
    3728              : !> \param para_env ...
    3729              : !> \param para_env_sub ...
    3730              : !> \param ri_data ...
    3731              : !> \note the tensor containing the derivatives in the main MPI group are deleted for memory
    3732              : ! **************************************************************************************************
    3733           46 :    SUBROUTINE get_subgroup_3c_derivs(t_3c_work_2, t_3c_work_3, t_3c_der_AO, t_3c_der_AO_sub, &
    3734           46 :                                      t_3c_der_RI, t_3c_der_RI_sub, t_3c_apc, t_3c_apc_sub, &
    3735           46 :                                      t_3c_der_stack, group_size, ngroups, para_env, para_env_sub, &
    3736              :                                      ri_data)
    3737              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_3c_work_2, t_3c_work_3
    3738              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: t_3c_der_AO, t_3c_der_AO_sub, &
    3739              :                                                             t_3c_der_RI, t_3c_der_RI_sub, &
    3740              :                                                             t_3c_apc, t_3c_apc_sub
    3741              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_3c_der_stack
    3742              :       INTEGER, INTENT(IN)                                :: group_size, ngroups
    3743              :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
    3744              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    3745              : 
    3746              :       CHARACTER(len=*), PARAMETER :: routineN = 'get_subgroup_3c_derivs'
    3747              : 
    3748              :       INTEGER                                            :: batch_size, handle, i_img, i_RI, i_spin, &
    3749              :                                                             i_xyz, ib, nblks_AO, nblks_RI, nimg, &
    3750              :                                                             nspins, pdims(3)
    3751              :       INTEGER(int_8)                                     :: nze
    3752           46 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes_RI_ext, bsizes_RI_ext_split, &
    3753           46 :                                                             bsizes_stack, dist1, dist2, dist3, &
    3754           46 :                                                             dist_stack, idx_to_at
    3755           46 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: subgroup_dest
    3756              :       REAL(dp)                                           :: occ
    3757          414 :       TYPE(dbt_distribution_type)                        :: t_dist
    3758          138 :       TYPE(dbt_pgrid_type)                               :: pgrid
    3759         1150 :       TYPE(dbt_type)                                     :: tmp, work_atom_block, work_atom_block_sub
    3760              : 
    3761           46 :       CALL timeset(routineN, handle)
    3762              : 
    3763              :       !We use intermediate tensors with larger block size for more optimized communication
    3764           46 :       nblks_RI = SIZE(ri_data%bsizes_RI)
    3765          138 :       ALLOCATE (bsizes_RI_ext(ri_data%ncell_RI*nblks_RI))
    3766          318 :       DO i_RI = 1, ri_data%ncell_RI
    3767          862 :          bsizes_RI_ext((i_RI - 1)*nblks_RI + 1:i_RI*nblks_RI) = ri_data%bsizes_RI(:)
    3768              :       END DO
    3769              : 
    3770           46 :       CALL dbt_get_info(ri_data%kp_t_3c_int(1), pdims=pdims)
    3771           46 :       CALL dbt_pgrid_create(para_env_sub, pdims, pgrid)
    3772              : 
    3773              :       CALL create_3c_tensor(work_atom_block_sub, dist1, dist2, dist3, &
    3774              :                             pgrid, bsizes_RI_ext, ri_data%bsizes_AO, &
    3775           46 :                             ri_data%bsizes_AO, [1], [2, 3], name="(RI | AO AO)")
    3776           46 :       DEALLOCATE (dist1, dist2, dist3)
    3777              : 
    3778              :       CALL create_3c_tensor(work_atom_block, dist1, dist2, dist3, &
    3779              :                             ri_data%pgrid_2, bsizes_RI_ext, ri_data%bsizes_AO, &
    3780           46 :                             ri_data%bsizes_AO, [1], [2, 3], name="(RI | AO AO)")
    3781           46 :       DEALLOCATE (dist1, dist2, dist3)
    3782           46 :       CALL dbt_pgrid_destroy(pgrid)
    3783              : 
    3784              :       CALL get_3c_subgroup_dest(subgroup_dest, work_atom_block_sub, work_atom_block, &
    3785           46 :                                 group_size, ngroups, para_env)
    3786              : 
    3787              :       !We use the 3c integrals on the subgroup as template for the derivatives
    3788           46 :       nimg = ri_data%nimg
    3789          184 :       DO i_xyz = 1, 3
    3790         3444 :          DO i_img = 1, nimg
    3791         3306 :             CALL dbt_create(ri_data%kp_t_3c_int(1), t_3c_der_AO_sub(i_img, i_xyz))
    3792         3306 :             CALL get_tensor_occupancy(t_3c_der_AO(i_img, i_xyz), nze, occ)
    3793         3306 :             IF (nze == 0) CYCLE
    3794              : 
    3795         2138 :             CALL dbt_copy(t_3c_der_AO(i_img, i_xyz), work_atom_block, move_data=.TRUE.)
    3796              :             CALL copy_3c_to_subgroup(work_atom_block_sub, work_atom_block, &
    3797         2138 :                                      ngroups, para_env, subgroup_dest)
    3798         2138 :             CALL dbt_copy(work_atom_block_sub, t_3c_der_AO_sub(i_img, i_xyz), move_data=.TRUE.)
    3799         5582 :             CALL dbt_filter(t_3c_der_AO_sub(i_img, i_xyz), ri_data%filter_eps)
    3800              :          END DO
    3801              : 
    3802         3444 :          DO i_img = 1, nimg
    3803         3306 :             CALL dbt_create(ri_data%kp_t_3c_int(1), t_3c_der_RI_sub(i_img, i_xyz))
    3804         3306 :             CALL get_tensor_occupancy(t_3c_der_RI(i_img, i_xyz), nze, occ)
    3805         3306 :             IF (nze == 0) CYCLE
    3806              : 
    3807         2118 :             CALL dbt_copy(t_3c_der_RI(i_img, i_xyz), work_atom_block, move_data=.TRUE.)
    3808              :             CALL copy_3c_to_subgroup(work_atom_block_sub, work_atom_block, &
    3809         2118 :                                      ngroups, para_env, subgroup_dest)
    3810         2118 :             CALL dbt_copy(work_atom_block_sub, t_3c_der_RI_sub(i_img, i_xyz), move_data=.TRUE.)
    3811         5562 :             CALL dbt_filter(t_3c_der_RI_sub(i_img, i_xyz), ri_data%filter_eps)
    3812              :          END DO
    3813              : 
    3814         3490 :          DO i_img = 1, nimg
    3815         3306 :             CALL dbt_destroy(t_3c_der_RI(i_img, i_xyz))
    3816         3444 :             CALL dbt_destroy(t_3c_der_AO(i_img, i_xyz))
    3817              :          END DO
    3818              :       END DO
    3819           46 :       CALL dbt_destroy(work_atom_block_sub)
    3820           46 :       CALL dbt_destroy(work_atom_block)
    3821           46 :       DEALLOCATE (subgroup_dest)
    3822              : 
    3823              :       !Deal with t_3c_apc
    3824           46 :       nblks_RI = SIZE(ri_data%bsizes_RI_split)
    3825          138 :       ALLOCATE (bsizes_RI_ext_split(ri_data%ncell_RI*nblks_RI))
    3826          318 :       DO i_RI = 1, ri_data%ncell_RI
    3827         1622 :          bsizes_RI_ext_split((i_RI - 1)*nblks_RI + 1:i_RI*nblks_RI) = ri_data%bsizes_RI_split(:)
    3828              :       END DO
    3829              : 
    3830           46 :       pdims = 0
    3831              :       CALL dbt_pgrid_create(para_env_sub, pdims, pgrid, &
    3832          184 :                             tensor_dims=[1, SIZE(bsizes_RI_ext_split), batch_size*SIZE(ri_data%bsizes_AO_split)])
    3833              : 
    3834              :       CALL create_3c_tensor(work_atom_block_sub, dist1, dist2, dist3, &
    3835              :                             pgrid, ri_data%bsizes_AO, bsizes_RI_ext, &
    3836           46 :                             ri_data%bsizes_AO, [1], [2, 3], name="(AO RI | AO)")
    3837           46 :       DEALLOCATE (dist1, dist2, dist3)
    3838              : 
    3839              :       CALL create_3c_tensor(work_atom_block, dist1, dist2, dist3, &
    3840              :                             ri_data%pgrid_1, ri_data%bsizes_AO, bsizes_RI_ext, &
    3841           46 :                             ri_data%bsizes_AO, [1], [2, 3], name="(AO RI | AO)")
    3842           46 :       DEALLOCATE (dist1, dist2, dist3)
    3843              : 
    3844              :       CALL create_3c_tensor(tmp, dist1, dist2, dist3, &
    3845              :                             pgrid, ri_data%bsizes_AO_split, bsizes_RI_ext_split, &
    3846           46 :                             ri_data%bsizes_AO_split, [1], [2, 3], name="(AO RI | AO)")
    3847           46 :       DEALLOCATE (dist1, dist2, dist3)
    3848              : 
    3849              :       CALL get_3c_subgroup_dest(subgroup_dest, work_atom_block_sub, work_atom_block, &
    3850           46 :                                 group_size, ngroups, para_env)
    3851              : 
    3852          138 :       ALLOCATE (idx_to_at(SIZE(ri_data%bsizes_AO)))
    3853           46 :       CALL get_idx_to_atom(idx_to_at, ri_data%bsizes_AO, ri_data%bsizes_AO)
    3854           46 :       nspins = SIZE(t_3c_apc, 1)
    3855         1148 :       DO i_img = 1, nimg
    3856         2384 :          DO i_spin = 1, nspins
    3857         1282 :             CALL dbt_create(tmp, t_3c_apc_sub(i_spin, i_img))
    3858         1282 :             CALL get_tensor_occupancy(t_3c_apc(i_spin, i_img), nze, occ)
    3859         1282 :             IF (nze == 0) CYCLE
    3860         1258 :             CALL dbt_copy(t_3c_apc(i_spin, i_img), work_atom_block, move_data=.TRUE.)
    3861              :             CALL copy_3c_to_subgroup(work_atom_block_sub, work_atom_block, ngroups, para_env, &
    3862         1258 :                                      subgroup_dest, ri_data%iatom_to_subgroup, 1, idx_to_at)
    3863         1258 :             CALL dbt_copy(work_atom_block_sub, t_3c_apc_sub(i_spin, i_img), move_data=.TRUE.)
    3864         3642 :             CALL dbt_filter(t_3c_apc_sub(i_spin, i_img), ri_data%filter_eps)
    3865              :          END DO
    3866         2430 :          DO i_spin = 1, nspins
    3867         2384 :             CALL dbt_destroy(t_3c_apc(i_spin, i_img))
    3868              :          END DO
    3869              :       END DO
    3870           46 :       CALL dbt_destroy(tmp)
    3871           46 :       CALL dbt_destroy(work_atom_block)
    3872           46 :       CALL dbt_destroy(work_atom_block_sub)
    3873           46 :       CALL dbt_pgrid_destroy(pgrid)
    3874              : 
    3875              :       !t_3c_work_3 based on structure of 3c integrals/derivs
    3876           46 :       batch_size = ri_data%kp_stack_size
    3877           46 :       nblks_AO = SIZE(ri_data%bsizes_AO_split)
    3878          138 :       ALLOCATE (bsizes_stack(batch_size*nblks_AO))
    3879          814 :       DO ib = 1, batch_size
    3880         3854 :          bsizes_stack((ib - 1)*nblks_AO + 1:ib*nblks_AO) = ri_data%bsizes_AO_split(:)
    3881              :       END DO
    3882              : 
    3883          322 :       ALLOCATE (dist1(ri_data%ncell_RI*nblks_RI), dist2(nblks_AO), dist3(nblks_AO))
    3884              :       CALL dbt_get_info(ri_data%kp_t_3c_int(1), proc_dist_1=dist1, proc_dist_2=dist2, &
    3885           46 :                         proc_dist_3=dist3, pdims=pdims)
    3886              : 
    3887          138 :       ALLOCATE (dist_stack(batch_size*nblks_AO))
    3888          814 :       DO ib = 1, batch_size
    3889         3854 :          dist_stack((ib - 1)*nblks_AO + 1:ib*nblks_AO) = dist3(:)
    3890              :       END DO
    3891              : 
    3892           46 :       CALL dbt_pgrid_create(para_env_sub, pdims, pgrid)
    3893           46 :       CALL dbt_distribution_new(t_dist, pgrid, dist1, dist2, dist_stack)
    3894              :       CALL dbt_create(t_3c_work_3(1), "work_3_stack", t_dist, [1], [2, 3], &
    3895           46 :                       bsizes_RI_ext_split, ri_data%bsizes_AO_split, bsizes_stack)
    3896           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_work_3(2))
    3897           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_work_3(3))
    3898           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_work_3(4))
    3899           46 :       CALL dbt_distribution_destroy(t_dist)
    3900           46 :       CALL dbt_pgrid_destroy(pgrid)
    3901           46 :       DEALLOCATE (dist1, dist2, dist3, dist_stack)
    3902              : 
    3903              :       !the derivatives are stacked in the same way
    3904           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_der_stack(1))
    3905           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_der_stack(2))
    3906           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_der_stack(3))
    3907           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_der_stack(4))
    3908           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_der_stack(5))
    3909           46 :       CALL dbt_create(t_3c_work_3(1), t_3c_der_stack(6))
    3910              : 
    3911              :       !t_3c_work_2 based on structure of t_3c_apc
    3912          322 :       ALLOCATE (dist1(nblks_AO), dist2(ri_data%ncell_RI*nblks_RI), dist3(nblks_AO))
    3913              :       CALL dbt_get_info(t_3c_apc_sub(1, 1), proc_dist_1=dist1, proc_dist_2=dist2, &
    3914           46 :                         proc_dist_3=dist3, pdims=pdims)
    3915              : 
    3916          138 :       ALLOCATE (dist_stack(batch_size*nblks_AO))
    3917          814 :       DO ib = 1, batch_size
    3918         3854 :          dist_stack((ib - 1)*nblks_AO + 1:ib*nblks_AO) = dist3(:)
    3919              :       END DO
    3920              : 
    3921           46 :       CALL dbt_pgrid_create(para_env_sub, pdims, pgrid)
    3922           46 :       CALL dbt_distribution_new(t_dist, pgrid, dist1, dist2, dist_stack)
    3923              :       CALL dbt_create(t_3c_work_2(1), "work_3_stack", t_dist, [1], [2, 3], &
    3924           46 :                       ri_data%bsizes_AO_split, bsizes_RI_ext_split, bsizes_stack)
    3925           46 :       CALL dbt_create(t_3c_work_2(1), t_3c_work_2(2))
    3926           46 :       CALL dbt_create(t_3c_work_2(1), t_3c_work_2(3))
    3927           46 :       CALL dbt_distribution_destroy(t_dist)
    3928           46 :       CALL dbt_pgrid_destroy(pgrid)
    3929           46 :       DEALLOCATE (dist1, dist2, dist3, dist_stack)
    3930              : 
    3931           46 :       CALL timestop(handle)
    3932              : 
    3933           92 :    END SUBROUTINE get_subgroup_3c_derivs
    3934              : 
    3935              : ! **************************************************************************************************
    3936              : !> \brief A routine that reorders the t_3c_int tensors such that all items which are fully empty
    3937              : !>        are bunched together. This way, we can get much more efficient screening based on NZE
    3938              : !> \param t_3c_ints ...
    3939              : !> \param ri_data ...
    3940              : ! **************************************************************************************************
    3941           80 :    SUBROUTINE reorder_3c_ints(t_3c_ints, ri_data)
    3942              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_3c_ints
    3943              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    3944              : 
    3945              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'reorder_3c_ints'
    3946              : 
    3947              :       INTEGER                                            :: handle, i_img, idx, idx_empty, idx_full, &
    3948              :                                                             nimg
    3949              :       INTEGER(int_8)                                     :: nze
    3950              :       REAL(dp)                                           :: occ
    3951           80 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:)          :: t_3c_tmp
    3952              : 
    3953           80 :       CALL timeset(routineN, handle)
    3954              : 
    3955           80 :       nimg = ri_data%nimg
    3956         2878 :       ALLOCATE (t_3c_tmp(nimg))
    3957         2078 :       DO i_img = 1, nimg
    3958         1998 :          CALL dbt_create(t_3c_ints(i_img), t_3c_tmp(i_img))
    3959         2078 :          CALL dbt_copy(t_3c_ints(i_img), t_3c_tmp(i_img), move_data=.TRUE.)
    3960              :       END DO
    3961              : 
    3962              :       !Loop over the images, check if ints have NZE == 0, and put them at the start or end of the
    3963              :       !initial tensor array. Keep the mapping in an array
    3964          240 :       ALLOCATE (ri_data%idx_to_img(nimg))
    3965           80 :       idx_full = 0
    3966           80 :       idx_empty = nimg + 1
    3967              : 
    3968         2078 :       DO i_img = 1, nimg
    3969         1998 :          CALL get_tensor_occupancy(t_3c_tmp(i_img), nze, occ)
    3970         1998 :          IF (nze == 0) THEN
    3971          542 :             idx_empty = idx_empty - 1
    3972          542 :             CALL dbt_copy(t_3c_tmp(i_img), t_3c_ints(idx_empty), move_data=.TRUE.)
    3973          542 :             ri_data%idx_to_img(idx_empty) = i_img
    3974              :          ELSE
    3975         1456 :             idx_full = idx_full + 1
    3976         1456 :             CALL dbt_copy(t_3c_tmp(i_img), t_3c_ints(idx_full), move_data=.TRUE.)
    3977         1456 :             ri_data%idx_to_img(idx_full) = i_img
    3978              :          END IF
    3979         4076 :          CALL dbt_destroy(t_3c_tmp(i_img))
    3980              :       END DO
    3981              : 
    3982              :       !store the highest image index with non-zero integrals
    3983           80 :       ri_data%nimg_nze = idx_full
    3984              : 
    3985          160 :       ALLOCATE (ri_data%img_to_idx(nimg))
    3986         2078 :       DO idx = 1, nimg
    3987         2078 :          ri_data%img_to_idx(ri_data%idx_to_img(idx)) = idx
    3988              :       END DO
    3989              : 
    3990           80 :       CALL timestop(handle)
    3991              : 
    3992         2158 :    END SUBROUTINE reorder_3c_ints
    3993              : 
    3994              : ! **************************************************************************************************
    3995              : !> \brief A routine that reorders the 3c derivatives, the same way that the integrals are, also to
    3996              : !>        increase efficiency of screening
    3997              : !> \param t_3c_derivs ...
    3998              : !> \param ri_data ...
    3999              : ! **************************************************************************************************
    4000           92 :    SUBROUTINE reorder_3c_derivs(t_3c_derivs, ri_data)
    4001              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: t_3c_derivs
    4002              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4003              : 
    4004              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'reorder_3c_derivs'
    4005              : 
    4006              :       INTEGER                                            :: handle, i_img, i_xyz, idx, nimg
    4007              :       INTEGER(int_8)                                     :: nze
    4008              :       REAL(dp)                                           :: occ
    4009           92 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:)          :: t_3c_tmp
    4010              : 
    4011           92 :       CALL timeset(routineN, handle)
    4012              : 
    4013           92 :       nimg = ri_data%nimg
    4014         3216 :       ALLOCATE (t_3c_tmp(nimg))
    4015         2296 :       DO i_img = 1, nimg
    4016         2296 :          CALL dbt_create(t_3c_derivs(1, 1), t_3c_tmp(i_img))
    4017              :       END DO
    4018              : 
    4019          368 :       DO i_xyz = 1, 3
    4020         6888 :          DO i_img = 1, nimg
    4021         6888 :             CALL dbt_copy(t_3c_derivs(i_img, i_xyz), t_3c_tmp(i_img), move_data=.TRUE.)
    4022              :          END DO
    4023         6980 :          DO i_img = 1, nimg
    4024         6612 :             idx = ri_data%img_to_idx(i_img)
    4025         6612 :             CALL dbt_copy(t_3c_tmp(i_img), t_3c_derivs(idx, i_xyz), move_data=.TRUE.)
    4026         6612 :             CALL get_tensor_occupancy(t_3c_derivs(idx, i_xyz), nze, occ)
    4027         6888 :             IF (nze > 0) ri_data%nimg_nze = MAX(idx, ri_data%nimg_nze)
    4028              :          END DO
    4029              :       END DO
    4030              : 
    4031         2296 :       DO i_img = 1, nimg
    4032         2296 :          CALL dbt_destroy(t_3c_tmp(i_img))
    4033              :       END DO
    4034              : 
    4035           92 :       CALL timestop(handle)
    4036              : 
    4037         2388 :    END SUBROUTINE reorder_3c_derivs
    4038              : 
    4039              : ! **************************************************************************************************
    4040              : !> \brief Get the sparsity pattern related to the non-symmetric AO basis overlap neighbor list
    4041              : !> \param pattern ...
    4042              : !> \param ri_data ...
    4043              : !> \param qs_env ...
    4044              : ! **************************************************************************************************
    4045          294 :    SUBROUTINE get_sparsity_pattern(pattern, ri_data, qs_env)
    4046              :       INTEGER, DIMENSION(:, :, :), INTENT(INOUT)         :: pattern
    4047              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4048              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    4049              : 
    4050              :       INTEGER                                            :: iatom, j_img, jatom, mj_img, natom, nimg
    4051          294 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bins
    4052          294 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: tmp_pattern
    4053              :       INTEGER, DIMENSION(3)                              :: cell_j
    4054          294 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    4055          294 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    4056              :       TYPE(dft_control_type), POINTER                    :: dft_control
    4057              :       TYPE(kpoint_type), POINTER                         :: kpoints
    4058              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    4059              :       TYPE(neighbor_list_iterator_p_type), &
    4060          294 :          DIMENSION(:), POINTER                           :: nl_iterator
    4061              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    4062          294 :          POINTER                                         :: nl_2c
    4063              : 
    4064          294 :       NULLIFY (nl_2c, nl_iterator, kpoints, cell_to_index, dft_control, index_to_cell, para_env)
    4065              : 
    4066          294 :       CALL get_qs_env(qs_env, kpoints=kpoints, dft_control=dft_control, para_env=para_env, natom=natom)
    4067          294 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index, index_to_cell=index_to_cell, sab_nl=nl_2c)
    4068              : 
    4069          294 :       nimg = ri_data%nimg
    4070        55510 :       pattern(:, :, :) = 0
    4071              : 
    4072              :       !We use the symmetric nl for all images that have an opposite cell
    4073          294 :       CALL neighbor_list_iterator_create(nl_iterator, nl_2c)
    4074        12973 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    4075        12679 :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, cell=cell_j)
    4076              : 
    4077        12679 :          j_img = cell_to_index(cell_j(1), cell_j(2), cell_j(3))
    4078        12679 :          IF (j_img > nimg .OR. j_img < 1) CYCLE
    4079              : 
    4080         9696 :          mj_img = get_opp_index(j_img, qs_env)
    4081         9696 :          IF (mj_img > nimg .OR. mj_img < 1) CYCLE
    4082              : 
    4083         9249 :          IF (ri_data%present_images(j_img) == 0) CYCLE
    4084              : 
    4085        12679 :          pattern(iatom, jatom, j_img) = 1
    4086              :       END DO
    4087          294 :       CALL neighbor_list_iterator_release(nl_iterator)
    4088              : 
    4089              :       !If there is no opposite cell present, then we take into account the non-symmetric nl
    4090          294 :       CALL get_kpoint_info(kpoints, sab_nl_nosym=nl_2c)
    4091              : 
    4092          294 :       CALL neighbor_list_iterator_create(nl_iterator, nl_2c)
    4093        17146 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    4094        16852 :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, cell=cell_j)
    4095              : 
    4096        16852 :          j_img = cell_to_index(cell_j(1), cell_j(2), cell_j(3))
    4097        16852 :          IF (j_img > nimg .OR. j_img < 1) CYCLE
    4098              : 
    4099        12452 :          mj_img = get_opp_index(j_img, qs_env)
    4100        12452 :          IF (mj_img .LE. nimg .AND. mj_img > 0) CYCLE
    4101              : 
    4102          456 :          IF (ri_data%present_images(j_img) == 0) CYCLE
    4103              : 
    4104        16852 :          pattern(iatom, jatom, j_img) = 1
    4105              :       END DO
    4106          294 :       CALL neighbor_list_iterator_release(nl_iterator)
    4107              : 
    4108       110726 :       CALL para_env%sum(pattern)
    4109              : 
    4110              :       !If the opposite image is considered, then there is no need to compute diagonal twice
    4111         7888 :       DO j_img = 2, nimg
    4112        23076 :          DO iatom = 1, natom
    4113        22782 :             IF (pattern(iatom, iatom, j_img) .NE. 0) THEN
    4114         5184 :                mj_img = get_opp_index(j_img, qs_env)
    4115         5184 :                IF (mj_img > nimg .OR. mj_img < 1) CYCLE
    4116         5184 :                pattern(iatom, iatom, mj_img) = 0
    4117              :             END IF
    4118              :          END DO
    4119              :       END DO
    4120              : 
    4121              :       ! We want to equilibrate the sparsity pattern such that there are same amount of blocks
    4122              :       ! for each atom i of i,j pairs
    4123          882 :       ALLOCATE (bins(natom))
    4124          882 :       bins(:) = 0
    4125              : 
    4126         1470 :       ALLOCATE (tmp_pattern(natom, natom, nimg))
    4127        55510 :       tmp_pattern(:, :, :) = 0
    4128         8182 :       DO j_img = 1, nimg
    4129        23958 :          DO jatom = 1, natom
    4130        55216 :             DO iatom = 1, natom
    4131        31552 :                IF (pattern(iatom, jatom, j_img) == 0) CYCLE
    4132        10572 :                mj_img = get_opp_index(j_img, qs_env)
    4133              : 
    4134              :                !Should we take the i,j,b or th j,i,-b atomic block?
    4135        26348 :                IF (mj_img > nimg .OR. mj_img < 1) THEN
    4136              :                   !No opposite image, no choice
    4137          214 :                   bins(iatom) = bins(iatom) + 1
    4138          214 :                   tmp_pattern(iatom, jatom, j_img) = 1
    4139              :                ELSE
    4140              : 
    4141        10358 :                   IF (bins(iatom) > bins(jatom)) THEN
    4142         2184 :                      bins(jatom) = bins(jatom) + 1
    4143         2184 :                      tmp_pattern(jatom, iatom, mj_img) = 1
    4144              :                   ELSE
    4145         8174 :                      bins(iatom) = bins(iatom) + 1
    4146         8174 :                      tmp_pattern(iatom, jatom, j_img) = 1
    4147              :                   END IF
    4148              :                END IF
    4149              :             END DO
    4150              :          END DO
    4151              :       END DO
    4152              : 
    4153              :       ! -1 => unoccupied, 0 => occupied
    4154        55510 :       pattern(:, :, :) = tmp_pattern(:, :, :) - 1
    4155              : 
    4156          588 :    END SUBROUTINE get_sparsity_pattern
    4157              : 
    4158              : ! **************************************************************************************************
    4159              : !> \brief Distribute the iatom, jatom, b_img triplet over the subgroupd to spread the load
    4160              : !>        the group id for each triplet is passed as the value of sparsity_pattern(i, j, b),
    4161              : !>        with -1 being an unoccupied block
    4162              : !> \param sparsity_pattern ...
    4163              : !> \param ngroups ...
    4164              : !> \param ri_data ...
    4165              : ! **************************************************************************************************
    4166          294 :    SUBROUTINE get_sub_dist(sparsity_pattern, ngroups, ri_data)
    4167              :       INTEGER, DIMENSION(:, :, :), INTENT(INOUT)         :: sparsity_pattern
    4168              :       INTEGER, INTENT(IN)                                :: ngroups
    4169              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4170              : 
    4171              :       INTEGER                                            :: b_img, ctr, iat, iatom, igroup, jatom, &
    4172              :                                                             natom, nimg, ub
    4173          294 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: max_at_per_group
    4174              :       REAL(dp)                                           :: cost
    4175          294 :       REAL(dp), ALLOCATABLE, DIMENSION(:)                :: bins
    4176              : 
    4177          294 :       natom = SIZE(sparsity_pattern, 2)
    4178          294 :       nimg = SIZE(sparsity_pattern, 3)
    4179              : 
    4180              :       !To avoid unnecessary data replication accross the subgroups, we want to have a limited number
    4181              :       !of subgroup with the data of a given iatom. At the minimum, all groups have 1 atom
    4182              :       !We assume that the cost associated to each iatom is roughly the same
    4183          294 :       IF (.NOT. ALLOCATED(ri_data%iatom_to_subgroup)) THEN
    4184          378 :          ALLOCATE (ri_data%iatom_to_subgroup(natom), max_at_per_group(ngroups))
    4185          162 :          DO iatom = 1, natom
    4186          108 :             NULLIFY (ri_data%iatom_to_subgroup(iatom)%array)
    4187          216 :             ALLOCATE (ri_data%iatom_to_subgroup(iatom)%array(ngroups))
    4188          378 :             ri_data%iatom_to_subgroup(iatom)%array(:) = .FALSE.
    4189              :          END DO
    4190              : 
    4191           54 :          ub = natom/ngroups
    4192           54 :          IF (ub*ngroups < natom) ub = ub + 1
    4193          162 :          max_at_per_group(:) = MAX(1, ub)
    4194              : 
    4195              :          !We want each atom to be present the same amount of times. Some groups might have more atoms
    4196              :          !than other to achieve this.
    4197              :          ctr = 0
    4198          162 :          DO WHILE (MODULO(SUM(max_at_per_group), natom) .NE. 0)
    4199            0 :             igroup = MODULO(ctr, ngroups) + 1
    4200            0 :             max_at_per_group(igroup) = max_at_per_group(igroup) + 1
    4201           54 :             ctr = ctr + 1
    4202              :          END DO
    4203              : 
    4204              :          ctr = 0
    4205          162 :          DO igroup = 1, ngroups
    4206          270 :             DO iat = 1, max_at_per_group(igroup)
    4207          108 :                iatom = MODULO(ctr, natom) + 1
    4208          108 :                ri_data%iatom_to_subgroup(iatom)%array(igroup) = .TRUE.
    4209          216 :                ctr = ctr + 1
    4210              :             END DO
    4211              :          END DO
    4212              :       END IF
    4213              : 
    4214          882 :       ALLOCATE (bins(ngroups))
    4215          882 :       bins = 0.0_dp
    4216         8182 :       DO b_img = 1, nimg
    4217        23958 :          DO jatom = 1, natom
    4218        55216 :             DO iatom = 1, natom
    4219        31552 :                IF (sparsity_pattern(iatom, jatom, b_img) == -1) CYCLE
    4220        52860 :                igroup = MINLOC(bins, 1, MASK=ri_data%iatom_to_subgroup(iatom)%array) - 1
    4221              : 
    4222              :                !Use cost information from previous SCF if available
    4223       654670 :                IF (ANY(ri_data%kp_cost > EPSILON(0.0_dp))) THEN
    4224         7948 :                   cost = ri_data%kp_cost(iatom, jatom, b_img)
    4225              :                ELSE
    4226         2624 :                   cost = REAL(ri_data%bsizes_AO(iatom)*ri_data%bsizes_AO(jatom), dp)
    4227              :                END IF
    4228        10572 :                bins(igroup + 1) = bins(igroup + 1) + cost
    4229        47328 :                sparsity_pattern(iatom, jatom, b_img) = igroup
    4230              :             END DO
    4231              :          END DO
    4232              :       END DO
    4233              : 
    4234          294 :    END SUBROUTINE get_sub_dist
    4235              : 
    4236              : ! **************************************************************************************************
    4237              : !> \brief A rouine that updates the sparsity pattern for force calculation, where all i,j,b combinations
    4238              : !>        are visited.
    4239              : !> \param force_pattern ...
    4240              : !> \param scf_pattern ...
    4241              : !> \param ngroups ...
    4242              : !> \param ri_data ...
    4243              : !> \param qs_env ...
    4244              : ! **************************************************************************************************
    4245           46 :    SUBROUTINE update_pattern_to_forces(force_pattern, scf_pattern, ngroups, ri_data, qs_env)
    4246              :       INTEGER, DIMENSION(:, :, :), INTENT(INOUT)         :: force_pattern, scf_pattern
    4247              :       INTEGER, INTENT(IN)                                :: ngroups
    4248              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4249              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    4250              : 
    4251              :       INTEGER                                            :: b_img, iatom, igroup, jatom, mb_img, &
    4252              :                                                             natom, nimg
    4253           46 :       REAL(dp), ALLOCATABLE, DIMENSION(:)                :: bins
    4254              : 
    4255           46 :       natom = SIZE(scf_pattern, 2)
    4256           46 :       nimg = SIZE(scf_pattern, 3)
    4257              : 
    4258          138 :       ALLOCATE (bins(ngroups))
    4259          138 :       bins = 0.0_dp
    4260              : 
    4261         1148 :       DO b_img = 1, nimg
    4262         1102 :          mb_img = get_opp_index(b_img, qs_env)
    4263         3352 :          DO jatom = 1, natom
    4264         7714 :             DO iatom = 1, natom
    4265              :                !Important: same distribution as KS matrix, because reuse t_3c_apc
    4266        22040 :                igroup = MINLOC(bins, 1, MASK=ri_data%iatom_to_subgroup(iatom)%array) - 1
    4267              : 
    4268              :                !check that block not already treated
    4269         4408 :                IF (scf_pattern(iatom, jatom, b_img) > -1) CYCLE
    4270              : 
    4271              :                !If not, take the cost of block j, i, -b (same energy contribution)
    4272         5320 :                IF (mb_img > 0 .AND. mb_img .LE. nimg) THEN
    4273         2672 :                   IF (scf_pattern(jatom, iatom, mb_img) == -1) CYCLE
    4274         1164 :                   bins(igroup + 1) = bins(igroup + 1) + ri_data%kp_cost(jatom, iatom, mb_img)
    4275         1164 :                   force_pattern(iatom, jatom, b_img) = igroup
    4276              :                END IF
    4277              :             END DO
    4278              :          END DO
    4279              :       END DO
    4280              : 
    4281           46 :    END SUBROUTINE update_pattern_to_forces
    4282              : 
    4283              : ! **************************************************************************************************
    4284              : !> \brief A routine that determines the extend of the KP RI-HFX periodic images, including for the
    4285              : !>        extension of the RI basis
    4286              : !> \param ri_data ...
    4287              : !> \param qs_env ...
    4288              : ! **************************************************************************************************
    4289           80 :    SUBROUTINE get_kp_and_ri_images(ri_data, qs_env)
    4290              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4291              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    4292              : 
    4293              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'get_kp_and_ri_images'
    4294              : 
    4295              :       CHARACTER(LEN=512)                                 :: warning_msg
    4296              :       INTEGER :: cell_j(3), cell_k(3), handle, i_img, iatom, ikind, j_img, jatom, jcell, katom, &
    4297              :          kcell, kp_index_lbounds(3), kp_index_ubounds(3), natom, ngroups, nimg, nkind, pcoord(3), &
    4298              :          pdims(3)
    4299           80 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dist_AO_1, dist_AO_2, dist_RI, &
    4300           80 :                                                             nRI_per_atom, present_img, RI_cells
    4301           80 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
    4302              :       REAL(dp)                                           :: bump_fact, dij, dik, image_range, &
    4303              :                                                             RI_range, rij(3), rik(3)
    4304          560 :       TYPE(dbt_type)                                     :: t_dummy
    4305              :       TYPE(dft_control_type), POINTER                    :: dft_control
    4306              :       TYPE(distribution_2d_type), POINTER                :: dist_2d
    4307              :       TYPE(distribution_3d_type)                         :: dist_3d
    4308              :       TYPE(gto_basis_set_p_type), ALLOCATABLE, &
    4309           80 :          DIMENSION(:), TARGET                            :: basis_set_AO, basis_set_RI
    4310              :       TYPE(kpoint_type), POINTER                         :: kpoints
    4311           80 :       TYPE(mp_cart_type)                                 :: mp_comm_t3c
    4312              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    4313              :       TYPE(neighbor_list_3c_iterator_type)               :: nl_3c_iter
    4314              :       TYPE(neighbor_list_3c_type)                        :: nl_3c
    4315              :       TYPE(neighbor_list_iterator_p_type), &
    4316           80 :          DIMENSION(:), POINTER                           :: nl_iterator
    4317              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    4318           80 :          POINTER                                         :: nl_2c
    4319           80 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    4320           80 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
    4321              :       TYPE(section_vals_type), POINTER                   :: hfx_section
    4322              : 
    4323           80 :       NULLIFY (qs_kind_set, dist_2d, nl_2c, nl_iterator, dft_control, &
    4324           80 :                particle_set, kpoints, para_env, cell_to_index, hfx_section)
    4325              : 
    4326           80 :       CALL timeset(routineN, handle)
    4327              : 
    4328              :       CALL get_qs_env(qs_env, nkind=nkind, qs_kind_set=qs_kind_set, distribution_2d=dist_2d, &
    4329              :                       dft_control=dft_control, particle_set=particle_set, kpoints=kpoints, &
    4330           80 :                       para_env=para_env, natom=natom)
    4331           80 :       nimg = dft_control%nimages
    4332           80 :       CALL get_kpoint_info(kpoints, cell_to_index=cell_to_index)
    4333          320 :       kp_index_lbounds = LBOUND(cell_to_index)
    4334          320 :       kp_index_ubounds = UBOUND(cell_to_index)
    4335              : 
    4336           80 :       hfx_section => section_vals_get_subs_vals(qs_env%input, "DFT%XC%HF%RI")
    4337           80 :       CALL section_vals_val_get(hfx_section, "KP_NGROUPS", i_val=ngroups)
    4338              : 
    4339          560 :       ALLOCATE (basis_set_RI(nkind), basis_set_AO(nkind))
    4340           80 :       CALL basis_set_list_setup(basis_set_RI, ri_data%ri_basis_type, qs_kind_set)
    4341           80 :       CALL basis_set_list_setup(basis_set_AO, ri_data%orb_basis_type, qs_kind_set)
    4342              : 
    4343              :       !In case of shortrange HFX potential, it is imprtant to be consistent with the rest of the KP
    4344              :       !code, and use EPS_SCHWARZ to determine the range (rather than eps_filter_2c in normal RI-HFX)
    4345           80 :       IF (ri_data%hfx_pot%potential_type == do_potential_short) THEN
    4346            0 :          CALL erfc_cutoff(ri_data%eps_schwarz, ri_data%hfx_pot%omega, ri_data%hfx_pot%cutoff_radius)
    4347              :          WRITE (warning_msg, '(A)') &
    4348              :             "The SHORTANGE HFX potential typically extends over many periodic images, "// &
    4349              :             "possibly slowing down the calculation. Consider using the TRUNCATED "// &
    4350            0 :             "potential for better computational performance."
    4351            0 :          CPWARN(warning_msg)
    4352              :       END IF
    4353              : 
    4354              :       !Determine the range for contributing periodic images, and for the RI basis extension
    4355           80 :       ri_data%kp_RI_range = 0.0_dp
    4356           80 :       ri_data%kp_image_range = 0.0_dp
    4357          200 :       DO ikind = 1, nkind
    4358              : 
    4359          120 :          CALL init_interaction_radii_orb_basis(basis_set_AO(ikind)%gto_basis_set, ri_data%eps_pgf_orb)
    4360          120 :          CALL get_gto_basis_set(basis_set_AO(ikind)%gto_basis_set, kind_radius=RI_range)
    4361          120 :          ri_data%kp_RI_range = MAX(RI_range, ri_data%kp_RI_range)
    4362              : 
    4363          120 :          CALL init_interaction_radii_orb_basis(basis_set_AO(ikind)%gto_basis_set, ri_data%eps_pgf_orb)
    4364          120 :          CALL init_interaction_radii_orb_basis(basis_set_RI(ikind)%gto_basis_set, ri_data%eps_pgf_orb)
    4365          120 :          CALL get_gto_basis_set(basis_set_RI(ikind)%gto_basis_set, kind_radius=image_range)
    4366              : 
    4367          120 :          image_range = 2.0_dp*image_range + cutoff_screen_factor*ri_data%hfx_pot%cutoff_radius
    4368          320 :          ri_data%kp_image_range = MAX(image_range, ri_data%kp_image_range)
    4369              :       END DO
    4370              : 
    4371           80 :       CALL section_vals_val_get(hfx_section, "KP_RI_BUMP_FACTOR", r_val=bump_fact)
    4372           80 :       ri_data%kp_bump_rad = bump_fact*ri_data%kp_RI_range
    4373              : 
    4374              :       !For the extent of the KP RI-HFX images, we are limited by the RI-HFX potential in
    4375              :       !(mu^0 sigma^a|P^0) (P^0|Q^b) (Q^b|nu^b lambda^a+c), if there is no contact between
    4376              :       !any P^0 and Q^b, then image b does not contribute
    4377              :       CALL build_2c_neighbor_lists(nl_2c, basis_set_RI, basis_set_RI, ri_data%hfx_pot, &
    4378           80 :                                    "HFX_2c_nl_RI", qs_env, sym_ij=.FALSE., dist_2d=dist_2d)
    4379              : 
    4380          240 :       ALLOCATE (present_img(nimg))
    4381         3448 :       present_img = 0
    4382           80 :       ri_data%nimg = 0
    4383           80 :       CALL neighbor_list_iterator_create(nl_iterator, nl_2c)
    4384         1926 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    4385         1846 :          CALL get_iterator_info(nl_iterator, r=rij, cell=cell_j)
    4386              : 
    4387         7384 :          dij = NORM2(rij)
    4388              : 
    4389         1846 :          j_img = cell_to_index(cell_j(1), cell_j(2), cell_j(3))
    4390         1846 :          IF (j_img > nimg .OR. j_img < 1) CYCLE
    4391              : 
    4392         1810 :          IF (dij > ri_data%kp_image_range) CYCLE
    4393              : 
    4394         1810 :          ri_data%nimg = MAX(j_img, ri_data%nimg)
    4395         1846 :          present_img(j_img) = 1
    4396              : 
    4397              :       END DO
    4398           80 :       CALL neighbor_list_iterator_release(nl_iterator)
    4399           80 :       CALL release_neighbor_list_sets(nl_2c)
    4400           80 :       CALL para_env%max(ri_data%nimg)
    4401           80 :       IF (ri_data%nimg > nimg) &
    4402            0 :          CPABORT("Make sure the smallest exponent of the RI-HFX basis is larger than that of the ORB basis.")
    4403              : 
    4404              :       !Keep track of which images will not contribute, so that can be ignored before calculation
    4405           80 :       CALL para_env%sum(present_img)
    4406          240 :       ALLOCATE (ri_data%present_images(ri_data%nimg))
    4407         2078 :       ri_data%present_images = 0
    4408         2078 :       DO i_img = 1, ri_data%nimg
    4409         2078 :          IF (present_img(i_img) > 0) ri_data%present_images(i_img) = 1
    4410              :       END DO
    4411              : 
    4412              :       CALL create_3c_tensor(t_dummy, dist_AO_1, dist_AO_2, dist_RI, &
    4413              :                             ri_data%pgrid, ri_data%bsizes_AO, ri_data%bsizes_AO, ri_data%bsizes_RI, &
    4414           80 :                             map1=[1, 2], map2=[3], name="(AO AO | RI)")
    4415              : 
    4416           80 :       CALL dbt_mp_environ_pgrid(ri_data%pgrid, pdims, pcoord)
    4417           80 :       CALL mp_comm_t3c%create(ri_data%pgrid%mp_comm_2d, 3, pdims)
    4418              :       CALL distribution_3d_create(dist_3d, dist_AO_1, dist_AO_2, dist_RI, &
    4419           80 :                                   nkind, particle_set, mp_comm_t3c, own_comm=.TRUE.)
    4420           80 :       DEALLOCATE (dist_RI, dist_AO_1, dist_AO_2)
    4421           80 :       CALL dbt_destroy(t_dummy)
    4422              : 
    4423              :       !For the extension of the RI basis P in (mu^0 sigma^a |P^i), we consider an atom if the distance,
    4424              :       !between mu^0 and P^i if smaller or equal to the kind radius of mu^0
    4425              :       CALL build_3c_neighbor_lists(nl_3c, basis_set_AO, basis_set_AO, basis_set_RI, dist_3d, &
    4426              :                                    ri_data%ri_metric, "HFX_3c_nl", qs_env, op_pos=2, sym_ij=.FALSE., &
    4427           80 :                                    own_dist=.TRUE.)
    4428              : 
    4429          160 :       ALLOCATE (RI_cells(nimg))
    4430         3448 :       RI_cells = 0
    4431              : 
    4432          240 :       ALLOCATE (nRI_per_atom(natom))
    4433          240 :       nRI_per_atom = 0
    4434              : 
    4435           80 :       CALL neighbor_list_3c_iterator_create(nl_3c_iter, nl_3c)
    4436        76342 :       DO WHILE (neighbor_list_3c_iterate(nl_3c_iter) == 0)
    4437              :          CALL get_3c_iterator_info(nl_3c_iter, cell_k=cell_k, rik=rik, cell_j=cell_j, &
    4438        76262 :                                    iatom=iatom, jatom=jatom, katom=katom)
    4439       305048 :          dik = NORM2(rik)
    4440              : 
    4441       533834 :          IF (ANY([cell_j(1), cell_j(2), cell_j(3)] < kp_index_lbounds) .OR. &
    4442              :              ANY([cell_j(1), cell_j(2), cell_j(3)] > kp_index_ubounds)) CYCLE
    4443              : 
    4444        76262 :          jcell = cell_to_index(cell_j(1), cell_j(2), cell_j(3))
    4445        76262 :          IF (jcell > nimg .OR. jcell < 1) CYCLE
    4446              : 
    4447       508705 :          IF (ANY([cell_k(1), cell_k(2), cell_k(3)] < kp_index_lbounds) .OR. &
    4448              :              ANY([cell_k(1), cell_k(2), cell_k(3)] > kp_index_ubounds)) CYCLE
    4449              : 
    4450        68523 :          kcell = cell_to_index(cell_k(1), cell_k(2), cell_k(3))
    4451        68523 :          IF (kcell > nimg .OR. kcell < 1) CYCLE
    4452              : 
    4453        54552 :          IF (dik > ri_data%kp_RI_range) CYCLE
    4454         6827 :          RI_cells(kcell) = 1
    4455              : 
    4456         6907 :          IF (jcell == 1 .AND. iatom == jatom) nRI_per_atom(iatom) = nRI_per_atom(iatom) + ri_data%bsizes_RI(katom)
    4457              :       END DO
    4458           80 :       CALL neighbor_list_3c_iterator_destroy(nl_3c_iter)
    4459           80 :       CALL neighbor_list_3c_destroy(nl_3c)
    4460           80 :       CALL para_env%sum(RI_cells)
    4461           80 :       CALL para_env%sum(nRI_per_atom)
    4462              : 
    4463          160 :       ALLOCATE (ri_data%img_to_RI_cell(nimg))
    4464           80 :       ri_data%ncell_RI = 0
    4465         3448 :       ri_data%img_to_RI_cell = 0
    4466         3448 :       DO i_img = 1, nimg
    4467         3448 :          IF (RI_cells(i_img) > 0) THEN
    4468          482 :             ri_data%ncell_RI = ri_data%ncell_RI + 1
    4469          482 :             ri_data%img_to_RI_cell(i_img) = ri_data%ncell_RI
    4470              :          END IF
    4471              :       END DO
    4472              : 
    4473          240 :       ALLOCATE (ri_data%RI_cell_to_img(ri_data%ncell_RI))
    4474         3448 :       DO i_img = 1, nimg
    4475         3448 :          IF (ri_data%img_to_RI_cell(i_img) > 0) ri_data%RI_cell_to_img(ri_data%img_to_RI_cell(i_img)) = i_img
    4476              :       END DO
    4477              : 
    4478              :       !Print some info
    4479           80 :       IF (ri_data%unit_nr > 0) THEN
    4480              :          WRITE (ri_data%unit_nr, FMT="(/T3,A,I29)") &
    4481           40 :             "KP-HFX_RI_INFO| Number of RI-KP parallel groups:", ngroups
    4482              :          WRITE (ri_data%unit_nr, FMT="(T3,A,I29)") &
    4483           40 :             "KP-HFX_RI_INFO| Tensor stack size:              ", ri_data%kp_stack_size
    4484              :          WRITE (ri_data%unit_nr, FMT="(T3,A,F31.3,A)") &
    4485           40 :             "KP-HFX_RI_INFO| RI basis extension radius:", ri_data%kp_RI_range*angstrom, " Ang"
    4486              :          WRITE (ri_data%unit_nr, FMT="(T3,A,F12.3,A, F6.3, A)") &
    4487           40 :             "KP-HFX_RI_INFO| RI basis bump factor and bump radius:", bump_fact, " /", &
    4488           80 :             ri_data%kp_bump_rad*angstrom, " Ang"
    4489              :          WRITE (ri_data%unit_nr, FMT="(T3,A,I16,A)") &
    4490           40 :             "KP-HFX_RI_INFO| The extended RI bases cover up to ", ri_data%ncell_RI, " unit cells"
    4491              :          WRITE (ri_data%unit_nr, FMT="(T3,A,I18)") &
    4492          120 :             "KP-HFX_RI_INFO| Average number of sgf in extended RI bases:", SUM(nRI_per_atom)/natom
    4493              :          WRITE (ri_data%unit_nr, FMT="(T3,A,F13.3,A)") &
    4494           40 :             "KP-HFX_RI_INFO| Consider all image cells within a radius of ", ri_data%kp_image_range*angstrom, " Ang"
    4495              :          WRITE (ri_data%unit_nr, FMT="(T3,A,I27/)") &
    4496           40 :             "KP-HFX_RI_INFO| Number of image cells considered: ", ri_data%nimg
    4497           40 :          CALL m_flush(ri_data%unit_nr)
    4498              :       END IF
    4499              : 
    4500           80 :       CALL timestop(handle)
    4501              : 
    4502          960 :    END SUBROUTINE get_kp_and_ri_images
    4503              : 
    4504              : ! **************************************************************************************************
    4505              : !> \brief A routine that creates tensors structure for rho_ao and 3c_ints in a stacked format for
    4506              : !>        the efficient contractions of rho_sigma^0,lambda^c * (mu^0 sigam^a | P) => TAS tensors
    4507              : !> \param res_stack ...
    4508              : !> \param rho_stack ...
    4509              : !> \param ints_stack ...
    4510              : !> \param rho_template ...
    4511              : !> \param ints_template ...
    4512              : !> \param stack_size ...
    4513              : !> \param ri_data ...
    4514              : !> \param qs_env ...
    4515              : !> \note The result tensor has the exact same shape and distribution as the integral tensor
    4516              : ! **************************************************************************************************
    4517          294 :    SUBROUTINE get_stack_tensors(res_stack, rho_stack, ints_stack, rho_template, ints_template, &
    4518              :                                 stack_size, ri_data, qs_env)
    4519              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: res_stack, rho_stack, ints_stack
    4520              :       TYPE(dbt_type), INTENT(INOUT)                      :: rho_template, ints_template
    4521              :       INTEGER, INTENT(IN)                                :: stack_size
    4522              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4523              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    4524              : 
    4525              :       INTEGER                                            :: is, nblks, nblks_3c(3), pdims_3d(3)
    4526          294 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bsizes_RI_ext, bsizes_stack, dist1, &
    4527          294 :                                                             dist2, dist3, dist_stack1, &
    4528          294 :                                                             dist_stack2, dist_stack3
    4529         2646 :       TYPE(dbt_distribution_type)                        :: t_dist
    4530          882 :       TYPE(dbt_pgrid_type)                               :: pgrid
    4531              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    4532              : 
    4533          294 :       NULLIFY (para_env)
    4534              : 
    4535          294 :       CALL get_qs_env(qs_env, para_env=para_env)
    4536              : 
    4537          294 :       nblks = SIZE(ri_data%bsizes_AO_split)
    4538          882 :       ALLOCATE (bsizes_stack(stack_size*nblks))
    4539         5798 :       DO is = 1, stack_size
    4540        24614 :          bsizes_stack((is - 1)*nblks + 1:is*nblks) = ri_data%bsizes_AO_split(:)
    4541              :       END DO
    4542              : 
    4543         2646 :       ALLOCATE (dist1(nblks), dist2(nblks), dist_stack1(stack_size*nblks), dist_stack2(stack_size*nblks))
    4544          294 :       CALL dbt_get_info(rho_template, proc_dist_1=dist1, proc_dist_2=dist2)
    4545         5798 :       DO is = 1, stack_size
    4546        24320 :          dist_stack1((is - 1)*nblks + 1:is*nblks) = dist1(:)
    4547        24614 :          dist_stack2((is - 1)*nblks + 1:is*nblks) = dist2(:)
    4548              :       END DO
    4549              : 
    4550              :       !First 2c tensor matches the distribution of template
    4551              :       !It is stacked in both directions
    4552          294 :       CALL dbt_distribution_new(t_dist, ri_data%pgrid_2d, dist_stack1, dist_stack2)
    4553          294 :       CALL dbt_create(rho_stack(1), "RHO_stack", t_dist, [1], [2], bsizes_stack, bsizes_stack)
    4554          294 :       CALL dbt_distribution_destroy(t_dist)
    4555          294 :       DEALLOCATE (dist1, dist2, dist_stack1, dist_stack2)
    4556              : 
    4557              :       !Second 2c tensor has optimal distribution on the 2d pgrid
    4558          294 :       CALL create_2c_tensor(rho_stack(2), dist1, dist2, ri_data%pgrid_2d, bsizes_stack, bsizes_stack, name="RHO_stack")
    4559          294 :       DEALLOCATE (dist1, dist2)
    4560              : 
    4561          294 :       CALL dbt_get_info(ints_template, nblks_total=nblks_3c)
    4562         2058 :       ALLOCATE (dist1(nblks_3c(1)), dist2(nblks_3c(2)), dist3(nblks_3c(3)))
    4563         1470 :       ALLOCATE (dist_stack3(stack_size*nblks_3c(3)), bsizes_RI_ext(nblks_3c(2)))
    4564              :       CALL dbt_get_info(ints_template, proc_dist_1=dist1, proc_dist_2=dist2, &
    4565          294 :                         proc_dist_3=dist3, blk_size_2=bsizes_RI_ext)
    4566         5798 :       DO is = 1, stack_size
    4567        24614 :          dist_stack3((is - 1)*nblks_3c(3) + 1:is*nblks_3c(3)) = dist3(:)
    4568              :       END DO
    4569              : 
    4570              :       !First 3c tensor matches the distribution of template
    4571          294 :       CALL dbt_distribution_new(t_dist, ri_data%pgrid_1, dist1, dist2, dist_stack3)
    4572              :       CALL dbt_create(ints_stack(1), "ints_stack", t_dist, [1, 2], [3], ri_data%bsizes_AO_split, &
    4573          294 :                       bsizes_RI_ext, bsizes_stack)
    4574          294 :       CALL dbt_distribution_destroy(t_dist)
    4575          294 :       DEALLOCATE (dist1, dist2, dist3, dist_stack3)
    4576              : 
    4577              :       !Second 3c tensor has optimal pgrid
    4578          294 :       pdims_3d = 0
    4579         1176 :       CALL dbt_pgrid_create(para_env, pdims_3d, pgrid, tensor_dims=[nblks_3c(1), nblks_3c(2), stack_size*nblks_3c(3)])
    4580              :       CALL create_3c_tensor(ints_stack(2), dist1, dist2, dist3, pgrid, ri_data%bsizes_AO_split, &
    4581          294 :                             bsizes_RI_ext, bsizes_stack, [1, 2], [3], name="ints_stack")
    4582          294 :       DEALLOCATE (dist1, dist2, dist3)
    4583          294 :       CALL dbt_pgrid_destroy(pgrid)
    4584              : 
    4585              :       !The result tensor has the same shape and dist as the integral tensor
    4586          294 :       CALL dbt_create(ints_stack(1), res_stack(1))
    4587          294 :       CALL dbt_create(ints_stack(2), res_stack(2))
    4588              : 
    4589          588 :    END SUBROUTINE get_stack_tensors
    4590              : 
    4591              : ! **************************************************************************************************
    4592              : !> \brief Fill the stack of 3c tensors accrding to the order in the images input
    4593              : !> \param t_3c_stack ...
    4594              : !> \param t_3c_in ...
    4595              : !> \param images ...
    4596              : !> \param stack_dim ...
    4597              : !> \param ri_data ...
    4598              : !> \param filter_at ...
    4599              : !> \param filter_dim ...
    4600              : !> \param idx_to_at ...
    4601              : !> \param img_bounds ...
    4602              : ! **************************************************************************************************
    4603        35544 :    SUBROUTINE fill_3c_stack(t_3c_stack, t_3c_in, images, stack_dim, ri_data, filter_at, filter_dim, &
    4604        35544 :                             idx_to_at, img_bounds)
    4605              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_3c_stack
    4606              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_3c_in
    4607              :       INTEGER, DIMENSION(:), INTENT(INOUT)               :: images
    4608              :       INTEGER, INTENT(IN)                                :: stack_dim
    4609              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4610              :       INTEGER, INTENT(IN), OPTIONAL                      :: filter_at, filter_dim
    4611              :       INTEGER, DIMENSION(:), INTENT(INOUT), OPTIONAL     :: idx_to_at
    4612              :       INTEGER, INTENT(IN), OPTIONAL                      :: img_bounds(2)
    4613              : 
    4614              :       INTEGER                                            :: dest(3), i_img, idx, ind(3), lb, nblks, &
    4615              :                                                             nimg, offset, ub
    4616              :       LOGICAL                                            :: do_filter, found
    4617        35544 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: blk
    4618              :       TYPE(dbt_iterator_type)                            :: iter
    4619              : 
    4620              :       !We loop over the a images from the ac_pairs, then copy the 3c ints to the correct spot in
    4621              :       !in the stack tensor (corresponding to pair index). Distributions match by construction
    4622        35544 :       nimg = ri_data%nimg
    4623        35544 :       nblks = SIZE(ri_data%bsizes_AO_split)
    4624              : 
    4625        35544 :       do_filter = .FALSE.
    4626        35118 :       IF (PRESENT(filter_at) .AND. PRESENT(filter_dim) .AND. PRESENT(idx_to_at)) do_filter = .TRUE.
    4627              : 
    4628        35544 :       lb = 1
    4629        35544 :       ub = nimg
    4630        35544 :       offset = 0
    4631        35544 :       IF (PRESENT(img_bounds)) THEN
    4632        35544 :          lb = img_bounds(1)
    4633        35544 :          ub = img_bounds(2) - 1
    4634        35544 :          offset = lb - 1
    4635              :       END IF
    4636              : 
    4637       525497 :       DO idx = lb, ub
    4638       489953 :          i_img = images(idx)
    4639       489953 :          IF (i_img == 0 .OR. i_img > nimg) CYCLE
    4640              : 
    4641              : !$OMP PARALLEL DEFAULT(NONE) &
    4642              : !$OMP SHARED(idx,i_img,t_3c_in,t_3c_stack,nblks,stack_dim,filter_at,filter_dim,idx_to_at,do_filter,offset) &
    4643       525497 : !$OMP PRIVATE(iter,ind,blk,found,dest)
    4644              :          CALL dbt_iterator_start(iter, t_3c_in(i_img))
    4645              :          DO WHILE (dbt_iterator_blocks_left(iter))
    4646              :             CALL dbt_iterator_next_block(iter, ind)
    4647              :             CALL dbt_get_block(t_3c_in(i_img), ind, blk, found)
    4648              :             IF (.NOT. found) CYCLE
    4649              : 
    4650              :             IF (do_filter) THEN
    4651              :                IF (.NOT. idx_to_at(ind(filter_dim)) == filter_at) CYCLE
    4652              :             END IF
    4653              : 
    4654              :             IF (stack_dim == 1) THEN
    4655              :                dest = [(idx - offset - 1)*nblks + ind(1), ind(2), ind(3)]
    4656              :             ELSE IF (stack_dim == 2) THEN
    4657              :                dest = [ind(1), (idx - offset - 1)*nblks + ind(2), ind(3)]
    4658              :             ELSE
    4659              :                dest = [ind(1), ind(2), (idx - offset - 1)*nblks + ind(3)]
    4660              :             END IF
    4661              : 
    4662              :             CALL dbt_put_block(t_3c_stack, dest, SHAPE(blk), blk)
    4663              :             DEALLOCATE (blk)
    4664              :          END DO
    4665              :          CALL dbt_iterator_stop(iter)
    4666              : !$OMP END PARALLEL
    4667              :       END DO !i_img
    4668        35544 :       CALL dbt_finalize(t_3c_stack)
    4669              : 
    4670        71088 :    END SUBROUTINE fill_3c_stack
    4671              : 
    4672              : ! **************************************************************************************************
    4673              : !> \brief Fill the stack of 2c tensors based on the content of images input
    4674              : !> \param t_2c_stack ...
    4675              : !> \param t_2c_in ...
    4676              : !> \param images ...
    4677              : !> \param stack_dim ...
    4678              : !> \param ri_data ...
    4679              : !> \param img_bounds ...
    4680              : !> \param shift ...
    4681              : ! **************************************************************************************************
    4682        16994 :    SUBROUTINE fill_2c_stack(t_2c_stack, t_2c_in, images, stack_dim, ri_data, img_bounds, shift)
    4683              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_2c_stack
    4684              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_2c_in
    4685              :       INTEGER, DIMENSION(:), INTENT(INOUT)               :: images
    4686              :       INTEGER, INTENT(IN)                                :: stack_dim
    4687              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4688              :       INTEGER, INTENT(IN), OPTIONAL                      :: img_bounds(2), shift
    4689              : 
    4690              :       INTEGER                                            :: dest(2), i_img, idx, ind(2), lb, &
    4691              :                                                             my_shift, nblks, nimg, offset, ub
    4692              :       LOGICAL                                            :: found
    4693        16994 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: blk
    4694              :       TYPE(dbt_iterator_type)                            :: iter
    4695              : 
    4696              :       !We loop over the a images from the ac_pairs, then copy the 3c ints to the correct spot in
    4697              :       !in the stack tensor (corresponding to pair index). Distributions match by construction
    4698        16994 :       nimg = ri_data%nimg
    4699        16994 :       nblks = SIZE(ri_data%bsizes_AO_split)
    4700              : 
    4701        16994 :       lb = 1
    4702        16994 :       ub = nimg
    4703        16994 :       offset = 0
    4704        16994 :       IF (PRESENT(img_bounds)) THEN
    4705        16994 :          lb = img_bounds(1)
    4706        16994 :          ub = img_bounds(2) - 1
    4707        16994 :          offset = lb - 1
    4708              :       END IF
    4709              : 
    4710        16994 :       my_shift = 1
    4711        16994 :       IF (PRESENT(shift)) my_shift = shift
    4712              : 
    4713       253764 :       DO idx = lb, ub
    4714       236770 :          i_img = images(idx)
    4715       236770 :          IF (i_img == 0 .OR. i_img > nimg) CYCLE
    4716              : 
    4717              : !$OMP PARALLEL DEFAULT(NONE) SHARED(idx,i_img,t_2c_in,t_2c_stack,nblks,stack_dim,offset,my_shift) &
    4718       253764 : !$OMP PRIVATE(iter,ind,blk,found,dest)
    4719              :          CALL dbt_iterator_start(iter, t_2c_in(i_img))
    4720              :          DO WHILE (dbt_iterator_blocks_left(iter))
    4721              :             CALL dbt_iterator_next_block(iter, ind)
    4722              :             CALL dbt_get_block(t_2c_in(i_img), ind, blk, found)
    4723              :             IF (.NOT. found) CYCLE
    4724              : 
    4725              :             IF (stack_dim == 1) THEN
    4726              :                dest = [(idx - offset - 1)*nblks + ind(1), (my_shift - 1)*nblks + ind(2)]
    4727              :             ELSE
    4728              :                dest = [(my_shift - 1)*nblks + ind(1), (idx - offset - 1)*nblks + ind(2)]
    4729              :             END IF
    4730              : 
    4731              :             CALL dbt_put_block(t_2c_stack, dest, SHAPE(blk), blk)
    4732              :             DEALLOCATE (blk)
    4733              :          END DO
    4734              :          CALL dbt_iterator_stop(iter)
    4735              : !$OMP END PARALLEL
    4736              :       END DO !idx
    4737        16994 :       CALL dbt_finalize(t_2c_stack)
    4738              : 
    4739        33988 :    END SUBROUTINE fill_2c_stack
    4740              : 
    4741              : ! **************************************************************************************************
    4742              : !> \brief Unstacks a stacked 3c tensor containing t_3c_apc
    4743              : !> \param t_3c_apc ...
    4744              : !> \param t_stacked ...
    4745              : !> \param idx ...
    4746              : ! **************************************************************************************************
    4747        20832 :    SUBROUTINE unstack_t_3c_apc(t_3c_apc, t_stacked, idx)
    4748              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_3c_apc, t_stacked
    4749              :       INTEGER, INTENT(IN)                                :: idx
    4750              : 
    4751              :       INTEGER                                            :: current_idx
    4752              :       INTEGER, DIMENSION(3)                              :: ind, nblks_3c
    4753              :       LOGICAL                                            :: found
    4754        20832 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: blk
    4755              :       TYPE(dbt_iterator_type)                            :: iter
    4756              : 
    4757              :       !Note: t_3c_apc and t_stacked must have the same ditribution
    4758        20832 :       CALL dbt_get_info(t_3c_apc, nblks_total=nblks_3c)
    4759              : 
    4760        20832 : !$OMP PARALLEL DEFAULT(NONE) SHARED(t_3c_apc,t_stacked,idx,nblks_3c) PRIVATE(iter,ind,blk,found,current_idx)
    4761              :       CALL dbt_iterator_start(iter, t_stacked)
    4762              :       DO WHILE (dbt_iterator_blocks_left(iter))
    4763              :          CALL dbt_iterator_next_block(iter, ind)
    4764              : 
    4765              :          !tensor is stacked along the 3rd dimension
    4766              :          current_idx = (ind(3) - 1)/nblks_3c(3) + 1
    4767              :          IF (.NOT. idx == current_idx) CYCLE
    4768              : 
    4769              :          CALL dbt_get_block(t_stacked, ind, blk, found)
    4770              :          IF (.NOT. found) CYCLE
    4771              : 
    4772              :          CALL dbt_put_block(t_3c_apc, [ind(1), ind(2), ind(3) - (idx - 1)*nblks_3c(3)], SHAPE(blk), blk)
    4773              :          DEALLOCATE (blk)
    4774              :       END DO
    4775              :       CALL dbt_iterator_stop(iter)
    4776              : !$OMP END PARALLEL
    4777              : 
    4778        20832 :    END SUBROUTINE unstack_t_3c_apc
    4779              : 
    4780              : ! **************************************************************************************************
    4781              : !> \brief copies the 3c integrals correspoinding to a single atom mu from the general (P^0| mu^0 sigam^a)
    4782              : !> \param t_3c_at ...
    4783              : !> \param t_3c_ints ...
    4784              : !> \param iatom ...
    4785              : !> \param dim_at ...
    4786              : !> \param idx_to_at ...
    4787              : ! **************************************************************************************************
    4788            0 :    SUBROUTINE get_atom_3c_ints(t_3c_at, t_3c_ints, iatom, dim_at, idx_to_at)
    4789              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_3c_at, t_3c_ints
    4790              :       INTEGER, INTENT(IN)                                :: iatom, dim_at
    4791              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: idx_to_at
    4792              : 
    4793              :       INTEGER, DIMENSION(3)                              :: ind
    4794              :       LOGICAL                                            :: found
    4795            0 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: blk
    4796              :       TYPE(dbt_iterator_type)                            :: iter
    4797              : 
    4798            0 : !$OMP PARALLEL DEFAULT(NONE) SHARED(t_3c_ints,t_3c_at,iatom,idx_to_at,dim_at) PRIVATE(iter,ind,blk,found)
    4799              :       CALL dbt_iterator_start(iter, t_3c_ints)
    4800              :       DO WHILE (dbt_iterator_blocks_left(iter))
    4801              :          CALL dbt_iterator_next_block(iter, ind)
    4802              :          IF (.NOT. idx_to_at(ind(dim_at)) == iatom) CYCLE
    4803              : 
    4804              :          CALL dbt_get_block(t_3c_ints, ind, blk, found)
    4805              :          IF (.NOT. found) CYCLE
    4806              : 
    4807              :          CALL dbt_put_block(t_3c_at, ind, SHAPE(blk), blk)
    4808              :          DEALLOCATE (blk)
    4809              :       END DO
    4810              :       CALL dbt_iterator_stop(iter)
    4811              : !$OMP END PARALLEL
    4812            0 :       CALL dbt_finalize(t_3c_at)
    4813              : 
    4814            0 :    END SUBROUTINE get_atom_3c_ints
    4815              : 
    4816              : ! **************************************************************************************************
    4817              : !> \brief Precalculate the 3c and 2c derivatives tensors
    4818              : !> \param t_3c_der_RI ...
    4819              : !> \param t_3c_der_AO ...
    4820              : !> \param mat_der_pot ...
    4821              : !> \param t_2c_der_metric ...
    4822              : !> \param ri_data ...
    4823              : !> \param qs_env ...
    4824              : ! **************************************************************************************************
    4825           46 :    SUBROUTINE precalc_derivatives(t_3c_der_RI, t_3c_der_AO, mat_der_pot, t_2c_der_metric, ri_data, qs_env)
    4826              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: t_3c_der_RI, t_3c_der_AO
    4827              :       TYPE(dbcsr_type), DIMENSION(:, :), INTENT(INOUT)   :: mat_der_pot
    4828              :       TYPE(dbt_type), DIMENSION(:, :), INTENT(INOUT)     :: t_2c_der_metric
    4829              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    4830              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    4831              : 
    4832              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'precalc_derivatives'
    4833              : 
    4834              :       INTEGER                                            :: handle, handle2, i_img, i_mem, i_RI, &
    4835              :                                                             i_xyz, iatom, n_mem, natom, nblks_RI, &
    4836              :                                                             ncell_RI, nimg, nkind, nthreads
    4837              :       INTEGER(int_8)                                     :: nze
    4838           46 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: bsizes_RI_ext, bsizes_RI_ext_split, dist_AO_1, &
    4839           92 :          dist_AO_2, dist_RI, dist_RI_ext, dummy_end, dummy_start, end_blocks, start_blocks
    4840              :       INTEGER, DIMENSION(3)                              :: pcoord, pdims
    4841           92 :       INTEGER, DIMENSION(:), POINTER                     :: col_bsize, row_bsize
    4842              :       REAL(dp)                                           :: occ
    4843              :       TYPE(dbcsr_distribution_type)                      :: dbcsr_dist
    4844              :       TYPE(dbcsr_type)                                   :: dbcsr_template
    4845           46 :       TYPE(dbcsr_type), ALLOCATABLE, DIMENSION(:, :)     :: mat_der_metric
    4846          414 :       TYPE(dbt_distribution_type)                        :: t_dist
    4847          138 :       TYPE(dbt_pgrid_type)                               :: pgrid
    4848          414 :       TYPE(dbt_type)                                     :: t_3c_template
    4849           46 :       TYPE(dbt_type), ALLOCATABLE, DIMENSION(:, :, :)    :: t_3c_der_AO_prv, t_3c_der_RI_prv
    4850              :       TYPE(dft_control_type), POINTER                    :: dft_control
    4851              :       TYPE(distribution_2d_type), POINTER                :: dist_2d
    4852              :       TYPE(distribution_3d_type)                         :: dist_3d
    4853              :       TYPE(gto_basis_set_p_type), ALLOCATABLE, &
    4854           46 :          DIMENSION(:), TARGET                            :: basis_set_AO, basis_set_RI
    4855           46 :       TYPE(mp_cart_type)                                 :: mp_comm_t3c
    4856              :       TYPE(mp_para_env_type), POINTER                    :: para_env
    4857              :       TYPE(neighbor_list_3c_type)                        :: nl_3c
    4858              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    4859           46 :          POINTER                                         :: nl_2c
    4860           46 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    4861           46 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
    4862              : 
    4863           46 :       NULLIFY (qs_kind_set, dist_2d, nl_2c, particle_set, dft_control, para_env, row_bsize, col_bsize)
    4864              : 
    4865           46 :       CALL timeset(routineN, handle)
    4866              : 
    4867              :       CALL get_qs_env(qs_env, nkind=nkind, qs_kind_set=qs_kind_set, distribution_2d=dist_2d, natom=natom, &
    4868           46 :                       particle_set=particle_set, dft_control=dft_control, para_env=para_env)
    4869              : 
    4870           46 :       nimg = ri_data%nimg
    4871           46 :       ncell_RI = ri_data%ncell_RI
    4872              : 
    4873          324 :       ALLOCATE (basis_set_RI(nkind), basis_set_AO(nkind))
    4874           46 :       CALL basis_set_list_setup(basis_set_RI, ri_data%ri_basis_type, qs_kind_set)
    4875           46 :       CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_RI)
    4876           46 :       CALL basis_set_list_setup(basis_set_AO, ri_data%orb_basis_type, qs_kind_set)
    4877           46 :       CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_AO)
    4878              : 
    4879              :       !Dealing with the 3c derivatives
    4880           46 :       nthreads = 1
    4881           46 : !$    nthreads = omp_get_num_threads()
    4882           46 :       pdims = 0
    4883          184 :       CALL dbt_pgrid_create(para_env, pdims, pgrid, tensor_dims=[MAX(1, natom/(ri_data%n_mem*nthreads)), natom, natom])
    4884              : 
    4885              :       CALL create_3c_tensor(t_3c_template, dist_AO_1, dist_AO_2, dist_RI, pgrid, &
    4886              :                             ri_data%bsizes_AO, ri_data%bsizes_AO, ri_data%bsizes_RI, &
    4887           46 :                             map1=[1, 2], map2=[3], name="tmp")
    4888           46 :       CALL dbt_destroy(t_3c_template)
    4889              : 
    4890              :       !We stack the RI basis images. Keep consistent distribution
    4891           46 :       nblks_RI = SIZE(ri_data%bsizes_RI_split)
    4892          138 :       ALLOCATE (dist_RI_ext(natom*ncell_RI))
    4893           92 :       ALLOCATE (bsizes_RI_ext(natom*ncell_RI))
    4894          138 :       ALLOCATE (bsizes_RI_ext_split(nblks_RI*ncell_RI))
    4895          318 :       DO i_RI = 1, ncell_RI
    4896          816 :          bsizes_RI_ext((i_RI - 1)*natom + 1:i_RI*natom) = ri_data%bsizes_RI(:)
    4897          816 :          dist_RI_ext((i_RI - 1)*natom + 1:i_RI*natom) = dist_RI(:)
    4898         1622 :          bsizes_RI_ext_split((i_RI - 1)*nblks_RI + 1:i_RI*nblks_RI) = ri_data%bsizes_RI_split(:)
    4899              :       END DO
    4900              : 
    4901           46 :       CALL dbt_distribution_new(t_dist, pgrid, dist_AO_1, dist_AO_2, dist_RI_ext)
    4902              :       CALL dbt_create(t_3c_template, "KP_3c_der", t_dist, [1, 2], [3], &
    4903           46 :                       ri_data%bsizes_AO, ri_data%bsizes_AO, bsizes_RI_ext)
    4904           46 :       CALL dbt_distribution_destroy(t_dist)
    4905              : 
    4906         8084 :       ALLOCATE (t_3c_der_RI_prv(nimg, 1, 3), t_3c_der_AO_prv(nimg, 1, 3))
    4907          184 :       DO i_xyz = 1, 3
    4908         3490 :          DO i_img = 1, nimg
    4909         3306 :             CALL dbt_create(t_3c_template, t_3c_der_RI_prv(i_img, 1, i_xyz))
    4910         3444 :             CALL dbt_create(t_3c_template, t_3c_der_AO_prv(i_img, 1, i_xyz))
    4911              :          END DO
    4912              :       END DO
    4913           46 :       CALL dbt_destroy(t_3c_template)
    4914              : 
    4915           46 :       CALL dbt_mp_environ_pgrid(pgrid, pdims, pcoord)
    4916           46 :       CALL mp_comm_t3c%create(pgrid%mp_comm_2d, 3, pdims)
    4917              :       CALL distribution_3d_create(dist_3d, dist_AO_1, dist_AO_2, dist_RI, &
    4918           46 :                                   nkind, particle_set, mp_comm_t3c, own_comm=.TRUE.)
    4919           46 :       DEALLOCATE (dist_RI, dist_AO_1, dist_AO_2)
    4920           46 :       CALL dbt_pgrid_destroy(pgrid)
    4921              : 
    4922              :       CALL build_3c_neighbor_lists(nl_3c, basis_set_AO, basis_set_AO, basis_set_RI, dist_3d, ri_data%ri_metric, &
    4923           46 :                                    "HFX_3c_nl", qs_env, op_pos=2, sym_jk=.FALSE., own_dist=.TRUE.)
    4924              : 
    4925           46 :       n_mem = ri_data%n_mem
    4926              :       CALL create_tensor_batches(ri_data%bsizes_RI, n_mem, dummy_start, dummy_end, &
    4927              :                                  start_blocks, end_blocks)
    4928           46 :       DEALLOCATE (dummy_start, dummy_end)
    4929              : 
    4930              :       CALL create_3c_tensor(t_3c_template, dist_RI, dist_AO_1, dist_AO_2, ri_data%pgrid_2, &
    4931              :                             bsizes_RI_ext_split, ri_data%bsizes_AO_split, ri_data%bsizes_AO_split, &
    4932           46 :                             map1=[1], map2=[2, 3], name="der (RI | AO AO)")
    4933          184 :       DO i_xyz = 1, 3
    4934         3490 :          DO i_img = 1, nimg
    4935         3306 :             CALL dbt_create(t_3c_template, t_3c_der_RI(i_img, i_xyz))
    4936         3444 :             CALL dbt_create(t_3c_template, t_3c_der_AO(i_img, i_xyz))
    4937              :          END DO
    4938              :       END DO
    4939              : 
    4940          128 :       DO i_mem = 1, n_mem
    4941              :          CALL build_3c_derivatives(t_3c_der_AO_prv, t_3c_der_RI_prv, ri_data%filter_eps, qs_env, &
    4942              :                                    nl_3c, basis_set_AO, basis_set_AO, basis_set_RI, &
    4943              :                                    ri_data%ri_metric, der_eps=ri_data%eps_schwarz_forces, op_pos=2, &
    4944              :                                    do_kpoints=.TRUE., do_hfx_kpoints=.TRUE., &
    4945              :                                    bounds_k=[start_blocks(i_mem), end_blocks(i_mem)], &
    4946          246 :                                    RI_range=ri_data%kp_RI_range, img_to_RI_cell=ri_data%img_to_RI_cell)
    4947              : 
    4948           82 :          CALL timeset(routineN//"_cpy", handle2)
    4949              :          !We go from (mu^0 sigma^i | P^j) to (P^i| sigma^j mu^0) and finally to (P^i| mu^0 sigma^j)
    4950         2186 :          DO i_img = 1, nimg
    4951         8498 :             DO i_xyz = 1, 3
    4952              :                !derivative wrt to mu^0
    4953         6312 :                CALL get_tensor_occupancy(t_3c_der_AO_prv(i_img, 1, i_xyz), nze, occ)
    4954         6312 :                IF (nze > 0) THEN
    4955              :                   CALL dbt_copy(t_3c_der_AO_prv(i_img, 1, i_xyz), t_3c_template, &
    4956         3830 :                                 order=[3, 2, 1], move_data=.TRUE.)
    4957         3830 :                   CALL dbt_filter(t_3c_template, ri_data%filter_eps)
    4958              :                   CALL dbt_copy(t_3c_template, t_3c_der_AO(i_img, i_xyz), &
    4959         3830 :                                 order=[1, 3, 2], move_data=.TRUE., summation=.TRUE.)
    4960              :                END IF
    4961              : 
    4962              :                !derivative wrt to P^i
    4963         6312 :                CALL get_tensor_occupancy(t_3c_der_RI_prv(i_img, 1, i_xyz), nze, occ)
    4964        14728 :                IF (nze > 0) THEN
    4965              :                   CALL dbt_copy(t_3c_der_RI_prv(i_img, 1, i_xyz), t_3c_template, &
    4966         3820 :                                 order=[3, 2, 1], move_data=.TRUE.)
    4967         3820 :                   CALL dbt_filter(t_3c_template, ri_data%filter_eps)
    4968              :                   CALL dbt_copy(t_3c_template, t_3c_der_RI(i_img, i_xyz), &
    4969         3820 :                                 order=[1, 3, 2], move_data=.TRUE., summation=.TRUE.)
    4970              :                END IF
    4971              :             END DO
    4972              :          END DO
    4973          210 :          CALL timestop(handle2)
    4974              :       END DO
    4975           46 :       CALL dbt_destroy(t_3c_template)
    4976              : 
    4977           46 :       CALL neighbor_list_3c_destroy(nl_3c)
    4978          184 :       DO i_xyz = 1, 3
    4979         3490 :          DO i_img = 1, nimg
    4980         3306 :             CALL dbt_destroy(t_3c_der_RI_prv(i_img, 1, i_xyz))
    4981         3444 :             CALL dbt_destroy(t_3c_der_AO_prv(i_img, 1, i_xyz))
    4982              :          END DO
    4983              :       END DO
    4984         6658 :       DEALLOCATE (t_3c_der_RI_prv, t_3c_der_AO_prv)
    4985              : 
    4986              :       !Reorder 3c derivatives to be consistant with ints
    4987           46 :       CALL reorder_3c_derivs(t_3c_der_RI, ri_data)
    4988           46 :       CALL reorder_3c_derivs(t_3c_der_AO, ri_data)
    4989              : 
    4990           46 :       CALL timeset(routineN//"_2c", handle2)
    4991              :       !The 2-center derivatives
    4992           46 :       CALL cp_dbcsr_dist2d_to_dist(dist_2d, dbcsr_dist)
    4993          138 :       ALLOCATE (row_bsize(SIZE(ri_data%bsizes_RI)))
    4994           92 :       ALLOCATE (col_bsize(SIZE(ri_data%bsizes_RI)))
    4995          138 :       row_bsize(:) = ri_data%bsizes_RI
    4996          138 :       col_bsize(:) = ri_data%bsizes_RI
    4997              : 
    4998              :       CALL dbcsr_create(dbcsr_template, "2c_der", dbcsr_dist, dbcsr_type_no_symmetry, &
    4999           46 :                         row_bsize, col_bsize)
    5000           46 :       CALL dbcsr_distribution_release(dbcsr_dist)
    5001           46 :       DEALLOCATE (col_bsize, row_bsize)
    5002              : 
    5003         3582 :       ALLOCATE (mat_der_metric(nimg, 3))
    5004          184 :       DO i_xyz = 1, 3
    5005         3490 :          DO i_img = 1, nimg
    5006         3306 :             CALL dbcsr_create(mat_der_pot(i_img, i_xyz), template=dbcsr_template)
    5007         3444 :             CALL dbcsr_create(mat_der_metric(i_img, i_xyz), template=dbcsr_template)
    5008              :          END DO
    5009              :       END DO
    5010           46 :       CALL dbcsr_release(dbcsr_template)
    5011              : 
    5012              :       !HFX potential derivatives
    5013              :       CALL build_2c_neighbor_lists(nl_2c, basis_set_RI, basis_set_RI, ri_data%hfx_pot, &
    5014           46 :                                    "HFX_2c_nl_pot", qs_env, sym_ij=.FALSE., dist_2d=dist_2d)
    5015              :       CALL build_2c_derivatives(mat_der_pot, ri_data%filter_eps_2c, qs_env, nl_2c, &
    5016           46 :                                 basis_set_RI, basis_set_RI, ri_data%hfx_pot, do_kpoints=.TRUE.)
    5017           46 :       CALL release_neighbor_list_sets(nl_2c)
    5018              : 
    5019              :       !RI metric derivatives
    5020              :       CALL build_2c_neighbor_lists(nl_2c, basis_set_RI, basis_set_RI, ri_data%ri_metric, &
    5021           46 :                                    "HFX_2c_nl_pot", qs_env, sym_ij=.FALSE., dist_2d=dist_2d)
    5022              :       CALL build_2c_derivatives(mat_der_metric, ri_data%filter_eps_2c, qs_env, nl_2c, &
    5023           46 :                                 basis_set_RI, basis_set_RI, ri_data%ri_metric, do_kpoints=.TRUE.)
    5024           46 :       CALL release_neighbor_list_sets(nl_2c)
    5025              : 
    5026              :       !Get into extended RI basis and tensor format
    5027          184 :       DO i_xyz = 1, 3
    5028          414 :          DO iatom = 1, natom
    5029          276 :             CALL dbt_create(ri_data%t_2c_inv(1, 1), t_2c_der_metric(iatom, i_xyz))
    5030              :             CALL get_ext_2c_int(t_2c_der_metric(iatom, i_xyz), mat_der_metric(:, i_xyz), &
    5031          414 :                                 iatom, iatom, 1, ri_data, qs_env)
    5032              :          END DO
    5033         3490 :          DO i_img = 1, nimg
    5034         3444 :             CALL dbcsr_release(mat_der_metric(i_img, i_xyz))
    5035              :          END DO
    5036              :       END DO
    5037           46 :       CALL timestop(handle2)
    5038              : 
    5039           46 :       CALL timestop(handle)
    5040              : 
    5041          276 :    END SUBROUTINE precalc_derivatives
    5042              : 
    5043              : ! **************************************************************************************************
    5044              : !> \brief Update the forces due to the derivative of the a 2-center product d/dR (Q|R)
    5045              : !> \param force ...
    5046              : !> \param t_2c_contr A precontracted tensor containing sum_abcdPS (ab|P)(P|Q)^-1 (R|S)^-1 (S|cd) P_ac P_bd
    5047              : !> \param t_2c_der the d/dR (Q|R) tensor, in all 3 cartesian directions
    5048              : !> \param atom_of_kind ...
    5049              : !> \param kind_of ...
    5050              : !> \param img in which periodic image the second center of the tensor is
    5051              : !> \param pref ...
    5052              : !> \param ri_data ...
    5053              : !> \param qs_env ...
    5054              : !> \param work_virial ...
    5055              : !> \param cell ...
    5056              : !> \param particle_set ...
    5057              : !> \param diag ...
    5058              : !> \param offdiag ...
    5059              : !> \note IMPORTANT: t_tc_contr and t_2c_der need to have the same distribution. Atomic block sizes are
    5060              : !>                  assumed
    5061              : ! **************************************************************************************************
    5062         3230 :    SUBROUTINE get_2c_der_force(force, t_2c_contr, t_2c_der, atom_of_kind, kind_of, img, pref, &
    5063              :                                ri_data, qs_env, work_virial, cell, particle_set, diag, offdiag)
    5064              : 
    5065              :       TYPE(qs_force_type), DIMENSION(:), POINTER         :: force
    5066              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_2c_contr
    5067              :       TYPE(dbt_type), DIMENSION(:), INTENT(INOUT)        :: t_2c_der
    5068              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: atom_of_kind, kind_of
    5069              :       INTEGER, INTENT(IN)                                :: img
    5070              :       REAL(dp), INTENT(IN)                               :: pref
    5071              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    5072              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    5073              :       REAL(dp), DIMENSION(3, 3), INTENT(INOUT), OPTIONAL :: work_virial
    5074              :       TYPE(cell_type), OPTIONAL, POINTER                 :: cell
    5075              :       TYPE(particle_type), DIMENSION(:), OPTIONAL, &
    5076              :          POINTER                                         :: particle_set
    5077              :       LOGICAL, INTENT(IN), OPTIONAL                      :: diag, offdiag
    5078              : 
    5079              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'get_2c_der_force'
    5080              : 
    5081              :       INTEGER                                            :: handle, i_img, i_RI, i_xyz, iat, &
    5082              :                                                             iat_of_kind, ikind, j_img, j_RI, &
    5083              :                                                             j_xyz, jat, jat_of_kind, jkind, natom
    5084              :       INTEGER, DIMENSION(2)                              :: ind
    5085         3230 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    5086              :       LOGICAL                                            :: found, my_diag, my_offdiag, use_virial
    5087              :       REAL(dp)                                           :: new_force
    5088         3230 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :), TARGET     :: contr_blk, der_blk
    5089              :       REAL(dp), DIMENSION(3)                             :: scoord
    5090              :       TYPE(dbt_iterator_type)                            :: iter
    5091              :       TYPE(kpoint_type), POINTER                         :: kpoints
    5092              : 
    5093         3230 :       NULLIFY (kpoints, index_to_cell)
    5094              : 
    5095              :       !Loop over the blocks of d/dR (Q|R), contract with the corresponding block of t_2c_contr and
    5096              :       !update the relevant force
    5097              : 
    5098         3230 :       CALL timeset(routineN, handle)
    5099              : 
    5100         3230 :       use_virial = .FALSE.
    5101         3230 :       IF (PRESENT(work_virial) .AND. PRESENT(cell) .AND. PRESENT(particle_set)) use_virial = .TRUE.
    5102              : 
    5103         3230 :       my_diag = .FALSE.
    5104         3230 :       IF (PRESENT(diag)) my_diag = diag
    5105              : 
    5106         2584 :       my_offdiag = .FALSE.
    5107         2584 :       IF (PRESENT(diag)) my_offdiag = offdiag
    5108              : 
    5109         3230 :       CALL get_qs_env(qs_env, kpoints=kpoints, natom=natom)
    5110         3230 :       CALL get_kpoint_info(kpoints, index_to_cell=index_to_cell)
    5111              : 
    5112              : !$OMP PARALLEL DEFAULT(NONE) &
    5113              : !$OMP SHARED(t_2c_der,t_2c_contr,work_virial,force,use_virial,natom,index_to_cell,ri_data,img) &
    5114              : !$OMP SHARED(pref,atom_of_kind,kind_of,particle_set,cell,my_diag,my_offdiag) &
    5115              : !$OMP PRIVATE(i_xyz,j_xyz,iter,ind,der_blk,contr_blk,found,new_force,i_RI,i_img,j_RI,j_img) &
    5116         3230 : !$OMP PRIVATE(iat,jat,iat_of_kind,jat_of_kind,ikind,jkind,scoord)
    5117              :       DO i_xyz = 1, 3
    5118              :          CALL dbt_iterator_start(iter, t_2c_der(i_xyz))
    5119              :          DO WHILE (dbt_iterator_blocks_left(iter))
    5120              :             CALL dbt_iterator_next_block(iter, ind)
    5121              : 
    5122              :             !Only take forecs due to block diagonal or block off-diagonal, depending on arguments
    5123              :             IF ((my_diag .AND. .NOT. my_offdiag) .OR. (.NOT. my_diag .AND. my_offdiag)) THEN
    5124              :                IF (my_diag .AND. (ind(1) .NE. ind(2))) CYCLE
    5125              :                IF (my_offdiag .AND. (ind(1) == ind(2))) CYCLE
    5126              :             END IF
    5127              : 
    5128              :             CALL dbt_get_block(t_2c_der(i_xyz), ind, der_blk, found)
    5129              :             CPASSERT(found)
    5130              :             CALL dbt_get_block(t_2c_contr, ind, contr_blk, found)
    5131              : 
    5132              :             IF (found) THEN
    5133              : 
    5134              :                !an element of d/dR (Q|R) corresponds to 2 things because of translational invariance
    5135              :                !(Q'| R) = - (Q| R'), once wrt the center on Q, and once on R
    5136              :                new_force = pref*SUM(der_blk(:, :)*contr_blk(:, :))
    5137              : 
    5138              :                i_RI = (ind(1) - 1)/natom + 1
    5139              :                i_img = ri_data%RI_cell_to_img(i_RI)
    5140              :                iat = ind(1) - (i_RI - 1)*natom
    5141              :                iat_of_kind = atom_of_kind(iat)
    5142              :                ikind = kind_of(iat)
    5143              : 
    5144              :                j_RI = (ind(2) - 1)/natom + 1
    5145              :                j_img = ri_data%RI_cell_to_img(j_RI)
    5146              :                jat = ind(2) - (j_RI - 1)*natom
    5147              :                jat_of_kind = atom_of_kind(jat)
    5148              :                jkind = kind_of(jat)
    5149              : 
    5150              :                !Force on iatom (first center)
    5151              : !$OMP ATOMIC
    5152              :                force(ikind)%fock_4c(i_xyz, iat_of_kind) = force(ikind)%fock_4c(i_xyz, iat_of_kind) &
    5153              :                                                           + new_force
    5154              : 
    5155              :                IF (use_virial) THEN
    5156              : 
    5157              :                   CALL real_to_scaled(scoord, pbc(particle_set(iat)%r, cell), cell)
    5158              :                   scoord(:) = scoord(:) + REAL(index_to_cell(:, i_img), dp)
    5159              : 
    5160              :                   DO j_xyz = 1, 3
    5161              : !$OMP ATOMIC
    5162              :                      work_virial(i_xyz, j_xyz) = work_virial(i_xyz, j_xyz) + new_force*scoord(j_xyz)
    5163              :                   END DO
    5164              :                END IF
    5165              : 
    5166              :                !Force on jatom (second center)
    5167              : !$OMP ATOMIC
    5168              :                force(jkind)%fock_4c(i_xyz, jat_of_kind) = force(jkind)%fock_4c(i_xyz, jat_of_kind) &
    5169              :                                                           - new_force
    5170              : 
    5171              :                IF (use_virial) THEN
    5172              : 
    5173              :                   CALL real_to_scaled(scoord, pbc(particle_set(jat)%r, cell), cell)
    5174              :                   scoord(:) = scoord(:) + REAL(index_to_cell(:, j_img) + index_to_cell(:, img), dp)
    5175              : 
    5176              :                   DO j_xyz = 1, 3
    5177              : !$OMP ATOMIC
    5178              :                      work_virial(i_xyz, j_xyz) = work_virial(i_xyz, j_xyz) - new_force*scoord(j_xyz)
    5179              :                   END DO
    5180              :                END IF
    5181              : 
    5182              :                DEALLOCATE (contr_blk)
    5183              :             END IF
    5184              : 
    5185              :             DEALLOCATE (der_blk)
    5186              :          END DO !iter
    5187              :          CALL dbt_iterator_stop(iter)
    5188              : 
    5189              :       END DO !i_xyz
    5190              : !$OMP END PARALLEL
    5191         3230 :       CALL timestop(handle)
    5192              : 
    5193         6460 :    END SUBROUTINE get_2c_der_force
    5194              : 
    5195              : ! **************************************************************************************************
    5196              : !> \brief This routines calculates the force contribution from a trace over 3D tensors, i.e.
    5197              : !>        force = sum_ijk A_ijk B_ijk., the B tensor is (P^0| sigma^0 lambda^img), with P in the
    5198              : !>        extended RI basis. Note that all tensors are stacked along the 3rd dimension
    5199              : !> \param force ...
    5200              : !> \param t_3c_contr ...
    5201              : !> \param t_3c_der_1 ...
    5202              : !> \param t_3c_der_2 ...
    5203              : !> \param atom_of_kind ...
    5204              : !> \param kind_of ...
    5205              : !> \param idx_to_at_RI ...
    5206              : !> \param idx_to_at_AO ...
    5207              : !> \param i_images ...
    5208              : !> \param lb_img ...
    5209              : !> \param pref ...
    5210              : !> \param ri_data ...
    5211              : !> \param qs_env ...
    5212              : !> \param work_virial ...
    5213              : !> \param cell ...
    5214              : !> \param particle_set ...
    5215              : ! **************************************************************************************************
    5216         2228 :    SUBROUTINE get_force_from_3c_trace(force, t_3c_contr, t_3c_der_1, t_3c_der_2, atom_of_kind, kind_of, &
    5217         4456 :                                       idx_to_at_RI, idx_to_at_AO, i_images, lb_img, pref, &
    5218              :                                       ri_data, qs_env, work_virial, cell, particle_set)
    5219              : 
    5220              :       TYPE(qs_force_type), DIMENSION(:), POINTER         :: force
    5221              :       TYPE(dbt_type), INTENT(INOUT)                      :: t_3c_contr
    5222              :       TYPE(dbt_type), DIMENSION(3), INTENT(INOUT)        :: t_3c_der_1, t_3c_der_2
    5223              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: atom_of_kind, kind_of, idx_to_at_RI, &
    5224              :                                                             idx_to_at_AO, i_images
    5225              :       INTEGER, INTENT(IN)                                :: lb_img
    5226              :       REAL(dp), INTENT(IN)                               :: pref
    5227              :       TYPE(hfx_ri_type), INTENT(INOUT)                   :: ri_data
    5228              :       TYPE(qs_environment_type), POINTER                 :: qs_env
    5229              :       REAL(dp), DIMENSION(3, 3), INTENT(INOUT), OPTIONAL :: work_virial
    5230              :       TYPE(cell_type), OPTIONAL, POINTER                 :: cell
    5231              :       TYPE(particle_type), DIMENSION(:), OPTIONAL, &
    5232              :          POINTER                                         :: particle_set
    5233              : 
    5234              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'get_force_from_3c_trace'
    5235              : 
    5236              :       INTEGER :: handle, i_RI, i_xyz, iat, iat_of_kind, idx, ikind, j_xyz, jat, jat_of_kind, &
    5237              :          jkind, kat, kat_of_kind, kkind, nblks_AO, nblks_RI, RI_img
    5238              :       INTEGER, DIMENSION(3)                              :: ind
    5239         2228 :       INTEGER, DIMENSION(:, :), POINTER                  :: index_to_cell
    5240              :       LOGICAL                                            :: found, found_1, found_2, use_virial
    5241              :       REAL(dp)                                           :: new_force
    5242         2228 :       REAL(dp), ALLOCATABLE, DIMENSION(:, :, :), TARGET  :: contr_blk, der_blk_1, der_blk_2, &
    5243         2228 :                                                             der_blk_3
    5244              :       REAL(dp), DIMENSION(3)                             :: scoord
    5245              :       TYPE(dbt_iterator_type)                            :: iter
    5246              :       TYPE(kpoint_type), POINTER                         :: kpoints
    5247              : 
    5248         2228 :       NULLIFY (kpoints, index_to_cell)
    5249              : 
    5250         2228 :       CALL timeset(routineN, handle)
    5251              : 
    5252         2228 :       CALL get_qs_env(qs_env, kpoints=kpoints)
    5253         2228 :       CALL get_kpoint_info(kpoints, index_to_cell=index_to_cell)
    5254              : 
    5255         2228 :       nblks_RI = SIZE(ri_data%bsizes_RI_split)
    5256         2228 :       nblks_AO = SIZE(ri_data%bsizes_AO_split)
    5257              : 
    5258         2228 :       use_virial = .FALSE.
    5259         2228 :       IF (PRESENT(work_virial) .AND. PRESENT(cell) .AND. PRESENT(particle_set)) use_virial = .TRUE.
    5260              : 
    5261              : !$OMP PARALLEL DEFAULT(NONE) &
    5262              : !$OMP SHARED(t_3c_der_1, t_3c_der_2,t_3c_contr,work_virial,force,use_virial,index_to_cell,i_images,lb_img) &
    5263              : !$OMP SHARED(pref,idx_to_at_AO,atom_of_kind,kind_of,particle_set,cell,idx_to_at_RI,ri_data,nblks_RI,nblks_AO) &
    5264              : !$OMP PRIVATE(i_xyz,j_xyz,iter,ind,der_blk_1,contr_blk,found,new_force,iat,iat_of_kind,ikind,scoord) &
    5265         2228 : !$OMP PRIVATE(jat,kat,jat_of_kind,kat_of_kind,jkind,kkind,i_RI,RI_img,der_blk_2,der_blk_3,found_1,found_2,idx)
    5266              :       CALL dbt_iterator_start(iter, t_3c_contr)
    5267              :       DO WHILE (dbt_iterator_blocks_left(iter))
    5268              :          CALL dbt_iterator_next_block(iter, ind)
    5269              : 
    5270              :          CALL dbt_get_block(t_3c_contr, ind, contr_blk, found)
    5271              :          IF (found) THEN
    5272              : 
    5273              :             DO i_xyz = 1, 3
    5274              :                CALL dbt_get_block(t_3c_der_1(i_xyz), ind, der_blk_1, found_1)
    5275              :                IF (.NOT. found_1) THEN
    5276              :                   DEALLOCATE (der_blk_1)
    5277              :                   ALLOCATE (der_blk_1(SIZE(contr_blk, 1), SIZE(contr_blk, 2), SIZE(contr_blk, 3)))
    5278              :                   der_blk_1(:, :, :) = 0.0_dp
    5279              :                END IF
    5280              :                CALL dbt_get_block(t_3c_der_2(i_xyz), ind, der_blk_2, found_2)
    5281              :                IF (.NOT. found_2) THEN
    5282              :                   DEALLOCATE (der_blk_2)
    5283              :                   ALLOCATE (der_blk_2(SIZE(contr_blk, 1), SIZE(contr_blk, 2), SIZE(contr_blk, 3)))
    5284              :                   der_blk_2(:, :, :) = 0.0_dp
    5285              :                END IF
    5286              : 
    5287              :                ALLOCATE (der_blk_3(SIZE(contr_blk, 1), SIZE(contr_blk, 2), SIZE(contr_blk, 3)))
    5288              :                der_blk_3(:, :, :) = -(der_blk_1(:, :, :) + der_blk_2(:, :, :))
    5289              : 
    5290              :                !We assume the tensors are in the format (P^0| sigma^0 mu^a+c-b), with P a member of the
    5291              :                !extended RI basis set
    5292              : 
    5293              :                !Force for the first center (RI extended basis, zero cell)
    5294              :                new_force = pref*SUM(der_blk_1(:, :, :)*contr_blk(:, :, :))
    5295              : 
    5296              :                i_RI = (ind(1) - 1)/nblks_RI + 1
    5297              :                RI_img = ri_data%RI_cell_to_img(i_RI)
    5298              :                iat = idx_to_at_RI(ind(1) - (i_RI - 1)*nblks_RI)
    5299              :                iat_of_kind = atom_of_kind(iat)
    5300              :                ikind = kind_of(iat)
    5301              : 
    5302              : !$OMP ATOMIC
    5303              :                force(ikind)%fock_4c(i_xyz, iat_of_kind) = force(ikind)%fock_4c(i_xyz, iat_of_kind) &
    5304              :                                                           + new_force
    5305              : 
    5306              :                IF (use_virial) THEN
    5307              : 
    5308              :                   CALL real_to_scaled(scoord, pbc(particle_set(iat)%r, cell), cell)
    5309              :                   scoord(:) = scoord(:) + REAL(index_to_cell(:, RI_img), dp)
    5310              : 
    5311              :                   DO j_xyz = 1, 3
    5312              : !$OMP ATOMIC
    5313              :                      work_virial(i_xyz, j_xyz) = work_virial(i_xyz, j_xyz) + new_force*scoord(j_xyz)
    5314              :                   END DO
    5315              :                END IF
    5316              : 
    5317              :                !Force with respect to the second center (AO basis, zero cell)
    5318              :                new_force = pref*SUM(der_blk_2(:, :, :)*contr_blk(:, :, :))
    5319              :                jat = idx_to_at_AO(ind(2))
    5320              :                jat_of_kind = atom_of_kind(jat)
    5321              :                jkind = kind_of(jat)
    5322              : 
    5323              : !$OMP ATOMIC
    5324              :                force(jkind)%fock_4c(i_xyz, jat_of_kind) = force(jkind)%fock_4c(i_xyz, jat_of_kind) &
    5325              :                                                           + new_force
    5326              : 
    5327              :                IF (use_virial) THEN
    5328              : 
    5329              :                   CALL real_to_scaled(scoord, pbc(particle_set(jat)%r, cell), cell)
    5330              : 
    5331              :                   DO j_xyz = 1, 3
    5332              : !$OMP ATOMIC
    5333              :                      work_virial(i_xyz, j_xyz) = work_virial(i_xyz, j_xyz) + new_force*scoord(j_xyz)
    5334              :                   END DO
    5335              :                END IF
    5336              : 
    5337              :                !Force with respect to the third center (AO basis, apc_img - b_img)
    5338              :                !Note: tensors are stacked along the 3rd direction
    5339              :                new_force = pref*SUM(der_blk_3(:, :, :)*contr_blk(:, :, :))
    5340              :                idx = (ind(3) - 1)/nblks_AO + 1
    5341              :                kat = idx_to_at_AO(ind(3) - (idx - 1)*nblks_AO)
    5342              :                kat_of_kind = atom_of_kind(kat)
    5343              :                kkind = kind_of(kat)
    5344              : 
    5345              : !$OMP ATOMIC
    5346              :                force(kkind)%fock_4c(i_xyz, kat_of_kind) = force(kkind)%fock_4c(i_xyz, kat_of_kind) &
    5347              :                                                           + new_force
    5348              : 
    5349              :                IF (use_virial) THEN
    5350              :                   CALL real_to_scaled(scoord, pbc(particle_set(kat)%r, cell), cell)
    5351              :                   scoord(:) = scoord(:) + REAL(index_to_cell(:, i_images(lb_img - 1 + idx)), dp)
    5352              : 
    5353              :                   DO j_xyz = 1, 3
    5354              : !$OMP ATOMIC
    5355              :                      work_virial(i_xyz, j_xyz) = work_virial(i_xyz, j_xyz) + new_force*scoord(j_xyz)
    5356              :                   END DO
    5357              :                END IF
    5358              : 
    5359              :                DEALLOCATE (der_blk_1, der_blk_2, der_blk_3)
    5360              :             END DO !i_xyz
    5361              :             DEALLOCATE (contr_blk)
    5362              :          END IF !found
    5363              :       END DO !iter
    5364              :       CALL dbt_iterator_stop(iter)
    5365              : !$OMP END PARALLEL
    5366         2228 :       CALL timestop(handle)
    5367              : 
    5368         4456 :    END SUBROUTINE get_force_from_3c_trace
    5369              : 
    5370              : END MODULE
        

Generated by: LCOV version 2.0-1