LCOV - code coverage report
Current view: top level - src - rpa_grad.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:1425fcd) Lines: 918 1128 81.4 %
Date: 2024-05-08 07:14:22 Functions: 25 31 80.6 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Routines to calculate RI-RPA and SOS-MP2 gradients
      10             : !> \par History
      11             : !>      10.2021 created [Frederick Stein]
      12             : ! **************************************************************************************************
      13             : MODULE rpa_grad
      14             :    USE ISO_C_BINDING,                   ONLY: C_NULL_PTR,&
      15             :                                               C_PTR,&
      16             :                                               c_associated
      17             :    USE cp_array_utils,                  ONLY: cp_1d_r_cp_type,&
      18             :                                               cp_3d_r_cp_type
      19             :    USE cp_blacs_env,                    ONLY: cp_blacs_env_type
      20             :    USE cp_fm_basic_linalg,              ONLY: cp_fm_geadd,&
      21             :                                               cp_fm_scale_and_add,&
      22             :                                               cp_fm_upper_to_full
      23             :    USE cp_fm_cholesky,                  ONLY: cp_fm_cholesky_invert
      24             :    USE cp_fm_struct,                    ONLY: cp_fm_struct_create,&
      25             :                                               cp_fm_struct_get,&
      26             :                                               cp_fm_struct_release,&
      27             :                                               cp_fm_struct_type
      28             :    USE cp_fm_types,                     ONLY: &
      29             :         cp_fm_create, cp_fm_get_info, cp_fm_indxg2l, cp_fm_indxg2p, cp_fm_release, cp_fm_set_all, &
      30             :         cp_fm_to_fm, cp_fm_to_fm_submat_general, cp_fm_type
      31             :    USE dgemm_counter_types,             ONLY: dgemm_counter_start,&
      32             :                                               dgemm_counter_stop,&
      33             :                                               dgemm_counter_type
      34             :    USE group_dist_types,                ONLY: create_group_dist,&
      35             :                                               get_group_dist,&
      36             :                                               group_dist_d1_type,&
      37             :                                               group_dist_proc,&
      38             :                                               maxsize,&
      39             :                                               release_group_dist
      40             :    USE kahan_sum,                       ONLY: accurate_dot_product,&
      41             :                                               accurate_dot_product_2
      42             :    USE kinds,                           ONLY: dp,&
      43             :                                               int_8
      44             :    USE libint_2c_3c,                    ONLY: compare_potential_types
      45             :    USE local_gemm_api,                  ONLY: LOCAL_GEMM_PU_GPU,&
      46             :                                               local_gemm,&
      47             :                                               local_gemm_create,&
      48             :                                               local_gemm_destroy,&
      49             :                                               local_gemm_set_op_threshold_gpu
      50             :    USE machine,                         ONLY: m_flush,&
      51             :                                               m_memory
      52             :    USE mathconstants,                   ONLY: pi
      53             :    USE message_passing,                 ONLY: mp_comm_type,&
      54             :                                               mp_para_env_type,&
      55             :                                               mp_request_null,&
      56             :                                               mp_request_type,&
      57             :                                               mp_waitall,&
      58             :                                               mp_waitany
      59             :    USE mp2_laplace,                     ONLY: calc_fm_mat_s_laplace
      60             :    USE mp2_ri_grad_util,                ONLY: array2fm,&
      61             :                                               create_dbcsr_gamma,&
      62             :                                               fm2array,&
      63             :                                               prepare_redistribution
      64             :    USE mp2_types,                       ONLY: mp2_type,&
      65             :                                               one_dim_int_array,&
      66             :                                               two_dim_int_array,&
      67             :                                               two_dim_real_array
      68             :    USE parallel_gemm_api,               ONLY: parallel_gemm
      69             :    USE qs_environment_types,            ONLY: get_qs_env,&
      70             :                                               qs_environment_type
      71             :    USE rpa_util,                        ONLY: calc_fm_mat_S_rpa,&
      72             :                                               remove_scaling_factor_rpa
      73             :    USE util,                            ONLY: get_limit
      74             : #include "./base/base_uses.f90"
      75             : 
      76             :    IMPLICIT NONE
      77             : 
      78             :    PRIVATE
      79             : 
      80             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'rpa_grad'
      81             : 
      82             :    PUBLIC :: rpa_grad_needed_mem, rpa_grad_type, rpa_grad_create, rpa_grad_finalize, rpa_grad_matrix_operations, rpa_grad_copy_Q
      83             : 
      84             :    TYPE sos_mp2_grad_work_type
      85             :       PRIVATE
      86             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: pair_list
      87             :       TYPE(one_dim_int_array), DIMENSION(:), ALLOCATABLE :: index2send, index2recv
      88             :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: P
      89             :    END TYPE
      90             : 
      91             :    TYPE rpa_grad_work_type
      92             :       TYPE(cp_fm_type) :: fm_mat_Q_copy = cp_fm_type()
      93             :       TYPE(one_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2send
      94             :       TYPE(two_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2recv
      95             :       TYPE(group_dist_d1_type), DIMENSION(:), ALLOCATABLE :: gd_homo, gd_virtual
      96             :       INTEGER, DIMENSION(2) :: grid = -1, mepos = -1
      97             :       TYPE(two_dim_real_array), DIMENSION(:), ALLOCATABLE :: P_ij, P_ab
      98             :    END TYPE
      99             : 
     100             :    TYPE rpa_grad_type
     101             :       PRIVATE
     102             :       TYPE(cp_fm_type) :: fm_Gamma_PQ = cp_fm_type()
     103             :       TYPE(cp_fm_type), DIMENSION(:), ALLOCATABLE :: fm_Y
     104             :       TYPE(sos_mp2_grad_work_type), ALLOCATABLE, DIMENSION(:) :: sos_mp2_work_occ, sos_mp2_work_virt
     105             :       TYPE(rpa_grad_work_type) :: rpa_work
     106             :    END TYPE
     107             : 
     108             :    INTEGER, PARAMETER :: spla_threshold = 128*128*128*2
     109             :    INTEGER, PARAMETER :: blksize_threshold = 4
     110             : 
     111             : CONTAINS
     112             : 
     113             : ! **************************************************************************************************
     114             : !> \brief Calculates the necessary minimum memory for the Gradient code ion MiB
     115             : !> \param homo ...
     116             : !> \param virtual ...
     117             : !> \param dimen_RI ...
     118             : !> \param mem_per_rank ...
     119             : !> \param mem_per_repl ...
     120             : !> \param do_ri_sos_laplace_mp2 ...
     121             : !> \return ...
     122             : ! **************************************************************************************************
     123          40 :    PURE SUBROUTINE rpa_grad_needed_mem(homo, virtual, dimen_RI, mem_per_rank, mem_per_repl, do_ri_sos_laplace_mp2)
     124             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     125             :       INTEGER, INTENT(IN)                                :: dimen_RI
     126             :       REAL(KIND=dp), INTENT(INOUT)                       :: mem_per_rank, mem_per_repl
     127             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
     128             : 
     129             :       REAL(KIND=dp)                                      :: mem_iaK, mem_KL, mem_pab, mem_pij
     130             : 
     131          88 :       mem_iaK = SUM(REAL(virtual, KIND=dp)*homo)*dimen_RI
     132          88 :       mem_pij = SUM(REAL(homo, KIND=dp)**2)
     133          88 :       mem_pab = SUM(REAL(virtual, KIND=dp)**2)
     134          40 :       mem_KL = REAL(dimen_RI, KIND=dp)*dimen_RI
     135             : 
     136             :       ! Required matrices iaK
     137             :       ! Ytot_iaP = sum_tau Y_iaP(tau)
     138             :       ! Y_iaP(tau) = S_iaP(tau)*Q_PQ(tau) (work array)
     139             :       ! Required matrices density matrices
     140             :       ! Pij (local)
     141             :       ! Pab (local)
     142             :       ! Additionally with SOS-MP2
     143             :       ! Send and receive buffers for degenerate orbital pairs (rough estimate: everything)
     144             :       ! Additionally with RPA
     145             :       ! copy of work matrix
     146             :       ! receive buffer for calculation of density matrix
     147             :       ! copy of matrix Q
     148          40 :       mem_per_rank = mem_per_rank + (mem_pij + mem_pab)*8.0_dp/(1024**2)
     149          40 :       mem_per_repl = mem_per_repl + (mem_iaK + 2.0_dp*mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
     150          40 :       IF (.NOT. do_ri_sos_laplace_mp2) THEN
     151          20 :          mem_per_repl = mem_per_rank + (mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
     152             :       END IF
     153             : 
     154          40 :    END SUBROUTINE rpa_grad_needed_mem
     155             : 
     156             : ! **************************************************************************************************
     157             : !> \brief Creates the arrays of a rpa_grad_type
     158             : !> \param rpa_grad ...
     159             : !> \param fm_mat_Q ...
     160             : !> \param fm_mat_S ...
     161             : !> \param homo ...
     162             : !> \param virtual ...
     163             : !> \param mp2_env ...
     164             : !> \param Eigenval ...
     165             : !> \param unit_nr ...
     166             : !> \param do_ri_sos_laplace_mp2 ...
     167             : ! **************************************************************************************************
     168         280 :    SUBROUTINE rpa_grad_create(rpa_grad, fm_mat_Q, fm_mat_S, &
     169          40 :                               homo, virtual, mp2_env, Eigenval, unit_nr, do_ri_sos_laplace_mp2)
     170             :       TYPE(rpa_grad_type), INTENT(OUT)                   :: rpa_grad
     171             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_Q
     172             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     173             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     174             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
     175             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
     176             :       INTEGER, INTENT(IN)                                :: unit_nr
     177             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
     178             : 
     179             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'rpa_grad_create'
     180             : 
     181             :       INTEGER                                            :: handle, ispin, nrow_local, nspins
     182             : 
     183          40 :       CALL timeset(routineN, handle)
     184             : 
     185          40 :       CALL cp_fm_create(rpa_grad%fm_Gamma_PQ, matrix_struct=fm_mat_Q%matrix_struct)
     186          40 :       CALL cp_fm_set_all(rpa_grad%fm_Gamma_PQ, 0.0_dp)
     187             : 
     188          40 :       nspins = SIZE(fm_mat_S)
     189             : 
     190         168 :       ALLOCATE (rpa_grad%fm_Y(nspins))
     191          88 :       DO ispin = 1, nspins
     192          88 :          CALL cp_fm_create(rpa_grad%fm_Y(ispin), fm_mat_S(ispin)%matrix_struct)
     193             :       END DO
     194             : 
     195          40 :       IF (do_ri_sos_laplace_mp2) THEN
     196             :          CALL sos_mp2_work_type_create(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
     197          20 :                                        unit_nr, Eigenval, homo, virtual, mp2_env%ri_grad%eps_canonical, fm_mat_S)
     198             :       ELSE
     199          20 :          CALL rpa_work_type_create(rpa_grad%rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
     200             :       END IF
     201             : 
     202             :       ! Set blocksize
     203          40 :       CALL cp_fm_struct_get(fm_mat_S(1)%matrix_struct, nrow_local=nrow_local)
     204          40 :       IF (mp2_env%ri_grad%dot_blksize < 1) mp2_env%ri_grad%dot_blksize = nrow_local
     205          40 :       mp2_env%ri_grad%dot_blksize = MIN(mp2_env%ri_grad%dot_blksize, nrow_local)
     206          40 :       IF (unit_nr > 0) THEN
     207          20 :          WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Block size for the contraction:', mp2_env%ri_grad%dot_blksize
     208          20 :          CALL m_flush(unit_nr)
     209             :       END IF
     210          40 :       CALL fm_mat_S(1)%matrix_struct%para_env%sync()
     211             : 
     212          40 :       CALL timestop(handle)
     213             : 
     214          80 :    END SUBROUTINE rpa_grad_create
     215             : 
     216             : ! **************************************************************************************************
     217             : !> \brief ...
     218             : !> \param sos_mp2_work_occ ...
     219             : !> \param sos_mp2_work_virt ...
     220             : !> \param unit_nr ...
     221             : !> \param Eigenval ...
     222             : !> \param homo ...
     223             : !> \param virtual ...
     224             : !> \param eps_degenerate ...
     225             : !> \param fm_mat_S ...
     226             : ! **************************************************************************************************
     227          20 :    SUBROUTINE sos_mp2_work_type_create(sos_mp2_work_occ, sos_mp2_work_virt, unit_nr, &
     228          20 :                                        Eigenval, homo, virtual, eps_degenerate, fm_mat_S)
     229             :       TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
     230             :          DIMENSION(:), INTENT(OUT)                       :: sos_mp2_work_occ, sos_mp2_work_virt
     231             :       INTEGER, INTENT(IN)                                :: unit_nr
     232             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
     233             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     234             :       REAL(KIND=dp), INTENT(IN)                          :: eps_degenerate
     235             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     236             : 
     237             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_work_type_create'
     238             : 
     239             :       INTEGER                                            :: handle, ispin, nspins
     240             : 
     241          20 :       CALL timeset(routineN, handle)
     242             : 
     243          20 :       nspins = SIZE(fm_mat_S)
     244         148 :       ALLOCATE (sos_mp2_work_occ(nspins), sos_mp2_work_virt(nspins))
     245          44 :       DO ispin = 1, nspins
     246             : 
     247             :          CALL create_list_nearly_degen_pairs(Eigenval(1:homo(ispin), ispin), &
     248          24 :                                              eps_degenerate, sos_mp2_work_occ(ispin)%pair_list)
     249          24 :          IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
     250          12 :             "MO_INFO| Number of ij pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
     251          72 :          ALLOCATE (sos_mp2_work_occ(ispin)%P(homo(ispin) + SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)))
     252         164 :          sos_mp2_work_occ(ispin)%P = 0.0_dp
     253          24 :          CALL prepare_comm_Pij(sos_mp2_work_occ(ispin), virtual(ispin), fm_mat_S(ispin))
     254             : 
     255             :          CALL create_list_nearly_degen_pairs(Eigenval(homo(ispin) + 1:, ispin), &
     256          24 :                                              eps_degenerate, sos_mp2_work_virt(ispin)%pair_list)
     257          24 :          IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
     258          12 :             "MO_INFO| Number of ab pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
     259          72 :          ALLOCATE (sos_mp2_work_virt(ispin)%P(virtual(ispin) + SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)))
     260        1136 :          sos_mp2_work_virt(ispin)%P = 0.0_dp
     261          44 :          CALL prepare_comm_Pab(sos_mp2_work_virt(ispin), virtual(ispin), fm_mat_S(ispin))
     262             :       END DO
     263             : 
     264          20 :       CALL timestop(handle)
     265             : 
     266          20 :    END SUBROUTINE
     267             : 
     268             : ! **************************************************************************************************
     269             : !> \brief ...
     270             : !> \param rpa_work ...
     271             : !> \param fm_mat_Q ...
     272             : !> \param fm_mat_S ...
     273             : !> \param homo ...
     274             : !> \param virtual ...
     275             : ! **************************************************************************************************
     276         120 :    SUBROUTINE rpa_work_type_create(rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
     277             :       TYPE(rpa_grad_work_type), INTENT(OUT)              :: rpa_work
     278             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_Q
     279             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     280             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     281             : 
     282             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_work_type_create'
     283             : 
     284             :       INTEGER :: avirt, col_global, col_local, first_p_pos(2), first_p_pos_col, handle, iocc, &
     285             :          ispin, my_a, my_a_end, my_a_size, my_a_start, my_i, my_i_end, my_i_size, my_i_start, &
     286             :          my_pcol, ncol_block, ncol_local, nspins, num_pe_col, proc_homo, proc_homo_send, &
     287             :          proc_recv, proc_send, proc_virtual, proc_virtual_send
     288          20 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv, data2send
     289          20 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices
     290             : 
     291          20 :       CALL timeset(routineN, handle)
     292             : 
     293          20 :       CALL cp_fm_create(rpa_work%fm_mat_Q_copy, matrix_struct=fm_mat_Q%matrix_struct)
     294             : 
     295          20 :       CALL fm_mat_S(1)%matrix_struct%context%get(number_of_process_columns=num_pe_col, my_process_column=my_pcol)
     296             : 
     297          20 :       nspins = SIZE(fm_mat_S)
     298             : 
     299           0 :       ALLOCATE (rpa_work%index2send(0:num_pe_col - 1, nspins), &
     300           0 :                 rpa_work%index2recv(0:num_pe_col - 1, nspins), &
     301           0 :                 rpa_work%gd_homo(nspins), rpa_work%gd_virtual(nspins), &
     302             :                 data2send(0:num_pe_col - 1), data2recv(0:num_pe_col - 1), &
     303         512 :                 rpa_work%P_ij(nspins), rpa_work%P_ab(nspins))
     304             : 
     305             :       ! Determine new process grid
     306          20 :       proc_homo = MAX(1, CEILING(SQRT(REAL(num_pe_col, KIND=dp))))
     307          20 :       DO WHILE (MOD(num_pe_col, proc_homo) /= 0)
     308           0 :          proc_homo = proc_homo - 1
     309             :       END DO
     310          20 :       proc_virtual = num_pe_col/proc_homo
     311             : 
     312          20 :       rpa_work%grid(1) = proc_virtual
     313          20 :       rpa_work%grid(2) = proc_homo
     314             : 
     315          20 :       rpa_work%mepos(1) = MOD(my_pcol, proc_virtual)
     316          20 :       rpa_work%mepos(2) = my_pcol/proc_virtual
     317             : 
     318          44 :       DO ispin = 1, nspins
     319             : 
     320             :          ! Determine distributions of the orbitals
     321          24 :          CALL create_group_dist(rpa_work%gd_homo(ispin), proc_homo, homo(ispin))
     322          24 :          CALL create_group_dist(rpa_work%gd_virtual(ispin), proc_virtual, virtual(ispin))
     323             : 
     324             :          CALL cp_fm_struct_get(fm_mat_S(ispin)%matrix_struct, ncol_local=ncol_local, col_indices=col_indices, &
     325          24 :                                first_p_pos=first_p_pos, ncol_block=ncol_block)
     326          24 :          first_p_pos_col = first_p_pos(2)
     327             : 
     328          48 :          data2send = 0
     329             :          ! Count the amount of data2send to each process
     330        1924 :          DO col_local = 1, ncol_local
     331        1900 :             col_global = col_indices(col_local)
     332             : 
     333        1900 :             iocc = (col_global - 1)/virtual(ispin) + 1
     334        1900 :             avirt = col_global - (iocc - 1)*virtual(ispin)
     335             : 
     336        1900 :             proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
     337        1900 :             proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
     338             : 
     339        1900 :             proc_send = proc_homo_send*proc_virtual + proc_virtual_send
     340             : 
     341        1924 :             data2send(proc_send) = data2send(proc_send) + 1
     342             :          END DO
     343             : 
     344          48 :          DO proc_send = 0, num_pe_col - 1
     345          96 :             ALLOCATE (rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)))
     346             :          END DO
     347             : 
     348             :          ! Prepare the indices
     349          48 :          data2send = 0
     350        1924 :          DO col_local = 1, ncol_local
     351        1900 :             col_global = col_indices(col_local)
     352             : 
     353        1900 :             iocc = (col_global - 1)/virtual(ispin) + 1
     354        1900 :             avirt = col_global - (iocc - 1)*virtual(ispin)
     355             : 
     356        1900 :             proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
     357        1900 :             proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
     358             : 
     359        1900 :             proc_send = proc_homo_send*proc_virtual + proc_virtual_send
     360             : 
     361        1900 :             data2send(proc_send) = data2send(proc_send) + 1
     362             : 
     363        1924 :             rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)) = col_local
     364             :          END DO
     365             : 
     366             :          ! Count the amount of data2recv from each process
     367          24 :          CALL get_group_dist(rpa_work%gd_homo(ispin), my_pcol/proc_virtual, my_i_start, my_i_end, my_i_size)
     368          24 :          CALL get_group_dist(rpa_work%gd_virtual(ispin), MOD(my_pcol, proc_virtual), my_a_start, my_a_end, my_a_size)
     369             : 
     370          48 :          data2recv = 0
     371         116 :          DO my_i = my_i_start, my_i_end
     372        2016 :          DO my_a = my_a_start, my_a_end
     373        1900 :             proc_recv = cp_fm_indxg2p((my_i - 1)*virtual(ispin) + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
     374        1992 :             data2recv(proc_recv) = data2recv(proc_recv) + 1
     375             :          END DO
     376             :          END DO
     377             : 
     378          48 :          DO proc_recv = 0, num_pe_col - 1
     379          96 :             ALLOCATE (rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)))
     380             :          END DO
     381             : 
     382          48 :          data2recv = 0
     383         116 :          DO my_i = my_i_start, my_i_end
     384        2016 :          DO my_a = my_a_start, my_a_end
     385        1900 :             proc_recv = cp_fm_indxg2p((my_i - 1)*virtual(ispin) + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
     386        1900 :             data2recv(proc_recv) = data2recv(proc_recv) + 1
     387             : 
     388        1900 :             rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)) = my_i - my_i_start + 1
     389        1992 :             rpa_work%index2recv(proc_recv, ispin)%array(1, data2recv(proc_recv)) = my_a - my_a_start + 1
     390             :          END DO
     391             :          END DO
     392             : 
     393           0 :          ALLOCATE (rpa_work%P_ij(ispin)%array(my_i_size, homo(ispin)), &
     394         168 :                    rpa_work%P_ab(ispin)%array(my_a_size, virtual(ispin)))
     395         472 :          rpa_work%P_ij(ispin)%array(:, :) = 0.0_dp
     396       11148 :          rpa_work%P_ab(ispin)%array(:, :) = 0.0_dp
     397             : 
     398             :       END DO
     399             : 
     400          20 :       DEALLOCATE (data2send, data2recv)
     401             : 
     402          20 :       CALL timestop(handle)
     403             : 
     404          40 :    END SUBROUTINE
     405             : 
     406             : ! **************************************************************************************************
     407             : !> \brief ...
     408             : !> \param Eigenval ...
     409             : !> \param eps_degen ...
     410             : !> \param pair_list ...
     411             : ! **************************************************************************************************
     412          48 :    SUBROUTINE create_list_nearly_degen_pairs(Eigenval, eps_degen, pair_list)
     413             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
     414             :       REAL(KIND=dp), INTENT(IN)                          :: eps_degen
     415             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: pair_list
     416             : 
     417             :       INTEGER                                            :: my_i, my_j, num_orbitals, num_pairs, &
     418             :                                                             pair_counter
     419             : 
     420          48 :       num_orbitals = SIZE(Eigenval)
     421             : 
     422             : ! Determine number of nearly degenerate orbital pairs
     423             : ! Trivial cases: diagonal elements
     424          48 :       num_pairs = 0
     425         640 :       DO my_i = 1, num_orbitals
     426       11576 :       DO my_j = 1, num_orbitals
     427       10936 :          IF (my_i == my_j) CYCLE
     428       10936 :          IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) num_pairs = num_pairs + 1
     429             :       END DO
     430             :       END DO
     431         104 :       ALLOCATE (pair_list(2, num_pairs))
     432             : 
     433             : ! Print the required pairs
     434          48 :       pair_counter = 1
     435         640 :       DO my_i = 1, num_orbitals
     436       11576 :       DO my_j = 1, num_orbitals
     437       10936 :          IF (my_i == my_j) CYCLE
     438       10936 :          IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) THEN
     439         660 :             pair_list(1, pair_counter) = my_i
     440         660 :             pair_list(2, pair_counter) = my_j
     441         660 :             pair_counter = pair_counter + 1
     442             :          END IF
     443             :       END DO
     444             :       END DO
     445             : 
     446          48 :    END SUBROUTINE create_list_nearly_degen_pairs
     447             : 
     448             : ! **************************************************************************************************
     449             : !> \brief ...
     450             : !> \param sos_mp2_work ...
     451             : !> \param virtual ...
     452             : !> \param fm_mat_S ...
     453             : ! **************************************************************************************************
     454          24 :    SUBROUTINE prepare_comm_Pij(sos_mp2_work, virtual, fm_mat_S)
     455             :       TYPE(sos_mp2_grad_work_type), INTENT(INOUT)        :: sos_mp2_work
     456             :       INTEGER, INTENT(IN)                                :: virtual
     457             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S
     458             : 
     459             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'prepare_comm_Pij'
     460             : 
     461             :       INTEGER :: avirt, col_global, col_local, counter, first_p_pos(2), first_p_pos_col, handle, &
     462             :          ij_counter, iocc, my_i, my_j, my_pcol, my_prow, ncol_block, ncol_local, nrow_local, &
     463             :          num_ij_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, tag
     464          24 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv, data2send
     465          24 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
     466          24 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
     467             :       TYPE(cp_blacs_env_type), POINTER                   :: context
     468             :       TYPE(mp_comm_type)                                 :: comm_exchange
     469             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     470             : 
     471          24 :       CALL timeset(routineN, handle)
     472             : 
     473          24 :       tag = 44
     474             : 
     475          24 :       CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
     476           0 :       ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
     477         168 :                 sos_mp2_work%index2recv(0:num_pe_col - 1))
     478             : 
     479          72 :       ALLOCATE (data2send(0:num_pe_col - 1))
     480          72 :       ALLOCATE (data2recv(0:num_pe_col - 1))
     481             : 
     482             :       CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
     483             :                             ncol_local=ncol_local, col_indices=col_indices, &
     484          24 :                             context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
     485             :       CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
     486          24 :                        blacs2mpi=blacs2mpi)
     487          24 :       first_p_pos_col = first_p_pos(2)
     488             : 
     489          24 :       num_ij_pairs = SIZE(sos_mp2_work%pair_list, 2)
     490             : 
     491          24 :       IF (num_ij_pairs > 0) THEN
     492             : 
     493           4 :          CALL comm_exchange%from_split(para_env, my_prow)
     494             : 
     495           8 :          data2send = 0
     496           8 :          data2recv = 0
     497             : 
     498           8 :          DO proc_shift = 0, num_pe_col - 1
     499           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     500             : 
     501           4 :             counter = 0
     502         308 :             DO col_local = 1, ncol_local
     503         304 :                col_global = col_indices(col_local)
     504             : 
     505         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     506         304 :                avirt = col_global - (iocc - 1)*virtual
     507             : 
     508         764 :                DO ij_counter = 1, num_ij_pairs
     509             : 
     510         760 :                   my_i = sos_mp2_work%pair_list(1, ij_counter)
     511         760 :                   my_j = sos_mp2_work%pair_list(2, ij_counter)
     512             : 
     513         760 :                   IF (iocc /= my_j) CYCLE
     514         304 :                   pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
     515         304 :                   IF (pcol /= pcol_send) CYCLE
     516             : 
     517         304 :                   counter = counter + 1
     518             : 
     519         760 :                   EXIT
     520             : 
     521             :                END DO
     522             :             END DO
     523           8 :             data2send(pcol_send) = counter
     524             :          END DO
     525             : 
     526           4 :          CALL comm_exchange%alltoall(data2send, data2recv, 1)
     527             : 
     528           8 :          DO proc_shift = 0, num_pe_col - 1
     529           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     530           4 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
     531             : 
     532             :             ! Collect indices and exchange
     533          12 :             ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
     534             : 
     535           4 :             counter = 0
     536         308 :             DO col_local = 1, ncol_local
     537         304 :                col_global = col_indices(col_local)
     538             : 
     539         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     540         304 :                avirt = col_global - (iocc - 1)*virtual
     541             : 
     542         764 :                DO ij_counter = 1, num_ij_pairs
     543             : 
     544         760 :                   my_i = sos_mp2_work%pair_list(1, ij_counter)
     545         760 :                   my_j = sos_mp2_work%pair_list(2, ij_counter)
     546             : 
     547         760 :                   IF (iocc /= my_j) CYCLE
     548         304 :                   pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
     549         304 :                   IF (pcol /= pcol_send) CYCLE
     550             : 
     551         304 :                   counter = counter + 1
     552             : 
     553         304 :                   sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
     554             : 
     555         760 :                   EXIT
     556             : 
     557             :                END DO
     558             :             END DO
     559             : 
     560          12 :             ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
     561             :             !
     562             :             CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
     563           4 :                                    sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
     564             : 
     565             :             ! Convert to global coordinates to local coordinates as we always work with them
     566         312 :             DO counter = 1, data2send(pcol_send)
     567             :                sos_mp2_work%index2send(pcol_send)%array(counter) = &
     568             :                   cp_fm_indxg2l(sos_mp2_work%index2send(pcol_send)%array(counter), &
     569         308 :                                 ncol_block, 0, first_p_pos_col, num_pe_col)
     570             :             END DO
     571             :          END DO
     572             : 
     573           4 :          CALL comm_exchange%free()
     574             :       END IF
     575             : 
     576          24 :       DEALLOCATE (data2send, data2recv)
     577             : 
     578          24 :       CALL timestop(handle)
     579             : 
     580          48 :    END SUBROUTINE
     581             : 
     582             : ! **************************************************************************************************
     583             : !> \brief ...
     584             : !> \param sos_mp2_work ...
     585             : !> \param virtual ...
     586             : !> \param fm_mat_S ...
     587             : ! **************************************************************************************************
     588          24 :    SUBROUTINE prepare_comm_Pab(sos_mp2_work, virtual, fm_mat_S)
     589             :       TYPE(sos_mp2_grad_work_type), INTENT(INOUT)        :: sos_mp2_work
     590             :       INTEGER, INTENT(IN)                                :: virtual
     591             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S
     592             : 
     593             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'prepare_comm_Pab'
     594             : 
     595             :       INTEGER :: ab_counter, avirt, col_global, col_local, counter, first_p_pos(2), &
     596             :          first_p_pos_col, handle, iocc, my_a, my_b, my_pcol, my_prow, ncol_block, ncol_local, &
     597             :          nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, tag
     598          24 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv, data2send
     599          24 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
     600          24 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
     601             :       TYPE(cp_blacs_env_type), POINTER                   :: context
     602             :       TYPE(mp_comm_type)                                 :: comm_exchange
     603             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     604             : 
     605          24 :       CALL timeset(routineN, handle)
     606             : 
     607          24 :       tag = 44
     608             : 
     609          24 :       CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
     610           0 :       ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
     611         168 :                 sos_mp2_work%index2recv(0:num_pe_col - 1))
     612             : 
     613          24 :       num_ab_pairs = SIZE(sos_mp2_work%pair_list, 2)
     614          24 :       IF (num_ab_pairs > 0) THEN
     615             : 
     616             :          CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
     617             :                                ncol_local=ncol_local, col_indices=col_indices, &
     618           4 :                                context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
     619             :          CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
     620           4 :                           blacs2mpi=blacs2mpi)
     621           4 :          first_p_pos_col = first_p_pos(2)
     622             : 
     623           4 :          CALL comm_exchange%from_split(para_env, my_prow)
     624             : 
     625          12 :          ALLOCATE (data2send(0:num_pe_col - 1))
     626          12 :          ALLOCATE (data2recv(0:num_pe_col - 1))
     627             : 
     628           8 :          data2send = 0
     629           8 :          data2recv = 0
     630           8 :          DO proc_shift = 0, num_pe_col - 1
     631           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     632           4 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
     633             : 
     634           4 :             counter = 0
     635         308 :             DO col_local = 1, ncol_local
     636         304 :                col_global = col_indices(col_local)
     637             : 
     638         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     639         304 :                avirt = col_global - (iocc - 1)*virtual
     640             : 
     641       15476 :                DO ab_counter = 1, num_ab_pairs
     642             : 
     643       15472 :                   my_a = sos_mp2_work%pair_list(1, ab_counter)
     644       15472 :                   my_b = sos_mp2_work%pair_list(2, ab_counter)
     645             : 
     646       15472 :                   IF (avirt /= my_b) CYCLE
     647         304 :                   pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
     648         304 :                   IF (pcol /= pcol_send) CYCLE
     649             : 
     650         304 :                   counter = counter + 1
     651             : 
     652       15472 :                   EXIT
     653             : 
     654             :                END DO
     655             :             END DO
     656           8 :             data2send(pcol_send) = counter
     657             :          END DO
     658             : 
     659           4 :          CALL comm_exchange%alltoall(data2send, data2recv, 1)
     660             : 
     661           8 :          DO proc_shift = 0, num_pe_col - 1
     662           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     663           4 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
     664             : 
     665             :             ! Collect indices and exchange
     666          12 :             ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
     667             : 
     668           4 :             counter = 0
     669         308 :             DO col_local = 1, ncol_local
     670         304 :                col_global = col_indices(col_local)
     671             : 
     672         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     673         304 :                avirt = col_global - (iocc - 1)*virtual
     674             : 
     675       15476 :                DO ab_counter = 1, num_ab_pairs
     676             : 
     677       15472 :                   my_a = sos_mp2_work%pair_list(1, ab_counter)
     678       15472 :                   my_b = sos_mp2_work%pair_list(2, ab_counter)
     679             : 
     680       15472 :                   IF (avirt /= my_b) CYCLE
     681         304 :                   pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
     682         304 :                   IF (pcol /= pcol_send) CYCLE
     683             : 
     684         304 :                   counter = counter + 1
     685             : 
     686         304 :                   sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
     687             : 
     688       15472 :                   EXIT
     689             : 
     690             :                END DO
     691             :             END DO
     692             : 
     693          12 :             ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
     694             :             !
     695             :             CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
     696           4 :                                    sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
     697             : 
     698             :             ! Convert to global coordinates to local coordinates as we always work with them
     699         312 :             DO counter = 1, data2send(pcol_send)
     700             :                sos_mp2_work%index2send(pcol_send)%array(counter) = &
     701             :                   cp_fm_indxg2l(sos_mp2_work%index2send(pcol_send)%array(counter), &
     702         308 :                                 ncol_block, 0, first_p_pos_col, num_pe_col)
     703             :             END DO
     704             :          END DO
     705             : 
     706           4 :          CALL comm_exchange%free()
     707           8 :          DEALLOCATE (data2send, data2recv)
     708             : 
     709             :       END IF
     710             : 
     711          24 :       CALL timestop(handle)
     712             : 
     713          48 :    END SUBROUTINE
     714             : 
     715             : ! **************************************************************************************************
     716             : !> \brief ...
     717             : !> \param fm_mat_Q ...
     718             : !> \param rpa_grad ...
     719             : ! **************************************************************************************************
     720          48 :    SUBROUTINE rpa_grad_copy_Q(fm_mat_Q, rpa_grad)
     721             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_Q
     722             :       TYPE(rpa_grad_type), INTENT(INOUT)                 :: rpa_grad
     723             : 
     724          48 :       CALL cp_fm_to_fm(fm_mat_Q, rpa_grad%rpa_work%fm_mat_Q_copy)
     725             : 
     726          48 :    END SUBROUTINE
     727             : 
     728             : ! **************************************************************************************************
     729             : !> \brief ...
     730             : !> \param mp2_env ...
     731             : !> \param rpa_grad ...
     732             : !> \param do_ri_sos_laplace_mp2 ...
     733             : !> \param fm_mat_Q ...
     734             : !> \param fm_mat_Q_gemm ...
     735             : !> \param dgemm_counter ...
     736             : !> \param fm_mat_S ...
     737             : !> \param omega ...
     738             : !> \param homo ...
     739             : !> \param virtual ...
     740             : !> \param Eigenval ...
     741             : !> \param weight ...
     742             : !> \param unit_nr ...
     743             : ! **************************************************************************************************
     744         108 :    SUBROUTINE rpa_grad_matrix_operations(mp2_env, rpa_grad, do_ri_sos_laplace_mp2, fm_mat_Q, fm_mat_Q_gemm, &
     745         108 :                                          dgemm_counter, fm_mat_S, omega, homo, virtual, Eigenval, weight, unit_nr)
     746             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
     747             :       TYPE(rpa_grad_type), INTENT(INOUT)                 :: rpa_grad
     748             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
     749             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_Q, fm_mat_Q_gemm
     750             :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
     751             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     752             :       REAL(KIND=dp), INTENT(IN)                          :: omega
     753             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     754             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
     755             :       REAL(KIND=dp), INTENT(IN)                          :: weight
     756             :       INTEGER, INTENT(IN)                                :: unit_nr
     757             : 
     758             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_matrix_operations'
     759             : 
     760             :       INTEGER                                            :: col_global, col_local, dimen_ia, &
     761             :                                                             dimen_RI, handle, handle2, ispin, &
     762             :                                                             jspin, ncol_local, nrow_local, nspins, &
     763             :                                                             row_local
     764         108 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
     765             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     766         108 :          TARGET                                          :: mat_S_3D, mat_work_iaP_3D
     767             :       TYPE(cp_fm_type)                                   :: fm_work_iaP, fm_work_PQ
     768             : 
     769         108 :       CALL timeset(routineN, handle)
     770             : 
     771         108 :       nspins = SIZE(fm_mat_Q)
     772             : 
     773             :       CALL cp_fm_get_info(fm_mat_Q(1), nrow_global=dimen_RI, nrow_local=nrow_local, ncol_local=ncol_local, &
     774         108 :                           col_indices=col_indices, row_indices=row_indices)
     775             : 
     776         108 :       IF (.NOT. do_ri_sos_laplace_mp2) THEN
     777          48 :          CALL cp_fm_create(fm_work_PQ, fm_mat_Q(1)%matrix_struct)
     778             : 
     779             :          ! calculate [1+Q(iw')]^-1
     780          48 :          CALL cp_fm_cholesky_invert(fm_mat_Q(1))
     781             :          ! symmetrize the result, fm_work_PQ is only a work matrix
     782          48 :          CALL cp_fm_upper_to_full(fm_mat_Q(1), fm_work_PQ)
     783             : 
     784          48 :          CALL cp_fm_release(fm_work_PQ)
     785             : 
     786        4144 :          DO col_local = 1, ncol_local
     787        4096 :             col_global = col_indices(col_local)
     788      164064 :             DO row_local = 1, nrow_local
     789      164016 :             IF (col_global == row_indices(row_local)) THEN
     790        3432 :                fm_mat_Q(1)%local_data(row_local, col_local) = fm_mat_Q(1)%local_data(row_local, col_local) - 1.0_dp
     791        3432 :                EXIT
     792             :             END IF
     793             :             END DO
     794             :          END DO
     795             : 
     796          48 :          CALL timeset(routineN//"_PQ", handle2)
     797          48 :          CALL dgemm_counter_start(dgemm_counter)
     798             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
     799             :                             matrix_a=rpa_grad%rpa_work%fm_mat_Q_copy, matrix_b=fm_mat_Q(1), beta=1.0_dp, &
     800          48 :                             matrix_c=rpa_grad%fm_Gamma_PQ)
     801          48 :          CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
     802          48 :          CALL timestop(handle2)
     803             : 
     804             :          CALL cp_fm_to_fm_submat_general(fm_mat_Q(1), fm_mat_Q_gemm(1), dimen_RI, dimen_RI, 1, 1, 1, 1, &
     805          48 :                                          fm_mat_Q_gemm(1)%matrix_struct%context)
     806             :       END IF
     807             : 
     808         232 :       DO ispin = 1, nspins
     809         124 :          IF (do_ri_sos_laplace_mp2) THEN
     810             :             ! The spin of the other Q matrix is always the other spin
     811          68 :             jspin = nspins - ispin + 1
     812             :          ELSE
     813             :             ! or the first matrix in the case of RPA
     814             :             jspin = 1
     815             :          END IF
     816             : 
     817         124 :          IF (do_ri_sos_laplace_mp2) THEN
     818          68 :             CALL timeset(routineN//"_PQ", handle2)
     819          68 :             CALL dgemm_counter_start(dgemm_counter)
     820             :             CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
     821             :                                matrix_a=fm_mat_Q(ispin), matrix_b=fm_mat_Q(jspin), beta=1.0_dp, &
     822          68 :                                matrix_c=rpa_grad%fm_Gamma_PQ)
     823          68 :             CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
     824          68 :             CALL timestop(handle2)
     825             : 
     826             :             CALL cp_fm_to_fm_submat_general(fm_mat_Q(jspin), fm_mat_Q_gemm(jspin), dimen_RI, dimen_RI, 1, 1, 1, 1, &
     827          68 :                                             fm_mat_Q_gemm(jspin)%matrix_struct%context)
     828             :          ELSE
     829             :             CALL calc_fm_mat_S_rpa(fm_mat_S(ispin), .TRUE., virtual(ispin), Eigenval(:, ispin), &
     830          56 :                                    homo(ispin), omega, 0.0_dp)
     831             :          END IF
     832             : 
     833         124 :          CALL timeset(routineN//"_contr_S", handle2)
     834         124 :          CALL cp_fm_create(fm_work_iaP, rpa_grad%fm_Y(ispin)%matrix_struct)
     835             : 
     836         124 :          CALL cp_fm_get_info(fm_mat_S(ispin), ncol_global=dimen_ia)
     837             : 
     838         124 :          CALL dgemm_counter_start(dgemm_counter)
     839             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_ia, k=dimen_RI, alpha=1.0_dp, &
     840             :                             matrix_a=fm_mat_Q_gemm(jspin), matrix_b=fm_mat_S(ispin), beta=0.0_dp, &
     841         124 :                             matrix_c=fm_work_iaP)
     842         124 :          CALL dgemm_counter_stop(dgemm_counter, dimen_ia, dimen_RI, dimen_RI)
     843         124 :          CALL timestop(handle2)
     844             : 
     845         356 :          IF (do_ri_sos_laplace_mp2) THEN
     846             :             CALL calc_P_sos_mp2(homo(ispin), fm_mat_S(ispin), fm_work_iaP, &
     847             :                                 rpa_grad%sos_mp2_work_occ(ispin), rpa_grad%sos_mp2_work_virt(ispin), &
     848          68 :                                 omega, weight, virtual(ispin), Eigenval(:, ispin), mp2_env%ri_grad%dot_blksize)
     849             : 
     850          68 :             CALL calc_fm_mat_S_laplace(fm_work_iaP, homo(ispin), virtual(ispin), Eigenval(:, ispin), omega)
     851             : 
     852          68 :             CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
     853             : 
     854          68 :             CALL cp_fm_release(fm_work_iaP)
     855             :          ELSE
     856             :             ! To save memory, we add it now
     857          56 :             CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
     858             : 
     859             :             ! Redistribute both matrices and deallocate fm_work_iaP
     860             :             CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
     861             :                                        fm_work_iaP, mat_work_iaP_3D, &
     862             :                                        rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
     863          56 :                                        rpa_grad%rpa_work%mepos)
     864          56 :             CALL cp_fm_release(fm_work_iaP)
     865             : 
     866             :             CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
     867             :                                        fm_mat_S(ispin), mat_S_3D, &
     868             :                                        rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
     869          56 :                                        rpa_grad%rpa_work%mepos)
     870             : 
     871             :             ! Now collect the density matrix
     872             :             CALL calc_P_rpa(mat_S_3D, mat_work_iaP_3D, rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
     873             :                             rpa_grad%rpa_work%grid, rpa_grad%rpa_work%mepos, &
     874             :                             fm_mat_S(ispin)%matrix_struct, &
     875             :                             rpa_grad%rpa_work%P_ij(ispin)%array, rpa_grad%rpa_work%P_ab(ispin)%array, &
     876          56 :                             weight, omega, Eigenval(:, ispin), homo(ispin), unit_nr, mp2_env)
     877             : 
     878          56 :             DEALLOCATE (mat_work_iaP_3D, mat_S_3D)
     879             : 
     880          56 :             CALL remove_scaling_factor_rpa(fm_mat_S(ispin), virtual(ispin), Eigenval(:, ispin), homo(ispin), omega)
     881             : 
     882             :          END IF
     883             : 
     884             :       END DO
     885             : 
     886         108 :       CALL timestop(handle)
     887             : 
     888         216 :    END SUBROUTINE
     889             : 
     890             : ! **************************************************************************************************
     891             : !> \brief ...
     892             : !> \param homo ...
     893             : !> \param fm_mat_S ...
     894             : !> \param fm_work_iaP ...
     895             : !> \param sos_mp2_work_occ ...
     896             : !> \param sos_mp2_work_virt ...
     897             : !> \param omega ...
     898             : !> \param weight ...
     899             : !> \param virtual ...
     900             : !> \param Eigenval ...
     901             : !> \param dot_blksize ...
     902             : ! **************************************************************************************************
     903         340 :    SUBROUTINE calc_P_sos_mp2(homo, fm_mat_S, fm_work_iaP, sos_mp2_work_occ, sos_mp2_work_virt, &
     904          68 :                              omega, weight, virtual, Eigenval, dot_blksize)
     905             :       INTEGER, INTENT(IN)                                :: homo
     906             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S, fm_work_iaP
     907             :       TYPE(sos_mp2_grad_work_type), INTENT(INOUT)        :: sos_mp2_work_occ, sos_mp2_work_virt
     908             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
     909             :       INTEGER, INTENT(IN)                                :: virtual
     910             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
     911             :       INTEGER, INTENT(IN)                                :: dot_blksize
     912             : 
     913             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_sos_mp2'
     914             : 
     915             :       INTEGER                                            :: avirt, col_global, col_local, handle, &
     916             :                                                             handle2, iocc, my_a, my_i, ncol_local, &
     917             :                                                             nrow_local, num_ab_pairs, num_ij_pairs
     918          68 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices
     919             :       REAL(KIND=dp)                                      :: ddot, trace
     920             : 
     921          68 :       CALL timeset(routineN, handle)
     922             : 
     923          68 :       CALL cp_fm_get_info(fm_mat_S, col_indices=col_indices, ncol_local=ncol_local, nrow_local=nrow_local)
     924             : 
     925          68 :       CALL timeset(routineN//"_Pij_diag", handle2)
     926         332 :       DO my_i = 1, homo
     927             :          ! Collect the contributions of the matrix elements
     928             : 
     929         264 :          trace = 0.0_dp
     930             : 
     931       20944 :          DO col_local = 1, ncol_local
     932       20680 :             col_global = col_indices(col_local)
     933             : 
     934       20680 :             iocc = MAX(1, col_global - 1)/virtual + 1
     935       20680 :             avirt = col_global - (iocc - 1)*virtual
     936             : 
     937       20680 :             IF (iocc == my_i) trace = trace + &
     938        5584 :                                      ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
     939             :          END DO
     940             : 
     941         332 :          sos_mp2_work_occ%P(my_i) = sos_mp2_work_occ%P(my_i) - trace*omega*weight
     942             : 
     943             :       END DO
     944          68 :       CALL timestop(handle2)
     945             : 
     946          68 :       CALL timeset(routineN//"_Pab_diag", handle2)
     947        1448 :       DO my_a = 1, virtual
     948             :          ! Collect the contributions of the matrix elements
     949             : 
     950        1380 :          trace = 0.0_dp
     951             : 
     952      109900 :          DO col_local = 1, ncol_local
     953      108520 :             col_global = col_indices(col_local)
     954             : 
     955      108520 :             iocc = MAX(1, col_global - 1)/virtual + 1
     956      108520 :             avirt = col_global - (iocc - 1)*virtual
     957             : 
     958      108520 :             IF (avirt == my_a) trace = trace + &
     959        6700 :                                      ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
     960             :          END DO
     961             : 
     962        1448 :          sos_mp2_work_virt%P(my_a) = sos_mp2_work_virt%P(my_a) + trace*omega*weight
     963             : 
     964             :       END DO
     965          68 :       CALL timestop(handle2)
     966             : 
     967             :       ! Loop over list and carry out operations
     968          68 :       num_ij_pairs = SIZE(sos_mp2_work_occ%pair_list, 2)
     969          68 :       num_ab_pairs = SIZE(sos_mp2_work_virt%pair_list, 2)
     970          68 :       IF (num_ij_pairs > 0) THEN
     971             :          CALL calc_Pij_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_occ%pair_list, &
     972             :                              virtual, sos_mp2_work_occ%P(homo + 1:), Eigenval(:homo), omega, weight, &
     973           8 :                              sos_mp2_work_occ%index2send, sos_mp2_work_occ%index2recv, dot_blksize)
     974             :       END IF
     975          68 :       IF (num_ab_pairs > 0) THEN
     976             :          CALL calc_Pab_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_virt%pair_list, &
     977             :                              virtual, sos_mp2_work_virt%P(virtual + 1:), Eigenval(homo + 1:), omega, weight, &
     978           8 :                              sos_mp2_work_virt%index2send, sos_mp2_work_virt%index2recv, dot_blksize)
     979             :       END IF
     980             : 
     981          68 :       CALL timestop(handle)
     982             : 
     983          68 :    END SUBROUTINE
     984             : 
     985             : ! **************************************************************************************************
     986             : !> \brief ...
     987             : !> \param mat_S_1D ...
     988             : !> \param mat_work_iaP_3D ...
     989             : !> \param gd_homo ...
     990             : !> \param gd_virtual ...
     991             : !> \param grid ...
     992             : !> \param mepos ...
     993             : !> \param fm_struct_S ...
     994             : !> \param P_ij ...
     995             : !> \param P_ab ...
     996             : !> \param weight ...
     997             : !> \param omega ...
     998             : !> \param Eigenval ...
     999             : !> \param homo ...
    1000             : !> \param unit_nr ...
    1001             : !> \param mp2_env ...
    1002             : ! **************************************************************************************************
    1003          56 :    SUBROUTINE calc_P_rpa(mat_S_1D, mat_work_iaP_3D, gd_homo, gd_virtual, grid, mepos, &
    1004          56 :                          fm_struct_S, P_ij, P_ab, weight, omega, Eigenval, homo, unit_nr, mp2_env)
    1005             :       REAL(KIND=dp), DIMENSION(*), INTENT(INOUT), TARGET :: mat_S_1D
    1006             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: mat_work_iaP_3D
    1007             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_homo, gd_virtual
    1008             :       INTEGER, DIMENSION(2), INTENT(IN)                  :: grid, mepos
    1009             :       TYPE(cp_fm_struct_type), INTENT(IN), POINTER       :: fm_struct_S
    1010             :       REAL(KIND=dp), DIMENSION(:, :)                     :: P_ij, P_ab
    1011             :       REAL(KIND=dp), INTENT(IN)                          :: weight, omega
    1012             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    1013             :       INTEGER, INTENT(IN)                                :: homo, unit_nr
    1014             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    1015             : 
    1016             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_rpa'
    1017             : 
    1018             :       INTEGER :: completed, handle, handle2, my_a_end, my_a_size, my_a_start, my_i_end, my_i_size, &
    1019             :          my_i_start, my_P_size, my_prow, number_of_parallel_channels, proc_a_recv, proc_a_send, &
    1020             :          proc_i_recv, proc_i_send, proc_recv, proc_send, proc_shift, recv_a_end, recv_a_size, &
    1021             :          recv_a_start, recv_i_end, recv_i_size, recv_i_start, tag
    1022             :       INTEGER(KIND=int_8)                                :: mem, number_of_elements_per_blk
    1023          56 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: procs_recv
    1024          56 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1025             :       REAL(KIND=dp)                                      :: mem_per_block, mem_real
    1026          56 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: buffer_compens_1D
    1027          56 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: mat_S_3D
    1028          56 :       TYPE(cp_1d_r_cp_type), ALLOCATABLE, DIMENSION(:)   :: buffer_1D
    1029          56 :       TYPE(cp_3d_r_cp_type), ALLOCATABLE, DIMENSION(:)   :: buffer_3D
    1030             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1031          56 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_requests, send_requests
    1032             : 
    1033          56 :       CALL timeset(routineN, handle)
    1034             : 
    1035             :       ! We allocate it at every step to reduce potential memory conflicts with COSMA
    1036          56 :       IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
    1037          48 :          IF (.NOT. c_associated(mp2_env%local_gemm_ctx)) THEN
    1038          48 :             CALL local_gemm_create(mp2_env%local_gemm_ctx, LOCAL_GEMM_PU_GPU)
    1039          48 :             CALL local_gemm_set_op_threshold_gpu(mp2_env%local_gemm_ctx, spla_threshold)
    1040             :          END IF
    1041             :       END IF
    1042             : 
    1043          56 :       tag = 47
    1044             : 
    1045          56 :       my_P_size = SIZE(mat_work_iaP_3D, 1)
    1046             : 
    1047          56 :       CALL cp_fm_struct_get(fm_struct_S, para_env=para_env)
    1048          56 :       CALL fm_struct_S%context%get(my_process_row=my_prow, blacs2mpi=blacs2mpi, para_env=para_env)
    1049             : 
    1050          56 :       CALL get_group_dist(gd_virtual, mepos(1), my_a_start, my_a_end, my_a_size)
    1051          56 :       CALL get_group_dist(gd_homo, mepos(2), my_i_start, my_i_end, my_i_size)
    1052             : 
    1053             :       ! We have to remap the indices because mp_sendrecv requires a 3D array (because of mat_work_iaP_3D)
    1054             :       ! and dgemm requires 2D arrays
    1055             :       ! Fortran 2008 does allow pointer remapping independently of the ranks but GCC 7 does not properly support it
    1056          56 :       mat_S_3D(1:my_P_size, 1:my_a_size, 1:my_i_size) => mat_S_1D(1:INT(my_P_size, int_8)*my_a_size*my_i_size)
    1057             : 
    1058             :       number_of_elements_per_blk = MAX(INT(maxsize(gd_homo), KIND=int_8)*my_a_size, &
    1059          56 :                                        INT(maxsize(gd_virtual), KIND=int_8)*my_i_size)*my_P_size
    1060             : 
    1061             :       ! Determine the available memory and estimate the number of possible parallel communication channels
    1062          56 :       CALL m_memory(mem)
    1063          56 :       mem_real = REAL(mem, KIND=dp)
    1064          56 :       mem_per_block = REAL(number_of_elements_per_blk, KIND=dp)*8.0_dp
    1065         168 :       number_of_parallel_channels = MAX(1, MIN(MAXVAL(grid) - 1, FLOOR(mem_real/mem_per_block)))
    1066          56 :       CALL para_env%min(number_of_parallel_channels)
    1067          56 :       IF (mp2_env%ri_grad%max_parallel_comm > 0) &
    1068          56 :          number_of_parallel_channels = MIN(number_of_parallel_channels, mp2_env%ri_grad%max_parallel_comm)
    1069             : 
    1070          56 :       IF (unit_nr > 0) THEN
    1071          28 :          WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Number of parallel communication channels:', number_of_parallel_channels
    1072          28 :          CALL m_flush(unit_nr)
    1073             :       END IF
    1074          56 :       CALL para_env%sync()
    1075             : 
    1076         224 :       ALLOCATE (buffer_1D(number_of_parallel_channels))
    1077         112 :       DO proc_shift = 1, number_of_parallel_channels
    1078         224 :          ALLOCATE (buffer_1D(proc_shift)%array(number_of_elements_per_blk))
    1079             :       END DO
    1080             : 
    1081         224 :       ALLOCATE (buffer_3D(number_of_parallel_channels))
    1082             : 
    1083             :       ! Allocate buffers for vector version of kahan summation
    1084          56 :       IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
    1085         144 :          ALLOCATE (buffer_compens_1D(2*MAX(my_a_size*maxsize(gd_virtual), my_i_size*maxsize(gd_homo))))
    1086             :       END IF
    1087             : 
    1088          56 :       IF (number_of_parallel_channels > 1) THEN
    1089           0 :          ALLOCATE (procs_recv(number_of_parallel_channels))
    1090           0 :          ALLOCATE (recv_requests(number_of_parallel_channels))
    1091           0 :          ALLOCATE (send_requests(MAXVAL(grid) - 1))
    1092             :       END IF
    1093             : 
    1094          56 :       IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
    1095           0 :          CALL timeset(routineN//"_comm_a", handle2)
    1096           0 :          recv_requests(:) = mp_request_null
    1097           0 :          procs_recv(:) = -1
    1098           0 :          DO proc_shift = 1, MIN(grid(1) - 1, number_of_parallel_channels)
    1099           0 :             proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
    1100           0 :             proc_recv = mepos(2)*grid(1) + proc_a_recv
    1101             : 
    1102           0 :             CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
    1103             : 
    1104             :             buffer_3D(proc_shift)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
    1105           0 :                buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
    1106             : 
    1107             :             CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
    1108           0 :                                 recv_requests(proc_shift), tag)
    1109             : 
    1110           0 :             procs_recv(proc_shift) = proc_a_recv
    1111             :          END DO
    1112             : 
    1113           0 :          send_requests(:) = mp_request_null
    1114           0 :          DO proc_shift = 1, grid(1) - 1
    1115           0 :             proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
    1116           0 :             proc_send = mepos(2)*grid(1) + proc_a_send
    1117             : 
    1118             :             CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1119           0 :                                 send_requests(proc_shift), tag)
    1120             :          END DO
    1121           0 :          CALL timestop(handle2)
    1122             :       END IF
    1123             : 
    1124             :       CALL calc_P_rpa_a(P_ab(:, my_a_start:my_a_end), &
    1125             :                         mat_S_3D, mat_work_iaP_3D, &
    1126             :                         mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1127             :                         Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1128          56 :                         Eigenval(homo + my_a_start:homo + my_a_end), omega, weight)
    1129             : 
    1130          56 :       DO proc_shift = 1, grid(1) - 1
    1131           0 :          CALL timeset(routineN//"_comm_a", handle2)
    1132           0 :          IF (number_of_parallel_channels > 1) THEN
    1133           0 :             CALL mp_waitany(recv_requests, completed)
    1134             : 
    1135           0 :             CALL get_group_dist(gd_virtual, procs_recv(completed), recv_a_start, recv_a_end, recv_a_size)
    1136             :          ELSE
    1137           0 :             proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
    1138           0 :             proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
    1139             : 
    1140           0 :             proc_send = mepos(2)*grid(1) + proc_a_send
    1141           0 :             proc_recv = mepos(2)*grid(1) + proc_a_recv
    1142             : 
    1143           0 :             CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
    1144             : 
    1145             :             buffer_3D(1)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
    1146           0 :                buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
    1147             : 
    1148             :             CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1149           0 :                                    buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
    1150           0 :             completed = 1
    1151             :          END IF
    1152           0 :          CALL timestop(handle2)
    1153             : 
    1154             :          CALL calc_P_rpa_a(P_ab(:, recv_a_start:recv_a_end), &
    1155             :                            mat_S_3D, buffer_3D(completed)%array, &
    1156             :                            mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1157             :                            Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1158           0 :                            Eigenval(homo + recv_a_start:homo + recv_a_end), omega, weight)
    1159             : 
    1160          56 :          IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(1)) THEN
    1161           0 :             proc_a_recv = MODULO(mepos(1) - proc_shift - number_of_parallel_channels, grid(1))
    1162           0 :             proc_recv = mepos(2)*grid(1) + proc_a_recv
    1163             : 
    1164           0 :             CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
    1165             : 
    1166             :             buffer_3D(completed)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
    1167           0 :                buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
    1168             : 
    1169             :             CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
    1170           0 :                                 recv_requests(completed), tag)
    1171             : 
    1172           0 :             procs_recv(completed) = proc_a_recv
    1173             :          END IF
    1174             :       END DO
    1175             : 
    1176          56 :       IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
    1177           0 :          CALL mp_waitall(send_requests)
    1178             :       END IF
    1179             : 
    1180          56 :       IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
    1181           0 :          recv_requests(:) = mp_request_null
    1182           0 :          procs_recv(:) = -1
    1183           0 :          DO proc_shift = 1, MIN(grid(2) - 1, number_of_parallel_channels)
    1184           0 :             proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
    1185           0 :             proc_recv = proc_i_recv*grid(1) + mepos(1)
    1186             : 
    1187           0 :             CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
    1188             : 
    1189             :             buffer_3D(proc_shift)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
    1190           0 :                buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
    1191             : 
    1192             :             CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
    1193           0 :                                 recv_requests(proc_shift), tag)
    1194             : 
    1195           0 :             procs_recv(proc_shift) = proc_i_recv
    1196             :          END DO
    1197             : 
    1198           0 :          send_requests(:) = mp_request_null
    1199           0 :          DO proc_shift = 1, grid(2) - 1
    1200           0 :             proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
    1201           0 :             proc_send = proc_i_send*grid(1) + mepos(1)
    1202             : 
    1203             :             CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1204           0 :                                 send_requests(proc_shift), tag)
    1205             :          END DO
    1206             :       END IF
    1207             : 
    1208             :       CALL calc_P_rpa_i(P_ij(:, my_i_start:my_i_end), &
    1209             :                         mat_S_3D, mat_work_iaP_3D, &
    1210             :                         mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1211             :                         Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1212          56 :                         Eigenval(my_i_start:my_i_end), omega, weight)
    1213             : 
    1214          56 :       DO proc_shift = 1, grid(2) - 1
    1215           0 :          CALL timeset(routineN//"_comm_i", handle2)
    1216           0 :          IF (number_of_parallel_channels > 1) THEN
    1217           0 :             CALL mp_waitany(recv_requests, completed)
    1218             : 
    1219           0 :             CALL get_group_dist(gd_homo, procs_recv(completed), recv_i_start, recv_i_end, recv_i_size)
    1220             :          ELSE
    1221           0 :             proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
    1222           0 :             proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
    1223             : 
    1224           0 :             proc_send = proc_i_send*grid(1) + mepos(1)
    1225           0 :             proc_recv = proc_i_recv*grid(1) + mepos(1)
    1226             : 
    1227           0 :             CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
    1228             : 
    1229             :             buffer_3D(1)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
    1230           0 :                buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
    1231             : 
    1232             :             CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1233           0 :                                    buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
    1234           0 :             completed = 1
    1235             :          END IF
    1236           0 :          CALL timestop(handle2)
    1237             : 
    1238             :          CALL calc_P_rpa_i(P_ij(:, recv_i_start:recv_i_end), &
    1239             :                            mat_S_3D, buffer_3D(completed)%array, &
    1240             :                            mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1241             :                            Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1242           0 :                            Eigenval(recv_i_start:recv_i_end), omega, weight)
    1243             : 
    1244          56 :          IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(2)) THEN
    1245           0 :             proc_i_recv = MODULO(mepos(2) - proc_shift - number_of_parallel_channels, grid(2))
    1246           0 :             proc_recv = proc_i_recv*grid(1) + mepos(1)
    1247             : 
    1248           0 :             CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_a_end, recv_i_size)
    1249             : 
    1250             :             buffer_3D(completed)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
    1251           0 :                buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
    1252             : 
    1253             :             CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
    1254           0 :                                 recv_requests(completed), tag)
    1255             : 
    1256           0 :             procs_recv(completed) = proc_i_recv
    1257             :          END IF
    1258             :       END DO
    1259             : 
    1260          56 :       IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
    1261           0 :          CALL mp_waitall(send_requests)
    1262             :       END IF
    1263             : 
    1264          56 :       IF (number_of_parallel_channels > 1) THEN
    1265           0 :          DEALLOCATE (procs_recv)
    1266           0 :          DEALLOCATE (recv_requests)
    1267           0 :          DEALLOCATE (send_requests)
    1268             :       END IF
    1269             : 
    1270          56 :       IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
    1271             :          ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
    1272          48 :          CALL local_gemm_destroy(mp2_env%local_gemm_ctx)
    1273          48 :          mp2_env%local_gemm_ctx = C_NULL_PTR
    1274          48 :          DEALLOCATE (buffer_compens_1D)
    1275             :       END IF
    1276             : 
    1277         112 :       DO proc_shift = 1, number_of_parallel_channels
    1278          56 :          NULLIFY (buffer_3D(proc_shift)%array)
    1279         112 :          DEALLOCATE (buffer_1D(proc_shift)%array)
    1280             :       END DO
    1281          56 :       DEALLOCATE (buffer_3D, buffer_1D)
    1282             : 
    1283          56 :       CALL timestop(handle)
    1284             : 
    1285         168 :    END SUBROUTINE
    1286             : 
    1287             : ! **************************************************************************************************
    1288             : !> \brief ...
    1289             : !> \param P_ab ...
    1290             : !> \param mat_S ...
    1291             : !> \param mat_work ...
    1292             : !> \param dot_blksize ...
    1293             : !> \param buffer_1D ...
    1294             : !> \param local_gemm_ctx ...
    1295             : !> \param my_eval_virt ...
    1296             : !> \param my_eval_occ ...
    1297             : !> \param recv_eval_virt ...
    1298             : !> \param omega ...
    1299             : !> \param weight ...
    1300             : ! **************************************************************************************************
    1301          56 :    SUBROUTINE calc_P_rpa_a(P_ab, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
    1302          56 :                            my_eval_virt, my_eval_occ, recv_eval_virt, omega, weight)
    1303             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: P_ab
    1304             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: mat_S, mat_work
    1305             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1306             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1307             :          INTENT(INOUT), TARGET                           :: buffer_1D
    1308             :       TYPE(C_PTR), INTENT(INOUT)                         :: local_gemm_ctx
    1309             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_virt, my_eval_occ, recv_eval_virt
    1310             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1311             : 
    1312             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_rpa_a'
    1313             : 
    1314             :       INTEGER                                            :: handle, my_a, my_a_size, my_i, &
    1315             :                                                             my_i_size, my_P_size, P_end, P_start, &
    1316             :                                                             recv_a_size, stripesize
    1317          56 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: buffer_compens, buffer_unscaled
    1318             : 
    1319          56 :       CALL timeset(routineN, handle)
    1320             : 
    1321          56 :       my_i_size = SIZE(mat_S, 3)
    1322          56 :       recv_a_size = SIZE(mat_work, 2)
    1323          56 :       my_a_size = SIZE(mat_S, 2)
    1324          56 :       my_P_size = SIZE(mat_S, 1)
    1325             : 
    1326          56 :       IF (dot_blksize >= blksize_threshold) THEN
    1327          48 :          buffer_compens(1:my_a_size, 1:recv_a_size) => buffer_1D(1:my_a_size*recv_a_size)
    1328       22208 :          buffer_compens = 0.0_dp
    1329          48 :          buffer_unscaled(1:my_a_size, 1:recv_a_size) => buffer_1D(my_a_size*recv_a_size + 1:2*my_a_size*recv_a_size)
    1330             : 
    1331             :          ! This loop imitates the actual tensor contraction
    1332         232 :          DO my_i = 1, my_i_size
    1333         416 :             DO P_start = 1, my_P_size, dot_blksize
    1334         184 :                stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
    1335         184 :                P_end = P_start + stripesize - 1
    1336             : 
    1337             :                CALL local_gemm("T", "N", my_a_size, recv_a_size, stripesize, &
    1338             :                                -weight, mat_S(P_start:P_end, :, my_i), stripesize, &
    1339             :                                mat_work(P_start:P_end, :, my_i), stripesize, &
    1340         184 :                                0.0_dp, buffer_unscaled, my_a_size, local_gemm_ctx)
    1341             : 
    1342             :                CALL scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
    1343         184 :                                                       my_eval_virt, recv_eval_virt, my_eval_occ(my_i))
    1344             : 
    1345         368 :                CALL kahan_step(buffer_compens, P_ab)
    1346             :             END DO
    1347             :          END DO
    1348             :       ELSE
    1349             :          BLOCK
    1350             :             INTEGER :: recv_a
    1351             :             REAL(KIND=dp) :: tmp, e_i, e_a, e_b, omega2, my_compens, my_p, s
    1352           8 :             omega2 = -omega**2
    1353             : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
    1354             : !$OMP SHARED(my_a_size,recv_a_size,my_i_size,mat_S,my_eval_virt,recv_eval_virt,my_eval_occ,omega2,&
    1355             : !$OMP        P_ab,weight,mat_work)&
    1356           8 : !$OMP PRIVATE(tmp,my_a,recv_a,my_i,e_a,e_b,e_i,my_compens,my_p,s)
    1357             :             DO my_a = 1, my_a_size
    1358             :             DO recv_a = 1, recv_a_size
    1359             :                e_a = my_eval_virt(my_a)
    1360             :                e_b = recv_eval_virt(recv_a)
    1361             :                my_p = P_ab(my_a, recv_a)
    1362             :                my_compens = 0.0_dp
    1363             :                DO my_i = 1, my_i_size
    1364             :                   e_i = -my_eval_occ(my_i)
    1365             :                   tmp = -weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, recv_a, my_i)) &
    1366             :                         *(1.0_dp + omega2/((e_a + e_i)*(e_b + e_i))) - my_compens
    1367             :                   s = my_p + tmp
    1368             :                   my_compens = (s - my_p) - tmp
    1369             :                   my_p = s
    1370             :                END DO
    1371             :                P_ab(my_a, recv_a) = my_p
    1372             :             END DO
    1373             :             END DO
    1374             :          END BLOCK
    1375             :       END IF
    1376             : 
    1377          56 :       CALL timestop(handle)
    1378             : 
    1379          56 :    END SUBROUTINE calc_P_rpa_a
    1380             : 
    1381             : ! **************************************************************************************************
    1382             : !> \brief ...
    1383             : !> \param P_ij ...
    1384             : !> \param mat_S ...
    1385             : !> \param mat_work ...
    1386             : !> \param dot_blksize ...
    1387             : !> \param buffer_1D ...
    1388             : !> \param local_gemm_ctx ...
    1389             : !> \param my_eval_virt ...
    1390             : !> \param my_eval_occ ...
    1391             : !> \param recv_eval_occ ...
    1392             : !> \param omega ...
    1393             : !> \param weight ...
    1394             : ! **************************************************************************************************
    1395          56 :    SUBROUTINE calc_P_rpa_i(P_ij, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
    1396          56 :                            my_eval_virt, my_eval_occ, recv_eval_occ, omega, weight)
    1397             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: P_ij
    1398             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: mat_S, mat_work
    1399             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1400             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1401             :          INTENT(INOUT), TARGET                           :: buffer_1D
    1402             :       TYPE(C_PTR), INTENT(INOUT)                         :: local_gemm_ctx
    1403             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_virt, my_eval_occ, recv_eval_occ
    1404             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1405             : 
    1406             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_rpa_i'
    1407             : 
    1408             :       INTEGER                                            :: handle, my_a, my_a_size, my_i, &
    1409             :                                                             my_i_size, my_P_size, P_end, P_start, &
    1410             :                                                             recv_i_size, stripesize
    1411          56 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: buffer_compens, buffer_unscaled
    1412             : 
    1413          56 :       CALL timeset(routineN, handle)
    1414             : 
    1415          56 :       my_i_size = SIZE(mat_S, 3)
    1416          56 :       recv_i_size = SIZE(mat_work, 3)
    1417          56 :       my_a_size = SIZE(mat_S, 2)
    1418          56 :       my_P_size = SIZE(mat_S, 1)
    1419             : 
    1420          56 :       IF (dot_blksize >= blksize_threshold) THEN
    1421          48 :          buffer_compens(1:my_i_size, 1:recv_i_size) => buffer_1D(1:my_i_size*recv_i_size)
    1422         944 :          buffer_compens = 0.0_dp
    1423          48 :          buffer_unscaled(1:my_i_size, 1:recv_i_size) => buffer_1D(my_i_size*recv_i_size + 1:2*my_i_size*recv_i_size)
    1424             : 
    1425             :          ! This loop imitates the actual tensor contraction
    1426        1048 :          DO my_a = 1, my_a_size
    1427        2048 :             DO P_start = 1, my_P_size, dot_blksize
    1428        1000 :                stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
    1429        1000 :                P_end = P_start + stripesize - 1
    1430             : 
    1431             :                CALL local_gemm("T", "N", my_i_size, recv_i_size, stripesize, &
    1432             :                                weight, mat_S(P_start:P_end, my_a, :), stripesize, &
    1433             :                                mat_work(P_start:P_end, my_a, :), stripesize, &
    1434        1000 :                                0.0_dp, buffer_unscaled, my_i_size, local_gemm_ctx)
    1435             : 
    1436             :                CALL scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
    1437        1000 :                                                      my_eval_occ, recv_eval_occ, my_eval_virt(my_a))
    1438             : 
    1439        2000 :                CALL kahan_step(buffer_compens, P_ij)
    1440             :             END DO
    1441             :          END DO
    1442             :       ELSE
    1443             :          BLOCK
    1444             :             REAL(KIND=dp) :: tmp, e_i, e_a, e_j, omega2, my_compens, my_p, s
    1445             :             INTEGER :: recv_i
    1446           8 :             omega2 = -omega**2
    1447             : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
    1448             : !$OMP SHARED(my_a_size,recv_i_size,my_i_size,mat_S,my_eval_occ,my_eval_virt,omega2,&
    1449             : !$OMP        recv_eval_occ,P_ij,weight,mat_work)&
    1450           8 : !$OMP PRIVATE(tmp,my_a,recv_i,my_i,e_i,e_j,e_a,my_compens,my_p,s)
    1451             :             DO my_i = 1, my_i_size
    1452             :             DO recv_i = 1, recv_i_size
    1453             :                e_i = my_eval_occ(my_i)
    1454             :                e_j = recv_eval_occ(recv_i)
    1455             :                my_p = P_ij(my_i, recv_i)
    1456             :                my_compens = 0.0_dp
    1457             :                DO my_a = 1, my_a_size
    1458             :                   e_a = my_eval_virt(my_a)
    1459             :                   tmp = weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, my_a, recv_i)) &
    1460             :                         *(1.0_dp + omega2/((e_a - e_i)*(e_a - e_j))) - my_compens
    1461             :                   s = my_p + tmp
    1462             :                   my_compens = (s - my_p) - tmp
    1463             :                   my_p = s
    1464             :                END DO
    1465             :                P_ij(my_i, recv_i) = my_p
    1466             :             END DO
    1467             :             END DO
    1468             :          END BLOCK
    1469             :       END IF
    1470             : 
    1471          56 :       CALL timestop(handle)
    1472             : 
    1473          56 :    END SUBROUTINE calc_P_rpa_i
    1474             : 
    1475             : ! **************************************************************************************************
    1476             : !> \brief ...
    1477             : !> \param compens ...
    1478             : !> \param P ...
    1479             : ! **************************************************************************************************
    1480        1184 :    SUBROUTINE kahan_step(compens, P)
    1481             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: compens, P
    1482             : 
    1483             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'kahan_step'
    1484             : 
    1485             :       INTEGER                                            :: handle, i, j
    1486             :       REAL(KIND=dp)                                      :: my_compens, my_p, s
    1487             : 
    1488        1184 :       CALL timeset(routineN, handle)
    1489             : 
    1490        1184 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(P,compens) PRIVATE(i,my_p,my_compens,s, j) COLLAPSE(2)
    1491             :       DO j = 1, SIZE(compens, 2)
    1492             :          DO i = 1, SIZE(compens, 1)
    1493             :             my_p = P(i, j)
    1494             :             my_compens = compens(i, j)
    1495             :             s = my_p + my_compens
    1496             :             compens(i, j) = (s - my_p) - my_compens
    1497             :             P(i, j) = s
    1498             :          END DO
    1499             :       END DO
    1500             : !$OMP END PARALLEL DO
    1501             : 
    1502        1184 :       CALL timestop(handle)
    1503             : 
    1504        1184 :    END SUBROUTINE
    1505             : 
    1506             : ! **************************************************************************************************
    1507             : !> \brief ...
    1508             : !> \param buffer_unscaled ...
    1509             : !> \param buffer_compens ...
    1510             : !> \param omega ...
    1511             : !> \param my_eval_virt ...
    1512             : !> \param recv_eval_virt ...
    1513             : !> \param my_eval_occ ...
    1514             : ! **************************************************************************************************
    1515         184 :    SUBROUTINE scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
    1516         184 :                                                 my_eval_virt, recv_eval_virt, my_eval_occ)
    1517             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: buffer_unscaled
    1518             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: buffer_compens
    1519             :       REAL(KIND=dp), INTENT(IN)                          :: omega
    1520             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_virt, recv_eval_virt
    1521             :       REAL(KIND=dp), INTENT(IN)                          :: my_eval_occ
    1522             : 
    1523             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_virt'
    1524             : 
    1525             :       INTEGER                                            :: handle, my_a, my_b
    1526             : 
    1527         184 :       CALL timeset(routineN, handle)
    1528             : 
    1529             : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_unscaled,buffer_compens,omega,&
    1530         184 : !$OMP                                  my_eval_virt,recv_eval_virt,my_eval_occ) PRIVATE(my_a,my_b)
    1531             :       DO my_b = 1, SIZE(buffer_compens, 2)
    1532             :          DO my_a = 1, SIZE(buffer_compens, 1)
    1533             :             buffer_compens(my_a, my_b) = buffer_unscaled(my_a, my_b) &
    1534             :                                     *(1.0_dp - omega**2/((my_eval_virt(my_a) - my_eval_occ)*(recv_eval_virt(my_b) - my_eval_occ))) &
    1535             :                                          - buffer_compens(my_a, my_b)
    1536             :          END DO
    1537             :       END DO
    1538             : !$OMP END PARALLEL DO
    1539             : 
    1540         184 :       CALL timestop(handle)
    1541             : 
    1542         184 :    END SUBROUTINE scale_buffer_and_add_compens_virt
    1543             : 
    1544             : ! **************************************************************************************************
    1545             : !> \brief ...
    1546             : !> \param buffer_unscaled ...
    1547             : !> \param buffer_compens ...
    1548             : !> \param omega ...
    1549             : !> \param my_eval_occ ...
    1550             : !> \param recv_eval_occ ...
    1551             : !> \param my_eval_virt ...
    1552             : ! **************************************************************************************************
    1553        1000 :    SUBROUTINE scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
    1554        1000 :                                                my_eval_occ, recv_eval_occ, my_eval_virt)
    1555             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: buffer_unscaled
    1556             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: buffer_compens
    1557             :       REAL(KIND=dp), INTENT(IN)                          :: omega
    1558             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_occ, recv_eval_occ
    1559             :       REAL(KIND=dp), INTENT(IN)                          :: my_eval_virt
    1560             : 
    1561             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_occ'
    1562             : 
    1563             :       INTEGER                                            :: handle, my_i, my_j
    1564             : 
    1565        1000 :       CALL timeset(routineN, handle)
    1566             : 
    1567             : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_compens,buffer_unscaled,omega,&
    1568        1000 : !$OMP        my_eval_virt,my_eval_occ,recv_eval_occ) PRIVATE(my_i,my_j)
    1569             :       DO my_j = 1, SIZE(buffer_compens, 2)
    1570             :          DO my_i = 1, SIZE(buffer_compens, 1)
    1571             :             buffer_compens(my_i, my_j) = buffer_unscaled(my_i, my_j) &
    1572             :                                     *(1.0_dp - omega**2/((my_eval_virt - my_eval_occ(my_i))*(my_eval_virt - recv_eval_occ(my_j)))) &
    1573             :                                          - buffer_compens(my_i, my_j)
    1574             :          END DO
    1575             :       END DO
    1576             : !$OMP END PARALLEL DO
    1577             : 
    1578        1000 :       CALL timestop(handle)
    1579             : 
    1580        1000 :    END SUBROUTINE scale_buffer_and_add_compens_occ
    1581             : 
    1582             : ! **************************************************************************************************
    1583             : !> \brief ...
    1584             : !> \param x ...
    1585             : !> \return ...
    1586             : ! **************************************************************************************************
    1587        1320 :    ELEMENTAL FUNCTION sinh_over_x(x) RESULT(res)
    1588             :       REAL(KIND=dp), INTENT(IN)                          :: x
    1589             :       REAL(KIND=dp)                                      :: res
    1590             : 
    1591             :       ! Calculate sinh(x)/x
    1592             :       ! Split the intervall to prevent numerical instabilities
    1593        1320 :       IF (ABS(x) > 3.0e-4_dp) THEN
    1594        1318 :          res = SINH(x)/x
    1595             :       ELSE
    1596           2 :          res = 1.0_dp + x**2/6.0_dp
    1597             :       END IF
    1598             : 
    1599        1320 :    END FUNCTION sinh_over_x
    1600             : 
    1601             : ! **************************************************************************************************
    1602             : !> \brief ...
    1603             : !> \param fm_work_iaP ...
    1604             : !> \param fm_mat_S ...
    1605             : !> \param pair_list ...
    1606             : !> \param virtual ...
    1607             : !> \param P_ij ...
    1608             : !> \param Eigenval ...
    1609             : !> \param omega ...
    1610             : !> \param weight ...
    1611             : !> \param index2send ...
    1612             : !> \param index2recv ...
    1613             : !> \param dot_blksize ...
    1614             : ! **************************************************************************************************
    1615           8 :    SUBROUTINE calc_Pij_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ij, Eigenval, &
    1616           8 :                              omega, weight, index2send, index2recv, dot_blksize)
    1617             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_work_iaP, fm_mat_S
    1618             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: pair_list
    1619             :       INTEGER, INTENT(IN)                                :: virtual
    1620             :       REAL(KIND=dp), DIMENSION(:), INTENT(INOUT)         :: P_ij
    1621             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    1622             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1623             :       TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
    1624             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1625             : 
    1626             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_Pij_degen'
    1627             : 
    1628             :       INTEGER :: avirt, col_global, col_local, counter, first_p_pos(2), first_p_pos_col, handle, &
    1629             :          handle2, ij_counter, iocc, my_col_local, my_i, my_j, my_pcol, my_prow, ncol_block, &
    1630             :          ncol_local, nrow_local, num_ij_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, &
    1631             :          recv_size, send_size, size_recv_buffer, size_send_buffer, tag
    1632           8 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
    1633           8 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1634             :       REAL(KIND=dp)                                      :: trace
    1635           8 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    1636             :       TYPE(cp_blacs_env_type), POINTER                   :: context
    1637             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1638             : 
    1639           8 :       CALL timeset(routineN, handle)
    1640             : 
    1641             :       CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
    1642             :                             ncol_local=ncol_local, col_indices=col_indices, &
    1643           8 :                             context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
    1644             :       CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
    1645           8 :                        number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
    1646           8 :       first_p_pos_col = first_p_pos(2)
    1647             : 
    1648           8 :       num_ij_pairs = SIZE(pair_list, 2)
    1649             : 
    1650           8 :       tag = 42
    1651             : 
    1652         104 :       DO ij_counter = 1, num_ij_pairs
    1653             : 
    1654          96 :          my_i = pair_list(1, ij_counter)
    1655          96 :          my_j = pair_list(2, ij_counter)
    1656             : 
    1657          96 :          trace = 0.0_dp
    1658             : 
    1659        7392 :          DO col_local = 1, ncol_local
    1660        7296 :             col_global = col_indices(col_local)
    1661             : 
    1662        7296 :             iocc = MAX(1, col_global - 1)/virtual + 1
    1663        7296 :             avirt = col_global - (iocc - 1)*virtual
    1664             : 
    1665        7296 :             IF (iocc /= my_j) CYCLE
    1666        1824 :             pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
    1667        1824 :             IF (pcol /= my_pcol) CYCLE
    1668             : 
    1669        1824 :             my_col_local = cp_fm_indxg2l((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
    1670             : 
    1671             :             trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
    1672        7392 :                                                    dot_blksize)
    1673             :          END DO
    1674             : 
    1675         104 :          P_ij(ij_counter) = P_ij(ij_counter) - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
    1676             : 
    1677             :       END DO
    1678             : 
    1679           8 :       IF (num_pe_col > 1) THEN
    1680             :          size_send_buffer = 0
    1681             :          size_recv_buffer = 0
    1682           0 :          DO proc_shift = 1, num_pe_col - 1
    1683           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1684           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1685             : 
    1686           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) &
    1687           0 :                size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
    1688             : 
    1689           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) &
    1690           0 :                size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
    1691             :          END DO
    1692             : 
    1693           0 :          ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
    1694             : 
    1695           0 :          DO proc_shift = 1, num_pe_col - 1
    1696           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1697           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1698             : 
    1699             :             ! Collect data and exchange
    1700           0 :             send_size = 0
    1701           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
    1702             : 
    1703           0 :             DO counter = 1, send_size
    1704           0 :                buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
    1705             :             END DO
    1706             : 
    1707           0 :             recv_size = 0
    1708           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
    1709           0 :             IF (recv_size > 0) THEN
    1710           0 :                CALL timeset(routineN//"_send", handle2)
    1711           0 :                IF (send_size > 0) THEN
    1712             :                   CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
    1713           0 :                                          buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1714             :                ELSE
    1715           0 :                   CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1716             :                END IF
    1717           0 :                CALL timestop(handle2)
    1718             : 
    1719           0 :                DO ij_counter = 1, num_ij_pairs
    1720             :                   ! Collect the contributions of the matrix elements
    1721             : 
    1722           0 :                   my_i = pair_list(1, ij_counter)
    1723           0 :                   my_j = pair_list(2, ij_counter)
    1724             : 
    1725           0 :                   trace = 0.0_dp
    1726             : 
    1727           0 :                   DO col_local = 1, recv_size
    1728           0 :                      col_global = index2recv(pcol_recv)%array(col_local)
    1729             : 
    1730           0 :                      iocc = MAX(1, col_global - 1)/virtual + 1
    1731           0 :                      IF (iocc /= my_j) CYCLE
    1732           0 :                      avirt = col_global - (iocc - 1)*virtual
    1733           0 :                      pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
    1734           0 :                      IF (pcol /= my_pcol) CYCLE
    1735             : 
    1736           0 :                      my_col_local = cp_fm_indxg2l((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
    1737             : 
    1738             :                      trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
    1739           0 :                                                             dot_blksize)
    1740             :                   END DO
    1741             : 
    1742             :                   P_ij(ij_counter) = P_ij(ij_counter) &
    1743           0 :                                      - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
    1744             :                END DO
    1745           0 :             ELSE IF (send_size > 0) THEN
    1746           0 :                CALL timeset(routineN//"_send", handle2)
    1747           0 :                CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
    1748           0 :                CALL timestop(handle2)
    1749             :             END IF
    1750             :          END DO
    1751           0 :          IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    1752           0 :          IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    1753             :       END IF
    1754             : 
    1755           8 :       CALL timestop(handle)
    1756             : 
    1757          16 :    END SUBROUTINE
    1758             : 
    1759             : ! **************************************************************************************************
    1760             : !> \brief ...
    1761             : !> \param fm_work_iaP ...
    1762             : !> \param fm_mat_S ...
    1763             : !> \param pair_list ...
    1764             : !> \param virtual ...
    1765             : !> \param P_ab ...
    1766             : !> \param Eigenval ...
    1767             : !> \param omega ...
    1768             : !> \param weight ...
    1769             : !> \param index2send ...
    1770             : !> \param index2recv ...
    1771             : !> \param dot_blksize ...
    1772             : ! **************************************************************************************************
    1773           8 :    SUBROUTINE calc_Pab_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ab, Eigenval, &
    1774           8 :                              omega, weight, index2send, index2recv, dot_blksize)
    1775             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_work_iaP, fm_mat_S
    1776             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: pair_list
    1777             :       INTEGER, INTENT(IN)                                :: virtual
    1778             :       REAL(KIND=dp), DIMENSION(:), INTENT(INOUT)         :: P_ab
    1779             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    1780             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1781             :       TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
    1782             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1783             : 
    1784             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_Pab_degen'
    1785             : 
    1786             :       INTEGER :: ab_counter, avirt, col_global, col_local, counter, first_p_pos(2), &
    1787             :          first_p_pos_col, handle, handle2, iocc, my_a, my_b, my_col_local, my_pcol, my_prow, &
    1788             :          ncol_block, ncol_local, nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, pcol_send, &
    1789             :          proc_shift, recv_size, send_size, size_recv_buffer, size_send_buffer, tag
    1790           8 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
    1791           8 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1792             :       REAL(KIND=dp)                                      :: trace
    1793           8 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    1794             :       TYPE(cp_blacs_env_type), POINTER                   :: context
    1795             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1796             : 
    1797           8 :       CALL timeset(routineN, handle)
    1798             : 
    1799             :       CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
    1800             :                             ncol_local=ncol_local, col_indices=col_indices, &
    1801           8 :                             context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
    1802             :       CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
    1803           8 :                        number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
    1804           8 :       first_p_pos_col = first_p_pos(2)
    1805             : 
    1806           8 :       num_ab_pairs = SIZE(pair_list, 2)
    1807             : 
    1808           8 :       tag = 43
    1809             : 
    1810        1232 :       DO ab_counter = 1, num_ab_pairs
    1811             : 
    1812        1224 :          my_a = pair_list(1, ab_counter)
    1813        1224 :          my_b = pair_list(2, ab_counter)
    1814             : 
    1815        1224 :          trace = 0.0_dp
    1816             : 
    1817       94248 :          DO col_local = 1, ncol_local
    1818       93024 :             col_global = col_indices(col_local)
    1819             : 
    1820       93024 :             iocc = MAX(1, col_global - 1)/virtual + 1
    1821       93024 :             avirt = col_global - (iocc - 1)*virtual
    1822             : 
    1823       93024 :             IF (avirt /= my_b) CYCLE
    1824        4896 :             pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
    1825        4896 :             IF (pcol /= my_pcol) CYCLE
    1826        4896 :             my_col_local = cp_fm_indxg2l((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
    1827             : 
    1828             :             trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
    1829       94248 :                                                    dot_blksize)
    1830             : 
    1831             :          END DO
    1832             : 
    1833             :          P_ab(ab_counter) = P_ab(ab_counter) &
    1834        1232 :                             + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
    1835             : 
    1836             :       END DO
    1837             : 
    1838           8 :       IF (num_pe_col > 1) THEN
    1839             :          size_send_buffer = 0
    1840             :          size_recv_buffer = 0
    1841           0 :          DO proc_shift = 1, num_pe_col - 1
    1842           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1843           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1844             : 
    1845           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) &
    1846           0 :                size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
    1847             : 
    1848           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) &
    1849           0 :                size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
    1850             :          END DO
    1851             : 
    1852           0 :          ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
    1853             : 
    1854           0 :          DO proc_shift = 1, num_pe_col - 1
    1855           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1856           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1857             : 
    1858             :             ! Collect data and exchange
    1859           0 :             send_size = 0
    1860           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
    1861             : 
    1862           0 :             DO counter = 1, send_size
    1863           0 :                buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
    1864             :             END DO
    1865             : 
    1866           0 :             recv_size = 0
    1867           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
    1868           0 :             IF (recv_size > 0) THEN
    1869           0 :                CALL timeset(routineN//"_send", handle2)
    1870           0 :                IF (send_size > 0) THEN
    1871             :                   CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
    1872           0 :                                          buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1873             :                ELSE
    1874           0 :                   CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1875             :                END IF
    1876           0 :                CALL timestop(handle2)
    1877             : 
    1878           0 :                DO ab_counter = 1, num_ab_pairs
    1879             :                   ! Collect the contributions of the matrix elements
    1880             : 
    1881           0 :                   my_a = pair_list(1, ab_counter)
    1882           0 :                   my_b = pair_list(2, ab_counter)
    1883             : 
    1884           0 :                   trace = 0.0_dp
    1885             : 
    1886           0 :                   DO col_local = 1, SIZE(index2recv(pcol_recv)%array)
    1887           0 :                      col_global = index2recv(pcol_recv)%array(col_local)
    1888             : 
    1889           0 :                      iocc = MAX(1, col_global - 1)/virtual + 1
    1890           0 :                      avirt = col_global - (iocc - 1)*virtual
    1891           0 :                      IF (avirt /= my_b) CYCLE
    1892           0 :                      pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
    1893           0 :                      IF (pcol /= my_pcol) CYCLE
    1894             : 
    1895           0 :                      my_col_local = cp_fm_indxg2l((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
    1896             : 
    1897             :                      trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
    1898           0 :                                                             dot_blksize)
    1899             :                   END DO
    1900             : 
    1901             :                   P_ab(ab_counter) = P_ab(ab_counter) &
    1902           0 :                                      + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
    1903             : 
    1904             :                END DO
    1905           0 :             ELSE IF (send_size > 0) THEN
    1906           0 :                CALL timeset(routineN//"_send", handle2)
    1907           0 :                CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
    1908           0 :                CALL timestop(handle2)
    1909             :             END IF
    1910             :          END DO
    1911           0 :          IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    1912           0 :          IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    1913             :       END IF
    1914             : 
    1915           8 :       CALL timestop(handle)
    1916             : 
    1917          16 :    END SUBROUTINE
    1918             : 
    1919             : ! **************************************************************************************************
    1920             : !> \brief ...
    1921             : !> \param index2send ...
    1922             : !> \param index2recv ...
    1923             : !> \param fm_mat_S ...
    1924             : !> \param mat_S_3D ...
    1925             : !> \param gd_homo ...
    1926             : !> \param gd_virtual ...
    1927             : !> \param mepos ...
    1928             : ! **************************************************************************************************
    1929         112 :    SUBROUTINE redistribute_fm_mat_S(index2send, index2recv, fm_mat_S, mat_S_3D, gd_homo, gd_virtual, mepos)
    1930             :       TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send
    1931             :       TYPE(two_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2recv
    1932             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S
    1933             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
    1934             :          INTENT(OUT)                                     :: mat_S_3D
    1935             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_homo, gd_virtual
    1936             :       INTEGER, DIMENSION(2), INTENT(IN)                  :: mepos
    1937             : 
    1938             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'redistribute_fm_mat_S'
    1939             : 
    1940             :       INTEGER :: col_local, handle, my_a, my_homo, my_i, my_pcol, my_prow, my_virtual, nrow_local, &
    1941             :          num_pe_col, proc_recv, proc_send, proc_shift, recv_size, send_size, size_recv_buffer, &
    1942             :          size_send_buffer, tag
    1943         112 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1944         112 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    1945             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1946             : 
    1947         112 :       CALL timeset(routineN, handle)
    1948             : 
    1949         112 :       tag = 46
    1950             : 
    1951             :       CALL fm_mat_S%matrix_struct%context%get(my_process_row=my_prow, my_process_column=my_pcol, &
    1952         112 :                                               number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
    1953             : 
    1954         112 :       CALL cp_fm_struct_get(fm_mat_S%matrix_struct, nrow_local=nrow_local, para_env=para_env)
    1955             : 
    1956         112 :       CALL get_group_dist(gd_homo, mepos(2), sizes=my_homo)
    1957         112 :       CALL get_group_dist(gd_virtual, mepos(1), sizes=my_virtual)
    1958             : 
    1959         560 :       ALLOCATE (mat_S_3D(nrow_local, my_virtual, my_homo))
    1960             : 
    1961         112 :       IF (ALLOCATED(index2send(my_pcol)%array)) THEN
    1962        8928 :          DO col_local = 1, SIZE(index2send(my_pcol)%array)
    1963        8816 :             my_a = index2recv(my_pcol)%array(1, col_local)
    1964        8816 :             my_i = index2recv(my_pcol)%array(2, col_local)
    1965      678032 :             mat_S_3D(:, my_a, my_i) = fm_mat_S%local_data(:, index2send(my_pcol)%array(col_local))
    1966             :          END DO
    1967             :       END IF
    1968             : 
    1969         112 :       IF (num_pe_col > 1) THEN
    1970             :          size_send_buffer = 0
    1971             :          size_recv_buffer = 0
    1972           0 :          DO proc_shift = 1, num_pe_col - 1
    1973           0 :             proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1974           0 :             proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1975             : 
    1976           0 :             send_size = 0
    1977           0 :             IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
    1978           0 :             size_send_buffer = MAX(size_send_buffer, send_size)
    1979             : 
    1980           0 :             recv_size = 0
    1981           0 :             IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array)
    1982           0 :             size_recv_buffer = MAX(size_recv_buffer, recv_size)
    1983             : 
    1984             :          END DO
    1985             : 
    1986           0 :          ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
    1987             : 
    1988           0 :          DO proc_shift = 1, num_pe_col - 1
    1989           0 :             proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1990           0 :             proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1991             : 
    1992           0 :             send_size = 0
    1993           0 :             IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
    1994           0 :             DO col_local = 1, send_size
    1995           0 :                buffer_send(:, col_local) = fm_mat_S%local_data(:, index2send(proc_send)%array(col_local))
    1996             :             END DO
    1997             : 
    1998           0 :             recv_size = 0
    1999           0 :             IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array, 2)
    2000           0 :             IF (recv_size > 0) THEN
    2001           0 :                IF (send_size > 0) THEN
    2002             :                   CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), &
    2003           0 :                                          buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
    2004             :                ELSE
    2005           0 :                   CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
    2006             :                END IF
    2007             : 
    2008           0 :                DO col_local = 1, recv_size
    2009           0 :                   my_a = index2recv(proc_recv)%array(1, col_local)
    2010           0 :                   my_i = index2recv(proc_recv)%array(2, col_local)
    2011           0 :                   mat_S_3D(:, my_a, my_i) = buffer_recv(:, col_local)
    2012             :                END DO
    2013           0 :             ELSE IF (send_size > 0) THEN
    2014           0 :                CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), tag)
    2015             :             END IF
    2016             : 
    2017             :          END DO
    2018             : 
    2019           0 :          IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    2020           0 :          IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    2021             :       END IF
    2022             : 
    2023         112 :       CALL timestop(handle)
    2024             : 
    2025         336 :    END SUBROUTINE
    2026             : 
    2027             : ! **************************************************************************************************
    2028             : !> \brief ...
    2029             : !> \param rpa_grad ...
    2030             : !> \param mp2_env ...
    2031             : !> \param para_env_sub ...
    2032             : !> \param para_env ...
    2033             : !> \param qs_env ...
    2034             : !> \param gd_array ...
    2035             : !> \param color_sub ...
    2036             : !> \param do_ri_sos_laplace_mp2 ...
    2037             : !> \param homo ...
    2038             : !> \param virtual ...
    2039             : ! **************************************************************************************************
    2040          40 :    SUBROUTINE rpa_grad_finalize(rpa_grad, mp2_env, para_env_sub, para_env, qs_env, gd_array, &
    2041          40 :                                 color_sub, do_ri_sos_laplace_mp2, homo, virtual)
    2042             :       TYPE(rpa_grad_type), INTENT(INOUT)                 :: rpa_grad
    2043             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    2044             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env_sub, para_env
    2045             :       TYPE(qs_environment_type), INTENT(IN), POINTER     :: qs_env
    2046             :       TYPE(group_dist_d1_type)                           :: gd_array
    2047             :       INTEGER, INTENT(IN)                                :: color_sub
    2048             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
    2049             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2050             : 
    2051             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'rpa_grad_finalize'
    2052             : 
    2053             :       INTEGER :: dimen_ia, dimen_RI, handle, iiB, ispin, my_group_L_end, my_group_L_size, &
    2054             :          my_group_L_start, my_ia_end, my_ia_size, my_ia_start, my_P_end, my_P_size, my_P_start, &
    2055             :          ngroup, nspins, pos_group, pos_sub, proc
    2056          40 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: pos_info
    2057          40 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: group_grid_2_mepos, mepos_2_grid_group
    2058             :       REAL(KIND=dp)                                      :: my_scale
    2059          40 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: Gamma_2D
    2060             :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env
    2061             :       TYPE(cp_fm_struct_type), POINTER                   :: fm_struct
    2062             :       TYPE(cp_fm_type)                                   :: fm_G_P_ia, fm_PQ, fm_PQ_2, fm_PQ_half, &
    2063             :                                                             fm_work_PQ, fm_work_PQ_2, fm_Y, &
    2064             :                                                             operator_half
    2065          40 :       TYPE(group_dist_d1_type)                           :: gd_array_new, gd_ia, gd_P, gd_P_new
    2066             : 
    2067          40 :       CALL timeset(routineN, handle)
    2068             : 
    2069             :       ! Release unnecessary matrices to save memory for next steps
    2070             : 
    2071          40 :       nspins = SIZE(rpa_grad%fm_Y)
    2072             : 
    2073             :       ! Scaling factor is required to scale the density matrices and the Gamma matrices later
    2074          40 :       IF (do_ri_sos_laplace_mp2) THEN
    2075          20 :          my_scale = mp2_env%scale_s
    2076             :       ELSE
    2077          20 :          my_scale = -mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
    2078          20 :          IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
    2079             :       END IF
    2080             : 
    2081          40 :       IF (do_ri_sos_laplace_mp2) THEN
    2082             :          CALL sos_mp2_grad_finalize(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
    2083          20 :                                     para_env, para_env_sub, homo, virtual, mp2_env)
    2084             :       ELSE
    2085             :          CALL rpa_grad_work_finalize(rpa_grad%rpa_work, mp2_env, homo, &
    2086          20 :                                      virtual, para_env, para_env_sub)
    2087             :       END IF
    2088             : 
    2089          40 :       CALL get_qs_env(qs_env, blacs_env=blacs_env)
    2090             : 
    2091          40 :       CALL cp_fm_get_info(rpa_grad%fm_Gamma_PQ, ncol_global=dimen_RI)
    2092             : 
    2093          40 :       NULLIFY (fm_struct)
    2094             :       CALL cp_fm_struct_create(fm_struct, context=blacs_env, nrow_global=dimen_RI, &
    2095          40 :                                ncol_global=dimen_RI, para_env=para_env)
    2096          40 :       CALL cp_fm_create(fm_PQ, fm_struct)
    2097          40 :       CALL cp_fm_create(fm_work_PQ, fm_struct)
    2098          40 :       IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2099           4 :          CALL cp_fm_create(fm_PQ_2, fm_struct)
    2100             :       END IF
    2101          40 :       CALL cp_fm_struct_release(fm_struct)
    2102          40 :       CALL cp_fm_set_all(fm_PQ, 0.0_dp)
    2103             : 
    2104             :       ! We still have to left- and right multiply it with PQhalf
    2105          40 :       CALL dereplicate_and_sum_fm(rpa_grad%fm_Gamma_PQ, fm_PQ)
    2106             : 
    2107          40 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    2108             : 
    2109             :       CALL prepare_redistribution(para_env, para_env_sub, ngroup, &
    2110             :                                   group_grid_2_mepos, mepos_2_grid_group, pos_info=pos_info)
    2111             : 
    2112             :       ! Create fm_PQ_half
    2113          40 :       CALL create_group_dist(gd_P, para_env_sub%num_pe, dimen_RI)
    2114          40 :       CALL get_group_dist(gd_P, para_env_sub%mepos, my_P_start, my_P_end, my_P_size)
    2115             : 
    2116          40 :       CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
    2117             : 
    2118          40 :       CALL create_group_dist(gd_P_new, para_env%num_pe)
    2119          40 :       CALL create_group_dist(gd_array_new, para_env%num_pe)
    2120             : 
    2121         120 :       DO proc = 0, para_env%num_pe - 1
    2122             :          ! calculate position of the group
    2123          80 :          pos_group = proc/para_env_sub%num_pe
    2124             :          ! calculate position in the subgroup
    2125          80 :          pos_sub = pos_info(proc)
    2126             :          ! 1 -> rows, 2 -> cols
    2127          80 :          CALL get_group_dist(gd_array, pos_group, gd_array_new, proc)
    2128         120 :          CALL get_group_dist(gd_P, pos_sub, gd_P_new, proc)
    2129             :       END DO
    2130             : 
    2131          40 :       DEALLOCATE (pos_info)
    2132          40 :       CALL release_group_dist(gd_P)
    2133             : 
    2134             :       CALL array2fm(mp2_env%ri_grad%PQ_half, fm_PQ%matrix_struct, &
    2135             :                     my_P_start, my_P_end, &
    2136             :                     my_group_L_start, my_group_L_end, &
    2137             :                     gd_P_new, gd_array_new, &
    2138             :                     group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
    2139          40 :                     fm_PQ_half)
    2140             : 
    2141          40 :       IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2142             :          CALL array2fm(mp2_env%ri_grad%operator_half, fm_PQ%matrix_struct, my_P_start, my_P_end, &
    2143             :                        my_group_L_start, my_group_L_end, &
    2144             :                        gd_P_new, gd_array_new, &
    2145             :                        group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
    2146           4 :                        operator_half)
    2147             :       END IF
    2148             : 
    2149             :       ! deallocate the info array
    2150          40 :       CALL release_group_dist(gd_P_new)
    2151          40 :       CALL release_group_dist(gd_array_new)
    2152             : 
    2153          40 :       IF (compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2154             : ! Finish Gamma_PQ
    2155             :          CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
    2156             :                             matrix_a=fm_PQ, matrix_b=fm_PQ_half, beta=0.0_dp, &
    2157          36 :                             matrix_c=fm_work_PQ)
    2158             : 
    2159             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
    2160             :                             matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
    2161          36 :                             matrix_c=fm_PQ)
    2162             : 
    2163          36 :          CALL cp_fm_release(fm_work_PQ)
    2164             :       ELSE
    2165             :          CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
    2166             :                             matrix_a=fm_PQ, matrix_b=operator_half, beta=0.0_dp, &
    2167           4 :                             matrix_c=fm_work_PQ)
    2168             : 
    2169             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
    2170             :                             matrix_a=operator_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
    2171           4 :                             matrix_c=fm_PQ)
    2172           4 :          CALL cp_fm_release(operator_half)
    2173             : 
    2174           4 :          CALL cp_fm_create(fm_work_PQ_2, fm_PQ%matrix_struct, name="fm_Gamma_PQ_2")
    2175             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
    2176             :                             matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
    2177           4 :                             matrix_c=fm_work_PQ_2)
    2178           4 :          CALL cp_fm_to_fm(fm_work_PQ_2, fm_PQ_2)
    2179           4 :          CALL cp_fm_geadd(1.0_dp, "T", fm_work_PQ_2, 1.0_dp, fm_PQ_2)
    2180           4 :          CALL cp_fm_release(fm_work_PQ_2)
    2181           4 :          CALL cp_fm_release(fm_work_PQ)
    2182             :       END IF
    2183             : 
    2184         160 :       ALLOCATE (mp2_env%ri_grad%Gamma_PQ(my_P_size, my_group_L_size))
    2185             :       CALL fm2array(mp2_env%ri_grad%Gamma_PQ, &
    2186             :                     my_P_size, my_P_start, my_P_end, &
    2187             :                     my_group_L_size, my_group_L_start, my_group_L_end, &
    2188             :                     group_grid_2_mepos, mepos_2_grid_group, &
    2189             :                     para_env_sub%num_pe, ngroup, &
    2190          40 :                     fm_PQ)
    2191             : 
    2192          40 :       IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2193          12 :          ALLOCATE (mp2_env%ri_grad%Gamma_PQ_2(my_P_size, my_group_L_size))
    2194             :          CALL fm2array(mp2_env%ri_grad%Gamma_PQ_2, my_P_size, my_P_start, my_P_end, &
    2195             :                        my_group_L_size, my_group_L_start, my_group_L_end, &
    2196             :                        group_grid_2_mepos, mepos_2_grid_group, &
    2197             :                        para_env_sub%num_pe, ngroup, &
    2198           4 :                        fm_PQ_2)
    2199             :       END IF
    2200             : 
    2201             : ! Now, Gamma_Pia
    2202        2644 :       ALLOCATE (mp2_env%ri_grad%G_P_ia(my_group_L_size, nspins))
    2203          88 :       DO ispin = 1, nspins
    2204        2524 :       DO iiB = 1, my_group_L_size
    2205        2484 :          NULLIFY (mp2_env%ri_grad%G_P_ia(iiB, ispin)%matrix)
    2206             :       END DO
    2207             :       END DO
    2208             : 
    2209             :       ! Redistribute the Y matrix
    2210          88 :       DO ispin = 1, nspins
    2211             :          ! Collect all data of columns for the own sub group locally
    2212          48 :          CALL cp_fm_get_info(rpa_grad%fm_Y(ispin), ncol_global=dimen_ia)
    2213             : 
    2214          48 :          CALL get_qs_env(qs_env, blacs_env=blacs_env)
    2215             : 
    2216          48 :          NULLIFY (fm_struct)
    2217          48 :          CALL cp_fm_struct_create(fm_struct, template_fmstruct=fm_PQ_half%matrix_struct, nrow_global=dimen_ia)
    2218          48 :          CALL cp_fm_create(fm_Y, fm_struct)
    2219          48 :          CALL cp_fm_struct_release(fm_struct)
    2220          48 :          CALL cp_fm_set_all(fm_Y, 0.0_dp)
    2221             : 
    2222          48 :          CALL dereplicate_and_sum_fm(rpa_grad%fm_Y(ispin), fm_Y)
    2223             : 
    2224          48 :          CALL cp_fm_create(fm_G_P_ia, fm_Y%matrix_struct)
    2225          48 :          CALL cp_fm_set_all(fm_G_P_ia, 0.0_dp)
    2226             : 
    2227             :          CALL parallel_gemm(transa="N", transb="T", m=dimen_ia, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
    2228             :                             matrix_a=fm_Y, matrix_b=fm_PQ_half, beta=0.0_dp, &
    2229          48 :                             matrix_c=fm_G_P_ia)
    2230             : 
    2231          48 :          CALL cp_fm_release(fm_Y)
    2232             : 
    2233          48 :          CALL create_group_dist(gd_ia, para_env_sub%num_pe, dimen_ia)
    2234          48 :          CALL get_group_dist(gd_ia, para_env_sub%mepos, my_ia_start, my_ia_end, my_ia_size)
    2235             : 
    2236             :          CALL fm2array(Gamma_2D, my_ia_size, my_ia_start, my_ia_end, &
    2237             :                        my_group_L_size, my_group_L_start, my_group_L_end, &
    2238             :                        group_grid_2_mepos, mepos_2_grid_group, &
    2239             :                        para_env_sub%num_pe, ngroup, &
    2240          48 :                        fm_G_P_ia)
    2241             : 
    2242             :          ! create the Gamma_ia_P in DBCSR style
    2243             :          CALL create_dbcsr_gamma(Gamma_2D, homo(ispin), virtual(ispin), dimen_ia, para_env_sub, &
    2244             :                                  my_ia_start, my_ia_end, my_group_L_size, gd_ia, &
    2245          48 :                                  mp2_env%ri_grad%G_P_ia(:, ispin), mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
    2246             : 
    2247         328 :          CALL release_group_dist(gd_ia)
    2248             : 
    2249             :       END DO
    2250          40 :       DEALLOCATE (rpa_grad%fm_Y)
    2251          40 :       CALL cp_fm_release(fm_PQ_half)
    2252             : 
    2253          40 :       CALL timestop(handle)
    2254             : 
    2255         320 :    END SUBROUTINE rpa_grad_finalize
    2256             : 
    2257             : ! **************************************************************************************************
    2258             : !> \brief ...
    2259             : !> \param sos_mp2_work_occ ...
    2260             : !> \param sos_mp2_work_virt ...
    2261             : !> \param para_env ...
    2262             : !> \param para_env_sub ...
    2263             : !> \param homo ...
    2264             : !> \param virtual ...
    2265             : !> \param mp2_env ...
    2266             : ! **************************************************************************************************
    2267          20 :    SUBROUTINE sos_mp2_grad_finalize(sos_mp2_work_occ, sos_mp2_work_virt, para_env, para_env_sub, homo, virtual, mp2_env)
    2268             :       TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
    2269             :          DIMENSION(:), INTENT(INOUT)                     :: sos_mp2_work_occ, sos_mp2_work_virt
    2270             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env, para_env_sub
    2271             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2272             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    2273             : 
    2274             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_grad_finalize'
    2275             : 
    2276             :       INTEGER                                            :: ab_counter, handle, ij_counter, ispin, &
    2277             :                                                             itmp(2), my_a, my_b, my_B_end, &
    2278             :                                                             my_B_size, my_B_start, my_i, my_j, &
    2279             :                                                             nspins, pcol
    2280             :       REAL(KIND=dp)                                      :: my_scale
    2281             : 
    2282          20 :       CALL timeset(routineN, handle)
    2283             : 
    2284          20 :       nspins = SIZE(sos_mp2_work_occ)
    2285          20 :       my_scale = mp2_env%scale_s
    2286             : 
    2287          44 :       DO ispin = 1, nspins
    2288          48 :          DO pcol = 0, SIZE(sos_mp2_work_occ(ispin)%index2send, 1) - 1
    2289          24 :             IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
    2290           4 :                DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
    2291          24 :             IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
    2292           0 :                DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
    2293          24 :             IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
    2294           4 :                DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
    2295          24 :             IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
    2296          24 :                DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
    2297             :          END DO
    2298           0 :          DEALLOCATE (sos_mp2_work_occ(ispin)%index2send, &
    2299           0 :                      sos_mp2_work_occ(ispin)%index2recv, &
    2300           0 :                      sos_mp2_work_virt(ispin)%index2send, &
    2301         140 :                      sos_mp2_work_virt(ispin)%index2recv)
    2302             :       END DO
    2303             : 
    2304             :       ! Sum P_ij and P_ab and redistribute them
    2305          44 :       DO ispin = 1, nspins
    2306          24 :          CALL para_env%sum(sos_mp2_work_occ(ispin)%P)
    2307             : 
    2308          96 :          ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
    2309         472 :          mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
    2310         116 :          DO my_i = 1, homo(ispin)
    2311         116 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_i) = my_scale*sos_mp2_work_occ(ispin)%P(my_i)
    2312             :          END DO
    2313          72 :          DO ij_counter = 1, SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
    2314          48 :             my_i = sos_mp2_work_occ(ispin)%pair_list(1, ij_counter)
    2315          48 :             my_j = sos_mp2_work_occ(ispin)%pair_list(2, ij_counter)
    2316             : 
    2317          72 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = my_scale*sos_mp2_work_occ(ispin)%P(homo(ispin) + ij_counter)
    2318             :          END DO
    2319          24 :          DEALLOCATE (sos_mp2_work_occ(ispin)%P, sos_mp2_work_occ(ispin)%pair_list)
    2320             : 
    2321             :          ! Symmetrize P_ij
    2322             :          mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
    2323         920 :                                                            TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
    2324             : 
    2325             :          ! The first index of P_ab has to be distributed within the subgroups, so sum it up first and add the required elements later
    2326          24 :          CALL para_env%sum(sos_mp2_work_virt(ispin)%P)
    2327             : 
    2328          24 :          itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
    2329          24 :          my_B_size = itmp(2) - itmp(1) + 1
    2330          24 :          my_B_start = itmp(1)
    2331          24 :          my_B_end = itmp(2)
    2332             : 
    2333          96 :          ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
    2334       10382 :          mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
    2335         486 :          DO my_a = itmp(1), itmp(2)
    2336         486 :             mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_a) = my_scale*sos_mp2_work_virt(ispin)%P(my_a)
    2337             :          END DO
    2338         636 :          DO ab_counter = 1, SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
    2339         612 :             my_a = sos_mp2_work_virt(ispin)%pair_list(1, ab_counter)
    2340         612 :             my_b = sos_mp2_work_virt(ispin)%pair_list(2, ab_counter)
    2341             : 
    2342         612 :             IF (my_a >= itmp(1) .AND. my_a <= itmp(2)) mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_b) = &
    2343         636 :                my_scale*sos_mp2_work_virt(ispin)%P(virtual(ispin) + ab_counter)
    2344             :          END DO
    2345             : 
    2346          24 :          DEALLOCATE (sos_mp2_work_virt(ispin)%P, sos_mp2_work_virt(ispin)%pair_list)
    2347             : 
    2348             :          ! Symmetrize P_ab
    2349          44 :          IF (para_env_sub%num_pe > 1) THEN
    2350          12 :             BLOCK
    2351             :                INTEGER :: send_a_start, send_a_end, send_a_size, &
    2352             :                           recv_a_start, recv_a_end, recv_a_size, proc_shift, proc_send, proc_recv
    2353           4 :                REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
    2354           4 :                REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
    2355           4 :                REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
    2356           4 :                TYPE(group_dist_d1_type)                           :: gd_virtual_sub
    2357             : 
    2358           4 :                CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
    2359             : 
    2360             :                mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
    2361             :                   0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
    2362         804 :                           + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
    2363             : 
    2364          12 :                ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
    2365          16 :                ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
    2366             : 
    2367           8 :                DO proc_shift = 1, para_env_sub%num_pe - 1
    2368             : 
    2369           4 :                   proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2370           4 :                   proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2371             : 
    2372           4 :                   CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
    2373           4 :                   CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
    2374             : 
    2375           4 :                   buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
    2376             : 
    2377         402 :                   buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
    2378             :                   CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
    2379         402 :                                              buffer_recv(:, :recv_a_size), proc_recv)
    2380             : 
    2381             :                   mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
    2382         410 :                      0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
    2383             : 
    2384             :                END DO
    2385             : 
    2386           4 :                DEALLOCATE (buffer_send_1D, buffer_recv)
    2387             : 
    2388          16 :                CALL release_group_dist(gd_virtual_sub)
    2389             :             END BLOCK
    2390             :          ELSE
    2391             :             mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
    2392       19140 :                                                               TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
    2393             :          END IF
    2394             : 
    2395             :       END DO
    2396          68 :       DEALLOCATE (sos_mp2_work_occ, sos_mp2_work_virt)
    2397          20 :       IF (nspins == 1) THEN
    2398         336 :          mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
    2399        5374 :          mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
    2400             :       END IF
    2401             : 
    2402          20 :       CALL timestop(handle)
    2403             : 
    2404          20 :    END SUBROUTINE
    2405             : 
    2406             : ! **************************************************************************************************
    2407             : !> \brief ...
    2408             : !> \param rpa_work ...
    2409             : !> \param mp2_env ...
    2410             : !> \param homo ...
    2411             : !> \param virtual ...
    2412             : !> \param para_env ...
    2413             : !> \param para_env_sub ...
    2414             : ! **************************************************************************************************
    2415          20 :    SUBROUTINE rpa_grad_work_finalize(rpa_work, mp2_env, homo, virtual, para_env, para_env_sub)
    2416             :       TYPE(rpa_grad_work_type), INTENT(INOUT)            :: rpa_work
    2417             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    2418             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2419             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env, para_env_sub
    2420             : 
    2421             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_work_finalize'
    2422             : 
    2423             :       INTEGER :: handle, ispin, itmp(2), my_a_end, my_a_size, my_a_start, my_B_end, my_B_size, &
    2424             :          my_B_start, my_i_end, my_i_size, my_i_start, nspins, proc, proc_recv, proc_send, &
    2425             :          proc_shift, recv_a_end, recv_a_size, recv_a_start, recv_end, recv_start, send_a_end, &
    2426             :          send_a_size, send_a_start, send_end, send_start, size_recv_buffer, size_send_buffer
    2427             :       REAL(KIND=dp)                                      :: my_scale
    2428          20 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    2429          20 :       TYPE(group_dist_d1_type)                           :: gd_a_sub, gd_virtual_sub
    2430             : 
    2431          20 :       CALL timeset(routineN, handle)
    2432             : 
    2433          20 :       nspins = SIZE(homo)
    2434          20 :       my_scale = mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
    2435          20 :       IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
    2436             : 
    2437          20 :       CALL cp_fm_release(rpa_work%fm_mat_Q_copy)
    2438             : 
    2439          44 :       DO ispin = 1, nspins
    2440          68 :       DO proc = 0, SIZE(rpa_work%index2send, 1) - 1
    2441          24 :          IF (ALLOCATED(rpa_work%index2send(proc, ispin)%array)) DEALLOCATE (rpa_work%index2send(proc, ispin)%array)
    2442          48 :          IF (ALLOCATED(rpa_work%index2recv(proc, ispin)%array)) DEALLOCATE (rpa_work%index2recv(proc, ispin)%array)
    2443             :       END DO
    2444             :       END DO
    2445          68 :       DEALLOCATE (rpa_work%index2send, rpa_work%index2recv)
    2446             : 
    2447          44 :       DO ispin = 1, nspins
    2448          24 :          CALL get_group_dist(rpa_work%gd_homo(ispin), rpa_work%mepos(2), my_i_start, my_i_end, my_i_size)
    2449          24 :          CALL release_group_dist(rpa_work%gd_homo(ispin))
    2450             : 
    2451          96 :          ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
    2452         472 :          mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
    2453         472 :          mp2_env%ri_grad%P_ij(ispin)%array(my_i_start:my_i_end, :) = my_scale*rpa_work%P_ij(ispin)%array
    2454          24 :          DEALLOCATE (rpa_work%P_ij(ispin)%array)
    2455          24 :          CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
    2456             : 
    2457             :          ! Symmetrize P_ij
    2458             :          mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
    2459         920 :                                                            TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
    2460             : 
    2461          24 :          itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
    2462          24 :          my_B_start = itmp(1)
    2463          24 :          my_B_end = itmp(2)
    2464          24 :          my_B_size = my_B_end - my_B_start + 1
    2465             : 
    2466          96 :          ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
    2467       10382 :          mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
    2468             : 
    2469          24 :          CALL get_group_dist(rpa_work%gd_virtual(ispin), rpa_work%mepos(1), my_a_start, my_a_end, my_a_size)
    2470          24 :          CALL release_group_dist(rpa_work%gd_virtual(ispin))
    2471             :          ! This group dist contains the info which parts of Pab a process currently owns
    2472          24 :          CALL create_group_dist(gd_a_sub, my_a_start, my_a_end, my_a_size, para_env_sub)
    2473             :          ! This group dist contains the info which parts of Pab a process is supposed to own later
    2474          24 :          CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
    2475             : 
    2476             :          ! Calculate local indices of the common range of own matrix and send process
    2477          24 :          send_start = MAX(1, my_B_start - my_a_start + 1)
    2478          24 :          send_end = MIN(my_a_size, my_B_end - my_a_start + 1)
    2479             : 
    2480             :          ! Same for recv process but with reverse positions
    2481          24 :          recv_start = MAX(1, my_a_start - my_B_start + 1)
    2482          24 :          recv_end = MIN(my_B_size, my_a_end - my_B_start + 1)
    2483             : 
    2484             :          mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
    2485       10382 :             my_scale*rpa_work%P_ab(ispin)%array(send_start:send_end, :)
    2486             : 
    2487          24 :          IF (para_env_sub%num_pe > 1) THEN
    2488             :             size_send_buffer = 0
    2489             :             size_recv_buffer = 0
    2490           8 :             DO proc_shift = 1, para_env_sub%num_pe - 1
    2491           4 :                proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2492           4 :                proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2493             : 
    2494           4 :                CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
    2495           4 :                CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
    2496             : 
    2497             :                ! Calculate local indices of the common range of own matrix and send process
    2498           4 :                send_start = MAX(1, send_a_start - my_a_start + 1)
    2499           4 :                send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
    2500             : 
    2501           4 :                size_send_buffer = MAX(size_send_buffer, MAX(send_end - send_start + 1, 0))
    2502             : 
    2503             :                ! Same for recv process but with reverse positions
    2504           4 :                recv_start = MAX(1, recv_a_start - my_B_start + 1)
    2505           4 :                recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
    2506             : 
    2507           8 :                size_recv_buffer = MAX(size_recv_buffer, MAX(recv_end - recv_start + 1, 0))
    2508             :             END DO
    2509          28 :             ALLOCATE (buffer_send(size_send_buffer, virtual(ispin)), buffer_recv(size_recv_buffer, virtual(ispin)))
    2510             : 
    2511           8 :             DO proc_shift = 1, para_env_sub%num_pe - 1
    2512           4 :                proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2513           4 :                proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2514             : 
    2515           4 :                CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
    2516           4 :                CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
    2517             : 
    2518             :                ! Calculate local indices of the common range of own matrix and send process
    2519           4 :                send_start = MAX(1, send_a_start - my_a_start + 1)
    2520           4 :                send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
    2521         802 :                buffer_send(1:MAX(send_end - send_start + 1, 0), :) = rpa_work%P_ab(ispin)%array(send_start:send_end, :)
    2522             : 
    2523             :                ! Same for recv process but with reverse positions
    2524           4 :                recv_start = MAX(1, recv_a_start - my_B_start + 1)
    2525           4 :                recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
    2526             : 
    2527             :                CALL para_env_sub%sendrecv(buffer_send(1:MAX(send_end - send_start + 1, 0), :), proc_send, &
    2528        1600 :                                           buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :), proc_recv)
    2529             : 
    2530             :                mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
    2531             :                   mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) + &
    2532         810 :                   my_scale*buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :)
    2533             : 
    2534             :             END DO
    2535             : 
    2536           4 :             IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    2537           4 :             IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    2538             :          END IF
    2539          24 :          DEALLOCATE (rpa_work%P_ab(ispin)%array)
    2540             : 
    2541          24 :          CALL release_group_dist(gd_a_sub)
    2542             : 
    2543             :          BLOCK
    2544             :             TYPE(mp_comm_type) :: comm_exchange
    2545          24 :             CALL comm_exchange%from_split(para_env, para_env_sub%mepos)
    2546          24 :             CALL comm_exchange%sum(mp2_env%ri_grad%P_ab(ispin)%array)
    2547          48 :             CALL comm_exchange%free()
    2548             :          END BLOCK
    2549             : 
    2550             :          ! Symmetrize P_ab
    2551          24 :          IF (para_env_sub%num_pe > 1) THEN
    2552             :             BLOCK
    2553           4 :                REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
    2554           4 :                REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
    2555           4 :                REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
    2556             : 
    2557             :                mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
    2558             :                   0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
    2559         804 :                           + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
    2560             : 
    2561          12 :                ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
    2562          16 :                ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
    2563             : 
    2564           8 :                DO proc_shift = 1, para_env_sub%num_pe - 1
    2565             : 
    2566           4 :                   proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2567           4 :                   proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2568             : 
    2569           4 :                   CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
    2570           4 :                   CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
    2571             : 
    2572           4 :                   buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
    2573             : 
    2574         402 :                   buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
    2575             :                   CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
    2576         402 :                                              buffer_recv(:, :recv_a_size), proc_recv)
    2577             : 
    2578             :                   mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
    2579         410 :                      0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
    2580             : 
    2581             :                END DO
    2582             : 
    2583           8 :                DEALLOCATE (buffer_send_1D, buffer_recv)
    2584             :             END BLOCK
    2585             :          ELSE
    2586             :             mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
    2587       19140 :                                                               TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
    2588             :          END IF
    2589             : 
    2590          92 :          CALL release_group_dist(gd_virtual_sub)
    2591             : 
    2592             :       END DO
    2593         116 :       DEALLOCATE (rpa_work%gd_homo, rpa_work%gd_virtual, rpa_work%P_ij, rpa_work%P_ab)
    2594          20 :       IF (nspins == 1) THEN
    2595         336 :          mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
    2596        5374 :          mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
    2597             :       END IF
    2598             : 
    2599          20 :       CALL timestop(handle)
    2600          20 :    END SUBROUTINE
    2601             : 
    2602             : ! **************************************************************************************************
    2603             : !> \brief Dereplicate data from fm_sub and collect in fm_global, overlapping data will be added
    2604             : !> \param fm_sub replicated matrix, all subgroups have the same size, will be release on output
    2605             : !> \param fm_global global matrix, on output it will contain the sum of the replicated matrices redistributed
    2606             : ! **************************************************************************************************
    2607          88 :    SUBROUTINE dereplicate_and_sum_fm(fm_sub, fm_global)
    2608             :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm_sub, fm_global
    2609             : 
    2610             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dereplicate_and_sum_fm'
    2611             : 
    2612             :       INTEGER :: col_local, elements2recv_col, elements2recv_row, elements2send_col, &
    2613             :          elements2send_row, first_p_pos_global(2), first_p_pos_sub(2), handle, handle2, &
    2614             :          mypcol_global, myprow_global, ncol_block_global, ncol_block_sub, ncol_local_global, &
    2615             :          ncol_local_sub, npcol_global, npcol_sub, nprow_global, nprow_sub, nrow_block_global, &
    2616             :          nrow_block_sub, nrow_local_global, nrow_local_sub, pcol_recv, pcol_send, proc_recv, &
    2617             :          proc_send, proc_send_global, proc_shift, prow_recv, prow_send, row_local, tag
    2618             :       INTEGER(int_8)                                     :: size_recv_buffer, size_send_buffer
    2619          88 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv_col, data2recv_row, &
    2620          88 :                                                             data2send_col, data2send_row, &
    2621          88 :                                                             subgroup2mepos
    2622          88 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices_global, col_indices_sub, &
    2623          88 :                                                             row_indices_global, row_indices_sub
    2624          88 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi_global, blacs2mpi_sub, &
    2625          88 :                                                             mpi2blacs_global, mpi2blacs_sub
    2626          88 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: recv_buffer_1D, send_buffer_1D
    2627          88 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: recv_buffer, send_buffer
    2628             :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
    2629          88 :       TYPE(one_dim_int_array), ALLOCATABLE, DIMENSION(:) :: index2recv_col, index2recv_row, &
    2630          88 :                                                             index2send_col, index2send_row
    2631             : 
    2632          88 :       CALL timeset(routineN, handle)
    2633             : 
    2634          88 :       tag = 1
    2635             : 
    2636          88 :       nprow_sub = fm_sub%matrix_struct%context%num_pe(1)
    2637          88 :       npcol_sub = fm_sub%matrix_struct%context%num_pe(2)
    2638             : 
    2639          88 :       myprow_global = fm_global%matrix_struct%context%mepos(1)
    2640          88 :       mypcol_global = fm_global%matrix_struct%context%mepos(2)
    2641          88 :       nprow_global = fm_global%matrix_struct%context%num_pe(1)
    2642          88 :       npcol_global = fm_global%matrix_struct%context%num_pe(2)
    2643             : 
    2644             :       CALL cp_fm_get_info(fm_sub, col_indices=col_indices_sub, row_indices=row_indices_sub, &
    2645          88 :                           nrow_local=nrow_local_sub, ncol_local=ncol_local_sub)
    2646             :       CALL cp_fm_struct_get(fm_sub%matrix_struct, para_env=para_env_sub, first_p_pos=first_p_pos_sub, &
    2647          88 :                             nrow_block=nrow_block_sub, ncol_block=ncol_block_sub)
    2648             :       CALL cp_fm_struct_get(fm_global%matrix_struct, first_p_pos=first_p_pos_global, nrow_block=nrow_block_global, &
    2649             :                             ncol_block=ncol_block_global, para_env=para_env, &
    2650             :                             col_indices=col_indices_global, row_indices=row_indices_global, &
    2651          88 :                             nrow_local=nrow_local_global, ncol_local=ncol_local_global)
    2652          88 :       CALL fm_sub%matrix_struct%context%get(blacs2mpi=blacs2mpi_sub, mpi2blacs=mpi2blacs_sub)
    2653          88 :       CALL fm_global%matrix_struct%context%get(blacs2mpi=blacs2mpi_global, mpi2blacs=mpi2blacs_global)
    2654             : 
    2655          88 :       IF (para_env%num_pe /= para_env_sub%num_pe) THEN
    2656             :          BLOCK
    2657             :             TYPE(mp_comm_type) :: comm_exchange
    2658          64 :             comm_exchange = fm_sub%matrix_struct%context%interconnect(para_env)
    2659          64 :             CALL comm_exchange%sum(fm_sub%local_data)
    2660         128 :             CALL comm_exchange%free()
    2661             :          END BLOCK
    2662             :       END IF
    2663             : 
    2664         264 :       ALLOCATE (subgroup2mepos(0:para_env_sub%num_pe - 1))
    2665          88 :       CALL para_env_sub%allgather(para_env%mepos, subgroup2mepos)
    2666             : 
    2667          88 :       CALL timeset(routineN//"_data2", handle2)
    2668             :       ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
    2669          88 :       CALL get_elements2send(data2send_col, npcol_global, row_indices_sub, ncol_block_global, first_p_pos_global(2), index2send_col)
    2670          88 :       CALL get_elements2send(data2send_row, nprow_global, col_indices_sub, nrow_block_global, first_p_pos_global(1), index2send_row)
    2671             : 
    2672             :       ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
    2673             :       ! Do the reverse for the recieve processes
    2674          88 :       CALL get_elements2send(data2recv_col, npcol_sub, row_indices_global, ncol_block_sub, first_p_pos_sub(2), index2recv_col)
    2675          88 :       CALL get_elements2send(data2recv_row, nprow_sub, col_indices_global, nrow_block_sub, first_p_pos_sub(1), index2recv_row)
    2676          88 :       CALL timestop(handle2)
    2677             : 
    2678          88 :       CALL timeset(routineN//"_local", handle2)
    2679             :       ! Loop over local data and transpose
    2680          88 :       prow_send = mpi2blacs_global(1, para_env%mepos)
    2681          88 :       pcol_send = mpi2blacs_global(2, para_env%mepos)
    2682          88 :       prow_recv = mpi2blacs_sub(1, para_env_sub%mepos)
    2683          88 :       pcol_recv = mpi2blacs_sub(2, para_env_sub%mepos)
    2684          88 :       elements2recv_col = data2recv_col(pcol_recv)
    2685          88 :       elements2recv_row = data2recv_row(prow_recv)
    2686             : 
    2687             : !$OMP    PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
    2688             : !$OMP                SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
    2689             : !$OMP                       index2recv_col,index2recv_row,pcol_recv,prow_recv, &
    2690          88 : !$OMP                       fm_sub,index2send_col,index2send_row,pcol_send,prow_send)
    2691             :       DO col_local = 1, elements2recv_col
    2692             :          DO row_local = 1, elements2recv_row
    2693             :             fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
    2694             :                                  index2recv_row(prow_recv)%array(row_local)) &
    2695             :                = fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
    2696             :                                    index2send_row(prow_send)%array(col_local))
    2697             :          END DO
    2698             :       END DO
    2699             : !$OMP    END PARALLEL DO
    2700          88 :       CALL timestop(handle2)
    2701             : 
    2702          88 :       IF (para_env_sub%num_pe > 1) THEN
    2703             :          size_send_buffer = 0_int_8
    2704             :          size_recv_buffer = 0_int_8
    2705             :          ! Loop over all processes in para_env_sub
    2706          48 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    2707          24 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2708          24 :             proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2709             : 
    2710          24 :             proc_send_global = subgroup2mepos(proc_send)
    2711          24 :             prow_send = mpi2blacs_global(1, proc_send_global)
    2712          24 :             pcol_send = mpi2blacs_global(2, proc_send_global)
    2713          24 :             elements2send_col = data2send_col(pcol_send)
    2714          24 :             elements2send_row = data2send_row(prow_send)
    2715             : 
    2716          24 :             size_send_buffer = MAX(size_send_buffer, INT(elements2send_col, int_8)*elements2send_row)
    2717             : 
    2718          24 :             prow_recv = mpi2blacs_sub(1, proc_recv)
    2719          24 :             pcol_recv = mpi2blacs_sub(2, proc_recv)
    2720          24 :             elements2recv_col = data2recv_col(pcol_recv)
    2721          24 :             elements2recv_row = data2recv_row(prow_recv)
    2722             : 
    2723          48 :             size_recv_buffer = MAX(size_recv_buffer, INT(elements2recv_col, int_8)*elements2recv_row)
    2724             :          END DO
    2725         120 :          ALLOCATE (send_buffer_1D(size_send_buffer), recv_buffer_1D(size_recv_buffer))
    2726             : 
    2727             :          ! Loop over all processes in para_env_sub
    2728          48 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    2729          24 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2730          24 :             proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2731             : 
    2732          24 :             proc_send_global = subgroup2mepos(proc_send)
    2733          24 :             prow_send = mpi2blacs_global(1, proc_send_global)
    2734          24 :             pcol_send = mpi2blacs_global(2, proc_send_global)
    2735          24 :             elements2send_col = data2send_col(pcol_send)
    2736          24 :             elements2send_row = data2send_row(prow_send)
    2737             : 
    2738          24 :             CALL timeset(routineN//"_pack", handle2)
    2739             :             ! Loop over local data and pack the buffer
    2740             :             ! Transpose the matrix already
    2741          24 :           send_buffer(1:elements2send_row, 1:elements2send_col) => send_buffer_1D(1:INT(elements2send_row, int_8)*elements2send_col)
    2742             : !$OMP    PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
    2743             : !$OMP                SHARED(elements2send_col,elements2send_row,send_buffer,fm_sub,&
    2744          24 : !$OMP                       index2send_col,index2send_row,pcol_send,prow_send)
    2745             :             DO row_local = 1, elements2send_col
    2746             :                DO col_local = 1, elements2send_row
    2747             :                   send_buffer(col_local, row_local) = &
    2748             :                      fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
    2749             :                                        index2send_row(prow_send)%array(col_local))
    2750             :                END DO
    2751             :             END DO
    2752             : !$OMP    END PARALLEL DO
    2753          24 :             CALL timestop(handle2)
    2754             : 
    2755          24 :             prow_recv = mpi2blacs_sub(1, proc_recv)
    2756          24 :             pcol_recv = mpi2blacs_sub(2, proc_recv)
    2757          24 :             elements2recv_col = data2recv_col(pcol_recv)
    2758          24 :             elements2recv_row = data2recv_row(prow_recv)
    2759             : 
    2760             :             ! Send data
    2761          24 :           recv_buffer(1:elements2recv_col, 1:elements2recv_row) => recv_buffer_1D(1:INT(elements2recv_row, int_8)*elements2recv_col)
    2762         120 :             IF (SIZE(recv_buffer) > 0_int_8) THEN
    2763          72 :             IF (SIZE(send_buffer) > 0_int_8) THEN
    2764       81192 :                CALL para_env_sub%sendrecv(send_buffer, proc_send, recv_buffer, proc_recv, tag)
    2765             :             ELSE
    2766           0 :                CALL para_env_sub%recv(recv_buffer, proc_recv, tag)
    2767             :             END IF
    2768             : 
    2769          24 :             CALL timeset(routineN//"_unpack", handle2)
    2770             : !$OMP    PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
    2771             : !$OMP                SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
    2772          24 : !$OMP                       index2recv_col,index2recv_row,pcol_recv,prow_recv)
    2773             :             DO col_local = 1, elements2recv_col
    2774             :                DO row_local = 1, elements2recv_row
    2775             :                   fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
    2776             :                                        index2recv_row(prow_recv)%array(row_local)) &
    2777             :                      = recv_buffer(col_local, row_local)
    2778             :                END DO
    2779             :             END DO
    2780             : !$OMP    END PARALLEL DO
    2781          24 :             CALL timestop(handle2)
    2782           0 :             ELSE IF (SIZE(send_buffer) > 0_int_8) THEN
    2783           0 :             CALL para_env_sub%send(send_buffer, proc_send, tag)
    2784             :             END IF
    2785             :          END DO
    2786             :       END IF
    2787             : 
    2788          88 :       DEALLOCATE (data2send_col, data2send_row, data2recv_col, data2recv_row)
    2789         176 :       DO proc_shift = 0, npcol_global - 1
    2790         176 :          DEALLOCATE (index2send_col(proc_shift)%array)
    2791             :       END DO
    2792         176 :       DO proc_shift = 0, npcol_sub - 1
    2793         176 :          DEALLOCATE (index2recv_col(proc_shift)%array)
    2794             :       END DO
    2795         264 :       DO proc_shift = 0, nprow_global - 1
    2796         264 :          DEALLOCATE (index2send_row(proc_shift)%array)
    2797             :       END DO
    2798         200 :       DO proc_shift = 0, nprow_sub - 1
    2799         200 :          DEALLOCATE (index2recv_row(proc_shift)%array)
    2800             :       END DO
    2801         552 :       DEALLOCATE (index2send_col, index2recv_col, index2send_row, index2recv_row)
    2802             : 
    2803          88 :       CALL cp_fm_release(fm_sub)
    2804             : 
    2805          88 :       CALL timestop(handle)
    2806             : 
    2807         352 :    END SUBROUTINE dereplicate_and_sum_fm
    2808             : 
    2809             : ! **************************************************************************************************
    2810             : !> \brief ...
    2811             : !> \param data2send ...
    2812             : !> \param np_global ...
    2813             : !> \param indices_sub ...
    2814             : !> \param n_block_global ...
    2815             : !> \param first_p_pos_global ...
    2816             : !> \param index2send ...
    2817             : ! **************************************************************************************************
    2818         352 :    SUBROUTINE get_elements2send(data2send, np_global, indices_sub, n_block_global, first_p_pos_global, index2send)
    2819             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: data2send
    2820             :       INTEGER, INTENT(IN)                                :: np_global
    2821             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: indices_sub
    2822             :       INTEGER, INTENT(IN)                                :: n_block_global, first_p_pos_global
    2823             :       TYPE(one_dim_int_array), ALLOCATABLE, &
    2824             :          DIMENSION(:), INTENT(OUT)                       :: index2send
    2825             : 
    2826             :       INTEGER                                            :: i_global, i_local, proc
    2827             : 
    2828        1056 :       ALLOCATE (data2send(0:np_global - 1))
    2829         816 :       data2send = 0
    2830       25484 :       DO i_local = 1, SIZE(indices_sub)
    2831       25132 :          i_global = indices_sub(i_local)
    2832       25132 :          proc = cp_fm_indxg2p(i_global, n_block_global, 0, first_p_pos_global, np_global)
    2833       25484 :          data2send(proc) = data2send(proc) + 1
    2834             :       END DO
    2835             : 
    2836        1520 :       ALLOCATE (index2send(0:np_global - 1))
    2837         816 :       DO proc = 0, np_global - 1
    2838        1392 :          ALLOCATE (index2send(proc)%array(data2send(proc)))
    2839             :          ! We want to crash if there is an error
    2840       25948 :          index2send(proc)%array = -1
    2841             :       END DO
    2842             : 
    2843         816 :       data2send = 0
    2844       25484 :       DO i_local = 1, SIZE(indices_sub)
    2845       25132 :          i_global = indices_sub(i_local)
    2846       25132 :          proc = cp_fm_indxg2p(i_global, n_block_global, 0, first_p_pos_global, np_global)
    2847       25132 :          data2send(proc) = data2send(proc) + 1
    2848       25484 :          index2send(proc)%array(data2send(proc)) = i_local
    2849             :       END DO
    2850             : 
    2851         352 :    END SUBROUTINE
    2852             : 
    2853           0 : END MODULE rpa_grad

Generated by: LCOV version 1.15