LCOV - code coverage report
Current view: top level - src - mp2_ri_gpw.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:ccc2433) Lines: 1055 1080 97.7 %
Date: 2024-04-25 07:09:54 Functions: 14 14 100.0 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Routines to calculate RI-GPW-MP2 energy using pw
      10             : !> \par History
      11             : !>      06.2012 created [Mauro Del Ben]
      12             : !>      03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
      13             : ! **************************************************************************************************
      14             : MODULE mp2_ri_gpw
      15             :    USE ISO_C_BINDING,                   ONLY: C_NULL_PTR,&
      16             :                                               c_associated
      17             :    USE cp_log_handling,                 ONLY: cp_to_string
      18             :    USE dgemm_counter_types,             ONLY: dgemm_counter_init,&
      19             :                                               dgemm_counter_start,&
      20             :                                               dgemm_counter_stop,&
      21             :                                               dgemm_counter_type,&
      22             :                                               dgemm_counter_write
      23             :    USE group_dist_types,                ONLY: get_group_dist,&
      24             :                                               group_dist_d1_type,&
      25             :                                               maxsize,&
      26             :                                               release_group_dist
      27             :    USE kinds,                           ONLY: dp,&
      28             :                                               int_8
      29             :    USE libint_2c_3c,                    ONLY: compare_potential_types
      30             :    USE local_gemm_api,                  ONLY: LOCAL_GEMM_PU_GPU,&
      31             :                                               local_gemm,&
      32             :                                               local_gemm_create,&
      33             :                                               local_gemm_destroy,&
      34             :                                               local_gemm_set_op_threshold_gpu
      35             :    USE machine,                         ONLY: m_flush,&
      36             :                                               m_memory,&
      37             :                                               m_walltime
      38             :    USE message_passing,                 ONLY: mp_comm_type,&
      39             :                                               mp_para_env_type
      40             :    USE mp2_ri_grad_util,                ONLY: complete_gamma
      41             :    USE mp2_types,                       ONLY: mp2_type,&
      42             :                                               three_dim_real_array
      43             : 
      44             : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
      45             : #include "./base/base_uses.f90"
      46             : 
      47             :    IMPLICIT NONE
      48             : 
      49             :    PRIVATE
      50             : 
      51             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
      52             : 
      53             :    PUBLIC :: mp2_ri_gpw_compute_en
      54             : 
      55             : CONTAINS
      56             : 
      57             : ! **************************************************************************************************
      58             : !> \brief ...
      59             : !> \param Emp2_Cou ...
      60             : !> \param Emp2_EX ...
      61             : !> \param Emp2_S ...
      62             : !> \param Emp2_T ...
      63             : !> \param BIb_C ...
      64             : !> \param mp2_env ...
      65             : !> \param para_env ...
      66             : !> \param para_env_sub ...
      67             : !> \param color_sub ...
      68             : !> \param gd_array ...
      69             : !> \param gd_B_virtual ...
      70             : !> \param Eigenval ...
      71             : !> \param nmo ...
      72             : !> \param homo ...
      73             : !> \param dimen_RI ...
      74             : !> \param unit_nr ...
      75             : !> \param calc_forces ...
      76             : !> \param calc_ex ...
      77             : ! **************************************************************************************************
      78        1038 :    SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
      79         346 :                                     gd_array, gd_B_virtual, &
      80         346 :                                     Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
      81             :       REAL(KIND=dp), INTENT(INOUT)                       :: Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T
      82             :       TYPE(three_dim_real_array), DIMENSION(:), &
      83             :          INTENT(INOUT)                                   :: BIb_C
      84             :       TYPE(mp2_type)                                     :: mp2_env
      85             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env, para_env_sub
      86             :       INTEGER, INTENT(IN)                                :: color_sub
      87             :       TYPE(group_dist_d1_type), INTENT(INOUT)            :: gd_array
      88             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
      89             :       INTEGER, INTENT(IN)                                :: nmo
      90             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
      91             :       TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
      92             :          INTENT(INOUT)                                   :: gd_B_virtual
      93             :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
      94             :       LOGICAL, INTENT(IN)                                :: calc_forces, calc_ex
      95             : 
      96             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_gpw_compute_en'
      97             : 
      98             :       INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
      99             :          iiB, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjB, jspin, &
     100             :          max_ij_pairs, my_block_size, my_group_L_end, my_group_L_size, my_group_L_size_orig, &
     101             :          my_group_L_start, my_i, my_ij_pairs, my_j, my_new_group_L_size, ngroup, nspins, &
     102             :          num_integ_group, proc_receive, proc_send, proc_shift, rec_B_size, rec_B_virtual_end, &
     103             :          rec_B_virtual_start, rec_L_size, send_B_size, send_B_virtual_end, send_B_virtual_start, &
     104             :          send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
     105         346 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_B_size, &
     106         346 :          my_B_virtual_end, my_B_virtual_start, num_ij_pairs, sizes_array_orig, virtual
     107         346 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: ij_map
     108         346 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: ranges_info_array
     109             :       LOGICAL                                            :: my_alpha_beta_case, my_beta_beta_case, &
     110             :                                                             my_open_shell_SS
     111             :       REAL(KIND=dp)                                      :: amp_fac, my_Emp2_Cou, my_Emp2_EX, &
     112             :                                                             sym_fac, t_new, t_start
     113         346 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: buffer_1D
     114             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     115         346 :          TARGET                                          :: local_ab, local_ba, t_ab
     116             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     117         346 :          TARGET                                          :: local_i_aL, local_j_aL, Y_i_aP, Y_j_aP
     118             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
     119         346 :          POINTER                                         :: external_ab, external_i_aL
     120             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
     121         346 :          POINTER                                         :: BI_C_rec
     122             :       TYPE(dgemm_counter_type)                           :: dgemm_counter
     123             :       TYPE(mp_comm_type)                                 :: comm_exchange, comm_rep
     124             :       TYPE(three_dim_real_array), ALLOCATABLE, &
     125         346 :          DIMENSION(:)                                    :: B_ia_Q
     126             : 
     127         346 :       CALL timeset(routineN, handle)
     128             : 
     129         346 :       nspins = SIZE(homo)
     130             : 
     131        1038 :       ALLOCATE (virtual(nspins))
     132         778 :       virtual(:) = nmo - homo(:)
     133             : 
     134        2422 :       ALLOCATE (my_B_size(nspins), my_B_virtual_start(nspins), my_B_virtual_end(nspins))
     135         778 :       DO ispin = 1, nspins
     136             :          CALL get_group_dist(gd_B_virtual(ispin), para_env_sub%mepos, &
     137         778 :                              my_B_virtual_start(ispin), my_B_virtual_end(ispin), my_B_size(ispin))
     138             :       END DO
     139             : 
     140         346 :       CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
     141             : 
     142         346 :       CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
     143             : 
     144             :       ! local_gemm_ctx has a very footprint the first time this routine is
     145             :       ! called.
     146         346 :       IF (.NOT. c_associated(mp2_env%local_gemm_ctx)) THEN
     147         346 :          CALL local_gemm_create(mp2_env%local_gemm_ctx, LOCAL_GEMM_PU_GPU)
     148         346 :          CALL local_gemm_set_op_threshold_gpu(mp2_env%local_gemm_ctx, 128*128*128*2)
     149             :       END IF
     150             : 
     151             :       CALL mp2_ri_get_integ_group_size( &
     152             :          mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
     153             :          homo, dimen_RI, unit_nr, &
     154             :          integ_group_size, ngroup, &
     155         346 :          num_integ_group, virtual, calc_forces)
     156             : 
     157             :       ! now create a group that contains all the proc that have the same virtual starting point
     158             :       ! in the integ group
     159             :       CALL mp2_ri_create_group( &
     160             :          para_env, para_env_sub, color_sub, &
     161             :          gd_array%sizes, calc_forces, &
     162             :          integ_group_size, my_group_L_end, &
     163             :          my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
     164             :          integ_group_pos2color_sub, sizes_array_orig, &
     165         346 :          ranges_info_array, comm_exchange, comm_rep, num_integ_group)
     166             : 
     167             :       ! We cannot fix the tag because of the recv routine
     168         346 :       tag = 42
     169             : 
     170         778 :       DO jspin = 1, nspins
     171             : 
     172             :          CALL replicate_iaK_2intgroup(BIb_C(jspin)%array, comm_exchange, comm_rep, &
     173             :                                       homo(jspin), gd_array%sizes, my_B_size(jspin), &
     174         432 :                                       my_group_L_size, ranges_info_array)
     175             : 
     176        1296 :          DO ispin = 1, jspin
     177             : 
     178         518 :             IF (unit_nr > 0) THEN
     179         259 :                IF (nspins == 1) THEN
     180         130 :                   WRITE (unit_nr, *) "Start loop run"
     181         129 :                ELSE IF (ispin == 1 .AND. jspin == 1) THEN
     182          43 :                   WRITE (unit_nr, *) "Start loop run alpha-alpha"
     183          86 :                ELSE IF (ispin == 1 .AND. jspin == 2) THEN
     184          43 :                   WRITE (unit_nr, *) "Start loop run alpha-beta"
     185          43 :                ELSE IF (ispin == 2 .AND. jspin == 2) THEN
     186          43 :                   WRITE (unit_nr, *) "Start loop run beta-beta"
     187             :                END IF
     188         259 :                CALL m_flush(unit_nr)
     189             :             END IF
     190             : 
     191         518 :             my_open_shell_SS = (nspins == 2) .AND. (ispin == jspin)
     192             : 
     193             :             ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
     194             :             ! If we calculate the gradient we need to distinguish
     195             :             ! between alpha-alpha and beta-beta cases for UMP2
     196             : 
     197         518 :             my_beta_beta_case = .FALSE.
     198         518 :             my_alpha_beta_case = .FALSE.
     199         518 :             IF (ispin /= jspin) THEN
     200          86 :                my_alpha_beta_case = .TRUE.
     201         432 :             ELSE IF (my_open_shell_SS) THEN
     202         172 :                IF (ispin == 2) my_beta_beta_case = .TRUE.
     203             :             END IF
     204             : 
     205         518 :             amp_fac = mp2_env%scale_S + mp2_env%scale_T
     206         518 :             IF (my_alpha_beta_case .OR. my_open_shell_SS) amp_fac = mp2_env%scale_T
     207             : 
     208             :             CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
     209         518 :                                         my_group_L_size, calc_forces, ispin, jspin, local_ba)
     210             : 
     211             :             CALL mp2_ri_get_block_size( &
     212             :                mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual(ispin:jspin), &
     213             :                homo(ispin:jspin), virtual(ispin:jspin), dimen_RI, unit_nr, block_size, &
     214         518 :                ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1D)
     215             : 
     216             :             ! *****************************************************************
     217             :             ! **********  REPLICATION-BLOCKED COMMUNICATION SCHEME  ***********
     218             :             ! *****************************************************************
     219             :             ! introduce block size, the number of occupied orbitals has to be a
     220             :             ! multiple of the block size
     221             : 
     222             :             ! Calculate the maximum number of ij pairs that have to be computed
     223             :             ! among groups
     224             :             CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
     225         518 :                                       block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
     226             : 
     227        1554 :             ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
     228         518 :             CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
     229             : 
     230        1146 :             max_ij_pairs = MAXVAL(num_ij_pairs)
     231             : 
     232             :             ! start real stuff
     233             :             CALL mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, local_i_aL, &
     234         518 :                                      local_j_aL, calc_forces, Y_i_aP, Y_j_aP, ispin, jspin)
     235             : 
     236         518 :             CALL timeset(routineN//"_RI_loop", handle2)
     237         518 :             my_Emp2_Cou = 0.0_dp
     238         518 :             my_Emp2_EX = 0.0_dp
     239         518 :             t_start = m_walltime()
     240        2509 :             DO ij_index = 1, max_ij_pairs
     241             : 
     242             :                ! Prediction is unreliable if we are in the first step of the loop
     243        1991 :                IF (unit_nr > 0 .AND. ij_index > 1) THEN
     244         723 :                   decil = ij_index*10/max_ij_pairs
     245         723 :                   IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
     246         682 :                      t_new = m_walltime()
     247         682 :                      t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
     248             :                      WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
     249         682 :                         cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
     250         682 :                      CALL m_flush(unit_nr)
     251             :                   END IF
     252             :                END IF
     253             : 
     254        1991 :                IF (calc_forces) THEN
     255     2791144 :                   Y_i_aP = 0.0_dp
     256     2891166 :                   Y_j_aP = 0.0_dp
     257             :                END IF
     258             : 
     259        1991 :                IF (ij_index <= my_ij_pairs) THEN
     260             :                   ! We have work to do
     261        1942 :                   ij_counter = (ij_index - MIN(1, color_sub))*ngroup + color_sub
     262        1942 :                   my_i = ij_map(1, ij_counter)
     263        1942 :                   my_j = ij_map(2, ij_counter)
     264        1942 :                   my_block_size = ij_map(3, ij_counter)
     265             : 
     266     3277275 :                   local_i_aL = 0.0_dp
     267             :                   CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
     268        1942 :                                        BIb_C(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
     269             : 
     270     3390693 :                   local_j_aL = 0.0_dp
     271             :                   CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
     272        1942 :                                        BIb_C(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
     273             : 
     274             :                   ! collect data from other proc
     275        1942 :                   CALL timeset(routineN//"_comm", handle3)
     276        2051 :                   DO proc_shift = 1, comm_exchange%num_pe - 1
     277         109 :                      proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
     278         109 :                      proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
     279             : 
     280         109 :                      send_ij_index = num_ij_pairs(proc_send)
     281             : 
     282         109 :                      CALL get_group_dist(gd_array, proc_receive, sizes=rec_L_size)
     283             : 
     284        2051 :                      IF (ij_index <= send_ij_index) THEN
     285             :                         ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
     286          60 :                                           integ_group_pos2color_sub(proc_send)
     287          60 :                         send_i = ij_map(1, ij_counter_send)
     288          60 :                         send_j = ij_map(2, ij_counter_send)
     289             : 
     290             :                         ! occupied i
     291             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
     292          60 :                            buffer_1D(1:rec_L_size*my_B_size(ispin)*my_block_size)
     293       43353 :                         BI_C_rec = 0.0_dp
     294             :                         CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
     295          60 :                                                     proc_send, BI_C_rec, proc_receive, tag)
     296             : 
     297             :                         CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     298          60 :                                              BI_C_rec(:, 1:my_B_size(ispin), :))
     299             : 
     300             :                         ! occupied j
     301             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
     302          60 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
     303       44373 :                         BI_C_rec = 0.0_dp
     304             :                         CALL comm_exchange%sendrecv(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
     305          60 :                                                     proc_send, BI_C_rec, proc_receive, tag)
     306             : 
     307             :                         CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     308          60 :                                              BI_C_rec(:, 1:my_B_size(jspin), :))
     309             : 
     310             :                      ELSE
     311             :                         ! we send nothing while we know that we have to receive something
     312             : 
     313             :                         ! occupied i
     314             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
     315          49 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(ispin)*my_block_size)
     316        9124 :                         BI_C_rec = 0.0_dp
     317          49 :                         CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
     318             : 
     319             :                         CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     320          49 :                                              BI_C_rec(:, 1:my_B_size(ispin), 1:my_block_size))
     321             : 
     322             :                         ! occupied j
     323             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
     324          49 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
     325        9124 :                         BI_C_rec = 0.0_dp
     326          49 :                         CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
     327             : 
     328             :                         CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     329          49 :                                              BI_C_rec(:, 1:my_B_size(jspin), 1:my_block_size))
     330             : 
     331             :                      END IF
     332             : 
     333             :                   END DO
     334             : 
     335        1942 :                   CALL timestop(handle3)
     336             : 
     337             :                   ! loop over the block elements
     338        3888 :                   DO iiB = 1, my_block_size
     339        5842 :                      DO jjB = 1, my_block_size
     340        1954 :                         CALL timeset(routineN//"_expansion", handle3)
     341        3900 :                         ASSOCIATE (my_local_i_aL => local_i_aL(:, :, iiB), my_local_j_aL => local_j_aL(:, :, jjB))
     342             : 
     343             :                            ! calculate the integrals (ia|jb) strating from my local data ...
     344      578764 :                            local_ab = 0.0_dp
     345        1954 :                            IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
     346      139874 :                               local_ba = 0.0_dp
     347             :                            END IF
     348        1954 :                            CALL dgemm_counter_start(dgemm_counter)
     349             :                            CALL local_gemm('T', 'N', my_B_size(ispin), my_B_size(jspin), dimen_RI, 1.0_dp, &
     350             :                                            my_local_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
     351             :                                            0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), &
     352        1954 :                                            my_B_size(ispin), mp2_env%local_gemm_ctx)
     353             :                            ! Additional integrals only for alpha_beta case and forces
     354        1954 :                            IF (my_alpha_beta_case .AND. calc_forces) THEN
     355             :                               local_ba(my_B_virtual_start(jspin):my_B_virtual_end(jspin), :) = &
     356      132909 :                                  TRANSPOSE(local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :))
     357             :                            END IF
     358             :                            ! ... and from the other of my subgroup
     359        2228 :                            DO proc_shift = 1, para_env_sub%num_pe - 1
     360         274 :                               proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
     361         274 :                               proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
     362             : 
     363             :                               CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, &
     364         274 :                                                   rec_B_virtual_end, rec_B_size)
     365             : 
     366         274 :                               external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
     367      267944 :                               external_i_aL = 0.0_dp
     368             : 
     369             :                               CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
     370         274 :                                                          external_i_aL, proc_receive, tag)
     371             : 
     372             :                               CALL local_gemm('T', 'N', rec_B_size, my_B_size(jspin), dimen_RI, 1.0_dp, &
     373             :                                               external_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
     374             :                                           0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(jspin)), rec_B_size, &
     375         274 :                                               mp2_env%local_gemm_ctx)
     376             : 
     377             :                               ! Additional integrals only for alpha_beta case and forces
     378        2502 :                               IF (my_alpha_beta_case .AND. calc_forces) THEN
     379             : 
     380             :                                  CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, &
     381          70 :                                                      rec_B_virtual_end, rec_B_size)
     382             : 
     383          70 :                                  external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
     384       81655 :                                  external_i_aL = 0.0_dp
     385             : 
     386             :                                  CALL para_env_sub%sendrecv(my_local_j_aL, proc_send, &
     387          70 :                                                             external_i_aL, proc_receive, tag)
     388             : 
     389             :                                  CALL local_gemm('T', 'N', rec_B_size, my_B_size(ispin), dimen_RI, 1.0_dp, &
     390             :                                                  external_i_aL, dimen_RI, my_local_i_aL, dimen_RI, &
     391             :                                           0.0_dp, local_ba(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(ispin)), rec_B_size, &
     392          70 :                                                  mp2_env%local_gemm_ctx)
     393             :                               END IF
     394             :                            END DO
     395        1954 :                            IF (my_alpha_beta_case .AND. calc_forces) THEN
     396             :                               ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
     397         490 :                               CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(ispin) + my_B_size(jspin), dimen_RI)
     398             :                            ELSE
     399        1464 :                               CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(jspin), dimen_RI)
     400             :                            END IF
     401        1954 :                            CALL timestop(handle3)
     402             : 
     403             :                            !sample peak memory
     404        1954 :                            CALL m_memory()
     405             : 
     406        1954 :                            CALL timeset(routineN//"_ener", handle3)
     407             :                            ! calculate coulomb only MP2
     408        1954 :                            sym_fac = 2.0_dp
     409        1954 :                            IF (my_i == my_j) sym_fac = 1.0_dp
     410        1954 :                            IF (my_alpha_beta_case) sym_fac = 0.5_dp
     411       30429 :                            DO b = 1, my_B_size(jspin)
     412       28475 :                               b_global = b + my_B_virtual_start(jspin) - 1
     413      578764 :                               DO a = 1, virtual(ispin)
     414             :                                  my_Emp2_Cou = my_Emp2_Cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
     415             :                                                (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
     416      576810 :                                                 Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
     417             :                               END DO
     418             :                            END DO
     419             : 
     420        1954 :                            IF (calc_ex) THEN
     421             :                               ! contract integrals with orbital energies for exchange MP2 energy
     422             :                               ! starting with local ...
     423      323406 :                               IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
     424       29841 :                               DO b = 1, my_B_size(ispin)
     425       27887 :                                  b_global = b + my_B_virtual_start(ispin) - 1
     426      541500 :                                  DO a = 1, my_B_size(ispin)
     427      511659 :                                     a_global = a + my_B_virtual_start(ispin) - 1
     428             :                                     my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
     429             :                                               (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
     430      511659 :                                                   Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     431      539546 :                                     IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
     432             :                                      t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
     433             :                                                            (Eigenval(homo(ispin) + a_global, ispin) + &
     434             :                                                             Eigenval(homo(ispin) + b_global, ispin) - &
     435      295758 :                                                             Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     436             :                                     END IF
     437             :                                  END DO
     438             :                               END DO
     439             :                               ! ... and then with external data
     440        2228 :                               DO proc_shift = 1, para_env_sub%num_pe - 1
     441         274 :                                  proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
     442         274 :                                  proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
     443             : 
     444             :                                  CALL get_group_dist(gd_B_virtual(ispin), proc_receive, &
     445         274 :                                                      rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
     446             :                                  CALL get_group_dist(gd_B_virtual(ispin), proc_send, &
     447         274 :                                                      send_B_virtual_start, send_B_virtual_end, send_B_size)
     448             : 
     449             :                                  external_ab(1:my_B_size(ispin), 1:rec_B_size) => &
     450         274 :                                     buffer_1D(1:INT(rec_B_size, int_8)*my_B_size(ispin))
     451       30405 :                                  external_ab = 0.0_dp
     452             : 
     453             :                       CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:my_B_size(ispin)), proc_send, &
     454       60536 :                                                             external_ab(1:my_B_size(ispin), 1:rec_B_size), proc_receive, tag)
     455             : 
     456        5233 :                                  DO b = 1, my_B_size(ispin)
     457        2731 :                                     b_global = b + my_B_virtual_start(ispin) - 1
     458       30405 :                                     DO a = 1, rec_B_size
     459       27400 :                                        a_global = a + rec_B_virtual_start - 1
     460             :                                        my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
     461             :                                               (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
     462       27400 :                                                      Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     463       27400 :                                        IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
     464             :                                          t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
     465             :                                                               (Eigenval(homo(ispin) + a_global, ispin) + &
     466             :                                                                Eigenval(homo(ispin) + b_global, ispin) - &
     467       12311 :                                                                Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     468             :                                     END DO
     469             :                                  END DO
     470             :                               END DO
     471             :                            END IF
     472        1954 :                            CALL timestop(handle3)
     473             : 
     474        3908 :                            IF (calc_forces) THEN
     475             :                               ! update P_ab, Gamma_P_ia
     476             :                               CALL mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
     477             :                                                       Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
     478             :                                                       my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, &
     479             :                                                       local_ab, t_ab, my_local_i_aL, my_local_j_aL, &
     480             :                                                       my_open_shell_ss, Y_i_aP(:, :, iiB), Y_j_aP(:, :, jjB), local_ba, &
     481        1569 :                                                       ispin, jspin, dgemm_counter, buffer_1D)
     482             : 
     483             :                            END IF
     484             : 
     485             :                         END ASSOCIATE
     486             : 
     487             :                      END DO ! jjB
     488             :                   END DO ! iiB
     489             : 
     490             :                ELSE
     491             :                   ! We need it later in case of gradients
     492          49 :                   my_block_size = 1
     493             : 
     494          49 :                   CALL timeset(routineN//"_comm", handle3)
     495             :                   ! No work to do and we know that we have to receive nothing, but send something
     496             :                   ! send data to other proc
     497          98 :                   DO proc_shift = 1, comm_exchange%num_pe - 1
     498          49 :                      proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
     499          49 :                      proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
     500             : 
     501          49 :                      send_ij_index = num_ij_pairs(proc_send)
     502             : 
     503          98 :                      IF (ij_index <= send_ij_index) THEN
     504             :                         ! something to send
     505             :                         ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
     506          49 :                                           integ_group_pos2color_sub(proc_send)
     507          49 :                         send_i = ij_map(1, ij_counter_send)
     508          49 :                         send_j = ij_map(2, ij_counter_send)
     509             : 
     510             :                         ! occupied i
     511             :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
     512          49 :                                                 proc_send, tag)
     513             :                         ! occupied j
     514             :                         CALL comm_exchange%send(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
     515          49 :                                                 proc_send, tag)
     516             :                      END IF
     517             :                   END DO
     518          49 :                   CALL timestop(handle3)
     519             :                END IF
     520             : 
     521             :                ! redistribute gamma
     522        2509 :                IF (calc_forces) THEN
     523             :                   CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_B_size(ispin), &
     524             :                                               my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
     525             :                                               num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
     526             :                                               ij_map, ranges_info_array, Y_i_aP(:, :, 1:my_block_size), comm_exchange, &
     527        1566 :                                               gd_array%sizes, 1, buffer_1D)
     528             :                   CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_B_size(jspin), &
     529             :                                               my_block_size, my_group_L_size, my_j, my_ij_pairs, ngroup, &
     530             :                                               num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
     531             :                                               ij_map, ranges_info_array, Y_j_aP(:, :, 1:my_block_size), comm_exchange, &
     532        1566 :                                               gd_array%sizes, 2, buffer_1D)
     533             :                END IF
     534             : 
     535             :             END DO
     536         518 :             CALL timestop(handle2)
     537             : 
     538         518 :             DEALLOCATE (local_i_aL)
     539         518 :             DEALLOCATE (local_j_aL)
     540         518 :             DEALLOCATE (ij_map)
     541         518 :             DEALLOCATE (num_ij_pairs)
     542         518 :             DEALLOCATE (local_ab)
     543             : 
     544         518 :             IF (calc_forces) THEN
     545         372 :                DEALLOCATE (Y_i_aP)
     546         372 :                DEALLOCATE (Y_j_aP)
     547         372 :                IF (ALLOCATED(t_ab)) THEN
     548         296 :                   DEALLOCATE (t_ab)
     549             :                END IF
     550         372 :                DEALLOCATE (local_ba)
     551             : 
     552             :                ! here we check if there are almost degenerate ij
     553             :                ! pairs and we update P_ij with these contribution.
     554             :                ! If all pairs are degenerate with each other this step will scale O(N^6),
     555             :                ! if the number of degenerate pairs scales linearly with the system size
     556             :                ! this step will scale O(N^5).
     557             :                ! Start counting the number of almost degenerate ij pairs according
     558             :                ! to eps_canonical
     559             :                CALL quasi_degenerate_P_ij( &
     560             :                   mp2_env, Eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
     561             :                   my_beta_beta_case, Bib_C(ispin:jspin), unit_nr, dimen_RI, &
     562             :                   my_B_size(ispin:jspin), ngroup, my_group_L_size, &
     563             :                   color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
     564             :                   my_B_virtual_start(ispin:jspin), my_B_virtual_end(ispin:jspin), gd_array%sizes, gd_B_virtual(ispin:jspin), &
     565         372 :                   integ_group_pos2color_sub, dgemm_counter, buffer_1D)
     566             : 
     567             :             END IF
     568             : 
     569         518 :             DEALLOCATE (buffer_1D)
     570             : 
     571             :             ! Dereplicate BIb_C and Gamma_P_ia to save memory
     572             :             ! These matrices will not be needed in that fashion anymore
     573             :             ! B_ia_Q will needed later
     574         518 :             IF (calc_forces .AND. jspin == nspins) THEN
     575        1032 :                IF (.NOT. ALLOCATED(B_ia_Q)) ALLOCATE (B_ia_Q(nspins))
     576        1480 :                ALLOCATE (B_ia_Q(ispin)%array(homo(ispin), my_B_size(ispin), my_group_L_size_orig))
     577      888463 :                B_ia_Q(ispin)%array = 0.0_dp
     578        1360 :                DO jjB = 1, homo(ispin)
     579       16940 :                   DO iiB = 1, my_B_size(ispin)
     580             :                      B_ia_Q(ispin)%array(jjB, iiB, 1:my_group_L_size_orig) = &
     581      709332 :                         BIb_C(ispin)%array(1:my_group_L_size_orig, iiB, jjB)
     582             :                   END DO
     583             :                END DO
     584         296 :                DEALLOCATE (BIb_C(ispin)%array)
     585             : 
     586             :                ! sum Gamma and dereplicate
     587        1480 :                ALLOCATE (BIb_C(ispin)%array(my_B_size(ispin), homo(ispin), my_group_L_size_orig))
     588         562 :                DO proc_shift = 1, comm_rep%num_pe - 1
     589             :                   ! invert order
     590         266 :                   proc_send = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
     591         266 :                   proc_receive = MODULO(comm_rep%mepos + proc_shift, comm_rep%num_pe)
     592             : 
     593         266 :                   start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
     594         266 :                   end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
     595             : 
     596             :                   CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
     597         266 :                                          proc_send, BIb_C(ispin)%array, proc_receive, tag)
     598             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     599         562 : !$OMP          SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
     600             :                   mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) = &
     601             :                      mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) &
     602             :                      + BIb_C(ispin)%array(:, :, :)
     603             : !$OMP END PARALLEL WORKSHARE
     604             :                END DO
     605             : 
     606      753612 :                BIb_C(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig)
     607         296 :                DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
     608         296 :                CALL MOVE_ALLOC(BIb_C(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
     609         222 :             ELSE IF (jspin == nspins) THEN
     610         136 :                DEALLOCATE (BIb_C(ispin)%array)
     611             :             END IF
     612             : 
     613         518 :             CALL para_env%sum(my_Emp2_Cou)
     614         518 :             CALL para_env%sum(my_Emp2_Ex)
     615             : 
     616        1986 :             IF (my_open_shell_SS .OR. my_alpha_beta_case) THEN
     617         258 :                IF (my_alpha_beta_case) THEN
     618          86 :                   Emp2_S = Emp2_S + my_Emp2_Cou
     619          86 :                   Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     620             :                ELSE
     621         172 :                   my_Emp2_Cou = my_Emp2_Cou*0.25_dp
     622         172 :                   my_Emp2_EX = my_Emp2_EX*0.5_dp
     623         172 :                   Emp2_T = Emp2_T + my_Emp2_Cou + my_Emp2_EX
     624         172 :                   Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     625         172 :                   Emp2_EX = Emp2_EX + my_Emp2_EX
     626             :                END IF
     627             :             ELSE
     628         260 :                Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     629         260 :                Emp2_EX = Emp2_EX + my_Emp2_EX
     630             :             END IF
     631             :          END DO
     632             : 
     633             :       END DO
     634             : 
     635         346 :       DEALLOCATE (integ_group_pos2color_sub)
     636         346 :       DEALLOCATE (ranges_info_array)
     637             : 
     638         346 :       CALL comm_exchange%free()
     639         346 :       CALL comm_rep%free()
     640             : 
     641         346 :       IF (calc_forces) THEN
     642             :          ! recover original information (before replication)
     643         220 :          DEALLOCATE (gd_array%sizes)
     644         220 :          iiB = SIZE(sizes_array_orig)
     645         660 :          ALLOCATE (gd_array%sizes(0:iiB - 1))
     646         654 :          gd_array%sizes(:) = sizes_array_orig
     647         220 :          DEALLOCATE (sizes_array_orig)
     648             : 
     649             :          ! Remove replication from BIb_C and reorder the matrix
     650         220 :          my_group_L_size = my_group_L_size_orig
     651             : 
     652             :          ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
     653         516 :          DO ispin = 1, nspins
     654             :             CALL complete_gamma(mp2_env, B_ia_Q(ispin)%array, dimen_RI, homo(ispin), &
     655             :                                 virtual(ispin), para_env, para_env_sub, ngroup, &
     656             :                                 my_group_L_size, my_group_L_start, my_group_L_end, &
     657             :                                 my_B_size(ispin), my_B_virtual_start(ispin), &
     658             :                                 gd_array, gd_B_virtual(ispin), &
     659         516 :                                 ispin)
     660             :          END DO
     661         516 :          DEALLOCATE (B_ia_Q)
     662             : 
     663       43758 :          IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
     664             :          BLOCK
     665             :             TYPE(mp_comm_type) :: comm
     666         220 :             CALL comm%from_split(para_env, para_env_sub%mepos)
     667         516 :             DO ispin = 1, nspins
     668             :                ! P_ab is only replicated over all subgroups
     669         296 :                CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
     670             :                ! P_ij is replicated over all processes
     671         516 :                CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
     672             :             END DO
     673         440 :             CALL comm%free()
     674             :          END BLOCK
     675             :       END IF
     676             : 
     677         346 :       CALL release_group_dist(gd_array)
     678         778 :       DO ispin = 1, nspins
     679         432 :          IF (ALLOCATED(BIb_C(ispin)%array)) DEALLOCATE (BIb_C(ispin)%array)
     680         778 :          CALL release_group_dist(gd_B_virtual(ispin))
     681             :       END DO
     682             : 
     683             :       ! We do not need this matrix later, so deallocate it here to safe memory
     684         346 :       IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
     685         346 :       IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
     686           8 :          DEALLOCATE (mp2_env%ri_grad%operator_half)
     687             : 
     688         346 :       CALL dgemm_counter_write(dgemm_counter, para_env)
     689             : 
     690             :       ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
     691         346 :       CALL local_gemm_destroy(mp2_env%local_gemm_ctx)
     692         346 :       mp2_env%local_gemm_ctx = C_NULL_PTR
     693         346 :       CALL timestop(handle)
     694             : 
     695        1384 :    END SUBROUTINE mp2_ri_gpw_compute_en
     696             : 
     697             : ! **************************************************************************************************
     698             : !> \brief ...
     699             : !> \param local_i_aL ...
     700             : !> \param ranges_info_array ...
     701             : !> \param BIb_C_rec ...
     702             : ! **************************************************************************************************
     703        4248 :    SUBROUTINE fill_local_i_aL(local_i_aL, ranges_info_array, BIb_C_rec)
     704             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: local_i_aL
     705             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: ranges_info_array
     706             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: BIb_C_rec
     707             : 
     708             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'fill_local_i_aL'
     709             : 
     710             :       INTEGER                                            :: end_point, handle, irep, Lend_pos, &
     711             :                                                             Lstart_pos, start_point
     712             : 
     713        4248 :       CALL timeset(routineN, handle)
     714             : 
     715       11764 :       DO irep = 1, SIZE(ranges_info_array, 2)
     716        7516 :          Lstart_pos = ranges_info_array(1, irep)
     717        7516 :          Lend_pos = ranges_info_array(2, irep)
     718        7516 :          start_point = ranges_info_array(3, irep)
     719        7516 :          end_point = ranges_info_array(4, irep)
     720             : 
     721             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     722       11764 : !$OMP          SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
     723             :          local_i_aL(Lstart_pos:Lend_pos, :, :) = BIb_C_rec(start_point:end_point, :, :)
     724             : !$OMP END PARALLEL WORKSHARE
     725             :       END DO
     726             : 
     727        4248 :       CALL timestop(handle)
     728             : 
     729        4248 :    END SUBROUTINE fill_local_i_aL
     730             : 
     731             : ! **************************************************************************************************
     732             : !> \brief ...
     733             : !> \param local_i_aL ...
     734             : !> \param ranges_info_array ...
     735             : !> \param BIb_C_rec ...
     736             : ! **************************************************************************************************
     737         250 :    SUBROUTINE fill_local_i_aL_2D(local_i_aL, ranges_info_array, BIb_C_rec)
     738             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: local_i_aL
     739             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: ranges_info_array
     740             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: BIb_C_rec
     741             : 
     742             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL_2D'
     743             : 
     744             :       INTEGER                                            :: end_point, handle, irep, Lend_pos, &
     745             :                                                             Lstart_pos, start_point
     746             : 
     747         250 :       CALL timeset(routineN, handle)
     748             : 
     749         718 :       DO irep = 1, SIZE(ranges_info_array, 2)
     750         468 :          Lstart_pos = ranges_info_array(1, irep)
     751         468 :          Lend_pos = ranges_info_array(2, irep)
     752         468 :          start_point = ranges_info_array(3, irep)
     753         468 :          end_point = ranges_info_array(4, irep)
     754             : 
     755             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     756         718 : !$OMP          SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
     757             :          local_i_aL(Lstart_pos:Lend_pos, :) = BIb_C_rec(start_point:end_point, :)
     758             : !$OMP END PARALLEL WORKSHARE
     759             :       END DO
     760             : 
     761         250 :       CALL timestop(handle)
     762             : 
     763         250 :    END SUBROUTINE fill_local_i_aL_2D
     764             : 
     765             : ! **************************************************************************************************
     766             : !> \brief ...
     767             : !> \param BIb_C ...
     768             : !> \param comm_exchange ...
     769             : !> \param comm_rep ...
     770             : !> \param homo ...
     771             : !> \param sizes_array ...
     772             : !> \param my_B_size ...
     773             : !> \param my_group_L_size ...
     774             : !> \param ranges_info_array ...
     775             : ! **************************************************************************************************
     776         432 :    SUBROUTINE replicate_iaK_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
     777         432 :                                       my_group_L_size, ranges_info_array)
     778             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     779             :          INTENT(INOUT)                                   :: BIb_C
     780             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange, comm_rep
     781             :       INTEGER, INTENT(IN)                                :: homo
     782             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: sizes_array
     783             :       INTEGER, INTENT(IN)                                :: my_B_size, my_group_L_size
     784             :       INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN)          :: ranges_info_array
     785             : 
     786             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'replicate_iaK_2intgroup'
     787             : 
     788             :       INTEGER                                            :: end_point, handle, max_L_size, &
     789             :                                                             proc_receive, proc_shift, start_point
     790             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: BIb_C_copy
     791         432 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :)  :: BIb_C_gather
     792             : 
     793         432 :       CALL timeset(routineN, handle)
     794             : 
     795             :       ! replication scheme using mpi_allgather
     796             :       ! get the max L size of the
     797         968 :       max_L_size = MAXVAL(sizes_array)
     798             : 
     799        2160 :       ALLOCATE (BIb_C_copy(max_L_size, my_B_size, homo))
     800     1625764 :       BIb_C_copy = 0.0_dp
     801      883318 :       BIb_C_copy(1:SIZE(BIb_C, 1), 1:my_B_size, 1:homo) = BIb_C
     802             : 
     803         432 :       DEALLOCATE (BIb_C)
     804             : 
     805        2592 :       ALLOCATE (BIb_C_gather(max_L_size, my_B_size, homo, 0:comm_rep%num_pe - 1))
     806     3128994 :       BIb_C_gather = 0.0_dp
     807             : 
     808         432 :       CALL comm_rep%allgather(BIb_C_copy, BIb_C_gather)
     809             : 
     810         432 :       DEALLOCATE (BIb_C_copy)
     811             : 
     812        2160 :       ALLOCATE (BIb_C(my_group_L_size, my_B_size, homo))
     813     1625417 :       BIb_C = 0.0_dp
     814             : 
     815             :       ! reorder data
     816        1174 :       DO proc_shift = 0, comm_rep%num_pe - 1
     817         742 :          proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
     818             : 
     819         742 :          start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
     820         742 :          end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
     821             : 
     822             :          BIb_C(start_point:end_point, 1:my_B_size, 1:homo) = &
     823     1644449 :             BIb_C_gather(1:end_point - start_point + 1, 1:my_B_size, 1:homo, proc_receive)
     824             : 
     825             :       END DO
     826             : 
     827         432 :       DEALLOCATE (BIb_C_gather)
     828             : 
     829         432 :       CALL timestop(handle)
     830             : 
     831         432 :    END SUBROUTINE replicate_iaK_2intgroup
     832             : 
     833             : ! **************************************************************************************************
     834             : !> \brief ...
     835             : !> \param local_ab ...
     836             : !> \param t_ab ...
     837             : !> \param mp2_env ...
     838             : !> \param homo ...
     839             : !> \param virtual ...
     840             : !> \param my_B_size ...
     841             : !> \param my_group_L_size ...
     842             : !> \param calc_forces ...
     843             : !> \param ispin ...
     844             : !> \param jspin ...
     845             : !> \param local_ba ...
     846             : ! **************************************************************************************************
     847         518 :    SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
     848             :                                      my_group_L_size, calc_forces, ispin, jspin, local_ba)
     849             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     850             :          INTENT(OUT)                                     :: local_ab, t_ab
     851             :       TYPE(mp2_type)                                     :: mp2_env
     852             :       INTEGER, INTENT(IN)                                :: homo(2), virtual(2), my_B_size(2), &
     853             :                                                             my_group_L_size
     854             :       LOGICAL, INTENT(IN)                                :: calc_forces
     855             :       INTEGER, INTENT(IN)                                :: ispin, jspin
     856             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     857             :          INTENT(OUT)                                     :: local_ba
     858             : 
     859             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_no_blk'
     860             : 
     861             :       INTEGER                                            :: handle
     862             : 
     863         518 :       CALL timeset(routineN, handle)
     864             : 
     865        2072 :       ALLOCATE (local_ab(virtual(ispin), my_B_size(jspin)))
     866      137025 :       local_ab = 0.0_dp
     867             : 
     868         518 :       IF (calc_forces) THEN
     869         372 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
     870        1184 :             ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
     871        6068 :             mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
     872             :          END IF
     873         372 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
     874        1184 :             ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_B_size(jspin), virtual(jspin)))
     875       83688 :             mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
     876             :          END IF
     877         372 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
     878        1480 :             ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_B_size(jspin), homo(jspin), my_group_L_size))
     879     1442032 :             mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
     880             :          END IF
     881             : 
     882         372 :          IF (ispin == jspin) THEN
     883             :             ! For non-alpha-beta case we need amplitudes
     884        1184 :             ALLOCATE (t_ab(virtual(ispin), my_B_size(jspin)))
     885             : 
     886             :             ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
     887         296 :             ALLOCATE (local_ba(1, 1))
     888             :          ELSE
     889             :             ! We need more integrals
     890         304 :             ALLOCATE (local_ba(virtual(jspin), my_B_size(ispin)))
     891             :          END IF
     892             :       END IF
     893             :       !
     894             : 
     895         518 :       CALL timestop(handle)
     896             : 
     897         518 :    END SUBROUTINE mp2_ri_allocate_no_blk
     898             : 
     899             : ! **************************************************************************************************
     900             : !> \brief ...
     901             : !> \param dimen_RI ...
     902             : !> \param my_B_size ...
     903             : !> \param block_size ...
     904             : !> \param local_i_aL ...
     905             : !> \param local_j_aL ...
     906             : !> \param calc_forces ...
     907             : !> \param Y_i_aP ...
     908             : !> \param Y_j_aP ...
     909             : !> \param ispin ...
     910             : !> \param jspin ...
     911             : ! **************************************************************************************************
     912         518 :    SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
     913             :                                   local_i_aL, local_j_aL, calc_forces, &
     914             :                                   Y_i_aP, Y_j_aP, ispin, jspin)
     915             :       INTEGER, INTENT(IN)                                :: dimen_RI, my_B_size(2), block_size
     916             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     917             :          INTENT(OUT)                                     :: local_i_aL, local_j_aL
     918             :       LOGICAL, INTENT(IN)                                :: calc_forces
     919             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     920             :          INTENT(OUT)                                     :: Y_i_aP, Y_j_aP
     921             :       INTEGER, INTENT(IN)                                :: ispin, jspin
     922             : 
     923             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_blk'
     924             : 
     925             :       INTEGER                                            :: handle
     926             : 
     927         518 :       CALL timeset(routineN, handle)
     928             : 
     929        2590 :       ALLOCATE (local_i_aL(dimen_RI, my_B_size(ispin), block_size))
     930      648381 :       local_i_aL = 0.0_dp
     931        2590 :       ALLOCATE (local_j_aL(dimen_RI, my_B_size(jspin), block_size))
     932      660833 :       local_j_aL = 0.0_dp
     933             : 
     934         518 :       IF (calc_forces) THEN
     935        1860 :          ALLOCATE (Y_i_aP(my_B_size(ispin), dimen_RI, block_size))
     936      531690 :          Y_i_aP = 0.0_dp
     937             :          ! For  closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
     938             :          ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
     939        1860 :          ALLOCATE (Y_j_aP(my_B_size(jspin), dimen_RI, block_size))
     940      542254 :          Y_j_aP = 0.0_dp
     941             :       END IF
     942             :       !
     943             : 
     944         518 :       CALL timestop(handle)
     945             : 
     946         518 :    END SUBROUTINE mp2_ri_allocate_blk
     947             : 
     948             : ! **************************************************************************************************
     949             : !> \brief ...
     950             : !> \param my_alpha_beta_case ...
     951             : !> \param total_ij_pairs ...
     952             : !> \param homo ...
     953             : !> \param homo_beta ...
     954             : !> \param block_size ...
     955             : !> \param ngroup ...
     956             : !> \param ij_map ...
     957             : !> \param color_sub ...
     958             : !> \param my_ij_pairs ...
     959             : !> \param my_open_shell_SS ...
     960             : !> \param unit_nr ...
     961             : ! **************************************************************************************************
     962         518 :    SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
     963             :                                    block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
     964             :       LOGICAL, INTENT(IN)                                :: my_alpha_beta_case
     965             :       INTEGER, INTENT(OUT)                               :: total_ij_pairs
     966             :       INTEGER, INTENT(IN)                                :: homo, homo_beta, block_size, ngroup
     967             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
     968             :       INTEGER, INTENT(IN)                                :: color_sub
     969             :       INTEGER, INTENT(OUT)                               :: my_ij_pairs
     970             :       LOGICAL, INTENT(IN)                                :: my_open_shell_SS
     971             :       INTEGER, INTENT(IN)                                :: unit_nr
     972             : 
     973             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_communication'
     974             : 
     975             :       INTEGER :: assigned_blocks, first_I_block, first_J_block, handle, iiB, ij_block_counter, &
     976             :          ij_counter, jjB, last_i_block, last_J_block, num_block_per_group, num_IJ_blocks, &
     977             :          num_IJ_blocks_beta, total_ij_block, total_ij_pairs_blocks
     978         518 :       LOGICAL, ALLOCATABLE, DIMENSION(:, :)              :: ij_marker
     979             : 
     980             : ! Calculate the maximum number of ij pairs that have to be computed
     981             : ! among groups
     982             : 
     983         518 :       CALL timeset(routineN, handle)
     984             : 
     985         518 :       IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
     986         260 :          total_ij_pairs = homo*(1 + homo)/2
     987         260 :          num_IJ_blocks = homo/block_size - 1
     988             : 
     989         260 :          first_I_block = 1
     990         260 :          last_i_block = block_size*(num_IJ_blocks - 1)
     991             : 
     992         260 :          first_J_block = block_size + 1
     993         260 :          last_J_block = block_size*(num_IJ_blocks + 1)
     994             : 
     995         260 :          ij_block_counter = 0
     996         584 :          DO iiB = first_I_block, last_i_block, block_size
     997         584 :             DO jjB = iiB + block_size, last_J_block, block_size
     998         810 :                ij_block_counter = ij_block_counter + 1
     999             :             END DO
    1000             :          END DO
    1001             : 
    1002         260 :          total_ij_block = ij_block_counter
    1003         260 :          num_block_per_group = total_ij_block/ngroup
    1004         260 :          assigned_blocks = num_block_per_group*ngroup
    1005             : 
    1006         260 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
    1007             : 
    1008        1040 :          ALLOCATE (ij_marker(homo, homo))
    1009        3892 :          ij_marker = .TRUE.
    1010         780 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1011        7524 :          ij_map = 0
    1012         260 :          ij_counter = 0
    1013         260 :          my_ij_pairs = 0
    1014         584 :          DO iiB = first_I_block, last_i_block, block_size
    1015        1236 :             DO jjB = iiB + block_size, last_J_block, block_size
    1016         810 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1017         652 :                ij_counter = ij_counter + 1
    1018        1956 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1019         652 :                ij_map(1, ij_counter) = iiB
    1020         652 :                ij_map(2, ij_counter) = jjB
    1021         652 :                ij_map(3, ij_counter) = block_size
    1022         976 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1023             :             END DO
    1024             :          END DO
    1025        1040 :          DO iiB = 1, homo
    1026        2856 :             DO jjB = iiB, homo
    1027        2596 :                IF (ij_marker(iiB, jjB)) THEN
    1028        1164 :                   ij_counter = ij_counter + 1
    1029        1164 :                   ij_map(1, ij_counter) = iiB
    1030        1164 :                   ij_map(2, ij_counter) = jjB
    1031        1164 :                   ij_map(3, ij_counter) = 1
    1032        1164 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1033             :                END IF
    1034             :             END DO
    1035             :          END DO
    1036         260 :          DEALLOCATE (ij_marker)
    1037             : 
    1038         258 :       ELSE IF (.NOT. my_alpha_beta_case) THEN
    1039             :          ! THese are the cases alpha/alpha and beta/beta
    1040             :          ! We do not have to consider the diagonal elements
    1041         172 :          total_ij_pairs = homo*(homo - 1)/2
    1042         172 :          num_IJ_blocks = (homo - 1)/block_size - 1
    1043             : 
    1044         172 :          first_I_block = 1
    1045         172 :          last_i_block = block_size*(num_IJ_blocks - 1)
    1046             : 
    1047             :          ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
    1048         172 :          first_J_block = block_size + 2
    1049         172 :          last_J_block = block_size*(num_IJ_blocks + 1) + 1
    1050             : 
    1051         172 :          ij_block_counter = 0
    1052         256 :          DO iiB = first_I_block, last_i_block, block_size
    1053         256 :             DO jjB = iiB + block_size + 1, last_J_block, block_size
    1054         196 :                ij_block_counter = ij_block_counter + 1
    1055             :             END DO
    1056             :          END DO
    1057             : 
    1058         172 :          total_ij_block = ij_block_counter
    1059         172 :          num_block_per_group = total_ij_block/ngroup
    1060         172 :          assigned_blocks = num_block_per_group*ngroup
    1061             : 
    1062         172 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
    1063             : 
    1064         688 :          ALLOCATE (ij_marker(homo, homo))
    1065        2916 :          ij_marker = .TRUE.
    1066         516 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1067        3276 :          ij_map = 0
    1068         172 :          ij_counter = 0
    1069         172 :          my_ij_pairs = 0
    1070         256 :          DO iiB = first_I_block, last_i_block, block_size
    1071         448 :             DO jjB = iiB + block_size + 1, last_J_block, block_size
    1072         196 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1073         192 :                ij_counter = ij_counter + 1
    1074         592 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1075         192 :                ij_map(1, ij_counter) = iiB
    1076         192 :                ij_map(2, ij_counter) = jjB
    1077         192 :                ij_map(3, ij_counter) = block_size
    1078         276 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1079             :             END DO
    1080             :          END DO
    1081         756 :          DO iiB = 1, homo
    1082        1544 :             DO jjB = iiB + 1, homo
    1083        1372 :                IF (ij_marker(iiB, jjB)) THEN
    1084         584 :                   ij_counter = ij_counter + 1
    1085         584 :                   ij_map(1, ij_counter) = iiB
    1086         584 :                   ij_map(2, ij_counter) = jjB
    1087         584 :                   ij_map(3, ij_counter) = 1
    1088         584 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1089             :                END IF
    1090             :             END DO
    1091             :          END DO
    1092         172 :          DEALLOCATE (ij_marker)
    1093             : 
    1094             :       ELSE
    1095          86 :          total_ij_pairs = homo*homo_beta
    1096          86 :          num_IJ_blocks = homo/block_size
    1097          86 :          num_IJ_blocks_beta = homo_beta/block_size
    1098             : 
    1099          86 :          first_I_block = 1
    1100          86 :          last_i_block = block_size*(num_IJ_blocks - 1)
    1101             : 
    1102          86 :          first_J_block = 1
    1103          86 :          last_J_block = block_size*(num_IJ_blocks_beta - 1)
    1104             : 
    1105          86 :          ij_block_counter = 0
    1106         242 :          DO iiB = first_I_block, last_i_block, block_size
    1107         242 :             DO jjB = first_J_block, last_J_block, block_size
    1108         282 :                ij_block_counter = ij_block_counter + 1
    1109             :             END DO
    1110             :          END DO
    1111             : 
    1112          86 :          total_ij_block = ij_block_counter
    1113          86 :          num_block_per_group = total_ij_block/ngroup
    1114          86 :          assigned_blocks = num_block_per_group*ngroup
    1115             : 
    1116          86 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
    1117             : 
    1118         344 :          ALLOCATE (ij_marker(homo, homo_beta))
    1119        1364 :          ij_marker = .TRUE.
    1120         258 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1121        4206 :          ij_map = 0
    1122          86 :          ij_counter = 0
    1123          86 :          my_ij_pairs = 0
    1124         242 :          DO iiB = first_I_block, last_i_block, block_size
    1125         482 :             DO jjB = first_J_block, last_J_block, block_size
    1126         244 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1127         240 :                ij_counter = ij_counter + 1
    1128         720 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1129         240 :                ij_map(1, ij_counter) = iiB
    1130         240 :                ij_map(2, ij_counter) = jjB
    1131         240 :                ij_map(3, ij_counter) = block_size
    1132         396 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1133             :             END DO
    1134             :          END DO
    1135         422 :          DO iiB = 1, homo
    1136        1452 :             DO jjB = 1, homo_beta
    1137        1366 :                IF (ij_marker(iiB, jjB)) THEN
    1138         790 :                   ij_counter = ij_counter + 1
    1139         790 :                   ij_map(1, ij_counter) = iiB
    1140         790 :                   ij_map(2, ij_counter) = jjB
    1141         790 :                   ij_map(3, ij_counter) = 1
    1142         790 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1143             :                END IF
    1144             :             END DO
    1145             :          END DO
    1146          86 :          DEALLOCATE (ij_marker)
    1147             :       END IF
    1148             : 
    1149         518 :       IF (unit_nr > 0) THEN
    1150         259 :          IF (block_size == 1) THEN
    1151             :             WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
    1152         231 :                "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
    1153             :          ELSE
    1154             :             WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
    1155          28 :                "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
    1156          56 :                100.0_dp*REAL((total_ij_pairs - assigned_blocks*(block_size**2)), KIND=dp)/REAL(total_ij_pairs, KIND=dp)
    1157             :          END IF
    1158         259 :          CALL m_flush(unit_nr)
    1159             :       END IF
    1160             : 
    1161         518 :       CALL timestop(handle)
    1162             : 
    1163         518 :    END SUBROUTINE mp2_ri_communication
    1164             : 
    1165             : ! **************************************************************************************************
    1166             : !> \brief ...
    1167             : !> \param para_env ...
    1168             : !> \param para_env_sub ...
    1169             : !> \param color_sub ...
    1170             : !> \param sizes_array ...
    1171             : !> \param calc_forces ...
    1172             : !> \param integ_group_size ...
    1173             : !> \param my_group_L_end ...
    1174             : !> \param my_group_L_size ...
    1175             : !> \param my_group_L_size_orig ...
    1176             : !> \param my_group_L_start ...
    1177             : !> \param my_new_group_L_size ...
    1178             : !> \param integ_group_pos2color_sub ...
    1179             : !> \param sizes_array_orig ...
    1180             : !> \param ranges_info_array ...
    1181             : !> \param comm_exchange ...
    1182             : !> \param comm_rep ...
    1183             : !> \param num_integ_group ...
    1184             : ! **************************************************************************************************
    1185         346 :    SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
    1186             :                                   sizes_array, calc_forces, &
    1187             :                                   integ_group_size, my_group_L_end, &
    1188             :                                   my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
    1189             :                                   integ_group_pos2color_sub, &
    1190             :                                   sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
    1191             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1192             :       INTEGER, INTENT(IN)                                :: color_sub
    1193             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT)  :: sizes_array
    1194             :       LOGICAL, INTENT(IN)                                :: calc_forces
    1195             :       INTEGER, INTENT(IN)                                :: integ_group_size, my_group_L_end
    1196             :       INTEGER, INTENT(INOUT)                             :: my_group_L_size
    1197             :       INTEGER, INTENT(OUT)                               :: my_group_L_size_orig
    1198             :       INTEGER, INTENT(IN)                                :: my_group_L_start
    1199             :       INTEGER, INTENT(INOUT)                             :: my_new_group_L_size
    1200             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: integ_group_pos2color_sub, &
    1201             :                                                             sizes_array_orig
    1202             :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    1203             :          INTENT(OUT)                                     :: ranges_info_array
    1204             :       TYPE(mp_comm_type), INTENT(OUT)                    :: comm_exchange, comm_rep
    1205             :       INTEGER, INTENT(IN)                                :: num_integ_group
    1206             : 
    1207             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_create_group'
    1208             : 
    1209             :       INTEGER                                            :: handle, iiB, proc_receive, proc_shift, &
    1210             :                                                             sub_sub_color
    1211         346 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: new_sizes_array, rep_ends_array, &
    1212             :                                                             rep_sizes_array, rep_starts_array
    1213             :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: my_info
    1214             : 
    1215         346 :       CALL timeset(routineN, handle)
    1216             :       !
    1217         346 :       sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
    1218         346 :       CALL comm_exchange%from_split(para_env, sub_sub_color)
    1219             : 
    1220             :       ! create the replication group
    1221         346 :       sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
    1222         346 :       CALL comm_rep%from_split(para_env, sub_sub_color)
    1223             : 
    1224             :       ! create the new limits for K according to the size
    1225             :       ! of the integral group
    1226             : 
    1227             :       ! info array for replication
    1228        1038 :       ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
    1229        1038 :       ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
    1230        1038 :       ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
    1231             : 
    1232         346 :       CALL comm_rep%allgather(my_group_L_size, rep_sizes_array)
    1233         346 :       CALL comm_rep%allgather(my_group_L_start, rep_starts_array)
    1234         346 :       CALL comm_rep%allgather(my_group_L_end, rep_ends_array)
    1235             : 
    1236             :       ! calculate my_new_group_L_size according to sizes_array
    1237         346 :       my_new_group_L_size = my_group_L_size
    1238             : 
    1239             :       ! Info of this process
    1240        1038 :       ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
    1241         346 :       my_info(1, 0) = my_group_L_start
    1242         346 :       my_info(2, 0) = my_group_L_end
    1243         346 :       my_info(3, 0) = 1
    1244         346 :       my_info(4, 0) = my_group_L_size
    1245             : 
    1246         580 :       DO proc_shift = 1, comm_rep%num_pe - 1
    1247         234 :          proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
    1248             : 
    1249         234 :          my_new_group_L_size = my_new_group_L_size + rep_sizes_array(proc_receive)
    1250             : 
    1251         234 :          my_info(1, proc_shift) = rep_starts_array(proc_receive)
    1252         234 :          my_info(2, proc_shift) = rep_ends_array(proc_receive)
    1253         234 :          my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
    1254         580 :          my_info(4, proc_shift) = my_new_group_L_size
    1255             : 
    1256             :       END DO
    1257             : 
    1258        1038 :       ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
    1259        1384 :       ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
    1260         346 :       CALL comm_exchange%allgather(my_new_group_L_size, new_sizes_array)
    1261         346 :       CALL comm_exchange%allgather(my_info, ranges_info_array)
    1262             : 
    1263         346 :       DEALLOCATE (rep_sizes_array)
    1264         346 :       DEALLOCATE (rep_starts_array)
    1265         346 :       DEALLOCATE (rep_ends_array)
    1266             : 
    1267        1038 :       ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
    1268         346 :       CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
    1269             : 
    1270         346 :       IF (calc_forces) THEN
    1271         220 :          iiB = SIZE(sizes_array)
    1272         660 :          ALLOCATE (sizes_array_orig(0:iiB - 1))
    1273         654 :          sizes_array_orig(:) = sizes_array
    1274             :       END IF
    1275             : 
    1276         346 :       my_group_L_size_orig = my_group_L_size
    1277         346 :       my_group_L_size = my_new_group_L_size
    1278         346 :       DEALLOCATE (sizes_array)
    1279             : 
    1280        1038 :       ALLOCATE (sizes_array(0:integ_group_size - 1))
    1281         790 :       sizes_array(:) = new_sizes_array
    1282             : 
    1283         346 :       DEALLOCATE (new_sizes_array)
    1284             :       !
    1285         346 :       CALL timestop(handle)
    1286             : 
    1287         692 :    END SUBROUTINE mp2_ri_create_group
    1288             : 
    1289             : ! **************************************************************************************************
    1290             : !> \brief ...
    1291             : !> \param mp2_env ...
    1292             : !> \param para_env ...
    1293             : !> \param para_env_sub ...
    1294             : !> \param gd_array ...
    1295             : !> \param gd_B_virtual ...
    1296             : !> \param homo ...
    1297             : !> \param dimen_RI ...
    1298             : !> \param unit_nr ...
    1299             : !> \param integ_group_size ...
    1300             : !> \param ngroup ...
    1301             : !> \param num_integ_group ...
    1302             : !> \param virtual ...
    1303             : !> \param calc_forces ...
    1304             : ! **************************************************************************************************
    1305        1384 :    SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
    1306         346 :                                           homo, dimen_RI, unit_nr, &
    1307             :                                           integ_group_size, &
    1308             :                                           ngroup, num_integ_group, &
    1309         346 :                                           virtual, calc_forces)
    1310             :       TYPE(mp2_type)                                     :: mp2_env
    1311             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1312             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_array
    1313             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1314             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
    1315             :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
    1316             :       INTEGER, INTENT(OUT)                               :: integ_group_size, ngroup, num_integ_group
    1317             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: virtual
    1318             :       LOGICAL, INTENT(IN)                                :: calc_forces
    1319             : 
    1320             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_integ_group_size'
    1321             : 
    1322             :       INTEGER                                            :: block_size, handle, iiB, &
    1323             :                                                             max_repl_group_size, &
    1324             :                                                             min_integ_group_size
    1325             :       INTEGER(KIND=int_8)                                :: mem
    1326             :       LOGICAL                                            :: calc_group_size
    1327             :       REAL(KIND=dp)                                      :: factor, mem_base, mem_min, mem_per_blk, &
    1328             :                                                             mem_per_repl, mem_per_repl_blk, &
    1329             :                                                             mem_real
    1330             : 
    1331         346 :       CALL timeset(routineN, handle)
    1332             : 
    1333         346 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    1334             : 
    1335         346 :       calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
    1336         346 :       IF (.NOT. calc_group_size) THEN
    1337          10 :          IF (MOD(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .TRUE.
    1338             :       END IF
    1339             : 
    1340         346 :       IF (calc_group_size) THEN
    1341         336 :          CALL m_memory(mem)
    1342         336 :          mem_real = (mem + 1024*1024 - 1)/(1024*1024)
    1343         336 :          CALL para_env%min(mem_real)
    1344             : 
    1345         336 :          mem_base = 0.0_dp
    1346         336 :          mem_per_blk = 0.0_dp
    1347         336 :          mem_per_repl = 0.0_dp
    1348         336 :          mem_per_repl_blk = 0.0_dp
    1349             : 
    1350             :          ! BIB_C_copy
    1351             :          mem_per_repl = mem_per_repl + MAXVAL(MAX(REAL(homo, KIND=dp)*maxsize(gd_array), REAL(dimen_RI, KIND=dp))* &
    1352        1088 :                                               maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1353             :          ! BIB_C
    1354         752 :          mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_B_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
    1355             :          ! BIB_C_rec
    1356         752 :          mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
    1357             :          ! local_i_aL+local_j_aL
    1358         752 :          mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
    1359             :          ! local_ab
    1360        1088 :          mem_base = mem_base + MAXVAL(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1361             :          ! external_ab/external_i_aL
    1362        1168 :          mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1363             : 
    1364         336 :          IF (calc_forces) THEN
    1365             :             ! Gamma_P_ia
    1366             :             mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_array)* &
    1367         496 :                                               maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1368             :             ! Y_i_aP+Y_j_aP
    1369         496 :             mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
    1370             :             ! local_ba/t_ab
    1371         780 :             mem_base = mem_base + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1372             :             ! P_ij
    1373         496 :             mem_base = mem_base + SUM(REAL(homo, KIND=dp)*homo)*8.0_dp/(1024**2)
    1374             :             ! P_ab
    1375         496 :             mem_base = mem_base + SUM(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1376             :             ! send_ab/send_i_aL
    1377         780 :             mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1378             :          END IF
    1379             : 
    1380             :          ! This a first guess based on the assumption of optimal block sizes
    1381         752 :          block_size = MAX(1, MIN(FLOOR(SQRT(REAL(MINVAL(homo), KIND=dp))), FLOOR(MINVAL(homo)/SQRT(2.0_dp*ngroup))))
    1382         336 :          IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
    1383             : 
    1384         336 :          mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
    1385             : 
    1386         504 :          IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
    1387         336 :             mem_real, ' MiB'
    1388         504 :          IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
    1389         336 :             mem_min, ' MiB'
    1390             : 
    1391             :          ! We use the following communication model
    1392             :          ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
    1393             :          ! One can show that the costs of the contraction step are independent of the block size and the replication group size
    1394             :          ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
    1395             :          ! NL ... number of RI basis functions
    1396             :          ! NR ... replication group size
    1397             :          ! NG ... number of sub groups
    1398             :          ! NB ... Block size
    1399             :          ! o  ... number of occupied orbitals
    1400             :          ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
    1401             :          ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
    1402             :          ! and with gradients
    1403             :          ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
    1404             :          ! We are looking for the minimum of the communication volume,
    1405             :          ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
    1406             :          ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
    1407             :          ! Replication group size = 1 implies that the integration group size equals the number of subgroups
    1408             : 
    1409         336 :          integ_group_size = ngroup
    1410             : 
    1411             :          ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
    1412             :          factor = REAL(SUM(homo*virtual), KIND=dp) &
    1413        1584 :                   - SUM((REAL(MAXVAL(homo), KIND=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
    1414         656 :          IF (SIZE(homo) == 2) factor = factor - 2.0_dp*PRODUCT(homo)/block_size/ngroup*SUM(homo*virtual)
    1415             : 
    1416         672 :          IF (factor <= 0.0_dp) THEN
    1417             :             ! Remove the fixed memory and divide by the memory per replication group size
    1418             :             max_repl_group_size = MIN(MAX(FLOOR((mem_real - mem_base - mem_per_blk*block_size)/ &
    1419         248 :                                                 (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
    1420             :             ! Convert to an integration group size
    1421         248 :             min_integ_group_size = ngroup/max_repl_group_size
    1422             : 
    1423             :             ! Ensure that the integration group size is a divisor of the number of sub groups
    1424         248 :             DO iiB = MAX(MIN(min_integ_group_size, ngroup), 1), ngroup
    1425             :                ! check that the ngroup is a multiple of  integ_group_size
    1426         248 :                IF (MOD(ngroup, iiB) == 0) THEN
    1427         248 :                   integ_group_size = iiB
    1428         248 :                   EXIT
    1429             :                END IF
    1430           0 :                integ_group_size = integ_group_size + 1
    1431             :             END DO
    1432             :          END IF
    1433             :       ELSE ! We take the user provided group size
    1434          10 :          integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
    1435             :       END IF
    1436             : 
    1437         346 :       IF (unit_nr > 0) THEN
    1438             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    1439         173 :             "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
    1440         173 :          CALL m_flush(unit_nr)
    1441             :       END IF
    1442             : 
    1443         346 :       num_integ_group = ngroup/integ_group_size
    1444             : 
    1445         346 :       CALL timestop(handle)
    1446             : 
    1447         346 :    END SUBROUTINE mp2_ri_get_integ_group_size
    1448             : 
    1449             : ! **************************************************************************************************
    1450             : !> \brief ...
    1451             : !> \param mp2_env ...
    1452             : !> \param para_env ...
    1453             : !> \param para_env_sub ...
    1454             : !> \param gd_array ...
    1455             : !> \param gd_B_virtual ...
    1456             : !> \param homo ...
    1457             : !> \param virtual ...
    1458             : !> \param dimen_RI ...
    1459             : !> \param unit_nr ...
    1460             : !> \param block_size ...
    1461             : !> \param ngroup ...
    1462             : !> \param num_integ_group ...
    1463             : !> \param my_open_shell_ss ...
    1464             : !> \param calc_forces ...
    1465             : !> \param buffer_1D ...
    1466             : ! **************************************************************************************************
    1467         518 :    SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
    1468         518 :                                     homo, virtual, dimen_RI, unit_nr, &
    1469             :                                     block_size, ngroup, num_integ_group, &
    1470             :                                     my_open_shell_ss, calc_forces, buffer_1D)
    1471             :       TYPE(mp2_type)                                     :: mp2_env
    1472             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1473             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_array
    1474             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1475             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    1476             :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
    1477             :       INTEGER, INTENT(OUT)                               :: block_size, ngroup
    1478             :       INTEGER, INTENT(IN)                                :: num_integ_group
    1479             :       LOGICAL, INTENT(IN)                                :: my_open_shell_ss, calc_forces
    1480             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1481             :          INTENT(OUT)                                     :: buffer_1D
    1482             : 
    1483             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_block_size'
    1484             : 
    1485             :       INTEGER                                            :: best_block_size, handle, num_IJ_blocks
    1486             :       INTEGER(KIND=int_8)                                :: buffer_size, mem
    1487             :       REAL(KIND=dp)                                      :: mem_base, mem_per_blk, mem_per_repl_blk, &
    1488             :                                                             mem_real
    1489             : 
    1490         518 :       CALL timeset(routineN, handle)
    1491             : 
    1492         518 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    1493             : 
    1494         518 :       CALL m_memory(mem)
    1495         518 :       mem_real = (mem + 1024*1024 - 1)/(1024*1024)
    1496         518 :       CALL para_env%min(mem_real)
    1497             : 
    1498         518 :       mem_base = 0.0_dp
    1499         518 :       mem_per_blk = 0.0_dp
    1500         518 :       mem_per_repl_blk = 0.0_dp
    1501             : 
    1502             :       ! external_ab
    1503        1726 :       mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1504             :       ! BIB_C_rec
    1505        1122 :       mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
    1506             :       ! local_i_aL+local_j_aL
    1507        1122 :       mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
    1508             :       ! Copy to keep arrays contiguous
    1509        1726 :       mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1510             : 
    1511         518 :       IF (calc_forces) THEN
    1512             :          ! Y_i_aP+Y_j_aP+BIb_C_send
    1513         820 :          mem_per_blk = mem_per_blk + 3.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
    1514             :          ! send_ab
    1515        1268 :          mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1516             :       END IF
    1517             : 
    1518         518 :       best_block_size = 1
    1519             : 
    1520             :       ! Here we split the memory half for the communication, half for replication
    1521         518 :       IF (mp2_env%ri_mp2%block_size > 0) THEN
    1522             :          best_block_size = mp2_env%ri_mp2%block_size
    1523             :       ELSE
    1524         286 :          best_block_size = MAX(FLOOR((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
    1525             : 
    1526     4319682 :          DO
    1527     4319968 :             IF (SIZE(homo) == 1) THEN
    1528     3468400 :             IF (.NOT. my_open_shell_ss) THEN
    1529     1580206 :                num_IJ_blocks = (homo(1)/best_block_size)
    1530     1580206 :                num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
    1531             :             ELSE
    1532     1888194 :                num_IJ_blocks = ((homo(1) - 1)/best_block_size)
    1533     1888194 :                num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
    1534             :             END IF
    1535             :             ELSE
    1536     2554704 :             num_ij_blocks = PRODUCT(homo/best_block_size)
    1537             :             END IF
    1538             :             ! Enforce at least one large block for each subgroup
    1539     4319968 :             IF ((num_IJ_blocks >= ngroup .AND. num_IJ_blocks > 0) .OR. best_block_size == 1) THEN
    1540             :                EXIT
    1541             :             ELSE
    1542     4319682 :                best_block_size = best_block_size - 1
    1543             :             END IF
    1544             :          END DO
    1545             : 
    1546         286 :          IF (SIZE(homo) == 1) THEN
    1547         226 :          IF (my_open_shell_ss) THEN
    1548             :             ! check that best_block_size is not bigger than sqrt(homo-1)
    1549             :             ! Diagonal elements do not have to be considered
    1550         120 :             best_block_size = MIN(FLOOR(SQRT(REAL(homo(1) - 1, KIND=dp))), best_block_size)
    1551             :          ELSE
    1552             :             ! check that best_block_size is not bigger than sqrt(homo)
    1553         106 :             best_block_size = MIN(FLOOR(SQRT(REAL(homo(1), KIND=dp))), best_block_size)
    1554             :          END IF
    1555             :          END IF
    1556             :       END IF
    1557         518 :       block_size = MAX(1, best_block_size)
    1558             : 
    1559         518 :       IF (unit_nr > 0) THEN
    1560             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    1561         259 :             "RI_INFO| Block size:", block_size
    1562         259 :          CALL m_flush(unit_nr)
    1563             :       END IF
    1564             : 
    1565             :       ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
    1566             :       buffer_size = MAX(INT(maxsize(gd_array), KIND=int_8)*block_size, INT(MAX(dimen_RI, MAXVAL(virtual)), KIND=int_8)) &
    1567        1726 :                     *MAXVAL(maxsize(gd_B_virtual))
    1568             :       ! The send buffer has the same size as the recv buffer
    1569         518 :       IF (calc_forces) buffer_size = buffer_size*2
    1570        1554 :       ALLOCATE (buffer_1D(buffer_size))
    1571             : 
    1572         518 :       CALL timestop(handle)
    1573             : 
    1574         518 :    END SUBROUTINE mp2_ri_get_block_size
    1575             : 
    1576             : ! **************************************************************************************************
    1577             : !> \brief ...
    1578             : !> \param mp2_env ...
    1579             : !> \param para_env_sub ...
    1580             : !> \param gd_B_virtual ...
    1581             : !> \param Eigenval ...
    1582             : !> \param homo ...
    1583             : !> \param dimen_RI ...
    1584             : !> \param iiB ...
    1585             : !> \param jjB ...
    1586             : !> \param my_B_size ...
    1587             : !> \param my_B_virtual_end ...
    1588             : !> \param my_B_virtual_start ...
    1589             : !> \param my_i ...
    1590             : !> \param my_j ...
    1591             : !> \param virtual ...
    1592             : !> \param local_ab ...
    1593             : !> \param t_ab ...
    1594             : !> \param my_local_i_aL ...
    1595             : !> \param my_local_j_aL ...
    1596             : !> \param open_ss ...
    1597             : !> \param Y_i_aP ...
    1598             : !> \param Y_j_aP ...
    1599             : !> \param local_ba ...
    1600             : !> \param ispin ...
    1601             : !> \param jspin ...
    1602             : !> \param dgemm_counter ...
    1603             : !> \param buffer_1D ...
    1604             : ! **************************************************************************************************
    1605        6276 :    SUBROUTINE mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
    1606        3138 :                                  Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
    1607        3138 :                                  my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
    1608        3138 :                                  t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
    1609        1569 :                                  local_ba, ispin, jspin, dgemm_counter, buffer_1D)
    1610             :       TYPE(mp2_type)                                     :: mp2_env
    1611             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env_sub
    1612             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1613             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
    1614             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
    1615             :       INTEGER, INTENT(IN)                                :: dimen_RI, iiB, jjB
    1616             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_size, my_B_virtual_end, &
    1617             :                                                             my_B_virtual_start
    1618             :       INTEGER, INTENT(IN)                                :: my_i, my_j
    1619             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: virtual
    1620             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1621             :          INTENT(INOUT), TARGET                           :: local_ab
    1622             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1623             :          INTENT(IN), TARGET                              :: t_ab, my_local_i_aL, my_local_j_aL
    1624             :       LOGICAL, INTENT(IN)                                :: open_ss
    1625             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1626             :          INTENT(INOUT), TARGET                           :: Y_i_aP, Y_j_aP, local_ba
    1627             :       INTEGER, INTENT(IN)                                :: ispin, jspin
    1628             :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
    1629             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    1630             : 
    1631             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_update_P_gamma'
    1632             : 
    1633             :       INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_B_size, &
    1634             :          rec_B_virtual_end, rec_B_virtual_start, send_B_size, send_B_virtual_end, &
    1635             :          send_B_virtual_start
    1636             :       INTEGER(KIND=int_8)                                :: offset
    1637             :       LOGICAL                                            :: alpha_beta
    1638             :       REAL(KIND=dp)                                      :: factor, P_ij_diag
    1639             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1640        1569 :          POINTER                                         :: external_ab, send_ab
    1641             : 
    1642        1569 :       CALL timeset(routineN//"_Pia", handle)
    1643             : 
    1644        1569 :       alpha_beta = .NOT. (ispin == jspin)
    1645        1569 :       IF (open_ss) THEN
    1646             :          factor = 1.0_dp
    1647             :       ELSE
    1648        1189 :          factor = 2.0_dp
    1649             :       END IF
    1650             :       ! divide the (ia|jb) integrals by Delta_ij^ab
    1651       24660 :       DO b = 1, my_B_size(jspin)
    1652       23091 :          b_global = b + my_B_virtual_start(jspin) - 1
    1653      462933 :          DO a = 1, virtual(ispin)
    1654             :             local_ab(a, b) = -local_ab(a, b)/ &
    1655             :                              (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
    1656      461364 :                               Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
    1657             :          END DO
    1658             :       END DO
    1659        1569 :       IF (.NOT. (alpha_beta)) THEN
    1660      322531 :          P_ij_diag = -SUM(local_ab*t_ab)*factor
    1661             :       ELSE
    1662             :          ! update diagonal part of P_ij
    1663      140402 :          P_ij_diag = -SUM(local_ab*local_ab)*mp2_env%scale_S
    1664             :          ! More integrals needed only for alpha-beta case: local_ba
    1665        6939 :          DO b = 1, my_B_size(ispin)
    1666        6449 :             b_global = b + my_B_virtual_start(ispin) - 1
    1667      139874 :             DO a = 1, virtual(jspin)
    1668             :                local_ba(a, b) = -local_ba(a, b)/ &
    1669             :                                 (Eigenval(homo(jspin) + a, jspin) + Eigenval(homo(ispin) + b_global, ispin) - &
    1670      139384 :                                  Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
    1671             :             END DO
    1672             :          END DO
    1673             :       END IF
    1674             : 
    1675             :       ! P_ab and add diagonal part of P_ij
    1676             : 
    1677        1569 :       CALL dgemm_counter_start(dgemm_counter)
    1678        1569 :       IF (.NOT. (alpha_beta)) THEN
    1679             :          CALL local_gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(ispin), 1.0_dp, &
    1680             :                          t_ab, virtual(ispin), local_ab, virtual(ispin), &
    1681             :                          1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
    1682        1079 :                                        my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin), mp2_env%local_gemm_ctx)
    1683             :          mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
    1684        1079 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
    1685             :       ELSE
    1686             :          CALL local_gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(jspin), mp2_env%scale_S, &
    1687             :                          local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
    1688             :                         mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin), &
    1689         490 :                          mp2_env%local_gemm_ctx)
    1690             : 
    1691             :          mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
    1692         490 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
    1693             : 
    1694             :          CALL local_gemm('T', 'N', my_B_size(jspin), my_B_size(jspin), virtual(ispin), mp2_env%scale_S, &
    1695             :                          local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
    1696             :                         mp2_env%ri_grad%P_ab(jspin)%array(:, my_B_virtual_start(jspin):my_B_virtual_end(jspin)), my_B_size(jspin), &
    1697         490 :                          mp2_env%local_gemm_ctx)
    1698             : 
    1699             :          mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
    1700         490 :             mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
    1701             :       END IF
    1702             :       ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
    1703             :       ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
    1704             :       ! due to spin in alpha-beta case.
    1705        1569 :       IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1706             : 
    1707             :          CALL local_gemm('N', 'T', my_B_size(ispin), virtual(ispin), my_B_size(ispin), 1.0_dp, &
    1708             :                          t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1709             :                          local_ab, virtual(ispin), &
    1710             :                          1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_B_size(ispin), &
    1711         796 :                          mp2_env%local_gemm_ctx)
    1712             : 
    1713             :          mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
    1714         796 :             mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
    1715             :       END IF
    1716        1741 :       DO proc_shift = 1, para_env_sub%num_pe - 1
    1717         172 :          proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1718         172 :          proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1719             : 
    1720         172 :          CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1721         172 :          CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1722             : 
    1723         172 :          external_ab(1:virtual(ispin), 1:rec_B_size) => buffer_1D(1:INT(virtual(ispin), int_8)*rec_B_size)
    1724       35072 :          external_ab = 0.0_dp
    1725             : 
    1726             :          CALL para_env_sub%sendrecv(local_ab, proc_send, &
    1727         172 :                                     external_ab, proc_receive)
    1728             : 
    1729         172 :          IF (.NOT. (alpha_beta)) THEN
    1730             :             CALL local_gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(ispin), 1.0_dp, &
    1731             :                             t_ab, virtual(ispin), external_ab, virtual(ispin), &
    1732             :                             1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin), &
    1733         102 :                             mp2_env%local_gemm_ctx)
    1734             :          ELSE
    1735             :             CALL local_gemm('T', 'N', my_B_size(jspin), rec_B_size, virtual(ispin), mp2_env%scale_S, &
    1736             :                             local_ab, virtual(ispin), external_ab, virtual(ispin), &
    1737             :                             1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_B_virtual_start:rec_B_virtual_end), &
    1738          70 :                             my_B_size(jspin), mp2_env%local_gemm_ctx)
    1739             : 
    1740             :             ! For alpha-beta part of alpha-density we need a new parallel code
    1741             :             ! And new external_ab (of a different size)
    1742          70 :             CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1743          70 :             CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1744          70 :             external_ab(1:virtual(jspin), 1:rec_B_size) => buffer_1D(1:INT(virtual(jspin), int_8)*rec_B_size)
    1745       14700 :             external_ab = 0.0_dp
    1746             :             CALL para_env_sub%sendrecv(local_ba, proc_send, &
    1747          70 :                                        external_ab, proc_receive)
    1748             :             CALL local_gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(jspin), mp2_env%scale_S, &
    1749             :                             local_ba, virtual(jspin), external_ab, virtual(jspin), &
    1750             :                             1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin), &
    1751          70 :                             mp2_env%local_gemm_ctx)
    1752             :          END IF
    1753             : 
    1754        1913 :          IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1755             :             external_ab(1:my_B_size(ispin), 1:virtual(ispin)) => &
    1756          86 :                buffer_1D(1:INT(virtual(ispin), int_8)*my_B_size(ispin))
    1757       18083 :             external_ab = 0.0_dp
    1758             : 
    1759          86 :             offset = INT(virtual(ispin), int_8)*my_B_size(ispin)
    1760             : 
    1761          86 :             send_ab(1:send_B_size, 1:virtual(ispin)) => buffer_1D(offset + 1:offset + INT(send_B_size, int_8)*virtual(ispin))
    1762       18083 :             send_ab = 0.0_dp
    1763             : 
    1764             :             CALL local_gemm('N', 'T', send_B_size, virtual(ispin), my_B_size(ispin), 1.0_dp, &
    1765             :                             t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1766             :                             local_ab, virtual(ispin), &
    1767             :                             0.0_dp, send_ab, send_B_size, &
    1768          86 :                             mp2_env%local_gemm_ctx)
    1769             :             CALL para_env_sub%sendrecv(send_ab, proc_send, &
    1770          86 :                                        external_ab, proc_receive)
    1771             : 
    1772       18083 :             mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
    1773             :          END IF
    1774             : 
    1775             :       END DO
    1776        1569 :       IF (.NOT. alpha_beta) THEN
    1777        1079 :          IF (my_i /= my_j) THEN
    1778         796 :             CALL dgemm_counter_stop(dgemm_counter, 2*my_B_size(ispin), virtual(ispin), virtual(ispin))
    1779             :          ELSE
    1780         283 :             CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), virtual(ispin), virtual(ispin))
    1781             :          END IF
    1782             :       ELSE
    1783        1470 :          CALL dgemm_counter_stop(dgemm_counter, SUM(my_B_size), virtual(ispin), virtual(jspin))
    1784             :       END IF
    1785        1569 :       CALL timestop(handle)
    1786             : 
    1787             :       ! Now, Gamma_P_ia (made of Y_ia_P)
    1788             : 
    1789        1569 :       CALL timeset(routineN//"_Gamma", handle)
    1790        1569 :       CALL dgemm_counter_start(dgemm_counter)
    1791        1569 :       IF (.NOT. alpha_beta) THEN
    1792             :          ! Alpha-alpha, beta-beta and closed shell
    1793             :          CALL local_gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
    1794             :                          t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1795             :                          my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin), &
    1796        1079 :                          mp2_env%local_gemm_ctx)
    1797             :       ELSE ! Alpha-beta
    1798             :          CALL local_gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
    1799             :                          local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1800         490 :                          my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin), mp2_env%local_gemm_ctx)
    1801             :          CALL local_gemm('T', 'T', my_B_size(jspin), dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
    1802             :                          local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1803         490 :                          my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(jspin), mp2_env%local_gemm_ctx)
    1804             :       END IF
    1805             : 
    1806        1569 :       IF (para_env_sub%num_pe > 1) THEN
    1807         172 :          external_ab(1:my_B_size(ispin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(ispin), int_8)*dimen_RI)
    1808      189692 :          external_ab = 0.0_dp
    1809             : 
    1810         172 :          offset = INT(my_B_size(ispin), int_8)*dimen_RI
    1811             :       END IF
    1812             :       !
    1813        1741 :       DO proc_shift = 1, para_env_sub%num_pe - 1
    1814         172 :          proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1815         172 :          proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1816             : 
    1817         172 :          CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1818         172 :          CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1819             : 
    1820         172 :          send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
    1821      189692 :          send_ab = 0.0_dp
    1822        1913 :          IF (.NOT. alpha_beta) THEN
    1823             :             CALL local_gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), 1.0_dp, &
    1824             :                             t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1825             :                             my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size, &
    1826         102 :                             mp2_env%local_gemm_ctx)
    1827         102 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1828             : 
    1829      217544 :             Y_i_aP(:, :) = Y_i_aP + external_ab
    1830             : 
    1831             :          ELSE ! Alpha-beta case
    1832             :             ! Alpha-alpha part
    1833             :             CALL local_gemm('N', 'T', send_B_size, dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
    1834             :                             local_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1835             :                             my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size, &
    1836          70 :                             mp2_env%local_gemm_ctx)
    1837          70 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1838      161840 :             Y_i_aP(:, :) = Y_i_aP + external_ab
    1839             :          END IF
    1840             :       END DO
    1841             : 
    1842        1569 :       IF (alpha_beta) THEN
    1843             :          ! For beta-beta part (in alpha-beta case) we need a new parallel code
    1844         490 :          IF (para_env_sub%num_pe > 1) THEN
    1845          70 :             external_ab(1:my_B_size(jspin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(jspin), int_8)*dimen_RI)
    1846       88620 :             external_ab = 0.0_dp
    1847             : 
    1848          70 :             offset = INT(my_B_size(jspin), int_8)*dimen_RI
    1849             :          END IF
    1850         560 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    1851          70 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1852          70 :             proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1853             : 
    1854          70 :             CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1855          70 :             send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
    1856       88620 :             send_ab = 0.0_dp
    1857             :             CALL local_gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
    1858             :                             local_ba(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1859             :                             my_local_i_aL, dimen_RI, 0.0_dp, send_ab, send_B_size, &
    1860          70 :                             mp2_env%local_gemm_ctx)
    1861          70 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1862      177800 :             Y_j_aP(:, :) = Y_j_aP + external_ab
    1863             : 
    1864             :          END DO
    1865             : 
    1866             :          ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
    1867         490 :          CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_RI, my_B_size(jspin))
    1868             :       ELSE
    1869        1079 :          CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_RI, my_B_size(ispin))
    1870             :       END IF
    1871             : 
    1872        1569 :       IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1873             :          ! Alpha-alpha, beta-beta and closed shell
    1874         796 :          CALL dgemm_counter_start(dgemm_counter)
    1875             :          CALL local_gemm('T', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
    1876             :                          t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1877             :                          my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin), &
    1878         796 :                          mp2_env%local_gemm_ctx)
    1879         882 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    1880          86 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1881          86 :             proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1882             : 
    1883          86 :             CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1884             : 
    1885          86 :             external_ab(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
    1886       86837 :             external_ab = 0.0_dp
    1887             : 
    1888             :             CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
    1889          86 :                                        external_ab, proc_receive)
    1890             : 
    1891             :             ! Alpha-alpha, beta-beta and closed shell
    1892             :             CALL local_gemm('T', 'T', my_B_size(ispin), dimen_RI, rec_B_size, 1.0_dp, &
    1893             :                             t_ab(rec_B_virtual_start:rec_B_virtual_end, :), rec_B_size, &
    1894         968 :                             external_ab, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin), mp2_env%local_gemm_ctx)
    1895             :          END DO
    1896             : 
    1897         796 :          CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), dimen_RI, virtual(ispin))
    1898             :       END IF
    1899             : 
    1900        1569 :       CALL timestop(handle)
    1901        1569 :    END SUBROUTINE mp2_update_P_gamma
    1902             : 
    1903             : ! **************************************************************************************************
    1904             : !> \brief ...
    1905             : !> \param Gamma_P_ia ...
    1906             : !> \param ij_index ...
    1907             : !> \param my_B_size ...
    1908             : !> \param my_block_size ...
    1909             : !> \param my_group_L_size ...
    1910             : !> \param my_i ...
    1911             : !> \param my_ij_pairs ...
    1912             : !> \param ngroup ...
    1913             : !> \param num_integ_group ...
    1914             : !> \param integ_group_pos2color_sub ...
    1915             : !> \param num_ij_pairs ...
    1916             : !> \param ij_map ...
    1917             : !> \param ranges_info_array ...
    1918             : !> \param Y_i_aP ...
    1919             : !> \param comm_exchange ...
    1920             : !> \param sizes_array ...
    1921             : !> \param spin ...
    1922             : !> \param buffer_1D ...
    1923             : ! **************************************************************************************************
    1924        3132 :    SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
    1925             :                                      my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
    1926             :                                      num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
    1927        3132 :                                      ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
    1928        3132 :                                      sizes_array, spin, buffer_1D)
    1929             : 
    1930             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: Gamma_P_ia
    1931             :       INTEGER, INTENT(IN)                                :: ij_index, my_B_size, my_block_size, &
    1932             :                                                             my_group_L_size, my_i, my_ij_pairs, &
    1933             :                                                             ngroup, num_integ_group
    1934             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: integ_group_pos2color_sub, num_ij_pairs
    1935             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN)  :: ij_map
    1936             :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    1937             :          INTENT(IN)                                      :: ranges_info_array
    1938             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: Y_i_aP
    1939             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    1940             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: sizes_array
    1941             :       INTEGER, INTENT(IN)                                :: spin
    1942             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    1943             : 
    1944             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_redistribute_gamma'
    1945             : 
    1946             :       INTEGER :: end_point, handle, handle2, iiB, ij_counter_rec, irep, kkk, lll, Lstart_pos, &
    1947             :          proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_L_size, start_point, tag
    1948             :       INTEGER(KIND=int_8)                                :: offset
    1949             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
    1950        3132 :          POINTER                                         :: BI_C_rec, BI_C_send
    1951             : 
    1952             : ! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
    1953             : 
    1954        3132 :       CALL timeset(routineN//"_comm2", handle)
    1955             : 
    1956        3132 :       tag = 43
    1957             : 
    1958        3132 :       IF (ij_index <= my_ij_pairs) THEN
    1959             :          ! somethig to send
    1960             :          ! start with myself
    1961        3114 :          CALL timeset(routineN//"_comm2_w", handle2)
    1962        8924 :          DO irep = 0, num_integ_group - 1
    1963        5810 :             Lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
    1964        5810 :             start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    1965        5810 :             end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    1966             : !$OMP       PARALLEL DO DEFAULT(NONE) PRIVATE(kkk,lll,iiB) &
    1967             : !$OMP                                 SHARED(start_point,end_point,Lstart_pos,my_block_size,&
    1968        8924 : !$OMP                                        Gamma_P_ia,my_i,my_B_size,Y_i_aP)
    1969             :             DO kkk = start_point, end_point
    1970             :                lll = kkk - start_point + Lstart_pos
    1971             :                DO iiB = 1, my_block_size
    1972             :                   Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) = &
    1973             :                      Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) + &
    1974             :                      Y_i_aP(1:my_B_size, lll, iiB)
    1975             :                END DO
    1976             :             END DO
    1977             : !$OMP              END PARALLEL DO
    1978             :          END DO
    1979        3114 :          CALL timestop(handle2)
    1980             : 
    1981             :          ! Y_i_aP(my_B_size,dimen_RI,block_size)
    1982             : 
    1983        3212 :          DO proc_shift = 1, comm_exchange%num_pe - 1
    1984          98 :             proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    1985          98 :             proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    1986             : 
    1987          98 :             send_L_size = sizes_array(proc_send)
    1988             :             BI_C_send(1:my_B_size, 1:my_block_size, 1:send_L_size) => &
    1989          98 :                buffer_1D(1:INT(my_B_size, int_8)*my_block_size*send_L_size)
    1990             : 
    1991          98 :             offset = INT(my_B_size, int_8)*my_block_size*send_L_size
    1992             : 
    1993          98 :             CALL timeset(routineN//"_comm2_w", handle2)
    1994       48692 :             BI_C_send = 0.0_dp
    1995         196 :             DO irep = 0, num_integ_group - 1
    1996          98 :                Lstart_pos = ranges_info_array(1, irep, proc_send)
    1997          98 :                start_point = ranges_info_array(3, irep, proc_send)
    1998          98 :                end_point = ranges_info_array(4, irep, proc_send)
    1999             : !$OMP             PARALLEL DO DEFAULT(NONE) PRIVATE(kkk,lll,iiB) &
    2000             : !$OMP                                       SHARED(start_point,end_point,Lstart_pos,my_block_size,&
    2001         196 : !$OMP                                              BI_C_send,my_B_size,Y_i_aP)
    2002             :                DO kkk = start_point, end_point
    2003             :                   lll = kkk - start_point + Lstart_pos
    2004             :                   DO iiB = 1, my_block_size
    2005             :                      BI_C_send(1:my_B_size, iiB, kkk) = Y_i_aP(1:my_B_size, lll, iiB)
    2006             :                   END DO
    2007             :                END DO
    2008             : !$OMP                END PARALLEL DO
    2009             :             END DO
    2010          98 :             CALL timestop(handle2)
    2011             : 
    2012          98 :             rec_ij_index = num_ij_pairs(proc_receive)
    2013             : 
    2014        3310 :             IF (ij_index <= rec_ij_index) THEN
    2015             :                ! we know that proc_receive has something to send for us, let's see what
    2016             :                ij_counter_rec = &
    2017          80 :                   (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
    2018             : 
    2019          80 :                rec_i = ij_map(spin, ij_counter_rec)
    2020             : 
    2021             :                BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
    2022          80 :                   buffer_1D(offset + 1:offset + INT(my_B_size, int_8)*my_block_size*my_group_L_size)
    2023       44250 :                BI_C_rec = 0.0_dp
    2024             : 
    2025             :                CALL comm_exchange%sendrecv(BI_C_send, proc_send, &
    2026          80 :                                            BI_C_rec, proc_receive, tag)
    2027             : 
    2028          80 :                CALL timeset(routineN//"_comm2_w", handle2)
    2029         160 :                DO irep = 0, num_integ_group - 1
    2030          80 :                   start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    2031          80 :                   end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    2032             : !$OMP             PARALLEL WORKSHARE DEFAULT(NONE) SHARED(start_point,end_point,my_block_size,&
    2033         160 : !$OMP                                           Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
    2034             :                   Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
    2035             :                      Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
    2036             :                      BI_C_rec(1:my_B_size, :, start_point:end_point)
    2037             : !$OMP             END PARALLEL WORKSHARE
    2038             :                END DO
    2039          80 :                CALL timestop(handle2)
    2040             : 
    2041             :             ELSE
    2042             :                ! we have something to send but nothing to receive
    2043          18 :                CALL comm_exchange%send(BI_C_send, proc_send, tag)
    2044             : 
    2045             :             END IF
    2046             : 
    2047             :          END DO
    2048             : 
    2049             :       ELSE
    2050             :          ! noting to send check if we have to receive
    2051          36 :          DO proc_shift = 1, comm_exchange%num_pe - 1
    2052          18 :             proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2053          18 :             proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2054          18 :             rec_ij_index = num_ij_pairs(proc_receive)
    2055             : 
    2056          36 :             IF (ij_index <= rec_ij_index) THEN
    2057             :                ! we know that proc_receive has something to send for us, let's see what
    2058             :                ij_counter_rec = &
    2059          18 :                   (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
    2060             : 
    2061          18 :                rec_i = ij_map(spin, ij_counter_rec)
    2062             : 
    2063             :                BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
    2064          18 :                   buffer_1D(1:INT(my_B_size, int_8)*my_block_size*my_group_L_size)
    2065             : 
    2066        4442 :                BI_C_rec = 0.0_dp
    2067             : 
    2068          18 :                CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2069             : 
    2070          18 :                CALL timeset(routineN//"_comm2_w", handle2)
    2071          36 :                DO irep = 0, num_integ_group - 1
    2072          18 :                   start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    2073          18 :                   end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    2074             : !$OMP             PARALLEL WORKSHARE DEFAULT(NONE) SHARED(start_point,end_point,my_block_size,&
    2075          36 : !$OMP                                           Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
    2076             :                   Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
    2077             :                      Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
    2078             :                      BI_C_rec(1:my_B_size, :, start_point:end_point)
    2079             : !$OMP             END PARALLEL WORKSHARE
    2080             :                END DO
    2081          18 :                CALL timestop(handle2)
    2082             : 
    2083             :             END IF
    2084             :          END DO
    2085             : 
    2086             :       END IF
    2087        3132 :       CALL timestop(handle)
    2088             : 
    2089        3132 :    END SUBROUTINE mp2_redistribute_gamma
    2090             : 
    2091             : ! **************************************************************************************************
    2092             : !> \brief ...
    2093             : !> \param mp2_env ...
    2094             : !> \param Eigenval ...
    2095             : !> \param homo ...
    2096             : !> \param virtual ...
    2097             : !> \param open_shell ...
    2098             : !> \param beta_beta ...
    2099             : !> \param Bib_C ...
    2100             : !> \param unit_nr ...
    2101             : !> \param dimen_RI ...
    2102             : !> \param my_B_size ...
    2103             : !> \param ngroup ...
    2104             : !> \param my_group_L_size ...
    2105             : !> \param color_sub ...
    2106             : !> \param ranges_info_array ...
    2107             : !> \param comm_exchange ...
    2108             : !> \param para_env_sub ...
    2109             : !> \param para_env ...
    2110             : !> \param my_B_virtual_start ...
    2111             : !> \param my_B_virtual_end ...
    2112             : !> \param sizes_array ...
    2113             : !> \param gd_B_virtual ...
    2114             : !> \param integ_group_pos2color_sub ...
    2115             : !> \param dgemm_counter ...
    2116             : !> \param buffer_1D ...
    2117             : ! **************************************************************************************************
    2118         372 :    SUBROUTINE quasi_degenerate_P_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
    2119         372 :                                     beta_beta, Bib_C, unit_nr, dimen_RI, &
    2120         372 :                                     my_B_size, ngroup, my_group_L_size, &
    2121             :                                     color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
    2122         372 :                                     my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
    2123         372 :                                     integ_group_pos2color_sub, dgemm_counter, buffer_1D)
    2124             :       TYPE(mp2_type)                                     :: mp2_env
    2125             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
    2126             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2127             :       LOGICAL, INTENT(IN)                                :: open_shell, beta_beta
    2128             :       TYPE(three_dim_real_array), DIMENSION(:), &
    2129             :          INTENT(IN)                                      :: BIb_C
    2130             :       INTEGER, INTENT(IN)                                :: unit_nr, dimen_RI
    2131             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_size
    2132             :       INTEGER, INTENT(IN)                                :: ngroup, my_group_L_size, color_sub
    2133             :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    2134             :          INTENT(IN)                                      :: ranges_info_array
    2135             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    2136             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env_sub, para_env
    2137             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_virtual_start, my_B_virtual_end
    2138             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: sizes_array
    2139             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    2140             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: integ_group_pos2color_sub
    2141             :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
    2142             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    2143             : 
    2144             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'quasi_degenerate_P_ij'
    2145             : 
    2146             :       INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
    2147             :          ijk_counter_send, ijk_index, ispin, kkB, kspin, max_block_size, max_ijk, my_i, my_ijk, &
    2148             :          my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
    2149             :          rec_B_size, rec_B_virtual_end, rec_B_virtual_start, rec_L_size, send_B_size, &
    2150             :          send_B_virtual_end, send_B_virtual_start, send_i, send_ijk_index, send_j, send_k, &
    2151             :          size_B_i, size_B_k, tag, tag2
    2152         372 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: num_ijk
    2153         372 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: ijk_map, send_last_k
    2154             :       LOGICAL                                            :: alpha_beta, do_recv_i, do_recv_j, &
    2155             :                                                             do_recv_k, do_send_i, do_send_j, &
    2156             :                                                             do_send_k
    2157             :       REAL(KIND=dp)                                      :: amp_fac, P_ij_elem, t_new, t_start
    2158             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
    2159         372 :          TARGET                                          :: local_ab, local_aL_i, local_aL_j, t_ab
    2160         372 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: local_aL_k
    2161         372 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: BI_C_rec, external_ab, external_aL
    2162         372 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: BI_C_rec_3D
    2163             : 
    2164         372 :       CALL timeset(routineN//"_ij_sing", handle)
    2165             : 
    2166         372 :       tag = 44
    2167         372 :       tag2 = 45
    2168             : 
    2169         372 :       nspins = SIZE(BIb_C)
    2170         372 :       alpha_beta = (nspins == 2)
    2171             : 
    2172             :       ! Set amplitude factor
    2173         372 :       amp_fac = mp2_env%scale_S + mp2_env%scale_T
    2174         372 :       IF (open_shell) amp_fac = mp2_env%scale_T
    2175             : 
    2176         770 :       ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
    2177             : 
    2178             :       ! Loop(s) over orbital triplets
    2179         820 :       DO ispin = 1, nspins
    2180         448 :          size_B_i = my_B_size(ispin)
    2181         448 :          IF (ispin == 1 .AND. alpha_beta) THEN
    2182             :             kspin = 2
    2183             :          ELSE
    2184         372 :             kspin = 1
    2185             :          END IF
    2186         448 :          size_B_k = my_B_size(kspin)
    2187             : 
    2188             :          ! Find the number of quasi-degenerate orbitals and orbital triplets
    2189             : 
    2190             :          CALL Find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), Eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
    2191             :                                        .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
    2192         600 :                                        SIZE(buffer_1D), my_group_L_size, size_B_k, para_env, virtual(ispin), size_B_i)
    2193             : 
    2194         448 :          my_virtual = virtual(ispin)
    2195         448 :          IF (SIZE(ijk_map, 2) > 0) THEN
    2196          90 :             max_block_size = ijk_map(4, 1)
    2197             :          ELSE
    2198             :             max_block_size = 1
    2199             :          END IF
    2200             : 
    2201        1792 :          ALLOCATE (local_aL_i(dimen_RI, size_B_i))
    2202        1792 :          ALLOCATE (local_aL_j(dimen_RI, size_B_i))
    2203        2240 :          ALLOCATE (local_aL_k(dimen_RI, size_B_k, max_block_size))
    2204        1792 :          ALLOCATE (t_ab(my_virtual, size_B_k))
    2205             : 
    2206        1344 :          my_last_k = -1
    2207         538 :          send_last_k = -1
    2208             : 
    2209         448 :          t_start = m_walltime()
    2210         594 :          DO ijk_index = 1, max_ijk
    2211             : 
    2212             :             ! Prediction is unreliable if we are in the first step of the loop
    2213         146 :             IF (unit_nr > 0 .AND. ijk_index > 1) THEN
    2214          18 :                decil = ijk_index*10/max_ijk
    2215          18 :                IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
    2216          18 :                   t_new = m_walltime()
    2217          18 :                   t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
    2218             :                   WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
    2219          18 :                      cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
    2220          18 :                   CALL m_flush(unit_nr)
    2221             :                END IF
    2222             :             END IF
    2223             : 
    2224         594 :             IF (ijk_index <= my_ijk) THEN
    2225             :                ! work to be done
    2226         144 :                ijk_counter = (ijk_index - MIN(1, color_sub))*ngroup + color_sub
    2227         144 :                my_i = ijk_map(1, ijk_counter)
    2228         144 :                my_j = ijk_map(2, ijk_counter)
    2229         144 :                my_k = ijk_map(3, ijk_counter)
    2230         144 :                block_size = ijk_map(4, ijk_counter)
    2231             : 
    2232         144 :                do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
    2233         144 :                do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
    2234         144 :                do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
    2235         144 :                my_last_k(1) = my_k
    2236         144 :                my_last_k(2) = my_k + block_size - 1
    2237             : 
    2238      127614 :                local_aL_i = 0.0_dp
    2239         144 :                IF (do_recv_i) THEN
    2240             :                   CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
    2241         117 :                                           BIb_C(ispin)%array(:, :, my_i))
    2242             :                END IF
    2243             : 
    2244      127614 :                local_aL_j = 0.0_dp
    2245         144 :                IF (do_recv_j) THEN
    2246             :                   CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
    2247         117 :                                           BIb_C(ispin)%array(:, :, my_j))
    2248             :                END IF
    2249             : 
    2250         144 :                IF (do_recv_k) THEN
    2251      219805 :                   local_aL_k = 0.0_dp
    2252             :                   CALL fill_local_i_aL(local_aL_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
    2253         136 :                                        BIb_C(kspin)%array(:, :, my_k:my_k + block_size - 1))
    2254             :                END IF
    2255             : 
    2256         144 :                CALL timeset(routineN//"_comm", handle2)
    2257         154 :                DO proc_shift = 1, comm_exchange%num_pe - 1
    2258          10 :                   proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2259          10 :                   proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2260             : 
    2261          10 :                   send_ijk_index = num_ijk(proc_send)
    2262             : 
    2263          10 :                   rec_L_size = sizes_array(proc_receive)
    2264          10 :                   BI_C_rec(1:rec_L_size, 1:size_B_i) => buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_i)
    2265             : 
    2266          10 :                   do_send_i = .FALSE.
    2267          10 :                   do_send_j = .FALSE.
    2268          10 :                   do_send_k = .FALSE.
    2269          10 :                   IF (ijk_index <= send_ijk_index) THEN
    2270             :                      ! something to send
    2271             :                      ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))* &
    2272           8 :                                         ngroup + integ_group_pos2color_sub(proc_send)
    2273           8 :                      send_i = ijk_map(1, ijk_counter_send)
    2274           8 :                      send_j = ijk_map(2, ijk_counter_send)
    2275           8 :                      send_k = ijk_map(3, ijk_counter_send)
    2276             : 
    2277           8 :                      do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
    2278           8 :                      do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
    2279           8 :                      do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
    2280           8 :                      send_last_k(1, proc_shift) = send_k
    2281           8 :                      send_last_k(2, proc_shift) = send_k + block_size - 1
    2282             :                   END IF
    2283             : 
    2284             :                   ! occupied i
    2285         722 :                   BI_C_rec = 0.0_dp
    2286          10 :                   IF (do_send_i) THEN
    2287           6 :                   IF (do_recv_i) THEN
    2288             :                      CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i), proc_send, &
    2289         288 :                                                  BI_C_rec, proc_receive, tag)
    2290             :                   ELSE
    2291           2 :                      CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
    2292             :                   END IF
    2293           4 :                   ELSE IF (do_recv_i) THEN
    2294         580 :                   CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2295             :                   END IF
    2296          10 :                   IF (do_recv_i) THEN
    2297           8 :                      CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, proc_receive), BI_C_rec)
    2298             :                   END IF
    2299             : 
    2300             :                   ! occupied j
    2301         722 :                   BI_C_rec = 0.0_dp
    2302          10 :                   IF (do_send_j) THEN
    2303           8 :                   IF (do_recv_j) THEN
    2304             :                      CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_j), proc_send, &
    2305         576 :                                                  BI_C_rec, proc_receive, tag)
    2306             :                   ELSE
    2307           0 :                      CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
    2308             :                   END IF
    2309           2 :                   ELSE IF (do_recv_j) THEN
    2310           0 :                   CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2311             :                   END IF
    2312          10 :                   IF (do_recv_j) THEN
    2313           8 :                      CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, proc_receive), BI_C_rec)
    2314             :                   END IF
    2315             : 
    2316             :                   ! occupied k
    2317             :                   BI_C_rec_3D(1:rec_L_size, 1:size_B_k, 1:block_size) => &
    2318          10 :                      buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_k*block_size)
    2319          10 :                   IF (do_send_k) THEN
    2320           8 :                   IF (do_recv_k) THEN
    2321             :                      CALL comm_exchange%sendrecv(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
    2322         726 :                                                  BI_C_rec_3D, proc_receive, tag)
    2323             :                   ELSE
    2324           0 :                      CALL comm_exchange%send(BI_C_rec, proc_receive, tag)
    2325             :                   END IF
    2326           2 :                   ELSE IF (do_recv_k) THEN
    2327         294 :                   CALL comm_exchange%recv(BI_C_rec_3D, proc_receive, tag)
    2328             :                   END IF
    2329         154 :                   IF (do_recv_k) THEN
    2330          10 :                      CALL fill_local_i_aL(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), BI_C_rec_3D)
    2331             :                   END IF
    2332             :                END DO
    2333             : 
    2334       20172 :                IF (.NOT. do_recv_i) local_aL_i(:, :) = local_aL_k(:, :, my_i - my_k + 1)
    2335       20172 :                IF (.NOT. do_recv_j) local_aL_j(:, :) = local_aL_k(:, :, my_j - my_k + 1)
    2336         144 :                CALL timestop(handle2)
    2337             : 
    2338             :                ! expand integrals
    2339         352 :                DO kkB = 1, block_size
    2340         208 :                   CALL timeset(routineN//"_exp_ik", handle2)
    2341         208 :                   CALL dgemm_counter_start(dgemm_counter)
    2342         832 :                   ALLOCATE (local_ab(my_virtual, size_B_k))
    2343       32408 :                   local_ab = 0.0_dp
    2344             :                   CALL local_gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
    2345             :                                   local_aL_i, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2346             :                                   0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i, &
    2347         208 :                                   mp2_env%local_gemm_ctx)
    2348         208 :                   DO proc_shift = 1, para_env_sub%num_pe - 1
    2349           0 :                      proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2350           0 :                      proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2351             : 
    2352           0 :                      CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2353             : 
    2354           0 :                      external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
    2355             : 
    2356             :                      CALL comm_exchange%sendrecv(local_aL_i, proc_send, &
    2357           0 :                                                  external_aL, proc_receive, tag)
    2358             : 
    2359             :                      CALL local_gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
    2360             :                                      external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2361             :                                      0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size, &
    2362         208 :                                      mp2_env%local_gemm_ctx)
    2363             :                   END DO
    2364         208 :                   CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
    2365         208 :                   CALL timestop(handle2)
    2366             : 
    2367             :                   ! Amplitudes
    2368         208 :                   CALL timeset(routineN//"_tab", handle2)
    2369       32408 :                   t_ab = 0.0_dp
    2370             :                   ! Alpha-alpha, beta-beta and closed shell
    2371         208 :                   IF (.NOT. alpha_beta) THEN
    2372        1125 :                      DO b = 1, size_B_k
    2373        1014 :                         b_global = b + my_B_virtual_start(1) - 1
    2374       17135 :                         DO a = 1, my_B_size(1)
    2375       16010 :                            a_global = a + my_B_virtual_start(1) - 1
    2376             :                            t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
    2377             :                                                (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
    2378       17024 :                                                 - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
    2379             :                         END DO
    2380             :                      END DO
    2381             :                   ELSE
    2382         999 :                      DO b = 1, size_B_k
    2383         902 :                         b_global = b + my_B_virtual_start(kspin) - 1
    2384       15273 :                         DO a = 1, my_B_size(ispin)
    2385       14274 :                            a_global = a + my_B_virtual_start(ispin) - 1
    2386             :                            t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
    2387             :                                                (Eigenval(my_i, ispin) + Eigenval(my_k + kkB - 1, kspin) &
    2388       15176 :                                                 - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
    2389             :                         END DO
    2390             :                      END DO
    2391             :                   END IF
    2392             : 
    2393         208 :                   IF (.NOT. alpha_beta) THEN
    2394         111 :                      DO proc_shift = 1, para_env_sub%num_pe - 1
    2395           0 :                         proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2396           0 :                         proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2397           0 :                         CALL get_group_dist(gd_B_virtual(1), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2398           0 :                         CALL get_group_dist(gd_B_virtual(1), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    2399             : 
    2400           0 :                         external_ab(1:size_B_i, 1:rec_B_size) => buffer_1D(1:INT(size_B_i, KIND=int_8)*rec_B_size)
    2401             :                         CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:size_B_k), proc_send, &
    2402           0 :                                                    external_ab(1:size_B_i, 1:rec_B_size), proc_receive, tag)
    2403             : 
    2404         111 :                         DO b = 1, my_B_size(1)
    2405           0 :                            b_global = b + my_B_virtual_start(1) - 1
    2406           0 :                            DO a = 1, rec_B_size
    2407           0 :                               a_global = a + rec_B_virtual_start - 1
    2408             :                               t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
    2409             :                                                   (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
    2410           0 :                                                    - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
    2411             :                            END DO
    2412             :                         END DO
    2413             :                      END DO
    2414             :                   END IF
    2415         208 :                   CALL timestop(handle2)
    2416             : 
    2417             :                   ! Expand the second set of integrals
    2418         208 :                   CALL timeset(routineN//"_exp_jk", handle2)
    2419       32408 :                   local_ab = 0.0_dp
    2420         208 :                   CALL dgemm_counter_start(dgemm_counter)
    2421             :                   CALL local_gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
    2422             :                                   local_aL_j, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2423             :                                   0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i, &
    2424         208 :                                   mp2_env%local_gemm_ctx)
    2425         208 :                   DO proc_shift = 1, para_env_sub%num_pe - 1
    2426           0 :                      proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2427           0 :                      proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2428             : 
    2429           0 :                      CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2430             : 
    2431           0 :                      external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
    2432             : 
    2433             :                      CALL comm_exchange%sendrecv(local_aL_j, proc_send, &
    2434           0 :                                                  external_aL, proc_receive, tag)
    2435             :                      CALL local_gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
    2436             :                                      external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2437             :                                      0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size, &
    2438         208 :                                      mp2_env%local_gemm_ctx)
    2439             :                   END DO
    2440         208 :                   CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
    2441         208 :                   CALL timestop(handle2)
    2442             : 
    2443         208 :                   CALL timeset(routineN//"_Pij", handle2)
    2444        2124 :                   DO b = 1, size_B_k
    2445        1916 :                      b_global = b + my_B_virtual_start(kspin) - 1
    2446       32408 :                      DO a = 1, my_B_size(ispin)
    2447       30284 :                         a_global = a + my_B_virtual_start(ispin) - 1
    2448             :                         local_ab(a_global, b) = &
    2449             :                            local_ab(a_global, b)/(Eigenval(my_j, ispin) + Eigenval(my_k + kkB - 1, kspin) &
    2450       32200 :                                                 - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
    2451             :                      END DO
    2452             :                   END DO
    2453             :                   !
    2454       32408 :                   P_ij_elem = SUM(local_ab*t_ab)
    2455         208 :                   DEALLOCATE (local_ab)
    2456         208 :                   IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
    2457           4 :                      P_ij_elem = P_ij_elem*2.0_dp
    2458             :                   END IF
    2459         208 :                   IF (beta_beta) THEN
    2460          31 :                      mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - P_ij_elem
    2461          31 :                      mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - P_ij_elem
    2462             :                   ELSE
    2463         177 :                      mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - P_ij_elem
    2464         177 :                      mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - P_ij_elem
    2465             :                   END IF
    2466        1184 :                   CALL timestop(handle2)
    2467             :                END DO
    2468             :             ELSE
    2469           2 :                CALL timeset(routineN//"_comm", handle2)
    2470             :                ! no work to be done, possible messeges to be exchanged
    2471           4 :                DO proc_shift = 1, comm_exchange%num_pe - 1
    2472           2 :                   proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2473           2 :                   proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2474             : 
    2475           2 :                   send_ijk_index = num_ijk(proc_send)
    2476             : 
    2477           4 :                   IF (ijk_index <= send_ijk_index) THEN
    2478             :                      ! somethig to send
    2479             :                      ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
    2480           2 :                                         integ_group_pos2color_sub(proc_send)
    2481           2 :                      send_i = ijk_map(1, ijk_counter_send)
    2482           2 :                      send_j = ijk_map(2, ijk_counter_send)
    2483           2 :                      send_k = ijk_map(3, ijk_counter_send)
    2484           2 :                      block_size = ijk_map(4, ijk_counter_send)
    2485             : 
    2486           2 :                      do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
    2487           2 :                      do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
    2488             :                      ! occupied i
    2489           2 :                      IF (do_send_i) THEN
    2490           2 :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
    2491             :                      END IF
    2492             :                      ! occupied j
    2493           2 :                      IF (do_send_j) THEN
    2494           0 :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
    2495             :                      END IF
    2496             :                      ! occupied k
    2497           2 :                      CALL comm_exchange%send(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
    2498             :                   END IF
    2499             : 
    2500             :                END DO ! proc loop
    2501           2 :                CALL timestop(handle2)
    2502             :             END IF
    2503             :          END DO ! ijk_index loop
    2504         448 :          DEALLOCATE (local_aL_i)
    2505         448 :          DEALLOCATE (local_aL_j)
    2506         448 :          DEALLOCATE (local_aL_k)
    2507         448 :          DEALLOCATE (t_ab)
    2508         820 :          DEALLOCATE (ijk_map)
    2509             :       END DO ! over number of loops (ispin)
    2510         372 :       CALL timestop(handle)
    2511             : 
    2512         744 :    END SUBROUTINE Quasi_degenerate_P_ij
    2513             : 
    2514             : ! **************************************************************************************************
    2515             : !> \brief ...
    2516             : !> \param my_ijk ...
    2517             : !> \param homo ...
    2518             : !> \param homo_beta ...
    2519             : !> \param Eigenval ...
    2520             : !> \param mp2_env ...
    2521             : !> \param ijk_map ...
    2522             : !> \param unit_nr ...
    2523             : !> \param ngroup ...
    2524             : !> \param do_print_alpha ...
    2525             : !> \param comm_exchange ...
    2526             : !> \param num_ijk ...
    2527             : !> \param max_ijk ...
    2528             : !> \param color_sub ...
    2529             : !> \param buffer_size ...
    2530             : !> \param my_group_L_size ...
    2531             : !> \param B_size_k ...
    2532             : !> \param para_env ...
    2533             : !> \param virtual ...
    2534             : !> \param B_size_i ...
    2535             : ! **************************************************************************************************
    2536         448 :    SUBROUTINE Find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
    2537             :                                        do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
    2538             :                                        buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
    2539             : 
    2540             :       INTEGER, INTENT(OUT)                               :: my_ijk
    2541             :       INTEGER, INTENT(IN)                                :: homo, homo_beta
    2542             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    2543             :       TYPE(mp2_type), INTENT(IN)                         :: mp2_env
    2544             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
    2545             :       INTEGER, INTENT(IN)                                :: unit_nr, ngroup
    2546             :       LOGICAL, INTENT(IN)                                :: do_print_alpha
    2547             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    2548             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: num_ijk
    2549             :       INTEGER, INTENT(OUT)                               :: max_ijk
    2550             :       INTEGER, INTENT(IN)                                :: color_sub, buffer_size, my_group_L_size, &
    2551             :                                                             B_size_k
    2552             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    2553             :       INTEGER, INTENT(IN)                                :: virtual, B_size_i
    2554             : 
    2555             :       INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
    2556             :          ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
    2557             :          my_steps, num_k_blocks, num_sing_ij, total_ijk
    2558             :       INTEGER(KIND=int_8)                                :: mem
    2559         448 :       LOGICAL, ALLOCATABLE, DIMENSION(:, :)              :: ijk_marker
    2560             : 
    2561        1344 :       ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
    2562             : 
    2563         448 :       num_sing_ij = 0
    2564        2026 :       DO iiB = 1, homo
    2565             :          ! diagonal elements already updated
    2566        4230 :          DO jjB = iiB + 1, homo
    2567        2204 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) < mp2_env%ri_grad%eps_canonical) &
    2568        1684 :                num_sing_ij = num_sing_ij + 1
    2569             :          END DO
    2570             :       END DO
    2571             : 
    2572         448 :       IF (unit_nr > 0) THEN
    2573         224 :       IF (do_print_alpha) THEN
    2574             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2575         148 :             "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
    2576             :       ELSE
    2577             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2578          76 :             "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
    2579             :       END IF
    2580             :       END IF
    2581             : 
    2582             :       ! Determine the block size, first guess: use available buffer
    2583         448 :       max_block_size = buffer_size/(my_group_L_size*B_size_k)
    2584             : 
    2585             :       ! Second limit: memory
    2586         448 :       CALL m_memory(mem)
    2587             :       ! Convert to number of doubles
    2588         448 :       mem = mem/8
    2589             :       ! Remove local_ab (2x) and local_aL_i (2x)
    2590         448 :       mem = mem - 2_int_8*(virtual*B_size_k + B_size_i*my_group_L_size)
    2591         448 :       max_block_size = MIN(max_block_size, MAX(1, INT(mem/(my_group_L_size*B_size_k), KIND(max_block_size))))
    2592             : 
    2593             :       ! Exchange the limit
    2594         448 :       CALL para_env%min(max_block_size)
    2595             : 
    2596             :       ! Find now the block size which minimizes the communication volume and then the number of communication steps
    2597         448 :       block_size = 1
    2598         448 :       min_communication_volume = 3*homo_beta*num_sing_ij
    2599         448 :       communication_steps = 3*homo_beta*num_sing_ij
    2600        1136 :       DO iiB = max_block_size, 2, -1
    2601         688 :          max_num_k_blocks = homo_beta/iiB*num_sing_ij
    2602         688 :          num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
    2603         688 :          communication_volume = num_k_blocks*(2 + iiB) + 3*(homo_beta*num_sing_ij - iiB*num_k_blocks)
    2604         688 :          my_steps = num_k_blocks + homo_beta*num_sing_ij - iiB*num_k_blocks
    2605        1136 :          IF (communication_volume < min_communication_volume) THEN
    2606          48 :             block_size = iiB
    2607          48 :             min_communication_volume = communication_volume
    2608          48 :             communication_steps = my_steps
    2609         640 :          ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
    2610          52 :             block_size = iiB
    2611          52 :             communication_steps = my_steps
    2612             :          END IF
    2613             :       END DO
    2614             : 
    2615         448 :       IF (unit_nr > 0) THEN
    2616             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2617         224 :             "MO_INFO| Block size:", block_size
    2618         224 :          CALL m_flush(unit_nr)
    2619             :       END IF
    2620             : 
    2621             :       ! Calculate number of large blocks
    2622         448 :       max_num_k_blocks = homo_beta/block_size*num_sing_ij
    2623         448 :       num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
    2624             : 
    2625         448 :       total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
    2626         986 :       ALLOCATE (ijk_map(4, total_ijk))
    2627        1888 :       ijk_map = 0
    2628        1434 :       ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
    2629         970 :       ijk_marker = .TRUE.
    2630             : 
    2631         448 :       my_ijk = 0
    2632         448 :       ijk_counter = 0
    2633         448 :       ij_counter = 0
    2634        2026 :       DO iiB = 1, homo
    2635             :          ! diagonal elements already updated
    2636        4230 :          DO jjB = iiB + 1, homo
    2637        2204 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
    2638         106 :             ij_counter = ij_counter + 1
    2639        1812 :             DO kkB = 1, homo_beta - MOD(homo_beta, block_size), block_size
    2640         162 :                IF (ijk_counter + 1 > num_k_blocks) EXIT
    2641         128 :                ijk_counter = ijk_counter + 1
    2642         384 :                ijk_marker(kkB:kkB + block_size - 1, ij_counter) = .FALSE.
    2643         128 :                ijk_map(1, ijk_counter) = iiB
    2644         128 :                ijk_map(2, ijk_counter) = jjB
    2645         128 :                ijk_map(3, ijk_counter) = kkB
    2646         128 :                ijk_map(4, ijk_counter) = block_size
    2647        2332 :                IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
    2648             :             END DO
    2649             :          END DO
    2650             :       END DO
    2651             :       ij_counter = 0
    2652        2026 :       DO iiB = 1, homo
    2653             :          ! diagonal elements already updated
    2654        4230 :          DO jjB = iiB + 1, homo
    2655        2204 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
    2656         106 :             ij_counter = ij_counter + 1
    2657        2100 :             DO kkB = 1, homo_beta
    2658        2620 :                IF (ijk_marker(kkB, ij_counter)) THEN
    2659         160 :                   ijk_counter = ijk_counter + 1
    2660         160 :                   ijk_map(1, ijk_counter) = iiB
    2661         160 :                   ijk_map(2, ijk_counter) = jjB
    2662         160 :                   ijk_map(3, ijk_counter) = kkB
    2663         160 :                   ijk_map(4, ijk_counter) = 1
    2664         160 :                   IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
    2665             :                END IF
    2666             :             END DO
    2667             :          END DO
    2668             :       END DO
    2669             : 
    2670         448 :       DEALLOCATE (ijk_marker)
    2671             : 
    2672         448 :       CALL comm_exchange%allgather(my_ijk, num_ijk)
    2673         926 :       max_ijk = MAXVAL(num_ijk)
    2674             : 
    2675         448 :    END SUBROUTINE Find_quasi_degenerate_ij
    2676             : 
    2677             : END MODULE mp2_ri_gpw

Generated by: LCOV version 1.15