LCOV - code coverage report
Current view: top level - src - mp2_ri_gpw.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:42dac4a) Lines: 97.7 % 1078 1053
Test Date: 2025-07-25 12:55:17 Functions: 100.0 % 14 14

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief Routines to calculate RI-GPW-MP2 energy using pw
      10              : !> \par History
      11              : !>      06.2012 created [Mauro Del Ben]
      12              : !>      03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
      13              : ! **************************************************************************************************
      14              : MODULE mp2_ri_gpw
      15              :    USE cp_log_handling,                 ONLY: cp_to_string
      16              :    USE dgemm_counter_types,             ONLY: dgemm_counter_init,&
      17              :                                               dgemm_counter_start,&
      18              :                                               dgemm_counter_stop,&
      19              :                                               dgemm_counter_type,&
      20              :                                               dgemm_counter_write
      21              :    USE group_dist_types,                ONLY: get_group_dist,&
      22              :                                               group_dist_d1_type,&
      23              :                                               maxsize,&
      24              :                                               release_group_dist
      25              :    USE kinds,                           ONLY: dp,&
      26              :                                               int_8
      27              :    USE libint_2c_3c,                    ONLY: compare_potential_types
      28              :    USE local_gemm_api,                  ONLY: LOCAL_GEMM_PU_GPU
      29              :    USE machine,                         ONLY: m_flush,&
      30              :                                               m_memory,&
      31              :                                               m_walltime
      32              :    USE message_passing,                 ONLY: mp_comm_type,&
      33              :                                               mp_para_env_type
      34              :    USE mp2_ri_grad_util,                ONLY: complete_gamma
      35              :    USE mp2_types,                       ONLY: mp2_type,&
      36              :                                               three_dim_real_array
      37              : 
      38              : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
      39              : #include "./base/base_uses.f90"
      40              : 
      41              :    IMPLICIT NONE
      42              : 
      43              :    PRIVATE
      44              : 
      45              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
      46              : 
      47              :    PUBLIC :: mp2_ri_gpw_compute_en
      48              : 
      49              : CONTAINS
      50              : 
      51              : ! **************************************************************************************************
      52              : !> \brief ...
      53              : !> \param Emp2_Cou ...
      54              : !> \param Emp2_EX ...
      55              : !> \param Emp2_S ...
      56              : !> \param Emp2_T ...
      57              : !> \param BIb_C ...
      58              : !> \param mp2_env ...
      59              : !> \param para_env ...
      60              : !> \param para_env_sub ...
      61              : !> \param color_sub ...
      62              : !> \param gd_array ...
      63              : !> \param gd_B_virtual ...
      64              : !> \param Eigenval ...
      65              : !> \param nmo ...
      66              : !> \param homo ...
      67              : !> \param dimen_RI ...
      68              : !> \param unit_nr ...
      69              : !> \param calc_forces ...
      70              : !> \param calc_ex ...
      71              : ! **************************************************************************************************
      72         1062 :    SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
      73          354 :                                     gd_array, gd_B_virtual, &
      74          354 :                                     Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
      75              :       REAL(KIND=dp), INTENT(INOUT)                       :: Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T
      76              :       TYPE(three_dim_real_array), DIMENSION(:), &
      77              :          INTENT(INOUT)                                   :: BIb_C
      78              :       TYPE(mp2_type)                                     :: mp2_env
      79              :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env, para_env_sub
      80              :       INTEGER, INTENT(IN)                                :: color_sub
      81              :       TYPE(group_dist_d1_type), INTENT(INOUT)            :: gd_array
      82              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
      83              :       INTEGER, INTENT(IN)                                :: nmo
      84              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
      85              :       TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
      86              :          INTENT(INOUT)                                   :: gd_B_virtual
      87              :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
      88              :       LOGICAL, INTENT(IN)                                :: calc_forces, calc_ex
      89              : 
      90              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_gpw_compute_en'
      91              : 
      92              :       INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
      93              :          iiB, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjB, jspin, &
      94              :          max_ij_pairs, my_block_size, my_group_L_end, my_group_L_size, my_group_L_size_orig, &
      95              :          my_group_L_start, my_i, my_ij_pairs, my_j, my_new_group_L_size, ngroup, nspins, &
      96              :          num_integ_group, proc_receive, proc_send, proc_shift, rec_B_size, rec_B_virtual_end, &
      97              :          rec_B_virtual_start, rec_L_size, send_B_size, send_B_virtual_end, send_B_virtual_start, &
      98              :          send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
      99          354 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_B_size, &
     100          354 :          my_B_virtual_end, my_B_virtual_start, num_ij_pairs, sizes_array_orig, virtual
     101          354 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: ij_map
     102          354 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: ranges_info_array
     103              :       LOGICAL                                            :: my_alpha_beta_case, my_beta_beta_case, &
     104              :                                                             my_open_shell_SS
     105              :       REAL(KIND=dp)                                      :: amp_fac, my_Emp2_Cou, my_Emp2_EX, &
     106              :                                                             sym_fac, t_new, t_start
     107          354 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: buffer_1D
     108              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     109          354 :          TARGET                                          :: local_ab, local_ba, t_ab
     110              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     111          354 :          TARGET                                          :: local_i_aL, local_j_aL, Y_i_aP, Y_j_aP
     112              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
     113          354 :          POINTER                                         :: external_ab, external_i_aL
     114              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
     115          354 :          POINTER                                         :: BI_C_rec
     116              :       TYPE(dgemm_counter_type)                           :: dgemm_counter
     117              :       TYPE(mp_comm_type)                                 :: comm_exchange, comm_rep
     118              :       TYPE(three_dim_real_array), ALLOCATABLE, &
     119          354 :          DIMENSION(:)                                    :: B_ia_Q
     120              : 
     121          354 :       CALL timeset(routineN, handle)
     122              : 
     123          354 :       nspins = SIZE(homo)
     124              : 
     125         1062 :       ALLOCATE (virtual(nspins))
     126          796 :       virtual(:) = nmo - homo(:)
     127              : 
     128         1416 :       ALLOCATE (my_B_size(nspins), my_B_virtual_start(nspins), my_B_virtual_end(nspins))
     129          796 :       DO ispin = 1, nspins
     130              :          CALL get_group_dist(gd_B_virtual(ispin), para_env_sub%mepos, &
     131          796 :                              my_B_virtual_start(ispin), my_B_virtual_end(ispin), my_B_size(ispin))
     132              :       END DO
     133              : 
     134          354 :       CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
     135              : 
     136          354 :       CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
     137              : 
     138              :       ! local_gemm_ctx has a very footprint the first time this routine is
     139              :       ! called.
     140          354 :       CALL mp2_env%local_gemm_ctx%create(LOCAL_GEMM_PU_GPU)
     141          354 :       CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(128*128*128*2)
     142              : 
     143              :       CALL mp2_ri_get_integ_group_size( &
     144              :          mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
     145              :          homo, dimen_RI, unit_nr, &
     146              :          integ_group_size, ngroup, &
     147          354 :          num_integ_group, virtual, calc_forces)
     148              : 
     149              :       ! now create a group that contains all the proc that have the same virtual starting point
     150              :       ! in the integ group
     151              :       CALL mp2_ri_create_group( &
     152              :          para_env, para_env_sub, color_sub, &
     153              :          gd_array%sizes, calc_forces, &
     154              :          integ_group_size, my_group_L_end, &
     155              :          my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
     156              :          integ_group_pos2color_sub, sizes_array_orig, &
     157          354 :          ranges_info_array, comm_exchange, comm_rep, num_integ_group)
     158              : 
     159              :       ! We cannot fix the tag because of the recv routine
     160          354 :       tag = 42
     161              : 
     162          796 :       DO jspin = 1, nspins
     163              : 
     164              :          CALL replicate_iaK_2intgroup(BIb_C(jspin)%array, comm_exchange, comm_rep, &
     165              :                                       homo(jspin), gd_array%sizes, my_B_size(jspin), &
     166          442 :                                       my_group_L_size, ranges_info_array)
     167              : 
     168         1326 :          DO ispin = 1, jspin
     169              : 
     170          530 :             IF (unit_nr > 0) THEN
     171          265 :                IF (nspins == 1) THEN
     172          133 :                   WRITE (unit_nr, *) "Start loop run"
     173          132 :                ELSE IF (ispin == 1 .AND. jspin == 1) THEN
     174           44 :                   WRITE (unit_nr, *) "Start loop run alpha-alpha"
     175           88 :                ELSE IF (ispin == 1 .AND. jspin == 2) THEN
     176           44 :                   WRITE (unit_nr, *) "Start loop run alpha-beta"
     177           44 :                ELSE IF (ispin == 2 .AND. jspin == 2) THEN
     178           44 :                   WRITE (unit_nr, *) "Start loop run beta-beta"
     179              :                END IF
     180          265 :                CALL m_flush(unit_nr)
     181              :             END IF
     182              : 
     183          530 :             my_open_shell_SS = (nspins == 2) .AND. (ispin == jspin)
     184              : 
     185              :             ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
     186              :             ! If we calculate the gradient we need to distinguish
     187              :             ! between alpha-alpha and beta-beta cases for UMP2
     188              : 
     189          530 :             my_beta_beta_case = .FALSE.
     190          530 :             my_alpha_beta_case = .FALSE.
     191          530 :             IF (ispin /= jspin) THEN
     192           88 :                my_alpha_beta_case = .TRUE.
     193          442 :             ELSE IF (my_open_shell_SS) THEN
     194          176 :                IF (ispin == 2) my_beta_beta_case = .TRUE.
     195              :             END IF
     196              : 
     197          530 :             amp_fac = mp2_env%scale_S + mp2_env%scale_T
     198          530 :             IF (my_alpha_beta_case .OR. my_open_shell_SS) amp_fac = mp2_env%scale_T
     199              : 
     200              :             CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
     201          530 :                                         my_group_L_size, calc_forces, ispin, jspin, local_ba)
     202              : 
     203              :             CALL mp2_ri_get_block_size( &
     204              :                mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual(ispin:jspin), &
     205              :                homo(ispin:jspin), virtual(ispin:jspin), dimen_RI, unit_nr, block_size, &
     206          530 :                ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1D)
     207              : 
     208              :             ! *****************************************************************
     209              :             ! **********  REPLICATION-BLOCKED COMMUNICATION SCHEME  ***********
     210              :             ! *****************************************************************
     211              :             ! introduce block size, the number of occupied orbitals has to be a
     212              :             ! multiple of the block size
     213              : 
     214              :             ! Calculate the maximum number of ij pairs that have to be computed
     215              :             ! among groups
     216              :             CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
     217          530 :                                       block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
     218              : 
     219         1590 :             ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
     220          530 :             CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
     221              : 
     222         1170 :             max_ij_pairs = MAXVAL(num_ij_pairs)
     223              : 
     224              :             ! start real stuff
     225              :             CALL mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, local_i_aL, &
     226          530 :                                      local_j_aL, calc_forces, Y_i_aP, Y_j_aP, ispin, jspin)
     227              : 
     228          530 :             CALL timeset(routineN//"_RI_loop", handle2)
     229          530 :             my_Emp2_Cou = 0.0_dp
     230          530 :             my_Emp2_EX = 0.0_dp
     231          530 :             t_start = m_walltime()
     232         2572 :             DO ij_index = 1, max_ij_pairs
     233              : 
     234              :                ! Prediction is unreliable if we are in the first step of the loop
     235         2042 :                IF (unit_nr > 0 .AND. ij_index > 1) THEN
     236          742 :                   decil = ij_index*10/max_ij_pairs
     237          742 :                   IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
     238          701 :                      t_new = m_walltime()
     239          701 :                      t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
     240              :                      WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
     241          701 :                         cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
     242          701 :                      CALL m_flush(unit_nr)
     243              :                   END IF
     244              :                END IF
     245              : 
     246         2042 :                IF (calc_forces) THEN
     247      2837553 :                   Y_i_aP = 0.0_dp
     248      2939903 :                   Y_j_aP = 0.0_dp
     249              :                END IF
     250              : 
     251         2042 :                IF (ij_index <= my_ij_pairs) THEN
     252              :                   ! We have work to do
     253         1993 :                   ij_counter = (ij_index - MIN(1, color_sub))*ngroup + color_sub
     254         1993 :                   my_i = ij_map(1, ij_counter)
     255         1993 :                   my_j = ij_map(2, ij_counter)
     256         1993 :                   my_block_size = ij_map(3, ij_counter)
     257              : 
     258      3318845 :                   local_i_aL = 0.0_dp
     259              :                   CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
     260         1993 :                                        BIb_C(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
     261              : 
     262      3434615 :                   local_j_aL = 0.0_dp
     263              :                   CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
     264         1993 :                                        BIb_C(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
     265              : 
     266              :                   ! collect data from other proc
     267         1993 :                   CALL timeset(routineN//"_comm", handle3)
     268         2102 :                   DO proc_shift = 1, comm_exchange%num_pe - 1
     269          109 :                      proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
     270          109 :                      proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
     271              : 
     272          109 :                      send_ij_index = num_ij_pairs(proc_send)
     273              : 
     274          109 :                      CALL get_group_dist(gd_array, proc_receive, sizes=rec_L_size)
     275              : 
     276         2102 :                      IF (ij_index <= send_ij_index) THEN
     277              :                         ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
     278           60 :                                           integ_group_pos2color_sub(proc_send)
     279           60 :                         send_i = ij_map(1, ij_counter_send)
     280           60 :                         send_j = ij_map(2, ij_counter_send)
     281              : 
     282              :                         ! occupied i
     283              :                         BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
     284           60 :                            buffer_1D(1:rec_L_size*my_B_size(ispin)*my_block_size)
     285        43353 :                         BI_C_rec = 0.0_dp
     286              :                         CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
     287           60 :                                                     proc_send, BI_C_rec, proc_receive, tag)
     288              : 
     289              :                         CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     290           60 :                                              BI_C_rec(:, 1:my_B_size(ispin), :))
     291              : 
     292              :                         ! occupied j
     293              :                         BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
     294           60 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
     295        44373 :                         BI_C_rec = 0.0_dp
     296              :                         CALL comm_exchange%sendrecv(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
     297           60 :                                                     proc_send, BI_C_rec, proc_receive, tag)
     298              : 
     299              :                         CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     300           60 :                                              BI_C_rec(:, 1:my_B_size(jspin), :))
     301              : 
     302              :                      ELSE
     303              :                         ! we send nothing while we know that we have to receive something
     304              : 
     305              :                         ! occupied i
     306              :                         BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
     307           49 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(ispin)*my_block_size)
     308         9124 :                         BI_C_rec = 0.0_dp
     309           49 :                         CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
     310              : 
     311              :                         CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     312           49 :                                              BI_C_rec(:, 1:my_B_size(ispin), 1:my_block_size))
     313              : 
     314              :                         ! occupied j
     315              :                         BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
     316           49 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
     317         9124 :                         BI_C_rec = 0.0_dp
     318           49 :                         CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
     319              : 
     320              :                         CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     321           49 :                                              BI_C_rec(:, 1:my_B_size(jspin), 1:my_block_size))
     322              : 
     323              :                      END IF
     324              : 
     325              :                   END DO
     326              : 
     327         1993 :                   CALL timestop(handle3)
     328              : 
     329              :                   ! loop over the block elements
     330         3990 :                   DO iiB = 1, my_block_size
     331         5995 :                      DO jjB = 1, my_block_size
     332         2005 :                         CALL timeset(routineN//"_expansion", handle3)
     333         4002 :                         ASSOCIATE (my_local_i_aL => local_i_aL(:, :, iiB), my_local_j_aL => local_j_aL(:, :, jjB))
     334              : 
     335              :                            ! calculate the integrals (ia|jb) strating from my local data ...
     336       585679 :                            local_ab = 0.0_dp
     337         2005 :                            IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
     338       140066 :                               local_ba = 0.0_dp
     339              :                            END IF
     340         2005 :                            CALL dgemm_counter_start(dgemm_counter)
     341              :                            CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(jspin), dimen_RI, 1.0_dp, &
     342              :                                                             my_local_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
     343              :                                                            0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), &
     344         2005 :                                                             my_B_size(ispin))
     345              :                            ! Additional integrals only for alpha_beta case and forces
     346         2005 :                            IF (my_alpha_beta_case .AND. calc_forces) THEN
     347              :                               local_ba(my_B_virtual_start(jspin):my_B_virtual_end(jspin), :) = &
     348       133101 :                                  TRANSPOSE(local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :))
     349              :                            END IF
     350              :                            ! ... and from the other of my subgroup
     351         2279 :                            DO proc_shift = 1, para_env_sub%num_pe - 1
     352          274 :                               proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
     353          274 :                               proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
     354              : 
     355              :                               CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, &
     356          274 :                                                   rec_B_virtual_end, rec_B_size)
     357              : 
     358          274 :                               external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
     359       267944 :                               external_i_aL = 0.0_dp
     360              : 
     361              :                               CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
     362          274 :                                                          external_i_aL, proc_receive, tag)
     363              : 
     364              :                               CALL mp2_env%local_gemm_ctx%gemm( &
     365              :                                  'T', 'N', rec_B_size, my_B_size(jspin), dimen_RI, 1.0_dp, &
     366              :                                  external_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
     367          274 :                                  0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(jspin)), rec_B_size)
     368              : 
     369              :                               ! Additional integrals only for alpha_beta case and forces
     370         2553 :                               IF (my_alpha_beta_case .AND. calc_forces) THEN
     371              : 
     372              :                                  CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, &
     373           70 :                                                      rec_B_virtual_end, rec_B_size)
     374              : 
     375           70 :                                  external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
     376        81655 :                                  external_i_aL = 0.0_dp
     377              : 
     378              :                                  CALL para_env_sub%sendrecv(my_local_j_aL, proc_send, &
     379           70 :                                                             external_i_aL, proc_receive, tag)
     380              : 
     381              :                                  CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, my_B_size(ispin), dimen_RI, 1.0_dp, &
     382              :                                                                   external_i_aL, dimen_RI, my_local_i_aL, dimen_RI, &
     383           70 :                                             0.0_dp, local_ba(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(ispin)), rec_B_size)
     384              :                               END IF
     385              :                            END DO
     386         2005 :                            IF (my_alpha_beta_case .AND. calc_forces) THEN
     387              :                               ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
     388          502 :                               CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(ispin) + my_B_size(jspin), dimen_RI)
     389              :                            ELSE
     390         1503 :                               CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(jspin), dimen_RI)
     391              :                            END IF
     392         2005 :                            CALL timestop(handle3)
     393              : 
     394              :                            !sample peak memory
     395         2005 :                            CALL m_memory()
     396              : 
     397         2005 :                            CALL timeset(routineN//"_ener", handle3)
     398              :                            ! calculate coulomb only MP2
     399         2005 :                            sym_fac = 2.0_dp
     400         2005 :                            IF (my_i == my_j) sym_fac = 1.0_dp
     401         2005 :                            IF (my_alpha_beta_case) sym_fac = 0.5_dp
     402        30928 :                            DO b = 1, my_B_size(jspin)
     403        28923 :                               b_global = b + my_B_virtual_start(jspin) - 1
     404       585679 :                               DO a = 1, virtual(ispin)
     405              :                                  my_Emp2_Cou = my_Emp2_Cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
     406              :                                                (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
     407       583674 :                                                 Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
     408              :                               END DO
     409              :                            END DO
     410              : 
     411         2005 :                            IF (calc_ex) THEN
     412              :                               ! contract integrals with orbital energies for exchange MP2 energy
     413              :                               ! starting with local ...
     414       330129 :                               IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
     415        30328 :                               DO b = 1, my_B_size(ispin)
     416        28323 :                                  b_global = b + my_B_virtual_start(ispin) - 1
     417       548367 :                                  DO a = 1, my_B_size(ispin)
     418       518039 :                                     a_global = a + my_B_virtual_start(ispin) - 1
     419              :                                     my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
     420              :                                               (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
     421       518039 :                                                   Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     422       546362 :                                     IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
     423              :                                      t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
     424              :                                                            (Eigenval(homo(ispin) + a_global, ispin) + &
     425              :                                                             Eigenval(homo(ispin) + b_global, ispin) - &
     426       302030 :                                                             Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     427              :                                     END IF
     428              :                                  END DO
     429              :                               END DO
     430              :                               ! ... and then with external data
     431         2279 :                               DO proc_shift = 1, para_env_sub%num_pe - 1
     432          274 :                                  proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
     433          274 :                                  proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
     434              : 
     435              :                                  CALL get_group_dist(gd_B_virtual(ispin), proc_receive, &
     436          274 :                                                      rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
     437              :                                  CALL get_group_dist(gd_B_virtual(ispin), proc_send, &
     438          274 :                                                      send_B_virtual_start, send_B_virtual_end, send_B_size)
     439              : 
     440              :                                  external_ab(1:my_B_size(ispin), 1:rec_B_size) => &
     441          274 :                                     buffer_1D(1:INT(rec_B_size, int_8)*my_B_size(ispin))
     442        30405 :                                  external_ab = 0.0_dp
     443              : 
     444              :                       CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:my_B_size(ispin)), proc_send, &
     445        60536 :                                                             external_ab(1:my_B_size(ispin), 1:rec_B_size), proc_receive, tag)
     446              : 
     447         5284 :                                  DO b = 1, my_B_size(ispin)
     448         2731 :                                     b_global = b + my_B_virtual_start(ispin) - 1
     449        30405 :                                     DO a = 1, rec_B_size
     450        27400 :                                        a_global = a + rec_B_virtual_start - 1
     451              :                                        my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
     452              :                                               (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
     453        27400 :                                                      Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     454        27400 :                                        IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
     455              :                                          t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
     456              :                                                               (Eigenval(homo(ispin) + a_global, ispin) + &
     457              :                                                                Eigenval(homo(ispin) + b_global, ispin) - &
     458        12311 :                                                                Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     459              :                                     END DO
     460              :                                  END DO
     461              :                               END DO
     462              :                            END IF
     463         2005 :                            CALL timestop(handle3)
     464              : 
     465         4010 :                            IF (calc_forces) THEN
     466              :                               ! update P_ab, Gamma_P_ia
     467              :                               CALL mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
     468              :                                                       Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
     469              :                                                       my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, &
     470              :                                                       local_ab, t_ab, my_local_i_aL, my_local_j_aL, &
     471              :                                                       my_open_shell_ss, Y_i_aP(:, :, iiB), Y_j_aP(:, :, jjB), local_ba, &
     472         1620 :                                                       ispin, jspin, dgemm_counter, buffer_1D)
     473              : 
     474              :                            END IF
     475              : 
     476              :                         END ASSOCIATE
     477              : 
     478              :                      END DO ! jjB
     479              :                   END DO ! iiB
     480              : 
     481              :                ELSE
     482              :                   ! We need it later in case of gradients
     483           49 :                   my_block_size = 1
     484              : 
     485           49 :                   CALL timeset(routineN//"_comm", handle3)
     486              :                   ! No work to do and we know that we have to receive nothing, but send something
     487              :                   ! send data to other proc
     488           98 :                   DO proc_shift = 1, comm_exchange%num_pe - 1
     489           49 :                      proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
     490           49 :                      proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
     491              : 
     492           49 :                      send_ij_index = num_ij_pairs(proc_send)
     493              : 
     494           98 :                      IF (ij_index <= send_ij_index) THEN
     495              :                         ! something to send
     496              :                         ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
     497           49 :                                           integ_group_pos2color_sub(proc_send)
     498           49 :                         send_i = ij_map(1, ij_counter_send)
     499           49 :                         send_j = ij_map(2, ij_counter_send)
     500              : 
     501              :                         ! occupied i
     502              :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
     503           49 :                                                 proc_send, tag)
     504              :                         ! occupied j
     505              :                         CALL comm_exchange%send(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
     506           49 :                                                 proc_send, tag)
     507              :                      END IF
     508              :                   END DO
     509           49 :                   CALL timestop(handle3)
     510              :                END IF
     511              : 
     512              :                ! redistribute gamma
     513         2572 :                IF (calc_forces) THEN
     514              :                   CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_B_size(ispin), &
     515              :                                               my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
     516              :                                               num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
     517              :                                               ij_map, ranges_info_array, Y_i_aP(:, :, 1:my_block_size), comm_exchange, &
     518         1617 :                                               gd_array%sizes, 1, buffer_1D)
     519              :                   CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_B_size(jspin), &
     520              :                                               my_block_size, my_group_L_size, my_j, my_ij_pairs, ngroup, &
     521              :                                               num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
     522              :                                               ij_map, ranges_info_array, Y_j_aP(:, :, 1:my_block_size), comm_exchange, &
     523         1617 :                                               gd_array%sizes, 2, buffer_1D)
     524              :                END IF
     525              : 
     526              :             END DO
     527          530 :             CALL timestop(handle2)
     528              : 
     529          530 :             DEALLOCATE (local_i_aL)
     530          530 :             DEALLOCATE (local_j_aL)
     531          530 :             DEALLOCATE (ij_map)
     532          530 :             DEALLOCATE (num_ij_pairs)
     533          530 :             DEALLOCATE (local_ab)
     534              : 
     535          530 :             IF (calc_forces) THEN
     536          384 :                DEALLOCATE (Y_i_aP)
     537          384 :                DEALLOCATE (Y_j_aP)
     538          384 :                IF (ALLOCATED(t_ab)) THEN
     539          306 :                   DEALLOCATE (t_ab)
     540              :                END IF
     541          384 :                DEALLOCATE (local_ba)
     542              : 
     543              :                ! here we check if there are almost degenerate ij
     544              :                ! pairs and we update P_ij with these contribution.
     545              :                ! If all pairs are degenerate with each other this step will scale O(N^6),
     546              :                ! if the number of degenerate pairs scales linearly with the system size
     547              :                ! this step will scale O(N^5).
     548              :                ! Start counting the number of almost degenerate ij pairs according
     549              :                ! to eps_canonical
     550              :                CALL quasi_degenerate_P_ij( &
     551              :                   mp2_env, Eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
     552              :                   my_beta_beta_case, Bib_C(ispin:jspin), unit_nr, dimen_RI, &
     553              :                   my_B_size(ispin:jspin), ngroup, my_group_L_size, &
     554              :                   color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
     555              :                   my_B_virtual_start(ispin:jspin), my_B_virtual_end(ispin:jspin), gd_array%sizes, gd_B_virtual(ispin:jspin), &
     556          384 :                   integ_group_pos2color_sub, dgemm_counter, buffer_1D)
     557              : 
     558              :             END IF
     559              : 
     560          530 :             DEALLOCATE (buffer_1D)
     561              : 
     562              :             ! Dereplicate BIb_C and Gamma_P_ia to save memory
     563              :             ! These matrices will not be needed in that fashion anymore
     564              :             ! B_ia_Q will needed later
     565          530 :             IF (calc_forces .AND. jspin == nspins) THEN
     566         1068 :                IF (.NOT. ALLOCATED(B_ia_Q)) ALLOCATE (B_ia_Q(nspins))
     567         1530 :                ALLOCATE (B_ia_Q(ispin)%array(homo(ispin), my_B_size(ispin), my_group_L_size_orig))
     568       907278 :                B_ia_Q(ispin)%array = 0.0_dp
     569         1408 :                DO jjB = 1, homo(ispin)
     570        17332 :                   DO iiB = 1, my_B_size(ispin)
     571              :                      B_ia_Q(ispin)%array(jjB, iiB, 1:my_group_L_size_orig) = &
     572       724326 :                         BIb_C(ispin)%array(1:my_group_L_size_orig, iiB, jjB)
     573              :                   END DO
     574              :                END DO
     575          306 :                DEALLOCATE (BIb_C(ispin)%array)
     576              : 
     577              :                ! sum Gamma and dereplicate
     578         1530 :                ALLOCATE (BIb_C(ispin)%array(my_B_size(ispin), homo(ispin), my_group_L_size_orig))
     579          582 :                DO proc_shift = 1, comm_rep%num_pe - 1
     580              :                   ! invert order
     581          276 :                   proc_send = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
     582          276 :                   proc_receive = MODULO(comm_rep%mepos + proc_shift, comm_rep%num_pe)
     583              : 
     584          276 :                   start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
     585          276 :                   end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
     586              : 
     587              :                   CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
     588          276 :                                          proc_send, BIb_C(ispin)%array, proc_receive, tag)
     589              : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     590          582 : !$OMP          SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
     591              :                   mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) = &
     592              :                      mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) &
     593              :                      + BIb_C(ispin)%array(:, :, :)
     594              : !$OMP END PARALLEL WORKSHARE
     595              :                END DO
     596              : 
     597       770352 :                BIb_C(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig)
     598          306 :                DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
     599          306 :                CALL MOVE_ALLOC(BIb_C(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
     600          224 :             ELSE IF (jspin == nspins) THEN
     601          136 :                DEALLOCATE (BIb_C(ispin)%array)
     602              :             END IF
     603              : 
     604          530 :             CALL para_env%sum(my_Emp2_Cou)
     605          530 :             CALL para_env%sum(my_Emp2_Ex)
     606              : 
     607         2032 :             IF (my_open_shell_SS .OR. my_alpha_beta_case) THEN
     608          264 :                IF (my_alpha_beta_case) THEN
     609           88 :                   Emp2_S = Emp2_S + my_Emp2_Cou
     610           88 :                   Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     611              :                ELSE
     612          176 :                   my_Emp2_Cou = my_Emp2_Cou*0.25_dp
     613          176 :                   my_Emp2_EX = my_Emp2_EX*0.5_dp
     614          176 :                   Emp2_T = Emp2_T + my_Emp2_Cou + my_Emp2_EX
     615          176 :                   Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     616          176 :                   Emp2_EX = Emp2_EX + my_Emp2_EX
     617              :                END IF
     618              :             ELSE
     619          266 :                Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     620          266 :                Emp2_EX = Emp2_EX + my_Emp2_EX
     621              :             END IF
     622              :          END DO
     623              : 
     624              :       END DO
     625              : 
     626          354 :       DEALLOCATE (integ_group_pos2color_sub)
     627          354 :       DEALLOCATE (ranges_info_array)
     628              : 
     629          354 :       CALL comm_exchange%free()
     630          354 :       CALL comm_rep%free()
     631              : 
     632          354 :       IF (calc_forces) THEN
     633              :          ! recover original information (before replication)
     634          228 :          DEALLOCATE (gd_array%sizes)
     635          228 :          iiB = SIZE(sizes_array_orig)
     636          684 :          ALLOCATE (gd_array%sizes(0:iiB - 1))
     637          678 :          gd_array%sizes(:) = sizes_array_orig
     638          228 :          DEALLOCATE (sizes_array_orig)
     639              : 
     640              :          ! Remove replication from BIb_C and reorder the matrix
     641          228 :          my_group_L_size = my_group_L_size_orig
     642              : 
     643              :          ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
     644          534 :          DO ispin = 1, nspins
     645              :             CALL complete_gamma(mp2_env, B_ia_Q(ispin)%array, dimen_RI, homo(ispin), &
     646              :                                 virtual(ispin), para_env, para_env_sub, ngroup, &
     647              :                                 my_group_L_size, my_group_L_start, my_group_L_end, &
     648              :                                 my_B_size(ispin), my_B_virtual_start(ispin), &
     649              :                                 gd_array, gd_B_virtual(ispin), &
     650          534 :                                 ispin)
     651              :          END DO
     652          534 :          DEALLOCATE (B_ia_Q)
     653              : 
     654        45074 :          IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
     655              :          BLOCK
     656              :             TYPE(mp_comm_type) :: comm
     657          228 :             CALL comm%from_split(para_env, para_env_sub%mepos)
     658          534 :             DO ispin = 1, nspins
     659              :                ! P_ab is only replicated over all subgroups
     660          306 :                CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
     661              :                ! P_ij is replicated over all processes
     662          534 :                CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
     663              :             END DO
     664          456 :             CALL comm%free()
     665              :          END BLOCK
     666              :       END IF
     667              : 
     668          354 :       CALL release_group_dist(gd_array)
     669          796 :       DO ispin = 1, nspins
     670          442 :          IF (ALLOCATED(BIb_C(ispin)%array)) DEALLOCATE (BIb_C(ispin)%array)
     671          796 :          CALL release_group_dist(gd_B_virtual(ispin))
     672              :       END DO
     673              : 
     674              :       ! We do not need this matrix later, so deallocate it here to safe memory
     675          354 :       IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
     676          354 :       IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
     677            8 :          DEALLOCATE (mp2_env%ri_grad%operator_half)
     678              : 
     679          354 :       CALL dgemm_counter_write(dgemm_counter, para_env)
     680              : 
     681              :       ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
     682          354 :       CALL mp2_env%local_gemm_ctx%destroy()
     683          354 :       CALL timestop(handle)
     684              : 
     685         1416 :    END SUBROUTINE mp2_ri_gpw_compute_en
     686              : 
     687              : ! **************************************************************************************************
     688              : !> \brief ...
     689              : !> \param local_i_aL ...
     690              : !> \param ranges_info_array ...
     691              : !> \param BIb_C_rec ...
     692              : ! **************************************************************************************************
     693         4360 :    SUBROUTINE fill_local_i_aL(local_i_aL, ranges_info_array, BIb_C_rec)
     694              :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: local_i_aL
     695              :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: ranges_info_array
     696              :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: BIb_C_rec
     697              : 
     698              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'fill_local_i_aL'
     699              : 
     700              :       INTEGER                                            :: end_point, handle, irep, Lend_pos, &
     701              :                                                             Lstart_pos, start_point
     702              : 
     703         4360 :       CALL timeset(routineN, handle)
     704              : 
     705        12100 :       DO irep = 1, SIZE(ranges_info_array, 2)
     706         7740 :          Lstart_pos = ranges_info_array(1, irep)
     707         7740 :          Lend_pos = ranges_info_array(2, irep)
     708         7740 :          start_point = ranges_info_array(3, irep)
     709         7740 :          end_point = ranges_info_array(4, irep)
     710              : 
     711              : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     712        12100 : !$OMP          SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
     713              :          local_i_aL(Lstart_pos:Lend_pos, :, :) = BIb_C_rec(start_point:end_point, :, :)
     714              : !$OMP END PARALLEL WORKSHARE
     715              :       END DO
     716              : 
     717         4360 :       CALL timestop(handle)
     718              : 
     719         4360 :    END SUBROUTINE fill_local_i_aL
     720              : 
     721              : ! **************************************************************************************************
     722              : !> \brief ...
     723              : !> \param local_i_aL ...
     724              : !> \param ranges_info_array ...
     725              : !> \param BIb_C_rec ...
     726              : ! **************************************************************************************************
     727          266 :    SUBROUTINE fill_local_i_aL_2D(local_i_aL, ranges_info_array, BIb_C_rec)
     728              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: local_i_aL
     729              :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: ranges_info_array
     730              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: BIb_C_rec
     731              : 
     732              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL_2D'
     733              : 
     734              :       INTEGER                                            :: end_point, handle, irep, Lend_pos, &
     735              :                                                             Lstart_pos, start_point
     736              : 
     737          266 :       CALL timeset(routineN, handle)
     738              : 
     739          766 :       DO irep = 1, SIZE(ranges_info_array, 2)
     740          500 :          Lstart_pos = ranges_info_array(1, irep)
     741          500 :          Lend_pos = ranges_info_array(2, irep)
     742          500 :          start_point = ranges_info_array(3, irep)
     743          500 :          end_point = ranges_info_array(4, irep)
     744              : 
     745              : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     746          766 : !$OMP          SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
     747              :          local_i_aL(Lstart_pos:Lend_pos, :) = BIb_C_rec(start_point:end_point, :)
     748              : !$OMP END PARALLEL WORKSHARE
     749              :       END DO
     750              : 
     751          266 :       CALL timestop(handle)
     752              : 
     753          266 :    END SUBROUTINE fill_local_i_aL_2D
     754              : 
     755              : ! **************************************************************************************************
     756              : !> \brief ...
     757              : !> \param BIb_C ...
     758              : !> \param comm_exchange ...
     759              : !> \param comm_rep ...
     760              : !> \param homo ...
     761              : !> \param sizes_array ...
     762              : !> \param my_B_size ...
     763              : !> \param my_group_L_size ...
     764              : !> \param ranges_info_array ...
     765              : ! **************************************************************************************************
     766          442 :    SUBROUTINE replicate_iaK_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
     767          442 :                                       my_group_L_size, ranges_info_array)
     768              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     769              :          INTENT(INOUT)                                   :: BIb_C
     770              :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange, comm_rep
     771              :       INTEGER, INTENT(IN)                                :: homo
     772              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: sizes_array
     773              :       INTEGER, INTENT(IN)                                :: my_B_size, my_group_L_size
     774              :       INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN)          :: ranges_info_array
     775              : 
     776              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'replicate_iaK_2intgroup'
     777              : 
     778              :       INTEGER                                            :: end_point, handle, max_L_size, &
     779              :                                                             proc_receive, proc_shift, start_point
     780              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: BIb_C_copy
     781          442 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :)  :: BIb_C_gather
     782              : 
     783          442 :       CALL timeset(routineN, handle)
     784              : 
     785              :       ! replication scheme using mpi_allgather
     786              :       ! get the max L size of the
     787          988 :       max_L_size = MAXVAL(sizes_array)
     788              : 
     789         2210 :       ALLOCATE (BIb_C_copy(max_L_size, my_B_size, homo))
     790      1655684 :       BIb_C_copy = 0.0_dp
     791       898474 :       BIb_C_copy(1:SIZE(BIb_C, 1), 1:my_B_size, 1:homo) = BIb_C
     792              : 
     793          442 :       DEALLOCATE (BIb_C)
     794              : 
     795         2652 :       ALLOCATE (BIb_C_gather(max_L_size, my_B_size, homo, 0:comm_rep%num_pe - 1))
     796      3188844 :       BIb_C_gather = 0.0_dp
     797              : 
     798          442 :       CALL comm_rep%allgather(BIb_C_copy, BIb_C_gather)
     799              : 
     800          442 :       DEALLOCATE (BIb_C_copy)
     801              : 
     802         2210 :       ALLOCATE (BIb_C(my_group_L_size, my_B_size, homo))
     803      1655337 :       BIb_C = 0.0_dp
     804              : 
     805              :       ! reorder data
     806         1204 :       DO proc_shift = 0, comm_rep%num_pe - 1
     807          762 :          proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
     808              : 
     809          762 :          start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
     810          762 :          end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
     811              : 
     812              :          BIb_C(start_point:end_point, 1:my_B_size, 1:homo) = &
     813      1674771 :             BIb_C_gather(1:end_point - start_point + 1, 1:my_B_size, 1:homo, proc_receive)
     814              : 
     815              :       END DO
     816              : 
     817          442 :       DEALLOCATE (BIb_C_gather)
     818              : 
     819          442 :       CALL timestop(handle)
     820              : 
     821          442 :    END SUBROUTINE replicate_iaK_2intgroup
     822              : 
     823              : ! **************************************************************************************************
     824              : !> \brief ...
     825              : !> \param local_ab ...
     826              : !> \param t_ab ...
     827              : !> \param mp2_env ...
     828              : !> \param homo ...
     829              : !> \param virtual ...
     830              : !> \param my_B_size ...
     831              : !> \param my_group_L_size ...
     832              : !> \param calc_forces ...
     833              : !> \param ispin ...
     834              : !> \param jspin ...
     835              : !> \param local_ba ...
     836              : ! **************************************************************************************************
     837          530 :    SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
     838              :                                      my_group_L_size, calc_forces, ispin, jspin, local_ba)
     839              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     840              :          INTENT(OUT)                                     :: local_ab, t_ab
     841              :       TYPE(mp2_type)                                     :: mp2_env
     842              :       INTEGER, INTENT(IN)                                :: homo(2), virtual(2), my_B_size(2), &
     843              :                                                             my_group_L_size
     844              :       LOGICAL, INTENT(IN)                                :: calc_forces
     845              :       INTEGER, INTENT(IN)                                :: ispin, jspin
     846              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     847              :          INTENT(OUT)                                     :: local_ba
     848              : 
     849              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_no_blk'
     850              : 
     851              :       INTEGER                                            :: handle
     852              : 
     853          530 :       CALL timeset(routineN, handle)
     854              : 
     855         2120 :       ALLOCATE (local_ab(virtual(ispin), my_B_size(jspin)))
     856       138441 :       local_ab = 0.0_dp
     857              : 
     858          530 :       IF (calc_forces) THEN
     859          384 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
     860         1224 :             ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
     861         6278 :             mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
     862              :          END IF
     863          384 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
     864         1224 :             ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_B_size(jspin), virtual(jspin)))
     865        85070 :             mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
     866              :          END IF
     867          384 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
     868         1530 :             ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_B_size(jspin), homo(jspin), my_group_L_size))
     869      1475502 :             mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
     870              :          END IF
     871              : 
     872          384 :          IF (ispin == jspin) THEN
     873              :             ! For non-alpha-beta case we need amplitudes
     874         1224 :             ALLOCATE (t_ab(virtual(ispin), my_B_size(jspin)))
     875              : 
     876              :             ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
     877          306 :             ALLOCATE (local_ba(1, 1))
     878              :          ELSE
     879              :             ! We need more integrals
     880          312 :             ALLOCATE (local_ba(virtual(jspin), my_B_size(ispin)))
     881              :          END IF
     882              :       END IF
     883              :       !
     884              : 
     885          530 :       CALL timestop(handle)
     886              : 
     887          530 :    END SUBROUTINE mp2_ri_allocate_no_blk
     888              : 
     889              : ! **************************************************************************************************
     890              : !> \brief ...
     891              : !> \param dimen_RI ...
     892              : !> \param my_B_size ...
     893              : !> \param block_size ...
     894              : !> \param local_i_aL ...
     895              : !> \param local_j_aL ...
     896              : !> \param calc_forces ...
     897              : !> \param Y_i_aP ...
     898              : !> \param Y_j_aP ...
     899              : !> \param ispin ...
     900              : !> \param jspin ...
     901              : ! **************************************************************************************************
     902          530 :    SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
     903              :                                   local_i_aL, local_j_aL, calc_forces, &
     904              :                                   Y_i_aP, Y_j_aP, ispin, jspin)
     905              :       INTEGER, INTENT(IN)                                :: dimen_RI, my_B_size(2), block_size
     906              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     907              :          INTENT(OUT)                                     :: local_i_aL, local_j_aL
     908              :       LOGICAL, INTENT(IN)                                :: calc_forces
     909              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     910              :          INTENT(OUT)                                     :: Y_i_aP, Y_j_aP
     911              :       INTEGER, INTENT(IN)                                :: ispin, jspin
     912              : 
     913              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_blk'
     914              : 
     915              :       INTEGER                                            :: handle
     916              : 
     917          530 :       CALL timeset(routineN, handle)
     918              : 
     919         2650 :       ALLOCATE (local_i_aL(dimen_RI, my_B_size(ispin), block_size))
     920       657247 :       local_i_aL = 0.0_dp
     921         2650 :       ALLOCATE (local_j_aL(dimen_RI, my_B_size(jspin), block_size))
     922       670091 :       local_j_aL = 0.0_dp
     923              : 
     924          530 :       IF (calc_forces) THEN
     925         1920 :          ALLOCATE (Y_i_aP(my_B_size(ispin), dimen_RI, block_size))
     926       541654 :          Y_i_aP = 0.0_dp
     927              :          ! For  closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
     928              :          ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
     929         1920 :          ALLOCATE (Y_j_aP(my_B_size(jspin), dimen_RI, block_size))
     930       552606 :          Y_j_aP = 0.0_dp
     931              :       END IF
     932              :       !
     933              : 
     934          530 :       CALL timestop(handle)
     935              : 
     936          530 :    END SUBROUTINE mp2_ri_allocate_blk
     937              : 
     938              : ! **************************************************************************************************
     939              : !> \brief ...
     940              : !> \param my_alpha_beta_case ...
     941              : !> \param total_ij_pairs ...
     942              : !> \param homo ...
     943              : !> \param homo_beta ...
     944              : !> \param block_size ...
     945              : !> \param ngroup ...
     946              : !> \param ij_map ...
     947              : !> \param color_sub ...
     948              : !> \param my_ij_pairs ...
     949              : !> \param my_open_shell_SS ...
     950              : !> \param unit_nr ...
     951              : ! **************************************************************************************************
     952          530 :    SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
     953              :                                    block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
     954              :       LOGICAL, INTENT(IN)                                :: my_alpha_beta_case
     955              :       INTEGER, INTENT(OUT)                               :: total_ij_pairs
     956              :       INTEGER, INTENT(IN)                                :: homo, homo_beta, block_size, ngroup
     957              :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
     958              :       INTEGER, INTENT(IN)                                :: color_sub
     959              :       INTEGER, INTENT(OUT)                               :: my_ij_pairs
     960              :       LOGICAL, INTENT(IN)                                :: my_open_shell_SS
     961              :       INTEGER, INTENT(IN)                                :: unit_nr
     962              : 
     963              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_communication'
     964              : 
     965              :       INTEGER :: assigned_blocks, first_I_block, first_J_block, handle, iiB, ij_block_counter, &
     966              :          ij_counter, jjB, last_i_block, last_J_block, num_block_per_group, num_IJ_blocks, &
     967              :          num_IJ_blocks_beta, total_ij_block, total_ij_pairs_blocks
     968          530 :       LOGICAL, ALLOCATABLE, DIMENSION(:, :)              :: ij_marker
     969              : 
     970              : ! Calculate the maximum number of ij pairs that have to be computed
     971              : ! among groups
     972              : 
     973          530 :       CALL timeset(routineN, handle)
     974              : 
     975          530 :       IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
     976          266 :          total_ij_pairs = homo*(1 + homo)/2
     977          266 :          num_IJ_blocks = homo/block_size - 1
     978              : 
     979          266 :          first_I_block = 1
     980          266 :          last_i_block = block_size*(num_IJ_blocks - 1)
     981              : 
     982          266 :          first_J_block = block_size + 1
     983          266 :          last_J_block = block_size*(num_IJ_blocks + 1)
     984              : 
     985          266 :          ij_block_counter = 0
     986          602 :          DO iiB = first_I_block, last_i_block, block_size
     987          602 :             DO jjB = iiB + block_size, last_J_block, block_size
     988          840 :                ij_block_counter = ij_block_counter + 1
     989              :             END DO
     990              :          END DO
     991              : 
     992          266 :          total_ij_block = ij_block_counter
     993          266 :          num_block_per_group = total_ij_block/ngroup
     994          266 :          assigned_blocks = num_block_per_group*ngroup
     995              : 
     996          266 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
     997              : 
     998         1064 :          ALLOCATE (ij_marker(homo, homo))
     999         4018 :          ij_marker = .TRUE.
    1000          798 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1001         7770 :          ij_map = 0
    1002          266 :          ij_counter = 0
    1003          266 :          my_ij_pairs = 0
    1004          602 :          DO iiB = first_I_block, last_i_block, block_size
    1005         1278 :             DO jjB = iiB + block_size, last_J_block, block_size
    1006          840 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1007          676 :                ij_counter = ij_counter + 1
    1008         2028 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1009          676 :                ij_map(1, ij_counter) = iiB
    1010          676 :                ij_map(2, ij_counter) = jjB
    1011          676 :                ij_map(3, ij_counter) = block_size
    1012         1012 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1013              :             END DO
    1014              :          END DO
    1015         1070 :          DO iiB = 1, homo
    1016         2946 :             DO jjB = iiB, homo
    1017         2680 :                IF (ij_marker(iiB, jjB)) THEN
    1018         1200 :                   ij_counter = ij_counter + 1
    1019         1200 :                   ij_map(1, ij_counter) = iiB
    1020         1200 :                   ij_map(2, ij_counter) = jjB
    1021         1200 :                   ij_map(3, ij_counter) = 1
    1022         1200 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1023              :                END IF
    1024              :             END DO
    1025              :          END DO
    1026          266 :          DEALLOCATE (ij_marker)
    1027              : 
    1028          264 :       ELSE IF (.NOT. my_alpha_beta_case) THEN
    1029              :          ! THese are the cases alpha/alpha and beta/beta
    1030              :          ! We do not have to consider the diagonal elements
    1031          176 :          total_ij_pairs = homo*(homo - 1)/2
    1032          176 :          num_IJ_blocks = (homo - 1)/block_size - 1
    1033              : 
    1034          176 :          first_I_block = 1
    1035          176 :          last_i_block = block_size*(num_IJ_blocks - 1)
    1036              : 
    1037              :          ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
    1038          176 :          first_J_block = block_size + 2
    1039          176 :          last_J_block = block_size*(num_IJ_blocks + 1) + 1
    1040              : 
    1041          176 :          ij_block_counter = 0
    1042          262 :          DO iiB = first_I_block, last_i_block, block_size
    1043          262 :             DO jjB = iiB + block_size + 1, last_J_block, block_size
    1044          200 :                ij_block_counter = ij_block_counter + 1
    1045              :             END DO
    1046              :          END DO
    1047              : 
    1048          176 :          total_ij_block = ij_block_counter
    1049          176 :          num_block_per_group = total_ij_block/ngroup
    1050          176 :          assigned_blocks = num_block_per_group*ngroup
    1051              : 
    1052          176 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
    1053              : 
    1054          704 :          ALLOCATE (ij_marker(homo, homo))
    1055         2984 :          ij_marker = .TRUE.
    1056          528 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1057         3352 :          ij_map = 0
    1058          176 :          ij_counter = 0
    1059          176 :          my_ij_pairs = 0
    1060          262 :          DO iiB = first_I_block, last_i_block, block_size
    1061          458 :             DO jjB = iiB + block_size + 1, last_J_block, block_size
    1062          200 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1063          196 :                ij_counter = ij_counter + 1
    1064          604 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1065          196 :                ij_map(1, ij_counter) = iiB
    1066          196 :                ij_map(2, ij_counter) = jjB
    1067          196 :                ij_map(3, ij_counter) = block_size
    1068          282 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1069              :             END DO
    1070              :          END DO
    1071          774 :          DO iiB = 1, homo
    1072         1580 :             DO jjB = iiB + 1, homo
    1073         1404 :                IF (ij_marker(iiB, jjB)) THEN
    1074          598 :                   ij_counter = ij_counter + 1
    1075          598 :                   ij_map(1, ij_counter) = iiB
    1076          598 :                   ij_map(2, ij_counter) = jjB
    1077          598 :                   ij_map(3, ij_counter) = 1
    1078          598 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1079              :                END IF
    1080              :             END DO
    1081              :          END DO
    1082          176 :          DEALLOCATE (ij_marker)
    1083              : 
    1084              :       ELSE
    1085           88 :          total_ij_pairs = homo*homo_beta
    1086           88 :          num_IJ_blocks = homo/block_size
    1087           88 :          num_IJ_blocks_beta = homo_beta/block_size
    1088              : 
    1089           88 :          first_I_block = 1
    1090           88 :          last_i_block = block_size*(num_IJ_blocks - 1)
    1091              : 
    1092           88 :          first_J_block = 1
    1093           88 :          last_J_block = block_size*(num_IJ_blocks_beta - 1)
    1094              : 
    1095           88 :          ij_block_counter = 0
    1096          246 :          DO iiB = first_I_block, last_i_block, block_size
    1097          246 :             DO jjB = first_J_block, last_J_block, block_size
    1098          284 :                ij_block_counter = ij_block_counter + 1
    1099              :             END DO
    1100              :          END DO
    1101              : 
    1102           88 :          total_ij_block = ij_block_counter
    1103           88 :          num_block_per_group = total_ij_block/ngroup
    1104           88 :          assigned_blocks = num_block_per_group*ngroup
    1105              : 
    1106           88 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
    1107              : 
    1108          352 :          ALLOCATE (ij_marker(homo, homo_beta))
    1109         1396 :          ij_marker = .TRUE.
    1110          264 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1111         4304 :          ij_map = 0
    1112           88 :          ij_counter = 0
    1113           88 :          my_ij_pairs = 0
    1114          246 :          DO iiB = first_I_block, last_i_block, block_size
    1115          486 :             DO jjB = first_J_block, last_J_block, block_size
    1116          244 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1117          240 :                ij_counter = ij_counter + 1
    1118          720 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1119          240 :                ij_map(1, ij_counter) = iiB
    1120          240 :                ij_map(2, ij_counter) = jjB
    1121          240 :                ij_map(3, ij_counter) = block_size
    1122          398 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1123              :             END DO
    1124              :          END DO
    1125          432 :          DO iiB = 1, homo
    1126         1486 :             DO jjB = 1, homo_beta
    1127         1398 :                IF (ij_marker(iiB, jjB)) THEN
    1128          814 :                   ij_counter = ij_counter + 1
    1129          814 :                   ij_map(1, ij_counter) = iiB
    1130          814 :                   ij_map(2, ij_counter) = jjB
    1131          814 :                   ij_map(3, ij_counter) = 1
    1132          814 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1133              :                END IF
    1134              :             END DO
    1135              :          END DO
    1136           88 :          DEALLOCATE (ij_marker)
    1137              :       END IF
    1138              : 
    1139          530 :       IF (unit_nr > 0) THEN
    1140          265 :          IF (block_size == 1) THEN
    1141              :             WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
    1142          236 :                "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
    1143              :          ELSE
    1144              :             WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
    1145           29 :                "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
    1146           58 :                100.0_dp*REAL((total_ij_pairs - assigned_blocks*(block_size**2)), KIND=dp)/REAL(total_ij_pairs, KIND=dp)
    1147              :          END IF
    1148          265 :          CALL m_flush(unit_nr)
    1149              :       END IF
    1150              : 
    1151          530 :       CALL timestop(handle)
    1152              : 
    1153          530 :    END SUBROUTINE mp2_ri_communication
    1154              : 
    1155              : ! **************************************************************************************************
    1156              : !> \brief ...
    1157              : !> \param para_env ...
    1158              : !> \param para_env_sub ...
    1159              : !> \param color_sub ...
    1160              : !> \param sizes_array ...
    1161              : !> \param calc_forces ...
    1162              : !> \param integ_group_size ...
    1163              : !> \param my_group_L_end ...
    1164              : !> \param my_group_L_size ...
    1165              : !> \param my_group_L_size_orig ...
    1166              : !> \param my_group_L_start ...
    1167              : !> \param my_new_group_L_size ...
    1168              : !> \param integ_group_pos2color_sub ...
    1169              : !> \param sizes_array_orig ...
    1170              : !> \param ranges_info_array ...
    1171              : !> \param comm_exchange ...
    1172              : !> \param comm_rep ...
    1173              : !> \param num_integ_group ...
    1174              : ! **************************************************************************************************
    1175          354 :    SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
    1176              :                                   sizes_array, calc_forces, &
    1177              :                                   integ_group_size, my_group_L_end, &
    1178              :                                   my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
    1179              :                                   integ_group_pos2color_sub, &
    1180              :                                   sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
    1181              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1182              :       INTEGER, INTENT(IN)                                :: color_sub
    1183              :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT)  :: sizes_array
    1184              :       LOGICAL, INTENT(IN)                                :: calc_forces
    1185              :       INTEGER, INTENT(IN)                                :: integ_group_size, my_group_L_end
    1186              :       INTEGER, INTENT(INOUT)                             :: my_group_L_size
    1187              :       INTEGER, INTENT(OUT)                               :: my_group_L_size_orig
    1188              :       INTEGER, INTENT(IN)                                :: my_group_L_start
    1189              :       INTEGER, INTENT(INOUT)                             :: my_new_group_L_size
    1190              :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: integ_group_pos2color_sub, &
    1191              :                                                             sizes_array_orig
    1192              :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    1193              :          INTENT(OUT)                                     :: ranges_info_array
    1194              :       TYPE(mp_comm_type), INTENT(OUT)                    :: comm_exchange, comm_rep
    1195              :       INTEGER, INTENT(IN)                                :: num_integ_group
    1196              : 
    1197              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_create_group'
    1198              : 
    1199              :       INTEGER                                            :: handle, iiB, proc_receive, proc_shift, &
    1200              :                                                             sub_sub_color
    1201          354 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: new_sizes_array, rep_ends_array, &
    1202              :                                                             rep_sizes_array, rep_starts_array
    1203              :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: my_info
    1204              : 
    1205          354 :       CALL timeset(routineN, handle)
    1206              :       !
    1207          354 :       sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
    1208          354 :       CALL comm_exchange%from_split(para_env, sub_sub_color)
    1209              : 
    1210              :       ! create the replication group
    1211          354 :       sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
    1212          354 :       CALL comm_rep%from_split(para_env, sub_sub_color)
    1213              : 
    1214              :       ! create the new limits for K according to the size
    1215              :       ! of the integral group
    1216              : 
    1217              :       ! info array for replication
    1218         1062 :       ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
    1219         1062 :       ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
    1220         1062 :       ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
    1221              : 
    1222          354 :       CALL comm_rep%allgather(my_group_L_size, rep_sizes_array)
    1223          354 :       CALL comm_rep%allgather(my_group_L_start, rep_starts_array)
    1224          354 :       CALL comm_rep%allgather(my_group_L_end, rep_ends_array)
    1225              : 
    1226              :       ! calculate my_new_group_L_size according to sizes_array
    1227          354 :       my_new_group_L_size = my_group_L_size
    1228              : 
    1229              :       ! Info of this process
    1230         1062 :       ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
    1231          354 :       my_info(1, 0) = my_group_L_start
    1232          354 :       my_info(2, 0) = my_group_L_end
    1233          354 :       my_info(3, 0) = 1
    1234          354 :       my_info(4, 0) = my_group_L_size
    1235              : 
    1236          596 :       DO proc_shift = 1, comm_rep%num_pe - 1
    1237          242 :          proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
    1238              : 
    1239          242 :          my_new_group_L_size = my_new_group_L_size + rep_sizes_array(proc_receive)
    1240              : 
    1241          242 :          my_info(1, proc_shift) = rep_starts_array(proc_receive)
    1242          242 :          my_info(2, proc_shift) = rep_ends_array(proc_receive)
    1243          242 :          my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
    1244          596 :          my_info(4, proc_shift) = my_new_group_L_size
    1245              : 
    1246              :       END DO
    1247              : 
    1248         1062 :       ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
    1249         1416 :       ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
    1250          354 :       CALL comm_exchange%allgather(my_new_group_L_size, new_sizes_array)
    1251          354 :       CALL comm_exchange%allgather(my_info, ranges_info_array)
    1252              : 
    1253          354 :       DEALLOCATE (rep_sizes_array)
    1254          354 :       DEALLOCATE (rep_starts_array)
    1255          354 :       DEALLOCATE (rep_ends_array)
    1256              : 
    1257         1062 :       ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
    1258          354 :       CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
    1259              : 
    1260          354 :       IF (calc_forces) THEN
    1261          228 :          iiB = SIZE(sizes_array)
    1262          684 :          ALLOCATE (sizes_array_orig(0:iiB - 1))
    1263          678 :          sizes_array_orig(:) = sizes_array
    1264              :       END IF
    1265              : 
    1266          354 :       my_group_L_size_orig = my_group_L_size
    1267          354 :       my_group_L_size = my_new_group_L_size
    1268          354 :       DEALLOCATE (sizes_array)
    1269              : 
    1270         1062 :       ALLOCATE (sizes_array(0:integ_group_size - 1))
    1271          806 :       sizes_array(:) = new_sizes_array
    1272              : 
    1273          354 :       DEALLOCATE (new_sizes_array)
    1274              :       !
    1275          354 :       CALL timestop(handle)
    1276              : 
    1277          708 :    END SUBROUTINE mp2_ri_create_group
    1278              : 
    1279              : ! **************************************************************************************************
    1280              : !> \brief ...
    1281              : !> \param mp2_env ...
    1282              : !> \param para_env ...
    1283              : !> \param para_env_sub ...
    1284              : !> \param gd_array ...
    1285              : !> \param gd_B_virtual ...
    1286              : !> \param homo ...
    1287              : !> \param dimen_RI ...
    1288              : !> \param unit_nr ...
    1289              : !> \param integ_group_size ...
    1290              : !> \param ngroup ...
    1291              : !> \param num_integ_group ...
    1292              : !> \param virtual ...
    1293              : !> \param calc_forces ...
    1294              : ! **************************************************************************************************
    1295         1416 :    SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
    1296          354 :                                           homo, dimen_RI, unit_nr, &
    1297              :                                           integ_group_size, &
    1298              :                                           ngroup, num_integ_group, &
    1299          354 :                                           virtual, calc_forces)
    1300              :       TYPE(mp2_type)                                     :: mp2_env
    1301              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1302              :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_array
    1303              :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1304              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
    1305              :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
    1306              :       INTEGER, INTENT(OUT)                               :: integ_group_size, ngroup, num_integ_group
    1307              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: virtual
    1308              :       LOGICAL, INTENT(IN)                                :: calc_forces
    1309              : 
    1310              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_integ_group_size'
    1311              : 
    1312              :       INTEGER                                            :: block_size, handle, iiB, &
    1313              :                                                             max_repl_group_size, &
    1314              :                                                             min_integ_group_size
    1315              :       INTEGER(KIND=int_8)                                :: mem
    1316              :       LOGICAL                                            :: calc_group_size
    1317              :       REAL(KIND=dp)                                      :: factor, mem_base, mem_min, mem_per_blk, &
    1318              :                                                             mem_per_repl, mem_per_repl_blk, &
    1319              :                                                             mem_real
    1320              : 
    1321          354 :       CALL timeset(routineN, handle)
    1322              : 
    1323          354 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    1324              : 
    1325          354 :       calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
    1326          354 :       IF (.NOT. calc_group_size) THEN
    1327           10 :          IF (MOD(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .TRUE.
    1328              :       END IF
    1329              : 
    1330          354 :       IF (calc_group_size) THEN
    1331          344 :          CALL m_memory(mem)
    1332          344 :          mem_real = (mem + 1024*1024 - 1)/(1024*1024)
    1333          344 :          CALL para_env%min(mem_real)
    1334              : 
    1335          344 :          mem_base = 0.0_dp
    1336          344 :          mem_per_blk = 0.0_dp
    1337          344 :          mem_per_repl = 0.0_dp
    1338          344 :          mem_per_repl_blk = 0.0_dp
    1339              : 
    1340              :          ! BIB_C_copy
    1341              :          mem_per_repl = mem_per_repl + MAXVAL(MAX(REAL(homo, KIND=dp)*maxsize(gd_array), REAL(dimen_RI, KIND=dp))* &
    1342         1114 :                                               maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1343              :          ! BIB_C
    1344          770 :          mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_B_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
    1345              :          ! BIB_C_rec
    1346          770 :          mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
    1347              :          ! local_i_aL+local_j_aL
    1348          770 :          mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
    1349              :          ! local_ab
    1350         1114 :          mem_base = mem_base + MAXVAL(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1351              :          ! external_ab/external_i_aL
    1352         1196 :          mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1353              : 
    1354          344 :          IF (calc_forces) THEN
    1355              :             ! Gamma_P_ia
    1356              :             mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_array)* &
    1357          514 :                                               maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1358              :             ! Y_i_aP+Y_j_aP
    1359          514 :             mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
    1360              :             ! local_ba/t_ab
    1361          808 :             mem_base = mem_base + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1362              :             ! P_ij
    1363          514 :             mem_base = mem_base + SUM(REAL(homo, KIND=dp)*homo)*8.0_dp/(1024**2)
    1364              :             ! P_ab
    1365          514 :             mem_base = mem_base + SUM(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1366              :             ! send_ab/send_i_aL
    1367          808 :             mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1368              :          END IF
    1369              : 
    1370              :          ! This a first guess based on the assumption of optimal block sizes
    1371          770 :          block_size = MAX(1, MIN(FLOOR(SQRT(REAL(MINVAL(homo), KIND=dp))), FLOOR(MINVAL(homo)/SQRT(2.0_dp*ngroup))))
    1372          344 :          IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
    1373              : 
    1374          344 :          mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
    1375              : 
    1376          516 :          IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
    1377          344 :             mem_real, ' MiB'
    1378          516 :          IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
    1379          344 :             mem_min, ' MiB'
    1380              : 
    1381              :          ! We use the following communication model
    1382              :          ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
    1383              :          ! One can show that the costs of the contraction step are independent of the block size and the replication group size
    1384              :          ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
    1385              :          ! NL ... number of RI basis functions
    1386              :          ! NR ... replication group size
    1387              :          ! NG ... number of sub groups
    1388              :          ! NB ... Block size
    1389              :          ! o  ... number of occupied orbitals
    1390              :          ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
    1391              :          ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
    1392              :          ! and with gradients
    1393              :          ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
    1394              :          ! We are looking for the minimum of the communication volume,
    1395              :          ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
    1396              :          ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
    1397              :          ! Replication group size = 1 implies that the integration group size equals the number of subgroups
    1398              : 
    1399          344 :          integ_group_size = ngroup
    1400              : 
    1401              :          ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
    1402              :          factor = REAL(SUM(homo*virtual), KIND=dp) &
    1403         1622 :                   - SUM((REAL(MAXVAL(homo), KIND=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
    1404          672 :          IF (SIZE(homo) == 2) factor = factor - 2.0_dp*PRODUCT(homo)/block_size/ngroup*SUM(homo*virtual)
    1405              : 
    1406          688 :          IF (factor <= 0.0_dp) THEN
    1407              :             ! Remove the fixed memory and divide by the memory per replication group size
    1408              :             max_repl_group_size = MIN(MAX(FLOOR((mem_real - mem_base - mem_per_blk*block_size)/ &
    1409          256 :                                                 (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
    1410              :             ! Convert to an integration group size
    1411          256 :             min_integ_group_size = ngroup/max_repl_group_size
    1412              : 
    1413              :             ! Ensure that the integration group size is a divisor of the number of sub groups
    1414          256 :             DO iiB = MAX(MIN(min_integ_group_size, ngroup), 1), ngroup
    1415              :                ! check that the ngroup is a multiple of  integ_group_size
    1416          256 :                IF (MOD(ngroup, iiB) == 0) THEN
    1417          256 :                   integ_group_size = iiB
    1418          256 :                   EXIT
    1419              :                END IF
    1420            0 :                integ_group_size = integ_group_size + 1
    1421              :             END DO
    1422              :          END IF
    1423              :       ELSE ! We take the user provided group size
    1424           10 :          integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
    1425              :       END IF
    1426              : 
    1427          354 :       IF (unit_nr > 0) THEN
    1428              :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    1429          177 :             "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
    1430          177 :          CALL m_flush(unit_nr)
    1431              :       END IF
    1432              : 
    1433          354 :       num_integ_group = ngroup/integ_group_size
    1434              : 
    1435          354 :       CALL timestop(handle)
    1436              : 
    1437          354 :    END SUBROUTINE mp2_ri_get_integ_group_size
    1438              : 
    1439              : ! **************************************************************************************************
    1440              : !> \brief ...
    1441              : !> \param mp2_env ...
    1442              : !> \param para_env ...
    1443              : !> \param para_env_sub ...
    1444              : !> \param gd_array ...
    1445              : !> \param gd_B_virtual ...
    1446              : !> \param homo ...
    1447              : !> \param virtual ...
    1448              : !> \param dimen_RI ...
    1449              : !> \param unit_nr ...
    1450              : !> \param block_size ...
    1451              : !> \param ngroup ...
    1452              : !> \param num_integ_group ...
    1453              : !> \param my_open_shell_ss ...
    1454              : !> \param calc_forces ...
    1455              : !> \param buffer_1D ...
    1456              : ! **************************************************************************************************
    1457          530 :    SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
    1458          530 :                                     homo, virtual, dimen_RI, unit_nr, &
    1459              :                                     block_size, ngroup, num_integ_group, &
    1460              :                                     my_open_shell_ss, calc_forces, buffer_1D)
    1461              :       TYPE(mp2_type)                                     :: mp2_env
    1462              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1463              :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_array
    1464              :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1465              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    1466              :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
    1467              :       INTEGER, INTENT(OUT)                               :: block_size, ngroup
    1468              :       INTEGER, INTENT(IN)                                :: num_integ_group
    1469              :       LOGICAL, INTENT(IN)                                :: my_open_shell_ss, calc_forces
    1470              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1471              :          INTENT(OUT)                                     :: buffer_1D
    1472              : 
    1473              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_block_size'
    1474              : 
    1475              :       INTEGER                                            :: best_block_size, handle, num_IJ_blocks
    1476              :       INTEGER(KIND=int_8)                                :: buffer_size, mem
    1477              :       REAL(KIND=dp)                                      :: mem_base, mem_per_blk, mem_per_repl_blk, &
    1478              :                                                             mem_real
    1479              : 
    1480          530 :       CALL timeset(routineN, handle)
    1481              : 
    1482          530 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    1483              : 
    1484          530 :       CALL m_memory(mem)
    1485          530 :       mem_real = (mem + 1024*1024 - 1)/(1024*1024)
    1486          530 :       CALL para_env%min(mem_real)
    1487              : 
    1488          530 :       mem_base = 0.0_dp
    1489          530 :       mem_per_blk = 0.0_dp
    1490          530 :       mem_per_repl_blk = 0.0_dp
    1491              : 
    1492              :       ! external_ab
    1493         1766 :       mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1494              :       ! BIB_C_rec
    1495         1148 :       mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
    1496              :       ! local_i_aL+local_j_aL
    1497         1148 :       mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
    1498              :       ! Copy to keep arrays contiguous
    1499         1766 :       mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1500              : 
    1501          530 :       IF (calc_forces) THEN
    1502              :          ! Y_i_aP+Y_j_aP+BIb_C_send
    1503          846 :          mem_per_blk = mem_per_blk + 3.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
    1504              :          ! send_ab
    1505         1308 :          mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1506              :       END IF
    1507              : 
    1508          530 :       best_block_size = 1
    1509              : 
    1510              :       ! Here we split the memory half for the communication, half for replication
    1511          530 :       IF (mp2_env%ri_mp2%block_size > 0) THEN
    1512              :          best_block_size = mp2_env%ri_mp2%block_size
    1513              :       ELSE
    1514          294 :          best_block_size = MAX(FLOOR((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
    1515              : 
    1516      2863720 :          DO
    1517      2864014 :             IF (SIZE(homo) == 1) THEN
    1518      2304834 :             IF (.NOT. my_open_shell_ss) THEN
    1519      1061856 :                num_IJ_blocks = (homo(1)/best_block_size)
    1520      1061856 :                num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
    1521              :             ELSE
    1522      1242978 :                num_IJ_blocks = ((homo(1) - 1)/best_block_size)
    1523      1242978 :                num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
    1524              :             END IF
    1525              :             ELSE
    1526      1677540 :             num_ij_blocks = PRODUCT(homo/best_block_size)
    1527              :             END IF
    1528              :             ! Enforce at least one large block for each subgroup
    1529      2864014 :             IF ((num_IJ_blocks >= ngroup .AND. num_IJ_blocks > 0) .OR. best_block_size == 1) THEN
    1530              :                EXIT
    1531              :             ELSE
    1532      2863720 :                best_block_size = best_block_size - 1
    1533              :             END IF
    1534              :          END DO
    1535              : 
    1536          294 :          IF (SIZE(homo) == 1) THEN
    1537          232 :          IF (my_open_shell_ss) THEN
    1538              :             ! check that best_block_size is not bigger than sqrt(homo-1)
    1539              :             ! Diagonal elements do not have to be considered
    1540          124 :             best_block_size = MIN(FLOOR(SQRT(REAL(homo(1) - 1, KIND=dp))), best_block_size)
    1541              :          ELSE
    1542              :             ! check that best_block_size is not bigger than sqrt(homo)
    1543          108 :             best_block_size = MIN(FLOOR(SQRT(REAL(homo(1), KIND=dp))), best_block_size)
    1544              :          END IF
    1545              :          END IF
    1546              :       END IF
    1547          530 :       block_size = MAX(1, best_block_size)
    1548              : 
    1549          530 :       IF (unit_nr > 0) THEN
    1550              :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    1551          265 :             "RI_INFO| Block size:", block_size
    1552          265 :          CALL m_flush(unit_nr)
    1553              :       END IF
    1554              : 
    1555              :       ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
    1556              :       buffer_size = MAX(INT(maxsize(gd_array), KIND=int_8)*block_size, INT(MAX(dimen_RI, MAXVAL(virtual)), KIND=int_8)) &
    1557         1766 :                     *MAXVAL(maxsize(gd_B_virtual))
    1558              :       ! The send buffer has the same size as the recv buffer
    1559          530 :       IF (calc_forces) buffer_size = buffer_size*2
    1560         1590 :       ALLOCATE (buffer_1D(buffer_size))
    1561              : 
    1562          530 :       CALL timestop(handle)
    1563              : 
    1564          530 :    END SUBROUTINE mp2_ri_get_block_size
    1565              : 
    1566              : ! **************************************************************************************************
    1567              : !> \brief ...
    1568              : !> \param mp2_env ...
    1569              : !> \param para_env_sub ...
    1570              : !> \param gd_B_virtual ...
    1571              : !> \param Eigenval ...
    1572              : !> \param homo ...
    1573              : !> \param dimen_RI ...
    1574              : !> \param iiB ...
    1575              : !> \param jjB ...
    1576              : !> \param my_B_size ...
    1577              : !> \param my_B_virtual_end ...
    1578              : !> \param my_B_virtual_start ...
    1579              : !> \param my_i ...
    1580              : !> \param my_j ...
    1581              : !> \param virtual ...
    1582              : !> \param local_ab ...
    1583              : !> \param t_ab ...
    1584              : !> \param my_local_i_aL ...
    1585              : !> \param my_local_j_aL ...
    1586              : !> \param open_ss ...
    1587              : !> \param Y_i_aP ...
    1588              : !> \param Y_j_aP ...
    1589              : !> \param local_ba ...
    1590              : !> \param ispin ...
    1591              : !> \param jspin ...
    1592              : !> \param dgemm_counter ...
    1593              : !> \param buffer_1D ...
    1594              : ! **************************************************************************************************
    1595         6480 :    SUBROUTINE mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
    1596         3240 :                                  Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
    1597         3240 :                                  my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
    1598         3240 :                                  t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
    1599         1620 :                                  local_ba, ispin, jspin, dgemm_counter, buffer_1D)
    1600              :       TYPE(mp2_type)                                     :: mp2_env
    1601              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env_sub
    1602              :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1603              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
    1604              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
    1605              :       INTEGER, INTENT(IN)                                :: dimen_RI, iiB, jjB
    1606              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_size, my_B_virtual_end, &
    1607              :                                                             my_B_virtual_start
    1608              :       INTEGER, INTENT(IN)                                :: my_i, my_j
    1609              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: virtual
    1610              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1611              :          INTENT(INOUT), TARGET                           :: local_ab
    1612              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1613              :          INTENT(IN), TARGET                              :: t_ab, my_local_i_aL, my_local_j_aL
    1614              :       LOGICAL, INTENT(IN)                                :: open_ss
    1615              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1616              :          INTENT(INOUT), TARGET                           :: Y_i_aP, Y_j_aP, local_ba
    1617              :       INTEGER, INTENT(IN)                                :: ispin, jspin
    1618              :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
    1619              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    1620              : 
    1621              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_update_P_gamma'
    1622              : 
    1623              :       INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_B_size, &
    1624              :          rec_B_virtual_end, rec_B_virtual_start, send_B_size, send_B_virtual_end, &
    1625              :          send_B_virtual_start
    1626              :       INTEGER(KIND=int_8)                                :: offset
    1627              :       LOGICAL                                            :: alpha_beta
    1628              :       REAL(KIND=dp)                                      :: factor, P_ij_diag
    1629              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1630         1620 :          POINTER                                         :: external_ab, send_ab
    1631              : 
    1632         1620 :       CALL timeset(routineN//"_Pia", handle)
    1633              : 
    1634         1620 :       alpha_beta = .NOT. (ispin == jspin)
    1635         1620 :       IF (open_ss) THEN
    1636              :          factor = 1.0_dp
    1637              :       ELSE
    1638         1231 :          factor = 2.0_dp
    1639              :       END IF
    1640              :       ! divide the (ia|jb) integrals by Delta_ij^ab
    1641        25159 :       DO b = 1, my_B_size(jspin)
    1642        23539 :          b_global = b + my_B_virtual_start(jspin) - 1
    1643       469848 :          DO a = 1, virtual(ispin)
    1644              :             local_ab(a, b) = -local_ab(a, b)/ &
    1645              :                              (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
    1646       468228 :                               Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
    1647              :          END DO
    1648              :       END DO
    1649         1620 :       IF (.NOT. (alpha_beta)) THEN
    1650       329242 :          P_ij_diag = -SUM(local_ab*t_ab)*factor
    1651              :       ELSE
    1652              :          ! update diagonal part of P_ij
    1653       140606 :          P_ij_diag = -SUM(local_ab*local_ab)*mp2_env%scale_S
    1654              :          ! More integrals needed only for alpha-beta case: local_ba
    1655         6987 :          DO b = 1, my_B_size(ispin)
    1656         6485 :             b_global = b + my_B_virtual_start(ispin) - 1
    1657       140066 :             DO a = 1, virtual(jspin)
    1658              :                local_ba(a, b) = -local_ba(a, b)/ &
    1659              :                                 (Eigenval(homo(jspin) + a, jspin) + Eigenval(homo(ispin) + b_global, ispin) - &
    1660       139564 :                                  Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
    1661              :             END DO
    1662              :          END DO
    1663              :       END IF
    1664              : 
    1665              :       ! P_ab and add diagonal part of P_ij
    1666              : 
    1667         1620 :       CALL dgemm_counter_start(dgemm_counter)
    1668         1620 :       IF (.NOT. (alpha_beta)) THEN
    1669              :          CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(ispin), 1.0_dp, &
    1670              :                                           t_ab, virtual(ispin), local_ab, virtual(ispin), &
    1671              :                                           1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
    1672         1118 :                                                                my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
    1673              :          mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
    1674         1118 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
    1675              :       ELSE
    1676              :          CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(jspin), mp2_env%scale_S, &
    1677              :                                           local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
    1678          502 :                           mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
    1679              : 
    1680              :          mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
    1681          502 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
    1682              : 
    1683              :          CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), my_B_size(jspin), virtual(ispin), mp2_env%scale_S, &
    1684              :                                           local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
    1685          502 :                           mp2_env%ri_grad%P_ab(jspin)%array(:, my_B_virtual_start(jspin):my_B_virtual_end(jspin)), my_B_size(jspin))
    1686              : 
    1687              :          mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
    1688          502 :             mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
    1689              :       END IF
    1690              :       ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
    1691              :       ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
    1692              :       ! due to spin in alpha-beta case.
    1693         1620 :       IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1694              : 
    1695              :          CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), virtual(ispin), my_B_size(ispin), 1.0_dp, &
    1696              :                                           t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1697              :                                           local_ab, virtual(ispin), &
    1698          823 :                                           1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_B_size(ispin))
    1699              : 
    1700              :          mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
    1701          823 :             mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
    1702              :       END IF
    1703         1792 :       DO proc_shift = 1, para_env_sub%num_pe - 1
    1704          172 :          proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1705          172 :          proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1706              : 
    1707          172 :          CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1708          172 :          CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1709              : 
    1710          172 :          external_ab(1:virtual(ispin), 1:rec_B_size) => buffer_1D(1:INT(virtual(ispin), int_8)*rec_B_size)
    1711        35072 :          external_ab = 0.0_dp
    1712              : 
    1713              :          CALL para_env_sub%sendrecv(local_ab, proc_send, &
    1714          172 :                                     external_ab, proc_receive)
    1715              : 
    1716          172 :          IF (.NOT. (alpha_beta)) THEN
    1717              :             CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(ispin), 1.0_dp, &
    1718              :                                              t_ab, virtual(ispin), external_ab, virtual(ispin), &
    1719          102 :                               1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
    1720              :          ELSE
    1721              :             CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), rec_B_size, virtual(ispin), mp2_env%scale_S, &
    1722              :                                              local_ab, virtual(ispin), external_ab, virtual(ispin), &
    1723              :                                              1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_B_virtual_start:rec_B_virtual_end), &
    1724           70 :                                              my_B_size(jspin))
    1725              : 
    1726              :             ! For alpha-beta part of alpha-density we need a new parallel code
    1727              :             ! And new external_ab (of a different size)
    1728           70 :             CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1729           70 :             CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1730           70 :             external_ab(1:virtual(jspin), 1:rec_B_size) => buffer_1D(1:INT(virtual(jspin), int_8)*rec_B_size)
    1731        14700 :             external_ab = 0.0_dp
    1732              :             CALL para_env_sub%sendrecv(local_ba, proc_send, &
    1733           70 :                                        external_ab, proc_receive)
    1734              :             CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(jspin), mp2_env%scale_S, &
    1735              :                                              local_ba, virtual(jspin), external_ab, virtual(jspin), &
    1736           70 :                               1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
    1737              :          END IF
    1738              : 
    1739         1964 :          IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1740              :             external_ab(1:my_B_size(ispin), 1:virtual(ispin)) => &
    1741           86 :                buffer_1D(1:INT(virtual(ispin), int_8)*my_B_size(ispin))
    1742        18083 :             external_ab = 0.0_dp
    1743              : 
    1744           86 :             offset = INT(virtual(ispin), int_8)*my_B_size(ispin)
    1745              : 
    1746           86 :             send_ab(1:send_B_size, 1:virtual(ispin)) => buffer_1D(offset + 1:offset + INT(send_B_size, int_8)*virtual(ispin))
    1747        18083 :             send_ab = 0.0_dp
    1748              : 
    1749              :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, virtual(ispin), my_B_size(ispin), 1.0_dp, &
    1750              :                                              t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1751           86 :                                              local_ab, virtual(ispin), 0.0_dp, send_ab, send_B_size)
    1752              :             CALL para_env_sub%sendrecv(send_ab, proc_send, &
    1753           86 :                                        external_ab, proc_receive)
    1754              : 
    1755        18083 :             mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
    1756              :          END IF
    1757              : 
    1758              :       END DO
    1759         1620 :       IF (.NOT. alpha_beta) THEN
    1760         1118 :          IF (my_i /= my_j) THEN
    1761          823 :             CALL dgemm_counter_stop(dgemm_counter, 2*my_B_size(ispin), virtual(ispin), virtual(ispin))
    1762              :          ELSE
    1763          295 :             CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), virtual(ispin), virtual(ispin))
    1764              :          END IF
    1765              :       ELSE
    1766         1506 :          CALL dgemm_counter_stop(dgemm_counter, SUM(my_B_size), virtual(ispin), virtual(jspin))
    1767              :       END IF
    1768         1620 :       CALL timestop(handle)
    1769              : 
    1770              :       ! Now, Gamma_P_ia (made of Y_ia_P)
    1771              : 
    1772         1620 :       CALL timeset(routineN//"_Gamma", handle)
    1773         1620 :       CALL dgemm_counter_start(dgemm_counter)
    1774         1620 :       IF (.NOT. alpha_beta) THEN
    1775              :          ! Alpha-alpha, beta-beta and closed shell
    1776              :          CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
    1777              :                                           t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1778         1118 :                                           my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
    1779              :       ELSE ! Alpha-beta
    1780              :          CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
    1781              :                                           local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1782          502 :                                           my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
    1783              :          CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(jspin), dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
    1784              :                                           local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1785          502 :                                           my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(jspin))
    1786              :       END IF
    1787              : 
    1788         1620 :       IF (para_env_sub%num_pe > 1) THEN
    1789          172 :          external_ab(1:my_B_size(ispin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(ispin), int_8)*dimen_RI)
    1790       189692 :          external_ab = 0.0_dp
    1791              : 
    1792          172 :          offset = INT(my_B_size(ispin), int_8)*dimen_RI
    1793              :       END IF
    1794              :       !
    1795         1792 :       DO proc_shift = 1, para_env_sub%num_pe - 1
    1796          172 :          proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1797          172 :          proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1798              : 
    1799          172 :          CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1800          172 :          CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1801              : 
    1802          172 :          send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
    1803       189692 :          send_ab = 0.0_dp
    1804         1964 :          IF (.NOT. alpha_beta) THEN
    1805              :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), 1.0_dp, &
    1806              :                                              t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1807          102 :                                              my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
    1808          102 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1809              : 
    1810       217544 :             Y_i_aP(:, :) = Y_i_aP + external_ab
    1811              : 
    1812              :          ELSE ! Alpha-beta case
    1813              :             ! Alpha-alpha part
    1814              :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
    1815              :                                              local_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1816           70 :                                              my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
    1817           70 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1818       161840 :             Y_i_aP(:, :) = Y_i_aP + external_ab
    1819              :          END IF
    1820              :       END DO
    1821              : 
    1822         1620 :       IF (alpha_beta) THEN
    1823              :          ! For beta-beta part (in alpha-beta case) we need a new parallel code
    1824          502 :          IF (para_env_sub%num_pe > 1) THEN
    1825           70 :             external_ab(1:my_B_size(jspin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(jspin), int_8)*dimen_RI)
    1826        88620 :             external_ab = 0.0_dp
    1827              : 
    1828           70 :             offset = INT(my_B_size(jspin), int_8)*dimen_RI
    1829              :          END IF
    1830          572 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    1831           70 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1832           70 :             proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1833              : 
    1834           70 :             CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1835           70 :             send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
    1836        88620 :             send_ab = 0.0_dp
    1837              :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
    1838              :                                              local_ba(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1839           70 :                                              my_local_i_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
    1840           70 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1841       177812 :             Y_j_aP(:, :) = Y_j_aP + external_ab
    1842              : 
    1843              :          END DO
    1844              : 
    1845              :          ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
    1846          502 :          CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_RI, my_B_size(jspin))
    1847              :       ELSE
    1848         1118 :          CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_RI, my_B_size(ispin))
    1849              :       END IF
    1850              : 
    1851         1620 :       IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1852              :          ! Alpha-alpha, beta-beta and closed shell
    1853          823 :          CALL dgemm_counter_start(dgemm_counter)
    1854              :          CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
    1855              :                                           t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1856          823 :                                           my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
    1857          909 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    1858           86 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1859           86 :             proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1860              : 
    1861           86 :             CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1862              : 
    1863           86 :             external_ab(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
    1864        86837 :             external_ab = 0.0_dp
    1865              : 
    1866              :             CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
    1867           86 :                                        external_ab, proc_receive)
    1868              : 
    1869              :             ! Alpha-alpha, beta-beta and closed shell
    1870              :             CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, rec_B_size, 1.0_dp, &
    1871              :                                              t_ab(rec_B_virtual_start:rec_B_virtual_end, :), rec_B_size, &
    1872          995 :                                              external_ab, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
    1873              :          END DO
    1874              : 
    1875          823 :          CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), dimen_RI, virtual(ispin))
    1876              :       END IF
    1877              : 
    1878         1620 :       CALL timestop(handle)
    1879         1620 :    END SUBROUTINE mp2_update_P_gamma
    1880              : 
    1881              : ! **************************************************************************************************
    1882              : !> \brief ...
    1883              : !> \param Gamma_P_ia ...
    1884              : !> \param ij_index ...
    1885              : !> \param my_B_size ...
    1886              : !> \param my_block_size ...
    1887              : !> \param my_group_L_size ...
    1888              : !> \param my_i ...
    1889              : !> \param my_ij_pairs ...
    1890              : !> \param ngroup ...
    1891              : !> \param num_integ_group ...
    1892              : !> \param integ_group_pos2color_sub ...
    1893              : !> \param num_ij_pairs ...
    1894              : !> \param ij_map ...
    1895              : !> \param ranges_info_array ...
    1896              : !> \param Y_i_aP ...
    1897              : !> \param comm_exchange ...
    1898              : !> \param sizes_array ...
    1899              : !> \param spin ...
    1900              : !> \param buffer_1D ...
    1901              : ! **************************************************************************************************
    1902         3234 :    SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
    1903              :                                      my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
    1904              :                                      num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
    1905         3234 :                                      ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
    1906         3234 :                                      sizes_array, spin, buffer_1D)
    1907              : 
    1908              :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: Gamma_P_ia
    1909              :       INTEGER, INTENT(IN)                                :: ij_index, my_B_size, my_block_size, &
    1910              :                                                             my_group_L_size, my_i, my_ij_pairs, &
    1911              :                                                             ngroup, num_integ_group
    1912              :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: integ_group_pos2color_sub, num_ij_pairs
    1913              :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN)  :: ij_map
    1914              :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    1915              :          INTENT(IN)                                      :: ranges_info_array
    1916              :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: Y_i_aP
    1917              :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    1918              :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: sizes_array
    1919              :       INTEGER, INTENT(IN)                                :: spin
    1920              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    1921              : 
    1922              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_redistribute_gamma'
    1923              : 
    1924              :       INTEGER :: end_point, handle, handle2, iiB, ij_counter_rec, irep, kkk, lll, Lstart_pos, &
    1925              :          proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_L_size, start_point, tag
    1926              :       INTEGER(KIND=int_8)                                :: offset
    1927              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
    1928         3234 :          POINTER                                         :: BI_C_rec, BI_C_send
    1929              : 
    1930              : ! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
    1931              : 
    1932         3234 :       CALL timeset(routineN//"_comm2", handle)
    1933              : 
    1934         3234 :       tag = 43
    1935              : 
    1936         3234 :       IF (ij_index <= my_ij_pairs) THEN
    1937              :          ! somethig to send
    1938              :          ! start with myself
    1939         3216 :          CALL timeset(routineN//"_comm2_w", handle2)
    1940         9230 :          DO irep = 0, num_integ_group - 1
    1941         6014 :             Lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
    1942         6014 :             start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    1943         6014 :             end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    1944              : !$OMP PARALLEL DO DEFAULT(NONE) &
    1945              : !$OMP             PRIVATE(kkk,lll,iiB) &
    1946              : !$OMP             SHARED(start_point,end_point,Lstart_pos,my_block_size,&
    1947         9230 : !$OMP                    Gamma_P_ia,my_i,my_B_size,Y_i_aP)
    1948              :             DO kkk = start_point, end_point
    1949              :                lll = kkk - start_point + Lstart_pos
    1950              :                DO iiB = 1, my_block_size
    1951              :                   Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) = &
    1952              :                      Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) + &
    1953              :                      Y_i_aP(1:my_B_size, lll, iiB)
    1954              :                END DO
    1955              :             END DO
    1956              : !$OMP END PARALLEL DO
    1957              :          END DO
    1958         3216 :          CALL timestop(handle2)
    1959              : 
    1960              :          ! Y_i_aP(my_B_size,dimen_RI,block_size)
    1961              : 
    1962         3314 :          DO proc_shift = 1, comm_exchange%num_pe - 1
    1963           98 :             proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    1964           98 :             proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    1965              : 
    1966           98 :             send_L_size = sizes_array(proc_send)
    1967              :             BI_C_send(1:my_B_size, 1:my_block_size, 1:send_L_size) => &
    1968           98 :                buffer_1D(1:INT(my_B_size, int_8)*my_block_size*send_L_size)
    1969              : 
    1970           98 :             offset = INT(my_B_size, int_8)*my_block_size*send_L_size
    1971              : 
    1972           98 :             CALL timeset(routineN//"_comm2_w", handle2)
    1973        48692 :             BI_C_send = 0.0_dp
    1974          196 :             DO irep = 0, num_integ_group - 1
    1975           98 :                Lstart_pos = ranges_info_array(1, irep, proc_send)
    1976           98 :                start_point = ranges_info_array(3, irep, proc_send)
    1977           98 :                end_point = ranges_info_array(4, irep, proc_send)
    1978              : !$OMP PARALLEL DO DEFAULT(NONE) &
    1979              : !$OMP             PRIVATE(kkk,lll,iiB) &
    1980              : !$OMP             SHARED(start_point,end_point,Lstart_pos,my_block_size,&
    1981          196 : !$OMP                    BI_C_send,my_B_size,Y_i_aP)
    1982              :                DO kkk = start_point, end_point
    1983              :                   lll = kkk - start_point + Lstart_pos
    1984              :                   DO iiB = 1, my_block_size
    1985              :                      BI_C_send(1:my_B_size, iiB, kkk) = Y_i_aP(1:my_B_size, lll, iiB)
    1986              :                   END DO
    1987              :                END DO
    1988              : !$OMP END PARALLEL DO
    1989              :             END DO
    1990           98 :             CALL timestop(handle2)
    1991              : 
    1992           98 :             rec_ij_index = num_ij_pairs(proc_receive)
    1993              : 
    1994         3412 :             IF (ij_index <= rec_ij_index) THEN
    1995              :                ! we know that proc_receive has something to send for us, let's see what
    1996              :                ij_counter_rec = &
    1997           80 :                   (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
    1998              : 
    1999           80 :                rec_i = ij_map(spin, ij_counter_rec)
    2000              : 
    2001              :                BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
    2002           80 :                   buffer_1D(offset + 1:offset + INT(my_B_size, int_8)*my_block_size*my_group_L_size)
    2003        44250 :                BI_C_rec = 0.0_dp
    2004              : 
    2005              :                CALL comm_exchange%sendrecv(BI_C_send, proc_send, &
    2006           80 :                                            BI_C_rec, proc_receive, tag)
    2007              : 
    2008           80 :                CALL timeset(routineN//"_comm2_w", handle2)
    2009          160 :                DO irep = 0, num_integ_group - 1
    2010           80 :                   start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    2011           80 :                   end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    2012              : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
    2013              : !$OMP                    SHARED(start_point,end_point,my_block_size,&
    2014          160 : !$OMP                           Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
    2015              :                   Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
    2016              :                      Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
    2017              :                      BI_C_rec(1:my_B_size, :, start_point:end_point)
    2018              : !$OMP END PARALLEL WORKSHARE
    2019              :                END DO
    2020           80 :                CALL timestop(handle2)
    2021              : 
    2022              :             ELSE
    2023              :                ! we have something to send but nothing to receive
    2024           18 :                CALL comm_exchange%send(BI_C_send, proc_send, tag)
    2025              : 
    2026              :             END IF
    2027              : 
    2028              :          END DO
    2029              : 
    2030              :       ELSE
    2031              :          ! noting to send check if we have to receive
    2032           36 :          DO proc_shift = 1, comm_exchange%num_pe - 1
    2033           18 :             proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2034           18 :             proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2035           18 :             rec_ij_index = num_ij_pairs(proc_receive)
    2036              : 
    2037           36 :             IF (ij_index <= rec_ij_index) THEN
    2038              :                ! we know that proc_receive has something to send for us, let's see what
    2039              :                ij_counter_rec = &
    2040           18 :                   (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
    2041              : 
    2042           18 :                rec_i = ij_map(spin, ij_counter_rec)
    2043              : 
    2044              :                BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
    2045           18 :                   buffer_1D(1:INT(my_B_size, int_8)*my_block_size*my_group_L_size)
    2046              : 
    2047         4442 :                BI_C_rec = 0.0_dp
    2048              : 
    2049           18 :                CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2050              : 
    2051           18 :                CALL timeset(routineN//"_comm2_w", handle2)
    2052           36 :                DO irep = 0, num_integ_group - 1
    2053           18 :                   start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    2054           18 :                   end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    2055              : #if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
    2056              : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
    2057              : !$OMP                    SHARED(start_point,end_point,my_block_size,&
    2058           36 : !$OMP                           Gamma_P_ia,rec_i,my_B_size,BI_C_rec)
    2059              : #endif
    2060              :                   Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
    2061              :                      Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
    2062              :                      BI_C_rec(1:my_B_size, :, start_point:end_point)
    2063              : #if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
    2064              : !$OMP END PARALLEL WORKSHARE
    2065              : #endif
    2066              :                END DO
    2067           18 :                CALL timestop(handle2)
    2068              : 
    2069              :             END IF
    2070              :          END DO
    2071              : 
    2072              :       END IF
    2073         3234 :       CALL timestop(handle)
    2074              : 
    2075         3234 :    END SUBROUTINE mp2_redistribute_gamma
    2076              : 
    2077              : ! **************************************************************************************************
    2078              : !> \brief ...
    2079              : !> \param mp2_env ...
    2080              : !> \param Eigenval ...
    2081              : !> \param homo ...
    2082              : !> \param virtual ...
    2083              : !> \param open_shell ...
    2084              : !> \param beta_beta ...
    2085              : !> \param Bib_C ...
    2086              : !> \param unit_nr ...
    2087              : !> \param dimen_RI ...
    2088              : !> \param my_B_size ...
    2089              : !> \param ngroup ...
    2090              : !> \param my_group_L_size ...
    2091              : !> \param color_sub ...
    2092              : !> \param ranges_info_array ...
    2093              : !> \param comm_exchange ...
    2094              : !> \param para_env_sub ...
    2095              : !> \param para_env ...
    2096              : !> \param my_B_virtual_start ...
    2097              : !> \param my_B_virtual_end ...
    2098              : !> \param sizes_array ...
    2099              : !> \param gd_B_virtual ...
    2100              : !> \param integ_group_pos2color_sub ...
    2101              : !> \param dgemm_counter ...
    2102              : !> \param buffer_1D ...
    2103              : ! **************************************************************************************************
    2104          384 :    SUBROUTINE quasi_degenerate_P_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
    2105          384 :                                     beta_beta, Bib_C, unit_nr, dimen_RI, &
    2106          384 :                                     my_B_size, ngroup, my_group_L_size, &
    2107              :                                     color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
    2108          384 :                                     my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
    2109          384 :                                     integ_group_pos2color_sub, dgemm_counter, buffer_1D)
    2110              :       TYPE(mp2_type)                                     :: mp2_env
    2111              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
    2112              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2113              :       LOGICAL, INTENT(IN)                                :: open_shell, beta_beta
    2114              :       TYPE(three_dim_real_array), DIMENSION(:), &
    2115              :          INTENT(IN)                                      :: BIb_C
    2116              :       INTEGER, INTENT(IN)                                :: unit_nr, dimen_RI
    2117              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_size
    2118              :       INTEGER, INTENT(IN)                                :: ngroup, my_group_L_size, color_sub
    2119              :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    2120              :          INTENT(IN)                                      :: ranges_info_array
    2121              :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    2122              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env_sub, para_env
    2123              :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_virtual_start, my_B_virtual_end
    2124              :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: sizes_array
    2125              :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    2126              :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: integ_group_pos2color_sub
    2127              :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
    2128              :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    2129              : 
    2130              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'quasi_degenerate_P_ij'
    2131              : 
    2132              :       INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
    2133              :          ijk_counter_send, ijk_index, ispin, kkB, kspin, max_block_size, max_ijk, my_i, my_ijk, &
    2134              :          my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
    2135              :          rec_B_size, rec_B_virtual_end, rec_B_virtual_start, rec_L_size, send_B_size, &
    2136              :          send_B_virtual_end, send_B_virtual_start, send_i, send_ijk_index, send_j, send_k, &
    2137              :          size_B_i, size_B_k, tag, tag2
    2138          384 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: num_ijk
    2139          384 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: ijk_map, send_last_k
    2140              :       LOGICAL                                            :: alpha_beta, do_recv_i, do_recv_j, &
    2141              :                                                             do_recv_k, do_send_i, do_send_j, &
    2142              :                                                             do_send_k
    2143              :       REAL(KIND=dp)                                      :: amp_fac, P_ij_elem, t_new, t_start
    2144              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
    2145          384 :          TARGET                                          :: local_ab, local_aL_i, local_aL_j, t_ab
    2146          384 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: local_aL_k
    2147          384 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: BI_C_rec, external_ab, external_aL
    2148          384 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: BI_C_rec_3D
    2149              : 
    2150          384 :       CALL timeset(routineN//"_ij_sing", handle)
    2151              : 
    2152          384 :       tag = 44
    2153          384 :       tag2 = 45
    2154              : 
    2155          384 :       nspins = SIZE(BIb_C)
    2156          384 :       alpha_beta = (nspins == 2)
    2157              : 
    2158              :       ! Set amplitude factor
    2159          384 :       amp_fac = mp2_env%scale_S + mp2_env%scale_T
    2160          384 :       IF (open_shell) amp_fac = mp2_env%scale_T
    2161              : 
    2162          794 :       ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
    2163              : 
    2164              :       ! Loop(s) over orbital triplets
    2165          846 :       DO ispin = 1, nspins
    2166          462 :          size_B_i = my_B_size(ispin)
    2167          462 :          IF (ispin == 1 .AND. alpha_beta) THEN
    2168              :             kspin = 2
    2169              :          ELSE
    2170          384 :             kspin = 1
    2171              :          END IF
    2172          462 :          size_B_k = my_B_size(kspin)
    2173              : 
    2174              :          ! Find the number of quasi-degenerate orbitals and orbital triplets
    2175              : 
    2176              :          CALL Find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), Eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
    2177              :                                        .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
    2178          618 :                                        SIZE(buffer_1D), my_group_L_size, size_B_k, para_env, virtual(ispin), size_B_i)
    2179              : 
    2180          462 :          my_virtual = virtual(ispin)
    2181          462 :          IF (SIZE(ijk_map, 2) > 0) THEN
    2182           98 :             max_block_size = ijk_map(4, 1)
    2183              :          ELSE
    2184              :             max_block_size = 1
    2185              :          END IF
    2186              : 
    2187         1848 :          ALLOCATE (local_aL_i(dimen_RI, size_B_i))
    2188         1386 :          ALLOCATE (local_aL_j(dimen_RI, size_B_i))
    2189         2310 :          ALLOCATE (local_aL_k(dimen_RI, size_B_k, max_block_size))
    2190         1848 :          ALLOCATE (t_ab(my_virtual, size_B_k))
    2191              : 
    2192         1386 :          my_last_k = -1
    2193          552 :          send_last_k = -1
    2194              : 
    2195          462 :          t_start = m_walltime()
    2196          618 :          DO ijk_index = 1, max_ijk
    2197              : 
    2198              :             ! Prediction is unreliable if we are in the first step of the loop
    2199          156 :             IF (unit_nr > 0 .AND. ijk_index > 1) THEN
    2200           18 :                decil = ijk_index*10/max_ijk
    2201           18 :                IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
    2202           18 :                   t_new = m_walltime()
    2203           18 :                   t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
    2204              :                   WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
    2205           18 :                      cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
    2206           18 :                   CALL m_flush(unit_nr)
    2207              :                END IF
    2208              :             END IF
    2209              : 
    2210          618 :             IF (ijk_index <= my_ijk) THEN
    2211              :                ! work to be done
    2212          154 :                ijk_counter = (ijk_index - MIN(1, color_sub))*ngroup + color_sub
    2213          154 :                my_i = ijk_map(1, ijk_counter)
    2214          154 :                my_j = ijk_map(2, ijk_counter)
    2215          154 :                my_k = ijk_map(3, ijk_counter)
    2216          154 :                block_size = ijk_map(4, ijk_counter)
    2217              : 
    2218          154 :                do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
    2219          154 :                do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
    2220          154 :                do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
    2221          154 :                my_last_k(1) = my_k
    2222          154 :                my_last_k(2) = my_k + block_size - 1
    2223              : 
    2224       131054 :                local_aL_i = 0.0_dp
    2225          154 :                IF (do_recv_i) THEN
    2226              :                   CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
    2227          125 :                                           BIb_C(ispin)%array(:, :, my_i))
    2228              :                END IF
    2229              : 
    2230       131054 :                local_aL_j = 0.0_dp
    2231          154 :                IF (do_recv_j) THEN
    2232              :                   CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
    2233          125 :                                           BIb_C(ispin)%array(:, :, my_j))
    2234              :                END IF
    2235              : 
    2236          154 :                IF (do_recv_k) THEN
    2237       224533 :                   local_aL_k = 0.0_dp
    2238              :                   CALL fill_local_i_aL(local_aL_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
    2239          146 :                                        BIb_C(kspin)%array(:, :, my_k:my_k + block_size - 1))
    2240              :                END IF
    2241              : 
    2242          154 :                CALL timeset(routineN//"_comm", handle2)
    2243          164 :                DO proc_shift = 1, comm_exchange%num_pe - 1
    2244           10 :                   proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2245           10 :                   proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2246              : 
    2247           10 :                   send_ijk_index = num_ijk(proc_send)
    2248              : 
    2249           10 :                   rec_L_size = sizes_array(proc_receive)
    2250           10 :                   BI_C_rec(1:rec_L_size, 1:size_B_i) => buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_i)
    2251              : 
    2252           10 :                   do_send_i = .FALSE.
    2253           10 :                   do_send_j = .FALSE.
    2254           10 :                   do_send_k = .FALSE.
    2255           10 :                   IF (ijk_index <= send_ijk_index) THEN
    2256              :                      ! something to send
    2257              :                      ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))* &
    2258            8 :                                         ngroup + integ_group_pos2color_sub(proc_send)
    2259            8 :                      send_i = ijk_map(1, ijk_counter_send)
    2260            8 :                      send_j = ijk_map(2, ijk_counter_send)
    2261            8 :                      send_k = ijk_map(3, ijk_counter_send)
    2262              : 
    2263            8 :                      do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
    2264            8 :                      do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
    2265            8 :                      do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
    2266            8 :                      send_last_k(1, proc_shift) = send_k
    2267            8 :                      send_last_k(2, proc_shift) = send_k + block_size - 1
    2268              :                   END IF
    2269              : 
    2270              :                   ! occupied i
    2271          722 :                   BI_C_rec = 0.0_dp
    2272           10 :                   IF (do_send_i) THEN
    2273            6 :                   IF (do_recv_i) THEN
    2274              :                      CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i), proc_send, &
    2275          288 :                                                  BI_C_rec, proc_receive, tag)
    2276              :                   ELSE
    2277            2 :                      CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
    2278              :                   END IF
    2279            4 :                   ELSE IF (do_recv_i) THEN
    2280          580 :                   CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2281              :                   END IF
    2282           10 :                   IF (do_recv_i) THEN
    2283            8 :                      CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, proc_receive), BI_C_rec)
    2284              :                   END IF
    2285              : 
    2286              :                   ! occupied j
    2287          722 :                   BI_C_rec = 0.0_dp
    2288           10 :                   IF (do_send_j) THEN
    2289            8 :                   IF (do_recv_j) THEN
    2290              :                      CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_j), proc_send, &
    2291          576 :                                                  BI_C_rec, proc_receive, tag)
    2292              :                   ELSE
    2293            0 :                      CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
    2294              :                   END IF
    2295            2 :                   ELSE IF (do_recv_j) THEN
    2296            0 :                   CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2297              :                   END IF
    2298            8 :                   IF (do_recv_j) THEN
    2299            8 :                      CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, proc_receive), BI_C_rec)
    2300              :                   END IF
    2301              : 
    2302              :                   ! occupied k
    2303              :                   BI_C_rec_3D(1:rec_L_size, 1:size_B_k, 1:block_size) => &
    2304           10 :                      buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_k*block_size)
    2305           10 :                   IF (do_send_k) THEN
    2306            8 :                   IF (do_recv_k) THEN
    2307              :                      CALL comm_exchange%sendrecv(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
    2308          726 :                                                  BI_C_rec_3D, proc_receive, tag)
    2309              :                   ELSE
    2310            0 :                      CALL comm_exchange%send(BI_C_rec, proc_receive, tag)
    2311              :                   END IF
    2312            2 :                   ELSE IF (do_recv_k) THEN
    2313          294 :                   CALL comm_exchange%recv(BI_C_rec_3D, proc_receive, tag)
    2314              :                   END IF
    2315          164 :                   IF (do_recv_k) THEN
    2316           10 :                      CALL fill_local_i_aL(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), BI_C_rec_3D)
    2317              :                   END IF
    2318              :                END DO
    2319              : 
    2320        20868 :                IF (.NOT. do_recv_i) local_aL_i(:, :) = local_aL_k(:, :, my_i - my_k + 1)
    2321        20868 :                IF (.NOT. do_recv_j) local_aL_j(:, :) = local_aL_k(:, :, my_j - my_k + 1)
    2322          154 :                CALL timestop(handle2)
    2323              : 
    2324              :                ! expand integrals
    2325          376 :                DO kkB = 1, block_size
    2326          222 :                   CALL timeset(routineN//"_exp_ik", handle2)
    2327          222 :                   CALL dgemm_counter_start(dgemm_counter)
    2328          666 :                   ALLOCATE (local_ab(my_virtual, size_B_k))
    2329        32638 :                   local_ab = 0.0_dp
    2330              :                   CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
    2331              :                                                    local_aL_i, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2332          222 :                                           0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
    2333          222 :                   DO proc_shift = 1, para_env_sub%num_pe - 1
    2334            0 :                      proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2335            0 :                      proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2336              : 
    2337            0 :                      CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2338              : 
    2339            0 :                      external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
    2340              : 
    2341              :                      CALL comm_exchange%sendrecv(local_aL_i, proc_send, &
    2342            0 :                                                  external_aL, proc_receive, tag)
    2343              : 
    2344              :                      CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
    2345              :                                                       external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2346          222 :                                                     0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
    2347              :                   END DO
    2348          222 :                   CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
    2349          222 :                   CALL timestop(handle2)
    2350              : 
    2351              :                   ! Amplitudes
    2352          222 :                   CALL timeset(routineN//"_tab", handle2)
    2353        32638 :                   t_ab = 0.0_dp
    2354              :                   ! Alpha-alpha, beta-beta and closed shell
    2355          222 :                   IF (.NOT. alpha_beta) THEN
    2356         1156 :                      DO b = 1, size_B_k
    2357         1038 :                         b_global = b + my_B_virtual_start(1) - 1
    2358        17250 :                         DO a = 1, my_B_size(1)
    2359        16094 :                            a_global = a + my_B_virtual_start(1) - 1
    2360              :                            t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
    2361              :                                                (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
    2362        17132 :                                                 - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
    2363              :                         END DO
    2364              :                      END DO
    2365              :                   ELSE
    2366         1030 :                      DO b = 1, size_B_k
    2367          926 :                         b_global = b + my_B_virtual_start(kspin) - 1
    2368        15388 :                         DO a = 1, my_B_size(ispin)
    2369        14358 :                            a_global = a + my_B_virtual_start(ispin) - 1
    2370              :                            t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
    2371              :                                                (Eigenval(my_i, ispin) + Eigenval(my_k + kkB - 1, kspin) &
    2372        15284 :                                                 - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
    2373              :                         END DO
    2374              :                      END DO
    2375              :                   END IF
    2376              : 
    2377          222 :                   IF (.NOT. alpha_beta) THEN
    2378          118 :                      DO proc_shift = 1, para_env_sub%num_pe - 1
    2379            0 :                         proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2380            0 :                         proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2381            0 :                         CALL get_group_dist(gd_B_virtual(1), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2382            0 :                         CALL get_group_dist(gd_B_virtual(1), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    2383              : 
    2384            0 :                         external_ab(1:size_B_i, 1:rec_B_size) => buffer_1D(1:INT(size_B_i, KIND=int_8)*rec_B_size)
    2385              :                         CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:size_B_k), proc_send, &
    2386            0 :                                                    external_ab(1:size_B_i, 1:rec_B_size), proc_receive, tag)
    2387              : 
    2388          118 :                         DO b = 1, my_B_size(1)
    2389            0 :                            b_global = b + my_B_virtual_start(1) - 1
    2390            0 :                            DO a = 1, rec_B_size
    2391            0 :                               a_global = a + rec_B_virtual_start - 1
    2392              :                               t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
    2393              :                                                   (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
    2394            0 :                                                    - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
    2395              :                            END DO
    2396              :                         END DO
    2397              :                      END DO
    2398              :                   END IF
    2399          222 :                   CALL timestop(handle2)
    2400              : 
    2401              :                   ! Expand the second set of integrals
    2402          222 :                   CALL timeset(routineN//"_exp_jk", handle2)
    2403        32638 :                   local_ab = 0.0_dp
    2404          222 :                   CALL dgemm_counter_start(dgemm_counter)
    2405              :                   CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
    2406              :                                                    local_aL_j, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2407          222 :                                           0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
    2408          222 :                   DO proc_shift = 1, para_env_sub%num_pe - 1
    2409            0 :                      proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2410            0 :                      proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2411              : 
    2412            0 :                      CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2413              : 
    2414            0 :                      external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
    2415              : 
    2416              :                      CALL comm_exchange%sendrecv(local_aL_j, proc_send, &
    2417            0 :                                                  external_aL, proc_receive, tag)
    2418              :                      CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
    2419              :                                                       external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2420          222 :                                                     0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
    2421              :                   END DO
    2422          222 :                   CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
    2423          222 :                   CALL timestop(handle2)
    2424              : 
    2425          222 :                   CALL timeset(routineN//"_Pij", handle2)
    2426         2186 :                   DO b = 1, size_B_k
    2427         1964 :                      b_global = b + my_B_virtual_start(kspin) - 1
    2428        32638 :                      DO a = 1, my_B_size(ispin)
    2429        30452 :                         a_global = a + my_B_virtual_start(ispin) - 1
    2430              :                         local_ab(a_global, b) = &
    2431              :                            local_ab(a_global, b)/(Eigenval(my_j, ispin) + Eigenval(my_k + kkB - 1, kspin) &
    2432        32416 :                                                 - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
    2433              :                      END DO
    2434              :                   END DO
    2435              :                   !
    2436        32638 :                   P_ij_elem = SUM(local_ab*t_ab)
    2437          222 :                   DEALLOCATE (local_ab)
    2438          222 :                   IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
    2439            4 :                      P_ij_elem = P_ij_elem*2.0_dp
    2440              :                   END IF
    2441          222 :                   IF (beta_beta) THEN
    2442           34 :                      mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - P_ij_elem
    2443           34 :                      mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - P_ij_elem
    2444              :                   ELSE
    2445          188 :                      mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - P_ij_elem
    2446          188 :                      mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - P_ij_elem
    2447              :                   END IF
    2448         1264 :                   CALL timestop(handle2)
    2449              :                END DO
    2450              :             ELSE
    2451            2 :                CALL timeset(routineN//"_comm", handle2)
    2452              :                ! no work to be done, possible messeges to be exchanged
    2453            4 :                DO proc_shift = 1, comm_exchange%num_pe - 1
    2454            2 :                   proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2455            2 :                   proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2456              : 
    2457            2 :                   send_ijk_index = num_ijk(proc_send)
    2458              : 
    2459            4 :                   IF (ijk_index <= send_ijk_index) THEN
    2460              :                      ! somethig to send
    2461              :                      ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
    2462            2 :                                         integ_group_pos2color_sub(proc_send)
    2463            2 :                      send_i = ijk_map(1, ijk_counter_send)
    2464            2 :                      send_j = ijk_map(2, ijk_counter_send)
    2465            2 :                      send_k = ijk_map(3, ijk_counter_send)
    2466            2 :                      block_size = ijk_map(4, ijk_counter_send)
    2467              : 
    2468            2 :                      do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
    2469            2 :                      do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
    2470              :                      ! occupied i
    2471            2 :                      IF (do_send_i) THEN
    2472            2 :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
    2473              :                      END IF
    2474              :                      ! occupied j
    2475            2 :                      IF (do_send_j) THEN
    2476            0 :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
    2477              :                      END IF
    2478              :                      ! occupied k
    2479            2 :                      CALL comm_exchange%send(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
    2480              :                   END IF
    2481              : 
    2482              :                END DO ! proc loop
    2483            2 :                CALL timestop(handle2)
    2484              :             END IF
    2485              :          END DO ! ijk_index loop
    2486          462 :          DEALLOCATE (local_aL_i)
    2487          462 :          DEALLOCATE (local_aL_j)
    2488          462 :          DEALLOCATE (local_aL_k)
    2489          462 :          DEALLOCATE (t_ab)
    2490          846 :          DEALLOCATE (ijk_map)
    2491              :       END DO ! over number of loops (ispin)
    2492          384 :       CALL timestop(handle)
    2493              : 
    2494          768 :    END SUBROUTINE Quasi_degenerate_P_ij
    2495              : 
    2496              : ! **************************************************************************************************
    2497              : !> \brief ...
    2498              : !> \param my_ijk ...
    2499              : !> \param homo ...
    2500              : !> \param homo_beta ...
    2501              : !> \param Eigenval ...
    2502              : !> \param mp2_env ...
    2503              : !> \param ijk_map ...
    2504              : !> \param unit_nr ...
    2505              : !> \param ngroup ...
    2506              : !> \param do_print_alpha ...
    2507              : !> \param comm_exchange ...
    2508              : !> \param num_ijk ...
    2509              : !> \param max_ijk ...
    2510              : !> \param color_sub ...
    2511              : !> \param buffer_size ...
    2512              : !> \param my_group_L_size ...
    2513              : !> \param B_size_k ...
    2514              : !> \param para_env ...
    2515              : !> \param virtual ...
    2516              : !> \param B_size_i ...
    2517              : ! **************************************************************************************************
    2518          462 :    SUBROUTINE Find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
    2519              :                                        do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
    2520              :                                        buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
    2521              : 
    2522              :       INTEGER, INTENT(OUT)                               :: my_ijk
    2523              :       INTEGER, INTENT(IN)                                :: homo, homo_beta
    2524              :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    2525              :       TYPE(mp2_type), INTENT(IN)                         :: mp2_env
    2526              :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
    2527              :       INTEGER, INTENT(IN)                                :: unit_nr, ngroup
    2528              :       LOGICAL, INTENT(IN)                                :: do_print_alpha
    2529              :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    2530              :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: num_ijk
    2531              :       INTEGER, INTENT(OUT)                               :: max_ijk
    2532              :       INTEGER, INTENT(IN)                                :: color_sub, buffer_size, my_group_L_size, &
    2533              :                                                             B_size_k
    2534              :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    2535              :       INTEGER, INTENT(IN)                                :: virtual, B_size_i
    2536              : 
    2537              :       INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
    2538              :          ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
    2539              :          my_steps, num_k_blocks, num_sing_ij, total_ijk
    2540              :       INTEGER(KIND=int_8)                                :: mem
    2541          462 :       LOGICAL, ALLOCATABLE, DIMENSION(:, :)              :: ijk_marker
    2542              : 
    2543         1386 :       ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
    2544              : 
    2545          462 :       num_sing_ij = 0
    2546         2092 :       DO iiB = 1, homo
    2547              :          ! diagonal elements already updated
    2548         4368 :          DO jjB = iiB + 1, homo
    2549         2276 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) < mp2_env%ri_grad%eps_canonical) &
    2550         1744 :                num_sing_ij = num_sing_ij + 1
    2551              :          END DO
    2552              :       END DO
    2553              : 
    2554          462 :       IF (unit_nr > 0) THEN
    2555          231 :       IF (do_print_alpha) THEN
    2556              :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2557          153 :             "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
    2558              :       ELSE
    2559              :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2560           78 :             "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
    2561              :       END IF
    2562              :       END IF
    2563              : 
    2564              :       ! Determine the block size, first guess: use available buffer
    2565          462 :       max_block_size = buffer_size/(my_group_L_size*B_size_k)
    2566              : 
    2567              :       ! Second limit: memory
    2568          462 :       CALL m_memory(mem)
    2569              :       ! Convert to number of doubles
    2570          462 :       mem = mem/8
    2571              :       ! Remove local_ab (2x) and local_aL_i (2x)
    2572          462 :       mem = mem - 2_int_8*(virtual*B_size_k + B_size_i*my_group_L_size)
    2573          462 :       max_block_size = MIN(max_block_size, MAX(1, INT(mem/(my_group_L_size*B_size_k), KIND(max_block_size))))
    2574              : 
    2575              :       ! Exchange the limit
    2576          462 :       CALL para_env%min(max_block_size)
    2577              : 
    2578              :       ! Find now the block size which minimizes the communication volume and then the number of communication steps
    2579          462 :       block_size = 1
    2580          462 :       min_communication_volume = 3*homo_beta*num_sing_ij
    2581          462 :       communication_steps = 3*homo_beta*num_sing_ij
    2582         1174 :       DO iiB = max_block_size, 2, -1
    2583          712 :          max_num_k_blocks = homo_beta/iiB*num_sing_ij
    2584          712 :          num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
    2585          712 :          communication_volume = num_k_blocks*(2 + iiB) + 3*(homo_beta*num_sing_ij - iiB*num_k_blocks)
    2586          712 :          my_steps = num_k_blocks + homo_beta*num_sing_ij - iiB*num_k_blocks
    2587         1174 :          IF (communication_volume < min_communication_volume) THEN
    2588           52 :             block_size = iiB
    2589           52 :             min_communication_volume = communication_volume
    2590           52 :             communication_steps = my_steps
    2591          660 :          ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
    2592           58 :             block_size = iiB
    2593           58 :             communication_steps = my_steps
    2594              :          END IF
    2595              :       END DO
    2596              : 
    2597          462 :       IF (unit_nr > 0) THEN
    2598              :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2599          231 :             "MO_INFO| Block size:", block_size
    2600          231 :          CALL m_flush(unit_nr)
    2601              :       END IF
    2602              : 
    2603              :       ! Calculate number of large blocks
    2604          462 :       max_num_k_blocks = homo_beta/block_size*num_sing_ij
    2605          462 :       num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
    2606              : 
    2607          462 :       total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
    2608         1022 :       ALLOCATE (ijk_map(4, total_ijk))
    2609         2002 :       ijk_map = 0
    2610         1484 :       ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
    2611         1020 :       ijk_marker = .TRUE.
    2612              : 
    2613          462 :       my_ijk = 0
    2614          462 :       ijk_counter = 0
    2615          462 :       ij_counter = 0
    2616         2092 :       DO iiB = 1, homo
    2617              :          ! diagonal elements already updated
    2618         4368 :          DO jjB = iiB + 1, homo
    2619         2276 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
    2620          114 :             ij_counter = ij_counter + 1
    2621         1880 :             DO kkB = 1, homo_beta - MOD(homo_beta, block_size), block_size
    2622          172 :                IF (ijk_counter + 1 > num_k_blocks) EXIT
    2623          136 :                ijk_counter = ijk_counter + 1
    2624          408 :                ijk_marker(kkB:kkB + block_size - 1, ij_counter) = .FALSE.
    2625          136 :                ijk_map(1, ijk_counter) = iiB
    2626          136 :                ijk_map(2, ijk_counter) = jjB
    2627          136 :                ijk_map(3, ijk_counter) = kkB
    2628          136 :                ijk_map(4, ijk_counter) = block_size
    2629         2412 :                IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
    2630              :             END DO
    2631              :          END DO
    2632              :       END DO
    2633              :       ij_counter = 0
    2634         2092 :       DO iiB = 1, homo
    2635              :          ! diagonal elements already updated
    2636         4368 :          DO jjB = iiB + 1, homo
    2637         2276 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
    2638          114 :             ij_counter = ij_counter + 1
    2639         2188 :             DO kkB = 1, homo_beta
    2640         2720 :                IF (ijk_marker(kkB, ij_counter)) THEN
    2641          172 :                   ijk_counter = ijk_counter + 1
    2642          172 :                   ijk_map(1, ijk_counter) = iiB
    2643          172 :                   ijk_map(2, ijk_counter) = jjB
    2644          172 :                   ijk_map(3, ijk_counter) = kkB
    2645          172 :                   ijk_map(4, ijk_counter) = 1
    2646          172 :                   IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
    2647              :                END IF
    2648              :             END DO
    2649              :          END DO
    2650              :       END DO
    2651              : 
    2652          462 :       DEALLOCATE (ijk_marker)
    2653              : 
    2654          462 :       CALL comm_exchange%allgather(my_ijk, num_ijk)
    2655          954 :       max_ijk = MAXVAL(num_ijk)
    2656              : 
    2657          462 :    END SUBROUTINE Find_quasi_degenerate_ij
    2658              : 
    2659              : END MODULE mp2_ri_gpw
        

Generated by: LCOV version 2.0-1