LCOV - code coverage report
Current view: top level - src - submatrix_dissection.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:42dac4a) Lines: 87.2 % 406 354
Test Date: 2025-07-25 12:55:17 Functions: 80.0 % 10 8

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \note
      10              : !> This module implements a modified version of the submatrix method, proposed in
      11              : !>   M. Lass, S. Mohr, H. Wiebeler, T. Kuehne, C. Plessl
      12              : !>   A Massively Parallel Algorithm for the Approximate Calculation of Inverse p-th Roots of Large Sparse Matrices
      13              : !>   Proc. Platform for Advanced Scientific Computing (PASC) Conference, ACM, 2018
      14              : !>
      15              : !> The method is extended to minimize the required data transfers and floating-point operations under the assumption that non-zero
      16              : !> blocks of the matrix are close to its diagonal.
      17              : !>
      18              : !> Submatrices can be constructed not for single columns of the matrix but for a set of w consecutive submatrices. The underlying
      19              : !> assumption is that columns next to each other have relatively similar occupation patterns, i.e., constructing a submatrix from
      20              : !> columns x and x+1 should not lead to a much bigger submatrix than contructing it only from column x.
      21              : !>
      22              : !> The construction of the submatrices requires communication between all ranks. It is crucial to implement this communication as
      23              : !> efficient as possible, i.e., data should only ever be transferred once between two ranks and message sizes need to be
      24              : !> sufficiently large to utilize the communication bandwidth. To achieve this, we communicate the required blocks for all
      25              : !> submatrices at once and copy them to large buffers before transmitting them via MPI.
      26              : !>
      27              : !> Note on multi-threading:
      28              : !> Submatrices can be constructed and processed in parallel by multiple threads. However, generate_submatrix, get_sm_ids_for_rank
      29              : !> and copy_resultcol are the only thread-safe routines in this module. All other routines involve MPI communication or operate on
      30              : !> common, non-protected data and are hence not thread-safe.
      31              : !>
      32              : !> TODO:
      33              : !> * generic types (for now only dp supported)
      34              : !> * optimization of threaded initialization
      35              : !> * sanity checks at the beginning of all methods
      36              : !>
      37              : !> \author Michael Lass
      38              : ! **************************************************************************************************
      39              : 
      40              : MODULE submatrix_dissection
      41              : 
      42              :    USE bibliography,                    ONLY: Lass2018,&
      43              :                                               cite_reference
      44              :    USE cp_dbcsr_api,                    ONLY: &
      45              :         dbcsr_distribution_get, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
      46              :         dbcsr_get_info, dbcsr_get_stored_coordinates, dbcsr_iterator_blocks_left, &
      47              :         dbcsr_iterator_next_block, dbcsr_iterator_readonly_start, dbcsr_iterator_stop, &
      48              :         dbcsr_iterator_type, dbcsr_put_block, dbcsr_type
      49              :    USE kinds,                           ONLY: dp
      50              :    USE message_passing,                 ONLY: mp_comm_type
      51              :    USE submatrix_types,                 ONLY: buffer_type,&
      52              :                                               bufptr_type,&
      53              :                                               intBuffer_type,&
      54              :                                               set_type,&
      55              :                                               setarray_type
      56              : 
      57              : !$ USE omp_lib, ONLY: omp_get_max_threads, omp_get_thread_num
      58              : 
      59              : #include "./base/base_uses.f90"
      60              : 
      61              :    IMPLICIT NONE
      62              :    PRIVATE
      63              : 
      64              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'submatrix_dissection'
      65              : 
      66              :    TYPE, PUBLIC :: submatrix_dissection_type
      67              :       TYPE(dbcsr_type)                                :: dbcsr_mat
      68              :       TYPE(dbcsr_distribution_type)                   :: dist
      69              :       LOGICAL                                         :: initialized = .FALSE.
      70              :       TYPE(mp_comm_type)                              :: group = mp_comm_type()
      71              :       INTEGER                                         :: numnodes = -1, myrank = -1, nblkcols = -1, &
      72              :                                                          nblkrows = -1, nblks = -1, local_blocks = -1, &
      73              :                                                          cols_per_sm = -1, number_of_submatrices = -1
      74              :       INTEGER, DIMENSION(:), POINTER                  :: row_blk_size => NULL(), col_blk_size => NULL()
      75              :       INTEGER, DIMENSION(:), ALLOCATABLE              :: coo_cols, coo_rows, coo_col_offsets, coo_cols_local, coo_rows_local, &
      76              :                                                          coo_col_offsets_local, submatrix_owners, submatrix_sizes
      77              :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE    :: recvbufs, result_sendbufs ! Indexing starts with 0 to match rank ids!
      78              :       TYPE(set_type), DIMENSION(:), ALLOCATABLE       :: result_blocks_for_rank, result_blocks_from_rank
      79              :       TYPE(bufptr_type), DIMENSION(:), ALLOCATABLE    :: coo_dptr
      80              :       TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank_buf_offsets
      81              :    CONTAINS
      82              :       PROCEDURE :: init => submatrix_dissection_init
      83              :       PROCEDURE :: final => submatrix_dissection_final
      84              :       PROCEDURE :: get_sm_ids_for_rank => submatrix_get_sm_ids_for_rank
      85              :       PROCEDURE :: generate_submatrix => submatrix_generate_sm
      86              :       PROCEDURE :: copy_resultcol => submatrix_cpy_resultcol
      87              :       PROCEDURE :: communicate_results => submatrix_communicate_results
      88              :       PROCEDURE :: get_relevant_sm_columns => submatrix_get_relevant_sm_columns
      89              :    END TYPE submatrix_dissection_type
      90              : 
      91              : CONTAINS
      92              : 
      93              : ! **************************************************************************************************
      94              : !> \brief determine which columns of the submatrix are relevant for the result matrix
      95              : !> \param this - object of class submatrix_dissection_type
      96              : !> \param sm_id - id of the submatrix
      97              : !> \param first - first column of submatrix that is relevant
      98              : !> \param last - last column of submatrix that is relevant
      99              : ! **************************************************************************************************
     100           36 :    SUBROUTINE submatrix_get_relevant_sm_columns(this, sm_id, first, last)
     101              :       CLASS(submatrix_dissection_type), INTENT(IN)     :: this
     102              :       INTEGER, INTENT(IN)                              :: sm_id
     103              :       INTEGER, INTENT(OUT)                             :: first, last
     104              :       INTEGER                                          :: i, j, blkid
     105         9288 :       TYPE(set_type)                                   :: nonzero_rows
     106              : 
     107              :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     108           72 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     109          108 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     110           72 :             CALL nonzero_rows%insert(this%coo_rows(j))
     111              :          END DO
     112              :       END DO
     113              : 
     114           36 :       first = 1
     115           36 :       DO i = 1, nonzero_rows%elements
     116           36 :          blkid = nonzero_rows%get(i)
     117           36 :          IF (blkid .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
     118              :             ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
     119              :             ! Now add up block sizes to determine the last one as well
     120           36 :             last = first - 1
     121           36 :             DO j = i, nonzero_rows%elements
     122           36 :                blkid = nonzero_rows%get(j)
     123           36 :                last = last + this%col_blk_size(blkid)
     124           36 :                IF (blkid .EQ. (sm_id)*this%cols_per_sm) EXIT
     125              :             END DO
     126              :             EXIT
     127              :          END IF
     128            0 :          first = first + this%col_blk_size(blkid)
     129              :       END DO
     130              : 
     131           36 :       CALL nonzero_rows%reset
     132              : 
     133         9288 :    END SUBROUTINE submatrix_get_relevant_sm_columns
     134              : 
     135              : ! **************************************************************************************************
     136              : !> \brief initialize submatrix dissection and communicate, needs to be called before constructing any submatrices.
     137              : !> \param this - object of class submatrix_dissection_type
     138              : !> \param matrix_p - dbcsr input matrix
     139              : !> \par History
     140              : !>       2020.02 created [Michael Lass]
     141              : !>       2020.05 add time measurements [Michael Lass]
     142              : ! **************************************************************************************************
     143           10 :    SUBROUTINE submatrix_dissection_init(this, matrix_p) ! Should be PURE but the iterator routines are not
     144              :       CLASS(submatrix_dissection_type), INTENT(INOUT)  :: this
     145              :       TYPE(dbcsr_type), INTENT(IN)                     :: matrix_p
     146              : 
     147              :       INTEGER                                          :: cur_row, cur_col, i, j, k, l, m, l_limit_left, l_limit_right, &
     148              :                                                           bufsize, bufsize_next
     149           20 :       INTEGER, DIMENSION(:), ALLOCATABLE               :: blocks_per_rank, coo_dsplcmnts, num_blockids_send, num_blockids_recv
     150              :       TYPE(dbcsr_iterator_type)                        :: iter
     151         2580 :       TYPE(set_type)                                   :: nonzero_rows
     152              :       REAL(KIND=dp)                                    :: flops_total, flops_per_rank, flops_per_sm, flops_threshold, &
     153              :                                                           flops_current, flops_remaining
     154              : 
     155              :       ! Indexing for the following arrays starts with 0 to match rank ids
     156           10 :       TYPE(set_type), DIMENSION(:), ALLOCATABLE        :: blocks_from_rank
     157           10 :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE     :: sendbufs
     158           10 :       TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE  :: blocks_for_rank
     159              : 
     160              :       ! Additional structures for threaded parts
     161              :       INTEGER                                          :: numthreads, mytid
     162              :       ! Indexing starts at 0 to match thread ids
     163           10 :       TYPE(setarray_type), DIMENSION(:), ALLOCATABLE   :: nonzero_rows_t
     164           10 :       TYPE(setarray_type), DIMENSION(:), ALLOCATABLE   :: result_blocks_from_rank_t, result_blocks_for_rank_t, &
     165           10 :                                                           blocks_from_rank_t
     166              : 
     167              :       LOGICAL                                          :: valid
     168           10 :       REAL(KIND=dp), DIMENSION(:, :), POINTER           :: blockp
     169              : 
     170              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_dissection_init'
     171              :       INTEGER :: handle
     172              : 
     173           10 :       CALL timeset(routineN, handle)
     174           10 :       CALL cite_reference(Lass2018)
     175              : 
     176           10 :       this%dbcsr_mat = matrix_p
     177              :       CALL dbcsr_get_info(matrix=this%dbcsr_mat, nblkcols_total=this%nblkcols, nblkrows_total=this%nblkrows, &
     178           10 :                           row_blk_size=this%row_blk_size, col_blk_size=this%col_blk_size, group=this%group, distribution=this%dist)
     179           10 :       CALL dbcsr_distribution_get(dist=this%dist, mynode=this%myrank, numnodes=this%numnodes)
     180              : 
     181           10 :       IF (this%nblkcols .NE. this%nblkrows) THEN
     182            0 :          CPABORT("Number of block rows and cols need to be identical")
     183              :       END IF
     184              : 
     185           20 :       DO i = 1, this%nblkcols
     186           20 :          IF (this%col_blk_size(i) .NE. this%row_blk_size(i)) THEN
     187            0 :             CPABORT("Block row sizes and col sizes need to be identical")
     188              :          END IF
     189              :       END DO
     190              : 
     191              :       ! TODO: We could do even more sanity checks here, e.g., the matrix must not be stored symmetrically
     192              : 
     193              :       ! For the submatrix method, we need global knwoledge about which blocks are actually used. Therefore, we create a COO
     194              :       ! representation of the blocks (without their contents) on all ranks.
     195              : 
     196              :       ! TODO: Right now, the COO contains all blocks. Also we transfer all blocks. We could skip half of them due to the matrix
     197              :       ! being symmetric (however, we need to transpose the blocks). This can increase performance only by a factor of 2 and is
     198              :       ! therefore deferred.
     199              : 
     200              :       ! Determine number of locally stored blocks
     201           10 :       this%local_blocks = 0
     202           10 :       CALL dbcsr_iterator_readonly_start(iter, this%dbcsr_mat)
     203           15 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     204            5 :          CALL dbcsr_iterator_next_block(iter, cur_row, cur_col)
     205            5 :          this%local_blocks = this%local_blocks + 1
     206              :       END DO
     207           10 :       CALL dbcsr_iterator_stop(iter)
     208              : 
     209            0 :       ALLOCATE (this%coo_cols_local(this%local_blocks), this%coo_rows_local(this%local_blocks), blocks_per_rank(this%numnodes), &
     210            0 :                 coo_dsplcmnts(this%numnodes), this%coo_col_offsets_local(this%nblkcols + 1), &
     211              :                 blocks_for_rank(0:this%numnodes - 1), blocks_from_rank(0:this%numnodes - 1), &
     212            0 :                 sendbufs(0:this%numnodes - 1), this%recvbufs(0:this%numnodes - 1), this%result_sendbufs(0:this%numnodes - 1), &
     213            0 :                 this%result_blocks_for_rank(0:this%numnodes - 1), this%result_blocks_from_rank(0:this%numnodes - 1), &
     214        23560 :                 this%result_blocks_for_rank_buf_offsets(0:this%numnodes - 1))
     215              : 
     216           10 :       i = 1
     217           10 :       CALL dbcsr_iterator_readonly_start(iter, this%dbcsr_mat)
     218           15 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     219            5 :          CALL dbcsr_iterator_next_block(iter, cur_row, cur_col)
     220            5 :          this%coo_cols_local(i) = cur_col
     221            5 :          this%coo_rows_local(i) = cur_row
     222            5 :          i = i + 1
     223              :       END DO
     224           10 :       CALL dbcsr_iterator_stop(iter)
     225              : 
     226              :       ! We only know how many blocks we own. What's with the other ranks?
     227           10 :       CALL this%group%allgather(msgout=this%local_blocks, msgin=blocks_per_rank)
     228           10 :       coo_dsplcmnts(1) = 0
     229           20 :       DO i = 1, this%numnodes - 1
     230           20 :          coo_dsplcmnts(i + 1) = coo_dsplcmnts(i) + blocks_per_rank(i)
     231              :       END DO
     232              : 
     233              :       ! Get a global view on the matrix
     234           30 :       this%nblks = SUM(blocks_per_rank)
     235            0 :       ALLOCATE (this%coo_cols(this%nblks), this%coo_rows(this%nblks), this%coo_col_offsets(this%nblkcols + 1), &
     236          100 :                 this%coo_dptr(this%nblks))
     237              :       CALL this%group%allgatherv(msgout=this%coo_rows_local, msgin=this%coo_rows, rcount=blocks_per_rank, &
     238           10 :                                  rdispl=coo_dsplcmnts)
     239              :       CALL this%group%allgatherv(msgout=this%coo_cols_local, msgin=this%coo_cols, rcount=blocks_per_rank, &
     240           10 :                                  rdispl=coo_dsplcmnts)
     241              : 
     242           10 :       DEALLOCATE (blocks_per_rank, coo_dsplcmnts)
     243              : 
     244              :       ! Sort COO arrays according to their columns
     245           10 :       CALL qsort_two(this%coo_cols_local, this%coo_rows_local, 1, this%local_blocks)
     246           10 :       CALL qsort_two(this%coo_cols, this%coo_rows, 1, this%nblks)
     247              : 
     248              :       ! Get COO array offsets for columns to accelerate lookups
     249           10 :       this%coo_col_offsets(this%nblkcols + 1) = this%nblks + 1
     250           10 :       j = 1
     251           20 :       DO i = 1, this%nblkcols
     252           10 :          DO WHILE ((j .LE. this%nblks))
     253           10 :             IF (this%coo_cols(j) .GE. i) EXIT
     254           10 :             j = j + 1
     255              :          END DO
     256           20 :          this%coo_col_offsets(i) = j
     257              :       END DO
     258              : 
     259              :       ! Same for local COO
     260           10 :       this%coo_col_offsets_local(this%nblkcols + 1) = this%local_blocks + 1
     261           10 :       j = 1
     262           20 :       DO i = 1, this%nblkcols
     263           10 :          DO WHILE ((j .LE. this%local_blocks))
     264            5 :             IF (this%coo_cols_local(j) .GE. i) EXIT
     265            5 :             j = j + 1
     266              :          END DO
     267           20 :          this%coo_col_offsets_local(i) = j
     268              :       END DO
     269              : 
     270              :       ! We could combine multiple columns to generate a single submatrix. For now, we have not found a practical use-case for this
     271              :       ! so we only look at single columns for now.
     272           10 :       this%cols_per_sm = 1
     273              : 
     274              :       ! Determine sizes of all submatrices. This is required in order to assess the computational effort that is required to process
     275              :       ! the submatrices.
     276           10 :       this%number_of_submatrices = this%nblkcols/this%cols_per_sm
     277           30 :       ALLOCATE (this%submatrix_sizes(this%number_of_submatrices))
     278           20 :       this%submatrix_sizes = 0
     279           10 :       flops_total = 0.0D0
     280           20 :       DO i = 1, this%number_of_submatrices
     281           10 :          CALL nonzero_rows%reset
     282           20 :          DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm            ! all colums that determine submatrix i
     283           30 :             DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
     284           20 :                CALL nonzero_rows%insert(this%coo_rows(k))
     285              :             END DO
     286              :          END DO
     287           20 :          DO j = 1, nonzero_rows%elements
     288           20 :             this%submatrix_sizes(i) = this%submatrix_sizes(i) + this%col_blk_size(nonzero_rows%get(j))
     289              :          END DO
     290           20 :          flops_total = flops_total + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
     291              :       END DO
     292              : 
     293              :       ! Create mapping from submatrices to ranks. Since submatrices can be of different sizes, we need to perform some load
     294              :       ! balancing here. For that we assume that arithmetic operations performed on the submatrices scale cubically.
     295           30 :       ALLOCATE (this%submatrix_owners(this%number_of_submatrices))
     296           10 :       flops_per_sm = flops_total/this%number_of_submatrices
     297           10 :       flops_per_rank = flops_total/this%numnodes
     298           10 :       flops_current = 0.0D0
     299           10 :       j = 0
     300           20 :       DO i = 1, this%number_of_submatrices
     301           10 :          this%submatrix_owners(i) = j
     302           10 :          flops_current = flops_current + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
     303           10 :          flops_remaining = flops_total - flops_current
     304           10 :          flops_threshold = (this%numnodes - j - 1)*flops_per_rank
     305              :          IF ((j .LT. (this%numnodes - 1)) &
     306           10 :              .AND. ((flops_remaining .LE. flops_threshold &
     307           10 :                      .OR. (this%number_of_submatrices - i) .LT. (this%numnodes - j)))) THEN
     308           10 :             j = j + 1
     309           10 :             flops_total = flops_total - flops_current
     310           10 :             flops_current = 0.0D0
     311              :          END IF
     312              :       END DO
     313              : 
     314              :       ! Prepare data structures for multithreaded loop
     315           10 :       numthreads = 1
     316           10 : !$    numthreads = omp_get_max_threads()
     317              : 
     318            0 :       ALLOCATE (result_blocks_from_rank_t(0:numthreads - 1), &
     319            0 :                 result_blocks_for_rank_t(0:numthreads - 1), &
     320            0 :                 blocks_from_rank_t(0:numthreads - 1), &
     321          100 :                 nonzero_rows_t(0:numthreads - 1))
     322              : 
     323              :       ! Figure out which blocks we need to receive. Blocks are identified here as indices into our COO representation.
     324              :       ! TODO: This currently shows limited parallel efficiency. Investigate further.
     325              : 
     326              :       !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
     327              :       !$OMP          NUM_THREADS(numthreads) &
     328              :       !$OMP          PRIVATE(i,j,k,l,m,l_limit_left,l_limit_right,cur_col,cur_row,mytid) &
     329              :       !$OMP          SHARED(result_blocks_from_rank_t,result_blocks_for_rank_t,blocks_from_rank_t,this,numthreads,nonzero_rows_t)
     330              :       mytid = 0
     331              : !$    mytid = omp_get_thread_num()
     332              : 
     333              :       ALLOCATE (nonzero_rows_t(mytid)%sets(1), &
     334              :                 result_blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1), &
     335              :                 result_blocks_for_rank_t(mytid)%sets(0:this%numnodes - 1), &
     336              :                 blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1))
     337              : 
     338           10 :       !$OMP DO schedule(guided)
     339              :       DO i = 1, this%number_of_submatrices
     340              :          CALL nonzero_rows_t(mytid)%sets(1)%reset
     341              :          DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm            ! all colums that determine submatrix i
     342              :             DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
     343              :                ! This block will be required to assemble the final block matrix as it is within an inducing column for submatrix i.
     344              :                ! Figure out who stores it and insert it into the result_blocks_* sets.
     345              :                CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=j, processor=m)
     346              :                IF (m .EQ. this%myrank) THEN
     347              :                   CALL result_blocks_from_rank_t(mytid)%sets(this%submatrix_owners(i))%insert(k)
     348              :                END IF
     349              :                IF (this%submatrix_owners(i) .EQ. this%myrank) THEN
     350              :                   CALL nonzero_rows_t(mytid)%sets(1)%insert(this%coo_rows(k))
     351              :                   CALL result_blocks_for_rank_t(mytid)%sets(m)%insert(k)
     352              :                END IF
     353              :             END DO
     354              :          END DO
     355              : 
     356              :          IF (this%submatrix_owners(i) .NE. this%myrank) CYCLE
     357              : 
     358              :          ! In the following, we determine all blocks required to build the submatrix. We interpret nonzero_rows_t(mytid)(j) as
     359              :          ! column and nonzero_rows_t(mytid)(k) as row.
     360              :          DO j = 1, nonzero_rows_t(mytid)%sets(1)%elements
     361              :             cur_col = nonzero_rows_t(mytid)%sets(1)%get(j)
     362              :             l_limit_left = this%coo_col_offsets(cur_col)
     363              :             l_limit_right = this%coo_col_offsets(cur_col + 1) - 1
     364              :             DO k = 1, nonzero_rows_t(mytid)%sets(1)%elements
     365              :                cur_row = nonzero_rows_t(mytid)%sets(1)%get(k)
     366              :                l = l_limit_left
     367              :                DO WHILE (l .LE. l_limit_right)
     368              :                   IF (this%coo_rows(l) .GE. cur_row) EXIT
     369              :                   l = l + 1
     370              :                END DO
     371              :                l_limit_left = l
     372              :                IF (l .LE. l_limit_right) THEN
     373              :                   IF (this%coo_rows(l) .EQ. cur_row) THEN
     374              :                      ! We found a valid block. Figure out what to do with it.
     375              :                      CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(l), &
     376              :                                                        column=this%coo_cols(l), processor=m)
     377              :                      CALL blocks_from_rank_t(mytid)%sets(m)%insert(l)
     378              :                   END IF
     379              :                END IF
     380              :             END DO
     381              :          END DO
     382              :       END DO
     383              :       !$OMP END DO
     384              :       !$OMP END PARALLEL
     385              : 
     386              :       ! Merge partial results from threads
     387           30 :       DO i = 0, this%numnodes - 1
     388           50 :          DO j = 0, numthreads - 1
     389           25 :             DO k = 1, result_blocks_from_rank_t(j)%sets(i)%elements
     390           25 :                CALL this%result_blocks_from_rank(i)%insert(result_blocks_from_rank_t(j)%sets(i)%get(k))
     391              :             END DO
     392           20 :             CALL result_blocks_from_rank_t(j)%sets(i)%reset
     393           25 :             DO k = 1, result_blocks_for_rank_t(j)%sets(i)%elements
     394           25 :                CALL this%result_blocks_for_rank(i)%insert(result_blocks_for_rank_t(j)%sets(i)%get(k))
     395              :             END DO
     396           20 :             CALL result_blocks_for_rank_t(j)%sets(i)%reset
     397           25 :             DO k = 1, blocks_from_rank_t(j)%sets(i)%elements
     398           25 :                CALL blocks_from_rank(i)%insert(blocks_from_rank_t(j)%sets(i)%get(k))
     399              :             END DO
     400           40 :             CALL blocks_from_rank_t(j)%sets(i)%reset
     401              :          END DO
     402              :       END DO
     403           20 :       DO i = 0, numthreads - 1
     404           10 :          CALL nonzero_rows_t(i)%sets(1)%reset
     405            0 :          DEALLOCATE (result_blocks_from_rank_t(i)%sets, result_blocks_for_rank_t(i)%sets, blocks_from_rank_t(i)%sets, &
     406        18080 :                      nonzero_rows_t(i)%sets)
     407              :       END DO
     408           50 :       DEALLOCATE (result_blocks_from_rank_t, result_blocks_for_rank_t, blocks_from_rank_t, nonzero_rows_t)
     409              : 
     410              :       ! Make other ranks aware of our needs
     411           40 :       ALLOCATE (num_blockids_send(0:this%numnodes - 1), num_blockids_recv(0:this%numnodes - 1))
     412           30 :       DO i = 0, this%numnodes - 1
     413           30 :          num_blockids_send(i) = blocks_from_rank(i)%elements
     414              :       END DO
     415           10 :       CALL this%group%alltoall(num_blockids_send, num_blockids_recv, 1)
     416           30 :       DO i = 0, this%numnodes - 1
     417           30 :          CALL blocks_for_rank(i)%alloc(num_blockids_recv(i))
     418              :       END DO
     419           10 :       DEALLOCATE (num_blockids_send, num_blockids_recv)
     420              : 
     421           10 :       IF (this%numnodes .GT. 1) THEN
     422           30 :          DO i = 1, this%numnodes
     423           20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     424           30 :             CALL this%group%irecv(msgout=blocks_for_rank(k)%data, source=k, request=blocks_for_rank(k)%mpi_request)
     425              :          END DO
     426           30 :          DO i = 1, this%numnodes
     427           20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     428           30 :             CALL this%group%send(blocks_from_rank(k)%getall(), k, 0)
     429              :          END DO
     430           30 :          DO i = 0, this%numnodes - 1
     431           30 :             CALL blocks_for_rank(i)%mpi_request%wait()
     432              :          END DO
     433              :       ELSE
     434            0 :          blocks_for_rank(0)%data = blocks_from_rank(0)%getall()
     435              :       END IF
     436              : 
     437              :       ! Free memory allocated in nonzero_rows
     438           10 :       CALL nonzero_rows%reset
     439              : 
     440              :       ! Make get calls on this%result_blocks_for_rank(...) threadsafe in other routines by updating the internal sorted list
     441           30 :       DO m = 0, this%numnodes - 1
     442           30 :          IF ((.NOT. this%result_blocks_for_rank(m)%sorted_up_to_date) .AND. (this%result_blocks_for_rank(m)%elements .GT. 0)) THEN
     443            5 :             CALL this%result_blocks_for_rank(m)%update_sorted
     444              :          END IF
     445              :       END DO
     446              : 
     447              :       ! Create and fill send buffers
     448           30 :       DO i = 0, this%numnodes - 1
     449           20 :          bufsize = 0
     450           25 :          DO j = 1, blocks_for_rank(i)%size
     451            5 :             k = blocks_for_rank(i)%data(j)
     452           25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     453              :          END DO
     454           20 :          CALL sendbufs(i)%alloc(bufsize)
     455              : 
     456           20 :          bufsize = 0
     457           20 :          CALL this%result_blocks_for_rank_buf_offsets(i)%alloc(this%result_blocks_for_rank(i)%elements)
     458           25 :          DO j = 1, this%result_blocks_for_rank(i)%elements
     459            5 :             k = this%result_blocks_for_rank(i)%get(j)
     460            5 :             this%result_blocks_for_rank_buf_offsets(i)%data(j) = bufsize
     461           25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     462              :          END DO
     463           20 :          CALL this%result_sendbufs(i)%alloc(bufsize)
     464              : 
     465           20 :          bufsize = 0
     466           35 :          DO j = 1, blocks_for_rank(i)%size
     467            5 :             k = blocks_for_rank(i)%data(j)
     468            5 :             CALL dbcsr_get_block_p(this%dbcsr_mat, row=this%coo_rows(k), col=this%coo_cols(k), block=blockp, found=valid)
     469            5 :             IF (.NOT. valid) THEN
     470            0 :                CPABORT("Block included in our COO and placed on our rank could not be fetched!")
     471              :             END IF
     472           15 :             bufsize_next = bufsize + SIZE(blockp)
     473          200 :             sendbufs(i)%data(bufsize + 1:bufsize_next) = RESHAPE(blockp, [SIZE(blockp)])
     474           30 :             bufsize = bufsize_next
     475              :          END DO
     476              :       END DO
     477              : 
     478              :       ! Create receive buffers and mapping from blocks to memory locations
     479           30 :       DO i = 0, this%numnodes - 1
     480           20 :          bufsize = 0
     481           25 :          DO j = 1, blocks_from_rank(i)%elements
     482            5 :             k = blocks_from_rank(i)%get(j)
     483           25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     484              :          END DO
     485           20 :          CALL this%recvbufs(i)%alloc(bufsize)
     486           20 :          bufsize = 0
     487           35 :          DO j = 1, blocks_from_rank(i)%elements
     488            5 :             k = blocks_from_rank(i)%get(j)
     489            5 :             bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     490            5 :             this%coo_dptr(k)%target => this%recvbufs(i)%data(bufsize + 1:bufsize_next)
     491           25 :             bufsize = bufsize_next
     492              :          END DO
     493              :       END DO
     494              : 
     495           30 :       DO i = 0, this%numnodes - 1
     496           20 :          CALL blocks_for_rank(i)%dealloc
     497           30 :          CALL blocks_from_rank(i)%reset
     498              :       END DO
     499         5180 :       DEALLOCATE (blocks_for_rank, blocks_from_rank)
     500              : 
     501           10 :       IF (this%numnodes .GT. 1) THEN
     502              :          ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
     503              :          ! and first trying to receive from our left neighbor.
     504           30 :          DO i = 1, this%numnodes
     505           20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     506           20 :             CALL this%group%irecv(msgout=this%recvbufs(k)%data, source=k, request=this%recvbufs(k)%mpi_request)
     507           20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     508           30 :             CALL this%group%isend(msgin=sendbufs(k)%data, dest=k, request=sendbufs(k)%mpi_request)
     509              :          END DO
     510           30 :          DO i = 0, this%numnodes - 1
     511           20 :             CALL sendbufs(i)%mpi_request%wait()
     512           30 :             CALL this%recvbufs(i)%mpi_request%wait()
     513              :          END DO
     514              :       ELSE
     515            0 :          this%recvbufs(0)%data = sendbufs(0)%data
     516              :       END IF
     517              : 
     518           30 :       DO i = 0, this%numnodes - 1
     519           30 :          CALL sendbufs(i)%dealloc
     520              :       END DO
     521           10 :       DEALLOCATE (sendbufs)
     522              : 
     523           10 :       this%initialized = .TRUE.
     524              : 
     525           10 :       CALL timestop(handle)
     526         2630 :    END SUBROUTINE submatrix_dissection_init
     527              : 
     528              : ! **************************************************************************************************
     529              : !> \brief free all associated memory, afterwards submatrix_dissection_init needs to be called again
     530              : !> \param this - object of class submatrix_dissection_type
     531              : ! **************************************************************************************************
     532           10 :    PURE SUBROUTINE submatrix_dissection_final(this)
     533              :       CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
     534              :       INTEGER                                         :: i
     535              : 
     536           10 :       this%initialized = .FALSE.
     537              : 
     538           10 :       IF (ALLOCATED(this%submatrix_sizes)) DEALLOCATE (this%submatrix_sizes)
     539           10 :       IF (ALLOCATED(this%coo_cols_local)) DEALLOCATE (this%coo_cols_local)
     540           10 :       IF (ALLOCATED(this%coo_rows_local)) DEALLOCATE (this%coo_rows_local)
     541           10 :       IF (ALLOCATED(this%coo_col_offsets_local)) DEALLOCATE (this%coo_col_offsets_local)
     542           10 :       IF (ALLOCATED(this%result_blocks_for_rank_buf_offsets)) THEN
     543           30 :          DO i = 0, this%numnodes - 1
     544           30 :             CALL this%result_blocks_for_rank_buf_offsets(i)%dealloc
     545              :          END DO
     546           10 :          DEALLOCATE (this%result_blocks_for_rank_buf_offsets)
     547              :       END IF
     548           10 :       IF (ALLOCATED(this%recvbufs)) THEN
     549           30 :          DO i = 0, this%numnodes - 1
     550           30 :             CALL this%recvbufs(i)%dealloc
     551              :          END DO
     552           10 :          DEALLOCATE (this%recvbufs)
     553              :       END IF
     554           10 :       IF (ALLOCATED(this%result_sendbufs)) THEN
     555           30 :          DO i = 0, this%numnodes - 1
     556           30 :             CALL this%result_sendbufs(i)%dealloc
     557              :          END DO
     558           10 :          DEALLOCATE (this%result_sendbufs)
     559              :       END IF
     560           10 :       IF (ALLOCATED(this%result_blocks_for_rank)) THEN
     561           30 :          DO i = 0, this%numnodes - 1
     562           30 :             CALL this%result_blocks_for_rank(i)%reset
     563              :          END DO
     564         5170 :          DEALLOCATE (this%result_blocks_for_rank)
     565              :       END IF
     566           10 :       IF (ALLOCATED(this%result_blocks_from_rank)) THEN
     567           30 :          DO i = 0, this%numnodes - 1
     568           30 :             CALL this%result_blocks_from_rank(i)%reset
     569              :          END DO
     570         5170 :          DEALLOCATE (this%result_blocks_from_rank)
     571              :       END IF
     572           10 :       IF (ALLOCATED(this%coo_cols)) DEALLOCATE (this%coo_cols)
     573           10 :       IF (ALLOCATED(this%coo_rows)) DEALLOCATE (this%coo_rows)
     574           10 :       IF (ALLOCATED(this%coo_col_offsets)) DEALLOCATE (this%coo_col_offsets)
     575           10 :       IF (ALLOCATED(this%coo_dptr)) DEALLOCATE (this%coo_dptr)
     576           10 :       IF (ALLOCATED(this%submatrix_owners)) DEALLOCATE (this%submatrix_owners)
     577              : 
     578           10 :    END SUBROUTINE submatrix_dissection_final
     579              : 
     580              : ! **************************************************************************************************
     581              : !> \brief generate a specific submatrix
     582              : !> \param this - object of class submatrix_dissection_type
     583              : !> \param sm_id - id of the submatrix to generate
     584              : !> \param sm - generated submatrix
     585              : ! **************************************************************************************************
     586            6 :    SUBROUTINE submatrix_generate_sm(this, sm_id, sm)
     587              :       CLASS(submatrix_dissection_type), INTENT(IN)             :: this
     588              :       INTEGER, INTENT(IN)                                      :: sm_id
     589              :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(OUT) :: sm
     590              : 
     591              :       INTEGER                                                  :: sm_dim, i, j, k, offset_x1, offset_x2, offset_y1, &
     592              :                                                                   offset_y2, k_limit_left, k_limit_right
     593         1548 :       TYPE(set_type)                                           :: nonzero_rows
     594              : 
     595            6 :       IF (.NOT. this%initialized) THEN
     596            0 :          CPABORT("Submatrix dissection not initialized")
     597              :       END IF
     598              : 
     599            6 :       IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
     600            0 :          CPABORT("This rank is not supposed to construct this submatrix")
     601              :       END IF
     602              : 
     603              :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     604            6 :       CALL nonzero_rows%reset
     605           12 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     606           18 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     607           12 :             CALL nonzero_rows%insert(this%coo_rows(j))
     608              :          END DO
     609              :       END DO
     610            6 :       sm_dim = 0
     611           12 :       DO i = 1, nonzero_rows%elements
     612           12 :          sm_dim = sm_dim + this%col_blk_size(nonzero_rows%get(i))
     613              :       END DO
     614              : 
     615           24 :       ALLOCATE (sm(sm_dim, sm_dim))
     616          258 :       sm = 0
     617              : 
     618            6 :       offset_x1 = 0
     619           12 :       DO j = 1, nonzero_rows%elements
     620            6 :          offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
     621            6 :          offset_y1 = 0
     622            6 :          k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
     623            6 :          k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
     624           12 :          DO i = 1, nonzero_rows%elements
     625            6 :             offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
     626              :             ! Copy block nonzero_rows(i),nonzero_rows(j) to sm(i,j) (if the block actually exists)
     627            6 :             k = k_limit_left
     628            6 :             DO WHILE (k .LE. k_limit_right)
     629            6 :                IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
     630            0 :                k = k + 1
     631              :             END DO
     632            6 :             k_limit_left = k
     633            6 :             IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
     634              :                sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2) = RESHAPE(this%coo_dptr(k)%target, &
     635          270 :                                                                               (/offset_y2 - offset_y1, offset_x2 - offset_x1/))
     636              :             END IF
     637           12 :             offset_y1 = offset_y2
     638              :          END DO
     639           12 :          offset_x1 = offset_x2
     640              :       END DO
     641              : 
     642              :       ! Free memory allocated in nonzero_rows
     643            6 :       CALL nonzero_rows%reset
     644              : 
     645         1548 :    END SUBROUTINE submatrix_generate_sm
     646              : 
     647              : ! **************************************************************************************************
     648              : !> \brief determine submatrix ids that are handled by a specific rank
     649              : !> \param this - object of class submatrix_dissection_type
     650              : !> \param rank - rank id of interest
     651              : !> \param sm_ids - list of submatrix ids handled by that rank
     652              : ! **************************************************************************************************
     653           10 :    SUBROUTINE submatrix_get_sm_ids_for_rank(this, rank, sm_ids)
     654              :       CLASS(submatrix_dissection_type), INTENT(IN)    :: this
     655              :       INTEGER, INTENT(IN)                             :: rank
     656              :       INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(OUT) :: sm_ids
     657              :       INTEGER                                         :: count, i
     658              : 
     659           10 :       IF (.NOT. this%initialized) THEN
     660            0 :          CPABORT("Submatrix dissection not initialized")
     661              :       END IF
     662              : 
     663           10 :       count = 0
     664           20 :       DO i = 1, this%number_of_submatrices
     665           20 :          IF (this%submatrix_owners(i) .EQ. rank) count = count + 1
     666              :       END DO
     667              : 
     668           25 :       ALLOCATE (sm_ids(count))
     669              : 
     670           10 :       count = 0
     671           20 :       DO i = 1, this%number_of_submatrices
     672           20 :          IF (this%submatrix_owners(i) .EQ. rank) THEN
     673            5 :             count = count + 1
     674            5 :             sm_ids(count) = i
     675              :          END IF
     676              :       END DO
     677              : 
     678           10 :    END SUBROUTINE submatrix_get_sm_ids_for_rank
     679              : 
     680              : ! **************************************************************************************************
     681              : !> \brief copy result columns from a submatrix into result buffer
     682              : !> \param this - object of class submatrix_dissection_type
     683              : !> \param sm_id - id of the submatrix
     684              : !> \param sm - result-submatrix
     685              : ! **************************************************************************************************
     686            6 :    SUBROUTINE submatrix_cpy_resultcol(this, sm_id, sm)
     687              :       CLASS(submatrix_dissection_type), INTENT(INOUT)         :: this
     688              :       INTEGER, INTENT(in)                                     :: sm_id
     689              :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(IN) :: sm
     690              : 
     691         1548 :       TYPE(set_type)                                          :: nonzero_rows
     692              :       INTEGER                                                 :: i, j, k, m, sm_col_offset, offset_x1, offset_x2, offset_y1, &
     693              :                                                                  offset_y2, bufsize, bufsize_next, k_limit_left, k_limit_right
     694            6 :       INTEGER, DIMENSION(:), ALLOCATABLE                      :: buf_offset_idxs
     695              : 
     696            6 :       IF (.NOT. this%initialized) THEN
     697            0 :          CPABORT("Submatrix dissection is not initizalized")
     698              :       END IF
     699              : 
     700            6 :       IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
     701            0 :          CPABORT("This rank is not supposed to operate on this submatrix")
     702              :       END IF
     703              : 
     704           18 :       ALLOCATE (buf_offset_idxs(0:this%numnodes - 1))
     705           18 :       buf_offset_idxs = 1
     706              : 
     707              :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     708            6 :       sm_col_offset = 0
     709           12 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     710           18 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     711           12 :             CALL nonzero_rows%insert(this%coo_rows(j))
     712              :          END DO
     713              :       END DO
     714              : 
     715            6 :       sm_col_offset = 0
     716            6 :       DO i = 1, nonzero_rows%elements
     717            6 :          IF (nonzero_rows%get(i) .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
     718              :             ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
     719            6 :             sm_col_offset = i
     720            6 :             EXIT
     721              :          END IF
     722              :       END DO
     723            6 :       IF (sm_col_offset .EQ. 0) THEN
     724            0 :          CPABORT("Could not determine relevant result columns of submatrix")
     725              :       END IF
     726              : 
     727            6 :       offset_x1 = 0
     728           12 :       DO j = 1, nonzero_rows%elements
     729            6 :          offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
     730              :          ! We only want to copy the blocks from the result columns
     731            6 :          IF ((j .GE. sm_col_offset) .AND. (j .LT. sm_col_offset + this%cols_per_sm)) THEN
     732            6 :             offset_y1 = 0
     733            6 :             k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
     734            6 :             k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
     735           12 :             DO i = 1, nonzero_rows%elements
     736            6 :                offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
     737              :                ! Check if sm(i,j), i.e., (nonzero_rows(i),nonzero_rows(j)) exists in the original matrix and if so, copy it into the
     738              :                ! correct send buffer.
     739            6 :                k = k_limit_left
     740            6 :                DO WHILE (k .LE. k_limit_right)
     741            6 :                   IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
     742            0 :                   k = k + 1
     743              :                END DO
     744            6 :                k_limit_left = k
     745            6 :                IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
     746              :                   CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=this%coo_cols(k), &
     747            6 :                                                     processor=m)
     748            6 :                   DO WHILE (buf_offset_idxs(m) .LE. this%result_blocks_for_rank(m)%elements)
     749            6 :                      IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .GE. k) EXIT
     750            0 :                      buf_offset_idxs(m) = buf_offset_idxs(m) + 1
     751              :                   END DO
     752            6 :                   IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .NE. k) THEN
     753            0 :                      CPABORT("Could not determine buffer offset for block")
     754              :                   END IF
     755            6 :                   bufsize = this%result_blocks_for_rank_buf_offsets(m)%data(buf_offset_idxs(m))
     756            6 :                   bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     757              :                   this%result_sendbufs(m)%data(bufsize + 1:bufsize_next) = RESHAPE( &
     758              :                                                                            sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2), &
     759          228 :                                                                            (/bufsize_next - bufsize/))
     760              :                END IF
     761           12 :                offset_y1 = offset_y2
     762              :             END DO
     763              :          END IF
     764           12 :          offset_x1 = offset_x2
     765              :       END DO
     766              : 
     767            6 :       DEALLOCATE (buf_offset_idxs)
     768              : 
     769              :       ! Free memory in set
     770            6 :       CALL nonzero_rows%reset
     771              : 
     772         1548 :    END SUBROUTINE submatrix_cpy_resultcol
     773              : 
     774              : ! **************************************************************************************************
     775              : !> \brief finalize results back into a dbcsr matrix
     776              : !> \param this - object of class submatrix_dissection_type
     777              : !> \param resultmat - result dbcsr matrix
     778              : !> \par History
     779              : !>       2020.02 created [Michael Lass]
     780              : !>       2020.05 add time measurements [Michael Lass]
     781              : ! **************************************************************************************************
     782           10 :    SUBROUTINE submatrix_communicate_results(this, resultmat)
     783              :       CLASS(submatrix_dissection_type), INTENT(INOUT)                 :: this
     784              :       TYPE(dbcsr_type), INTENT(INOUT)                                 :: resultmat
     785              : 
     786              :       INTEGER                                                         :: i, j, k, cur_row, cur_col, cur_sm, cur_buf, last_buf, &
     787              :                                                                          bufsize, bufsize_next, row_size, col_size
     788           10 :       REAL(kind=dp), DIMENSION(:), POINTER                            :: vector
     789           10 :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE                    :: result_recvbufs
     790              : 
     791              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_communicate_results'
     792              :       INTEGER :: handle
     793              : 
     794           10 :       CALL timeset(routineN, handle)
     795              : 
     796           60 :       ALLOCATE (result_recvbufs(0:this%numnodes - 1))
     797           30 :       DO i = 0, this%numnodes - 1
     798           20 :          bufsize = 0
     799           25 :          DO j = 1, this%result_blocks_from_rank(i)%elements
     800            5 :             k = this%result_blocks_from_rank(i)%get(j)
     801           25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     802              :          END DO
     803           30 :          CALL result_recvbufs(i)%alloc(bufsize)
     804              :       END DO
     805              : 
     806              :       ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
     807              :       ! and first trying to receive from our left neighbor.
     808           10 :       IF (this%numnodes .GT. 1) THEN
     809           30 :          DO i = 1, this%numnodes
     810           20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     811           20 :             CALL this%group%irecv(msgout=result_recvbufs(k)%data, source=k, request=result_recvbufs(k)%mpi_request)
     812           20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     813           30 :             CALL this%group%isend(msgin=this%result_sendbufs(k)%data, dest=k, request=this%result_sendbufs(k)%mpi_request)
     814              :          END DO
     815           30 :          DO i = 0, this%numnodes - 1
     816           20 :             CALL this%result_sendbufs(i)%mpi_request%wait()
     817           30 :             CALL result_recvbufs(i)%mpi_request%wait()
     818              :          END DO
     819              :       ELSE
     820            0 :          result_recvbufs(0)%data = this%result_sendbufs(0)%data
     821              :       END IF
     822              : 
     823           10 :       last_buf = -1
     824           10 :       bufsize = 0
     825           15 :       DO i = 1, this%local_blocks
     826            5 :          cur_col = this%coo_cols_local(i)
     827            5 :          cur_row = this%coo_rows_local(i)
     828            5 :          cur_sm = (cur_col - 1)/this%cols_per_sm + 1
     829            5 :          cur_buf = this%submatrix_owners(cur_sm)
     830            5 :          IF (cur_buf .GT. last_buf) bufsize = 0
     831            5 :          row_size = this%row_blk_size(cur_row)
     832            5 :          col_size = this%col_blk_size(cur_col)
     833            5 :          bufsize_next = bufsize + row_size*col_size
     834            5 :          vector => result_recvbufs(cur_buf)%data(bufsize + 1:bufsize_next)
     835              :          CALL dbcsr_put_block(matrix=resultmat, row=cur_row, col=cur_col, &
     836           15 :                               block=RESHAPE(vector, [row_size, col_size]))
     837            5 :          bufsize = bufsize_next
     838           15 :          last_buf = cur_buf
     839              :       END DO
     840              : 
     841           30 :       DO i = 0, this%numnodes - 1
     842           30 :          CALL result_recvbufs(i)%dealloc
     843              :       END DO
     844           10 :       DEALLOCATE (result_recvbufs)
     845              : 
     846           10 :       CALL dbcsr_finalize(resultmat)
     847              : 
     848           10 :       CALL timestop(handle)
     849           10 :    END SUBROUTINE submatrix_communicate_results
     850              : 
     851              : ! **************************************************************************************************
     852              : !> \brief sort two integer arrays using quicksort, using the second list as second-level sorting criterion
     853              : !> \param arr_a - first input array
     854              : !> \param arr_b - second input array
     855              : !> \param left - left boundary of region to be sorted
     856              : !> \param right - right boundary of region to be sorted
     857              : ! **************************************************************************************************
     858           20 :    RECURSIVE PURE SUBROUTINE qsort_two(arr_a, arr_b, left, right)
     859              : 
     860              :       INTEGER, DIMENSION(:), INTENT(inout)               :: arr_a, arr_b
     861              :       INTEGER, INTENT(in)                                :: left, right
     862              : 
     863              :       INTEGER                                            :: i, j, pivot_a, pivot_b, tmp
     864              : 
     865           20 :       IF (right - left .LT. 1) RETURN
     866              : 
     867            0 :       i = left
     868            0 :       j = right - 1
     869            0 :       pivot_a = arr_a(right)
     870            0 :       pivot_b = arr_b(right)
     871              : 
     872            0 :       DO
     873            0 :          DO WHILE ((arr_a(i) .LT. pivot_a) .OR. ((arr_a(i) .EQ. pivot_a) .AND. (arr_b(i) .LT. pivot_b)))
     874            0 :             i = i + 1
     875              :          END DO
     876            0 :          DO WHILE ((j .GT. i) .AND. ((arr_a(j) .GT. pivot_a) .OR. ((arr_a(j) .EQ. pivot_a) .AND. (arr_b(j) .GE. pivot_b))))
     877            0 :             j = j - 1
     878              :          END DO
     879            0 :          IF (i .LT. j) THEN
     880            0 :             tmp = arr_a(i)
     881            0 :             arr_a(i) = arr_a(j)
     882            0 :             arr_a(j) = tmp
     883            0 :             tmp = arr_b(i)
     884            0 :             arr_b(i) = arr_b(j)
     885            0 :             arr_b(j) = tmp
     886              :          ELSE
     887              :             EXIT
     888              :          END IF
     889              :       END DO
     890              : 
     891            0 :       IF ((arr_a(i) .GT. pivot_a) .OR. (arr_a(i) .EQ. pivot_a .AND. arr_b(i) .GT. pivot_b)) THEN
     892            0 :          tmp = arr_a(i)
     893            0 :          arr_a(i) = arr_a(right)
     894            0 :          arr_a(right) = tmp
     895            0 :          tmp = arr_b(i)
     896            0 :          arr_b(i) = arr_b(right)
     897            0 :          arr_b(right) = tmp
     898              :       END IF
     899              : 
     900            0 :       IF (i - 1 .GT. left) CALL qsort_two(arr_a, arr_b, left, i - 1)
     901            0 :       IF (i + 1 .LT. right) CALL qsort_two(arr_a, arr_b, i + 1, right)
     902              : 
     903              :    END SUBROUTINE qsort_two
     904              : 
     905            0 : END MODULE submatrix_dissection
        

Generated by: LCOV version 2.0-1