LCOV - code coverage report
Current view: top level - src/dbt/tas - dbt_tas_reshape_ops.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 98.8 % 341 337
Test Date: 2025-12-04 06:27:48 Functions: 83.3 % 12 10

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief communication routines to reshape / replicate / merge tall-and-skinny matrices.
      10              : !> \author Patrick Seewald
      11              : ! **************************************************************************************************
      12              : MODULE dbt_tas_reshape_ops
      13              :    USE OMP_LIB,                         ONLY: omp_destroy_lock,&
      14              :                                               omp_get_max_threads,&
      15              :                                               omp_get_num_threads,&
      16              :                                               omp_get_thread_num,&
      17              :                                               omp_init_lock,&
      18              :                                               omp_lock_kind,&
      19              :                                               omp_set_lock,&
      20              :                                               omp_unset_lock
      21              :    USE dbm_api,                         ONLY: &
      22              :         dbm_clear, dbm_distribution_col_dist, dbm_distribution_obj, dbm_distribution_row_dist, &
      23              :         dbm_finalize, dbm_get_col_block_sizes, dbm_get_distribution, dbm_get_name, &
      24              :         dbm_get_row_block_sizes, dbm_get_stored_coordinates, dbm_iterator, &
      25              :         dbm_iterator_blocks_left, dbm_iterator_next_block, dbm_iterator_start, dbm_iterator_stop, &
      26              :         dbm_put_block, dbm_reserve_blocks, dbm_type
      27              :    USE dbt_tas_base,                    ONLY: &
      28              :         dbt_repl_get_stored_coordinates, dbt_tas_blk_sizes, dbt_tas_clear, dbt_tas_create, &
      29              :         dbt_tas_distribution_new, dbt_tas_finalize, dbt_tas_get_stored_coordinates, dbt_tas_info, &
      30              :         dbt_tas_iterator_blocks_left, dbt_tas_iterator_next_block, dbt_tas_iterator_start, &
      31              :         dbt_tas_iterator_stop, dbt_tas_put_block, dbt_tas_reserve_blocks
      32              :    USE dbt_tas_global,                  ONLY: dbt_tas_blk_size_arb,&
      33              :                                               dbt_tas_blk_size_repl,&
      34              :                                               dbt_tas_dist_arb,&
      35              :                                               dbt_tas_dist_repl,&
      36              :                                               dbt_tas_distribution,&
      37              :                                               dbt_tas_rowcol_data
      38              :    USE dbt_tas_split,                   ONLY: colsplit,&
      39              :                                               dbt_tas_get_split_info,&
      40              :                                               rowsplit
      41              :    USE dbt_tas_types,                   ONLY: dbt_tas_distribution_type,&
      42              :                                               dbt_tas_iterator,&
      43              :                                               dbt_tas_split_info,&
      44              :                                               dbt_tas_type
      45              :    USE dbt_tas_util,                    ONLY: swap
      46              :    USE kinds,                           ONLY: dp,&
      47              :                                               int_8
      48              :    USE message_passing,                 ONLY: mp_cart_type,&
      49              :                                               mp_comm_type,&
      50              :                                               mp_request_type,&
      51              :                                               mp_waitall
      52              : #include "../../base/base_uses.f90"
      53              : 
      54              :    IMPLICIT NONE
      55              :    PRIVATE
      56              : 
      57              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbt_tas_reshape_ops'
      58              : 
      59              :    PUBLIC :: &
      60              :       dbt_tas_merge, &
      61              :       dbt_tas_replicate, &
      62              :       dbt_tas_reshape
      63              : 
      64              :    TYPE dbt_buffer_type
      65              :       INTEGER :: nblock = -1
      66              :       INTEGER(KIND=int_8), DIMENSION(:, :), ALLOCATABLE :: indx
      67              :       REAL(dp), DIMENSION(:), ALLOCATABLE :: msg
      68              :       INTEGER :: endpos = -1
      69              :    END TYPE dbt_buffer_type
      70              : 
      71              : CONTAINS
      72              : 
      73              : ! **************************************************************************************************
      74              : !> \brief copy data (involves reshape)
      75              : !> \param matrix_in ...
      76              : !> \param matrix_out ...
      77              : !> \param summation whether matrix_out = matrix_out + matrix_in
      78              : !> \param transposed ...
      79              : !> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
      80              : !> \author Patrick Seewald
      81              : ! **************************************************************************************************
      82       197498 :    RECURSIVE SUBROUTINE dbt_tas_reshape(matrix_in, matrix_out, summation, transposed, move_data)
      83              :       TYPE(dbt_tas_type), INTENT(INOUT)                  :: matrix_in, matrix_out
      84              :       LOGICAL, INTENT(IN), OPTIONAL                      :: summation, transposed, move_data
      85              : 
      86              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'dbt_tas_reshape'
      87              : 
      88              :       INTEGER                                            :: a, b, bcount, handle, handle2, iproc, &
      89              :                                                             nblk, nblk_per_thread, ndata, numnodes
      90       197498 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :)  :: blks_to_allocate, index_recv
      91              :       INTEGER(KIND=int_8), DIMENSION(2)                  :: blk_index
      92              :       INTEGER(kind=omp_lock_kind), ALLOCATABLE, &
      93       197498 :          DIMENSION(:)                                    :: locks
      94       197498 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: num_blocks_recv, num_blocks_send, &
      95       197498 :                                                             num_entries_recv, num_entries_send, &
      96       197498 :                                                             num_rec, num_send
      97              :       INTEGER, DIMENSION(2)                              :: blk_size
      98              :       LOGICAL                                            :: move_prv, tr_in
      99       197498 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: block
     100       197498 :       TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:)   :: buffer_recv, buffer_send
     101              :       TYPE(dbt_tas_iterator)                             :: iter
     102      1184988 :       TYPE(dbt_tas_split_info)                           :: info
     103              :       TYPE(mp_comm_type)                                 :: mp_comm
     104              :       TYPE(mp_request_type), ALLOCATABLE, &
     105       197498 :          DIMENSION(:, :)                                 :: req_array
     106              : 
     107       197498 :       CALL timeset(routineN, handle)
     108              : 
     109       197498 :       IF (PRESENT(summation)) THEN
     110        69710 :          IF (.NOT. summation) CALL dbm_clear(matrix_out%matrix)
     111              :       ELSE
     112       127788 :          CALL dbm_clear(matrix_out%matrix)
     113              :       END IF
     114              : 
     115       197498 :       IF (PRESENT(move_data)) THEN
     116       197498 :          move_prv = move_data
     117              :       ELSE
     118              :          move_prv = .FALSE.
     119              :       END IF
     120              : 
     121       197498 :       IF (PRESENT(transposed)) THEN
     122       197498 :          tr_in = transposed
     123              :       ELSE
     124            0 :          tr_in = .FALSE.
     125              :       END IF
     126              : 
     127       197498 :       IF (.NOT. matrix_out%valid) THEN
     128            0 :          CPABORT("can not reshape into invalid matrix")
     129              :       END IF
     130              : 
     131       197498 :       info = dbt_tas_info(matrix_in)
     132       197498 :       mp_comm = info%mp_comm
     133       197498 :       numnodes = mp_comm%num_pe
     134       913996 :       ALLOCATE (buffer_send(0:numnodes - 1))
     135       716498 :       ALLOCATE (buffer_recv(0:numnodes - 1))
     136       592494 :       ALLOCATE (num_blocks_recv(0:numnodes - 1))
     137       394996 :       ALLOCATE (num_blocks_send(0:numnodes - 1))
     138       394996 :       ALLOCATE (num_entries_recv(0:numnodes - 1))
     139       394996 :       ALLOCATE (num_entries_send(0:numnodes - 1))
     140       592494 :       ALLOCATE (num_rec(0:2*numnodes - 1))
     141      1038000 :       ALLOCATE (num_send(0:2*numnodes - 1), SOURCE=0)
     142      2668494 :       ALLOCATE (req_array(1:numnodes, 4))
     143       394996 :       ALLOCATE (locks(0:numnodes - 1))
     144       519000 :       DO iproc = 0, numnodes - 1
     145       519000 :          CALL omp_init_lock(locks(iproc))
     146              :       END DO
     147              : 
     148       197498 :       CALL timeset(routineN//"_get_coord", handle2)
     149              : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,tr_in,num_send,locks) &
     150       197498 : !$OMP PRIVATE(iter,blk_index,blk_size,iproc)
     151              :       CALL dbt_tas_iterator_start(iter, matrix_in)
     152              :       DO WHILE (dbt_tas_iterator_blocks_left(iter))
     153              :          CALL dbt_tas_iterator_next_block(iter, blk_index(1), blk_index(2), &
     154              :                                           row_size=blk_size(1), col_size=blk_size(2))
     155              :          IF (tr_in) THEN
     156              :             CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(2), blk_index(1), iproc)
     157              :          ELSE
     158              :             CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
     159              :          END IF
     160              :          CALL omp_set_lock(locks(iproc))
     161              :          num_send(2*iproc) = num_send(2*iproc) + PRODUCT(blk_size)
     162              :          num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
     163              :          CALL omp_unset_lock(locks(iproc))
     164              :       END DO
     165              :       CALL dbt_tas_iterator_stop(iter)
     166              : !$OMP END PARALLEL
     167       197498 :       CALL timestop(handle2)
     168              : 
     169       197498 :       CALL timeset(routineN//"_alltoall", handle2)
     170       197498 :       CALL mp_comm%alltoall(num_send, num_rec, 2)
     171       197498 :       CALL timestop(handle2)
     172              : 
     173       197498 :       CALL timeset(routineN//"_buffer_fill", handle2)
     174       519000 :       DO iproc = 0, numnodes - 1
     175       321502 :          num_entries_recv(iproc) = num_rec(2*iproc)
     176       321502 :          num_blocks_recv(iproc) = num_rec(2*iproc + 1)
     177       321502 :          num_entries_send(iproc) = num_send(2*iproc)
     178       321502 :          num_blocks_send(iproc) = num_send(2*iproc + 1)
     179              : 
     180       321502 :          CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
     181       519000 :          CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
     182              :       END DO
     183              : 
     184              : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,tr_in,buffer_send,locks) &
     185       197498 : !$OMP PRIVATE(iter,blk_index,blk_size,block,iproc)
     186              :       CALL dbt_tas_iterator_start(iter, matrix_in)
     187              :       DO WHILE (dbt_tas_iterator_blocks_left(iter))
     188              :          CALL dbt_tas_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
     189              :                                           row_size=blk_size(1), col_size=blk_size(2))
     190              :          IF (tr_in) THEN
     191              :             CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(2), blk_index(1), iproc)
     192              :          ELSE
     193              :             CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
     194              :          END IF
     195              :          CALL omp_set_lock(locks(iproc))
     196              :          CALL dbt_buffer_add_block(buffer_send(iproc), blk_index, block, transposed=tr_in)
     197              :          CALL omp_unset_lock(locks(iproc))
     198              :       END DO
     199              :       CALL dbt_tas_iterator_stop(iter)
     200              : !$OMP END PARALLEL
     201              : 
     202       197498 :       IF (move_prv) CALL dbt_tas_clear(matrix_in)
     203       197498 :       CALL timestop(handle2)
     204              : 
     205       197498 :       CALL timeset(routineN//"_communicate_buffer", handle2)
     206       197498 :       CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
     207              : 
     208       519000 :       DO iproc = 0, numnodes - 1
     209       321502 :          CALL dbt_buffer_destroy(buffer_send(iproc))
     210       519000 :          CALL omp_destroy_lock(locks(iproc))
     211              :       END DO
     212       197498 :       DEALLOCATE (locks)
     213              : 
     214       197498 :       CALL timestop(handle2)
     215              : 
     216       197498 :       CALL timeset(routineN//"_buffer_obtain", handle2)
     217              : 
     218              :       ! Parallel unpack of received blocks.
     219       519000 :       nblk = SUM(num_blocks_recv)
     220       571769 :       ALLOCATE (blks_to_allocate(nblk, 2))
     221              : 
     222       197498 :       bcount = 0
     223       519000 :       DO iproc = 0, numnodes - 1
     224       321502 :          CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
     225      5973378 :          blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = index_recv(:, :)
     226       321502 :          bcount = bcount + SIZE(index_recv, 1)
     227       840502 :          DEALLOCATE (index_recv)
     228              :       END DO
     229              : 
     230              : !TODO: Parallelize creation of block list.
     231       197498 : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
     232              :       nblk_per_thread = nblk/omp_get_num_threads() + 1
     233              :       a = omp_get_thread_num()*nblk_per_thread + 1
     234              :       b = MIN(a + nblk_per_thread, nblk)
     235              :       CALL dbt_tas_reserve_blocks(matrix_out, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
     236              : !$OMP END PARALLEL
     237       197498 :       DEALLOCATE (blks_to_allocate)
     238              : 
     239              : !$OMP PARALLEL DEFAULT(NONE) SHARED(buffer_recv,matrix_out,numnodes,summation) &
     240       197498 : !$OMP PRIVATE(iproc,ndata,blk_index,blk_size,block)
     241              : !$OMP DO SCHEDULE(DYNAMIC)
     242              :       DO iproc = 0, numnodes - 1
     243              :          ! First, we need to get the index to create block
     244              :          DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
     245              :             CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index)
     246              :             CALL dbt_tas_blk_sizes(matrix_out, blk_index(1), blk_index(2), blk_size(1), blk_size(2))
     247              :             ALLOCATE (block(blk_size(1), blk_size(2)))
     248              :             CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index, block)
     249              :             CALL dbt_tas_put_block(matrix_out, blk_index(1), blk_index(2), block, summation=summation)
     250              :             DEALLOCATE (block)
     251              :          END DO
     252              :          CALL dbt_buffer_destroy(buffer_recv(iproc))
     253              :       END DO
     254              : !$OMP END DO
     255              : !$OMP END PARALLEL
     256              : 
     257       197498 :       CALL timestop(handle2)
     258              : 
     259       197498 :       CALL dbt_tas_finalize(matrix_out)
     260              : 
     261       197498 :       CALL timestop(handle)
     262      2222988 :    END SUBROUTINE dbt_tas_reshape
     263              : 
     264              : ! **************************************************************************************************
     265              : !> \brief Replicate matrix_in such that each submatrix of matrix_out is an exact copy of matrix_in
     266              : !> \param matrix_in ...
     267              : !> \param info ...
     268              : !> \param matrix_out ...
     269              : !> \param nodata Don't copy data but create matrix_out
     270              : !> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
     271              : !> \author Patrick Seewald
     272              : ! **************************************************************************************************
     273      1096140 :    SUBROUTINE dbt_tas_replicate(matrix_in, info, matrix_out, nodata, move_data)
     274              :       TYPE(dbm_type), INTENT(INOUT)                      :: matrix_in
     275              :       TYPE(dbt_tas_split_info), INTENT(IN)               :: info
     276              :       TYPE(dbt_tas_type), INTENT(OUT)                    :: matrix_out
     277              :       LOGICAL, INTENT(IN), OPTIONAL                      :: nodata, move_data
     278              : 
     279              :       INTEGER                                            :: a, b, nblk_per_thread, nblkcols, nblkrows
     280              :       INTEGER, DIMENSION(2)                              :: pdims
     281       365380 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     282       182690 :                                                             row_dist
     283              :       TYPE(dbm_distribution_obj)                         :: dbm_dist
     284       182690 :       TYPE(dbt_tas_dist_arb), TARGET                     :: dir_dist
     285       182690 :       TYPE(dbt_tas_dist_repl), TARGET                    :: repl_dist
     286              : 
     287       365380 :       CLASS(dbt_tas_distribution), ALLOCATABLE :: col_dist_obj, row_dist_obj
     288       365380 :       CLASS(dbt_tas_rowcol_data), ALLOCATABLE :: row_bsize_obj, col_bsize_obj
     289       182690 :       TYPE(dbt_tas_blk_size_repl), TARGET :: repl_blksize
     290       182690 :       TYPE(dbt_tas_blk_size_arb), TARGET :: dir_blksize
     291       913450 :       TYPE(dbt_tas_distribution_type) :: dist
     292              :       INTEGER :: numnodes, ngroup, max_threads, cache_idx
     293       182690 :       INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
     294       182690 :       TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:) :: buffer_recv, buffer_send
     295       182690 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: num_blocks_recv, num_blocks_send, &
     296       182690 :                                                             num_entries_recv, num_entries_send, &
     297       182690 :                                                             num_rec, num_send
     298       182690 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:, :) :: req_array
     299       182690 :       INTEGER, ALLOCATABLE, DIMENSION(:, :) :: blks_to_allocate
     300              :       INTEGER, DIMENSION(2) :: blk_size
     301              :       INTEGER, DIMENSION(2) :: blk_index
     302              :       INTEGER(KIND=int_8), DIMENSION(2) :: blk_index_i8
     303              :       TYPE(dbm_iterator) :: iter
     304              :       INTEGER :: i, iproc, bcount, nblk
     305       182690 :       INTEGER, ALLOCATABLE, DIMENSION(:, :) :: iprocs
     306              :       LOGICAL :: nodata_prv, move_prv
     307       182690 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :) :: index_recv
     308              :       INTEGER :: ndata
     309       182690 :       TYPE(mp_cart_type) :: mp_comm
     310              : 
     311       182690 :       REAL(KIND=dp), DIMENSION(:, :), POINTER :: block
     312              : 
     313              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbt_tas_replicate'
     314              : 
     315              :       INTEGER :: handle, handle2
     316              : 
     317       182690 :       NULLIFY (col_blk_size, row_blk_size)
     318              : 
     319       182690 :       CALL timeset(routineN, handle)
     320              : 
     321       182690 :       IF (PRESENT(nodata)) THEN
     322        56720 :          nodata_prv = nodata
     323              :       ELSE
     324              :          nodata_prv = .FALSE.
     325              :       END IF
     326              : 
     327       182690 :       IF (PRESENT(move_data)) THEN
     328       125970 :          move_prv = move_data
     329              :       ELSE
     330              :          move_prv = .FALSE.
     331              :       END IF
     332              : 
     333       182690 :       row_blk_size => dbm_get_row_block_sizes(matrix_in)
     334       182690 :       col_blk_size => dbm_get_col_block_sizes(matrix_in)
     335       182690 :       nblkrows = SIZE(row_blk_size)
     336       182690 :       nblkcols = SIZE(col_blk_size)
     337       182690 :       dbm_dist = dbm_get_distribution(matrix_in)
     338       182690 :       row_dist => dbm_distribution_row_dist(dbm_dist)
     339       182690 :       col_dist => dbm_distribution_col_dist(dbm_dist)
     340              : 
     341       182690 :       mp_comm = info%mp_comm
     342       182690 :       ngroup = info%ngroup
     343              : 
     344       182690 :       numnodes = mp_comm%num_pe
     345       548070 :       pdims = mp_comm%num_pe_cart
     346              : 
     347       309548 :       SELECT CASE (info%split_rowcol)
     348              :       CASE (rowsplit)
     349       126858 :          repl_dist = dbt_tas_dist_repl(row_dist, pdims(1), nblkrows, info%ngroup, info%pgrid_split_size)
     350       126858 :          dir_dist = dbt_tas_dist_arb(col_dist, pdims(2), INT(nblkcols, KIND=int_8))
     351       126858 :          repl_blksize = dbt_tas_blk_size_repl(row_blk_size, info%ngroup)
     352       126858 :          dir_blksize = dbt_tas_blk_size_arb(col_blk_size)
     353       126858 :          ALLOCATE (row_dist_obj, source=repl_dist)
     354       126858 :          ALLOCATE (col_dist_obj, source=dir_dist)
     355       126858 :          ALLOCATE (row_bsize_obj, source=repl_blksize)
     356       253716 :          ALLOCATE (col_bsize_obj, source=dir_blksize)
     357              :       CASE (colsplit)
     358        55832 :          dir_dist = dbt_tas_dist_arb(row_dist, pdims(1), INT(nblkrows, KIND=int_8))
     359        55832 :          repl_dist = dbt_tas_dist_repl(col_dist, pdims(2), nblkcols, info%ngroup, info%pgrid_split_size)
     360        55832 :          dir_blksize = dbt_tas_blk_size_arb(row_blk_size)
     361        55832 :          repl_blksize = dbt_tas_blk_size_repl(col_blk_size, info%ngroup)
     362        55832 :          ALLOCATE (row_dist_obj, source=dir_dist)
     363        55832 :          ALLOCATE (col_dist_obj, source=repl_dist)
     364        55832 :          ALLOCATE (row_bsize_obj, source=dir_blksize)
     365       857618 :          ALLOCATE (col_bsize_obj, source=repl_blksize)
     366              :       END SELECT
     367              : 
     368       182690 :       CALL dbt_tas_distribution_new(dist, mp_comm, row_dist_obj, col_dist_obj, split_info=info)
     369              :       CALL dbt_tas_create(matrix_out, TRIM(dbm_get_name(matrix_in))//" replicated", &
     370       182690 :                           dist, row_bsize_obj, col_bsize_obj, own_dist=.TRUE.)
     371              : 
     372       182690 :       IF (nodata_prv) THEN
     373        56720 :          CALL dbt_tas_finalize(matrix_out)
     374        56720 :          CALL timestop(handle)
     375        56720 :          RETURN
     376              :       END IF
     377              : 
     378       582188 :       ALLOCATE (buffer_send(0:numnodes - 1))
     379       456218 :       ALLOCATE (buffer_recv(0:numnodes - 1))
     380       377910 :       ALLOCATE (num_blocks_recv(0:numnodes - 1))
     381       251940 :       ALLOCATE (num_blocks_send(0:numnodes - 1))
     382       251940 :       ALLOCATE (num_entries_recv(0:numnodes - 1))
     383       251940 :       ALLOCATE (num_entries_send(0:numnodes - 1))
     384       377910 :       ALLOCATE (num_rec(0:2*numnodes - 1))
     385       660496 :       ALLOCATE (num_send(0:2*numnodes - 1), SOURCE=0)
     386      1698902 :       ALLOCATE (req_array(1:numnodes, 4))
     387       251940 :       ALLOCATE (locks(0:numnodes - 1))
     388       125970 :       max_threads = 1
     389       125970 : !$    max_threads = omp_get_max_threads()
     390       503880 :       ALLOCATE (iprocs(ngroup, max_threads))
     391       330248 :       DO iproc = 0, numnodes - 1
     392       330248 :          CALL omp_init_lock(locks(iproc))
     393              :       END DO
     394              : 
     395              : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,num_send,ngroup,iprocs,locks) &
     396       125970 : !$OMP PRIVATE(iter,blk_index,blk_size,cache_idx,i,iproc)
     397              :       cache_idx = omp_get_thread_num() + 1
     398              :       CALL dbm_iterator_start(iter, matrix_in)
     399              :       DO WHILE (dbm_iterator_blocks_left(iter))
     400              :          CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), &
     401              :                                       row_size=blk_size(1), col_size=blk_size(2))
     402              :          CALL dbt_repl_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), &
     403              :                                               iprocs(:, cache_idx))
     404              :          DO i = 1, ngroup
     405              :             iproc = iprocs(i, cache_idx)
     406              :             CALL omp_set_lock(locks(iproc))
     407              :             num_send(2*iproc) = num_send(2*iproc) + PRODUCT(blk_size)
     408              :             num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
     409              :             CALL omp_unset_lock(locks(iproc))
     410              :          END DO
     411              :       END DO
     412              :       CALL dbm_iterator_stop(iter)
     413              : !$OMP END PARALLEL
     414              : 
     415       125970 :       CALL timeset(routineN//"_alltoall", handle2)
     416       125970 :       CALL mp_comm%alltoall(num_send, num_rec, 2)
     417       125970 :       CALL timestop(handle2)
     418              : 
     419       330248 :       DO iproc = 0, numnodes - 1
     420       204278 :          num_entries_recv(iproc) = num_rec(2*iproc)
     421       204278 :          num_blocks_recv(iproc) = num_rec(2*iproc + 1)
     422       204278 :          num_entries_send(iproc) = num_send(2*iproc)
     423       204278 :          num_blocks_send(iproc) = num_send(2*iproc + 1)
     424              : 
     425       204278 :          CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
     426       330248 :          CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
     427              :       END DO
     428              : 
     429              : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,buffer_send,locks,ngroup,iprocs) &
     430       125970 : !$OMP PRIVATE(iter,blk_index,blk_size,block,cache_idx,i,iproc)
     431              :       cache_idx = omp_get_thread_num() + 1
     432              :       CALL dbm_iterator_start(iter, matrix_in)
     433              :       DO WHILE (dbm_iterator_blocks_left(iter))
     434              :          CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
     435              :                                       row_size=blk_size(1), col_size=blk_size(2))
     436              :          CALL dbt_repl_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), &
     437              :                                               iprocs(:, cache_idx))
     438              :          DO i = 1, ngroup
     439              :             iproc = iprocs(i, cache_idx)
     440              :             CALL omp_set_lock(locks(iproc))
     441              :             CALL dbt_buffer_add_block(buffer_send(iproc), INT(blk_index, KIND=int_8), block)
     442              :             CALL omp_unset_lock(locks(iproc))
     443              :          END DO
     444              :       END DO
     445              :       CALL dbm_iterator_stop(iter)
     446              : !$OMP END PARALLEL
     447              : 
     448       125970 :       DEALLOCATE (iprocs)
     449              : 
     450       125970 :       IF (move_prv) CALL dbm_clear(matrix_in)
     451              : 
     452       125970 :       CALL timeset(routineN//"_communicate_buffer", handle2)
     453       125970 :       CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
     454              : 
     455       330248 :       DO iproc = 0, numnodes - 1
     456       204278 :          CALL dbt_buffer_destroy(buffer_send(iproc))
     457       330248 :          CALL omp_destroy_lock(locks(iproc))
     458              :       END DO
     459       125970 :       DEALLOCATE (locks)
     460              : 
     461       125970 :       CALL timestop(handle2)
     462              : 
     463              :       ! Parallel unpack of received blocks.
     464       330248 :       nblk = SUM(num_blocks_recv)
     465       377528 :       ALLOCATE (blks_to_allocate(nblk, 2))
     466              : 
     467       125970 :       bcount = 0
     468       330248 :       DO iproc = 0, numnodes - 1
     469       204278 :          CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
     470      4796132 :          blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = INT(index_recv(:, :))
     471       204278 :          bcount = bcount + SIZE(index_recv, 1)
     472       534526 :          DEALLOCATE (index_recv)
     473              :       END DO
     474              : 
     475              : !TODO: Parallelize creation of block list.
     476       125970 : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
     477              :       nblk_per_thread = nblk/omp_get_num_threads() + 1
     478              :       a = omp_get_thread_num()*nblk_per_thread + 1
     479              :       b = MIN(a + nblk_per_thread, nblk)
     480              :       CALL dbm_reserve_blocks(matrix_out%matrix, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
     481              : !$OMP END PARALLEL
     482       125970 :       DEALLOCATE (blks_to_allocate)
     483              : 
     484              : !$OMP PARALLEL DEFAULT(NONE) SHARED(buffer_recv,matrix_out,numnodes) &
     485       125970 : !$OMP PRIVATE(iproc,ndata,blk_index_i8,blk_size,block)
     486              : !$OMP DO SCHEDULE(DYNAMIC)
     487              :       DO iproc = 0, numnodes - 1
     488              :          ! First, we need to get the index to create block
     489              :          DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
     490              :             CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8)
     491              :             CALL dbt_tas_blk_sizes(matrix_out, blk_index_i8(1), blk_index_i8(2), blk_size(1), blk_size(2))
     492              :             ALLOCATE (block(blk_size(1), blk_size(2)))
     493              :             CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8, block)
     494              :             CALL dbm_put_block(matrix_out%matrix, INT(blk_index_i8(1)), INT(blk_index_i8(2)), block)
     495              :             DEALLOCATE (block)
     496              :          END DO
     497              : 
     498              :          CALL dbt_buffer_destroy(buffer_recv(iproc))
     499              :       END DO
     500              : !$OMP END DO
     501              : !$OMP END PARALLEL
     502              : 
     503       125970 :       CALL dbt_tas_finalize(matrix_out)
     504              : 
     505       125970 :       CALL timestop(handle)
     506              : 
     507      2122016 :    END SUBROUTINE dbt_tas_replicate
     508              : 
     509              : ! **************************************************************************************************
     510              : !> \brief Merge submatrices of matrix_in to matrix_out by sum
     511              : !> \param matrix_out ...
     512              : !> \param matrix_in ...
     513              : !> \param summation ...
     514              : !> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
     515              : !> \author Patrick Seewald
     516              : ! **************************************************************************************************
     517        56720 :    SUBROUTINE dbt_tas_merge(matrix_out, matrix_in, summation, move_data)
     518              :       TYPE(dbm_type), INTENT(INOUT)                      :: matrix_out
     519              :       TYPE(dbt_tas_type), INTENT(INOUT)                  :: matrix_in
     520              :       LOGICAL, INTENT(IN), OPTIONAL                      :: summation, move_data
     521              : 
     522              :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'dbt_tas_merge'
     523              : 
     524              :       INTEGER                                            :: a, b, bcount, handle, handle2, iproc, &
     525              :                                                             nblk, nblk_per_thread, ndata, numnodes
     526        56720 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :)  :: index_recv
     527              :       INTEGER(KIND=int_8), DIMENSION(2)                  :: blk_index_i8
     528              :       INTEGER(kind=omp_lock_kind), ALLOCATABLE, &
     529        56720 :          DIMENSION(:)                                    :: locks
     530        56720 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: num_blocks_recv, num_blocks_send, &
     531        56720 :                                                             num_entries_recv, num_entries_send, &
     532        56720 :                                                             num_rec, num_send
     533        56720 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: blks_to_allocate
     534              :       INTEGER, DIMENSION(2)                              :: blk_index, blk_size
     535       113440 :       INTEGER, DIMENSION(:), POINTER                     :: col_block_sizes, row_block_sizes
     536              :       LOGICAL                                            :: move_prv
     537        56720 :       REAL(dp), DIMENSION(:, :), POINTER                 :: block
     538              :       TYPE(dbm_iterator)                                 :: iter
     539        56720 :       TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:)   :: buffer_recv, buffer_send
     540       340320 :       TYPE(dbt_tas_split_info)                           :: info
     541        56720 :       TYPE(mp_cart_type)                                 :: mp_comm
     542              :       TYPE(mp_request_type), ALLOCATABLE, &
     543        56720 :          DIMENSION(:, :)                                 :: req_array
     544              : 
     545        56720 :       CALL timeset(routineN, handle)
     546              : 
     547        56720 :       IF (PRESENT(summation)) THEN
     548            0 :          IF (.NOT. summation) CALL dbm_clear(matrix_out)
     549              :       ELSE
     550        56720 :          CALL dbm_clear(matrix_out)
     551              :       END IF
     552              : 
     553        56720 :       IF (PRESENT(move_data)) THEN
     554        56720 :          move_prv = move_data
     555              :       ELSE
     556              :          move_prv = .FALSE.
     557              :       END IF
     558              : 
     559        56720 :       info = dbt_tas_info(matrix_in)
     560        56720 :       CALL dbt_tas_get_split_info(info, mp_comm=mp_comm)
     561        56720 :       numnodes = mp_comm%num_pe
     562              : 
     563       269888 :       ALLOCATE (buffer_send(0:numnodes - 1))
     564       213168 :       ALLOCATE (buffer_recv(0:numnodes - 1))
     565       170160 :       ALLOCATE (num_blocks_recv(0:numnodes - 1))
     566       113440 :       ALLOCATE (num_blocks_send(0:numnodes - 1))
     567       113440 :       ALLOCATE (num_entries_recv(0:numnodes - 1))
     568       113440 :       ALLOCATE (num_entries_send(0:numnodes - 1))
     569       170160 :       ALLOCATE (num_rec(0:2*numnodes - 1))
     570       312896 :       ALLOCATE (num_send(0:2*numnodes - 1), SOURCE=0)
     571       795952 :       ALLOCATE (req_array(1:numnodes, 4))
     572       113440 :       ALLOCATE (locks(0:numnodes - 1))
     573       156448 :       DO iproc = 0, numnodes - 1
     574       156448 :          CALL omp_init_lock(locks(iproc))
     575              :       END DO
     576              : 
     577              : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,num_send,locks) &
     578        56720 : !$OMP PRIVATE(iter,blk_index,blk_size,iproc)
     579              :       CALL dbm_iterator_start(iter, matrix_in%matrix)
     580              :       DO WHILE (dbm_iterator_blocks_left(iter))
     581              :          CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), &
     582              :                                       row_size=blk_size(1), col_size=blk_size(2))
     583              :          CALL dbm_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
     584              :          CALL omp_set_lock(locks(iproc))
     585              :          num_send(2*iproc) = num_send(2*iproc) + PRODUCT(blk_size)
     586              :          num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
     587              :          CALL omp_unset_lock(locks(iproc))
     588              :       END DO
     589              :       CALL dbm_iterator_stop(iter)
     590              : !$OMP END PARALLEL
     591              : 
     592        56720 :       CALL timeset(routineN//"_alltoall", handle2)
     593        56720 :       CALL mp_comm%alltoall(num_send, num_rec, 2)
     594        56720 :       CALL timestop(handle2)
     595              : 
     596       156448 :       DO iproc = 0, numnodes - 1
     597        99728 :          num_entries_recv(iproc) = num_rec(2*iproc)
     598        99728 :          num_blocks_recv(iproc) = num_rec(2*iproc + 1)
     599        99728 :          num_entries_send(iproc) = num_send(2*iproc)
     600        99728 :          num_blocks_send(iproc) = num_send(2*iproc + 1)
     601              : 
     602        99728 :          CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
     603       156448 :          CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
     604              :       END DO
     605              : 
     606              : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,buffer_send,locks) &
     607        56720 : !$OMP PRIVATE(iter,blk_index,blk_size,block,iproc)
     608              :       CALL dbm_iterator_start(iter, matrix_in%matrix)
     609              :       DO WHILE (dbm_iterator_blocks_left(iter))
     610              :          CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
     611              :                                       row_size=blk_size(1), col_size=blk_size(2))
     612              :          CALL dbm_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
     613              :          CALL omp_set_lock(locks(iproc))
     614              :          CALL dbt_buffer_add_block(buffer_send(iproc), INT(blk_index, KIND=int_8), block)
     615              :          CALL omp_unset_lock(locks(iproc))
     616              :       END DO
     617              :       CALL dbm_iterator_stop(iter)
     618              : !$OMP END PARALLEL
     619              : 
     620        56720 :       IF (move_prv) CALL dbt_tas_clear(matrix_in)
     621              : 
     622        56720 :       CALL timeset(routineN//"_communicate_buffer", handle2)
     623        56720 :       CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
     624              : 
     625       156448 :       DO iproc = 0, numnodes - 1
     626        99728 :          CALL dbt_buffer_destroy(buffer_send(iproc))
     627       156448 :          CALL omp_destroy_lock(locks(iproc))
     628              :       END DO
     629        56720 :       DEALLOCATE (locks)
     630              : 
     631        56720 :       CALL timestop(handle2)
     632              : 
     633              :       ! Parallel unpack of received blocks.
     634       156448 :       nblk = SUM(num_blocks_recv)
     635       161177 :       ALLOCATE (blks_to_allocate(nblk, 2))
     636              : 
     637        56720 :       bcount = 0
     638       156448 :       DO iproc = 0, numnodes - 1
     639        99728 :          CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
     640      2252752 :          blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = INT(index_recv(:, :))
     641        99728 :          bcount = bcount + SIZE(index_recv, 1)
     642       256176 :          DEALLOCATE (index_recv)
     643              :       END DO
     644              : 
     645              : !TODO: Parallelize creation of block list.
     646        56720 : !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
     647              :       nblk_per_thread = nblk/omp_get_num_threads() + 1
     648              :       a = omp_get_thread_num()*nblk_per_thread + 1
     649              :       b = MIN(a + nblk_per_thread, nblk)
     650              :       CALL dbm_reserve_blocks(matrix_out, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
     651              : !$OMP END PARALLEL
     652        56720 :       DEALLOCATE (blks_to_allocate)
     653              : 
     654        56720 :       row_block_sizes => dbm_get_row_block_sizes(matrix_out)
     655        56720 :       col_block_sizes => dbm_get_col_block_sizes(matrix_out)
     656              : 
     657              : !$OMP PARALLEL DEFAULT(NONE) SHARED(buffer_recv,matrix_out,numnodes,row_block_sizes,col_block_sizes) &
     658        56720 : !$OMP PRIVATE(iproc,ndata,blk_index_i8,blk_size,block)
     659              : !$OMP DO SCHEDULE(DYNAMIC)
     660              :       DO iproc = 0, numnodes - 1
     661              :          ! First, we need to get the index to create block
     662              :          DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
     663              :             CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8)
     664              :             blk_size(1) = row_block_sizes(INT(blk_index_i8(1)))
     665              :             blk_size(2) = col_block_sizes(INT(blk_index_i8(2)))
     666              :             ALLOCATE (block(blk_size(1), blk_size(2)))
     667              :             CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8, block)
     668              :             CALL dbm_put_block(matrix_out, INT(blk_index_i8(1)), INT(blk_index_i8(2)), block, summation=.TRUE.)
     669              :             DEALLOCATE (block)
     670              :          END DO
     671              :          CALL dbt_buffer_destroy(buffer_recv(iproc))
     672              :       END DO
     673              : !$OMP END DO
     674              : !$OMP END PARALLEL
     675              : 
     676        56720 :       CALL dbm_finalize(matrix_out)
     677              : 
     678        56720 :       CALL timestop(handle)
     679       483056 :    END SUBROUTINE dbt_tas_merge
     680              : 
     681              : ! **************************************************************************************************
     682              : !> \brief get all indices from buffer
     683              : !> \param buffer ...
     684              : !> \param index ...
     685              : !> \author Patrick Seewald
     686              : ! **************************************************************************************************
     687       625508 :    SUBROUTINE dbt_buffer_get_index(buffer, index)
     688              :       TYPE(dbt_buffer_type), INTENT(IN)                  :: buffer
     689              :       INTEGER(KIND=int_8), ALLOCATABLE, &
     690              :          DIMENSION(:, :), INTENT(OUT)                    :: index
     691              : 
     692              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbt_buffer_get_index'
     693              : 
     694              :       INTEGER                                            :: handle
     695              :       INTEGER, DIMENSION(2)                              :: indx_shape
     696              : 
     697       625508 :       CALL timeset(routineN, handle)
     698              : 
     699      3753048 :       indx_shape = SHAPE(buffer%indx) - [0, 1]
     700      2328827 :       ALLOCATE (INDEX(indx_shape(1), indx_shape(2)))
     701     13022262 :       INDEX(:, :) = buffer%indx(1:indx_shape(1), 1:indx_shape(2))
     702       625508 :       CALL timestop(handle)
     703       625508 :    END SUBROUTINE dbt_buffer_get_index
     704              : 
     705              : ! **************************************************************************************************
     706              : !> \brief how many blocks left in iterator
     707              : !> \param buffer ...
     708              : !> \return ...
     709              : !> \author Patrick Seewald
     710              : ! **************************************************************************************************
     711      6198377 :    PURE FUNCTION dbt_buffer_blocks_left(buffer)
     712              :       TYPE(dbt_buffer_type), INTENT(IN)                  :: buffer
     713              :       LOGICAL                                            :: dbt_buffer_blocks_left
     714              : 
     715      6198377 :       dbt_buffer_blocks_left = buffer%endpos < buffer%nblock
     716      6198377 :    END FUNCTION dbt_buffer_blocks_left
     717              : 
     718              : ! **************************************************************************************************
     719              : !> \brief Create block buffer for MPI communication.
     720              : !> \param buffer block buffer
     721              : !> \param nblock number of blocks
     722              : !> \param ndata total number of block entries
     723              : !> \author Patrick Seewald
     724              : ! **************************************************************************************************
     725      1251016 :    SUBROUTINE dbt_buffer_create(buffer, nblock, ndata)
     726              :       TYPE(dbt_buffer_type), INTENT(OUT)                 :: buffer
     727              :       INTEGER, INTENT(IN)                                :: nblock, ndata
     728              : 
     729      1251016 :       buffer%nblock = nblock
     730      1251016 :       buffer%endpos = 0
     731      3406638 :       ALLOCATE (buffer%msg(ndata))
     732      3406638 :       ALLOCATE (buffer%indx(nblock, 3))
     733      1251016 :    END SUBROUTINE dbt_buffer_create
     734              : 
     735              : ! **************************************************************************************************
     736              : !> \brief ...
     737              : !> \param buffer ...
     738              : !> \author Patrick Seewald
     739              : ! **************************************************************************************************
     740      1251016 :    SUBROUTINE dbt_buffer_destroy(buffer)
     741              :       TYPE(dbt_buffer_type), INTENT(INOUT)               :: buffer
     742              : 
     743      1251016 :       DEALLOCATE (buffer%msg)
     744      1251016 :       DEALLOCATE (buffer%indx)
     745      1251016 :       buffer%nblock = -1
     746      1251016 :       buffer%endpos = -1
     747      1251016 :    END SUBROUTINE dbt_buffer_destroy
     748              : 
     749              : ! **************************************************************************************************
     750              : !> \brief insert a block into block buffer (at current iterator position)
     751              : !> \param buffer ...
     752              : !> \param index index of block
     753              : !> \param block ...
     754              : !> \param transposed ...
     755              : !> \author Patrick Seewald
     756              : ! **************************************************************************************************
     757      5572869 :    SUBROUTINE dbt_buffer_add_block(buffer, index, block, transposed)
     758              :       TYPE(dbt_buffer_type), INTENT(INOUT)               :: buffer
     759              :       INTEGER(KIND=int_8), DIMENSION(2), INTENT(IN)      :: index
     760              :       REAL(dp), DIMENSION(:, :), INTENT(IN)              :: block
     761              :       LOGICAL, INTENT(IN), OPTIONAL                      :: transposed
     762              : 
     763              :       INTEGER                                            :: ndata, p, p_data
     764              :       INTEGER(KIND=int_8), DIMENSION(2)                  :: index_prv
     765              :       LOGICAL                                            :: tr
     766              : 
     767      5572869 :       IF (PRESENT(transposed)) THEN
     768      2504436 :          tr = transposed
     769              :       ELSE
     770              :          tr = .FALSE.
     771              :       END IF
     772              : 
     773      5572869 :       index_prv(:) = INDEX(:)
     774      5572869 :       IF (tr) THEN
     775       831679 :          CALL swap(index_prv)
     776              :       END IF
     777     16718607 :       ndata = PRODUCT(SHAPE(block))
     778              : 
     779      5572869 :       p = buffer%endpos
     780      5572869 :       IF (p == 0) THEN
     781              :          p_data = 0
     782              :       ELSE
     783      5120566 :          p_data = INT(buffer%indx(p, 3))
     784              :       END IF
     785              : 
     786      5572869 :       IF (tr) THEN
     787     90131029 :          buffer%msg(p_data + 1:p_data + ndata) = RESHAPE(TRANSPOSE(block), [ndata])
     788              :       ELSE
     789    731386557 :          buffer%msg(p_data + 1:p_data + ndata) = RESHAPE(block, [ndata])
     790              :       END IF
     791              : 
     792     16718607 :       buffer%indx(p + 1, 1:2) = index_prv(:)
     793      5572869 :       IF (p > 0) THEN
     794      5120566 :          buffer%indx(p + 1, 3) = buffer%indx(p, 3) + INT(ndata, KIND=int_8)
     795              :       ELSE
     796       452303 :          buffer%indx(p + 1, 3) = INT(ndata, KIND=int_8)
     797              :       END IF
     798      5572869 :       buffer%endpos = buffer%endpos + 1
     799      5572869 :    END SUBROUTINE dbt_buffer_add_block
     800              : 
     801              : ! **************************************************************************************************
     802              : !> \brief get next block from buffer. Iterator is advanced only if block is retrieved or advance_iter.
     803              : !> \param buffer ...
     804              : !> \param ndata ...
     805              : !> \param index ...
     806              : !> \param block ...
     807              : !> \param advance_iter ...
     808              : !> \author Patrick Seewald
     809              : ! **************************************************************************************************
     810     11145738 :    SUBROUTINE dbt_buffer_get_next_block(buffer, ndata, index, block, advance_iter)
     811              :       TYPE(dbt_buffer_type), INTENT(INOUT)               :: buffer
     812              :       INTEGER, INTENT(OUT)                               :: ndata
     813              :       INTEGER(KIND=int_8), DIMENSION(2), INTENT(OUT)     :: index
     814              :       REAL(dp), DIMENSION(:, :), INTENT(OUT), OPTIONAL   :: block
     815              :       LOGICAL, INTENT(IN), OPTIONAL                      :: advance_iter
     816              : 
     817              :       INTEGER                                            :: p, p_data
     818              :       LOGICAL                                            :: do_advance
     819              : 
     820     11145738 :       do_advance = .FALSE.
     821     11145738 :       IF (PRESENT(advance_iter)) THEN
     822            0 :          do_advance = advance_iter
     823     11145738 :       ELSE IF (PRESENT(block)) THEN
     824      5572869 :          do_advance = .TRUE.
     825              :       END IF
     826              : 
     827     11145738 :       p = buffer%endpos
     828     11145738 :       IF (p == 0) THEN
     829              :          p_data = 0
     830              :       ELSE
     831     10241132 :          p_data = INT(buffer%indx(p, 3))
     832              :       END IF
     833              : 
     834     10241132 :       IF (p > 0) THEN
     835     10241132 :          ndata = INT(buffer%indx(p + 1, 3) - buffer%indx(p, 3))
     836              :       ELSE
     837       904606 :          ndata = INT(buffer%indx(p + 1, 3))
     838              :       END IF
     839     33437214 :       INDEX(:) = buffer%indx(p + 1, 1:2)
     840              : 
     841     11145738 :       IF (PRESENT(block)) THEN
     842     16718607 :          block(:, :) = RESHAPE(buffer%msg(p_data + 1:p_data + ndata), SHAPE(block))
     843              :       END IF
     844              : 
     845     11145738 :       IF (do_advance) buffer%endpos = buffer%endpos + 1
     846     11145738 :    END SUBROUTINE dbt_buffer_get_next_block
     847              : 
     848              : ! **************************************************************************************************
     849              : !> \brief communicate buffer
     850              : !> \param mp_comm ...
     851              : !> \param buffer_recv ...
     852              : !> \param buffer_send ...
     853              : !> \param req_array ...
     854              : !> \author Patrick Seewald
     855              : ! **************************************************************************************************
     856      4402972 :    SUBROUTINE dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
     857              :       CLASS(mp_comm_type), INTENT(IN)                     :: mp_comm
     858              :       TYPE(dbt_buffer_type), DIMENSION(0:), &
     859              :          INTENT(INOUT)                                   :: buffer_recv, buffer_send
     860              :       TYPE(mp_request_type), DIMENSION(:, :), &
     861              :          INTENT(OUT)                                     :: req_array
     862              : 
     863              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbt_tas_communicate_buffer'
     864              : 
     865              :       INTEGER                                            :: handle, iproc, numnodes, &
     866              :                                                             rec_counter, send_counter
     867              : 
     868       380188 :       CALL timeset(routineN, handle)
     869       380188 :       numnodes = mp_comm%num_pe
     870              : 
     871       380188 :       IF (numnodes > 1) THEN
     872              : 
     873       245320 :          send_counter = 0
     874       245320 :          rec_counter = 0
     875              : 
     876       735960 :          DO iproc = 0, numnodes - 1
     877       735960 :             IF (buffer_recv(iproc)%nblock > 0) THEN
     878       335443 :                rec_counter = rec_counter + 1
     879       335443 :                CALL mp_comm%irecv(buffer_recv(iproc)%indx, iproc, req_array(rec_counter, 3), tag=4)
     880       335443 :                CALL mp_comm%irecv(buffer_recv(iproc)%msg, iproc, req_array(rec_counter, 4), tag=7)
     881              :             END IF
     882              :          END DO
     883              : 
     884       735960 :          DO iproc = 0, numnodes - 1
     885       735960 :             IF (buffer_send(iproc)%nblock > 0) THEN
     886       335443 :                send_counter = send_counter + 1
     887       335443 :                CALL mp_comm%isend(buffer_send(iproc)%indx, iproc, req_array(send_counter, 1), tag=4)
     888       335443 :                CALL mp_comm%isend(buffer_send(iproc)%msg, iproc, req_array(send_counter, 2), tag=7)
     889              :             END IF
     890              :          END DO
     891              : 
     892       245320 :          IF (send_counter > 0) THEN
     893       213048 :             CALL mp_waitall(req_array(1:send_counter, 1:2))
     894              :          END IF
     895       245320 :          IF (rec_counter > 0) THEN
     896       233238 :             CALL mp_waitall(req_array(1:rec_counter, 3:4))
     897              :          END IF
     898              : 
     899              :       ELSE
     900       134868 :          IF (buffer_recv(0)%nblock > 0) THEN
     901      5166172 :             buffer_recv(0)%indx(:, :) = buffer_send(0)%indx(:, :)
     902    463201493 :             buffer_recv(0)%msg(:) = buffer_send(0)%msg(:)
     903              :          END IF
     904              :       END IF
     905       380188 :       CALL timestop(handle)
     906       380188 :    END SUBROUTINE dbt_tas_communicate_buffer
     907              : 
     908       254218 : END MODULE dbt_tas_reshape_ops
        

Generated by: LCOV version 2.0-1