LCOV - code coverage report
Current view: top level - src - smeagol_matrix_utils.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 74.7 % 708 529
Test Date: 2025-12-04 06:27:48 Functions: 82.4 % 17 14

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief Routines to convert sparse matrices between DBCSR (distributed-blocks compressed sparse rows)
      10              : !>        and SIESTA (distributed compressed sparse columns) formats.
      11              : !> \author Sergey Chulkov
      12              : !> \author Christian Ahart
      13              : !> \author Clotilde Cucinotta
      14              : ! **************************************************************************************************
      15              : MODULE smeagol_matrix_utils
      16              :    USE cell_types, ONLY: cell_type, &
      17              :                          real_to_scaled, &
      18              :                          scaled_to_real
      19              :    USE cp_dbcsr_api, ONLY: dbcsr_get_block_p, &
      20              :                            dbcsr_get_info, &
      21              :                            dbcsr_p_type, &
      22              :                            dbcsr_set
      23              :    USE kinds, ONLY: dp, &
      24              :                     dp_size, &
      25              :                     int_8
      26              :    USE message_passing, ONLY: mp_para_env_type, &
      27              :                               mp_request_type, &
      28              :                               mp_waitall
      29              :    USE negf_matrix_utils, ONLY: get_index_by_cell
      30              : #if defined(__SMEAGOL)
      31              :    USE parallel, ONLY: GetNodeOrbs, &
      32              :                        GlobalToLocalOrb, &
      33              :                        LocalToGlobalOrb, &
      34              :                        WhichNodeOrb
      35              : #endif
      36              :    USE particle_types, ONLY: particle_type
      37              :    USE qs_neighbor_list_types, ONLY: get_iterator_info, &
      38              :                                      get_neighbor_list_set_p, &
      39              :                                      neighbor_list_iterate, &
      40              :                                      neighbor_list_iterator_create, &
      41              :                                      neighbor_list_iterator_p_type, &
      42              :                                      neighbor_list_iterator_release, &
      43              :                                      neighbor_list_set_p_type
      44              :    USE qs_subsys_types, ONLY: qs_subsys_get, &
      45              :                               qs_subsys_type
      46              :    USE util, ONLY: sort
      47              : #include "./base/base_uses.f90"
      48              : 
      49              :    IMPLICIT NONE
      50              :    PRIVATE
      51              : 
      52              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'smeagol_matrix_utils'
      53              :    LOGICAL, PARAMETER, PRIVATE          :: debug_this_module = .FALSE.
      54              : 
      55              :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_iatom_index = 1
      56              :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_jatom_index = 2
      57              :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_dbcsr_image_index = 3
      58              :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_siesta_image_index = 4
      59              :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_siesta_transp_image_index = 5
      60              :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_dim1 = neighbor_list_siesta_transp_image_index
      61              : 
      62              :    PUBLIC :: siesta_distrib_csc_struct_type
      63              :    PUBLIC :: siesta_struct_create, siesta_struct_release
      64              :    PUBLIC :: convert_dbcsr_to_distributed_siesta, convert_distributed_siesta_to_dbcsr
      65              : 
      66              :    PRIVATE :: get_negf_cell_ijk, index_in_canonical_enumeration, number_from_canonical_enumeration, pbc_0_1
      67              :    PRIVATE :: get_number_of_mpi_sendrecv_requests, assign_nonzero_elements_to_requests
      68              : 
      69              :    !> number of DBCSR matrix elements to receive from a given rank
      70              :    INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_recv = 1
      71              :    !> number of DBCSR matrix elements to send to a given rank
      72              :    INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_send = 2
      73              :    INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_dim2 = nelements_dbcsr_send
      74              : 
      75              :    INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_bytes = 134217728 ! 128 MiB (to limit memory usage for matrix redistribution)
      76              :    INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_dp = max_mpi_packet_size_bytes/INT(dp_size, kind=int_8)
      77              : 
      78              :    ! a portable way to determine the upper bound for tag value is to call
      79              :    ! MPI_COMM_GET_ATTR(comm, MPI_TAG_UB, max_mpi_rank, flag, ierror).
      80              :    ! The MPI specification guarantees a value of 32767
      81              :    INTEGER, PARAMETER, PRIVATE          :: max_mpi_rank = 32767
      82              : 
      83              : ! **************************************************************************************************
      84              : !> \brief Sparsity pattern of replicated SIESTA compressed sparse column (CSC) matrices
      85              : ! **************************************************************************************************
      86              :    TYPE siesta_distrib_csc_struct_type
      87              :       !> gather all non-zero matrix elements on the given MPI process.
      88              :       !> Distribute the elements across MPI processes if gather_root < 0.
      89              :       !> The matrix elements should be located on I/O process in case of bulk transport,
      90              :       !> and should be distributed in case of SMEAGOL calculation.
      91              :       INTEGER                                            :: gather_root = 0
      92              :       !> Based of full (.FALSE.) or upper-triangular (.TRUE.) DBCSR matrix.
      93              :       !> It is used in CPASSERTs to allow access to lower-triangular matrix elements of non-symmetric DBCSR matrices
      94              :       !> In case there is no bugs in this module, these CPASSERTs should newer trigger.
      95              :       !> Therefore the 'symmetric' variable alongside with relevant CPASSERT calls are excessive and can be removed.
      96              :       LOGICAL                                            :: symmetric = .TRUE.
      97              :       !> number of neighbour list nodes for each MPI process (0:num_pe-1).
      98              :       !> If do_merge == .TRUE., nodes for different cell images along k cell vector are merged into one node
      99              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: nnodes_per_proc
     100              : 
     101              :       !> replicated neighbour list (1:neighbor_list_dim1, 1:SUM(nnodes_per_proc)).
     102              :       !> Neighbour list nodes are ordered according to their MPI ranks.
     103              :       !> Thus, the first nnodes_per_proc(0) nodes are stored on MPI rank 0,
     104              :       !> the next nnodes_per_proc(1) nodes reside on MPI rank 1, etc
     105              :       !> Nodes for cell images along transport direction are merged into one node.
     106              :       !> The number of non-zero DBCSR matrix blocks and their DBCSR cell image indices
     107              :       !> are stored into 'n_dbcsr_cell_images_to_merge' and 'dbcsr_cell_image_to_merge' arrays
     108              :       INTEGER, ALLOCATABLE, DIMENSION(:, :)               :: nl_repl
     109              : 
     110              :       !> number of DBCSR images for each local merged neighbour list node (1:nnodes_per_proc(para_env%mepos))
     111              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: n_dbcsr_cell_images_to_merge
     112              :       !> list of DBCSR image indices to merge; (1:SUM(n_dbcsr_cell_images_to_merge))
     113              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dbcsr_cell_image_to_merge
     114              : 
     115              :       !> number of DBCSR non-zero matrix elements that should be received/sent from each MPI rank
     116              :       !> (0:num_pe-1, 1:nelements_dbcsr_dim2)
     117              :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:, :)   :: nelements_per_proc
     118              : 
     119              :       !> number of non-zero matrix elements local to this MPI rank.
     120              :       INTEGER(kind=int_8)                                :: n_nonzero_elements = 0_int_8
     121              :       INTEGER                                            :: nrows = 0, ncols = 0
     122              :       !> Number of non-zero matrix elements (columns) on each row
     123              :       !> n_nonzero_cols(1:nrows); same as 'numh' in SMEAGOL code.
     124              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: n_nonzero_cols
     125              :       !> offset of the first non-zero matrix elements on each row.
     126              :       !> column_offset(1:nrows); same as 'listhptr' in SMEAGOL code.
     127              :       !> It should be declared as INTEGER(kind=int_8), but SMEAGOL expects it to be INTEGER
     128              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: row_offset
     129              :       !> column index of each non-zero matrix element.
     130              :       !> col_index(1:n_nonzero_elements); same as 'listh' in SMEAGOL code
     131              :       !> col index of the first non-zero matrix elements of irow row is col_index(row_offset(irow)+1)
     132              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: col_index
     133              :       !> index of the non-zero matrix element in a communication buffer for each rank
     134              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: packed_index
     135              :       !> R_atom_row - R_atom_col
     136              :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)         :: xij
     137              :       !> equivalent atomic orbitals
     138              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: indxuo
     139              :       !> atomic index on which the orbital is centred
     140              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: iaorb
     141              :       !> coordinates of all atoms in the supercell
     142              :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)         :: xa
     143              :    END TYPE siesta_distrib_csc_struct_type
     144              : 
     145              : CONTAINS
     146              : 
     147              : ! **************************************************************************************************
     148              : !> \brief Map non-zero matrix blocks between sparse matrices in DBCSR and SIESTA formats.
     149              : !> \param siesta_struct      structure that stores metadata (sparsity pattern) of sparse SIESTA matrices
     150              : !> \param matrix_dbcsr_kp    DBCSR matrices for each cell image
     151              : !> \param subsys             QuickStep molecular system
     152              : !> \param cell_to_index      array to convert 3-D cell indices to 1-D DBCSR image indices
     153              : !> \param sab_nl             pair-wise neighbour list
     154              : !> \param para_env           MPI parallel environment
     155              : !> \param max_ij_cell_image  largest index of cell images along i and j cell vectors (e.g. (2,0) in
     156              : !>                           case of 5 cell images (0,0), (1,0), (-1,0), (2,0), and (-2,0))
     157              : !> \param do_merge           merge DBCSR images along transport direction (k cell vector)
     158              : !> \param gather_root        distribute non-zero matrix elements of SIESTA matrices across all
     159              : !>                           parallel processes (-1), or gather them on the given MPI rank (>= 0).
     160              : ! **************************************************************************************************
     161            4 :    SUBROUTINE siesta_struct_create(siesta_struct, matrix_dbcsr_kp, subsys, cell_to_index, &
     162              :                                    sab_nl, para_env, max_ij_cell_image, do_merge, gather_root)
     163              :       TYPE(siesta_distrib_csc_struct_type), &
     164              :          INTENT(inout)                                   :: siesta_struct
     165              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
     166              :       TYPE(qs_subsys_type), POINTER                      :: subsys
     167              :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
     168              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     169              :          POINTER                                         :: sab_nl
     170              :       TYPE(mp_para_env_type), POINTER                    :: para_env
     171              :       INTEGER, DIMENSION(2), INTENT(inout)               :: max_ij_cell_image
     172              :       LOGICAL, INTENT(in)                                :: do_merge
     173              :       INTEGER, INTENT(in)                                :: gather_root
     174              : 
     175              :       CHARACTER(len=*), PARAMETER :: routineN = 'siesta_struct_create'
     176              : 
     177              :       CHARACTER(len=20)                                  :: str_nelem, str_nelem_max
     178              :       INTEGER :: handle, iatom, icol, icol_blk, icol_local, image, image_j, image_k, irow, &
     179              :          irow_local, natoms, ncells_siesta_total, ncols_blk, ncols_total, nrows_local, &
     180              :          nrows_total, offset
     181              :       INTEGER(kind=int_8)                                :: n_nonzero_elements_local
     182              :       INTEGER, DIMENSION(3)                              :: max_ijk_cell_image, ncells_siesta
     183            4 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size
     184              :       LOGICAL                                            :: do_distribute, is_root_rank
     185              :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)        :: particle_coords
     186              :       REAL(kind=dp), DIMENSION(3)                        :: real_cell_shift, scaled_cell_shift
     187              :       TYPE(cell_type), POINTER                           :: cell
     188            4 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     189              : 
     190            4 :       CALL timeset(routineN, handle)
     191            4 :       do_distribute = gather_root < 0
     192            4 :       is_root_rank = gather_root == para_env%mepos
     193              : 
     194              :       ! here row_blk_offset / col_blk_offset are global indices of the first row / column of a given non-zero block.
     195              :       ! They are not offsets (index-1) but the actual indices.
     196              :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
     197              :                           nfullrows_total=nrows_total, nfullcols_total=ncols_total, &
     198            4 :                           nblkcols_total=ncols_blk, col_blk_size=col_blk_size, col_blk_offset=col_blk_offset)
     199              :       IF (debug_this_module) THEN
     200              :          CPASSERT(nrows_total == ncols_total)
     201              :          CPASSERT(gather_root < para_env%num_pe)
     202              :       END IF
     203              : 
     204            4 :       siesta_struct%gather_root = gather_root
     205            4 :       CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=siesta_struct%symmetric)
     206              : 
     207              :       ! apply periodic boundary conditions to atomic coordinates
     208            4 :       CALL qs_subsys_get(subsys, cell=cell, particle_set=particle_set, nparticle=natoms)
     209           12 :       ALLOCATE (particle_coords(3, natoms))
     210          140 :       DO iatom = 1, natoms
     211          140 :          CALL pbc_0_1(particle_coords(1:3, iatom), particle_set(iatom)%r(1:3), cell)
     212              :       END DO
     213              : 
     214              :       ! Note: in case we would like to limit the number of cell images along transport direction (k cell vector)
     215              :       !       by enabling 'BulkTransvCellSizeZ' keyword, we need to pass max_ijk_cell_image(1:3) vector
     216              :       !       to the subroutine instead of the reduced vector max_ij_cell_image(1:2)
     217           12 :       max_ijk_cell_image(1:2) = max_ij_cell_image(1:2)
     218              : 
     219              :       ! determine the actual number of cell images along k cell vector. Unless the third element is also passed
     220              :       ! via subroutine arguments, an extra MPI_Allreduce operation is needed each time we call
     221              :       ! replicate_neighbour_list() / get_nnodes_local().
     222            4 :       max_ijk_cell_image(3) = -1
     223            4 :       IF (.NOT. do_merge) max_ijk_cell_image(3) = 1 ! bulk-transport calculation expects exactly 3 cell images along transport direction
     224              : 
     225              :       ! replicate pair-wise neighbour list. Identical non-zero matrix blocks from cell image along transport direction
     226              :       ! are grouped together if do_merge == .TRUE.
     227           12 :       ALLOCATE (siesta_struct%nnodes_per_proc(0:para_env%num_pe - 1))
     228              :       CALL replicate_neighbour_list(siesta_struct%nl_repl, &
     229              :                                     siesta_struct%n_dbcsr_cell_images_to_merge, &
     230              :                                     siesta_struct%dbcsr_cell_image_to_merge, &
     231              :                                     siesta_struct%nnodes_per_proc, &
     232            4 :                                     max_ijk_cell_image, sab_nl, para_env, particle_coords, cell, cell_to_index, do_merge)
     233           12 :       max_ij_cell_image(1:2) = max_ijk_cell_image(1:2)
     234              : 
     235              :       ! count number of non-zero matrix elements that need to be send to and received from other parallel processes
     236           16 :       ALLOCATE (siesta_struct%nelements_per_proc(0:para_env%num_pe - 1, nelements_dbcsr_dim2))
     237              :       CALL count_remote_dbcsr_elements(siesta_struct%nelements_per_proc, siesta_struct%nnodes_per_proc, &
     238            4 :                                        siesta_struct%nl_repl, matrix_dbcsr_kp, siesta_struct%symmetric, para_env, gather_root)
     239              : 
     240              :       ! number of SIESTA non-zero matrix elements that are going to be stored on this parallel process
     241           12 :       n_nonzero_elements_local = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
     242            4 :       siesta_struct%n_nonzero_elements = n_nonzero_elements_local
     243              : 
     244              :       ! as SMEAGOL uses 32-bits integers, the number of non-zero matrix elements is limited by 2^31 per MPI rank.
     245              :       ! Abort CP2K if we are about to exceed this limit.
     246            4 :       IF (n_nonzero_elements_local > INT(HUGE(0), kind=int_8)) THEN
     247            0 :          WRITE (str_nelem, '(I0)') n_nonzero_elements_local
     248            0 :          WRITE (str_nelem_max, '(I0)') HUGE(0)
     249              :          CALL cp_abort(__LOCATION__, &
     250              :                        "The number of non-zero matrix elements per MPI process "//TRIM(str_nelem)// &
     251              :                        " cannot exceed "//TRIM(str_nelem_max)// &
     252            0 :                        ". Please increase the number of MPI processes to satisfy this SMEAGOL limitation.")
     253              :       END IF
     254              : 
     255              :       ! in case there is no nonzero matrix element stored on this process, allocate arrays with one element to avoid SEGFAULT
     256            4 :       IF (n_nonzero_elements_local == 0) n_nonzero_elements_local = 1
     257              : 
     258              :       ! number of SIESTA-matrix rows local to the given parallel process
     259            4 :       IF (do_distribute) THEN
     260              : #if defined(__SMEAGOL)
     261            0 :          CALL GetNodeOrbs(nrows_total, para_env%mepos, para_env%num_pe, nrows_local)
     262              : #else
     263              :          CALL cp_abort(__LOCATION__, &
     264              :                        "CP2K was compiled with no SMEAGOL support.")
     265              : #endif
     266              :       ELSE
     267            4 :          IF (is_root_rank) THEN
     268            2 :             nrows_local = nrows_total
     269              :          ELSE
     270            2 :             nrows_local = 0
     271              :          END IF
     272              :       END IF
     273              : 
     274              :       ! number of cell images along each cell vector. It is 2*m+1, as SIESTA images are ordered as 0, 1, -1, ..., m, -m
     275           16 :       ncells_siesta(1:3) = 2*max_ijk_cell_image(1:3) + 1
     276              :       ! in case of merged cell images along the transport direction, there will be just 1 'merged' cell image along it
     277            4 :       IF (do_merge) ncells_siesta(3) = 1
     278              : 
     279            4 :       ncells_siesta_total = ncells_siesta(1)*ncells_siesta(2)*ncells_siesta(3)
     280              : 
     281              :       ! number of rows local to the given parallel process. Rows are distributed in a block-cyclic manner
     282            4 :       siesta_struct%nrows = nrows_local
     283              : 
     284              :       ! number of columns of the matrix in its dense form. SIESTA uses 1-D (rows) block-cyclic distribution.
     285              :       ! All non-zero matrix elements on a given row are stored on the same parallel process
     286            4 :       siesta_struct%ncols = nrows_total*ncells_siesta_total
     287              : 
     288              :       ! allocate at least one array element to avoid SIGFAULT when passing unallocated arrays to subroutines
     289            4 :       IF (nrows_local == 0) nrows_local = 1
     290              : 
     291           12 :       ALLOCATE (siesta_struct%n_nonzero_cols(nrows_local))
     292            8 :       ALLOCATE (siesta_struct%row_offset(nrows_local))
     293           12 :       ALLOCATE (siesta_struct%col_index(n_nonzero_elements_local))
     294            8 :       ALLOCATE (siesta_struct%packed_index(n_nonzero_elements_local))
     295              : 
     296              :       ! restore the actual number of local rows
     297            4 :       nrows_local = siesta_struct%nrows
     298              : 
     299              :       ! get number of non-zero matrix element on each local row (n_nonzero_cols),
     300              :       ! offset of the first non-zero matrix element for each local row (row_offset),
     301              :       ! global column indices of all local non-zero matrix elements (col_index), and
     302              :       ! the indices of all local non-zero matrix elements in the communication buffer (packed_index)
     303              :       CALL get_nonzero_element_indices(siesta_struct%n_nonzero_cols, siesta_struct%row_offset, &
     304              :                                        siesta_struct%col_index, siesta_struct%packed_index, &
     305              :                                        siesta_struct%nl_repl, matrix_dbcsr_kp, &
     306            4 :                                        siesta_struct%symmetric, para_env, gather_root)
     307              : 
     308              :       ! indices of equivalent atomic orbitals
     309           12 :       ALLOCATE (siesta_struct%indxuo(siesta_struct%ncols))
     310         1228 :       DO icol = 1, ncols_total
     311         1228 :          siesta_struct%indxuo(icol) = icol
     312              :       END DO
     313          300 :       DO image = 2, ncells_siesta_total
     314       181452 :          siesta_struct%indxuo((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%indxuo(1:ncols_total)
     315              :       END DO
     316              : 
     317              :       ! particle index on which the orbital is centred
     318            8 :       ALLOCATE (siesta_struct%iaorb(siesta_struct%ncols))
     319          140 :       DO icol_blk = 1, ncols_blk
     320              :          ! col_blk_offset() is not an offset but the index of the first atomic orbital in the column block
     321         1364 :          siesta_struct%iaorb(col_blk_offset(icol_blk):col_blk_offset(icol_blk) + col_blk_size(icol_blk) - 1) = icol_blk
     322              :       END DO
     323          300 :       DO image = 2, ncells_siesta_total
     324       181452 :         siesta_struct%iaorb((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%iaorb(1:ncols_total) + (image - 1)*natoms
     325              :       END DO
     326              : 
     327              :       ! coordinates of all particles in each cell images
     328           12 :       ALLOCATE (siesta_struct%xa(3, natoms*ncells_siesta_total))
     329           16 :       DO image_k = 1, ncells_siesta(3)
     330              :          !icell_siesta(3) = image_k
     331           12 :          scaled_cell_shift(3) = REAL(number_from_canonical_enumeration(image_k), kind=dp) ! SIESTA -> actual cell index
     332           76 :          DO image_j = 1, ncells_siesta(2)
     333           60 :             scaled_cell_shift(2) = REAL(number_from_canonical_enumeration(image_j), kind=dp)
     334          372 :             DO image = 1, ncells_siesta(1)
     335          300 :                scaled_cell_shift(1) = REAL(number_from_canonical_enumeration(image), kind=dp)
     336          300 :                CALL scaled_to_real(real_cell_shift, scaled_cell_shift, cell)
     337          300 :                offset = (((image_k - 1)*ncells_siesta(2) + image_j - 1)*ncells_siesta(1) + image - 1)*natoms
     338        10560 :                DO iatom = 1, natoms
     339        41100 :                   siesta_struct%xa(1:3, offset + iatom) = particle_set(iatom)%r(1:3) + real_cell_shift(1:3)
     340              :                END DO
     341              :             END DO
     342              :          END DO
     343              :       END DO
     344              : 
     345              :       ! inter-atomic distance
     346           12 :       ALLOCATE (siesta_struct%xij(3, n_nonzero_elements_local))
     347          616 :       DO irow_local = 1, nrows_local
     348          612 :          IF (do_distribute) THEN
     349              : #if defined(__SMEAGOL)
     350            0 :             CALL LocalToGlobalOrb(irow_local, para_env%mepos, para_env%num_pe, irow)
     351              : #else
     352              :             CALL cp_abort(__LOCATION__, &
     353              :                           "CP2K was compiled with no SMEAGOL support.")
     354              : #endif
     355              :          ELSE
     356          612 :             irow = irow_local
     357              :             IF (debug_this_module) THEN
     358              :                CPASSERT(is_root_rank)
     359              :             END IF
     360              :          END IF
     361          612 :          offset = siesta_struct%row_offset(irow_local)
     362      2319484 :          DO icol_local = offset + 1, offset + siesta_struct%n_nonzero_cols(irow_local)
     363      2318868 :             icol = siesta_struct%col_index(icol_local)
     364              :             siesta_struct%xij(1:3, icol_local) = siesta_struct%xa(1:3, siesta_struct%iaorb(icol)) - &
     365      9276084 :                                                  siesta_struct%xa(1:3, siesta_struct%iaorb(irow))
     366              :          END DO
     367              :       END DO
     368              : 
     369            4 :       DEALLOCATE (particle_coords)
     370              : 
     371            4 :       CALL timestop(handle)
     372            8 :    END SUBROUTINE siesta_struct_create
     373              : 
     374              : ! **************************************************************************************************
     375              : !> \brief Release a SIESTA matrix structure
     376              : !> \param siesta_struct      structure to release
     377              : ! **************************************************************************************************
     378            4 :    SUBROUTINE siesta_struct_release(siesta_struct)
     379              :       TYPE(siesta_distrib_csc_struct_type), &
     380              :          INTENT(inout)                                   :: siesta_struct
     381              : 
     382              :       CHARACTER(len=*), PARAMETER :: routineN = 'siesta_struct_release'
     383              : 
     384              :       INTEGER                                            :: handle
     385              : 
     386            4 :       CALL timeset(routineN, handle)
     387              : 
     388            4 :       siesta_struct%gather_root = -1
     389              : 
     390            4 :       IF (ALLOCATED(siesta_struct%nnodes_per_proc)) DEALLOCATE (siesta_struct%nnodes_per_proc)
     391            4 :       IF (ALLOCATED(siesta_struct%nl_repl)) DEALLOCATE (siesta_struct%nl_repl)
     392            4 :       IF (ALLOCATED(siesta_struct%n_dbcsr_cell_images_to_merge)) DEALLOCATE (siesta_struct%n_dbcsr_cell_images_to_merge)
     393            4 :       IF (ALLOCATED(siesta_struct%dbcsr_cell_image_to_merge)) DEALLOCATE (siesta_struct%dbcsr_cell_image_to_merge)
     394            4 :       IF (ALLOCATED(siesta_struct%nelements_per_proc)) DEALLOCATE (siesta_struct%nelements_per_proc)
     395              : 
     396            4 :       siesta_struct%n_nonzero_elements = 0
     397            4 :       siesta_struct%nrows = 0
     398            4 :       siesta_struct%ncols = 0
     399              : 
     400            4 :       IF (ALLOCATED(siesta_struct%n_nonzero_cols)) DEALLOCATE (siesta_struct%n_nonzero_cols)
     401            4 :       IF (ALLOCATED(siesta_struct%row_offset)) DEALLOCATE (siesta_struct%row_offset)
     402            4 :       IF (ALLOCATED(siesta_struct%col_index)) DEALLOCATE (siesta_struct%col_index)
     403            4 :       IF (ALLOCATED(siesta_struct%packed_index)) DEALLOCATE (siesta_struct%packed_index)
     404              : 
     405            4 :       IF (ALLOCATED(siesta_struct%xij)) DEALLOCATE (siesta_struct%xij)
     406            4 :       IF (ALLOCATED(siesta_struct%indxuo)) DEALLOCATE (siesta_struct%indxuo)
     407            4 :       IF (ALLOCATED(siesta_struct%iaorb)) DEALLOCATE (siesta_struct%iaorb)
     408            4 :       IF (ALLOCATED(siesta_struct%xa)) DEALLOCATE (siesta_struct%xa)
     409              : 
     410            4 :       CALL timestop(handle)
     411            4 :    END SUBROUTINE siesta_struct_release
     412              : 
     413              : ! **************************************************************************************************
     414              : !> \brief Convert matrix from DBCSR to sparse SIESTA format.
     415              : !> \param matrix_siesta      matrix in SIESTA format [out]
     416              : !> \param matrix_dbcsr_kp    DBCSR matrix [in]
     417              : !> \param siesta_struct      structure to map matrix blocks between formats
     418              : !> \param para_env           MPI parallel environment
     419              : ! **************************************************************************************************
     420           16 :    SUBROUTINE convert_dbcsr_to_distributed_siesta(matrix_siesta, matrix_dbcsr_kp, siesta_struct, para_env)
     421              :       REAL(kind=dp), DIMENSION(:), INTENT(out)           :: matrix_siesta
     422              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
     423              :       TYPE(siesta_distrib_csc_struct_type), INTENT(in)   :: siesta_struct
     424              :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
     425              : 
     426              :       CHARACTER(len=*), PARAMETER :: routineN = 'convert_dbcsr_to_distributed_siesta'
     427              : 
     428              :       INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
     429              :          image_dbcsr, image_ind, image_ind_offset, image_siesta, image_siesta_transp, inode, &
     430              :          inode_proc, iproc, irequest, irow_blk, irow_local, irow_proc, mepos, n_image_ind, &
     431              :          ncols_blk, ncols_local, nnodes_proc, node_offset, nprocs, nrequests_recv, &
     432              :          nrequests_total, nrows_blk, nrows_local
     433              :       INTEGER(kind=int_8)                                :: n_nonzero_elements_dbcsr, &
     434              :                                                             n_nonzero_elements_siesta, &
     435              :                                                             offset_recv_mepos, offset_send_mepos
     436           16 :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: n_packed_elements_per_proc, &
     437           16 :                                                             nelements_per_request, &
     438           16 :                                                             offset_per_proc, offset_per_request
     439           16 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: next_nonzero_element_offset, peer_rank, &
     440           16 :                                                             request_tag
     441           16 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
     442           16 :                                                             row_blk_offset, row_blk_size
     443              :       LOGICAL                                            :: do_distribute, found, is_root_rank, &
     444              :                                                             symmetric
     445           16 :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buffer, reorder_recv_buffer, &
     446           16 :                                                             send_buffer
     447           16 :       REAL(kind=dp), DIMENSION(:, :), POINTER            :: sm_block, sm_block_merged
     448           16 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: requests
     449              : 
     450           16 :       CALL timeset(routineN, handle)
     451      9275496 :       matrix_siesta(:) = 0.0_dp
     452              : 
     453           16 :       mepos = para_env%mepos
     454           16 :       nprocs = para_env%num_pe
     455           16 :       do_distribute = siesta_struct%gather_root < 0
     456           16 :       is_root_rank = siesta_struct%gather_root == mepos
     457              : 
     458              :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
     459              :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
     460              :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
     461           16 :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
     462           16 :       symmetric = siesta_struct%symmetric
     463              : 
     464              :       ! number of locally stored SIESTA non-zero matrix elements
     465           48 :       n_nonzero_elements_siesta = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
     466              :       ! number of locally stored DBCSR non-zero matrix elements
     467           48 :       n_nonzero_elements_dbcsr = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))
     468              : 
     469              :       ! number of concurrent MPI isend / irecv operations
     470              :       nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     471           16 :                                                            max_mpi_packet_size_dp)
     472              :       nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     473           16 :                                                             max_mpi_packet_size_dp) + nrequests_recv
     474              : 
     475           16 :       IF (nrequests_total > 0) THEN
     476              :          ! allocate MPI-related arrays. request_tag is not actually needed, as MPI standard guarantees the order of
     477              :          ! peer-to-peer messages with the same tag between same processes
     478           64 :          ALLOCATE (requests(nrequests_total))
     479           48 :          ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))
     480           64 :          ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))
     481              :          !requests(:) = mp_request_null
     482              : 
     483              :          ! split large messages into a number of smaller messages. It is not really needed
     484              :          ! unless we are going to send > 2^31 matrix elements per MPI request
     485           16 :          IF (nrequests_recv > 0) THEN
     486              :             CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &
     487              :                                                      nelements_per_request(1:nrequests_recv), &
     488              :                                                      peer_rank(1:nrequests_recv), &
     489              :                                                      request_tag(1:nrequests_recv), &
     490              :                                                      mepos, &
     491              :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     492            8 :                                                      max_mpi_packet_size_dp)
     493              :          END IF
     494           16 :          IF (nrequests_total > nrequests_recv) THEN
     495              :             CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &
     496              :                                                      nelements_per_request(nrequests_recv + 1:nrequests_total), &
     497              :                                                      peer_rank(nrequests_recv + 1:nrequests_total), &
     498              :                                                      request_tag(nrequests_recv + 1:nrequests_total), &
     499              :                                                      mepos, &
     500              :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     501            8 :                                                      max_mpi_packet_size_dp)
     502              :          END IF
     503              :       END IF
     504              : 
     505              :       ! point-to-point recv/send can be replaced with alltoallv, if data to distribute per rank is < 2^31 elements (should be OK due to SMEAGOL limitation)
     506              :       ! in principle, it is possible to overcome this limit by using derived datatypes (which will require additional wrapper functions, indeed).
     507              :       !
     508              :       ! pre-post non-blocking receive operations
     509           16 :       IF (n_nonzero_elements_siesta > 0) THEN
     510           24 :          ALLOCATE (recv_buffer(n_nonzero_elements_siesta))
     511              :       END IF
     512           24 :       DO irequest = 1, nrequests_recv
     513              :          CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &
     514              :                                          offset_per_request(irequest) + nelements_per_request(irequest)), &
     515           24 :                              peer_rank(irequest), requests(irequest), request_tag(irequest))
     516              :       END DO
     517              : 
     518              :       ! pack local DBCSR non-zero matrix elements ordering by their target parallel process
     519           64 :       ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))
     520           16 :       offset_per_proc(0) = 0
     521           32 :       DO iproc = 1, nprocs - 1
     522           32 :          offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)
     523              :       END DO
     524           48 :       n_packed_elements_per_proc(:) = 0
     525              : 
     526              :       ! number of local neighbour-list nodes and offset of the first local neighbour-list node
     527           16 :       nnodes_proc = siesta_struct%nnodes_per_proc(mepos)
     528              :       !node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos)) - siesta_struct%nnodes_per_proc(mepos)
     529           40 :       node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos)) - nnodes_proc
     530              : 
     531              :       ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root
     532              :       ! in case of do_distribute == .TRUE., iproc is determined by calling WhichNodeOrb()
     533           16 :       iproc = siesta_struct%gather_root
     534              : 
     535           16 :       IF (n_nonzero_elements_dbcsr > 0) THEN
     536           48 :          ALLOCATE (send_buffer(n_nonzero_elements_dbcsr))
     537      9275488 :          send_buffer(:) = 0.0_dp
     538              : 
     539              :          ! iterate over locally-stored DBCSR matrix blocks.
     540              :          ! inode_proc is the target parallel process (where data are going to be sent)
     541              :          image_ind_offset = 0
     542        59568 :          DO inode_proc = 1, nnodes_proc
     543        59552 :             n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
     544        59552 :             IF (n_image_ind > 0) THEN
     545        59552 :                inode = node_offset + inode_proc
     546              : 
     547        59552 :                irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
     548        59552 :                icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
     549        59552 :                CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
     550        59552 :                image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
     551        59552 :                image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
     552              : 
     553        59552 :                nrows_local = row_blk_size(irow_blk)
     554        59552 :                ncols_local = col_blk_size(icol_blk)
     555        59552 :                first_row_minus_one = row_blk_offset(irow_blk) - 1
     556        59552 :                first_col_minus_one = col_blk_offset(icol_blk) - 1
     557              : 
     558              :                ! merging cell images along transport direction
     559        59552 :                IF (n_image_ind == 1) THEN
     560              :                   ! the most common case. Nothing to merge, so there is no need to allocate memory for a merged block
     561        59552 :                   image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind_offset + 1)
     562              :                   CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
     563        59552 :                                          row=irow_blk, col=icol_blk, block=sm_block_merged, found=found)
     564        59552 :                   CPASSERT(found)
     565              :                ELSE ! n_image_ind > 1
     566            0 :                   ALLOCATE (sm_block_merged(nrows_local, ncols_local))
     567              : 
     568            0 :                   DO image_ind = 1, n_image_ind
     569            0 :                      image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind + image_ind_offset)
     570              : 
     571              :                      CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
     572            0 :                                             row=irow_blk, col=icol_blk, block=sm_block, found=found)
     573            0 :                      CPASSERT(found)
     574              :                      sm_block_merged(1:nrows_local, 1:ncols_local) = sm_block_merged(1:nrows_local, 1:ncols_local) + &
     575            0 :                                                                      sm_block(1:nrows_local, 1:ncols_local)
     576              :                   END DO
     577              :                END IF
     578              : 
     579              :                ! pack matrix elements for the 'normal' SIESTA matrix block
     580        59552 :                IF (image_siesta > 0) THEN
     581       595520 :                   DO irow_local = 1, nrows_local
     582       535968 :                      IF (do_distribute) THEN
     583              : #if defined(__SMEAGOL)
     584            0 :                         CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc)
     585              : #else
     586              :                         CALL cp_abort(__LOCATION__, &
     587              :                                       "CP2K was compiled with no SMEAGOL support.")
     588              : #endif
     589              :                      END IF
     590              : 
     591              :                      ! CPASSERT
     592              :                      IF (debug_this_module) THEN
     593              :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
     594              :                         IF (n_packed_elements_per_proc(iproc) + ncols_local > &
     595              :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
     596              :                            CALL cp__a(__SHORT_FILE__, __LINE__)
     597              :                         END IF
     598              :                      END IF
     599              : 
     600              :                      offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
     601       535968 :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = sm_block_merged(irow_local, 1:ncols_local)
     602      5359680 : 
     603              :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local
     604       595520 :                   END DO
     605              :                END IF
     606              : 
     607              :                ! pack matrix elements of the transposed SIESTA matrix block
     608              :                IF (image_siesta_transp > 0) THEN
     609        59552 :                   DO icol_local = 1, ncols_local
     610       549600 :                      IF (do_distribute) THEN
     611       494640 : #if defined(__SMEAGOL)
     612              :                         CALL WhichNodeOrb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb
     613            0 : #else
     614              :                         CALL cp_abort(__LOCATION__, &
     615              :                                       "CP2K was compiled with no SMEAGOL support.")
     616              : #endif
     617              :                      END IF
     618              : 
     619              :                      ! CPASSERT
     620              :                      IF (debug_this_module) THEN
     621              :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
     622              :                         IF (n_packed_elements_per_proc(iproc) + nrows_local > &
     623              :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
     624              :                            CALL cp__a(__SHORT_FILE__, __LINE__)
     625              :                         END IF
     626              :                      END IF
     627              : 
     628              :                      offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
     629              :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = sm_block_merged(1:nrows_local, icol_local)
     630       494640 : 
     631      4946400 :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local
     632              :                   END DO
     633       549600 :                END IF
     634              : 
     635              :                IF (n_image_ind > 1) THEN
     636              :                   DEALLOCATE (sm_block_merged)
     637        59552 :                END IF
     638            0 :             END IF
     639              : 
     640              :             image_ind_offset = image_ind_offset + siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
     641              :          END DO
     642        59568 : 
     643              :          IF (debug_this_module) THEN
     644              :             DO iproc = 0, nprocs - 1
     645              :                IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
     646              :                   CALL cp__a(__SHORT_FILE__, __LINE__)
     647              :                END IF
     648              :             END DO
     649              :          END IF
     650              : 
     651              :          ! send packed data to other parallel processes
     652              :          DO irequest = nrequests_recv + 1, nrequests_total
     653              :             CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &
     654           24 :                                             offset_per_request(irequest) + nelements_per_request(irequest)), &
     655              :                                 peer_rank(irequest), requests(irequest), request_tag(irequest))
     656              :          END DO
     657           24 : 
     658              :          ! copy data locally that stay on the same process.
     659              :          IF (mepos > 0) THEN
     660              :             offset_recv_mepos = SUM(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))
     661           16 :          ELSE
     662           16 :             offset_recv_mepos = 0
     663              :          END IF
     664              :          offset_send_mepos = offset_per_proc(mepos)
     665              : 
     666           16 :          IF (debug_this_module) THEN
     667              :             IF (n_packed_elements_per_proc(mepos) /= siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv)) THEN
     668              :                CALL cp__a(__SHORT_FILE__, __LINE__)
     669              :             END IF
     670              :          END IF
     671              : 
     672              :          IF (n_packed_elements_per_proc(mepos) > 0) THEN
     673              :             recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + n_packed_elements_per_proc(mepos)) = &
     674           16 :                send_buffer(offset_send_mepos + 1:offset_send_mepos + n_packed_elements_per_proc(mepos))
     675              :          END IF
     676      4591088 :       END IF
     677              : 
     678              :       IF (nrequests_total > 0) THEN
     679              :          ! wait for pending isend/irecv requests
     680           16 :          CALL mp_waitall(requests)
     681              :          DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)
     682           16 :       END IF
     683           16 : 
     684              :       ! release send buffers
     685              :       IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)
     686              :       DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)
     687           16 : 
     688           16 :       ! non-zero matrix elements in 'recv_buffer' array are grouped by their source MPI rank, local row index, and column index (in this order).
     689              :       ! Reorder the matrix elements ('reorder_recv_buffer') so they are grouped by their local row index, source MPI rank, and column index.
     690              :       ! (column indices are in the ascending order within each (row index, source MPI rank) block).
     691              :       ! The array 'packed_index' allows mapping matrix element between these intermediate order and SIESTA order : local row index, column index
     692              :       IF (n_nonzero_elements_siesta > 0) THEN
     693              :          ALLOCATE (reorder_recv_buffer(n_nonzero_elements_siesta))
     694           16 :          ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))
     695           24 :          next_nonzero_element_offset(:) = 0
     696           24 :          offset_recv_mepos = 0
     697         2456 : 
     698            8 :          DO inode = 1, SIZE(siesta_struct%nl_repl, 2)
     699              :             irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
     700        59560 :             icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
     701        59552 :             CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
     702        59552 :             image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
     703        59552 :             image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
     704        59552 : 
     705        59552 :             nrows_local = row_blk_size(irow_blk)
     706              :             ncols_local = col_blk_size(icol_blk)
     707        59552 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
     708        59552 :             first_col_minus_one = col_blk_offset(icol_blk) - 1
     709        59552 : 
     710        59552 :             ! normal block
     711              :             IF (image_siesta > 0) THEN
     712              :                DO irow_local = 1, nrows_local
     713        59552 :                   IF (do_distribute) THEN
     714       595520 : #if defined(__SMEAGOL)
     715       535968 :                      CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
     716              : #else
     717            0 :                      CALL cp_abort(__LOCATION__, &
     718              :                                    "CP2K was compiled with no SMEAGOL support.")
     719              : #endif
     720              :                   ELSE
     721              :                      IF (is_root_rank) THEN
     722              :                         irow_proc = irow_local + first_row_minus_one
     723       535968 :                      ELSE
     724       535968 :                         irow_proc = 0
     725              :                      END IF
     726            0 :                   END IF
     727              :                   IF (irow_proc > 0) THEN
     728              :                      offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     729       595520 :                      reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &
     730       535968 :                         recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
     731              :                      offset_recv_mepos = offset_recv_mepos + ncols_local
     732      5359680 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local
     733       535968 :                   END IF
     734       535968 :                END DO
     735              :             END IF
     736              : 
     737              :             ! transposed block
     738              :             IF (image_siesta_transp > 0) THEN
     739              :                DO icol_local = 1, ncols_local
     740        59560 :                   IF (do_distribute) THEN
     741       549600 : #if defined(__SMEAGOL)
     742       494640 :                      CALL GlobalToLocalOrb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)
     743              : #else
     744            0 :                      CALL cp_abort(__LOCATION__, &
     745              :                                    "CP2K was compiled with no SMEAGOL support.")
     746              : #endif
     747              :                   ELSE
     748              :                      IF (is_root_rank) THEN
     749              :                         irow_proc = icol_local + first_col_minus_one
     750       494640 :                      ELSE
     751       494640 :                         irow_proc = 0
     752              :                      END IF
     753            0 :                   END IF
     754              :                   IF (irow_proc > 0) THEN
     755              :                      offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     756       549600 :                      reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &
     757       494640 :                         recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
     758              :                      offset_recv_mepos = offset_recv_mepos + nrows_local
     759      4946400 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local
     760       494640 :                   END IF
     761       494640 :                END DO
     762              :             END IF
     763              :          END DO
     764              : 
     765              :          IF (debug_this_module) THEN
     766              :             DO irow_local = 1, siesta_struct%nrows
     767              :                IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN
     768              :                   CALL cp__a(__SHORT_FILE__, __LINE__)
     769              :                END IF
     770              :             END DO
     771              :          END IF
     772              : 
     773              :          DEALLOCATE (next_nonzero_element_offset)
     774              :          DEALLOCATE (recv_buffer)
     775            8 : 
     776            8 :          ! Map non-zero matrix element between the intermediate order and SIESTA order
     777              :          DO irow_local = 1, siesta_struct%nrows
     778              :             offset_recv_mepos = siesta_struct%row_offset(irow_local)
     779         2456 :             DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)
     780         2448 :                matrix_siesta(offset_recv_mepos + icol_local) = &
     781      9277928 :                   reorder_recv_buffer(offset_recv_mepos + siesta_struct%packed_index(offset_recv_mepos + icol_local))
     782              :             END DO
     783      9277920 :          END DO
     784              :          DEALLOCATE (reorder_recv_buffer)
     785              :       END IF
     786            8 : 
     787              :       CALL timestop(handle)
     788              :    END SUBROUTINE convert_dbcsr_to_distributed_siesta
     789           16 : 
     790           48 : ! **************************************************************************************************
     791              : !> \brief Convert matrix from DBCSR to sparse SIESTA format.
     792              : !> \param matrix_dbcsr_kp    DBCSR matrix [out]. The matrix is declared as INTENT(in) as pointers to
     793              : !>                 dbcsr matrices remain intact. However we have intention to update matrix elements
     794              : !> \param matrix_siesta      matrix in SIESTA format [in]
     795              : !> \param siesta_struct      structure to map matrix blocks between formats
     796              : !> \param para_env           MPI parallel environment
     797              : ! **************************************************************************************************
     798              :    SUBROUTINE convert_distributed_siesta_to_dbcsr(matrix_dbcsr_kp, matrix_siesta, siesta_struct, para_env)
     799              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
     800            0 :       REAL(kind=dp), DIMENSION(:), INTENT(in)            :: matrix_siesta
     801              :       TYPE(siesta_distrib_csc_struct_type), INTENT(in)   :: siesta_struct
     802              :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
     803              : 
     804              :       CHARACTER(len=*), PARAMETER :: routineN = 'convert_distributed_siesta_to_dbcsr'
     805              : 
     806              :       INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
     807              :          image_dbcsr, image_siesta, image_siesta_transp, inode, inode_proc, iproc, irequest, &
     808              :          irow_blk, irow_local, irow_proc, mepos, n_image_ind, ncols_blk, ncols_local, nnodes_proc, &
     809              :          node_offset, nprocs, nrequests_recv, nrequests_total, nrows_blk, nrows_local
     810              :       INTEGER(kind=int_8)                                :: n_nonzero_elements_dbcsr, &
     811              :                                                             n_nonzero_elements_siesta, &
     812              :                                                             offset_recv_mepos, offset_send_mepos
     813              :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: n_packed_elements_per_proc, &
     814              :                                                             nelements_per_request, &
     815            0 :                                                             offset_per_proc, offset_per_request
     816            0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: next_nonzero_element_offset, peer_rank, &
     817            0 :                                                             request_tag
     818            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
     819            0 :                                                             row_blk_offset, row_blk_size
     820            0 :       LOGICAL                                            :: do_distribute, found, is_root_rank, &
     821            0 :                                                             symmetric
     822              :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buffer, reorder_send_buffer, &
     823              :                                                             send_buffer
     824            0 :       REAL(kind=dp), DIMENSION(:, :), POINTER            :: sm_block
     825            0 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: requests
     826            0 : 
     827            0 :       CALL timeset(routineN, handle)
     828              :       DO image_dbcsr = 1, SIZE(matrix_dbcsr_kp)
     829            0 :          CALL dbcsr_set(matrix_dbcsr_kp(image_dbcsr)%matrix, 0.0_dp)
     830            0 :       END DO
     831            0 : 
     832              :       mepos = para_env%mepos
     833              :       nprocs = para_env%num_pe
     834            0 :       do_distribute = siesta_struct%gather_root < 0
     835            0 :       is_root_rank = siesta_struct%gather_root == mepos
     836            0 : 
     837            0 :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
     838              :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
     839              :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
     840              :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
     841              :       symmetric = siesta_struct%symmetric
     842            0 : 
     843            0 :       n_nonzero_elements_siesta = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
     844              :       n_nonzero_elements_dbcsr = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))
     845            0 : 
     846            0 :       nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     847              :                                                            max_mpi_packet_size_dp)
     848              :       nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     849            0 :                                                             max_mpi_packet_size_dp) + nrequests_recv
     850              :       IF (nrequests_total > 0) THEN
     851            0 :          ALLOCATE (requests(nrequests_total))
     852            0 :          ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))
     853            0 :          ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))
     854            0 :          !requests(:) = mp_request_null
     855            0 :          IF (nrequests_recv > 0) THEN
     856              :             CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &
     857            0 :                                                      nelements_per_request(1:nrequests_recv), &
     858              :                                                      peer_rank(1:nrequests_recv), &
     859              :                                                      request_tag(1:nrequests_recv), &
     860              :                                                      mepos, &
     861              :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     862              :                                                      max_mpi_packet_size_dp)
     863              :          END IF
     864            0 :          IF (nrequests_total > nrequests_recv) THEN
     865              :             CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &
     866            0 :                                                      nelements_per_request(nrequests_recv + 1:nrequests_total), &
     867              :                                                      peer_rank(nrequests_recv + 1:nrequests_total), &
     868              :                                                      request_tag(nrequests_recv + 1:nrequests_total), &
     869              :                                                      mepos, &
     870              :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     871              :                                                      max_mpi_packet_size_dp)
     872              :          END IF
     873            0 :       END IF
     874              : 
     875              :       IF (n_nonzero_elements_dbcsr > 0) THEN
     876              :          ALLOCATE (recv_buffer(n_nonzero_elements_dbcsr))
     877            0 :       END IF
     878            0 :       DO irequest = 1, nrequests_recv
     879              :          CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &
     880            0 :                                          offset_per_request(irequest) + nelements_per_request(irequest)), &
     881              :                              peer_rank(irequest), requests(irequest), request_tag(irequest))
     882              :       END DO
     883            0 : 
     884              :       ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))
     885              :       offset_per_proc(0) = 0
     886            0 :       DO iproc = 1, nprocs - 1
     887            0 :          offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)
     888            0 :       END DO
     889            0 :       n_packed_elements_per_proc(:) = 0
     890              : 
     891            0 :       IF (mepos > 0) THEN
     892              :          node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos - 1))
     893            0 :       ELSE
     894            0 :          node_offset = 0
     895              :       END IF
     896              :       nnodes_proc = siesta_struct%nnodes_per_proc(mepos)
     897              : 
     898            0 :       IF (n_nonzero_elements_siesta > 0) THEN
     899              :          ALLOCATE (send_buffer(n_nonzero_elements_siesta))
     900            0 : 
     901            0 :          ALLOCATE (reorder_send_buffer(n_nonzero_elements_siesta))
     902              :          DO irow_local = 1, siesta_struct%nrows
     903            0 :             offset_send_mepos = siesta_struct%row_offset(irow_local)
     904            0 :             DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)
     905            0 :                reorder_send_buffer(offset_send_mepos + siesta_struct%packed_index(offset_send_mepos + icol_local)) = &
     906            0 :                   matrix_siesta(offset_send_mepos + icol_local)
     907              :             END DO
     908            0 :          END DO
     909              : 
     910              :          ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))
     911              :          next_nonzero_element_offset(:) = 0
     912            0 :          offset_send_mepos = 0
     913            0 : 
     914            0 :          DO inode = 1, SIZE(siesta_struct%nl_repl, 2)
     915              :             irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
     916            0 :             icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
     917            0 :             CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
     918            0 :             image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
     919            0 :             image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
     920            0 : 
     921            0 :             nrows_local = row_blk_size(irow_blk)
     922              :             ncols_local = col_blk_size(icol_blk)
     923            0 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
     924            0 :             first_col_minus_one = col_blk_offset(icol_blk) - 1
     925            0 : 
     926            0 :             IF (image_siesta > 0) THEN
     927              :                DO irow_local = 1, nrows_local
     928            0 :                   IF (do_distribute) THEN
     929            0 : #if defined(__SMEAGOL)
     930            0 :                      CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
     931              : #else
     932            0 :                      CALL cp_abort(__LOCATION__, &
     933              :                                    "CP2K was compiled with no SMEAGOL support.")
     934              : #endif
     935              :                   ELSE
     936              :                      IF (is_root_rank) THEN
     937              :                         irow_proc = irow_local + first_row_minus_one
     938            0 :                      ELSE
     939            0 :                         irow_proc = 0
     940              :                      END IF
     941            0 :                   END IF
     942              :                   IF (irow_proc > 0) THEN
     943              :                      offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     944            0 :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &
     945            0 :                         reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
     946              :                      offset_send_mepos = offset_send_mepos + ncols_local
     947            0 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local
     948            0 :                   END IF
     949            0 :                END DO
     950              :             END IF
     951              : 
     952              :             ! transposed block
     953              :             IF (image_siesta_transp > 0) THEN
     954              :                DO icol_local = 1, ncols_local
     955            0 :                   IF (do_distribute) THEN
     956            0 : #if defined(__SMEAGOL)
     957            0 :                      CALL GlobalToLocalOrb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)
     958              : #else
     959            0 :                      CALL cp_abort(__LOCATION__, &
     960              :                                    "CP2K was compiled with no SMEAGOL support.")
     961              : #endif
     962              :                   ELSE
     963              :                      IF (is_root_rank) THEN
     964              :                         irow_proc = icol_local + first_col_minus_one
     965            0 :                      ELSE
     966            0 :                         irow_proc = 0
     967              :                      END IF
     968            0 :                   END IF
     969              :                   IF (irow_proc > 0) THEN
     970              :                      offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     971            0 :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &
     972            0 :                         reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
     973              :                      offset_send_mepos = offset_send_mepos + nrows_local
     974            0 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local
     975            0 :                   END IF
     976            0 :                END DO
     977              :             END IF
     978              :          END DO
     979              : 
     980              :          IF (debug_this_module) THEN
     981              :             DO irow_local = 1, siesta_struct%nrows
     982              :                IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN
     983              :                   CALL cp__a(__SHORT_FILE__, __LINE__)
     984              :                END IF
     985              :             END DO
     986              :          END IF
     987              : 
     988              :          DEALLOCATE (next_nonzero_element_offset)
     989              :          DEALLOCATE (reorder_send_buffer)
     990            0 : 
     991            0 :          DO irequest = nrequests_recv + 1, nrequests_total
     992              :             CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &
     993            0 :                                             offset_per_request(irequest) + nelements_per_request(irequest)), &
     994              :                                 peer_rank(irequest), requests(irequest), request_tag(irequest))
     995              :          END DO
     996            0 : 
     997              :          ! copy data locally that stay on the same process.
     998              :          IF (mepos > 0) THEN
     999              :             offset_send_mepos = SUM(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))
    1000            0 :          ELSE
    1001            0 :             offset_send_mepos = 0
    1002              :          END IF
    1003              :          offset_recv_mepos = offset_per_proc(mepos)
    1004              : 
    1005            0 :          IF (debug_this_module) THEN
    1006              :             IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv) /= &
    1007              :                 siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) THEN
    1008              :                CALL cp__a(__SHORT_FILE__, __LINE__)
    1009              :             END IF
    1010              :          END IF
    1011              : 
    1012              :          IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send) > 0) THEN
    1013              :             recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) = &
    1014            0 :                send_buffer(offset_send_mepos + 1:offset_send_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send))
    1015              :          END IF
    1016            0 :       END IF
    1017              : 
    1018              :       IF (nrequests_total > 0) THEN
    1019              :          ! wait for pending isend/irecv requests
    1020            0 :          CALL mp_waitall(requests)
    1021              :          DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)
    1022            0 :       END IF
    1023            0 : 
    1024              :       IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)
    1025              : 
    1026            0 :       iproc = siesta_struct%gather_root ! if do_distribute == .FALSE., collect matrix elements from MPI process with rank gather_root
    1027              :       IF (n_nonzero_elements_dbcsr > 0) THEN
    1028            0 :          DO inode_proc = 1, nnodes_proc
    1029            0 :             n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
    1030            0 :             IF (n_image_ind > 0) THEN
    1031            0 :                inode = node_offset + inode_proc
    1032            0 : 
    1033            0 :                irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
    1034              :                icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
    1035            0 :                image_dbcsr = siesta_struct%nl_repl(neighbor_list_dbcsr_image_index, inode)
    1036            0 :                CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1037            0 :                image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
    1038            0 :                image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1039            0 : 
    1040            0 :                nrows_local = row_blk_size(irow_blk)
    1041              :                ncols_local = col_blk_size(icol_blk)
    1042            0 :                first_row_minus_one = row_blk_offset(irow_blk) - 1
    1043            0 :                first_col_minus_one = col_blk_offset(icol_blk) - 1
    1044            0 : 
    1045            0 :                CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
    1046              :                                       row=irow_blk, col=icol_blk, block=sm_block, found=found)
    1047              :                CPASSERT(found)
    1048            0 : 
    1049            0 :                IF (image_siesta > 0) THEN
    1050              :                   DO irow_local = 1, nrows_local
    1051            0 :                      IF (do_distribute) THEN
    1052            0 : #if defined(__SMEAGOL)
    1053            0 :                         CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc) ! iproc_orb
    1054              : #else
    1055            0 :                         CALL cp_abort(__LOCATION__, &
    1056              :                                       "CP2K was compiled with no SMEAGOL support.")
    1057              : #endif
    1058              :                      END IF
    1059              :                      ! CPASSERT
    1060              :                      IF (debug_this_module) THEN
    1061              :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
    1062              :                         IF (n_packed_elements_per_proc(iproc) + ncols_local > &
    1063              :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
    1064              :                            CALL cp__a(__SHORT_FILE__, __LINE__)
    1065              :                         END IF
    1066              :                      END IF
    1067              : 
    1068              :                      offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
    1069              :                      sm_block(irow_local, 1:ncols_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
    1070              : 
    1071            0 :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local
    1072            0 :                   END DO
    1073              :                END IF
    1074            0 : 
    1075              :                ! transposed block
    1076              :                IF (image_siesta_transp > 0) THEN
    1077              :                   DO icol_local = 1, ncols_local
    1078              :                      IF (do_distribute) THEN
    1079            0 : #if defined(__SMEAGOL)
    1080            0 :                         CALL WhichNodeOrb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb
    1081            0 : #else
    1082              :                         CALL cp_abort(__LOCATION__, &
    1083            0 :                                       "CP2K was compiled with no SMEAGOL support.")
    1084              : #endif
    1085              :                      END IF
    1086              :                      ! CPASSERT
    1087              :                      IF (debug_this_module) THEN
    1088              :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
    1089              :                         IF (n_packed_elements_per_proc(iproc) + nrows_local > &
    1090              :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
    1091              :                            CALL cp__a(__SHORT_FILE__, __LINE__)
    1092              :                         END IF
    1093              :                      END IF
    1094              : 
    1095              :                      offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
    1096              :                      sm_block(1:nrows_local, icol_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
    1097              : 
    1098              :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local
    1099            0 :                   END DO
    1100            0 :                END IF
    1101              :             END IF
    1102            0 :          END DO
    1103              : 
    1104              :          IF (debug_this_module) THEN
    1105              :             DO iproc = 0, nprocs - 1
    1106              :                IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
    1107              :                   CALL cp__a(__SHORT_FILE__, __LINE__)
    1108              :                END IF
    1109              :             END DO
    1110              :          END IF
    1111              : 
    1112              :          DEALLOCATE (recv_buffer)
    1113              :       END IF
    1114              : 
    1115              :       DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)
    1116            0 : 
    1117              :       CALL timestop(handle)
    1118              :    END SUBROUTINE convert_distributed_siesta_to_dbcsr
    1119            0 : 
    1120              :    ! *** PRIVATE SUBROUTINES ***
    1121            0 : 
    1122            0 : ! **************************************************************************************************
    1123              : !> \brief Computes number of neighbour-list nodes on the current parallel process.
    1124              : !> \param nnodes_local              number of nodes [out]
    1125              : !> \param max_ijk_cell_image_local  largest index of cell images along i, j and k cell vectors
    1126              : !>                                  on this parallel process [out]
    1127              : !> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [inout]
    1128              : !> \param sab_nl                    pair-wise neighbour list [in]
    1129              : !> \param para_env                  MPI parallel environment [in]
    1130              : !> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]
    1131              : !> \param cell                      simulation unit cell [in]
    1132              : ! **************************************************************************************************
    1133              :    SUBROUTINE get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)
    1134              :       INTEGER, INTENT(out)                               :: nnodes_local
    1135              :       INTEGER, DIMENSION(3), INTENT(out)                 :: max_ijk_cell_image_local
    1136              :       INTEGER, DIMENSION(3), INTENT(inout)               :: max_ijk_cell_image
    1137            4 :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    1138              :          INTENT(in), POINTER                             :: sab_nl
    1139              :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1140              :       REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
    1141              :          INTENT(in)                                      :: particle_coords
    1142              :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1143              : 
    1144              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'get_nnodes_local'
    1145              : 
    1146              :       INTEGER                                            :: handle, iatom, icoord, jatom
    1147              :       INTEGER, DIMENSION(3)                              :: cell_ijk, max_ijk_cell_image_tmp
    1148              :       LOGICAL                                            :: update_ncells
    1149              :       REAL(kind=dp), DIMENSION(3)                        :: r_ij
    1150              :       TYPE(neighbor_list_iterator_p_type), &
    1151              :          DIMENSION(:), POINTER                           :: nl_iterator
    1152              : 
    1153              :       CALL timeset(routineN, handle)
    1154              : 
    1155            4 :       update_ncells = .FALSE.
    1156              :       DO icoord = 1, 3 ! x, y, z
    1157            4 :          IF (max_ijk_cell_image(icoord) >= 0) THEN
    1158              :             max_ijk_cell_image_tmp(icoord) = max_ijk_cell_image(icoord)
    1159            4 :          ELSE
    1160           16 :             max_ijk_cell_image_tmp(icoord) = HUGE(max_ijk_cell_image_tmp(icoord))
    1161           16 :             update_ncells = .TRUE.
    1162            4 :          END IF
    1163              :       END DO
    1164            8 : 
    1165            8 :       nnodes_local = 0
    1166              :       max_ijk_cell_image_local(:) = 0
    1167              : 
    1168              :       CALL neighbor_list_iterator_create(nl_iterator, sab_nl)
    1169            4 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    1170            4 :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, r=r_ij)
    1171              :          CALL get_negf_cell_ijk(cell_ijk, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)
    1172            4 :          cell_ijk(1:3) = ABS(cell_ijk(1:3))
    1173        15164 : 
    1174        15160 :          IF (cell_ijk(1) <= max_ijk_cell_image_tmp(1) .AND. cell_ijk(2) <= max_ijk_cell_image_tmp(2) .AND. &
    1175        15160 :              cell_ijk(3) <= max_ijk_cell_image_tmp(3)) THEN
    1176        60640 :             nnodes_local = nnodes_local + 1
    1177              :             max_ijk_cell_image_local(1:3) = MAX(max_ijk_cell_image_local(1:3), cell_ijk(1:3))
    1178        15160 :          END IF
    1179            4 :       END DO
    1180        14888 :       CALL neighbor_list_iterator_release(nl_iterator)
    1181        59552 : 
    1182              :       IF (update_ncells) THEN
    1183              :          max_ijk_cell_image_tmp(1:3) = max_ijk_cell_image_local(1:3)
    1184            4 :          CALL para_env%max(max_ijk_cell_image_tmp)
    1185              :          DO icoord = 1, 3
    1186            4 :             IF (max_ijk_cell_image(icoord) < 0) THEN
    1187            4 :                max_ijk_cell_image(icoord) = max_ijk_cell_image_tmp(icoord)
    1188            4 :             END IF
    1189           16 :          END DO
    1190           16 :       END IF
    1191            8 : 
    1192              :       CALL timestop(handle)
    1193              :    END SUBROUTINE get_nnodes_local
    1194              : 
    1195              : ! **************************************************************************************************
    1196            4 : !> \brief Construct list of neighbour-list's nodes on the current parallel process.
    1197            4 : !> \param nl_local                  non-merged local neighbour-list's nodes [out]
    1198              : !> \param max_ijk_cell_image_local  largest index of cell images along i, j and k cell vectors
    1199              : !>                                  on this parallel process [in]
    1200              : !> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [in]
    1201              : !> \param sab_nl                    pair-wise neighbour list [in]
    1202              : !> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]
    1203              : !> \param cell                      simulation unit cell [in]
    1204              : !> \param cell_to_index             array to convert 3-D cell indices to 1-D DBCSR image indices
    1205              : !> \param do_merge                  merge cell images along transport direction [in]
    1206              : !> \param node_merged_indices       nodes-related indices. Nodes with identical indices will be merged [out]
    1207              : !> \param k_cells                   list of cell image indices along transport direction. Nodes to
    1208              : !>                be merged have the same 'node_merged_indices' but different 'k_cells' indices [out]
    1209              : ! **************************************************************************************************
    1210              :    SUBROUTINE get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, &
    1211              :                                  cell, cell_to_index, do_merge, node_merged_indices, k_cells)
    1212              :       INTEGER, DIMENSION(:, :), INTENT(out)              :: nl_local
    1213              :       INTEGER, DIMENSION(3), INTENT(in)                  :: max_ijk_cell_image_local, &
    1214            4 :                                                             max_ijk_cell_image
    1215            4 :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    1216              :          INTENT(in), POINTER                             :: sab_nl
    1217              :       REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
    1218              :          INTENT(in)                                      :: particle_coords
    1219              :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1220              :       INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER   :: cell_to_index
    1221              :       LOGICAL, INTENT(in)                                :: do_merge
    1222              :       INTEGER(kind=int_8), DIMENSION(:), INTENT(out)     :: node_merged_indices
    1223              :       INTEGER, DIMENSION(:), INTENT(out)                 :: k_cells
    1224              : 
    1225              :       CHARACTER(len=*), PARAMETER :: routineN = 'get_nl_nodes_local'
    1226              : 
    1227              :       INTEGER                                            :: handle, iatom, icol_blk, image, inode, &
    1228              :                                                             irow_blk, jatom, natoms
    1229              :       INTEGER(kind=8), DIMENSION(2)                      :: ncells_siesta_local
    1230              :       INTEGER, DIMENSION(2)                              :: ncells_siesta
    1231              :       INTEGER, DIMENSION(3)                              :: cell_ijk_abs, cell_ijk_dbcsr, &
    1232              :                                                             cell_ijk_siesta
    1233              :       LOGICAL                                            :: do_symmetric
    1234              :       REAL(kind=dp), DIMENSION(3)                        :: r_ij
    1235              :       TYPE(neighbor_list_iterator_p_type), &
    1236              :          DIMENSION(:), POINTER                           :: nl_iterator
    1237              : 
    1238              :       CALL timeset(routineN, handle)
    1239              :       ! natoms only used to compute a merged 1D index of each DBCSR block
    1240            4 :       natoms = SIZE(particle_coords, 2)
    1241              :       CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=do_symmetric)
    1242            4 :       ncells_siesta(1:2) = 2*max_ijk_cell_image(1:2) + 1
    1243              : 
    1244            4 :       ncells_siesta_local(1:2) = INT(2*max_ijk_cell_image_local(1:2) + 1, kind=int_8)
    1245            4 : 
    1246           12 :       inode = 0
    1247              :       CALL neighbor_list_iterator_create(nl_iterator, sab_nl)
    1248           12 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    1249              :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, cell=cell_ijk_dbcsr, r=r_ij)
    1250            4 :          CALL get_negf_cell_ijk(cell_ijk_abs, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)
    1251            4 : 
    1252        15164 :          IF (ABS(cell_ijk_abs(1)) <= max_ijk_cell_image(1) .AND. ABS(cell_ijk_abs(2)) <= max_ijk_cell_image(2) .AND. &
    1253        15160 :              ABS(cell_ijk_abs(3)) <= max_ijk_cell_image(3)) THEN
    1254        15160 : 
    1255              :             inode = inode + 1
    1256        15160 : 
    1257            4 :             image = get_index_by_cell(cell_ijk_dbcsr, cell_to_index)
    1258              :             CPASSERT(image > 0)
    1259        14888 :             nl_local(neighbor_list_dbcsr_image_index, inode) = image
    1260              : 
    1261        14888 :             IF (do_symmetric .AND. iatom > jatom) THEN
    1262        14888 :                irow_blk = jatom
    1263        14888 :                icol_blk = iatom
    1264              :                cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)
    1265        14888 :             ELSE
    1266        28224 :                irow_blk = iatom
    1267        28224 :                icol_blk = jatom
    1268        28224 :             END IF
    1269              : 
    1270              :             nl_local(neighbor_list_iatom_index, inode) = irow_blk
    1271              :             nl_local(neighbor_list_jatom_index, inode) = icol_blk
    1272              : 
    1273              :             cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA
    1274        14888 : 
    1275        14888 :             IF (do_merge) THEN
    1276              :                node_merged_indices(inode) = (((cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &
    1277        59552 :                                      INT(natoms, kind=int_8) + icol_blk - 1)*INT(natoms, kind=int_8) + INT(irow_blk - 1, kind=int_8)
    1278              :                image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1)
    1279        14888 :             ELSE
    1280              :                node_merged_indices(inode) = ((((cell_ijk_siesta(3) - 1)*ncells_siesta_local(2) + &
    1281            0 :                                                cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &
    1282            0 :                                      INT(natoms, kind=int_8) + icol_blk - 1)*INT(natoms, kind=int_8) + INT(irow_blk - 1, kind=int_8)
    1283              :                image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))
    1284              :             END IF
    1285              :             k_cells(inode) = cell_ijk_siesta(3)
    1286        14888 :             nl_local(neighbor_list_siesta_image_index, inode) = image
    1287        14888 : 
    1288              :             IF (do_symmetric .AND. irow_blk /= icol_blk) THEN
    1289        14888 :                cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)
    1290        14888 :                cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA
    1291              :                IF (do_merge) cell_ijk_siesta(3) = 1
    1292        14888 :                nl_local(neighbor_list_siesta_transp_image_index, inode) = &
    1293        54960 :                   cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))
    1294        54960 :             ELSE
    1295        13740 :                nl_local(neighbor_list_siesta_transp_image_index, inode) = 0
    1296              :             END IF
    1297        13740 :          END IF
    1298              :       END DO
    1299         1148 :       CALL neighbor_list_iterator_release(nl_iterator)
    1300              : 
    1301              :       IF (debug_this_module) THEN
    1302              :          CPASSERT(SIZE(nl_local, 2) == inode)
    1303            4 :       END IF
    1304              : 
    1305              :       CALL timestop(handle)
    1306              :    END SUBROUTINE get_nl_nodes_local
    1307              : 
    1308              : ! **************************************************************************************************
    1309            4 : !> \brief Replicate (and optionally merge) pair-wise neighbour list.
    1310            4 : !> \param repl_nl                   replicated neighbour list. It needs to be deallocated elsewhere [allocated]
    1311              : !> \param n_dbcsr_cell_images_to_merge  number of merged blocks per neighbour-list node [allocated]
    1312              : !> \param dbcsr_cell_image_to_merge list of DBCSR image indices to merge [allocated]
    1313              : !> \param nnodes_per_proc           number of merged nodes on each parallel processes [out]
    1314              : !> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [inout]
    1315              : !> \param sab_nl                    pair-wise neighbour list [in]
    1316              : !> \param para_env                  MPI parallel environment [in]
    1317              : !> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]
    1318              : !> \param cell                      simulation unit cell [in]
    1319              : !> \param cell_to_index             array to convert 3-D cell indices to 1-D DBCSR image indices [in]
    1320              : !> \param do_merge                  merge cell images along transport direction [in]
    1321              : ! **************************************************************************************************
    1322              :    SUBROUTINE replicate_neighbour_list(repl_nl, n_dbcsr_cell_images_to_merge, dbcsr_cell_image_to_merge, &
    1323              :                                        nnodes_per_proc, max_ijk_cell_image, sab_nl, para_env, particle_coords, &
    1324              :                                        cell, cell_to_index, do_merge)
    1325              :       INTEGER, ALLOCATABLE, DIMENSION(:, :), &
    1326            4 :          INTENT(inout)                                   :: repl_nl
    1327            4 :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(inout)  :: n_dbcsr_cell_images_to_merge, &
    1328              :                                                             dbcsr_cell_image_to_merge
    1329              :       INTEGER, DIMENSION(0:), INTENT(out)                :: nnodes_per_proc
    1330              :       INTEGER, DIMENSION(3), INTENT(inout)               :: max_ijk_cell_image
    1331              :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    1332              :          INTENT(in), POINTER                             :: sab_nl
    1333              :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1334              :       REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
    1335              :          INTENT(in)                                      :: particle_coords
    1336              :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1337              :       INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER   :: cell_to_index
    1338              :       LOGICAL, INTENT(in)                                :: do_merge
    1339              : 
    1340              :       CHARACTER(len=*), PARAMETER :: routineN = 'replicate_neighbour_list'
    1341              : 
    1342              :       INTEGER                                            :: handle, inode, iproc, kcell_closest, &
    1343              :                                                             nnodes_local, nnodes_merged, &
    1344              :                                                             nnodes_repl, offset_inode
    1345              :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: node_merged_indices
    1346              :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: inodes_orig, k_cells
    1347              :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: nl_local
    1348              :       INTEGER, DIMENSION(3)                              :: max_ijk_cell_image_local
    1349            4 : 
    1350            4 :       CALL timeset(routineN, handle)
    1351            4 :       CPASSERT(.NOT. ALLOCATED(repl_nl))
    1352              :       CPASSERT(.NOT. ALLOCATED(n_dbcsr_cell_images_to_merge))
    1353              :       CPASSERT(.NOT. ALLOCATED(dbcsr_cell_image_to_merge))
    1354            4 : 
    1355            4 :       CALL get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)
    1356            4 : 
    1357            4 :       nnodes_per_proc(:) = 0
    1358              : 
    1359            4 :       IF (nnodes_local > 0) THEN
    1360              :          ALLOCATE (nl_local(neighbor_list_dim1, nnodes_local))
    1361           12 :          ALLOCATE (node_merged_indices(nnodes_local))
    1362              :          ALLOCATE (k_cells(nnodes_local))
    1363            4 :          CALL get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, cell, &
    1364           12 :                                  cell_to_index, do_merge, node_merged_indices, k_cells)
    1365           12 : 
    1366           12 :          ALLOCATE (inodes_orig(nnodes_local))
    1367              :          CALL sort(node_merged_indices, nnodes_local, inodes_orig)
    1368            4 : 
    1369              :          nnodes_merged = 1
    1370            8 :          DO inode = 2, nnodes_local
    1371            4 :             IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) nnodes_merged = nnodes_merged + 1
    1372              :          END DO
    1373            4 :       ELSE
    1374        14888 :          nnodes_merged = 0
    1375        14888 :       END IF
    1376              : 
    1377              :       nnodes_per_proc(para_env%mepos) = nnodes_merged
    1378              :       CALL para_env%sum(nnodes_per_proc)
    1379              : 
    1380              :       nnodes_repl = SUM(nnodes_per_proc(:))
    1381            4 :       ALLOCATE (repl_nl(neighbor_list_dim1, nnodes_repl))
    1382           20 : 
    1383              :       IF (nnodes_local > 0) THEN
    1384           12 :          IF (para_env%mepos > 0) THEN
    1385           12 :             offset_inode = SUM(nnodes_per_proc(0:para_env%mepos - 1))
    1386              :          ELSE
    1387            4 :             offset_inode = 0
    1388            4 :          END IF
    1389            4 : 
    1390              :          ALLOCATE (n_dbcsr_cell_images_to_merge(nnodes_merged))
    1391              :          ALLOCATE (dbcsr_cell_image_to_merge(nnodes_local))
    1392              :          n_dbcsr_cell_images_to_merge(:) = 0
    1393              : 
    1394           12 :          nnodes_merged = 1 !offset_inode + 1
    1395           12 :          repl_nl(:, offset_inode + 1) = nl_local(:, inodes_orig(1))
    1396        14892 :          n_dbcsr_cell_images_to_merge(1) = 1
    1397              :          dbcsr_cell_image_to_merge(1) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(1))
    1398            4 :          kcell_closest = k_cells(inodes_orig(1))
    1399           24 :          DO inode = 2, nnodes_local
    1400            4 :             IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) THEN
    1401            4 :                nnodes_merged = nnodes_merged + 1
    1402            4 :                repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))
    1403        14888 :                !n_dbcsr_cell_images_to_merge(nnodes_merged) = 1
    1404        14884 :                kcell_closest = k_cells(inodes_orig(inode))
    1405        14884 :             ELSE
    1406        89304 :                IF (ABS(k_cells(inodes_orig(inode))) < ABS(kcell_closest) .OR. &
    1407              :                    (ABS(k_cells(inodes_orig(inode))) == ABS(kcell_closest) .AND. kcell_closest < 0)) THEN
    1408        14884 :                   repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))
    1409              :                   kcell_closest = k_cells(inodes_orig(inode))
    1410            0 :                END IF
    1411            0 :             END IF
    1412            0 :             dbcsr_cell_image_to_merge(inode) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(inode))
    1413            0 :             n_dbcsr_cell_images_to_merge(nnodes_merged) = n_dbcsr_cell_images_to_merge(nnodes_merged) + 1
    1414              :          END DO
    1415              : 
    1416        14884 :          IF (debug_this_module) THEN
    1417        14888 :             CPASSERT(SUM(n_dbcsr_cell_images_to_merge) == nnodes_local)
    1418              :          END IF
    1419              : 
    1420              :          DEALLOCATE (inodes_orig)
    1421              :          DEALLOCATE (node_merged_indices, k_cells)
    1422              :          DEALLOCATE (nl_local)
    1423              :       END IF
    1424            4 : 
    1425            4 :       IF (para_env%num_pe > 1) THEN
    1426            4 :          offset_inode = 0
    1427              :          DO iproc = 0, para_env%num_pe - 1
    1428              :             IF (nnodes_per_proc(iproc) > 0) THEN
    1429            4 :                CALL para_env%bcast(repl_nl(:, offset_inode + 1:offset_inode + nnodes_per_proc(iproc)), iproc)
    1430            4 :                offset_inode = offset_inode + nnodes_per_proc(iproc)
    1431           12 :             END IF
    1432           12 :          END DO
    1433            8 :       END IF
    1434            8 : 
    1435              :       CALL timestop(handle)
    1436              :    END SUBROUTINE replicate_neighbour_list
    1437              : 
    1438              : ! **************************************************************************************************
    1439            4 : !> \brief Count number of DBCSR matrix elements that should be received from (*,nelements_dbcsr_recv)
    1440            8 : !>        and send to (*,nelements_dbcsr_send) each parallel process.
    1441              : !> \param nelements_per_proc number of non-zero matrix elements for each MPI process
    1442              : !> \param nnodes_per_proc    number of non-zero DBCSR matrix blocks (neighbour-list nodes)
    1443              : !> \param nl_repl            replicated neighbour list
    1444              : !> \param matrix_dbcsr_kp    DBCSR matrix
    1445              : !> \param symmetric          whether the DBCSR matrix is a symmetric one
    1446              : !> \param para_env           parallel environment
    1447              : !> \param gather_root        if >=0, gather all non-zero matrix element on the MPI process with
    1448              : !>                           gather_root rank (useful for bulk transport calculation).
    1449              : !>                           If <0, distribute non-zero matrix element across all MPI processes
    1450              : ! **************************************************************************************************
    1451              :    SUBROUTINE count_remote_dbcsr_elements(nelements_per_proc, nnodes_per_proc, nl_repl, matrix_dbcsr_kp, &
    1452              :                                           symmetric, para_env, gather_root)
    1453              :       INTEGER(kind=int_8), DIMENSION(0:, :), INTENT(out) :: nelements_per_proc
    1454              :       INTEGER, DIMENSION(0:), INTENT(in)                 :: nnodes_per_proc
    1455            4 :       INTEGER, DIMENSION(:, :), INTENT(in)               :: nl_repl
    1456              :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
    1457              :       LOGICAL, INTENT(in)                                :: symmetric
    1458              :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1459              :       INTEGER, INTENT(in)                                :: gather_root
    1460              : 
    1461              :       CHARACTER(len=*), PARAMETER :: routineN = 'count_remote_dbcsr_elements'
    1462              : 
    1463              :       INTEGER :: first_row_minus_one, handle, icol_blk, image, image_transp, inode, inode_proc, &
    1464              :          iproc, iproc_orb, irow_blk, irow_local, mepos, ncols_blk, ncols_local, nnodes_proc, &
    1465              :          nprocs, nrows_blk, nrows_local, offset_inode
    1466              :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
    1467              :                                                             row_blk_offset, row_blk_size
    1468              :       LOGICAL                                            :: do_distribute
    1469              : 
    1470            4 :       CALL timeset(routineN, handle)
    1471            4 :       nelements_per_proc(:, :) = 0
    1472              :       mepos = para_env%mepos
    1473              :       nprocs = para_env%num_pe
    1474            4 :       do_distribute = gather_root < 0
    1475           28 :       IF (debug_this_module) THEN
    1476            4 :          CPASSERT(SIZE(nnodes_per_proc) == nprocs)
    1477            4 :       END IF
    1478            4 : 
    1479              :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
    1480              :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
    1481              :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
    1482              :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
    1483              : 
    1484              :       offset_inode = 0
    1485              :       iproc_orb = gather_root ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root
    1486            4 :       DO iproc = LBOUND(nnodes_per_proc, 1), UBOUND(nnodes_per_proc, 1)
    1487              :          nnodes_proc = nnodes_per_proc(iproc)
    1488            4 :          DO inode_proc = 1, nnodes_proc
    1489            4 :             inode = inode_proc + offset_inode
    1490           16 : 
    1491            8 :             irow_blk = nl_repl(neighbor_list_iatom_index, inode)
    1492        29784 :             icol_blk = nl_repl(neighbor_list_jatom_index, inode)
    1493        29776 :             CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1494              :             image = nl_repl(neighbor_list_siesta_image_index, inode)
    1495        29776 :             image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1496        29776 : 
    1497        29776 :             IF (image > 0) THEN
    1498        29776 :                nrows_local = row_blk_size(irow_blk)
    1499        29776 :                first_row_minus_one = row_blk_offset(irow_blk) - 1
    1500              :                ncols_local = col_blk_size(icol_blk)
    1501        29776 :                DO irow_local = 1, nrows_local
    1502        29776 :                   IF (do_distribute) THEN
    1503        29776 : #if defined(__SMEAGOL)
    1504        29776 :                      CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc_orb)
    1505       297760 : #else
    1506       267984 :                      CALL cp_abort(__LOCATION__, &
    1507              :                                    "CP2K was compiled with no SMEAGOL support.")
    1508            0 : #endif
    1509              :                   END IF
    1510              :                   IF (iproc_orb == mepos) THEN
    1511              :                      nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &
    1512              :                                                                        ncols_local
    1513              :                   END IF
    1514       267984 : 
    1515              :                   IF (iproc == mepos) THEN
    1516       133992 :                      nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &
    1517              :                                                                            ncols_local
    1518              :                   END IF
    1519       297760 :                END DO
    1520              :             END IF
    1521       133992 : 
    1522              :             ! transposed block
    1523              :             IF (image_transp > 0) THEN
    1524              :                nrows_local = col_blk_size(icol_blk)
    1525              :                first_row_minus_one = col_blk_offset(icol_blk) - 1
    1526              :                ncols_local = row_blk_size(irow_blk)
    1527        29784 :                DO irow_local = 1, nrows_local
    1528        27480 :                   IF (do_distribute) THEN
    1529        27480 : #if defined(__SMEAGOL)
    1530        27480 :                      CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc_orb)
    1531       274800 : #else
    1532       247320 :                      CALL cp_abort(__LOCATION__, &
    1533              :                                    "CP2K was compiled with no SMEAGOL support.")
    1534            0 : #endif
    1535              :                   END IF
    1536              :                   IF (iproc_orb == mepos) THEN
    1537              :                      nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &
    1538              :                                                                        ncols_local
    1539              :                   END IF
    1540       247320 : 
    1541              :                   IF (iproc == mepos) THEN
    1542       123660 :                      nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &
    1543              :                                                                            ncols_local
    1544              :                   END IF
    1545       274800 :                END DO
    1546              :             END IF
    1547       123660 :          END DO
    1548              :          offset_inode = offset_inode + nnodes_proc
    1549              :       END DO
    1550              :       CALL timestop(handle)
    1551              :    END SUBROUTINE count_remote_dbcsr_elements
    1552           12 : 
    1553              : ! **************************************************************************************************
    1554            4 : !> \brief Construct list of non-zero matrix elements' indices in SIESTA format.
    1555            4 : !> \param n_nonzero_cols  number of non-zero matrix elements on each matrix row local to the current
    1556              : !>                        MPI process
    1557              : !> \param row_offset      offset of the first non-zero matrix elements for each locally-stores row
    1558              : !> \param col_index       sorted list of column indices of non-zero matrix element
    1559              : !> \param packed_index    original order of non-sorted column indices
    1560              : !> \param nl_repl         replicated neighbour list
    1561              : !> \param matrix_dbcsr_kp DBCSR matrix
    1562              : !> \param symmetric       whether the DBCSR matrix is a symmetric one
    1563              : !> \param para_env        parallel environment
    1564              : !> \param gather_root     if >=0, gather all non-zero matrix element on the MPI process with
    1565              : !>                        gather_root rank (useful for bulk transport calculation).
    1566              : !>                        If <0, distribute non-zero matrix element across all MPI processes
    1567              : ! **************************************************************************************************
    1568              :    SUBROUTINE get_nonzero_element_indices(n_nonzero_cols, row_offset, col_index, packed_index, &
    1569              :                                           nl_repl, matrix_dbcsr_kp, symmetric, para_env, gather_root)
    1570              :       INTEGER, DIMENSION(:), INTENT(out)                 :: n_nonzero_cols, row_offset, col_index, &
    1571              :                                                             packed_index
    1572            8 :       INTEGER, DIMENSION(:, :), INTENT(in)               :: nl_repl
    1573            4 :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
    1574              :       LOGICAL, INTENT(in)                                :: symmetric
    1575              :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1576              :       INTEGER, INTENT(in)                                :: gather_root
    1577              : 
    1578              :       CHARACTER(len=*), PARAMETER :: routineN = 'get_nonzero_element_indices'
    1579              : 
    1580              :       INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
    1581              :          icol_offset, image, image_transp, inode, irow_blk, irow_local, irow_proc, mepos, &
    1582              :          ncols_blk, ncols_local, ncols_total, nnodes, nprocs, nrows_blk, nrows_local, nrows_total
    1583              :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
    1584              :                                                             row_blk_offset, row_blk_size
    1585              :       LOGICAL                                            :: do_distribute, is_root_rank
    1586              : 
    1587            4 :       CALL timeset(routineN, handle)
    1588            4 :       n_nonzero_cols(:) = 0
    1589              :       mepos = para_env%mepos
    1590              :       nprocs = para_env%num_pe
    1591            4 :       do_distribute = gather_root < 0
    1592          618 :       is_root_rank = gather_root == mepos
    1593            4 : 
    1594            4 :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
    1595            4 :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
    1596            4 :                           nfullrows_total=nrows_total, nfullcols_total=ncols_total, &
    1597              :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
    1598              :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
    1599              : 
    1600              :       nnodes = SIZE(nl_repl, 2)
    1601              :       DO inode = 1, nnodes
    1602            4 :          irow_blk = nl_repl(neighbor_list_iatom_index, inode)
    1603              :          icol_blk = nl_repl(neighbor_list_jatom_index, inode)
    1604            4 :          CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1605        29780 :          image = nl_repl(neighbor_list_siesta_image_index, inode)
    1606        29776 :          image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1607        29776 : 
    1608        29776 :          IF (image > 0) THEN
    1609        29776 :             nrows_local = row_blk_size(irow_blk)
    1610        29776 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
    1611              :             ncols_local = col_blk_size(icol_blk)
    1612        29776 :             DO irow_local = 1, nrows_local
    1613        29776 :                IF (do_distribute) THEN
    1614        29776 : #if defined(__SMEAGOL)
    1615        29776 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1616       297760 : #else
    1617       267984 :                   CALL cp_abort(__LOCATION__, &
    1618              :                                 "CP2K was compiled with no SMEAGOL support.")
    1619            0 : #endif
    1620              :                ELSE
    1621              :                   IF (is_root_rank) THEN
    1622              :                      irow_proc = irow_local + first_row_minus_one
    1623              :                   ELSE
    1624              :                      irow_proc = 0
    1625       267984 :                   END IF
    1626       133992 :                END IF
    1627              :                IF (irow_proc > 0) THEN
    1628       133992 :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1629              :                END IF
    1630              :             END DO
    1631       297760 :          END IF
    1632       133992 : 
    1633              :          ! transposed block
    1634              :          IF (image_transp > 0) THEN
    1635              :             nrows_local = col_blk_size(icol_blk)
    1636              :             first_row_minus_one = col_blk_offset(icol_blk) - 1
    1637              :             ncols_local = row_blk_size(irow_blk)
    1638        29780 :             DO irow_local = 1, nrows_local
    1639        27480 :                IF (do_distribute) THEN
    1640        27480 : #if defined(__SMEAGOL)
    1641        27480 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1642       274800 : #else
    1643       247320 :                   CALL cp_abort(__LOCATION__, &
    1644              :                                 "CP2K was compiled with no SMEAGOL support.")
    1645            0 : #endif
    1646              :                ELSE
    1647              :                   IF (is_root_rank) THEN
    1648              :                      irow_proc = irow_local + first_row_minus_one
    1649              :                   ELSE
    1650              :                      irow_proc = 0
    1651       247320 :                   END IF
    1652       123660 :                END IF
    1653              :                IF (irow_proc > 0) THEN
    1654       123660 :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1655              :                END IF
    1656              :             END DO
    1657       274800 :          END IF
    1658       123660 :       END DO
    1659              : 
    1660              :       row_offset(1) = 0
    1661              :       DO irow_local = 1, SIZE(n_nonzero_cols) - 1
    1662              :          row_offset(irow_local + 1) = row_offset(irow_local) + n_nonzero_cols(irow_local)
    1663              :       END DO
    1664            4 : 
    1665          614 :       n_nonzero_cols(:) = 0
    1666          614 :       col_index(:) = 0
    1667              :       DO inode = 1, nnodes
    1668              :          irow_blk = nl_repl(neighbor_list_iatom_index, inode)
    1669          618 :          icol_blk = nl_repl(neighbor_list_jatom_index, inode)
    1670      2318874 :          CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1671        29780 :          image = nl_repl(neighbor_list_siesta_image_index, inode)
    1672        29776 :          image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1673        29776 : 
    1674        29776 :          IF (image > 0) THEN
    1675        29776 :             nrows_local = row_blk_size(irow_blk)
    1676        29776 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
    1677              :             ncols_local = col_blk_size(icol_blk)
    1678        29776 :             first_col_minus_one = col_blk_offset(icol_blk) + (image - 1)*ncols_total - 1
    1679        29776 :             DO irow_local = 1, nrows_local
    1680        29776 :                IF (do_distribute) THEN
    1681        29776 : #if defined(__SMEAGOL)
    1682        29776 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1683       297760 : #else
    1684       267984 :                   CALL cp_abort(__LOCATION__, &
    1685              :                                 "CP2K was compiled with no SMEAGOL support.")
    1686            0 : #endif
    1687              :                ELSE
    1688              :                   IF (is_root_rank) THEN
    1689              :                      irow_proc = irow_local + first_row_minus_one
    1690              :                   ELSE
    1691              :                      irow_proc = 0
    1692       267984 :                   END IF
    1693       133992 :                END IF
    1694              :                IF (irow_proc > 0) THEN
    1695       133992 :                   icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)
    1696              :                   DO icol_local = 1, ncols_local
    1697              :                      col_index(icol_offset + icol_local) = first_col_minus_one + icol_local
    1698       297760 :                   END DO
    1699       133992 :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1700      1339920 :                END IF
    1701      1339920 :             END DO
    1702              :          END IF
    1703       133992 : 
    1704              :          ! transposed block
    1705              :          IF (image_transp > 0) THEN
    1706              :             nrows_local = col_blk_size(icol_blk)
    1707              :             first_row_minus_one = col_blk_offset(icol_blk) - 1
    1708              :             ncols_local = row_blk_size(irow_blk)
    1709        29780 :             first_col_minus_one = row_blk_offset(irow_blk) + (image_transp - 1)*nrows_total - 1
    1710        27480 :             DO irow_local = 1, nrows_local
    1711        27480 :                IF (do_distribute) THEN
    1712        27480 : #if defined(__SMEAGOL)
    1713        27480 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1714       274800 : #else
    1715       247320 :                   CALL cp_abort(__LOCATION__, &
    1716              :                                 "CP2K was compiled with no SMEAGOL support.")
    1717            0 : #endif
    1718              :                ELSE
    1719              :                   IF (is_root_rank) THEN
    1720              :                      irow_proc = irow_local + first_row_minus_one
    1721              :                   ELSE
    1722              :                      irow_proc = 0
    1723       247320 :                   END IF
    1724       123660 :                END IF
    1725              :                IF (irow_proc > 0) THEN
    1726       123660 :                   icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)
    1727              :                   DO icol_local = 1, ncols_local
    1728              :                      col_index(icol_offset + icol_local) = first_col_minus_one + icol_local
    1729       274800 :                   END DO
    1730       123660 :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1731      1236600 :                END IF
    1732      1236600 :             END DO
    1733              :          END IF
    1734       123660 :       END DO
    1735              : 
    1736              :       IF (SIZE(n_nonzero_cols) > 0) THEN
    1737              :          DO irow_local = 1, SIZE(n_nonzero_cols)
    1738              :             CALL sort(col_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)), &
    1739              :                       n_nonzero_cols(irow_local), &
    1740            4 :                       packed_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)))
    1741          618 :          END DO
    1742              :       END IF
    1743              : 
    1744          618 :       CALL timestop(handle)
    1745              :    END SUBROUTINE get_nonzero_element_indices
    1746              : 
    1747              : ! **************************************************************************************************
    1748            4 : !> \brief Get absolute i, j, and k indices of cell image for the given DBCSR matrix block.
    1749            4 : !>        Effective cell image recorded in the neighbour-list (where the matrix block is actually
    1750              : !>        stored) depends on atomic coordinates can be significantly different.
    1751              : !> \param cell_ijk      array with 3 indices along the cell's vectors
    1752              : !> \param r_ij          actual interatomic distance (vector R_j - r_i), where R_j is the coordinates
    1753              : !>                      of the j-th atom in the supercell
    1754              : !> \param r_i           coordinates of the i-th atom in the primary unit cell
    1755              : !> \param r_j           coordinates of the j-th atom in the primary unit cell
    1756              : !> \param cell          unit cell
    1757              : ! **************************************************************************************************
    1758              :    SUBROUTINE get_negf_cell_ijk(cell_ijk, r_ij, r_i, r_j, cell)
    1759              :       INTEGER, DIMENSION(3), INTENT(out)                 :: cell_ijk
    1760              :       REAL(kind=dp), DIMENSION(3), INTENT(in)            :: r_ij, r_i, r_j
    1761              :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1762        30320 : 
    1763              :       REAL(kind=dp), DIMENSION(3)                        :: coords_scaled, r
    1764              : 
    1765              :       r(:) = r_ij(:) + r_i(:) - r_j(:)
    1766              :       CALL real_to_scaled(coords_scaled, r, cell)
    1767              :       cell_ijk(:) = NINT(coords_scaled(:))
    1768              :    END SUBROUTINE get_negf_cell_ijk
    1769       121280 : 
    1770        30320 : ! **************************************************************************************************
    1771       121280 : !> \brief Return the index of an integer number in the sequence 0, 1, -1, ..., n, -n, ...
    1772        30320 : !>        (canonical enumeration of integers, oeis.org/A001057).
    1773              : !> \param  inum     integer number [in]
    1774              : !> \return index of 'inum' in A001057
    1775              : !> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this
    1776              : !>       function converts the absolute index of a cell replica along some (x/y/z) dimension
    1777              : !>       into its corresponding SIESTA's index.
    1778              : ! **************************************************************************************************
    1779              :    ELEMENTAL FUNCTION index_in_canonical_enumeration(inum) RESULT(ind)
    1780              :       INTEGER, INTENT(in)                                :: inum
    1781              :       INTEGER                                            :: ind
    1782              : 
    1783        85884 :       INTEGER                                            :: inum_abs, is_non_positive
    1784              : 
    1785              :       inum_abs = ABS(inum)
    1786              :       !IF (inum <= 0) THEN; is_non_positive = 1; ELSE; is_non_positive = 0; END IF
    1787              :       is_non_positive = MERGE(1, 0, inum <= 0)
    1788              : 
    1789        85884 :       ! inum =  0 -> inum_abs = 0, is_non_positive = 1 -> ind = 1
    1790              :       ! inum =  1 -> inum_abs = 1, is_non_positive = 0 -> ind = 2
    1791        85884 :       ! inum = -1 -> inum_abs = 1, is_non_positive = 1 -> ind = 3
    1792              :       ind = 2*inum_abs + is_non_positive
    1793              :    END FUNCTION index_in_canonical_enumeration
    1794              : 
    1795              : ! **************************************************************************************************
    1796        85884 : !> \brief Return an integer number according to its index in the sequence 0, 1, -1, ..., n, -n, ...
    1797        85884 : !>        (canonical enumeration of integers, oeis.org/A001057)
    1798              : !> \param  ind       index in A001057 starting from 1
    1799              : !> \return integer number according to its position 'ind' in A001057
    1800              : !> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this
    1801              : !>       function converts SIESTA's index of a cell replica along some (x/y/z) dimension
    1802              : !>       into the corresponding absolute index.
    1803              : ! **************************************************************************************************
    1804              :    ELEMENTAL FUNCTION number_from_canonical_enumeration(ind) RESULT(inum)
    1805              :       INTEGER, INTENT(in)                                :: ind
    1806              :       INTEGER                                            :: inum
    1807              : 
    1808          372 :       ! ind < 1 is invalid
    1809              :       ! ind = 1 -> SIGN(0, -1) =  0
    1810              :       ! ind = 2 -> SIGN(1,  0) =  1
    1811              :       ! ind = 3 -> SIGN(1, -1) = -1
    1812              :       inum = SIGN(ind/2, -MOD(ind, 2))
    1813              :    END FUNCTION number_from_canonical_enumeration
    1814              : 
    1815              : ! **************************************************************************************************
    1816          372 : !> \brief   Apply periodic boundary conditions defined by a simulation cell to a position
    1817          372 : !>          vector r. Similar to pbc1 from cell_types.F but returns unscaled coordinates from
    1818              : !>          the scaled range [0, 1) instead of [-0.5, 0.5)
    1819              : !> \param r_pbc   position vector subject to the periodic boundary conditions [out]
    1820              : !> \param r       initial position vector [in]
    1821              : !> \param cell    simulation unit cell [in]
    1822              : ! **************************************************************************************************
    1823              :    PURE SUBROUTINE pbc_0_1(r_pbc, r, cell)
    1824              :       REAL(KIND=dp), DIMENSION(3), INTENT(out)           :: r_pbc
    1825              :       REAL(KIND=dp), DIMENSION(3), INTENT(in)            :: r
    1826              :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1827          136 : 
    1828              :       REAL(KIND=dp), DIMENSION(3)                        :: s
    1829              : 
    1830              :       IF (cell%orthorhombic) THEN
    1831              :          r_pbc(1) = r(1) - cell%hmat(1, 1)*cell%perd(1)*REAL(FLOOR(cell%h_inv(1, 1)*r(1)), dp)
    1832              :          r_pbc(2) = r(2) - cell%hmat(2, 2)*cell%perd(2)*REAL(FLOOR(cell%h_inv(2, 2)*r(2)), dp)
    1833              :          r_pbc(3) = r(3) - cell%hmat(3, 3)*cell%perd(3)*REAL(FLOOR(cell%h_inv(3, 3)*r(3)), dp)
    1834          136 :       ELSE
    1835          136 :          s(1) = cell%h_inv(1, 1)*r(1) + cell%h_inv(1, 2)*r(2) + cell%h_inv(1, 3)*r(3)
    1836          136 :          s(2) = cell%h_inv(2, 1)*r(1) + cell%h_inv(2, 2)*r(2) + cell%h_inv(2, 3)*r(3)
    1837          136 :          s(3) = cell%h_inv(3, 1)*r(1) + cell%h_inv(3, 2)*r(2) + cell%h_inv(3, 3)*r(3)
    1838              :          s(1) = s(1) - cell%perd(1)*REAL(FLOOR(s(1)), dp)
    1839            0 :          s(2) = s(2) - cell%perd(2)*REAL(FLOOR(s(2)), dp)
    1840            0 :          s(3) = s(3) - cell%perd(3)*REAL(FLOOR(s(3)), dp)
    1841            0 :          r_pbc(1) = cell%hmat(1, 1)*s(1) + cell%hmat(1, 2)*s(2) + cell%hmat(1, 3)*s(3)
    1842            0 :          r_pbc(2) = cell%hmat(2, 1)*s(1) + cell%hmat(2, 2)*s(2) + cell%hmat(2, 3)*s(3)
    1843            0 :          r_pbc(3) = cell%hmat(3, 1)*s(1) + cell%hmat(3, 2)*s(2) + cell%hmat(3, 3)*s(3)
    1844            0 :       END IF
    1845            0 :    END SUBROUTINE pbc_0_1
    1846            0 : 
    1847            0 : ! **************************************************************************************************
    1848              : !> \brief Computes the number of send requests from this MPI process to all the other processes.
    1849          136 : !>        Alternatively computes the number of recv requests per MPI process that the given process
    1850              : !>        expects.
    1851              : !> \param mepos                    MPI rank of the given process
    1852              : !> \param nelements_per_proc       number of element to send / receive
    1853              : !> \param max_nelements_per_packet maximum number of elements per single MPI request
    1854              : !> \return number of MPI requests
    1855              : ! **************************************************************************************************
    1856              :    PURE FUNCTION get_number_of_mpi_sendrecv_requests(mepos, nelements_per_proc, max_nelements_per_packet) RESULT(nrequests)
    1857              :       INTEGER, INTENT(in)                                :: mepos
    1858              :       INTEGER(kind=int_8), DIMENSION(0:), INTENT(in)     :: nelements_per_proc
    1859              :       INTEGER(kind=int_8), INTENT(in)                    :: max_nelements_per_packet
    1860           32 :       INTEGER                                            :: nrequests
    1861              : 
    1862              :       INTEGER                                            :: iproc
    1863              : 
    1864              :       nrequests = 0
    1865              :       DO iproc = LBOUND(nelements_per_proc, 1), UBOUND(nelements_per_proc, 1)
    1866              :          ! there is no need to send data to the same MPI process
    1867              :          IF (iproc /= mepos) THEN
    1868           32 :             nrequests = nrequests + INT(nelements_per_proc(iproc)/max_nelements_per_packet)
    1869          128 :             IF (MOD(nelements_per_proc(iproc), max_nelements_per_packet) > 0) &
    1870              :                nrequests = nrequests + 1
    1871           96 :          END IF
    1872           32 :       END DO
    1873           32 :    END FUNCTION get_number_of_mpi_sendrecv_requests
    1874           16 : 
    1875              : ! **************************************************************************************************
    1876              : !> \brief Map non-zero matrix elements on to MPI requests.
    1877           32 : !> \param element_offset           offset (index-1) of the first element [out]
    1878              : !> \param nelements_per_request    number of element for each request [out]
    1879              : !> \param peer_rank                rank of a peering MPI process
    1880              : !> \param tag                      MPI tag
    1881              : !> \param mepos                    MPI rank of a given MPI process
    1882              : !> \param nelements_per_proc       number of element to send / receive by the current MPI process
    1883              : !> \param max_nelements_per_packet maximum number of elements per single MPI request
    1884              : ! **************************************************************************************************
    1885              :    SUBROUTINE assign_nonzero_elements_to_requests(element_offset, nelements_per_request, peer_rank, tag, &
    1886              :                                                   mepos, nelements_per_proc, max_nelements_per_packet)
    1887              :       INTEGER(kind=int_8), DIMENSION(:), INTENT(out)     :: element_offset, nelements_per_request
    1888              :       INTEGER, DIMENSION(:), INTENT(out)                 :: peer_rank, tag
    1889           16 :       INTEGER, INTENT(in)                                :: mepos
    1890           16 :       INTEGER(kind=int_8), DIMENSION(0:), INTENT(in)     :: nelements_per_proc
    1891              :       INTEGER(kind=int_8), INTENT(in)                    :: max_nelements_per_packet
    1892              : 
    1893              :       INTEGER                                            :: iproc, irequest, nrequests, &
    1894              :                                                             request_offset
    1895              :       INTEGER(kind=int_8)                                :: element_offset_tmp, nelements
    1896              : 
    1897              :       request_offset = 0
    1898              :       element_offset_tmp = 0
    1899              :       DO iproc = LBOUND(nelements_per_proc, 1), UBOUND(nelements_per_proc, 1)
    1900              :          IF (iproc /= mepos) THEN
    1901           16 :             nrequests = INT(nelements_per_proc(iproc)/max_nelements_per_packet)
    1902           16 :             IF (MOD(nelements_per_proc(iproc), max_nelements_per_packet) > 0) nrequests = nrequests + 1
    1903           64 :             CPASSERT(nrequests <= max_mpi_rank + 1)
    1904           32 :             IF (nrequests > 0) THEN
    1905           16 :                nelements = nelements_per_proc(iproc)/nrequests
    1906           16 :                IF (nelements_per_proc(iproc) - nelements*nrequests > 0) nelements = nelements + 1
    1907           16 :                CPASSERT(nelements <= max_nelements_per_packet)
    1908           16 : 
    1909           16 :                DO irequest = 1, nrequests
    1910           16 :                   element_offset(request_offset + irequest) = (irequest - 1)*nelements + element_offset_tmp
    1911           16 :                   IF (irequest < nrequests) THEN
    1912              :                      nelements_per_request(request_offset + irequest) = nelements
    1913           32 :                   ELSE
    1914           16 :                      nelements_per_request(request_offset + irequest) = nelements_per_proc(iproc) - nelements*(nrequests - 1)
    1915           16 :                   END IF
    1916            0 :                   peer_rank(request_offset + irequest) = iproc
    1917              :                   tag(request_offset + irequest) = irequest - 1
    1918           16 :                END DO
    1919              :             END IF
    1920           16 :             request_offset = request_offset + nrequests
    1921           32 :          END IF
    1922              :          element_offset_tmp = element_offset_tmp + nelements_per_proc(iproc)
    1923              :       END DO
    1924           16 : 
    1925              :       IF (debug_this_module) THEN
    1926           48 :          CPASSERT(SIZE(element_offset) == request_offset)
    1927              :          CPASSERT(SIZE(nelements_per_request) == request_offset)
    1928              :          CPASSERT(SIZE(peer_rank) == request_offset)
    1929              :          CPASSERT(SIZE(tag) == request_offset)
    1930              :       END IF
    1931              :    END SUBROUTINE assign_nonzero_elements_to_requests
    1932              : 
    1933              : END MODULE smeagol_matrix_utils
        

Generated by: LCOV version 2.0-1