LCOV - code coverage report
Current view: top level - src/arnoldi - arnoldi_vector.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 78.2 % 348 272
Test Date: 2025-12-04 06:27:48 Functions: 69.6 % 23 16

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
      10              : !> \par History
      11              : !>       2014.10 created [Florian Schiffmann]
      12              : !>       2023.12 Removed support for single-precision [Ole Schuett]
      13              : !>       2024.12 Removed support for complex input matrices [Ole Schuett]
      14              : !> \author Florian Schiffmann
      15              : ! **************************************************************************************************
      16              : MODULE arnoldi_vector
      17              :    USE cp_dbcsr_api,                    ONLY: &
      18              :         dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &
      19              :         dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &
      20              :         dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &
      21              :         dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_set, dbcsr_type, &
      22              :         dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, dbcsr_type_symmetric
      23              :    USE cp_dbcsr_contrib,                ONLY: dbcsr_reserve_all_blocks
      24              :    USE kinds,                           ONLY: dp,&
      25              :                                               real_8
      26              :    USE message_passing,                 ONLY: mp_comm_type
      27              : #include "../base/base_uses.f90"
      28              : 
      29              : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      30              : 
      31              :    IMPLICIT NONE
      32              : 
      33              :    PRIVATE
      34              : 
      35              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'
      36              : 
      37              : ! **************************************************************************************************
      38              : !> \brief Types needed for the hashtable.
      39              : ! **************************************************************************************************
      40              :    TYPE ele_type
      41              :       INTEGER :: c = 0
      42              :       INTEGER :: p = 0
      43              :    END TYPE ele_type
      44              : 
      45              :    TYPE hash_table_type
      46              :       TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
      47              :       INTEGER :: nele = 0
      48              :       INTEGER :: nmax = 0
      49              :       INTEGER :: prime = 0
      50              :    END TYPE hash_table_type
      51              : 
      52              : ! **************************************************************************************************
      53              : !> \brief Types needed for fast access to distributed dbcsr vectors.
      54              : ! **************************************************************************************************
      55              :    TYPE block_ptr
      56              :       REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => NULL()
      57              :       INTEGER                                         :: assigned_thread = -1
      58              :    END TYPE block_ptr
      59              : 
      60              :    TYPE fast_vec_access_type
      61              :       TYPE(hash_table_type) :: hash_table = hash_table_type()
      62              :       TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map
      63              :    END TYPE fast_vec_access_type
      64              : 
      65              :    PUBLIC :: dbcsr_matrix_colvec_multiply, &
      66              :              create_col_vec_from_matrix, &
      67              :              create_row_vec_from_matrix, &
      68              :              create_replicated_col_vec_from_matrix, &
      69              :              create_replicated_row_vec_from_matrix
      70              : 
      71              : CONTAINS
      72              : 
      73              : ! **************************************************************************************************
      74              : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
      75              : !>        and has the same row dist as the template matrix
      76              : !>        the returned matrix is fully allocated and all blocks are set to 0
      77              : !>        this is not a sparse object (and must never be)
      78              : !> \param dbcsr_vec  the vector object to create must be allocated but not initialized
      79              : !> \param matrix a dbcsr matrix used as template
      80              : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
      81              : ! **************************************************************************************************
      82       279169 :    SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
      83              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
      84              :       INTEGER                                            :: ncol
      85              : 
      86              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
      87              : 
      88              :       INTEGER                                            :: handle, npcols
      89       279169 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
      90       279169 :                                                             row_dist
      91              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
      92              : 
      93       279169 :       CALL timeset(routineN, handle)
      94              : 
      95       279169 :       CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
      96       279169 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
      97              : 
      98       279169 :       ALLOCATE (col_dist(1), col_blk_size(1))
      99       279169 :       col_dist(1) = 0
     100       279169 :       col_blk_size(1) = ncol
     101       279169 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     102              : 
     103              :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
     104       279169 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     105       279169 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     106              : 
     107       279169 :       CALL dbcsr_distribution_release(dist_col_vec)
     108       279169 :       DEALLOCATE (col_dist, col_blk_size)
     109       279169 :       CALL timestop(handle)
     110              : 
     111       558338 :    END SUBROUTINE create_col_vec_from_matrix
     112              : 
     113              : ! **************************************************************************************************
     114              : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
     115              : !>        and has the same row dist as the template matrix
     116              : !>        the returned matrix is fully allocated and all blocks are set to 0
     117              : !>        this is not a sparse object (and must never be)
     118              : !> \param dbcsr_vec ...
     119              : !> \param matrix a dbcsr matrix used as template
     120              : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     121              : ! **************************************************************************************************
     122            0 :    SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     123              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     124              :       INTEGER                                            :: nrow
     125              : 
     126              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
     127              : 
     128              :       INTEGER                                            :: handle, nprows
     129            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     130            0 :                                                             row_dist
     131              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     132              : 
     133            0 :       CALL timeset(routineN, handle)
     134              : 
     135            0 :       CALL dbcsr_get_info(matrix, col_blk_size=col_blk_size, distribution=dist)
     136            0 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     137              : 
     138            0 :       ALLOCATE (row_dist(1), row_blk_size(1))
     139            0 :       row_dist(1) = 0
     140            0 :       row_blk_size(1) = nrow
     141            0 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     142              : 
     143              :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, matrix_type=dbcsr_type_no_symmetry, &
     144            0 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     145            0 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     146              : 
     147            0 :       CALL dbcsr_distribution_release(dist_row_vec)
     148            0 :       DEALLOCATE (row_dist, row_blk_size)
     149            0 :       CALL timestop(handle)
     150              : 
     151            0 :    END SUBROUTINE create_row_vec_from_matrix
     152              : 
     153              : ! **************************************************************************************************
     154              : !> \brief creates a col vector like object whose blocks can be replicated
     155              : !>        along the processor row and has the same row dist as the template matrix
     156              : !>        the returned matrix is fully allocated and all blocks are set to 0
     157              : !>        this is not a sparse object (and must never be)
     158              : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     159              : !> \param matrix a dbcsr matrix used as template
     160              : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     161              : ! **************************************************************************************************
     162       137255 :    SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
     163              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     164              :       INTEGER                                            :: ncol
     165              : 
     166              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
     167              : 
     168              :       INTEGER                                            :: handle, i, npcols
     169       137255 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     170       137255 :                                                             row_dist
     171              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
     172              : 
     173       137255 :       CALL timeset(routineN, handle)
     174              : 
     175       137255 :       CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
     176       137255 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
     177              : 
     178       686275 :       ALLOCATE (col_dist(npcols), col_blk_size(npcols))
     179       274510 :       col_blk_size(:) = ncol
     180       274510 :       DO i = 0, npcols - 1
     181       274510 :          col_dist(i + 1) = i
     182              :       END DO
     183       137255 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     184              : 
     185              :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
     186       137255 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     187       137255 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     188              : 
     189       137255 :       CALL dbcsr_distribution_release(dist_col_vec)
     190       137255 :       DEALLOCATE (col_dist, col_blk_size)
     191       137255 :       CALL timestop(handle)
     192              : 
     193       274510 :    END SUBROUTINE create_replicated_col_vec_from_matrix
     194              : 
     195              : ! **************************************************************************************************
     196              : !> \brief creates a row vector like object whose blocks can be replicated
     197              : !>        along the processor col and has the same col dist as the template matrix
     198              : !>        the returned matrix is fully allocated and all blocks are set to 0
     199              : !>        this is not a sparse object (and must never be)
     200              : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     201              : !> \param matrix a dbcsr matrix used as template
     202              : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     203              : ! **************************************************************************************************
     204       137255 :    SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     205              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     206              :       INTEGER                                            :: nrow
     207              : 
     208              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
     209              : 
     210              :       INTEGER                                            :: handle, i, nprows
     211       137255 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     212       137255 :                                                             row_dist
     213              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     214              : 
     215       137255 :       CALL timeset(routineN, handle)
     216              : 
     217       137255 :       CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size)
     218       137255 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     219              : 
     220       686275 :       ALLOCATE (row_dist(nprows), row_blk_size(nprows))
     221       409374 :       row_blk_size(:) = nrow
     222       409374 :       DO i = 0, nprows - 1
     223       409374 :          row_dist(i + 1) = i
     224              :       END DO
     225       137255 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     226              : 
     227              :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
     228       137255 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     229       137255 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     230              : 
     231       137255 :       CALL dbcsr_distribution_release(dist_row_vec)
     232       137255 :       DEALLOCATE (row_dist, row_blk_Size)
     233       137255 :       CALL timestop(handle)
     234              : 
     235       274510 :    END SUBROUTINE create_replicated_row_vec_from_matrix
     236              : 
     237              : ! **************************************************************************************************
     238              : !> \brief given a column vector, prepare the fast_vec_access container
     239              : !> \param vec ...
     240              : !> \param fast_vec_access ...
     241              : ! **************************************************************************************************
     242      1095375 :    SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
     243              :       TYPE(dbcsr_type)                                   :: vec
     244              :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     245              : 
     246              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
     247              : 
     248              :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     249              :                                                             nthreads, row
     250      1095375 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     251              :       TYPE(dbcsr_iterator_type)                          :: iter
     252              : 
     253      1095375 :       CALL timeset(routineN, handle)
     254              : 
     255              :       ! figure out the number of threads
     256      1095375 :       nthreads = 1
     257      1095375 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     258              : !$OMP MASTER
     259              : !$    nthreads = OMP_GET_NUM_THREADS()
     260              : !$OMP END MASTER
     261              : !$OMP END PARALLEL
     262              : 
     263      1095375 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
     264              :       ! 4 times makes sure the table is big enough to limit collisions.
     265      1095375 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     266              :       ! include zero for effective dealing with values not in the hash table (will return 0)
     267      7099106 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     268              : 
     269      1095375 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
     270      1095375 :       IF (col > 1) CPABORT("BUG")
     271              : 
     272              :       ! go through the blocks of the vector
     273      1095375 :       iblock = 0
     274      1095375 :       CALL dbcsr_iterator_start(iter, vec)
     275      3812981 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     276      2717606 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     277      2717606 :          iblock = iblock + 1
     278      2717606 :          CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
     279      2717606 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     280      2717606 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     281              :       END DO
     282      1095375 :       CALL dbcsr_iterator_stop(iter)
     283      1095375 :       CALL timestop(handle)
     284              : 
     285      3286125 :    END SUBROUTINE create_fast_col_vec_access
     286              : 
     287              : ! **************************************************************************************************
     288              : !> \brief given a row vector, prepare the fast_vec_access_container
     289              : !> \param vec ...
     290              : !> \param fast_vec_access ...
     291              : ! **************************************************************************************************
     292      1095375 :    SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
     293              :       TYPE(dbcsr_type)                                   :: vec
     294              :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     295              : 
     296              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
     297              : 
     298              :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     299              :                                                             nthreads, row
     300      1095375 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     301              :       TYPE(dbcsr_iterator_type)                          :: iter
     302              : 
     303      1095375 :       CALL timeset(routineN, handle)
     304              : 
     305              :       ! figure out the number of threads
     306      1095375 :       nthreads = 1
     307      1095375 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     308              : !$OMP MASTER
     309              : !$    nthreads = OMP_GET_NUM_THREADS()
     310              : !$OMP END MASTER
     311              : !$OMP END PARALLEL
     312              : 
     313      1095375 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
     314              :       ! 4 times makes sure the table is big enough to limit collisions.
     315      1095375 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     316              :       ! include zero for effective dealing with values not in the hash table (will return 0)
     317      9791439 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     318              : 
     319              :       ! sanity check
     320      1095375 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
     321      1095375 :       IF (row > 1) CPABORT("BUG")
     322              : 
     323              :       ! go through the blocks of the vector
     324      1095375 :       iblock = 0
     325      1095375 :       CALL dbcsr_iterator_start(iter, vec)
     326      6505314 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     327      5409939 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     328      5409939 :          iblock = iblock + 1
     329      5409939 :          CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
     330      5409939 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     331      5409939 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     332              :       END DO
     333      1095375 :       CALL dbcsr_iterator_stop(iter)
     334              : 
     335      1095375 :       CALL timestop(handle)
     336              : 
     337      3286125 :    END SUBROUTINE create_fast_row_vec_access
     338              : 
     339              : ! **************************************************************************************************
     340              : !> \brief release all memory associated with the fast_vec_access type
     341              : !> \param fast_vec_access ...
     342              : ! **************************************************************************************************
     343      2190750 :    SUBROUTINE release_fast_vec_access(fast_vec_access)
     344              :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     345              : 
     346              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
     347              : 
     348              :       INTEGER                                            :: handle
     349              : 
     350      2190750 :       CALL timeset(routineN, handle)
     351              : 
     352      2190750 :       CALL hash_table_release(fast_vec_access%hash_table)
     353      2190750 :       DEALLOCATE (fast_vec_access%blk_map)
     354              : 
     355      2190750 :       CALL timestop(handle)
     356              : 
     357      2190750 :    END SUBROUTINE release_fast_vec_access
     358              : 
     359              : ! --------------------------------------------------------------------------------------------------
     360              : ! Beginning of hashtable.
     361              : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
     362              : ! part of the module to guarantee inlining
     363              : ! hashes (c,p) pairs, where p is assumed to be >0
     364              : ! on return (0 is used as a flag for not present)
     365              : !
     366              : !
     367              : ! **************************************************************************************************
     368              : !> \brief finds a prime equal or larger than i, needed at creation
     369              : !> \param i ...
     370              : !> \return ...
     371              : ! **************************************************************************************************
     372      2190750 :    FUNCTION matching_prime(i) RESULT(res)
     373              :       INTEGER, INTENT(IN)                                :: i
     374              :       INTEGER                                            :: res
     375              : 
     376              :       INTEGER                                            :: j
     377              : 
     378      2190750 :       res = i
     379      2190750 :       j = 0
     380      6687606 :       DO WHILE (j < res)
     381     55565517 :          DO j = 2, res - 1
     382     53374767 :             IF (MOD(res, j) == 0) THEN
     383      2306106 :                res = res + 1
     384      2306106 :                EXIT
     385              :             END IF
     386              :          END DO
     387              :       END DO
     388      2190750 :    END FUNCTION matching_prime
     389              : 
     390              : ! **************************************************************************************************
     391              : !> \brief create a hash_table of given initial size.
     392              : !>        the hash table will expand as needed (but this requires rehashing)
     393              : !> \param hash_table ...
     394              : !> \param table_size ...
     395              : ! **************************************************************************************************
     396      2190750 :    SUBROUTINE hash_table_create(hash_table, table_size)
     397              :       TYPE(hash_table_type)                              :: hash_table
     398              :       INTEGER, INTENT(IN)                                :: table_size
     399              : 
     400              :       INTEGER                                            :: j
     401              : 
     402              :       ! guarantee a minimal hash table size (8), so that expansion works
     403              : 
     404      2190750 :       j = 3
     405      4206016 :       DO WHILE (2**j - 1 < table_size)
     406      2015266 :          j = j + 1
     407              :       END DO
     408      2190750 :       hash_table%nmax = 2**j - 1
     409      2190750 :       hash_table%prime = matching_prime(hash_table%nmax)
     410      2190750 :       hash_table%nele = 0
     411     58195890 :       ALLOCATE (hash_table%table(0:hash_table%nmax))
     412      2190750 :    END SUBROUTINE hash_table_create
     413              : 
     414              : ! **************************************************************************************************
     415              : !> \brief ...
     416              : !> \param hash_table ...
     417              : ! **************************************************************************************************
     418      2190750 :    SUBROUTINE hash_table_release(hash_table)
     419              :       TYPE(hash_table_type)                              :: hash_table
     420              : 
     421      2190750 :       hash_table%nmax = 0
     422      2190750 :       hash_table%nele = 0
     423      2190750 :       DEALLOCATE (hash_table%table)
     424              : 
     425      2190750 :    END SUBROUTINE hash_table_release
     426              : 
     427              : ! **************************************************************************************************
     428              : !> \brief add a pair (c,p) to the hash table
     429              : !> \param hash_table ...
     430              : !> \param c this value is being hashed
     431              : !> \param p this is being stored
     432              : ! **************************************************************************************************
     433      8127545 :    RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
     434              :       TYPE(hash_table_type), INTENT(INOUT)               :: hash_table
     435              :       INTEGER, INTENT(IN)                                :: c, p
     436              : 
     437              :       REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
     438              :          inv_hash_table_fill = 2.5_real_8
     439              : 
     440              :       INTEGER                                            :: i, j
     441      8127545 :       TYPE(ele_type), ALLOCATABLE, DIMENSION(:)          :: tmp_hash
     442              : 
     443              :       ! if too small, make a copy and rehash in a larger table
     444              : 
     445      8127545 :       IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
     446            0 :          ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
     447            0 :          tmp_hash(:) = hash_table%table
     448            0 :          CALL hash_table_release(hash_table)
     449            0 :          CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
     450            0 :          DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
     451            0 :             IF (tmp_hash(i)%c /= 0) THEN
     452            0 :                CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
     453              :             END IF
     454              :          END DO
     455            0 :          DEALLOCATE (tmp_hash)
     456              :       END IF
     457              : 
     458      8127545 :       hash_table%nele = hash_table%nele + 1
     459      8127545 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     460              : 
     461      8127545 :       DO j = i, hash_table%nmax
     462      8127545 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     463      8127545 :             hash_table%table(j)%c = c
     464      8127545 :             hash_table%table(j)%p = p
     465      8127545 :             RETURN
     466              :          END IF
     467              :       END DO
     468            0 :       DO j = 0, i - 1
     469            0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     470            0 :             hash_table%table(j)%c = c
     471            0 :             hash_table%table(j)%p = p
     472            0 :             RETURN
     473              :          END IF
     474              :       END DO
     475              : 
     476              :    END SUBROUTINE hash_table_add
     477              : 
     478              : ! **************************************************************************************************
     479              : !> \brief ...
     480              : !> \param hash_table ...
     481              : !> \param c ...
     482              : !> \return ...
     483              : ! **************************************************************************************************
     484    121067472 :    PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
     485              :       TYPE(hash_table_type), INTENT(IN)                  :: hash_table
     486              :       INTEGER, INTENT(IN)                                :: c
     487              :       INTEGER                                            :: p
     488              : 
     489              :       INTEGER                                            :: i, j
     490              : 
     491    121067472 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     492              : 
     493              :       ! catch the likely case first
     494    121067472 :       IF (hash_table%table(i)%c == c) THEN
     495    116828891 :          p = hash_table%table(i)%p
     496    116828891 :          RETURN
     497              :       END IF
     498              : 
     499      4238581 :       DO j = i, hash_table%nmax
     500      4238581 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     501      4238581 :             p = hash_table%table(j)%p
     502      4238581 :             RETURN
     503              :          END IF
     504              :       END DO
     505            0 :       DO j = 0, i - 1
     506            0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     507            0 :             p = hash_table%table(j)%p
     508            0 :             RETURN
     509              :          END IF
     510              :       END DO
     511              : 
     512              :       ! we should never reach this point.
     513    121067472 :       p = HUGE(p)
     514              : 
     515              :    END FUNCTION hash_table_get
     516              : 
     517              : ! End of hashtable
     518              : ! --------------------------------------------------------------------------------------------------
     519              : 
     520              : ! **************************************************************************************************
     521              : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
     522              : !> \param matrix ...
     523              : !> \param vec_in ...
     524              : !> \param vec_out ...
     525              : !> \param alpha ...
     526              : !> \param beta ...
     527              : !> \param work_row ...
     528              : !> \param work_col ...
     529              : ! **************************************************************************************************
     530       817722 :    SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     531              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     532              :       REAL(kind=real_8)                                  :: alpha, beta
     533              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     534              : 
     535              :       CHARACTER                                          :: matrix_type
     536              : 
     537       817722 :       CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
     538              : 
     539       540069 :       SELECT CASE (matrix_type)
     540              :       CASE (dbcsr_type_no_symmetry)
     541       540069 :          CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     542              :       CASE (dbcsr_type_symmetric)
     543       277653 :          CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     544              :       CASE (dbcsr_type_antisymmetric)
     545              :          ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
     546            0 :          CPABORT("NYI, antisymmetric matrix not permitted")
     547              :       CASE DEFAULT
     548       817722 :          CPABORT("Unknown matrix type, ...")
     549              :       END SELECT
     550              : 
     551       817722 :    END SUBROUTINE dbcsr_matrix_colvec_multiply
     552              : 
     553              : ! **************************************************************************************************
     554              : !> \brief low level routines for matrix vector multiplies
     555              : !> \param matrix ...
     556              : !> \param vec_in ...
     557              : !> \param vec_out ...
     558              : !> \param alpha ...
     559              : !> \param beta ...
     560              : !> \param work_row ...
     561              : !> \param work_col ...
     562              : ! **************************************************************************************************
     563       540069 :    SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     564              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     565              :       REAL(kind=real_8)                                  :: alpha, beta
     566              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     567              : 
     568              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
     569              : 
     570              :       INTEGER                                            :: col, handle, handle1, ithread, k, m, &
     571              :                                                             mypcol, myprow, n, ncols, nrows, pcol, &
     572              :                                                             prow, prow_handle, row
     573       540069 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     574       540069 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     575              :       TYPE(dbcsr_distribution_type)                      :: dist
     576              :       TYPE(dbcsr_iterator_type)                          :: iter
     577       540069 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     578              :       TYPE(mp_comm_type)                                 :: prow_group
     579              : 
     580       540069 :       CALL timeset(routineN, handle)
     581       540069 :       ithread = 0
     582              : 
     583              :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     584       540069 :       CALL dbcsr_get_info(matrix, distribution=dist)
     585       540069 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
     586              : 
     587       540069 :       CALL prow_group%set_handle(prow_handle)
     588              : 
     589       540069 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     590       540069 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     591              : 
     592              :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     593       540069 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     594              : 
     595              :       ! Set the work vector for the results to 0
     596       540069 :       CALL dbcsr_set(work_col, 0.0_real_8)
     597              : 
     598              :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     599              :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     600       540069 :       CALL timeset(routineN//"_local_mm", handle1)
     601              : 
     602              : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,m,n,k) &
     603       540069 : !$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)
     604              : !$    ithread = omp_get_thread_num()
     605              :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     606              :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     607              :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     608              :          prow = hash_table_get(fast_vec_col%hash_table, row)
     609              :          IF (fast_vec_col%blk_map(prow)%assigned_thread /= ithread) CYCLE
     610              :          m = SIZE(fast_vec_col%blk_map(prow)%ptr, 1)
     611              :          n = SIZE(fast_vec_col%blk_map(prow)%ptr, 2)
     612              :          k = SIZE(data_d, 2)
     613              :          IF ((m == 0) .OR. (n == 0) .OR. (k == 0)) CYCLE
     614              :          CPASSERT(n == 1)
     615              :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     616              :          CALL dgemm('N', 'T', m, n, k, &
     617              :                     1.0_dp, data_d, m, &
     618              :                     fast_vec_row%blk_map(pcol)%ptr, n, &
     619              :                     1.0_dp, &
     620              :                     fast_vec_col%blk_map(prow)%ptr, m)
     621              :       END DO
     622              :       CALL dbcsr_iterator_stop(iter)
     623              : !$OMP END PARALLEL
     624              : 
     625       540069 :       CALL timestop(handle1)
     626              : 
     627              :       ! sum all the data onto the first processor col where the original vector is stored
     628       540069 :       data_vec => dbcsr_get_data_p(work_col)
     629       540069 :       CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
     630     11787241 :       CALL prow_group%sum(data_vec(1:nrows*ncols))
     631              : 
     632              :       ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
     633              :       ! from the replicated to the original vector. Let's play it safe and use the iterator
     634       540069 :       CALL dbcsr_iterator_start(iter, vec_out)
     635      1140563 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     636       600494 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
     637       600494 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     638      1140563 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     639      6824574 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     640              :          ELSE
     641            0 :             vec_res(:, :) = beta*vec_res(:, :)
     642              :          END IF
     643              :       END DO
     644       540069 :       CALL dbcsr_iterator_stop(iter)
     645              : 
     646       540069 :       CALL release_fast_vec_access(fast_vec_row)
     647       540069 :       CALL release_fast_vec_access(fast_vec_col)
     648              : 
     649       540069 :       CALL timestop(handle)
     650              : 
     651      1080138 :    END SUBROUTINE dbcsr_matrix_vector_mult
     652              : 
     653              : ! **************************************************************************************************
     654              : !> \brief ...
     655              : !> \param matrix ...
     656              : !> \param vec_in ...
     657              : !> \param vec_out ...
     658              : !> \param alpha ...
     659              : !> \param beta ...
     660              : !> \param work_row ...
     661              : !> \param work_col ...
     662              : !> \param skip_diag ...
     663              : ! **************************************************************************************************
     664            0 :    SUBROUTINE dbcsr_matrixT_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
     665              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     666              :       REAL(kind=real_8)                                  :: alpha, beta
     667              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     668              :       LOGICAL                                            :: skip_diag
     669              : 
     670              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
     671              : 
     672              :       INTEGER                                            :: col, col_size, handle, handle1, ithread, &
     673              :                                                             mypcol, myprow, ncols, nrows, pcol, &
     674              :                                                             pcol_handle, prow, prow_handle, row, &
     675              :                                                             row_size
     676            0 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     677            0 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_bl, vec_res
     678              :       TYPE(dbcsr_distribution_type)                      :: dist
     679              :       TYPE(dbcsr_iterator_type)                          :: iter
     680            0 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     681              :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     682              : 
     683            0 :       CALL timeset(routineN, handle)
     684            0 :       ithread = 0
     685              : 
     686              :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     687            0 :       CALL dbcsr_get_info(matrix, distribution=dist)
     688            0 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     689              : 
     690            0 :       CALL prow_group%set_handle(prow_handle)
     691            0 :       CALL pcol_group%set_handle(pcol_handle)
     692              : 
     693            0 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     694            0 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     695              : 
     696              :       ! Set the work vector for the results to 0
     697            0 :       CALL dbcsr_set(work_row, 0.0_real_8)
     698              : 
     699              :       ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
     700            0 :       CALL dbcsr_iterator_start(iter, vec_in)
     701            0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     702            0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
     703            0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     704            0 :          fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
     705              :       END DO
     706            0 :       CALL dbcsr_iterator_stop(iter)
     707              :       ! Replicate the data on all processore in the row
     708            0 :       data_vec => dbcsr_get_data_p(work_col)
     709            0 :       CALL prow_group%bcast(data_vec, 0)
     710              : 
     711              :       ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
     712            0 :       CALL timeset(routineN//"local_mm", handle1)
     713            0 :       CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
     714              : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
     715            0 : !$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
     716              : !$    ithread = omp_get_thread_num()
     717              :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     718              :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     719              :          CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
     720              :          IF (skip_diag .AND. col == row) CYCLE
     721              :          prow = hash_table_get(fast_vec_col%hash_table, row)
     722              :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     723              :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     724              :              ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     725              :             IF (fast_vec_row%blk_map(pcol)%assigned_thread /= ithread) CYCLE
     726              :             fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &
     727              :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
     728              :          ELSE
     729              :             prow = hash_table_get(fast_vec_row%hash_table, row)
     730              :             pcol = hash_table_get(fast_vec_col%hash_table, col)
     731              :             IF (fast_vec_row%blk_map(prow)%assigned_thread /= ithread) CYCLE
     732              :             fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &
     733              :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(pcol)%ptr), TRANSPOSE(data_d))
     734              :          END IF
     735              :       END DO
     736              :       CALL dbcsr_iterator_stop(iter)
     737              : !$OMP END PARALLEL
     738              : 
     739            0 :       CALL timestop(handle1)
     740              : 
     741              :       ! sum all the data within a processor column to obtain the replicated result
     742            0 :       data_vec => dbcsr_get_data_p(work_row)
     743              :       ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
     744            0 :       CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
     745            0 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
     746              : 
     747              :       ! Convert the result to a column wise distribution
     748            0 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)
     749              : 
     750              :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0
     751            0 :       CALL dbcsr_iterator_start(iter, vec_out)
     752            0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     753            0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
     754            0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     755            0 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     756            0 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     757              :          ELSE
     758            0 :             vec_res(:, :) = beta*vec_res(:, :)
     759              :          END IF
     760              :       END DO
     761            0 :       CALL dbcsr_iterator_stop(iter)
     762              : 
     763            0 :       CALL timestop(handle)
     764              : 
     765            0 :    END SUBROUTINE dbcsr_matrixT_vector_mult
     766              : 
     767              : ! **************************************************************************************************
     768              : !> \brief ...
     769              : !> \param vec_in ...
     770              : !> \param rep_col_vec ...
     771              : !> \param rep_row_vec ...
     772              : !> \param fast_vec_col ...
     773              : ! **************************************************************************************************
     774      6541776 :    SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
     775              :       TYPE(dbcsr_type)                                   :: vec_in, rep_col_vec, rep_row_vec
     776              :       TYPE(fast_vec_access_type), INTENT(IN)             :: fast_vec_col
     777              : 
     778              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
     779              : 
     780              :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     781              :                                                             nrows, pcol_handle, prow_handle, row
     782       817722 :       INTEGER, DIMENSION(:), POINTER                     :: local_cols, row_dist
     783       817722 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec, data_vec_rep
     784       817722 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_row
     785              :       TYPE(dbcsr_distribution_type)                      :: dist_in, dist_rep_col
     786              :       TYPE(dbcsr_iterator_type)                          :: iter
     787              :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     788              : 
     789       817722 :       CALL timeset(routineN, handle)
     790              : 
     791              :       ! get information about the parallel environment
     792       817722 :       CALL dbcsr_get_info(vec_in, distribution=dist_in)
     793              :       CALL dbcsr_distribution_get(dist_in, &
     794              :                                   prow_group=prow_handle, &
     795              :                                   pcol_group=pcol_handle, &
     796              :                                   myprow=myprow, &
     797       817722 :                                   mypcol=mypcol)
     798              : 
     799       817722 :       CALL prow_group%set_handle(prow_handle)
     800       817722 :       CALL pcol_group%set_handle(pcol_handle)
     801              : 
     802              :       ! Get the vector which tells us which blocks are local to which processor row in the col vec
     803       817722 :       CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
     804       817722 :       CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
     805              : 
     806              :       ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
     807       817722 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     808       817722 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec)
     809       817722 :       data_vec => dbcsr_get_data_p(vec_in)
     810     24979998 :       IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
     811              :       ! Replicate the data along the row
     812     24979998 :       CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
     813              : 
     814              :       ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
     815              :       ! The rep_col_vec contains all blocks local to the row distribution of the vector.
     816              :       ! The rep_row_vec only needs the fraction which is local to the col distribution.
     817              :       ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
     818              :       ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
     819              :       ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
     820              :       ! Hope this clarifies the idea
     821       817722 :       CALL dbcsr_set(rep_row_vec, 0.0_real_8)
     822       817722 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
     823       817722 :       CALL dbcsr_iterator_start(iter, rep_row_vec)
     824      4121394 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     825      3303672 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
     826      4121394 :          IF (row_dist(col) == myprow) THEN
     827     25821326 :             vec_row = TRANSPOSE(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)
     828              :          END IF
     829              :       END DO
     830       817722 :       CALL dbcsr_iterator_stop(iter)
     831       817722 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     832       817722 :       data_vec_rep => dbcsr_get_data_p(rep_row_vec)
     833     49003236 :       CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
     834              : 
     835       817722 :       CALL timestop(handle)
     836              : 
     837       817722 :    END SUBROUTINE dbcsr_col_vec_to_rep_row
     838              : 
     839              : ! **************************************************************************************************
     840              : !> \brief ...
     841              : !> \param rep_col_vec ...
     842              : !> \param rep_row_vec ...
     843              : !> \param fast_vec_row ...
     844              : !> \param fast_vec_col_add ...
     845              : ! **************************************************************************************************
     846      1665918 :    SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
     847              :       TYPE(dbcsr_type)                                   :: rep_col_vec, rep_row_vec
     848              :       TYPE(fast_vec_access_type)                         :: fast_vec_row
     849              :       TYPE(fast_vec_access_type), OPTIONAL               :: fast_vec_col_add
     850              : 
     851              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
     852              : 
     853              :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     854              :                                                             nrows, prow_handle, row
     855       277653 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist
     856       277653 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec_rep
     857       277653 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_col
     858              :       TYPE(dbcsr_distribution_type)                      :: dist_rep_col, dist_rep_row
     859              :       TYPE(dbcsr_iterator_type)                          :: iter
     860              :       TYPE(mp_comm_type)                                 :: prow_group
     861              : 
     862       277653 :       CALL timeset(routineN, handle)
     863              : 
     864              :       ! get information about the parallel environment
     865       277653 :       CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
     866              :       CALL dbcsr_distribution_get(dist_rep_col, &
     867              :                                   prow_group=prow_handle, &
     868              :                                   myprow=myprow, &
     869       277653 :                                   mypcol=mypcol)
     870              : 
     871       277653 :       CALL prow_group%set_handle(prow_handle)
     872              : 
     873              :       ! Get the vector which tells us which blocks are local to which processor col in the row vec
     874       277653 :       CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
     875       277653 :       CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
     876              : 
     877              :       ! The same trick as described above with opposite direction
     878       277653 :       CALL dbcsr_set(rep_col_vec, 0.0_real_8)
     879       277653 :       CALL dbcsr_iterator_start(iter, rep_col_vec)
     880      1336209 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     881      1058556 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
     882      1058556 :          IF (col_dist(row) == mypcol) THEN
     883      8574664 :             vec_col = TRANSPOSE(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)
     884              :          END IF
     885              :          ! this one is special and allows to add the elements of a not yet summed replicated
     886              :          ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
     887      1336209 :          IF (PRESENT(fast_vec_col_add)) THEN
     888      8574664 :             vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
     889              :          END IF
     890              :       END DO
     891       277653 :       CALL dbcsr_iterator_stop(iter)
     892       277653 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     893       277653 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec)
     894     13192757 :       CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
     895              : 
     896       277653 :       CALL timestop(handle)
     897              : 
     898       277653 :    END SUBROUTINE dbcsr_rep_row_to_rep_col_vec
     899              : 
     900              : ! **************************************************************************************************
     901              : !> \brief ...
     902              : !> \param matrix ...
     903              : !> \param vec_in ...
     904              : !> \param vec_out ...
     905              : !> \param alpha ...
     906              : !> \param beta ...
     907              : !> \param work_row ...
     908              : !> \param work_col ...
     909              : ! **************************************************************************************************
     910       277653 :    SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     911              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     912              :       REAL(kind=real_8)                                  :: alpha, beta
     913              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     914              : 
     915              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_matrix_vector_mult'
     916              : 
     917              :       INTEGER                                            :: col, handle, handle1, ithread, mypcol, &
     918              :                                                             myprow, ncols, nrows, pcol, &
     919              :                                                             pcol_handle, prow, row, rpcol, rprow, &
     920              :                                                             vec_dim
     921       277653 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     922       277653 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     923              :       TYPE(dbcsr_distribution_type)                      :: dist
     924              :       TYPE(dbcsr_iterator_type)                          :: iter
     925              :       TYPE(dbcsr_type)                                   :: result_col, result_row
     926       277653 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row, &
     927       277653 :                                                             res_fast_vec_col, res_fast_vec_row
     928              :       TYPE(mp_comm_type)                                 :: pcol_group
     929              : 
     930       277653 :       CALL timeset(routineN, handle)
     931       277653 :       ithread = 0
     932              :       ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
     933       277653 :       CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
     934              :       ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
     935       277653 :       CALL dbcsr_set(work_col, 0.0_real_8)
     936       277653 :       CALL dbcsr_copy(result_col, work_col)
     937       277653 :       CALL dbcsr_set(work_row, 0.0_real_8)
     938       277653 :       CALL dbcsr_copy(result_row, work_row)
     939              : 
     940              :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     941       277653 :       CALL dbcsr_get_info(matrix=matrix, distribution=dist)
     942       277653 :       CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     943              : 
     944       277653 :       CALL pcol_group%set_handle(pcol_handle)
     945              : 
     946       277653 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     947       277653 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     948       277653 :       CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
     949       277653 :       CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
     950              : 
     951              :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     952       277653 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     953              : 
     954              :       ! Probably I should rename the routine above as it delivers both the replicated row and column vector
     955              : 
     956              :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     957              :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     958       277653 :       CALL timeset(routineN//"_local_mm", handle1)
     959              : 
     960              :       !------ perform the multiplication, we have to take car to take the correct blocks ----------
     961              : 
     962              : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
     963       277653 : !$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
     964              : !$    ithread = omp_get_thread_num()
     965              :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     966              :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     967              :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     968              :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     969              :          rprow = hash_table_get(res_fast_vec_col%hash_table, row)
     970              :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     971              :              ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN
     972              :             IF (res_fast_vec_col%blk_map(rprow)%assigned_thread == ithread) THEN
     973              :                res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &
     974              :                                                      MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map(pcol)%ptr))
     975              :             END IF
     976              :             prow = hash_table_get(fast_vec_col%hash_table, row)
     977              :             rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
     978              :             IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread == ithread .AND. row /= col) THEN
     979              :                res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &
     980              :                                                      MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
     981              :             END IF
     982              :          ELSE
     983              :             rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
     984              :             prow = hash_table_get(fast_vec_row%hash_table, row)
     985              :             IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread == ithread) THEN
     986              :                res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &
     987              :                                                      TRANSPOSE(MATMUL(fast_vec_row%blk_map(prow)%ptr, data_d))
     988              :             END IF
     989              :             rprow = hash_table_get(res_fast_vec_row%hash_table, row)
     990              :             pcol = hash_table_get(fast_vec_col%hash_table, col)
     991              :             IF (res_fast_vec_row%blk_map(rprow)%assigned_thread == ithread .AND. row /= col) THEN
     992              :                res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &
     993              :                                                      TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map(pcol)%ptr))
     994              :             END IF
     995              :          END IF
     996              :       END DO
     997              :       CALL dbcsr_iterator_stop(iter)
     998              : !$OMP END PARALLEL
     999              : 
    1000       277653 :       CALL timestop(handle1)
    1001              : 
    1002              :       ! sum all the data within a processor column to obtain the replicated result from lower
    1003       277653 :       data_vec => dbcsr_get_data_p(result_row)
    1004       277653 :       CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
    1005              : 
    1006     25998067 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
    1007              : 
    1008              :       ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
    1009              :       ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
    1010              :       ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
    1011              :       ! parallel bits are locally added. The sum magically creates the correct vector
    1012       277653 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
    1013              : 
    1014              :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
    1015       277653 :       CALL dbcsr_iterator_start(iter, vec_out)
    1016      1336209 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
    1017      1058556 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
    1018      1058556 :          prow = hash_table_get(fast_vec_col%hash_table, row)
    1019      1336209 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
    1020      8574664 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))
    1021              :          ELSE
    1022            0 :             vec_res(:, :) = beta*vec_res(:, :)
    1023              :          END IF
    1024              :       END DO
    1025       277653 :       CALL dbcsr_iterator_stop(iter)
    1026              : 
    1027       277653 :       CALL release_fast_vec_access(fast_vec_row)
    1028       277653 :       CALL release_fast_vec_access(fast_vec_col)
    1029       277653 :       CALL release_fast_vec_access(res_fast_vec_row)
    1030       277653 :       CALL release_fast_vec_access(res_fast_vec_col)
    1031              : 
    1032       277653 :       CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
    1033              : 
    1034       277653 :       CALL timestop(handle)
    1035              : 
    1036       555306 :    END SUBROUTINE dbcsr_sym_matrix_vector_mult
    1037              : 
    1038            0 : END MODULE arnoldi_vector
        

Generated by: LCOV version 2.0-1