LCOV - code coverage report
Current view: top level - src/arnoldi - arnoldi_vector.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:42dac4a) Lines: 78.2 % 348 272
Test Date: 2025-07-25 12:55:17 Functions: 69.6 % 23 16

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
      10              : !> \par History
      11              : !>       2014.10 created [Florian Schiffmann]
      12              : !>       2023.12 Removed support for single-precision [Ole Schuett]
      13              : !>       2024.12 Removed support for complex input matrices [Ole Schuett]
      14              : !> \author Florian Schiffmann
      15              : ! **************************************************************************************************
      16              : MODULE arnoldi_vector
      17              :    USE cp_dbcsr_api,                    ONLY: &
      18              :         dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &
      19              :         dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &
      20              :         dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &
      21              :         dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_set, dbcsr_type, &
      22              :         dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, dbcsr_type_symmetric
      23              :    USE cp_dbcsr_contrib,                ONLY: dbcsr_reserve_all_blocks
      24              :    USE kinds,                           ONLY: dp,&
      25              :                                               real_8
      26              :    USE message_passing,                 ONLY: mp_comm_type
      27              : #include "../base/base_uses.f90"
      28              : 
      29              : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      30              : 
      31              :    IMPLICIT NONE
      32              : 
      33              :    PRIVATE
      34              : 
      35              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'
      36              : 
      37              : ! **************************************************************************************************
      38              : !> \brief Types needed for the hashtable.
      39              : ! **************************************************************************************************
      40              :    TYPE ele_type
      41              :       INTEGER :: c = 0
      42              :       INTEGER :: p = 0
      43              :    END TYPE ele_type
      44              : 
      45              :    TYPE hash_table_type
      46              :       TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
      47              :       INTEGER :: nele = 0
      48              :       INTEGER :: nmax = 0
      49              :       INTEGER :: prime = 0
      50              :    END TYPE hash_table_type
      51              : 
      52              : ! **************************************************************************************************
      53              : !> \brief Types needed for fast access to distributed dbcsr vectors.
      54              : ! **************************************************************************************************
      55              :    TYPE block_ptr
      56              :       REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => NULL()
      57              :       INTEGER                                         :: assigned_thread = -1
      58              :    END TYPE block_ptr
      59              : 
      60              :    TYPE fast_vec_access_type
      61              :       TYPE(hash_table_type) :: hash_table = hash_table_type()
      62              :       TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map
      63              :    END TYPE
      64              : 
      65              :    PUBLIC :: dbcsr_matrix_colvec_multiply, &
      66              :              create_col_vec_from_matrix, &
      67              :              create_row_vec_from_matrix, &
      68              :              create_replicated_col_vec_from_matrix, &
      69              :              create_replicated_row_vec_from_matrix
      70              : 
      71              : CONTAINS
      72              : 
      73              : ! **************************************************************************************************
      74              : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
      75              : !>        and has the same row dist as the template matrix
      76              : !>        the returned matrix is fully allocated and all blocks are set to 0
      77              : !>        this is not a sparse object (and must never be)
      78              : !> \param dbcsr_vec  the vector object to create must be allocated but not initialized
      79              : !> \param matrix a dbcsr matrix used as template
      80              : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
      81              : ! **************************************************************************************************
      82       275061 :    SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
      83              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
      84              :       INTEGER                                            :: ncol
      85              : 
      86              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
      87              : 
      88              :       INTEGER                                            :: handle, npcols
      89       275061 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
      90       275061 :                                                             row_dist
      91              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
      92              : 
      93       275061 :       CALL timeset(routineN, handle)
      94              : 
      95       275061 :       CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
      96       275061 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
      97              : 
      98       275061 :       ALLOCATE (col_dist(1), col_blk_size(1))
      99       275061 :       col_dist(1) = 0
     100       275061 :       col_blk_size(1) = ncol
     101       275061 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     102              : 
     103              :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
     104       275061 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     105       275061 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     106              : 
     107       275061 :       CALL dbcsr_distribution_release(dist_col_vec)
     108       275061 :       DEALLOCATE (col_dist, col_blk_size)
     109       275061 :       CALL timestop(handle)
     110              : 
     111       550122 :    END SUBROUTINE create_col_vec_from_matrix
     112              : 
     113              : ! **************************************************************************************************
     114              : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
     115              : !>        and has the same row dist as the template matrix
     116              : !>        the returned matrix is fully allocated and all blocks are set to 0
     117              : !>        this is not a sparse object (and must never be)
     118              : !> \param dbcsr_vec ...
     119              : !> \param matrix a dbcsr matrix used as template
     120              : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     121              : ! **************************************************************************************************
     122            0 :    SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     123              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     124              :       INTEGER                                            :: nrow
     125              : 
     126              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
     127              : 
     128              :       INTEGER                                            :: handle, nprows
     129            0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     130            0 :                                                             row_dist
     131              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     132              : 
     133            0 :       CALL timeset(routineN, handle)
     134              : 
     135            0 :       CALL dbcsr_get_info(matrix, col_blk_size=col_blk_size, distribution=dist)
     136            0 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     137              : 
     138            0 :       ALLOCATE (row_dist(1), row_blk_size(1))
     139            0 :       row_dist(1) = 0
     140            0 :       row_blk_size(1) = nrow
     141            0 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     142              : 
     143              :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, matrix_type=dbcsr_type_no_symmetry, &
     144            0 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     145            0 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     146              : 
     147            0 :       CALL dbcsr_distribution_release(dist_row_vec)
     148            0 :       DEALLOCATE (row_dist, row_blk_size)
     149            0 :       CALL timestop(handle)
     150              : 
     151            0 :    END SUBROUTINE create_row_vec_from_matrix
     152              : 
     153              : ! **************************************************************************************************
     154              : !> \brief creates a col vector like object whose blocks can be replicated
     155              : !>        along the processor row and has the same row dist as the template matrix
     156              : !>        the returned matrix is fully allocated and all blocks are set to 0
     157              : !>        this is not a sparse object (and must never be)
     158              : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     159              : !> \param matrix a dbcsr matrix used as template
     160              : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     161              : ! **************************************************************************************************
     162       135245 :    SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
     163              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     164              :       INTEGER                                            :: ncol
     165              : 
     166              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
     167              : 
     168              :       INTEGER                                            :: handle, i, npcols
     169       135245 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     170       135245 :                                                             row_dist
     171              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
     172              : 
     173       135245 :       CALL timeset(routineN, handle)
     174              : 
     175       135245 :       CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
     176       135245 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
     177              : 
     178       676225 :       ALLOCATE (col_dist(npcols), col_blk_size(npcols))
     179       270490 :       col_blk_size(:) = ncol
     180       270490 :       DO i = 0, npcols - 1
     181       270490 :          col_dist(i + 1) = i
     182              :       END DO
     183       135245 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     184              : 
     185              :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
     186       135245 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     187       135245 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     188              : 
     189       135245 :       CALL dbcsr_distribution_release(dist_col_vec)
     190       135245 :       DEALLOCATE (col_dist, col_blk_size)
     191       135245 :       CALL timestop(handle)
     192              : 
     193       270490 :    END SUBROUTINE create_replicated_col_vec_from_matrix
     194              : 
     195              : ! **************************************************************************************************
     196              : !> \brief creates a row vector like object whose blocks can be replicated
     197              : !>        along the processor col and has the same col dist as the template matrix
     198              : !>        the returned matrix is fully allocated and all blocks are set to 0
     199              : !>        this is not a sparse object (and must never be)
     200              : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     201              : !> \param matrix a dbcsr matrix used as template
     202              : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     203              : ! **************************************************************************************************
     204       135245 :    SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     205              :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     206              :       INTEGER                                            :: nrow
     207              : 
     208              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
     209              : 
     210              :       INTEGER                                            :: handle, i, nprows
     211       135245 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     212       135245 :                                                             row_dist
     213              :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     214              : 
     215       135245 :       CALL timeset(routineN, handle)
     216              : 
     217       135245 :       CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size)
     218       135245 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     219              : 
     220       676225 :       ALLOCATE (row_dist(nprows), row_blk_size(nprows))
     221       403344 :       row_blk_size(:) = nrow
     222       403344 :       DO i = 0, nprows - 1
     223       403344 :          row_dist(i + 1) = i
     224              :       END DO
     225       135245 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     226              : 
     227              :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
     228       135245 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     229       135245 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     230              : 
     231       135245 :       CALL dbcsr_distribution_release(dist_row_vec)
     232       135245 :       DEALLOCATE (row_dist, row_blk_Size)
     233       135245 :       CALL timestop(handle)
     234              : 
     235       270490 :    END SUBROUTINE create_replicated_row_vec_from_matrix
     236              : 
     237              : ! **************************************************************************************************
     238              : !> \brief given a column vector, prepare the fast_vec_access container
     239              : !> \param vec ...
     240              : !> \param fast_vec_access ...
     241              : ! **************************************************************************************************
     242      1080311 :    SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
     243              :       TYPE(dbcsr_type)                                   :: vec
     244              :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     245              : 
     246              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
     247              : 
     248              :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     249              :                                                             nthreads, row
     250      1080311 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     251              :       TYPE(dbcsr_iterator_type)                          :: iter
     252              : 
     253      1080311 :       CALL timeset(routineN, handle)
     254              : 
     255              :       ! figure out the number of threads
     256      1080311 :       nthreads = 1
     257      1080311 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     258              : !$OMP MASTER
     259              : !$    nthreads = OMP_GET_NUM_THREADS()
     260              : !$OMP END MASTER
     261              : !$OMP END PARALLEL
     262              : 
     263      1080311 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
     264              :       ! 4 times makes sure the table is big enough to limit collisions.
     265      1080311 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     266              :       ! include zero for effective dealing with values not in the hash table (will return 0)
     267      7025286 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     268              : 
     269      1080311 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
     270      1080311 :       IF (col .GT. 1) CPABORT("BUG")
     271              : 
     272              :       ! go through the blocks of the vector
     273      1080311 :       iblock = 0
     274      1080311 :       CALL dbcsr_iterator_start(iter, vec)
     275      3784353 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     276      2704042 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     277      2704042 :          iblock = iblock + 1
     278      2704042 :          CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
     279      2704042 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     280      2704042 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     281              :       END DO
     282      1080311 :       CALL dbcsr_iterator_stop(iter)
     283      1080311 :       CALL timestop(handle)
     284              : 
     285      3240933 :    END SUBROUTINE create_fast_col_vec_access
     286              : 
     287              : ! **************************************************************************************************
     288              : !> \brief given a row vector, prepare the fast_vec_access_container
     289              : !> \param vec ...
     290              : !> \param fast_vec_access ...
     291              : ! **************************************************************************************************
     292      1080311 :    SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
     293              :       TYPE(dbcsr_type)                                   :: vec
     294              :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     295              : 
     296              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
     297              : 
     298              :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     299              :                                                             nthreads, row
     300      1080311 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     301              :       TYPE(dbcsr_iterator_type)                          :: iter
     302              : 
     303      1080311 :       CALL timeset(routineN, handle)
     304              : 
     305              :       ! figure out the number of threads
     306      1080311 :       nthreads = 1
     307      1080311 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     308              : !$OMP MASTER
     309              : !$    nthreads = OMP_GET_NUM_THREADS()
     310              : !$OMP END MASTER
     311              : !$OMP END PARALLEL
     312              : 
     313      1080311 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
     314              :       ! 4 times makes sure the table is big enough to limit collisions.
     315      1080311 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     316              :       ! include zero for effective dealing with values not in the hash table (will return 0)
     317      9704055 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     318              : 
     319              :       ! sanity check
     320      1080311 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
     321      1080311 :       IF (row .GT. 1) CPABORT("BUG")
     322              : 
     323              :       ! go through the blocks of the vector
     324      1080311 :       iblock = 0
     325      1080311 :       CALL dbcsr_iterator_start(iter, vec)
     326      6463122 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     327      5382811 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     328      5382811 :          iblock = iblock + 1
     329      5382811 :          CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
     330      5382811 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     331      5382811 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     332              :       END DO
     333      1080311 :       CALL dbcsr_iterator_stop(iter)
     334              : 
     335      1080311 :       CALL timestop(handle)
     336              : 
     337      3240933 :    END SUBROUTINE create_fast_row_vec_access
     338              : 
     339              : ! **************************************************************************************************
     340              : !> \brief release all memory associated with the fast_vec_access type
     341              : !> \param fast_vec_access ...
     342              : ! **************************************************************************************************
     343      2160622 :    SUBROUTINE release_fast_vec_access(fast_vec_access)
     344              :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     345              : 
     346              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
     347              : 
     348              :       INTEGER                                            :: handle
     349              : 
     350      2160622 :       CALL timeset(routineN, handle)
     351              : 
     352      2160622 :       CALL hash_table_release(fast_vec_access%hash_table)
     353      2160622 :       DEALLOCATE (fast_vec_access%blk_map)
     354              : 
     355      2160622 :       CALL timestop(handle)
     356              : 
     357      2160622 :    END SUBROUTINE release_fast_vec_access
     358              : 
     359              : ! --------------------------------------------------------------------------------------------------
     360              : ! Beginning of hashtable.
     361              : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
     362              : ! part of the module to guarantee inlining
     363              : ! hashes (c,p) pairs, where p is assumed to be >0
     364              : ! on return (0 is used as a flag for not present)
     365              : !
     366              : !
     367              : ! **************************************************************************************************
     368              : !> \brief finds a prime equal or larger than i, needed at creation
     369              : !> \param i ...
     370              : !> \return ...
     371              : ! **************************************************************************************************
     372      2160622 :    FUNCTION matching_prime(i) RESULT(res)
     373              :       INTEGER, INTENT(IN)                                :: i
     374              :       INTEGER                                            :: res
     375              : 
     376              :       INTEGER                                            :: j
     377              : 
     378      2160622 :       res = i
     379      2160622 :       j = 0
     380      6604830 :       DO WHILE (j < res)
     381     55230513 :          DO j = 2, res - 1
     382     53069891 :             IF (MOD(res, j) == 0) THEN
     383      2283586 :                res = res + 1
     384      2283586 :                EXIT
     385              :             END IF
     386              :          END DO
     387              :       END DO
     388      2160622 :    END FUNCTION
     389              : 
     390              : ! **************************************************************************************************
     391              : !> \brief create a hash_table of given initial size.
     392              : !>        the hash table will expand as needed (but this requires rehashing)
     393              : !> \param hash_table ...
     394              : !> \param table_size ...
     395              : ! **************************************************************************************************
     396      2160622 :    SUBROUTINE hash_table_create(hash_table, table_size)
     397              :       TYPE(hash_table_type)                              :: hash_table
     398              :       INTEGER, INTENT(IN)                                :: table_size
     399              : 
     400              :       INTEGER                                            :: j
     401              : 
     402              :       ! guarantee a minimal hash table size (8), so that expansion works
     403              : 
     404      2160622 :       j = 3
     405      4165864 :       DO WHILE (2**j - 1 < table_size)
     406      2005242 :          j = j + 1
     407              :       END DO
     408      2160622 :       hash_table%nmax = 2**j - 1
     409      2160622 :       hash_table%prime = matching_prime(hash_table%nmax)
     410      2160622 :       hash_table%nele = 0
     411     57796178 :       ALLOCATE (hash_table%table(0:hash_table%nmax))
     412      2160622 :    END SUBROUTINE hash_table_create
     413              : 
     414              : ! **************************************************************************************************
     415              : !> \brief ...
     416              : !> \param hash_table ...
     417              : ! **************************************************************************************************
     418      2160622 :    SUBROUTINE hash_table_release(hash_table)
     419              :       TYPE(hash_table_type)                              :: hash_table
     420              : 
     421      2160622 :       hash_table%nmax = 0
     422      2160622 :       hash_table%nele = 0
     423      2160622 :       DEALLOCATE (hash_table%table)
     424              : 
     425      2160622 :    END SUBROUTINE hash_table_release
     426              : 
     427              : ! **************************************************************************************************
     428              : !> \brief add a pair (c,p) to the hash table
     429              : !> \param hash_table ...
     430              : !> \param c this value is being hashed
     431              : !> \param p this is being stored
     432              : ! **************************************************************************************************
     433      8086853 :    RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
     434              :       TYPE(hash_table_type), INTENT(INOUT)               :: hash_table
     435              :       INTEGER, INTENT(IN)                                :: c, p
     436              : 
     437              :       REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
     438              :          inv_hash_table_fill = 2.5_real_8
     439              : 
     440              :       INTEGER                                            :: i, j
     441      8086853 :       TYPE(ele_type), ALLOCATABLE, DIMENSION(:)          :: tmp_hash
     442              : 
     443              :       ! if too small, make a copy and rehash in a larger table
     444              : 
     445      8086853 :       IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
     446            0 :          ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
     447            0 :          tmp_hash(:) = hash_table%table
     448            0 :          CALL hash_table_release(hash_table)
     449            0 :          CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
     450            0 :          DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
     451            0 :             IF (tmp_hash(i)%c .NE. 0) THEN
     452            0 :                CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
     453              :             END IF
     454              :          END DO
     455            0 :          DEALLOCATE (tmp_hash)
     456              :       END IF
     457              : 
     458      8086853 :       hash_table%nele = hash_table%nele + 1
     459      8086853 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     460              : 
     461      8086853 :       DO j = i, hash_table%nmax
     462      8086853 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     463      8086853 :             hash_table%table(j)%c = c
     464      8086853 :             hash_table%table(j)%p = p
     465      8086853 :             RETURN
     466              :          END IF
     467              :       END DO
     468            0 :       DO j = 0, i - 1
     469            0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     470            0 :             hash_table%table(j)%c = c
     471            0 :             hash_table%table(j)%p = p
     472            0 :             RETURN
     473              :          END IF
     474              :       END DO
     475              : 
     476              :    END SUBROUTINE hash_table_add
     477              : 
     478              : ! **************************************************************************************************
     479              : !> \brief ...
     480              : !> \param hash_table ...
     481              : !> \param c ...
     482              : !> \return ...
     483              : ! **************************************************************************************************
     484    120779278 :    PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
     485              :       TYPE(hash_table_type), INTENT(IN)                  :: hash_table
     486              :       INTEGER, INTENT(IN)                                :: c
     487              :       INTEGER                                            :: p
     488              : 
     489              :       INTEGER                                            :: i, j
     490              : 
     491    120779278 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     492              : 
     493              :       ! catch the likely case first
     494    120779278 :       IF (hash_table%table(i)%c == c) THEN
     495    116542292 :          p = hash_table%table(i)%p
     496    116542292 :          RETURN
     497              :       END IF
     498              : 
     499      4236986 :       DO j = i, hash_table%nmax
     500      4236986 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     501      4236986 :             p = hash_table%table(j)%p
     502      4236986 :             RETURN
     503              :          END IF
     504              :       END DO
     505            0 :       DO j = 0, i - 1
     506            0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     507            0 :             p = hash_table%table(j)%p
     508            0 :             RETURN
     509              :          END IF
     510              :       END DO
     511              : 
     512              :       ! we should never reach this point.
     513    120779278 :       p = HUGE(p)
     514              : 
     515              :    END FUNCTION hash_table_get
     516              : 
     517              : ! End of hashtable
     518              : ! --------------------------------------------------------------------------------------------------
     519              : 
     520              : ! **************************************************************************************************
     521              : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
     522              : !> \param matrix ...
     523              : !> \param vec_in ...
     524              : !> \param vec_out ...
     525              : !> \param alpha ...
     526              : !> \param beta ...
     527              : !> \param work_row ...
     528              : !> \param work_col ...
     529              : ! **************************************************************************************************
     530       807402 :    SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     531              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     532              :       REAL(kind=real_8)                                  :: alpha, beta
     533              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     534              : 
     535              :       CHARACTER                                          :: matrix_type
     536              : 
     537       807402 :       CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
     538              : 
     539       534493 :       SELECT CASE (matrix_type)
     540              :       CASE (dbcsr_type_no_symmetry)
     541       534493 :          CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     542              :       CASE (dbcsr_type_symmetric)
     543       272909 :          CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     544              :       CASE (dbcsr_type_antisymmetric)
     545              :          ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
     546            0 :          CPABORT("NYI, antisymmetric matrix not permitted")
     547              :       CASE DEFAULT
     548       807402 :          CPABORT("Unknown matrix type, ...")
     549              :       END SELECT
     550              : 
     551       807402 :    END SUBROUTINE dbcsr_matrix_colvec_multiply
     552              : 
     553              : ! **************************************************************************************************
     554              : !> \brief low level routines for matrix vector multiplies
     555              : !> \param matrix ...
     556              : !> \param vec_in ...
     557              : !> \param vec_out ...
     558              : !> \param alpha ...
     559              : !> \param beta ...
     560              : !> \param work_row ...
     561              : !> \param work_col ...
     562              : ! **************************************************************************************************
     563       534493 :    SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     564              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     565              :       REAL(kind=real_8)                                  :: alpha, beta
     566              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     567              : 
     568              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
     569              : 
     570              :       INTEGER                                            :: col, handle, handle1, ithread, mypcol, &
     571              :                                                             myprow, ncols, nrows, pcol, prow, &
     572              :                                                             prow_handle, row
     573       534493 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     574       534493 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     575              :       TYPE(dbcsr_distribution_type)                      :: dist
     576              :       TYPE(dbcsr_iterator_type)                          :: iter
     577       534493 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     578              :       TYPE(mp_comm_type)                                 :: prow_group
     579              : 
     580       534493 :       CALL timeset(routineN, handle)
     581       534493 :       ithread = 0
     582              : 
     583              :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     584       534493 :       CALL dbcsr_get_info(matrix, distribution=dist)
     585       534493 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
     586              : 
     587       534493 :       CALL prow_group%set_handle(prow_handle)
     588              : 
     589       534493 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     590       534493 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     591              : 
     592              :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     593       534493 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     594              : 
     595              :       ! Set the work vector for the results to 0
     596       534493 :       CALL dbcsr_set(work_col, 0.0_real_8)
     597              : 
     598              :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     599              :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     600       534493 :       CALL timeset(routineN//"_local_mm", handle1)
     601              : 
     602              : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
     603       534493 : !$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)
     604              : !$    ithread = omp_get_thread_num()
     605              :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     606              :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     607              :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     608              :          prow = hash_table_get(fast_vec_col%hash_table, row)
     609              :          IF (fast_vec_col%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
     610              :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     611              :          IF (SIZE(fast_vec_col%blk_map(prow)%ptr, 1) .EQ. 0 .OR. &
     612              :              SIZE(fast_vec_col%blk_map(prow)%ptr, 2) .EQ. 0 .OR. &
     613              :              SIZE(data_d, 2) .EQ. 0) CYCLE
     614              :          CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
     615              :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
     616              :                     SIZE(data_d, 2), &
     617              :                     1.0_dp, &
     618              :                     data_d, &
     619              :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
     620              :                     fast_vec_row%blk_map(pcol)%ptr, &
     621              :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
     622              :                     1.0_dp, &
     623              :                     fast_vec_col%blk_map(prow)%ptr, &
     624              :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 1))
     625              :       END DO
     626              :       CALL dbcsr_iterator_stop(iter)
     627              : !$OMP END PARALLEL
     628              : 
     629       534493 :       CALL timestop(handle1)
     630              : 
     631              :       ! sum all the data onto the first processor col where the original vector is stored
     632       534493 :       data_vec => dbcsr_get_data_p(work_col)
     633       534493 :       CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
     634     11753915 :       CALL prow_group%sum(data_vec(1:nrows*ncols))
     635              : 
     636              :       ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
     637              :       ! from the replicated to the original vector. Let's play it safe and use the iterator
     638       534493 :       CALL dbcsr_iterator_start(iter, vec_out)
     639      1131963 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     640       597470 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
     641       597470 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     642      1131963 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     643      6804651 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     644              :          ELSE
     645            0 :             vec_res(:, :) = beta*vec_res(:, :)
     646              :          END IF
     647              :       END DO
     648       534493 :       CALL dbcsr_iterator_stop(iter)
     649              : 
     650       534493 :       CALL release_fast_vec_access(fast_vec_row)
     651       534493 :       CALL release_fast_vec_access(fast_vec_col)
     652              : 
     653       534493 :       CALL timestop(handle)
     654              : 
     655      1068986 :    END SUBROUTINE dbcsr_matrix_vector_mult
     656              : 
     657              : ! **************************************************************************************************
     658              : !> \brief ...
     659              : !> \param matrix ...
     660              : !> \param vec_in ...
     661              : !> \param vec_out ...
     662              : !> \param alpha ...
     663              : !> \param beta ...
     664              : !> \param work_row ...
     665              : !> \param work_col ...
     666              : !> \param skip_diag ...
     667              : ! **************************************************************************************************
     668            0 :    SUBROUTINE dbcsr_matrixT_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
     669              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     670              :       REAL(kind=real_8)                                  :: alpha, beta
     671              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     672              :       LOGICAL                                            :: skip_diag
     673              : 
     674              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
     675              : 
     676              :       INTEGER                                            :: col, col_size, handle, handle1, ithread, &
     677              :                                                             mypcol, myprow, ncols, nrows, pcol, &
     678              :                                                             pcol_handle, prow, prow_handle, row, &
     679              :                                                             row_size
     680            0 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     681            0 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_bl, vec_res
     682              :       TYPE(dbcsr_distribution_type)                      :: dist
     683              :       TYPE(dbcsr_iterator_type)                          :: iter
     684            0 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     685              :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     686              : 
     687            0 :       CALL timeset(routineN, handle)
     688            0 :       ithread = 0
     689              : 
     690              :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     691            0 :       CALL dbcsr_get_info(matrix, distribution=dist)
     692            0 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     693              : 
     694            0 :       CALL prow_group%set_handle(prow_handle)
     695            0 :       CALL pcol_group%set_handle(pcol_handle)
     696              : 
     697            0 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     698            0 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     699              : 
     700              :       ! Set the work vector for the results to 0
     701            0 :       CALL dbcsr_set(work_row, 0.0_real_8)
     702              : 
     703              :       ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
     704            0 :       CALL dbcsr_iterator_start(iter, vec_in)
     705            0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     706            0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
     707            0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     708            0 :          fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
     709              :       END DO
     710            0 :       CALL dbcsr_iterator_stop(iter)
     711              :       ! Replicate the data on all processore in the row
     712            0 :       data_vec => dbcsr_get_data_p(work_col)
     713            0 :       CALL prow_group%bcast(data_vec, 0)
     714              : 
     715              :       ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
     716            0 :       CALL timeset(routineN//"local_mm", handle1)
     717            0 :       CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
     718              : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
     719            0 : !$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
     720              : !$    ithread = omp_get_thread_num()
     721              :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     722              :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     723              :          CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
     724              :          IF (skip_diag .AND. col == row) CYCLE
     725              :          prow = hash_table_get(fast_vec_col%hash_table, row)
     726              :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     727              :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     728              :              ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     729              :             IF (fast_vec_row%blk_map(pcol)%assigned_thread .NE. ithread) CYCLE
     730              :             fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &
     731              :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
     732              :          ELSE
     733              :             prow = hash_table_get(fast_vec_row%hash_table, row)
     734              :             pcol = hash_table_get(fast_vec_col%hash_table, col)
     735              :             IF (fast_vec_row%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
     736              :             fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &
     737              :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(pcol)%ptr), TRANSPOSE(data_d))
     738              :          END IF
     739              :       END DO
     740              :       CALL dbcsr_iterator_stop(iter)
     741              : !$OMP END PARALLEL
     742              : 
     743            0 :       CALL timestop(handle1)
     744              : 
     745              :       ! sum all the data within a processor column to obtain the replicated result
     746            0 :       data_vec => dbcsr_get_data_p(work_row)
     747              :       ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
     748            0 :       CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
     749            0 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
     750              : 
     751              :       ! Convert the result to a column wise distribution
     752            0 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)
     753              : 
     754              :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0
     755            0 :       CALL dbcsr_iterator_start(iter, vec_out)
     756            0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     757            0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
     758            0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     759            0 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     760            0 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     761              :          ELSE
     762            0 :             vec_res(:, :) = beta*vec_res(:, :)
     763              :          END IF
     764              :       END DO
     765            0 :       CALL dbcsr_iterator_stop(iter)
     766              : 
     767            0 :       CALL timestop(handle)
     768              : 
     769            0 :    END SUBROUTINE dbcsr_matrixT_vector_mult
     770              : 
     771              : ! **************************************************************************************************
     772              : !> \brief ...
     773              : !> \param vec_in ...
     774              : !> \param rep_col_vec ...
     775              : !> \param rep_row_vec ...
     776              : !> \param fast_vec_col ...
     777              : ! **************************************************************************************************
     778      6459216 :    SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
     779              :       TYPE(dbcsr_type)                                   :: vec_in, rep_col_vec, rep_row_vec
     780              :       TYPE(fast_vec_access_type), INTENT(IN)             :: fast_vec_col
     781              : 
     782              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
     783              : 
     784              :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     785              :                                                             nrows, pcol_handle, prow_handle, row
     786       807402 :       INTEGER, DIMENSION(:), POINTER                     :: local_cols, row_dist
     787       807402 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec, data_vec_rep
     788       807402 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_row
     789              :       TYPE(dbcsr_distribution_type)                      :: dist_in, dist_rep_col
     790              :       TYPE(dbcsr_iterator_type)                          :: iter
     791              :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     792              : 
     793       807402 :       CALL timeset(routineN, handle)
     794              : 
     795              :       ! get information about the parallel environment
     796       807402 :       CALL dbcsr_get_info(vec_in, distribution=dist_in)
     797              :       CALL dbcsr_distribution_get(dist_in, &
     798              :                                   prow_group=prow_handle, &
     799              :                                   pcol_group=pcol_handle, &
     800              :                                   myprow=myprow, &
     801       807402 :                                   mypcol=mypcol)
     802              : 
     803       807402 :       CALL prow_group%set_handle(prow_handle)
     804       807402 :       CALL pcol_group%set_handle(pcol_handle)
     805              : 
     806              :       ! Get the vector which tells us which blocks are local to which processor row in the col vec
     807       807402 :       CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
     808       807402 :       CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
     809              : 
     810              :       ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
     811       807402 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     812       807402 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec)
     813       807402 :       data_vec => dbcsr_get_data_p(vec_in)
     814     24851738 :       IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
     815              :       ! Replicate the data along the row
     816     24851738 :       CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
     817              : 
     818              :       ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
     819              :       ! The rep_col_vec contains all blocks local to the row distribution of the vector.
     820              :       ! The rep_row_vec only needs the fraction which is local to the col distribution.
     821              :       ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
     822              :       ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
     823              :       ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
     824              :       ! Hope this clarifies the idea
     825       807402 :       CALL dbcsr_set(rep_row_vec, 0.0_real_8)
     826       807402 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
     827       807402 :       CALL dbcsr_iterator_start(iter, rep_row_vec)
     828      4094486 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     829      3287084 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
     830      4094486 :          IF (row_dist(col) == myprow) THEN
     831     25695092 :             vec_row = TRANSPOSE(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)
     832              :          END IF
     833              :       END DO
     834       807402 :       CALL dbcsr_iterator_stop(iter)
     835       807402 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     836       807402 :       data_vec_rep => dbcsr_get_data_p(rep_row_vec)
     837     48757036 :       CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
     838              : 
     839       807402 :       CALL timestop(handle)
     840              : 
     841       807402 :    END SUBROUTINE dbcsr_col_vec_to_rep_row
     842              : 
     843              : ! **************************************************************************************************
     844              : !> \brief ...
     845              : !> \param rep_col_vec ...
     846              : !> \param rep_row_vec ...
     847              : !> \param fast_vec_row ...
     848              : !> \param fast_vec_col_add ...
     849              : ! **************************************************************************************************
     850      1637454 :    SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
     851              :       TYPE(dbcsr_type)                                   :: rep_col_vec, rep_row_vec
     852              :       TYPE(fast_vec_access_type)                         :: fast_vec_row
     853              :       TYPE(fast_vec_access_type), OPTIONAL               :: fast_vec_col_add
     854              : 
     855              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
     856              : 
     857              :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     858              :                                                             nrows, prow_handle, row
     859       272909 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist
     860       272909 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec_rep
     861       272909 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_col
     862              :       TYPE(dbcsr_distribution_type)                      :: dist_rep_col, dist_rep_row
     863              :       TYPE(dbcsr_iterator_type)                          :: iter
     864              :       TYPE(mp_comm_type)                                 :: prow_group
     865              : 
     866       272909 :       CALL timeset(routineN, handle)
     867              : 
     868              :       ! get information about the parallel environment
     869       272909 :       CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
     870              :       CALL dbcsr_distribution_get(dist_rep_col, &
     871              :                                   prow_group=prow_handle, &
     872              :                                   myprow=myprow, &
     873       272909 :                                   mypcol=mypcol)
     874              : 
     875       272909 :       CALL prow_group%set_handle(prow_handle)
     876              : 
     877              :       ! Get the vector which tells us which blocks are local to which processor col in the row vec
     878       272909 :       CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
     879       272909 :       CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
     880              : 
     881              :       ! The same trick as described above with opposite direction
     882       272909 :       CALL dbcsr_set(rep_col_vec, 0.0_real_8)
     883       272909 :       CALL dbcsr_iterator_start(iter, rep_col_vec)
     884      1326195 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     885      1053286 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
     886      1053286 :          IF (col_dist(row) == mypcol) THEN
     887      8519029 :             vec_col = TRANSPOSE(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)
     888              :          END IF
     889              :          ! this one is special and allows to add the elements of a not yet summed replicated
     890              :          ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
     891      1326195 :          IF (PRESENT(fast_vec_col_add)) THEN
     892      8519029 :             vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
     893              :          END IF
     894              :       END DO
     895       272909 :       CALL dbcsr_iterator_stop(iter)
     896       272909 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     897       272909 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec)
     898     13097823 :       CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
     899              : 
     900       272909 :       CALL timestop(handle)
     901              : 
     902       272909 :    END SUBROUTINE dbcsr_rep_row_to_rep_col_vec
     903              : 
     904              : ! **************************************************************************************************
     905              : !> \brief ...
     906              : !> \param matrix ...
     907              : !> \param vec_in ...
     908              : !> \param vec_out ...
     909              : !> \param alpha ...
     910              : !> \param beta ...
     911              : !> \param work_row ...
     912              : !> \param work_col ...
     913              : ! **************************************************************************************************
     914       272909 :    SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     915              :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     916              :       REAL(kind=real_8)                                  :: alpha, beta
     917              :       TYPE(dbcsr_type)                                   :: work_row, work_col
     918              : 
     919              :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_matrix_vector_mult'
     920              : 
     921              :       INTEGER                                            :: col, handle, handle1, ithread, mypcol, &
     922              :                                                             myprow, ncols, nrows, pcol, &
     923              :                                                             pcol_handle, prow, row, rpcol, rprow, &
     924              :                                                             vec_dim
     925       272909 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     926       272909 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     927              :       TYPE(dbcsr_distribution_type)                      :: dist
     928              :       TYPE(dbcsr_iterator_type)                          :: iter
     929              :       TYPE(dbcsr_type)                                   :: result_col, result_row
     930       272909 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row, &
     931       272909 :                                                             res_fast_vec_col, res_fast_vec_row
     932              :       TYPE(mp_comm_type)                                 :: pcol_group
     933              : 
     934       272909 :       CALL timeset(routineN, handle)
     935       272909 :       ithread = 0
     936              :       ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
     937       272909 :       CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
     938              :       ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
     939       272909 :       CALL dbcsr_set(work_col, 0.0_real_8)
     940       272909 :       CALL dbcsr_copy(result_col, work_col)
     941       272909 :       CALL dbcsr_set(work_row, 0.0_real_8)
     942       272909 :       CALL dbcsr_copy(result_row, work_row)
     943              : 
     944              :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     945       272909 :       CALL dbcsr_get_info(matrix=matrix, distribution=dist)
     946       272909 :       CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     947              : 
     948       272909 :       CALL pcol_group%set_handle(pcol_handle)
     949              : 
     950       272909 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     951       272909 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     952       272909 :       CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
     953       272909 :       CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
     954              : 
     955              :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     956       272909 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     957              : 
     958              :       ! Probably I should rename the routine above as it delivers both the replicated row and column vector
     959              : 
     960              :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     961              :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     962       272909 :       CALL timeset(routineN//"_local_mm", handle1)
     963              : 
     964              :       !------ perform the multiplication, we have to take car to take the correct blocks ----------
     965              : 
     966              : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
     967       272909 : !$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
     968              : !$    ithread = omp_get_thread_num()
     969              :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     970              :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     971              :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     972              :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     973              :          rprow = hash_table_get(res_fast_vec_col%hash_table, row)
     974              :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     975              :              ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN
     976              :             IF (res_fast_vec_col%blk_map(rprow)%assigned_thread .EQ. ithread) THEN
     977              :                res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &
     978              :                                                      MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map(pcol)%ptr))
     979              :             END IF
     980              :             prow = hash_table_get(fast_vec_col%hash_table, row)
     981              :             rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
     982              :             IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
     983              :                res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &
     984              :                                                      MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
     985              :             END IF
     986              :          ELSE
     987              :             rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
     988              :             prow = hash_table_get(fast_vec_row%hash_table, row)
     989              :             IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread .EQ. ithread) THEN
     990              :                res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &
     991              :                                                      TRANSPOSE(MATMUL(fast_vec_row%blk_map(prow)%ptr, data_d))
     992              :             END IF
     993              :             rprow = hash_table_get(res_fast_vec_row%hash_table, row)
     994              :             pcol = hash_table_get(fast_vec_col%hash_table, col)
     995              :             IF (res_fast_vec_row%blk_map(rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
     996              :                res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &
     997              :                                                      TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map(pcol)%ptr))
     998              :             END IF
     999              :          END IF
    1000              :       END DO
    1001              :       CALL dbcsr_iterator_stop(iter)
    1002              : !$OMP END PARALLEL
    1003              : 
    1004       272909 :       CALL timestop(handle1)
    1005              : 
    1006              :       ! sum all the data within a processor column to obtain the replicated result from lower
    1007       272909 :       data_vec => dbcsr_get_data_p(result_row)
    1008       272909 :       CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
    1009              : 
    1010     25812943 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
    1011              : 
    1012              :       ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
    1013              :       ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
    1014              :       ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
    1015              :       ! parallel bits are locally added. The sum magically creates the correct vector
    1016       272909 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
    1017              : 
    1018              :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
    1019       272909 :       CALL dbcsr_iterator_start(iter, vec_out)
    1020      1326195 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
    1021      1053286 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
    1022      1053286 :          prow = hash_table_get(fast_vec_col%hash_table, row)
    1023      1326195 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
    1024      8519029 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))
    1025              :          ELSE
    1026            0 :             vec_res(:, :) = beta*vec_res(:, :)
    1027              :          END IF
    1028              :       END DO
    1029       272909 :       CALL dbcsr_iterator_stop(iter)
    1030              : 
    1031       272909 :       CALL release_fast_vec_access(fast_vec_row)
    1032       272909 :       CALL release_fast_vec_access(fast_vec_col)
    1033       272909 :       CALL release_fast_vec_access(res_fast_vec_row)
    1034       272909 :       CALL release_fast_vec_access(res_fast_vec_col)
    1035              : 
    1036       272909 :       CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
    1037              : 
    1038       272909 :       CALL timestop(handle)
    1039              : 
    1040       545818 :    END SUBROUTINE dbcsr_sym_matrix_vector_mult
    1041              : 
    1042            0 : END MODULE arnoldi_vector
        

Generated by: LCOV version 2.0-1