LCOV - code coverage report
Current view: top level - src/dbcsrx - dbcsr_vector.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:b279b6b) Lines: 285 360 79.2 %
Date: 2024-04-24 07:13:09 Functions: 18 34 52.9 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
      10             : !> \par History
      11             : !>       2014.10 created [Florian Schiffmann]
      12             : !> \author Florian Schiffmann
      13             : ! **************************************************************************************************
      14             : 
      15             : MODULE dbcsr_vector
      16             :    USE dbcsr_api, ONLY: dbcsr_copy, &
      17             :                         dbcsr_create, &
      18             :                         dbcsr_distribution_get, &
      19             :                         dbcsr_distribution_new, &
      20             :                         dbcsr_distribution_release, &
      21             :                         dbcsr_distribution_type, &
      22             :                         dbcsr_get_info, &
      23             :                         dbcsr_iterator_blocks_left, &
      24             :                         dbcsr_iterator_next_block, &
      25             :                         dbcsr_iterator_start, &
      26             :                         dbcsr_iterator_stop, &
      27             :                         dbcsr_iterator_type, &
      28             :                         dbcsr_release, &
      29             :                         dbcsr_reserve_all_blocks, &
      30             :                         dbcsr_set, dbcsr_get_data_p, &
      31             :                         dbcsr_type, &
      32             :                         dbcsr_type_antisymmetric, &
      33             :                         dbcsr_type_complex_8, &
      34             :                         dbcsr_type_complex_8, &
      35             :                         dbcsr_type_no_symmetry, &
      36             :                         dbcsr_type_real_8, &
      37             :                         dbcsr_type_real_8, &
      38             :                         dbcsr_type_symmetric
      39             :    USE kinds, ONLY: dp, &
      40             :                     real_8
      41             :    USE message_passing, ONLY: mp_comm_type
      42             : 
      43             : #include "../base/base_uses.f90"
      44             : 
      45             : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      46             : 
      47             :    IMPLICIT NONE
      48             : 
      49             :    PRIVATE
      50             : 
      51             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_vector_operations'
      52             : 
      53             : ! **************************************************************************************************
      54             : !> \brief Types needed for the hashtable.
      55             : ! **************************************************************************************************
      56             :    TYPE ele_type
      57             :       INTEGER :: c = 0
      58             :       INTEGER :: p = 0
      59             :    END TYPE ele_type
      60             : 
      61             :    TYPE hash_table_type
      62             :       TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
      63             :       INTEGER :: nele = 0
      64             :       INTEGER :: nmax = 0
      65             :       INTEGER :: prime = 0
      66             :    END TYPE hash_table_type
      67             : 
      68             : ! **************************************************************************************************
      69             : !> \brief Types needed for fast access to distributed dbcsr vectors.
      70             : ! **************************************************************************************************
      71             :    TYPE block_ptr_d
      72             :       REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => NULL()
      73             :       INTEGER                                         :: assigned_thread = -1
      74             :    END TYPE
      75             :    TYPE block_ptr_z
      76             :       COMPLEX(real_8), DIMENSION(:, :), POINTER       :: ptr => NULL()
      77             :       INTEGER                                         :: assigned_thread = -1
      78             :    END TYPE
      79             : 
      80             :    TYPE fast_vec_access_type
      81             :       TYPE(hash_table_type) :: hash_table = hash_table_type()
      82             :       TYPE(block_ptr_d), DIMENSION(:), ALLOCATABLE :: blk_map_d
      83             :       TYPE(block_ptr_z), DIMENSION(:), ALLOCATABLE :: blk_map_z
      84             :    END TYPE
      85             : 
      86             :    PUBLIC :: dbcsr_matrix_colvec_multiply, &
      87             :              create_col_vec_from_matrix, &
      88             :              create_row_vec_from_matrix, &
      89             :              create_replicated_col_vec_from_matrix, &
      90             :              create_replicated_row_vec_from_matrix
      91             : 
      92             :    INTERFACE dbcsr_matrix_colvec_multiply
      93             :       MODULE PROCEDURE dbcsr_matrix_colvec_multiply_d
      94             :       MODULE PROCEDURE dbcsr_matrix_colvec_multiply_z
      95             :    END INTERFACE
      96             : 
      97             : CONTAINS
      98             : 
      99             : ! **************************************************************************************************
     100             : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
     101             : !>        and has the same row dist as the template matrix
     102             : !>        the returned matrix is fully allocated and all blocks are set to 0
     103             : !>        this is not a sparse object (and must never be)
     104             : !> \param dbcsr_vec  the vector object to create must be allocated but not initialized
     105             : !> \param matrix a dbcsr matrix used as template
     106             : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     107             : ! **************************************************************************************************
     108      265448 :    SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
     109             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     110             :       INTEGER                                            :: ncol
     111             : 
     112             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
     113             : 
     114             :       INTEGER                                            :: handle, npcols, data_type
     115      265448 :       INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist
     116             :       TYPE(dbcsr_distribution_type)                      :: dist_col_vec, dist
     117             : 
     118      265448 :       CALL timeset(routineN, handle)
     119             : 
     120      265448 :       CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
     121      265448 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
     122             : 
     123      265448 :       ALLOCATE (col_dist(1), col_blk_size(1))
     124      265448 :       col_dist(1) = 0
     125      265448 :       col_blk_size(1) = ncol
     126      265448 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     127             : 
     128             :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
     129             :                         matrix_type=dbcsr_type_no_symmetry, &
     130             :                         row_blk_size=row_blk_size, &
     131             :                         col_blk_size=col_blk_size, &
     132      265448 :                         nze=0, data_type=data_type)
     133      265448 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     134             : 
     135      265448 :       CALL dbcsr_distribution_release(dist_col_vec)
     136      265448 :       DEALLOCATE (col_dist, col_blk_size)
     137      265448 :       CALL timestop(handle)
     138             : 
     139      265448 :    END SUBROUTINE create_col_vec_from_matrix
     140             : 
     141             : ! **************************************************************************************************
     142             : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
     143             : !>        and has the same row dist as the template matrix
     144             : !>        the returned matrix is fully allocated and all blocks are set to 0
     145             : !>        this is not a sparse object (and must never be)
     146             : !> \param dbcsr_vec ...
     147             : !> \param matrix a dbcsr matrix used as template
     148             : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     149             : ! **************************************************************************************************
     150           0 :    SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     151             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     152             :       INTEGER                                            :: nrow
     153             : 
     154             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
     155             : 
     156             :       INTEGER                                            :: handle, nprows, data_type
     157           0 :       INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist
     158             :       TYPE(dbcsr_distribution_type)                      :: dist_row_vec, dist
     159             : 
     160           0 :       CALL timeset(routineN, handle)
     161             : 
     162           0 :       CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)
     163           0 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     164             : 
     165           0 :       ALLOCATE (row_dist(1), row_blk_size(1))
     166           0 :       row_dist(1) = 0
     167           0 :       row_blk_size(1) = nrow
     168           0 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     169             : 
     170             :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, &
     171             :                         matrix_type=dbcsr_type_no_symmetry, &
     172             :                         row_blk_size=row_blk_size, &
     173             :                         col_blk_size=col_blk_size, &
     174           0 :                         nze=0, data_type=data_type)
     175           0 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     176             : 
     177           0 :       CALL dbcsr_distribution_release(dist_row_vec)
     178           0 :       DEALLOCATE (row_dist, row_blk_size)
     179           0 :       CALL timestop(handle)
     180             : 
     181           0 :    END SUBROUTINE create_row_vec_from_matrix
     182             : 
     183             : ! **************************************************************************************************
     184             : !> \brief creates a col vector like object whose blocks can be replicated
     185             : !>        along the processor row and has the same row dist as the template matrix
     186             : !>        the returned matrix is fully allocated and all blocks are set to 0
     187             : !>        this is not a sparse object (and must never be)
     188             : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     189             : !> \param matrix a dbcsr matrix used as template
     190             : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     191             : ! **************************************************************************************************
     192      130475 :    SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
     193             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     194             :       INTEGER                                            :: ncol
     195             : 
     196             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
     197             : 
     198             :       INTEGER                                            :: handle, npcols, data_type, i
     199      130475 :       INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist
     200             :       TYPE(dbcsr_distribution_type)                      :: dist_col_vec, dist
     201      130475 :       CALL timeset(routineN, handle)
     202             : 
     203      130475 :       CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
     204      130475 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
     205             : 
     206      652375 :       ALLOCATE (col_dist(npcols), col_blk_size(npcols))
     207      260950 :       col_blk_size(:) = ncol
     208      260950 :       DO i = 0, npcols - 1
     209      260950 :          col_dist(i + 1) = i
     210             :       END DO
     211      130475 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     212             : 
     213             :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
     214             :                         matrix_type=dbcsr_type_no_symmetry, &
     215             :                         row_blk_size=row_blk_size, &
     216             :                         col_blk_size=col_blk_size, &
     217      130475 :                         nze=0, data_type=data_type)
     218      130475 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     219             : 
     220      130475 :       CALL dbcsr_distribution_release(dist_col_vec)
     221      130475 :       DEALLOCATE (col_dist, col_blk_size)
     222      130475 :       CALL timestop(handle)
     223             : 
     224      130475 :    END SUBROUTINE create_replicated_col_vec_from_matrix
     225             : 
     226             : ! **************************************************************************************************
     227             : !> \brief creates a row vector like object whose blocks can be replicated
     228             : !>        along the processor col and has the same col dist as the template matrix
     229             : !>        the returned matrix is fully allocated and all blocks are set to 0
     230             : !>        this is not a sparse object (and must never be)
     231             : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     232             : !> \param matrix a dbcsr matrix used as template
     233             : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     234             : ! **************************************************************************************************
     235      130475 :    SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     236             :       TYPE(dbcsr_type)                                   :: dbcsr_vec
     237             :       TYPE(dbcsr_type)                                   :: matrix
     238             :       INTEGER                                            :: nrow
     239             : 
     240             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
     241             : 
     242             :       INTEGER                                            :: handle, i, nprows, data_type
     243      130475 :       INTEGER, DIMENSION(:), POINTER                     :: row_dist, col_dist, row_blk_size, col_blk_size
     244             :       TYPE(dbcsr_distribution_type)                      :: dist_row_vec, dist
     245             : 
     246      130475 :       CALL timeset(routineN, handle)
     247             : 
     248      130475 :       CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)
     249      130475 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     250             : 
     251      652375 :       ALLOCATE (row_dist(nprows), row_blk_size(nprows))
     252      389070 :       row_blk_size(:) = nrow
     253      389070 :       DO i = 0, nprows - 1
     254      389070 :          row_dist(i + 1) = i
     255             :       END DO
     256      130475 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     257             : 
     258             :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
     259             :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
     260      130475 :                         nze=0, data_type=data_type)
     261      130475 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     262             : 
     263      130475 :       CALL dbcsr_distribution_release(dist_row_vec)
     264      130475 :       DEALLOCATE (row_dist, row_blk_Size)
     265      130475 :       CALL timestop(handle)
     266             : 
     267      130475 :    END SUBROUTINE create_replicated_row_vec_from_matrix
     268             : 
     269             : ! **************************************************************************************************
     270             : !> \brief given a column vector, prepare the fast_vec_access container
     271             : !> \param vec ...
     272             : !> \param fast_vec_access ...
     273             : ! **************************************************************************************************
     274     2089030 :    SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
     275             :       TYPE(dbcsr_type)                                   :: vec
     276             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     277             : 
     278             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
     279             : 
     280             :       INTEGER                                            :: handle, data_type
     281             : 
     282     1044515 :       CALL timeset(routineN, handle)
     283             : 
     284     1044515 :       CALL dbcsr_get_info(vec, data_type=data_type)
     285             : 
     286     1044515 :       SELECT CASE (data_type)
     287             :       CASE (dbcsr_type_real_8)
     288     1044515 :          CALL create_fast_col_vec_access_d(vec, fast_vec_access)
     289             :       CASE (dbcsr_type_complex_8)
     290     1044515 :          CALL create_fast_col_vec_access_z(vec, fast_vec_access)
     291             :       END SELECT
     292             : 
     293     1044515 :       CALL timestop(handle)
     294             : 
     295     1044515 :    END SUBROUTINE create_fast_col_vec_access
     296             : 
     297             : ! **************************************************************************************************
     298             : !> \brief given a row vector, prepare the fast_vec_access_container
     299             : !> \param vec ...
     300             : !> \param fast_vec_access ...
     301             : ! **************************************************************************************************
     302     2089030 :    SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
     303             :       TYPE(dbcsr_type)                                   :: vec
     304             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     305             : 
     306             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
     307             : 
     308             :       INTEGER                                            :: handle, data_type
     309             : 
     310     1044515 :       CALL timeset(routineN, handle)
     311             : 
     312     1044515 :       CALL dbcsr_get_info(vec, data_type=data_type)
     313             : 
     314     1044515 :       SELECT CASE (data_type)
     315             :       CASE (dbcsr_type_real_8)
     316     1044515 :          CALL create_fast_row_vec_access_d(vec, fast_vec_access)
     317             :       CASE (dbcsr_type_complex_8)
     318     1044515 :          CALL create_fast_row_vec_access_z(vec, fast_vec_access)
     319             :       END SELECT
     320             : 
     321     1044515 :       CALL timestop(handle)
     322             : 
     323     1044515 :    END SUBROUTINE create_fast_row_vec_access
     324             : 
     325             : ! **************************************************************************************************
     326             : !> \brief release all memory associated with the fast_vec_access type
     327             : !> \param fast_vec_access ...
     328             : ! **************************************************************************************************
     329     2089030 :    SUBROUTINE release_fast_vec_access(fast_vec_access)
     330             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     331             : 
     332             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
     333             : 
     334             :       INTEGER                                            :: handle
     335             : 
     336     2089030 :       CALL timeset(routineN, handle)
     337             : 
     338     2089030 :       CALL hash_table_release(fast_vec_access%hash_table)
     339     2089030 :       IF (ALLOCATED(fast_vec_access%blk_map_d)) DEALLOCATE (fast_vec_access%blk_map_d)
     340     2089030 :       IF (ALLOCATED(fast_vec_access%blk_map_z)) DEALLOCATE (fast_vec_access%blk_map_z)
     341             : 
     342     2089030 :       CALL timestop(handle)
     343             : 
     344     2089030 :    END SUBROUTINE release_fast_vec_access
     345             : 
     346             : ! --------------------------------------------------------------------------------------------------
     347             : ! Beginning of hashtable.
     348             : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
     349             : ! part of the module to guarantee inlining
     350             : ! hashes (c,p) pairs, where p is assumed to be >0
     351             : ! on return (0 is used as a flag for not present)
     352             : !
     353             : !
     354             : ! **************************************************************************************************
     355             : !> \brief finds a prime equal or larger than i, needed at creation
     356             : !> \param i ...
     357             : !> \return ...
     358             : ! **************************************************************************************************
     359     2089030 :    FUNCTION matching_prime(i) RESULT(res)
     360             :       INTEGER, INTENT(IN)                      :: i
     361             :       INTEGER                                  :: res
     362             : 
     363             :       INTEGER                                  :: j
     364             : 
     365     2089030 :       res = i
     366     2089030 :       j = 0
     367     6451672 :       DO WHILE (j < res)
     368    55025594 :          DO j = 2, res - 1
     369    52936564 :             IF (MOD(res, j) == 0) THEN
     370     2273612 :                res = res + 1
     371     2273612 :                EXIT
     372             :             END IF
     373             :          END DO
     374             :       END DO
     375     2089030 :    END FUNCTION
     376             : 
     377             : ! **************************************************************************************************
     378             : !> \brief create a hash_table of given initial size.
     379             : !>        the hash table will expand as needed (but this requires rehashing)
     380             : !> \param hash_table ...
     381             : !> \param table_size ...
     382             : ! **************************************************************************************************
     383     2089030 :    SUBROUTINE hash_table_create(hash_table, table_size)
     384             :       TYPE(hash_table_type)                    :: hash_table
     385             :       INTEGER, INTENT(IN)                      :: table_size
     386             : 
     387             :       INTEGER                                  :: j
     388             : 
     389             :       ! guarantee a minimal hash table size (8), so that expansion works
     390             : 
     391     2089030 :       j = 3
     392     4086033 :       DO WHILE (2**j - 1 < table_size)
     393     1997003 :          j = j + 1
     394             :       END DO
     395     2089030 :       hash_table%nmax = 2**j - 1
     396     2089030 :       hash_table%prime = matching_prime(hash_table%nmax)
     397     2089030 :       hash_table%nele = 0
     398    57313714 :       ALLOCATE (hash_table%table(0:hash_table%nmax))
     399     2089030 :    END SUBROUTINE hash_table_create
     400             : 
     401             : ! **************************************************************************************************
     402             : !> \brief ...
     403             : !> \param hash_table ...
     404             : ! **************************************************************************************************
     405     2089030 :    SUBROUTINE hash_table_release(hash_table)
     406             :       TYPE(hash_table_type)                    :: hash_table
     407             : 
     408     2089030 :       hash_table%nmax = 0
     409     2089030 :       hash_table%nele = 0
     410     2089030 :       DEALLOCATE (hash_table%table)
     411             : 
     412     2089030 :    END SUBROUTINE hash_table_release
     413             : 
     414             : ! **************************************************************************************************
     415             : !> \brief add a pair (c,p) to the hash table
     416             : !> \param hash_table ...
     417             : !> \param c this value is being hashed
     418             : !> \param p this is being stored
     419             : ! **************************************************************************************************
     420     8060545 :    RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
     421             :       TYPE(hash_table_type), INTENT(INOUT)     :: hash_table
     422             :       INTEGER, INTENT(IN)                      :: c, p
     423             : 
     424             :       REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
     425             :                                       inv_hash_table_fill = 2.5_real_8
     426             : 
     427             :       INTEGER                                  :: i, j
     428             :       TYPE(ele_type), ALLOCATABLE, &
     429     8060545 :          DIMENSION(:)                           :: tmp_hash
     430             : 
     431             : ! if too small, make a copy and rehash in a larger table
     432             : 
     433     8060545 :       IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
     434           0 :          ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
     435           0 :          tmp_hash(:) = hash_table%table
     436           0 :          CALL hash_table_release(hash_table)
     437           0 :          CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
     438           0 :          DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
     439           0 :             IF (tmp_hash(i)%c .NE. 0) THEN
     440           0 :                CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
     441             :             END IF
     442             :          END DO
     443           0 :          DEALLOCATE (tmp_hash)
     444             :       END IF
     445             : 
     446     8060545 :       hash_table%nele = hash_table%nele + 1
     447     8060545 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     448             : 
     449     8060545 :       DO j = i, hash_table%nmax
     450     8060545 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     451     8060545 :             hash_table%table(j)%c = c
     452     8060545 :             hash_table%table(j)%p = p
     453     8060545 :             RETURN
     454             :          END IF
     455             :       END DO
     456           0 :       DO j = 0, i - 1
     457           0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     458           0 :             hash_table%table(j)%c = c
     459           0 :             hash_table%table(j)%p = p
     460           0 :             RETURN
     461             :          END IF
     462             :       END DO
     463             : 
     464             :    END SUBROUTINE hash_table_add
     465             : 
     466             : ! **************************************************************************************************
     467             : !> \brief ...
     468             : !> \param hash_table ...
     469             : !> \param c ...
     470             : !> \return ...
     471             : ! **************************************************************************************************
     472   120637070 :    PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
     473             :       TYPE(hash_table_type), INTENT(IN)        :: hash_table
     474             :       INTEGER, INTENT(IN)                      :: c
     475             :       INTEGER                                  :: p
     476             : 
     477             :       INTEGER                                  :: i, j
     478             : 
     479   120637070 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     480             : 
     481             :       ! catch the likely case first
     482   120637070 :       IF (hash_table%table(i)%c == c) THEN
     483   116317232 :          p = hash_table%table(i)%p
     484   116317232 :          RETURN
     485             :       END IF
     486             : 
     487     4319838 :       DO j = i, hash_table%nmax
     488     4319838 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     489     4319838 :             p = hash_table%table(j)%p
     490     4319838 :             RETURN
     491             :          END IF
     492             :       END DO
     493           0 :       DO j = 0, i - 1
     494           0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     495           0 :             p = hash_table%table(j)%p
     496           0 :             RETURN
     497             :          END IF
     498             :       END DO
     499             : 
     500             :       ! we should never reach this point.
     501   120637070 :       p = HUGE(p)
     502             : 
     503             :    END FUNCTION hash_table_get
     504             : 
     505             : ! End of hashtable
     506             : ! --------------------------------------------------------------------------------------------------
     507             : 
     508             :    #:set instances = [ &
     509             :       ('d', 'REAL(kind=real_8)',    '0.0_real_8'), &
     510             :       ('z', 'COMPLEX(kind=real_8)', 'CMPLX(0.0, 0.0, real_8)') ]
     511             : 
     512             :    #:for nametype, type, zero in instances
     513             : 
     514             : ! **************************************************************************************************
     515             : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
     516             : !> \param matrix ...
     517             : !> \param vec_in ...
     518             : !> \param vec_out ...
     519             : !> \param alpha ...
     520             : !> \param beta ...
     521             : !> \param work_row ...
     522             : !> \param work_col ...
     523             : ! **************************************************************************************************
     524      771106 :       SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     525             :          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
     526             :          ${type}$                                  :: alpha, beta
     527             :          TYPE(dbcsr_type)                          :: work_row, work_col
     528             : 
     529             :          CHARACTER                                :: matrix_type
     530             : 
     531      771106 :          CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
     532             : 
     533      497697 :          SELECT CASE (matrix_type)
     534             :          CASE (dbcsr_type_no_symmetry)
     535      497697 :             CALL dbcsr_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     536             :          CASE (dbcsr_type_symmetric)
     537      273409 :             CALL dbcsr_sym_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     538             :          CASE (dbcsr_type_antisymmetric)
     539             :             ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
     540           0 :             CPABORT("NYI, antisymmetric matrix not permitted")
     541             :          CASE DEFAULT
     542      771106 :             CPABORT("Unknown matrix type, ...")
     543             :          END SELECT
     544             : 
     545      771106 :       END SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$
     546             : 
     547             : ! **************************************************************************************************
     548             : !> \brief low level routines for matrix vector multiplies
     549             : !> \param matrix ...
     550             : !> \param vec_in ...
     551             : !> \param vec_out ...
     552             : !> \param alpha ...
     553             : !> \param beta ...
     554             : !> \param work_row ...
     555             : !> \param work_col ...
     556             : ! **************************************************************************************************
     557      497697 :       SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     558             :          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
     559             :          ${type}$                                  :: alpha, beta
     560             :          TYPE(dbcsr_type)                          :: work_row, work_col
     561             : 
     562             :          CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
     563             : 
     564             :          INTEGER                                  :: col, mypcol, &
     565             :                                                      myprow, prow_handle, &
     566             :                                                      ncols, nrows, &
     567             :                                                      row, &
     568             :                                                      handle, handle1, ithread
     569             :          TYPE(mp_comm_type) :: prow_group
     570      497697 :          ${type}$, DIMENSION(:), POINTER          :: data_vec
     571      497697 :          ${type}$, DIMENSION(:, :), POINTER       :: data_d, vec_res
     572             :          TYPE(dbcsr_distribution_type)            :: dist
     573             :          TYPE(dbcsr_iterator_type)                :: iter
     574      497697 :          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col
     575             :          INTEGER                                  :: prow, pcol
     576             : 
     577      497697 :          CALL timeset(routineN, handle)
     578      497697 :          ithread = 0
     579             : 
     580             : ! Collect some data about the parallel environment. We will use them later to move the vector around
     581      497697 :          CALL dbcsr_get_info(matrix, distribution=dist)
     582      497697 :          CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
     583             : 
     584      497697 :          CALL prow_group%set_handle(prow_handle)
     585             : 
     586      497697 :          CALL create_fast_row_vec_access(work_row, fast_vec_row)
     587      497697 :          CALL create_fast_col_vec_access(work_col, fast_vec_col)
     588             : 
     589             : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     590      497697 :          CALL dbcsr_col_vec_to_rep_row_${nametype}$ (vec_in, work_col, work_row, fast_vec_col)
     591             : 
     592             : ! Set the work vector for the results to 0
     593      497697 :          CALL dbcsr_set(work_col, ${zero}$)
     594             : 
     595             : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     596             : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     597      497697 :          CALL timeset(routineN//"_local_mm", handle1)
     598             : 
     599             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
     600      497697 : !$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)
     601             : !$       ithread = omp_get_thread_num()
     602             :          CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     603             :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     604             :             CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     605             :             prow = hash_table_get(fast_vec_col%hash_table, row)
     606             :             IF (fast_vec_col%blk_map_${nametype}$ (prow)%assigned_thread .NE. ithread) CYCLE
     607             :             pcol = hash_table_get(fast_vec_row%hash_table, col)
     608             :             #:if nametype=='d'
     609             :                IF (SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1) .EQ. 0 .OR. &
     610             :                    SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 2) .EQ. 0 .OR. &
     611             :                    SIZE(data_d, 2) .EQ. 0) CYCLE
     612             :                CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1), &
     613             :                           SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 2), &
     614             :                           SIZE(data_d, 2), &
     615             :                           1.0_dp, &
     616             :                           data_d, &
     617             :                           SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1), &
     618             :                           fast_vec_row%blk_map_${nametype}$ (pcol)%ptr, &
     619             :                           SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 2), &
     620             :                           1.0_dp, &
     621             :                           fast_vec_col%blk_map_${nametype}$ (prow)%ptr, &
     622             :                           SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1))
     623             :             #:else
     624             :                fast_vec_col%blk_map_${nametype}$ (prow)%ptr = fast_vec_col%blk_map_${nametype}$ (prow)%ptr + &
     625             :                                                              MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr))
     626             :             #:endif
     627             :          END DO
     628             :          CALL dbcsr_iterator_stop(iter)
     629             : !$OMP END PARALLEL
     630             : 
     631      497697 :          CALL timestop(handle1)
     632             : 
     633             : ! sum all the data onto the first processor col where the original vector is stored
     634      497697 :          data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$)
     635      497697 :          CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
     636    10497223 :          CALL prow_group%sum(data_vec(1:nrows*ncols))
     637             : 
     638             : ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
     639             : ! from the replicated to the original vector. Let's play it safe and use the iterator
     640      497697 :          CALL dbcsr_iterator_start(iter, vec_out)
     641     1057391 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     642      559694 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
     643      559694 :             prow = hash_table_get(fast_vec_col%hash_table, row)
     644     1057391 :             IF (ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
     645     6119151 :                vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_${nametype}$ (prow)%ptr(:, :)
     646             :             ELSE
     647           0 :                vec_res(:, :) = beta*vec_res(:, :)
     648             :             END IF
     649             :          END DO
     650      497697 :          CALL dbcsr_iterator_stop(iter)
     651             : 
     652      497697 :          CALL release_fast_vec_access(fast_vec_row)
     653      497697 :          CALL release_fast_vec_access(fast_vec_col)
     654             : 
     655      497697 :          CALL timestop(handle)
     656             : 
     657      995394 :       END SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$
     658             : 
     659             : ! **************************************************************************************************
     660             : !> \brief ...
     661             : !> \param matrix ...
     662             : !> \param vec_in ...
     663             : !> \param vec_out ...
     664             : !> \param alpha ...
     665             : !> \param beta ...
     666             : !> \param work_row ...
     667             : !> \param work_col ...
     668             : !> \param skip_diag ...
     669             : ! **************************************************************************************************
     670           0 :       SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
     671             :          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
     672             :          ${type}$                                  :: alpha, beta
     673             :          TYPE(dbcsr_type)                          :: work_row, work_col
     674             :          LOGICAL                                   :: skip_diag
     675             : 
     676             :          CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
     677             : 
     678             :          INTEGER                                  :: col, col_size, mypcol, &
     679             :                                                      myprow, pcol_handle, prow_handle, &
     680             :                                                      ncols, nrows, &
     681             :                                                      row, row_size, &
     682             :                                                      handle, handle1, ithread
     683             :          TYPE(mp_comm_type) :: pcol_group, prow_group
     684           0 :          ${type}$, DIMENSION(:), POINTER          :: data_vec
     685           0 :          ${type}$, DIMENSION(:, :), POINTER       :: data_d, vec_bl, vec_res
     686             :          TYPE(dbcsr_distribution_type)            :: dist
     687             :          TYPE(dbcsr_iterator_type)                :: iter
     688             : 
     689           0 :          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col
     690             :          INTEGER                                  :: prow, pcol
     691             : 
     692           0 :          CALL timeset(routineN, handle)
     693           0 :          ithread = 0
     694             : 
     695             : ! Collect some data about the parallel environment. We will use them later to move the vector around
     696           0 :          CALL dbcsr_get_info(matrix, distribution=dist)
     697           0 :          CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     698             : 
     699           0 :          CALL prow_group%set_handle(prow_handle)
     700           0 :          CALL pcol_group%set_handle(pcol_handle)
     701             : 
     702           0 :          CALL create_fast_row_vec_access(work_row, fast_vec_row)
     703           0 :          CALL create_fast_col_vec_access(work_col, fast_vec_col)
     704             : 
     705             : ! Set the work vector for the results to 0
     706           0 :          CALL dbcsr_set(work_row, ${zero}$)
     707             : 
     708             : ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
     709           0 :          CALL dbcsr_iterator_start(iter, vec_in)
     710           0 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     711           0 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
     712           0 :             prow = hash_table_get(fast_vec_col%hash_table, row)
     713           0 :             fast_vec_col%blk_map_${nametype}$ (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
     714             :          END DO
     715           0 :          CALL dbcsr_iterator_stop(iter)
     716             : ! Replicate the data on all processore in the row
     717           0 :          data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$)
     718           0 :          CALL prow_group%bcast(data_vec, 0)
     719             : 
     720             : ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
     721           0 :          CALL timeset(routineN//"local_mm", handle1)
     722           0 :          CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
     723             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
     724           0 : !$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
     725             : !$       ithread = omp_get_thread_num()
     726             :          CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     727             :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     728             :             CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
     729             :             IF (skip_diag .AND. col == row) CYCLE
     730             :             prow = hash_table_get(fast_vec_col%hash_table, row)
     731             :             pcol = hash_table_get(fast_vec_row%hash_table, col)
     732             :             IF (ASSOCIATED(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr) .AND. &
     733             :                 ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
     734             :                IF (fast_vec_row%blk_map_${nametype}$ (pcol)%assigned_thread .NE. ithread) CYCLE
     735             :                fast_vec_row%blk_map_${nametype}$ (pcol)%ptr = fast_vec_row%blk_map_${nametype}$ (pcol)%ptr + &
     736             :                                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr), data_d)
     737             :             ELSE
     738             :                prow = hash_table_get(fast_vec_row%hash_table, row)
     739             :                pcol = hash_table_get(fast_vec_col%hash_table, col)
     740             :                IF (fast_vec_row%blk_map_${nametype}$ (prow)%assigned_thread .NE. ithread) CYCLE
     741             :                fast_vec_row%blk_map_${nametype}$ (prow)%ptr = fast_vec_row%blk_map_${nametype}$ (prow)%ptr + &
     742             :                                                   MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (pcol)%ptr), TRANSPOSE(data_d))
     743             :             END IF
     744             :          END DO
     745             :          CALL dbcsr_iterator_stop(iter)
     746             : !$OMP END PARALLEL
     747             : 
     748           0 :          CALL timestop(handle1)
     749             : 
     750             : ! sum all the data within a processor column to obtain the replicated result
     751           0 :          data_vec => dbcsr_get_data_p(work_row, select_data_type=${zero}$)
     752             : ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
     753           0 :          CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
     754           0 :          CALL pcol_group%sum(data_vec(1:nrows*ncols))
     755             : 
     756             : ! Convert the result to a column wise distribution
     757           0 :          CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$ (work_col, work_row, fast_vec_row)
     758             : 
     759             : ! Create_the final vector by summing it to the result vector which lives on proc_col 0
     760           0 :          CALL dbcsr_iterator_start(iter, vec_out)
     761           0 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     762           0 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
     763           0 :             prow = hash_table_get(fast_vec_col%hash_table, row)
     764           0 :             IF (ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
     765           0 :                vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_${nametype}$ (prow)%ptr(:, :)
     766             :             ELSE
     767           0 :                vec_res(:, :) = beta*vec_res(:, :)
     768             :             END IF
     769             :          END DO
     770           0 :          CALL dbcsr_iterator_stop(iter)
     771             : 
     772           0 :          CALL timestop(handle)
     773             : 
     774           0 :       END SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$
     775             : 
     776             : ! **************************************************************************************************
     777             : !> \brief ...
     778             : !> \param vec_in ...
     779             : !> \param rep_col_vec ...
     780             : !> \param rep_row_vec ...
     781             : !> \param fast_vec_col ...
     782             : ! **************************************************************************************************
     783     3855530 :       SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$ (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
     784             :          TYPE(dbcsr_type)                          :: vec_in, rep_col_vec, &
     785             :                                                       rep_row_vec
     786             :          TYPE(fast_vec_access_type), INTENT(IN)   :: fast_vec_col
     787             : 
     788             :          CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
     789             : 
     790             :          INTEGER                                  :: col, mypcol, myprow, ncols, &
     791             :                                                      nrows, pcol_handle, prow_handle, &
     792             :                                                      row, handle
     793             :          TYPE(mp_comm_type) :: pcol_group, prow_group
     794      771106 :          INTEGER, DIMENSION(:), POINTER           :: local_cols, row_dist
     795      771106 :          ${type}$, DIMENSION(:), POINTER          :: data_vec, data_vec_rep
     796      771106 :          ${type}$, DIMENSION(:, :), POINTER       :: vec_row
     797             :          TYPE(dbcsr_distribution_type)            :: dist_in, dist_rep_col
     798             :          TYPE(dbcsr_iterator_type)                :: iter
     799             : 
     800      771106 :          CALL timeset(routineN, handle)
     801             : 
     802             : ! get information about the parallel environment
     803      771106 :          CALL dbcsr_get_info(vec_in, distribution=dist_in)
     804             :          CALL dbcsr_distribution_get(dist_in, &
     805             :                                      prow_group=prow_handle, &
     806             :                                      pcol_group=pcol_handle, &
     807             :                                      myprow=myprow, &
     808      771106 :                                      mypcol=mypcol)
     809             : 
     810      771106 :          CALL prow_group%set_handle(prow_handle)
     811      771106 :          CALL pcol_group%set_handle(pcol_handle)
     812             : 
     813             : ! Get the vector which tells us which blocks are local to which processor row in the col vec
     814      771106 :          CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
     815      771106 :          CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
     816             : 
     817             : ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
     818      771106 :          CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     819      771106 :          data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$)
     820      771106 :          data_vec => dbcsr_get_data_p(vec_in, select_data_type=${zero}$)
     821    23913868 :          IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
     822             : ! Replicate the data along the row
     823    23913868 :          CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
     824             : 
     825             : ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
     826             : ! The rep_col_vec contains all blocks local to the row distribution of the vector.
     827             : ! The rep_row_vec only needs the fraction which is local to the col distribution.
     828             : ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
     829             : ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
     830             : ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
     831             : ! Hope this clarifies the idea
     832      771106 :          CALL dbcsr_set(rep_row_vec, ${zero}$)
     833      771106 :          CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
     834      771106 :          CALL dbcsr_iterator_start(iter, rep_row_vec)
     835     4011800 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     836     3240694 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
     837     4011800 :             IF (row_dist(col) == myprow) THEN
     838    24770209 :                vec_row = TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (hash_table_get(fast_vec_col%hash_table, col))%ptr)
     839             :             END IF
     840             :          END DO
     841      771106 :          CALL dbcsr_iterator_stop(iter)
     842      771106 :          CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     843      771106 :          data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=${zero}$)
     844    46919546 :          CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
     845             : 
     846      771106 :          CALL timestop(handle)
     847             : 
     848      771106 :       END SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$
     849             : 
     850             : ! **************************************************************************************************
     851             : !> \brief ...
     852             : !> \param rep_col_vec ...
     853             : !> \param rep_row_vec ...
     854             : !> \param fast_vec_row ...
     855             : !> \param fast_vec_col_add ...
     856             : ! **************************************************************************************************
     857      820227 :       SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$ (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
     858             :          TYPE(dbcsr_type)                          :: rep_col_vec, rep_row_vec
     859             :          TYPE(fast_vec_access_type), OPTIONAL     :: fast_vec_col_add
     860             :          TYPE(fast_vec_access_type)               :: fast_vec_row
     861             : 
     862             :          CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
     863             : 
     864             :          INTEGER                                  :: col, mypcol, myprow, ncols, &
     865             :                                                      nrows, prow_handle, &
     866             :                                                      row, handle
     867      273409 :          INTEGER, DIMENSION(:), POINTER           :: col_dist
     868             :          TYPE(mp_comm_type) :: prow_group
     869      273409 :          ${type}$, DIMENSION(:), POINTER          :: data_vec_rep
     870      273409 :          ${type}$, DIMENSION(:, :), POINTER       :: vec_col
     871             :          TYPE(dbcsr_distribution_type)            :: dist_rep_row, dist_rep_col
     872             :          TYPE(dbcsr_iterator_type)                :: iter
     873             : 
     874      273409 :          CALL timeset(routineN, handle)
     875             : 
     876             : ! get information about the parallel environment
     877      273409 :          CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
     878             :          CALL dbcsr_distribution_get(dist_rep_col, &
     879             :                                      prow_group=prow_handle, &
     880             :                                      myprow=myprow, &
     881      273409 :                                      mypcol=mypcol)
     882             : 
     883      273409 :          CALL prow_group%set_handle(prow_handle)
     884             : 
     885             : ! Get the vector which tells us which blocks are local to which processor col in the row vec
     886      273409 :          CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
     887      273409 :          CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
     888             : 
     889             : ! The same trick as described above with opposite direction
     890      273409 :          CALL dbcsr_set(rep_col_vec, ${zero}$)
     891      273409 :          CALL dbcsr_iterator_start(iter, rep_col_vec)
     892     1341162 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     893     1067753 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
     894     1067753 :             IF (col_dist(row) == mypcol) THEN
     895     8707124 :                vec_col = TRANSPOSE(fast_vec_row%blk_map_${nametype}$ (hash_table_get(fast_vec_row%hash_table, row))%ptr)
     896             :             END IF
     897             :             ! this one is special and allows to add the elements of a not yet summed replicated
     898             :             ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
     899     1067753 :             IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &
     900     8980533 :                                   fast_vec_col_add%blk_map_${nametype}$ (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
     901             :          END DO
     902      273409 :          CALL dbcsr_iterator_stop(iter)
     903      273409 :          CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     904      273409 :          data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$)
     905    13416645 :          CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
     906             : 
     907      273409 :          CALL timestop(handle)
     908             : 
     909      273409 :       END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$
     910             : 
     911             : ! **************************************************************************************************
     912             : !> \brief given a column vector, prepare the fast_vec_access container
     913             : !> \param vec ...
     914             : !> \param fast_vec_access ...
     915             : ! **************************************************************************************************
     916     1044515 :       SUBROUTINE create_fast_col_vec_access_${nametype}$ (vec, fast_vec_access)
     917             :          TYPE(dbcsr_type)                          :: vec
     918             :          TYPE(fast_vec_access_type)               :: fast_vec_access
     919             : 
     920             :          CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access_${nametype}$'
     921             : 
     922             :          INTEGER                                  :: handle, nblk_local
     923             :          INTEGER                                  :: col, row, iblock, nthreads
     924     1044515 :          ${type}$, DIMENSION(:, :), POINTER       :: vec_bl
     925             :          TYPE(dbcsr_iterator_type)                :: iter
     926             : 
     927     1044515 :          CALL timeset(routineN, handle)
     928             : 
     929             :          ! figure out the number of threads
     930     1044515 :          nthreads = 1
     931     1044515 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     932             : !$OMP MASTER
     933             : !$       nthreads = OMP_GET_NUM_THREADS()
     934             : !$OMP END MASTER
     935             : !$OMP END PARALLEL
     936             : 
     937     1044515 :          CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
     938             :          ! 4 times makes sure the table is big enough to limit collisions.
     939     1044515 :          CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     940             :          ! include zero for effective dealing with values not in the hash table (will return 0)
     941     6873260 :          ALLOCATE (fast_vec_access%blk_map_${nametype}$ (0:nblk_local))
     942             : 
     943     1044515 :          CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
     944     1044515 :          IF (col .GT. 1) CPABORT("BUG")
     945             : 
     946             :          ! go through the blocks of the vector
     947     1044515 :          iblock = 0
     948     1044515 :          CALL dbcsr_iterator_start(iter, vec)
     949     3739715 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
     950     2695200 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     951     2695200 :             iblock = iblock + 1
     952     2695200 :             CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
     953     2695200 :             fast_vec_access%blk_map_${nametype}$ (iblock)%ptr => vec_bl
     954     2695200 :             fast_vec_access%blk_map_${nametype}$ (iblock)%assigned_thread = MOD(iblock, nthreads)
     955             :          END DO
     956     1044515 :          CALL dbcsr_iterator_stop(iter)
     957             : 
     958     1044515 :          CALL timestop(handle)
     959             : 
     960     2089030 :       END SUBROUTINE create_fast_col_vec_access_${nametype}$
     961             : 
     962             : ! **************************************************************************************************
     963             : !> \brief given a row vector, prepare the fast_vec_access_container
     964             : !> \param vec ...
     965             : !> \param fast_vec_access ...
     966             : ! **************************************************************************************************
     967     1044515 :       SUBROUTINE create_fast_row_vec_access_${nametype}$ (vec, fast_vec_access)
     968             :          TYPE(dbcsr_type)                          :: vec
     969             :          TYPE(fast_vec_access_type)                :: fast_vec_access
     970             : 
     971             :          CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access_${nametype}$'
     972             : 
     973             :          INTEGER                                  :: handle, nblk_local
     974             :          INTEGER                                  :: col, row, iblock, nthreads
     975     1044515 :          ${type}$, DIMENSION(:, :), POINTER       :: vec_bl
     976             :          TYPE(dbcsr_iterator_type)                :: iter
     977             : 
     978     1044515 :          CALL timeset(routineN, handle)
     979             : 
     980             :          ! figure out the number of threads
     981     1044515 :          nthreads = 1
     982     1044515 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     983             : !$OMP MASTER
     984             : !$       nthreads = OMP_GET_NUM_THREADS()
     985             : !$OMP END MASTER
     986             : !$OMP END PARALLEL
     987             : 
     988     1044515 :          CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
     989             :          ! 4 times makes sure the table is big enough to limit collisions.
     990     1044515 :          CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     991             :          ! include zero for effective dealing with values not in the hash table (will return 0)
     992     9543405 :          ALLOCATE (fast_vec_access%blk_map_${nametype}$ (0:nblk_local))
     993             : 
     994             :          ! sanity check
     995     1044515 :          CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
     996     1044515 :          IF (row .GT. 1) CPABORT("BUG")
     997             : 
     998             :          ! go through the blocks of the vector
     999     1044515 :          iblock = 0
    1000     1044515 :          CALL dbcsr_iterator_start(iter, vec)
    1001     6409860 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
    1002     5365345 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
    1003     5365345 :             iblock = iblock + 1
    1004     5365345 :             CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
    1005     5365345 :             fast_vec_access%blk_map_${nametype}$ (iblock)%ptr => vec_bl
    1006     5365345 :             fast_vec_access%blk_map_${nametype}$ (iblock)%assigned_thread = MOD(iblock, nthreads)
    1007             :          END DO
    1008     1044515 :          CALL dbcsr_iterator_stop(iter)
    1009             : 
    1010     1044515 :          CALL timestop(handle)
    1011             : 
    1012     2089030 :       END SUBROUTINE create_fast_row_vec_access_${nametype}$
    1013             : 
    1014             : ! **************************************************************************************************
    1015             : !> \brief ...
    1016             : !> \param matrix ...
    1017             : !> \param vec_in ...
    1018             : !> \param vec_out ...
    1019             : !> \param alpha ...
    1020             : !> \param beta ...
    1021             : !> \param work_row ...
    1022             : !> \param work_col ...
    1023             : ! **************************************************************************************************
    1024      273409 :       SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
    1025             :          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
    1026             :          ${type}$                                  :: alpha, beta
    1027             :          TYPE(dbcsr_type)                          :: work_row, work_col
    1028             : 
    1029             :          CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_m_v_mult'
    1030             : 
    1031             :          INTEGER                                  :: col, mypcol, &
    1032             :                                                      myprow, &
    1033             :                                                      nrows, ncols, &
    1034             :                                                      row, pcol_handle, &
    1035             :                                                      handle, handle1, ithread, vec_dim
    1036      273409 :          ${type}$, DIMENSION(:), POINTER          :: data_vec
    1037      273409 :          ${type}$, DIMENSION(:, :), POINTER       :: data_d, vec_res
    1038             :          TYPE(dbcsr_distribution_type)            :: dist
    1039             :          TYPE(dbcsr_iterator_type)                :: iter
    1040             :          TYPE(dbcsr_type)                         :: result_row, result_col
    1041             :          TYPE(mp_comm_type) :: pcol_group
    1042             : 
    1043      273409 :          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col
    1044             :          INTEGER                                  :: prow, pcol, rprow, rpcol
    1045             : 
    1046      273409 :          CALL timeset(routineN, handle)
    1047      273409 :          ithread = 0
    1048             : ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
    1049      273409 :          CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
    1050             : ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
    1051      273409 :          CALL dbcsr_set(work_col, ${zero}$)
    1052      273409 :          CALL dbcsr_copy(result_col, work_col)
    1053      273409 :          CALL dbcsr_set(work_row, ${zero}$)
    1054      273409 :          CALL dbcsr_copy(result_row, work_row)
    1055             : 
    1056             : ! Collect some data about the parallel environment. We will use them later to move the vector around
    1057      273409 :          CALL dbcsr_get_info(matrix=matrix, distribution=dist)
    1058      273409 :          CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
    1059             : 
    1060      273409 :          CALL pcol_group%set_handle(pcol_handle)
    1061             : 
    1062      273409 :          CALL create_fast_row_vec_access(work_row, fast_vec_row)
    1063      273409 :          CALL create_fast_col_vec_access(work_col, fast_vec_col)
    1064      273409 :          CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
    1065      273409 :          CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
    1066             : 
    1067             : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
    1068      273409 :          CALL dbcsr_col_vec_to_rep_row_${nametype}$ (vec_in, work_col, work_row, fast_vec_col)
    1069             : 
    1070             : ! Probably I should rename the routine above as it delivers both the replicated row and column vector
    1071             : 
    1072             : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
    1073             : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
    1074      273409 :          CALL timeset(routineN//"_local_mm", handle1)
    1075             : 
    1076             : !------ perform the multiplication, we have to take car to take the correct blocks ----------
    1077             : 
    1078             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
    1079      273409 : !$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
    1080             : !$       ithread = omp_get_thread_num()
    1081             :          CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
    1082             :          DO WHILE (dbcsr_iterator_blocks_left(iter))
    1083             :             CALL dbcsr_iterator_next_block(iter, row, col, data_d)
    1084             :             pcol = hash_table_get(fast_vec_row%hash_table, col)
    1085             :             rprow = hash_table_get(res_fast_vec_col%hash_table, row)
    1086             :             IF (ASSOCIATED(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr) .AND. &
    1087             :                 ASSOCIATED(res_fast_vec_col%blk_map_${nametype}$ (rprow)%ptr)) THEN
    1088             :                IF (res_fast_vec_col%blk_map_${nametype}$ (rprow)%assigned_thread .EQ. ithread) THEN
    1089             :                   res_fast_vec_col%blk_map_${nametype}$ (rprow)%ptr = res_fast_vec_col%blk_map_${nametype}$ (rprow)%ptr + &
    1090             :                                                              MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr))
    1091             :                END IF
    1092             :                prow = hash_table_get(fast_vec_col%hash_table, row)
    1093             :                rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
    1094             :                IF (res_fast_vec_row%blk_map_${nametype}$ (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
    1095             :                   res_fast_vec_row%blk_map_${nametype}$ (rpcol)%ptr = res_fast_vec_row%blk_map_${nametype}$ (rpcol)%ptr + &
    1096             :                                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr), data_d)
    1097             :                END IF
    1098             :             ELSE
    1099             :                rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
    1100             :                prow = hash_table_get(fast_vec_row%hash_table, row)
    1101             :                IF (res_fast_vec_col%blk_map_${nametype}$ (rpcol)%assigned_thread .EQ. ithread) THEN
    1102             :                   res_fast_vec_col%blk_map_${nametype}$ (rpcol)%ptr = res_fast_vec_col%blk_map_${nametype}$ (rpcol)%ptr + &
    1103             :                                                              TRANSPOSE(MATMUL(fast_vec_row%blk_map_${nametype}$ (prow)%ptr, data_d))
    1104             :                END IF
    1105             :                rprow = hash_table_get(res_fast_vec_row%hash_table, row)
    1106             :                pcol = hash_table_get(fast_vec_col%hash_table, col)
    1107             :                IF (res_fast_vec_row%blk_map_${nametype}$ (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
    1108             :                   res_fast_vec_row%blk_map_${nametype}$ (rprow)%ptr = res_fast_vec_row%blk_map_${nametype}$ (rprow)%ptr + &
    1109             :                                                              TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map_${nametype}$ (pcol)%ptr))
    1110             :                END IF
    1111             :             END IF
    1112             :          END DO
    1113             :          CALL dbcsr_iterator_stop(iter)
    1114             : !$OMP END PARALLEL
    1115             : 
    1116      273409 :          CALL timestop(handle1)
    1117             : 
    1118             :          ! sum all the data within a processor column to obtain the replicated result from lower
    1119      273409 :          data_vec => dbcsr_get_data_p(result_row, select_data_type=${zero}$)
    1120      273409 :          CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
    1121             : 
    1122    26449963 :          CALL pcol_group%sum(data_vec(1:nrows*ncols))
    1123             : !
    1124             : !! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
    1125             : !! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
    1126             : !! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
    1127             : !! parallel bits are locally added. The sum magically creates the correct vector
    1128      273409 :          CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$ (work_col, result_row, res_fast_vec_row, res_fast_vec_col)
    1129             : 
    1130             : !    ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
    1131      273409 :          CALL dbcsr_iterator_start(iter, vec_out)
    1132     1341162 :          DO WHILE (dbcsr_iterator_blocks_left(iter))
    1133     1067753 :             CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
    1134     1067753 :             prow = hash_table_get(fast_vec_col%hash_table, row)
    1135     1341162 :             IF (ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
    1136     8707124 :                vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_${nametype}$ (prow)%ptr(:, :))
    1137             :             ELSE
    1138           0 :                vec_res(:, :) = beta*vec_res(:, :)
    1139             :             END IF
    1140             :          END DO
    1141      273409 :          CALL dbcsr_iterator_stop(iter)
    1142             : 
    1143      273409 :          CALL release_fast_vec_access(fast_vec_row)
    1144      273409 :          CALL release_fast_vec_access(fast_vec_col)
    1145      273409 :          CALL release_fast_vec_access(res_fast_vec_row)
    1146      273409 :          CALL release_fast_vec_access(res_fast_vec_col)
    1147             : 
    1148      273409 :          CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
    1149             : 
    1150      273409 :          CALL timestop(handle)
    1151             : 
    1152      546818 :       END SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$
    1153             : 
    1154             :    #:endfor
    1155             : 
    1156           0 : END MODULE dbcsr_vector

Generated by: LCOV version 1.15