Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
10 : !> \par History
11 : !> 2014.10 created [Florian Schiffmann]
12 : !> 2023.12 Removed support for single-precision [Ole Schuett]
13 : !> 2024.12 Removed support for complex input matrices [Ole Schuett]
14 : !> \author Florian Schiffmann
15 : ! **************************************************************************************************
16 : MODULE arnoldi_vector
17 : USE cp_dbcsr_api, ONLY: &
18 : dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &
19 : dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &
20 : dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &
21 : dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_set, dbcsr_type, &
22 : dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, dbcsr_type_symmetric
23 : USE cp_dbcsr_contrib, ONLY: dbcsr_reserve_all_blocks
24 : USE kinds, ONLY: dp,&
25 : real_8
26 : USE message_passing, ONLY: mp_comm_type
27 : #include "../base/base_uses.f90"
28 :
29 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
30 :
31 : IMPLICIT NONE
32 :
33 : PRIVATE
34 :
35 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'
36 :
37 : ! **************************************************************************************************
38 : !> \brief Types needed for the hashtable.
39 : ! **************************************************************************************************
40 : TYPE ele_type
41 : INTEGER :: c = 0
42 : INTEGER :: p = 0
43 : END TYPE ele_type
44 :
45 : TYPE hash_table_type
46 : TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
47 : INTEGER :: nele = 0
48 : INTEGER :: nmax = 0
49 : INTEGER :: prime = 0
50 : END TYPE hash_table_type
51 :
52 : ! **************************************************************************************************
53 : !> \brief Types needed for fast access to distributed dbcsr vectors.
54 : ! **************************************************************************************************
55 : TYPE block_ptr
56 : REAL(real_8), DIMENSION(:, :), POINTER :: ptr => NULL()
57 : INTEGER :: assigned_thread = -1
58 : END TYPE block_ptr
59 :
60 : TYPE fast_vec_access_type
61 : TYPE(hash_table_type) :: hash_table = hash_table_type()
62 : TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map
63 : END TYPE fast_vec_access_type
64 :
65 : PUBLIC :: dbcsr_matrix_colvec_multiply, &
66 : create_col_vec_from_matrix, &
67 : create_row_vec_from_matrix, &
68 : create_replicated_col_vec_from_matrix, &
69 : create_replicated_row_vec_from_matrix
70 :
71 : CONTAINS
72 :
73 : ! **************************************************************************************************
74 : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
75 : !> and has the same row dist as the template matrix
76 : !> the returned matrix is fully allocated and all blocks are set to 0
77 : !> this is not a sparse object (and must never be)
78 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
79 : !> \param matrix a dbcsr matrix used as template
80 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
81 : ! **************************************************************************************************
82 279169 : SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
83 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
84 : INTEGER :: ncol
85 :
86 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
87 :
88 : INTEGER :: handle, npcols
89 279169 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
90 279169 : row_dist
91 : TYPE(dbcsr_distribution_type) :: dist, dist_col_vec
92 :
93 279169 : CALL timeset(routineN, handle)
94 :
95 279169 : CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
96 279169 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
97 :
98 279169 : ALLOCATE (col_dist(1), col_blk_size(1))
99 279169 : col_dist(1) = 0
100 279169 : col_blk_size(1) = ncol
101 279169 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
102 :
103 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
104 279169 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
105 279169 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
106 :
107 279169 : CALL dbcsr_distribution_release(dist_col_vec)
108 279169 : DEALLOCATE (col_dist, col_blk_size)
109 279169 : CALL timestop(handle)
110 :
111 558338 : END SUBROUTINE create_col_vec_from_matrix
112 :
113 : ! **************************************************************************************************
114 : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
115 : !> and has the same row dist as the template matrix
116 : !> the returned matrix is fully allocated and all blocks are set to 0
117 : !> this is not a sparse object (and must never be)
118 : !> \param dbcsr_vec ...
119 : !> \param matrix a dbcsr matrix used as template
120 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
121 : ! **************************************************************************************************
122 0 : SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
123 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
124 : INTEGER :: nrow
125 :
126 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
127 :
128 : INTEGER :: handle, nprows
129 0 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
130 0 : row_dist
131 : TYPE(dbcsr_distribution_type) :: dist, dist_row_vec
132 :
133 0 : CALL timeset(routineN, handle)
134 :
135 0 : CALL dbcsr_get_info(matrix, col_blk_size=col_blk_size, distribution=dist)
136 0 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
137 :
138 0 : ALLOCATE (row_dist(1), row_blk_size(1))
139 0 : row_dist(1) = 0
140 0 : row_blk_size(1) = nrow
141 0 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
142 :
143 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, matrix_type=dbcsr_type_no_symmetry, &
144 0 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
145 0 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
146 :
147 0 : CALL dbcsr_distribution_release(dist_row_vec)
148 0 : DEALLOCATE (row_dist, row_blk_size)
149 0 : CALL timestop(handle)
150 :
151 0 : END SUBROUTINE create_row_vec_from_matrix
152 :
153 : ! **************************************************************************************************
154 : !> \brief creates a col vector like object whose blocks can be replicated
155 : !> along the processor row and has the same row dist as the template matrix
156 : !> the returned matrix is fully allocated and all blocks are set to 0
157 : !> this is not a sparse object (and must never be)
158 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
159 : !> \param matrix a dbcsr matrix used as template
160 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
161 : ! **************************************************************************************************
162 137255 : SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
163 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
164 : INTEGER :: ncol
165 :
166 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
167 :
168 : INTEGER :: handle, i, npcols
169 137255 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
170 137255 : row_dist
171 : TYPE(dbcsr_distribution_type) :: dist, dist_col_vec
172 :
173 137255 : CALL timeset(routineN, handle)
174 :
175 137255 : CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
176 137255 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
177 :
178 686275 : ALLOCATE (col_dist(npcols), col_blk_size(npcols))
179 274510 : col_blk_size(:) = ncol
180 274510 : DO i = 0, npcols - 1
181 274510 : col_dist(i + 1) = i
182 : END DO
183 137255 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
184 :
185 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
186 137255 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
187 137255 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
188 :
189 137255 : CALL dbcsr_distribution_release(dist_col_vec)
190 137255 : DEALLOCATE (col_dist, col_blk_size)
191 137255 : CALL timestop(handle)
192 :
193 274510 : END SUBROUTINE create_replicated_col_vec_from_matrix
194 :
195 : ! **************************************************************************************************
196 : !> \brief creates a row vector like object whose blocks can be replicated
197 : !> along the processor col and has the same col dist as the template matrix
198 : !> the returned matrix is fully allocated and all blocks are set to 0
199 : !> this is not a sparse object (and must never be)
200 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
201 : !> \param matrix a dbcsr matrix used as template
202 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
203 : ! **************************************************************************************************
204 137255 : SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
205 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
206 : INTEGER :: nrow
207 :
208 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
209 :
210 : INTEGER :: handle, i, nprows
211 137255 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
212 137255 : row_dist
213 : TYPE(dbcsr_distribution_type) :: dist, dist_row_vec
214 :
215 137255 : CALL timeset(routineN, handle)
216 :
217 137255 : CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size)
218 137255 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
219 :
220 686275 : ALLOCATE (row_dist(nprows), row_blk_size(nprows))
221 409374 : row_blk_size(:) = nrow
222 409374 : DO i = 0, nprows - 1
223 409374 : row_dist(i + 1) = i
224 : END DO
225 137255 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
226 :
227 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
228 137255 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
229 137255 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
230 :
231 137255 : CALL dbcsr_distribution_release(dist_row_vec)
232 137255 : DEALLOCATE (row_dist, row_blk_Size)
233 137255 : CALL timestop(handle)
234 :
235 274510 : END SUBROUTINE create_replicated_row_vec_from_matrix
236 :
237 : ! **************************************************************************************************
238 : !> \brief given a column vector, prepare the fast_vec_access container
239 : !> \param vec ...
240 : !> \param fast_vec_access ...
241 : ! **************************************************************************************************
242 1095375 : SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
243 : TYPE(dbcsr_type) :: vec
244 : TYPE(fast_vec_access_type) :: fast_vec_access
245 :
246 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
247 :
248 : INTEGER :: col, handle, iblock, nblk_local, &
249 : nthreads, row
250 1095375 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
251 : TYPE(dbcsr_iterator_type) :: iter
252 :
253 1095375 : CALL timeset(routineN, handle)
254 :
255 : ! figure out the number of threads
256 1095375 : nthreads = 1
257 1095375 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
258 : !$OMP MASTER
259 : !$ nthreads = OMP_GET_NUM_THREADS()
260 : !$OMP END MASTER
261 : !$OMP END PARALLEL
262 :
263 1095375 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
264 : ! 4 times makes sure the table is big enough to limit collisions.
265 1095375 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
266 : ! include zero for effective dealing with values not in the hash table (will return 0)
267 7099106 : ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
268 :
269 1095375 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
270 1095375 : IF (col > 1) CPABORT("BUG")
271 :
272 : ! go through the blocks of the vector
273 1095375 : iblock = 0
274 1095375 : CALL dbcsr_iterator_start(iter, vec)
275 3812981 : DO WHILE (dbcsr_iterator_blocks_left(iter))
276 2717606 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
277 2717606 : iblock = iblock + 1
278 2717606 : CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
279 2717606 : fast_vec_access%blk_map(iblock)%ptr => vec_bl
280 2717606 : fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
281 : END DO
282 1095375 : CALL dbcsr_iterator_stop(iter)
283 1095375 : CALL timestop(handle)
284 :
285 3286125 : END SUBROUTINE create_fast_col_vec_access
286 :
287 : ! **************************************************************************************************
288 : !> \brief given a row vector, prepare the fast_vec_access_container
289 : !> \param vec ...
290 : !> \param fast_vec_access ...
291 : ! **************************************************************************************************
292 1095375 : SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
293 : TYPE(dbcsr_type) :: vec
294 : TYPE(fast_vec_access_type) :: fast_vec_access
295 :
296 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
297 :
298 : INTEGER :: col, handle, iblock, nblk_local, &
299 : nthreads, row
300 1095375 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
301 : TYPE(dbcsr_iterator_type) :: iter
302 :
303 1095375 : CALL timeset(routineN, handle)
304 :
305 : ! figure out the number of threads
306 1095375 : nthreads = 1
307 1095375 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
308 : !$OMP MASTER
309 : !$ nthreads = OMP_GET_NUM_THREADS()
310 : !$OMP END MASTER
311 : !$OMP END PARALLEL
312 :
313 1095375 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
314 : ! 4 times makes sure the table is big enough to limit collisions.
315 1095375 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
316 : ! include zero for effective dealing with values not in the hash table (will return 0)
317 9791439 : ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
318 :
319 : ! sanity check
320 1095375 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
321 1095375 : IF (row > 1) CPABORT("BUG")
322 :
323 : ! go through the blocks of the vector
324 1095375 : iblock = 0
325 1095375 : CALL dbcsr_iterator_start(iter, vec)
326 6505314 : DO WHILE (dbcsr_iterator_blocks_left(iter))
327 5409939 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
328 5409939 : iblock = iblock + 1
329 5409939 : CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
330 5409939 : fast_vec_access%blk_map(iblock)%ptr => vec_bl
331 5409939 : fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
332 : END DO
333 1095375 : CALL dbcsr_iterator_stop(iter)
334 :
335 1095375 : CALL timestop(handle)
336 :
337 3286125 : END SUBROUTINE create_fast_row_vec_access
338 :
339 : ! **************************************************************************************************
340 : !> \brief release all memory associated with the fast_vec_access type
341 : !> \param fast_vec_access ...
342 : ! **************************************************************************************************
343 2190750 : SUBROUTINE release_fast_vec_access(fast_vec_access)
344 : TYPE(fast_vec_access_type) :: fast_vec_access
345 :
346 : CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
347 :
348 : INTEGER :: handle
349 :
350 2190750 : CALL timeset(routineN, handle)
351 :
352 2190750 : CALL hash_table_release(fast_vec_access%hash_table)
353 2190750 : DEALLOCATE (fast_vec_access%blk_map)
354 :
355 2190750 : CALL timestop(handle)
356 :
357 2190750 : END SUBROUTINE release_fast_vec_access
358 :
359 : ! --------------------------------------------------------------------------------------------------
360 : ! Beginning of hashtable.
361 : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
362 : ! part of the module to guarantee inlining
363 : ! hashes (c,p) pairs, where p is assumed to be >0
364 : ! on return (0 is used as a flag for not present)
365 : !
366 : !
367 : ! **************************************************************************************************
368 : !> \brief finds a prime equal or larger than i, needed at creation
369 : !> \param i ...
370 : !> \return ...
371 : ! **************************************************************************************************
372 2190750 : FUNCTION matching_prime(i) RESULT(res)
373 : INTEGER, INTENT(IN) :: i
374 : INTEGER :: res
375 :
376 : INTEGER :: j
377 :
378 2190750 : res = i
379 2190750 : j = 0
380 6687606 : DO WHILE (j < res)
381 55565517 : DO j = 2, res - 1
382 53374767 : IF (MOD(res, j) == 0) THEN
383 2306106 : res = res + 1
384 2306106 : EXIT
385 : END IF
386 : END DO
387 : END DO
388 2190750 : END FUNCTION matching_prime
389 :
390 : ! **************************************************************************************************
391 : !> \brief create a hash_table of given initial size.
392 : !> the hash table will expand as needed (but this requires rehashing)
393 : !> \param hash_table ...
394 : !> \param table_size ...
395 : ! **************************************************************************************************
396 2190750 : SUBROUTINE hash_table_create(hash_table, table_size)
397 : TYPE(hash_table_type) :: hash_table
398 : INTEGER, INTENT(IN) :: table_size
399 :
400 : INTEGER :: j
401 :
402 : ! guarantee a minimal hash table size (8), so that expansion works
403 :
404 2190750 : j = 3
405 4206016 : DO WHILE (2**j - 1 < table_size)
406 2015266 : j = j + 1
407 : END DO
408 2190750 : hash_table%nmax = 2**j - 1
409 2190750 : hash_table%prime = matching_prime(hash_table%nmax)
410 2190750 : hash_table%nele = 0
411 58195890 : ALLOCATE (hash_table%table(0:hash_table%nmax))
412 2190750 : END SUBROUTINE hash_table_create
413 :
414 : ! **************************************************************************************************
415 : !> \brief ...
416 : !> \param hash_table ...
417 : ! **************************************************************************************************
418 2190750 : SUBROUTINE hash_table_release(hash_table)
419 : TYPE(hash_table_type) :: hash_table
420 :
421 2190750 : hash_table%nmax = 0
422 2190750 : hash_table%nele = 0
423 2190750 : DEALLOCATE (hash_table%table)
424 :
425 2190750 : END SUBROUTINE hash_table_release
426 :
427 : ! **************************************************************************************************
428 : !> \brief add a pair (c,p) to the hash table
429 : !> \param hash_table ...
430 : !> \param c this value is being hashed
431 : !> \param p this is being stored
432 : ! **************************************************************************************************
433 8127545 : RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
434 : TYPE(hash_table_type), INTENT(INOUT) :: hash_table
435 : INTEGER, INTENT(IN) :: c, p
436 :
437 : REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
438 : inv_hash_table_fill = 2.5_real_8
439 :
440 : INTEGER :: i, j
441 8127545 : TYPE(ele_type), ALLOCATABLE, DIMENSION(:) :: tmp_hash
442 :
443 : ! if too small, make a copy and rehash in a larger table
444 :
445 8127545 : IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
446 0 : ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
447 0 : tmp_hash(:) = hash_table%table
448 0 : CALL hash_table_release(hash_table)
449 0 : CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
450 0 : DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
451 0 : IF (tmp_hash(i)%c /= 0) THEN
452 0 : CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
453 : END IF
454 : END DO
455 0 : DEALLOCATE (tmp_hash)
456 : END IF
457 :
458 8127545 : hash_table%nele = hash_table%nele + 1
459 8127545 : i = IAND(c*hash_table%prime, hash_table%nmax)
460 :
461 8127545 : DO j = i, hash_table%nmax
462 8127545 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
463 8127545 : hash_table%table(j)%c = c
464 8127545 : hash_table%table(j)%p = p
465 8127545 : RETURN
466 : END IF
467 : END DO
468 0 : DO j = 0, i - 1
469 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
470 0 : hash_table%table(j)%c = c
471 0 : hash_table%table(j)%p = p
472 0 : RETURN
473 : END IF
474 : END DO
475 :
476 : END SUBROUTINE hash_table_add
477 :
478 : ! **************************************************************************************************
479 : !> \brief ...
480 : !> \param hash_table ...
481 : !> \param c ...
482 : !> \return ...
483 : ! **************************************************************************************************
484 121067472 : PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
485 : TYPE(hash_table_type), INTENT(IN) :: hash_table
486 : INTEGER, INTENT(IN) :: c
487 : INTEGER :: p
488 :
489 : INTEGER :: i, j
490 :
491 121067472 : i = IAND(c*hash_table%prime, hash_table%nmax)
492 :
493 : ! catch the likely case first
494 121067472 : IF (hash_table%table(i)%c == c) THEN
495 116828891 : p = hash_table%table(i)%p
496 116828891 : RETURN
497 : END IF
498 :
499 4238581 : DO j = i, hash_table%nmax
500 4238581 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
501 4238581 : p = hash_table%table(j)%p
502 4238581 : RETURN
503 : END IF
504 : END DO
505 0 : DO j = 0, i - 1
506 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
507 0 : p = hash_table%table(j)%p
508 0 : RETURN
509 : END IF
510 : END DO
511 :
512 : ! we should never reach this point.
513 121067472 : p = HUGE(p)
514 :
515 : END FUNCTION hash_table_get
516 :
517 : ! End of hashtable
518 : ! --------------------------------------------------------------------------------------------------
519 :
520 : ! **************************************************************************************************
521 : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
522 : !> \param matrix ...
523 : !> \param vec_in ...
524 : !> \param vec_out ...
525 : !> \param alpha ...
526 : !> \param beta ...
527 : !> \param work_row ...
528 : !> \param work_col ...
529 : ! **************************************************************************************************
530 817722 : SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
531 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
532 : REAL(kind=real_8) :: alpha, beta
533 : TYPE(dbcsr_type) :: work_row, work_col
534 :
535 : CHARACTER :: matrix_type
536 :
537 817722 : CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
538 :
539 540069 : SELECT CASE (matrix_type)
540 : CASE (dbcsr_type_no_symmetry)
541 540069 : CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
542 : CASE (dbcsr_type_symmetric)
543 277653 : CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
544 : CASE (dbcsr_type_antisymmetric)
545 : ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
546 0 : CPABORT("NYI, antisymmetric matrix not permitted")
547 : CASE DEFAULT
548 817722 : CPABORT("Unknown matrix type, ...")
549 : END SELECT
550 :
551 817722 : END SUBROUTINE dbcsr_matrix_colvec_multiply
552 :
553 : ! **************************************************************************************************
554 : !> \brief low level routines for matrix vector multiplies
555 : !> \param matrix ...
556 : !> \param vec_in ...
557 : !> \param vec_out ...
558 : !> \param alpha ...
559 : !> \param beta ...
560 : !> \param work_row ...
561 : !> \param work_col ...
562 : ! **************************************************************************************************
563 540069 : SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
564 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
565 : REAL(kind=real_8) :: alpha, beta
566 : TYPE(dbcsr_type) :: work_row, work_col
567 :
568 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
569 :
570 : INTEGER :: col, handle, handle1, ithread, k, m, &
571 : mypcol, myprow, n, ncols, nrows, pcol, &
572 : prow, prow_handle, row
573 540069 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
574 540069 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
575 : TYPE(dbcsr_distribution_type) :: dist
576 : TYPE(dbcsr_iterator_type) :: iter
577 540069 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row
578 : TYPE(mp_comm_type) :: prow_group
579 :
580 540069 : CALL timeset(routineN, handle)
581 540069 : ithread = 0
582 :
583 : ! Collect some data about the parallel environment. We will use them later to move the vector around
584 540069 : CALL dbcsr_get_info(matrix, distribution=dist)
585 540069 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
586 :
587 540069 : CALL prow_group%set_handle(prow_handle)
588 :
589 540069 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
590 540069 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
591 :
592 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
593 540069 : CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
594 :
595 : ! Set the work vector for the results to 0
596 540069 : CALL dbcsr_set(work_col, 0.0_real_8)
597 :
598 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
599 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
600 540069 : CALL timeset(routineN//"_local_mm", handle1)
601 :
602 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,m,n,k) &
603 540069 : !$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
604 : !$ ithread = omp_get_thread_num()
605 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
606 : DO WHILE (dbcsr_iterator_blocks_left(iter))
607 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
608 : prow = hash_table_get(fast_vec_col%hash_table, row)
609 : IF (fast_vec_col%blk_map(prow)%assigned_thread /= ithread) CYCLE
610 : m = SIZE(fast_vec_col%blk_map(prow)%ptr, 1)
611 : n = SIZE(fast_vec_col%blk_map(prow)%ptr, 2)
612 : k = SIZE(data_d, 2)
613 : IF ((m == 0) .OR. (n == 0) .OR. (k == 0)) CYCLE
614 : CPASSERT(n == 1)
615 : pcol = hash_table_get(fast_vec_row%hash_table, col)
616 : CALL dgemm('N', 'T', m, n, k, &
617 : 1.0_dp, data_d, m, &
618 : fast_vec_row%blk_map(pcol)%ptr, n, &
619 : 1.0_dp, &
620 : fast_vec_col%blk_map(prow)%ptr, m)
621 : END DO
622 : CALL dbcsr_iterator_stop(iter)
623 : !$OMP END PARALLEL
624 :
625 540069 : CALL timestop(handle1)
626 :
627 : ! sum all the data onto the first processor col where the original vector is stored
628 540069 : data_vec => dbcsr_get_data_p(work_col)
629 540069 : CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
630 11787241 : CALL prow_group%sum(data_vec(1:nrows*ncols))
631 :
632 : ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
633 : ! from the replicated to the original vector. Let's play it safe and use the iterator
634 540069 : CALL dbcsr_iterator_start(iter, vec_out)
635 1140563 : DO WHILE (dbcsr_iterator_blocks_left(iter))
636 600494 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
637 600494 : prow = hash_table_get(fast_vec_col%hash_table, row)
638 1140563 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
639 6824574 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
640 : ELSE
641 0 : vec_res(:, :) = beta*vec_res(:, :)
642 : END IF
643 : END DO
644 540069 : CALL dbcsr_iterator_stop(iter)
645 :
646 540069 : CALL release_fast_vec_access(fast_vec_row)
647 540069 : CALL release_fast_vec_access(fast_vec_col)
648 :
649 540069 : CALL timestop(handle)
650 :
651 1080138 : END SUBROUTINE dbcsr_matrix_vector_mult
652 :
653 : ! **************************************************************************************************
654 : !> \brief ...
655 : !> \param matrix ...
656 : !> \param vec_in ...
657 : !> \param vec_out ...
658 : !> \param alpha ...
659 : !> \param beta ...
660 : !> \param work_row ...
661 : !> \param work_col ...
662 : !> \param skip_diag ...
663 : ! **************************************************************************************************
664 0 : SUBROUTINE dbcsr_matrixT_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
665 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
666 : REAL(kind=real_8) :: alpha, beta
667 : TYPE(dbcsr_type) :: work_row, work_col
668 : LOGICAL :: skip_diag
669 :
670 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
671 :
672 : INTEGER :: col, col_size, handle, handle1, ithread, &
673 : mypcol, myprow, ncols, nrows, pcol, &
674 : pcol_handle, prow, prow_handle, row, &
675 : row_size
676 0 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
677 0 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
678 : TYPE(dbcsr_distribution_type) :: dist
679 : TYPE(dbcsr_iterator_type) :: iter
680 0 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row
681 : TYPE(mp_comm_type) :: pcol_group, prow_group
682 :
683 0 : CALL timeset(routineN, handle)
684 0 : ithread = 0
685 :
686 : ! Collect some data about the parallel environment. We will use them later to move the vector around
687 0 : CALL dbcsr_get_info(matrix, distribution=dist)
688 0 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
689 :
690 0 : CALL prow_group%set_handle(prow_handle)
691 0 : CALL pcol_group%set_handle(pcol_handle)
692 :
693 0 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
694 0 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
695 :
696 : ! Set the work vector for the results to 0
697 0 : CALL dbcsr_set(work_row, 0.0_real_8)
698 :
699 : ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
700 0 : CALL dbcsr_iterator_start(iter, vec_in)
701 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
702 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
703 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
704 0 : fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
705 : END DO
706 0 : CALL dbcsr_iterator_stop(iter)
707 : ! Replicate the data on all processore in the row
708 0 : data_vec => dbcsr_get_data_p(work_col)
709 0 : CALL prow_group%bcast(data_vec, 0)
710 :
711 : ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
712 0 : CALL timeset(routineN//"local_mm", handle1)
713 0 : CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
714 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
715 0 : !$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
716 : !$ ithread = omp_get_thread_num()
717 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
718 : DO WHILE (dbcsr_iterator_blocks_left(iter))
719 : CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
720 : IF (skip_diag .AND. col == row) CYCLE
721 : prow = hash_table_get(fast_vec_col%hash_table, row)
722 : pcol = hash_table_get(fast_vec_row%hash_table, col)
723 : IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
724 : ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
725 : IF (fast_vec_row%blk_map(pcol)%assigned_thread /= ithread) CYCLE
726 : fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &
727 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
728 : ELSE
729 : prow = hash_table_get(fast_vec_row%hash_table, row)
730 : pcol = hash_table_get(fast_vec_col%hash_table, col)
731 : IF (fast_vec_row%blk_map(prow)%assigned_thread /= ithread) CYCLE
732 : fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &
733 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(pcol)%ptr), TRANSPOSE(data_d))
734 : END IF
735 : END DO
736 : CALL dbcsr_iterator_stop(iter)
737 : !$OMP END PARALLEL
738 :
739 0 : CALL timestop(handle1)
740 :
741 : ! sum all the data within a processor column to obtain the replicated result
742 0 : data_vec => dbcsr_get_data_p(work_row)
743 : ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
744 0 : CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
745 0 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
746 :
747 : ! Convert the result to a column wise distribution
748 0 : CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)
749 :
750 : ! Create_the final vector by summing it to the result vector which lives on proc_col 0
751 0 : CALL dbcsr_iterator_start(iter, vec_out)
752 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
753 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
754 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
755 0 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
756 0 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
757 : ELSE
758 0 : vec_res(:, :) = beta*vec_res(:, :)
759 : END IF
760 : END DO
761 0 : CALL dbcsr_iterator_stop(iter)
762 :
763 0 : CALL timestop(handle)
764 :
765 0 : END SUBROUTINE dbcsr_matrixT_vector_mult
766 :
767 : ! **************************************************************************************************
768 : !> \brief ...
769 : !> \param vec_in ...
770 : !> \param rep_col_vec ...
771 : !> \param rep_row_vec ...
772 : !> \param fast_vec_col ...
773 : ! **************************************************************************************************
774 6541776 : SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
775 : TYPE(dbcsr_type) :: vec_in, rep_col_vec, rep_row_vec
776 : TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
777 :
778 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
779 :
780 : INTEGER :: col, handle, mypcol, myprow, ncols, &
781 : nrows, pcol_handle, prow_handle, row
782 817722 : INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
783 817722 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec, data_vec_rep
784 817722 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_row
785 : TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
786 : TYPE(dbcsr_iterator_type) :: iter
787 : TYPE(mp_comm_type) :: pcol_group, prow_group
788 :
789 817722 : CALL timeset(routineN, handle)
790 :
791 : ! get information about the parallel environment
792 817722 : CALL dbcsr_get_info(vec_in, distribution=dist_in)
793 : CALL dbcsr_distribution_get(dist_in, &
794 : prow_group=prow_handle, &
795 : pcol_group=pcol_handle, &
796 : myprow=myprow, &
797 817722 : mypcol=mypcol)
798 :
799 817722 : CALL prow_group%set_handle(prow_handle)
800 817722 : CALL pcol_group%set_handle(pcol_handle)
801 :
802 : ! Get the vector which tells us which blocks are local to which processor row in the col vec
803 817722 : CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
804 817722 : CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
805 :
806 : ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
807 817722 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
808 817722 : data_vec_rep => dbcsr_get_data_p(rep_col_vec)
809 817722 : data_vec => dbcsr_get_data_p(vec_in)
810 24979998 : IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
811 : ! Replicate the data along the row
812 24979998 : CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
813 :
814 : ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
815 : ! The rep_col_vec contains all blocks local to the row distribution of the vector.
816 : ! The rep_row_vec only needs the fraction which is local to the col distribution.
817 : ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
818 : ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
819 : ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
820 : ! Hope this clarifies the idea
821 817722 : CALL dbcsr_set(rep_row_vec, 0.0_real_8)
822 817722 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
823 817722 : CALL dbcsr_iterator_start(iter, rep_row_vec)
824 4121394 : DO WHILE (dbcsr_iterator_blocks_left(iter))
825 3303672 : CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
826 4121394 : IF (row_dist(col) == myprow) THEN
827 25821326 : vec_row = TRANSPOSE(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)
828 : END IF
829 : END DO
830 817722 : CALL dbcsr_iterator_stop(iter)
831 817722 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
832 817722 : data_vec_rep => dbcsr_get_data_p(rep_row_vec)
833 49003236 : CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
834 :
835 817722 : CALL timestop(handle)
836 :
837 817722 : END SUBROUTINE dbcsr_col_vec_to_rep_row
838 :
839 : ! **************************************************************************************************
840 : !> \brief ...
841 : !> \param rep_col_vec ...
842 : !> \param rep_row_vec ...
843 : !> \param fast_vec_row ...
844 : !> \param fast_vec_col_add ...
845 : ! **************************************************************************************************
846 1665918 : SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
847 : TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
848 : TYPE(fast_vec_access_type) :: fast_vec_row
849 : TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
850 :
851 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
852 :
853 : INTEGER :: col, handle, mypcol, myprow, ncols, &
854 : nrows, prow_handle, row
855 277653 : INTEGER, DIMENSION(:), POINTER :: col_dist
856 277653 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec_rep
857 277653 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_col
858 : TYPE(dbcsr_distribution_type) :: dist_rep_col, dist_rep_row
859 : TYPE(dbcsr_iterator_type) :: iter
860 : TYPE(mp_comm_type) :: prow_group
861 :
862 277653 : CALL timeset(routineN, handle)
863 :
864 : ! get information about the parallel environment
865 277653 : CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
866 : CALL dbcsr_distribution_get(dist_rep_col, &
867 : prow_group=prow_handle, &
868 : myprow=myprow, &
869 277653 : mypcol=mypcol)
870 :
871 277653 : CALL prow_group%set_handle(prow_handle)
872 :
873 : ! Get the vector which tells us which blocks are local to which processor col in the row vec
874 277653 : CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
875 277653 : CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
876 :
877 : ! The same trick as described above with opposite direction
878 277653 : CALL dbcsr_set(rep_col_vec, 0.0_real_8)
879 277653 : CALL dbcsr_iterator_start(iter, rep_col_vec)
880 1336209 : DO WHILE (dbcsr_iterator_blocks_left(iter))
881 1058556 : CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
882 1058556 : IF (col_dist(row) == mypcol) THEN
883 8574664 : vec_col = TRANSPOSE(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)
884 : END IF
885 : ! this one is special and allows to add the elements of a not yet summed replicated
886 : ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
887 1336209 : IF (PRESENT(fast_vec_col_add)) THEN
888 8574664 : vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
889 : END IF
890 : END DO
891 277653 : CALL dbcsr_iterator_stop(iter)
892 277653 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
893 277653 : data_vec_rep => dbcsr_get_data_p(rep_col_vec)
894 13192757 : CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
895 :
896 277653 : CALL timestop(handle)
897 :
898 277653 : END SUBROUTINE dbcsr_rep_row_to_rep_col_vec
899 :
900 : ! **************************************************************************************************
901 : !> \brief ...
902 : !> \param matrix ...
903 : !> \param vec_in ...
904 : !> \param vec_out ...
905 : !> \param alpha ...
906 : !> \param beta ...
907 : !> \param work_row ...
908 : !> \param work_col ...
909 : ! **************************************************************************************************
910 277653 : SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
911 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
912 : REAL(kind=real_8) :: alpha, beta
913 : TYPE(dbcsr_type) :: work_row, work_col
914 :
915 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_matrix_vector_mult'
916 :
917 : INTEGER :: col, handle, handle1, ithread, mypcol, &
918 : myprow, ncols, nrows, pcol, &
919 : pcol_handle, prow, row, rpcol, rprow, &
920 : vec_dim
921 277653 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
922 277653 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
923 : TYPE(dbcsr_distribution_type) :: dist
924 : TYPE(dbcsr_iterator_type) :: iter
925 : TYPE(dbcsr_type) :: result_col, result_row
926 277653 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row, &
927 277653 : res_fast_vec_col, res_fast_vec_row
928 : TYPE(mp_comm_type) :: pcol_group
929 :
930 277653 : CALL timeset(routineN, handle)
931 277653 : ithread = 0
932 : ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
933 277653 : CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
934 : ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
935 277653 : CALL dbcsr_set(work_col, 0.0_real_8)
936 277653 : CALL dbcsr_copy(result_col, work_col)
937 277653 : CALL dbcsr_set(work_row, 0.0_real_8)
938 277653 : CALL dbcsr_copy(result_row, work_row)
939 :
940 : ! Collect some data about the parallel environment. We will use them later to move the vector around
941 277653 : CALL dbcsr_get_info(matrix=matrix, distribution=dist)
942 277653 : CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
943 :
944 277653 : CALL pcol_group%set_handle(pcol_handle)
945 :
946 277653 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
947 277653 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
948 277653 : CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
949 277653 : CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
950 :
951 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
952 277653 : CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
953 :
954 : ! Probably I should rename the routine above as it delivers both the replicated row and column vector
955 :
956 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
957 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
958 277653 : CALL timeset(routineN//"_local_mm", handle1)
959 :
960 : !------ perform the multiplication, we have to take car to take the correct blocks ----------
961 :
962 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
963 277653 : !$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
964 : !$ ithread = omp_get_thread_num()
965 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
966 : DO WHILE (dbcsr_iterator_blocks_left(iter))
967 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
968 : pcol = hash_table_get(fast_vec_row%hash_table, col)
969 : rprow = hash_table_get(res_fast_vec_col%hash_table, row)
970 : IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
971 : ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN
972 : IF (res_fast_vec_col%blk_map(rprow)%assigned_thread == ithread) THEN
973 : res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &
974 : MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map(pcol)%ptr))
975 : END IF
976 : prow = hash_table_get(fast_vec_col%hash_table, row)
977 : rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
978 : IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread == ithread .AND. row /= col) THEN
979 : res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &
980 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
981 : END IF
982 : ELSE
983 : rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
984 : prow = hash_table_get(fast_vec_row%hash_table, row)
985 : IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread == ithread) THEN
986 : res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &
987 : TRANSPOSE(MATMUL(fast_vec_row%blk_map(prow)%ptr, data_d))
988 : END IF
989 : rprow = hash_table_get(res_fast_vec_row%hash_table, row)
990 : pcol = hash_table_get(fast_vec_col%hash_table, col)
991 : IF (res_fast_vec_row%blk_map(rprow)%assigned_thread == ithread .AND. row /= col) THEN
992 : res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &
993 : TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map(pcol)%ptr))
994 : END IF
995 : END IF
996 : END DO
997 : CALL dbcsr_iterator_stop(iter)
998 : !$OMP END PARALLEL
999 :
1000 277653 : CALL timestop(handle1)
1001 :
1002 : ! sum all the data within a processor column to obtain the replicated result from lower
1003 277653 : data_vec => dbcsr_get_data_p(result_row)
1004 277653 : CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1005 :
1006 25998067 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
1007 :
1008 : ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1009 : ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1010 : ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1011 : ! parallel bits are locally added. The sum magically creates the correct vector
1012 277653 : CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1013 :
1014 : ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1015 277653 : CALL dbcsr_iterator_start(iter, vec_out)
1016 1336209 : DO WHILE (dbcsr_iterator_blocks_left(iter))
1017 1058556 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1018 1058556 : prow = hash_table_get(fast_vec_col%hash_table, row)
1019 1336209 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
1020 8574664 : vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))
1021 : ELSE
1022 0 : vec_res(:, :) = beta*vec_res(:, :)
1023 : END IF
1024 : END DO
1025 277653 : CALL dbcsr_iterator_stop(iter)
1026 :
1027 277653 : CALL release_fast_vec_access(fast_vec_row)
1028 277653 : CALL release_fast_vec_access(fast_vec_col)
1029 277653 : CALL release_fast_vec_access(res_fast_vec_row)
1030 277653 : CALL release_fast_vec_access(res_fast_vec_col)
1031 :
1032 277653 : CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1033 :
1034 277653 : CALL timestop(handle)
1035 :
1036 555306 : END SUBROUTINE dbcsr_sym_matrix_vector_mult
1037 :
1038 0 : END MODULE arnoldi_vector
|