Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \note
10 : !> This module contains routines necessary to operate on plane waves on GPUs
11 : ! > independently of the GPU platform.
12 : !> \par History
13 : !> BGL (06-Mar-2008) : Created
14 : !> AG (18-May-2012) : Refacturing:
15 : !> - added explicit interfaces to C routines
16 : !> - enable double precision complex transformations
17 : !> AG (11-Sept-2012) : Modifications:
18 : !> - use pointers if precision mapping is not required
19 : !> - use OMP for mapping
20 : !> MT (Jan 2022) : Modifications
21 : !> - use a generic interface for fft calls to GPUs
22 : !> - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
23 : !> can be added easily.
24 : !> \author Benjamin G. Levine
25 : ! **************************************************************************************************
26 : MODULE pw_gpu
27 : USE ISO_C_BINDING, ONLY: C_DOUBLE,&
28 : C_INT,&
29 : C_LOC,&
30 : C_PTR
31 : USE fft_tools, ONLY: &
32 : cube_transpose_1, cube_transpose_2, fft_scratch_sizes, fft_scratch_type, get_fft_scratch, &
33 : release_fft_scratch, x_to_yz, xz_to_yz, yz_to_x, yz_to_xz
34 : USE kinds, ONLY: dp
35 : USE mathconstants, ONLY: z_zero
36 : USE message_passing, ONLY: mp_cart_type,&
37 : mp_comm_type
38 : USE pw_grid_types, ONLY: FULLSPACE
39 : USE pw_types, ONLY: pw_c1d_gs_type,&
40 : pw_r3d_rs_type
41 : #include "../base/base_uses.f90"
42 :
43 : IMPLICIT NONE
44 :
45 : PRIVATE
46 :
47 : PUBLIC :: pw_gpu_r3dc1d_3d
48 : PUBLIC :: pw_gpu_c1dr3d_3d
49 : PUBLIC :: pw_gpu_r3dc1d_3d_ps
50 : PUBLIC :: pw_gpu_c1dr3d_3d_ps
51 : PUBLIC :: pw_gpu_init, pw_gpu_finalize
52 :
53 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
54 : LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .FALSE.
55 :
56 : CONTAINS
57 :
58 : ! **************************************************************************************************
59 : !> \brief Allocates resources on the gpu device for gpu fft acceleration
60 : !> \author Ole Schuett
61 : ! **************************************************************************************************
62 16926 : SUBROUTINE pw_gpu_init()
63 : INTEGER :: dummy
64 : INTERFACE
65 : SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
66 : END SUBROUTINE pw_gpu_init_c
67 : END INTERFACE
68 :
69 : MARK_USED(dummy) ! TODO: fix fpretty
70 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
71 : CALL pw_gpu_init_c()
72 : #else
73 : ! Nothing to do.
74 : #endif
75 16926 : END SUBROUTINE pw_gpu_init
76 :
77 : ! **************************************************************************************************
78 : !> \brief Releases resources on the gpu device for gpu fft acceleration
79 : !> \author Ole Schuett
80 : ! **************************************************************************************************
81 16926 : SUBROUTINE pw_gpu_finalize()
82 : INTEGER :: dummy
83 : INTERFACE
84 : SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
85 : END SUBROUTINE pw_gpu_finalize_c
86 : END INTERFACE
87 :
88 : MARK_USED(dummy) ! TODO: fix fpretty
89 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
90 : CALL pw_gpu_finalize_c()
91 : #else
92 : ! Nothing to do.
93 : #endif
94 16926 : END SUBROUTINE pw_gpu_finalize
95 :
96 : ! **************************************************************************************************
97 : !> \brief perform an fft followed by a gather on the gpu
98 : !> \param pw1 ...
99 : !> \param pw2 ...
100 : !> \param scale ...
101 : !> \author Benjamin G Levine
102 : ! **************************************************************************************************
103 0 : SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2, scale)
104 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
105 : TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
106 : REAL(KIND=dp), INTENT(IN) :: scale
107 :
108 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_r3dc1d_3d'
109 :
110 : COMPLEX(KIND=dp), POINTER :: ptr_pwout
111 : INTEGER :: handle, l1, l2, l3, ngpts
112 0 : INTEGER, DIMENSION(:), POINTER :: npts
113 : INTEGER, POINTER :: ptr_ghatmap
114 : REAL(KIND=dp), POINTER :: ptr_pwin
115 : INTERFACE
116 : SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
117 : IMPORT
118 : TYPE(C_PTR), INTENT(IN), VALUE :: din
119 : TYPE(C_PTR), VALUE :: zout
120 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
121 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
122 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts
123 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
124 :
125 : END SUBROUTINE pw_gpu_cfffg_c
126 : END INTERFACE
127 :
128 0 : CALL timeset(routineN, handle)
129 :
130 0 : ngpts = SIZE(pw2%pw_grid%gsq)
131 0 : l1 = LBOUND(pw1%array, 1)
132 0 : l2 = LBOUND(pw1%array, 2)
133 0 : l3 = LBOUND(pw1%array, 3)
134 0 : npts => pw1%pw_grid%npts
135 :
136 : ! pointers to data arrays
137 0 : ptr_pwin => pw1%array(l1, l2, l3)
138 0 : ptr_pwout => pw2%array(1)
139 :
140 : ! pointer to map array
141 0 : ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
142 :
143 : ! invoke the combined transformation
144 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
145 : CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
146 : #else
147 : MARK_USED(scale)
148 0 : CPABORT("Compiled without pw offloading.")
149 : #endif
150 :
151 0 : CALL timestop(handle)
152 0 : END SUBROUTINE pw_gpu_r3dc1d_3d
153 :
154 : ! **************************************************************************************************
155 : !> \brief perform an scatter followed by a fft on the gpu
156 : !> \param pw1 ...
157 : !> \param pw2 ...
158 : !> \param scale ...
159 : !> \author Benjamin G Levine
160 : ! **************************************************************************************************
161 0 : SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2, scale)
162 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
163 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
164 : REAL(KIND=dp), INTENT(IN) :: scale
165 :
166 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_c1dr3d_3d'
167 :
168 : COMPLEX(KIND=dp), POINTER :: ptr_pwin
169 : INTEGER :: handle, l1, l2, l3, ngpts, nmaps
170 0 : INTEGER, DIMENSION(:), POINTER :: npts
171 : INTEGER, POINTER :: ptr_ghatmap
172 : REAL(KIND=dp), POINTER :: ptr_pwout
173 : INTERFACE
174 : SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
175 : IMPORT
176 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
177 : TYPE(C_PTR), VALUE :: dout
178 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
179 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
180 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts, nmaps
181 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
182 : END SUBROUTINE pw_gpu_sfffc_c
183 : END INTERFACE
184 :
185 0 : CALL timeset(routineN, handle)
186 :
187 0 : ngpts = SIZE(pw1%pw_grid%gsq)
188 0 : l1 = LBOUND(pw2%array, 1)
189 0 : l2 = LBOUND(pw2%array, 2)
190 0 : l3 = LBOUND(pw2%array, 3)
191 0 : npts => pw1%pw_grid%npts
192 :
193 : ! pointers to data arrays
194 0 : ptr_pwin => pw1%array(1)
195 0 : ptr_pwout => pw2%array(l1, l2, l3)
196 :
197 : ! pointer to map array
198 0 : nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
199 0 : ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
200 :
201 : ! invoke the combined transformation
202 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
203 : CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
204 : #else
205 : MARK_USED(scale)
206 0 : CPABORT("Compiled without pw offloading")
207 : #endif
208 :
209 0 : CALL timestop(handle)
210 0 : END SUBROUTINE pw_gpu_c1dr3d_3d
211 :
212 : ! **************************************************************************************************
213 : !> \brief perform an parallel fft followed by a gather on the gpu
214 : !> \param pw1 ...
215 : !> \param pw2 ...
216 : !> \param scale ...
217 : !> \author Andreas Gloess
218 : ! **************************************************************************************************
219 0 : SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2, scale)
220 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
221 : TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
222 : REAL(KIND=dp), INTENT(IN) :: scale
223 :
224 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_r3dc1d_3d_ps'
225 :
226 0 : COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
227 0 : COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
228 : INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
229 : mz2, n1, n2, ngpts, nmax, numtask, &
230 : numtask_g, numtask_r, rp
231 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
232 : INTEGER, DIMENSION(2) :: r_dim, r_pos
233 0 : INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
234 0 : INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
235 0 : TYPE(fft_scratch_sizes) :: fft_scratch_size
236 : TYPE(fft_scratch_type), POINTER :: fft_scratch
237 0 : TYPE(mp_cart_type) :: rs_group
238 : TYPE(mp_comm_type) :: gs_group
239 :
240 0 : CALL timeset(routineN, handle)
241 :
242 : ! dimensions
243 0 : n => pw1%pw_grid%npts
244 0 : nloc => pw1%pw_grid%npts_local
245 0 : grays => pw1%pw_grid%grays
246 0 : ngpts = nloc(1)*nloc(2)*nloc(3)
247 :
248 : !..transform
249 0 : IF (pw1%pw_grid%para%ray_distribution) THEN
250 0 : gs_group = pw1%pw_grid%para%group
251 0 : rs_group = pw1%pw_grid%para%rs_group
252 0 : nyzray => pw1%pw_grid%para%nyzray
253 0 : bo => pw1%pw_grid%para%bo
254 :
255 0 : numtask_g = gs_group%num_pe
256 0 : g_pos = gs_group%mepos
257 0 : numtask_r = rs_group%num_pe
258 0 : r_dim = rs_group%num_pe_cart
259 0 : r_pos = rs_group%mepos_cart
260 0 : IF (numtask_g /= numtask_r) THEN
261 0 : CPABORT("Real space and G space groups are different.")
262 : END IF
263 0 : numtask = numtask_r
264 :
265 0 : lg = SIZE(grays, 1)
266 0 : mg = SIZE(grays, 2)
267 0 : mmax = MAX(mg, 1)
268 0 : lmax = MAX(lg, (ngpts/mmax + 1))
269 :
270 0 : ALLOCATE (p2p(0:numtask - 1))
271 :
272 0 : CALL gs_group%rank_compare(rs_group, p2p)
273 :
274 0 : rp = p2p(g_pos)
275 0 : mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
276 0 : mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
277 0 : n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
278 0 : n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
279 0 : nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
280 0 : nmax = MAX(nmax, n1*MAXVAL(nyzray))
281 :
282 0 : fft_scratch_size%nx = nloc(1)
283 0 : fft_scratch_size%ny = nloc(2)
284 0 : fft_scratch_size%nz = nloc(3)
285 0 : fft_scratch_size%lmax = lmax
286 0 : fft_scratch_size%mmax = mmax
287 0 : fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
288 0 : fft_scratch_size%mx2 = mx2
289 0 : fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
290 0 : fft_scratch_size%mz2 = mz2
291 0 : fft_scratch_size%lg = lg
292 0 : fft_scratch_size%mg = mg
293 0 : fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
294 0 : fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
295 0 : fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
296 0 : fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
297 0 : fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
298 0 : fft_scratch_size%nmax = nmax
299 0 : fft_scratch_size%nmray = MAXVAL(nyzray)
300 0 : fft_scratch_size%nyzray = nyzray(g_pos)
301 0 : fft_scratch_size%gs_group = gs_group
302 0 : fft_scratch_size%rs_group = rs_group
303 0 : fft_scratch_size%g_pos = g_pos
304 0 : fft_scratch_size%r_pos = r_pos
305 0 : fft_scratch_size%r_dim = r_dim
306 0 : fft_scratch_size%numtask = numtask
307 :
308 0 : IF (r_dim(2) > 1) THEN
309 : !
310 : ! real space is distributed over x and y coordinate
311 : ! we have two stages of communication
312 : !
313 0 : IF (r_dim(1) == 1) &
314 0 : CPABORT("This processor distribution is not supported.")
315 :
316 0 : CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
317 :
318 : ! assign buffers
319 0 : qbuf => fft_scratch%p2buf
320 0 : rbuf => fft_scratch%p3buf
321 0 : pbuf => fft_scratch%p4buf
322 0 : sbuf => fft_scratch%p5buf
323 :
324 : ! FFT along z
325 0 : CALL pw_gpu_cf(pw1, qbuf)
326 :
327 : ! Exchange data ( transpose of matrix )
328 0 : CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
329 :
330 : ! FFT along y
331 : ! use the inbuild fft-lib
332 : ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
333 : ! or cufft (works faster, but is only faster if plans are stored)
334 0 : CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
335 :
336 : ! Exchange data ( transpose of matrix ) and sort
337 : CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
338 0 : bo(:, :, :, 2), sbuf, fft_scratch)
339 :
340 : ! FFT along x
341 0 : CALL pw_gpu_fg(sbuf, pw2, scale)
342 :
343 0 : CALL release_fft_scratch(fft_scratch)
344 :
345 : ELSE
346 : !
347 : ! real space is only distributed over x coordinate
348 : ! we have one stage of communication, after the transform of
349 : ! direction x
350 : !
351 :
352 0 : CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
353 :
354 : ! assign buffers
355 0 : tbuf => fft_scratch%tbuf
356 0 : sbuf => fft_scratch%r1buf
357 :
358 : ! FFT along y and z
359 0 : CALL pw_gpu_cff(pw1, tbuf)
360 :
361 : ! Exchange data ( transpose of matrix ) and sort
362 : CALL yz_to_x(tbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
363 0 : bo(:, :, :, 2), sbuf, fft_scratch)
364 :
365 : ! FFT along x
366 0 : CALL pw_gpu_fg(sbuf, pw2, scale)
367 :
368 0 : CALL release_fft_scratch(fft_scratch)
369 :
370 : END IF
371 :
372 0 : DEALLOCATE (p2p)
373 :
374 : !--------------------------------------------------------------------------
375 : ELSE
376 0 : CPABORT("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
377 : !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
378 : ! pw1%pw_grid%para%bo, scale = scale, debug=test )
379 : END IF
380 :
381 0 : CALL timestop(handle)
382 0 : END SUBROUTINE pw_gpu_r3dc1d_3d_ps
383 :
384 : ! **************************************************************************************************
385 : !> \brief perform an parallel scatter followed by a fft on the gpu
386 : !> \param pw1 ...
387 : !> \param pw2 ...
388 : !> \param scale ...
389 : !> \author Andreas Gloess
390 : ! **************************************************************************************************
391 0 : SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2, scale)
392 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
393 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
394 : REAL(KIND=dp), INTENT(IN) :: scale
395 :
396 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_c1dr3d_3d_ps'
397 :
398 0 : COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
399 0 : COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
400 : INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
401 : mz2, n1, n2, ngpts, nmax, numtask, &
402 : numtask_g, numtask_r, rp
403 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
404 : INTEGER, DIMENSION(2) :: r_dim, r_pos
405 0 : INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
406 0 : INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
407 0 : TYPE(fft_scratch_sizes) :: fft_scratch_size
408 : TYPE(fft_scratch_type), POINTER :: fft_scratch
409 0 : TYPE(mp_cart_type) :: rs_group
410 : TYPE(mp_comm_type) :: gs_group
411 :
412 0 : CALL timeset(routineN, handle)
413 :
414 : ! dimensions
415 0 : n => pw1%pw_grid%npts
416 0 : nloc => pw1%pw_grid%npts_local
417 0 : grays => pw1%pw_grid%grays
418 0 : ngpts = nloc(1)*nloc(2)*nloc(3)
419 :
420 : !..transform
421 0 : IF (pw1%pw_grid%para%ray_distribution) THEN
422 0 : gs_group = pw1%pw_grid%para%group
423 0 : rs_group = pw1%pw_grid%para%rs_group
424 0 : nyzray => pw1%pw_grid%para%nyzray
425 0 : bo => pw1%pw_grid%para%bo
426 :
427 0 : numtask_g = gs_group%num_pe
428 0 : g_pos = gs_group%mepos
429 0 : numtask_r = rs_group%num_pe
430 0 : r_dim = rs_group%num_pe_cart
431 0 : r_pos = rs_group%mepos_cart
432 0 : IF (numtask_g /= numtask_r) THEN
433 0 : CPABORT("Real space and G space groups are different.")
434 : END IF
435 0 : numtask = numtask_r
436 :
437 0 : lg = SIZE(grays, 1)
438 0 : mg = SIZE(grays, 2)
439 0 : mmax = MAX(mg, 1)
440 0 : lmax = MAX(lg, (ngpts/mmax + 1))
441 :
442 0 : ALLOCATE (p2p(0:numtask - 1))
443 :
444 0 : CALL gs_group%rank_compare(rs_group, p2p)
445 :
446 0 : rp = p2p(g_pos)
447 0 : mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
448 0 : mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
449 0 : n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
450 0 : n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
451 0 : nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
452 0 : nmax = MAX(nmax, n1*MAXVAL(nyzray))
453 :
454 0 : fft_scratch_size%nx = nloc(1)
455 0 : fft_scratch_size%ny = nloc(2)
456 0 : fft_scratch_size%nz = nloc(3)
457 0 : fft_scratch_size%lmax = lmax
458 0 : fft_scratch_size%mmax = mmax
459 0 : fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
460 0 : fft_scratch_size%mx2 = mx2
461 0 : fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
462 0 : fft_scratch_size%mz2 = mz2
463 0 : fft_scratch_size%lg = lg
464 0 : fft_scratch_size%mg = mg
465 0 : fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
466 0 : fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
467 0 : fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
468 0 : fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
469 0 : fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
470 0 : fft_scratch_size%nmax = nmax
471 0 : fft_scratch_size%nmray = MAXVAL(nyzray)
472 0 : fft_scratch_size%nyzray = nyzray(g_pos)
473 0 : fft_scratch_size%gs_group = gs_group
474 0 : fft_scratch_size%rs_group = rs_group
475 0 : fft_scratch_size%g_pos = g_pos
476 0 : fft_scratch_size%r_pos = r_pos
477 0 : fft_scratch_size%r_dim = r_dim
478 0 : fft_scratch_size%numtask = numtask
479 :
480 0 : IF (r_dim(2) > 1) THEN
481 : !
482 : ! real space is distributed over x and y coordinate
483 : ! we have two stages of communication
484 : !
485 0 : IF (r_dim(1) == 1) &
486 0 : CPABORT("This processor distribution is not supported.")
487 :
488 0 : CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
489 :
490 : ! assign buffers
491 0 : pbuf => fft_scratch%p7buf
492 0 : qbuf => fft_scratch%p4buf
493 0 : rbuf => fft_scratch%p3buf
494 0 : sbuf => fft_scratch%p2buf
495 :
496 : ! FFT along x
497 0 : CALL pw_gpu_sf(pw1, pbuf, scale)
498 :
499 : ! Exchange data ( transpose of matrix ) and sort
500 0 : IF (pw1%pw_grid%grid_span /= FULLSPACE) qbuf = z_zero
501 : CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
502 0 : bo(:, :, :, 2), qbuf, fft_scratch)
503 :
504 : ! FFT along y
505 : ! use the inbuild fft-lib
506 : ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
507 : ! or cufft (works faster, but is only faster if plans are stored)
508 0 : CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
509 :
510 : ! Exchange data ( transpose of matrix )
511 0 : IF (pw1%pw_grid%grid_span /= FULLSPACE) sbuf = z_zero
512 :
513 0 : CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
514 :
515 : ! FFT along z
516 0 : CALL pw_gpu_fc(sbuf, pw2)
517 :
518 0 : CALL release_fft_scratch(fft_scratch)
519 :
520 : ELSE
521 : !
522 : ! real space is only distributed over x coordinate
523 : ! we have one stage of communication, after the transform of
524 : ! direction x
525 : !
526 :
527 0 : CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
528 :
529 : ! assign buffers
530 0 : sbuf => fft_scratch%r1buf
531 0 : tbuf => fft_scratch%tbuf
532 :
533 : ! FFT along x
534 0 : CALL pw_gpu_sf(pw1, sbuf, scale)
535 :
536 : ! Exchange data ( transpose of matrix ) and sort
537 0 : IF (pw1%pw_grid%grid_span /= FULLSPACE) tbuf = z_zero
538 : CALL x_to_yz(sbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
539 0 : bo(:, :, :, 2), tbuf, fft_scratch)
540 :
541 : ! FFT along y and z
542 0 : CALL pw_gpu_ffc(tbuf, pw2)
543 :
544 0 : CALL release_fft_scratch(fft_scratch)
545 :
546 : END IF
547 :
548 0 : DEALLOCATE (p2p)
549 :
550 : !--------------------------------------------------------------------------
551 : ELSE
552 0 : CPABORT("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
553 : !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
554 : ! pw1%pw_grid%para%bo, scale = scale, debug=test )
555 : END IF
556 :
557 0 : CALL timestop(handle)
558 0 : END SUBROUTINE pw_gpu_c1dr3d_3d_ps
559 :
560 : ! **************************************************************************************************
561 : !> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
562 : !> \param pw1 ...
563 : !> \param pwbuf ...
564 : !> \author Andreas Gloess
565 : ! **************************************************************************************************
566 0 : SUBROUTINE pw_gpu_cff(pw1, pwbuf)
567 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
568 : COMPLEX(KIND=dp), DIMENSION(:, :, :), &
569 : INTENT(INOUT), TARGET :: pwbuf
570 :
571 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_cff'
572 :
573 : COMPLEX(KIND=dp), POINTER :: ptr_pwout
574 : INTEGER :: handle, l1, l2, l3
575 0 : INTEGER, DIMENSION(:), POINTER :: npts
576 : REAL(KIND=dp), POINTER :: ptr_pwin
577 : INTERFACE
578 : SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
579 : IMPORT
580 : TYPE(C_PTR), INTENT(IN), VALUE :: din
581 : TYPE(C_PTR), VALUE :: zout
582 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
583 : END SUBROUTINE pw_gpu_cff_c
584 : END INTERFACE
585 :
586 0 : CALL timeset(routineN, handle)
587 :
588 : ! dimensions
589 0 : npts => pw1%pw_grid%npts_local
590 0 : l1 = LBOUND(pw1%array, 1)
591 0 : l2 = LBOUND(pw1%array, 2)
592 0 : l3 = LBOUND(pw1%array, 3)
593 :
594 : ! pointers to data arrays
595 0 : ptr_pwin => pw1%array(l1, l2, l3)
596 0 : ptr_pwout => pwbuf(1, 1, 1)
597 :
598 : ! invoke the combined transformation
599 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
600 : CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
601 : #else
602 0 : CPABORT("Compiled without pw offloading")
603 : #endif
604 :
605 0 : CALL timestop(handle)
606 0 : END SUBROUTINE pw_gpu_cff
607 :
608 : ! **************************************************************************************************
609 : !> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
610 : !> \param pwbuf ...
611 : !> \param pw2 ...
612 : !> \author Andreas Gloess
613 : ! **************************************************************************************************
614 0 : SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
615 : COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
616 : TARGET :: pwbuf
617 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
618 :
619 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_ffc'
620 :
621 : COMPLEX(KIND=dp), POINTER :: ptr_pwin
622 : INTEGER :: handle, l1, l2, l3
623 0 : INTEGER, DIMENSION(:), POINTER :: npts
624 : REAL(KIND=dp), POINTER :: ptr_pwout
625 : INTERFACE
626 : SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
627 : IMPORT
628 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
629 : TYPE(C_PTR), VALUE :: dout
630 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
631 : END SUBROUTINE pw_gpu_ffc_c
632 : END INTERFACE
633 :
634 0 : CALL timeset(routineN, handle)
635 :
636 : ! dimensions
637 0 : npts => pw2%pw_grid%npts_local
638 0 : l1 = LBOUND(pw2%array, 1)
639 0 : l2 = LBOUND(pw2%array, 2)
640 0 : l3 = LBOUND(pw2%array, 3)
641 :
642 : ! pointers to data arrays
643 0 : ptr_pwin => pwbuf(1, 1, 1)
644 0 : ptr_pwout => pw2%array(l1, l2, l3)
645 :
646 : ! invoke the combined transformation
647 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
648 : CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
649 : #else
650 0 : CPABORT("Compiled without pw offloading")
651 : #endif
652 :
653 0 : CALL timestop(handle)
654 0 : END SUBROUTINE pw_gpu_ffc
655 :
656 : ! **************************************************************************************************
657 : !> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
658 : !> \param pw1 ...
659 : !> \param pwbuf ...
660 : !> \author Andreas Gloess
661 : ! **************************************************************************************************
662 0 : SUBROUTINE pw_gpu_cf(pw1, pwbuf)
663 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
664 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
665 : TARGET :: pwbuf
666 :
667 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_cf'
668 :
669 : COMPLEX(KIND=dp), POINTER :: ptr_pwout
670 : INTEGER :: handle, l1, l2, l3
671 0 : INTEGER, DIMENSION(:), POINTER :: npts
672 : REAL(KIND=dp), POINTER :: ptr_pwin
673 : INTERFACE
674 : SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
675 : IMPORT
676 : TYPE(C_PTR), INTENT(IN), VALUE :: din
677 : TYPE(C_PTR), VALUE :: zout
678 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
679 : END SUBROUTINE pw_gpu_cf_c
680 : END INTERFACE
681 :
682 0 : CALL timeset(routineN, handle)
683 :
684 : ! dimensions
685 0 : npts => pw1%pw_grid%npts_local
686 0 : l1 = LBOUND(pw1%array, 1)
687 0 : l2 = LBOUND(pw1%array, 2)
688 0 : l3 = LBOUND(pw1%array, 3)
689 :
690 : ! pointers to data arrays
691 0 : ptr_pwin => pw1%array(l1, l2, l3)
692 0 : ptr_pwout => pwbuf(1, 1)
693 :
694 : ! invoke the combined transformation
695 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
696 : CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
697 : #else
698 0 : CPABORT("Compiled without pw offloading")
699 : #endif
700 0 : CALL timestop(handle)
701 0 : END SUBROUTINE pw_gpu_cf
702 :
703 : ! **************************************************************************************************
704 : !> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
705 : !> \param pwbuf ...
706 : !> \param pw2 ...
707 : !> \author Andreas Gloess
708 : ! **************************************************************************************************
709 0 : SUBROUTINE pw_gpu_fc(pwbuf, pw2)
710 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
711 : TARGET :: pwbuf
712 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
713 :
714 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_fc'
715 :
716 : COMPLEX(KIND=dp), POINTER :: ptr_pwin
717 : INTEGER :: handle, l1, l2, l3
718 0 : INTEGER, DIMENSION(:), POINTER :: npts
719 : REAL(KIND=dp), POINTER :: ptr_pwout
720 : INTERFACE
721 : SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
722 : IMPORT
723 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
724 : TYPE(C_PTR), VALUE :: dout
725 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
726 : END SUBROUTINE pw_gpu_fc_c
727 : END INTERFACE
728 :
729 0 : CALL timeset(routineN, handle)
730 :
731 0 : npts => pw2%pw_grid%npts_local
732 0 : l1 = LBOUND(pw2%array, 1)
733 0 : l2 = LBOUND(pw2%array, 2)
734 0 : l3 = LBOUND(pw2%array, 3)
735 :
736 : ! pointers to data arrays
737 0 : ptr_pwin => pwbuf(1, 1)
738 0 : ptr_pwout => pw2%array(l1, l2, l3)
739 :
740 : ! invoke the combined transformation
741 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
742 : CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
743 : #else
744 0 : CPABORT("Compiled without pw offloading")
745 : #endif
746 :
747 0 : CALL timestop(handle)
748 0 : END SUBROUTINE pw_gpu_fc
749 :
750 : ! **************************************************************************************************
751 : !> \brief perform a parallel 1D-FFT on the gpu
752 : !> \param pwbuf1 ...
753 : !> \param pwbuf2 ...
754 : !> \param dir ...
755 : !> \param n ...
756 : !> \param m ...
757 : !> \author Andreas Gloess
758 : ! **************************************************************************************************
759 0 : SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
760 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
761 : TARGET :: pwbuf1
762 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
763 : TARGET :: pwbuf2
764 : INTEGER, INTENT(IN) :: dir, n, m
765 :
766 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_f'
767 :
768 : COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
769 : INTEGER :: handle
770 : INTERFACE
771 : SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
772 : IMPORT
773 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
774 : TYPE(C_PTR), VALUE :: zout
775 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: dir, n, m
776 : END SUBROUTINE pw_gpu_f_c
777 : END INTERFACE
778 :
779 0 : CALL timeset(routineN, handle)
780 :
781 0 : IF (n*m /= 0) THEN
782 : ! pointers to data arrays
783 0 : ptr_pwin => pwbuf1(1, 1)
784 0 : ptr_pwout => pwbuf2(1, 1)
785 :
786 : ! invoke the combined transformation
787 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
788 : CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
789 : #else
790 : MARK_USED(dir)
791 0 : CPABORT("Compiled without pw offloading")
792 : #endif
793 : END IF
794 :
795 0 : CALL timestop(handle)
796 0 : END SUBROUTINE pw_gpu_f
797 : ! **************************************************************************************************
798 : !> \brief perform a parallel 1D-FFT followed by a gather on the gpu
799 : !> \param pwbuf ...
800 : !> \param pw2 ...
801 : !> \param scale ...
802 : !> \author Andreas Gloess
803 : ! **************************************************************************************************
804 0 : SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
805 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
806 : TARGET :: pwbuf
807 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw2
808 : REAL(KIND=dp), INTENT(IN) :: scale
809 :
810 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_fg'
811 :
812 : COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
813 : INTEGER :: handle, mg, mmax, ngpts
814 0 : INTEGER, DIMENSION(:), POINTER :: npts
815 : INTEGER, POINTER :: ptr_ghatmap
816 : INTERFACE
817 : SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
818 : IMPORT
819 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
820 : TYPE(C_PTR), VALUE :: zout
821 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
822 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
823 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts
824 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
825 :
826 : END SUBROUTINE pw_gpu_fg_c
827 : END INTERFACE
828 :
829 0 : CALL timeset(routineN, handle)
830 :
831 0 : ngpts = SIZE(pw2%pw_grid%gsq)
832 0 : npts => pw2%pw_grid%npts
833 :
834 0 : IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
835 0 : mg = SIZE(pw2%pw_grid%grays, 2)
836 0 : mmax = MAX(mg, 1)
837 :
838 : ! pointers to data arrays
839 0 : ptr_pwin => pwbuf(1, 1)
840 0 : ptr_pwout => pw2%array(1)
841 :
842 : ! pointer to map array
843 0 : ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
844 :
845 : ! invoke the combined transformation
846 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
847 : CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
848 : #else
849 : MARK_USED(scale)
850 0 : CPABORT("Compiled without pw offloading")
851 : #endif
852 : END IF
853 :
854 0 : CALL timestop(handle)
855 0 : END SUBROUTINE pw_gpu_fg
856 :
857 : ! **************************************************************************************************
858 : !> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
859 : !> \param pw1 ...
860 : !> \param pwbuf ...
861 : !> \param scale ...
862 : !> \author Andreas Gloess
863 : ! **************************************************************************************************
864 0 : SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
865 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
866 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
867 : TARGET :: pwbuf
868 : REAL(KIND=dp), INTENT(IN) :: scale
869 :
870 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_sf'
871 :
872 : COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
873 : INTEGER :: handle, mg, mmax, ngpts, nmaps
874 0 : INTEGER, DIMENSION(:), POINTER :: npts
875 : INTEGER, POINTER :: ptr_ghatmap
876 : INTERFACE
877 : SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
878 : IMPORT
879 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
880 : TYPE(C_PTR), VALUE :: zout
881 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
882 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
883 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts, nmaps
884 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
885 :
886 : END SUBROUTINE pw_gpu_sf_c
887 : END INTERFACE
888 :
889 0 : CALL timeset(routineN, handle)
890 :
891 0 : ngpts = SIZE(pw1%pw_grid%gsq)
892 0 : npts => pw1%pw_grid%npts
893 :
894 0 : IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
895 0 : mg = SIZE(pw1%pw_grid%grays, 2)
896 0 : mmax = MAX(mg, 1)
897 :
898 : ! pointers to data arrays
899 0 : ptr_pwin => pw1%array(1)
900 0 : ptr_pwout => pwbuf(1, 1)
901 :
902 : ! pointer to map array
903 0 : nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
904 0 : ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
905 :
906 : ! invoke the combined transformation
907 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
908 : CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
909 : #else
910 : MARK_USED(scale)
911 0 : CPABORT("Compiled without pw offloading")
912 : #endif
913 : END IF
914 :
915 0 : CALL timestop(handle)
916 0 : END SUBROUTINE pw_gpu_sf
917 :
918 : END MODULE pw_gpu
919 :
|