LCOV - code coverage report
Current view: top level - src/pw - pw_gpu.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:1f285aa) Lines: 4 299 1.3 %
Date: 2024-04-23 06:49:27 Functions: 2 13 15.4 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \note
      10             : !> This module contains routines necessary to operate on plane waves on GPUs
      11             : !     >  independently of the GPU platform.
      12             : !> \par History
      13             : !>      BGL (06-Mar-2008)  : Created
      14             : !>      AG  (18-May-2012)  : Refacturing:
      15             : !>                           - added explicit interfaces to C routines
      16             : !>                           - enable double precision complex transformations
      17             : !>      AG  (11-Sept-2012) : Modifications:
      18             : !>                          - use pointers if precision mapping is not required
      19             : !>                          - use OMP for mapping
      20             : !>      MT  (Jan 2022)     : Modifications
      21             : !>                          - use a generic interface for fft calls to GPUs
      22             : !>                          - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
      23             : !>                            can be added easily.
      24             : !> \author Benjamin G. Levine
      25             : ! **************************************************************************************************
      26             : MODULE pw_gpu
      27             :    USE ISO_C_BINDING,                   ONLY: C_DOUBLE,&
      28             :                                               C_INT,&
      29             :                                               C_LOC,&
      30             :                                               C_PTR
      31             :    USE fft_tools,                       ONLY: &
      32             :         cube_transpose_1, cube_transpose_2, fft_scratch_sizes, fft_scratch_type, get_fft_scratch, &
      33             :         release_fft_scratch, x_to_yz, xz_to_yz, yz_to_x, yz_to_xz
      34             :    USE kinds,                           ONLY: dp
      35             :    USE mathconstants,                   ONLY: z_zero
      36             :    USE message_passing,                 ONLY: mp_cart_type,&
      37             :                                               mp_comm_type
      38             :    USE pw_grid_types,                   ONLY: FULLSPACE
      39             :    USE pw_types,                        ONLY: pw_c1d_gs_type,&
      40             :                                               pw_r3d_rs_type
      41             : #include "../base/base_uses.f90"
      42             : 
      43             :    IMPLICIT NONE
      44             : 
      45             :    PRIVATE
      46             : 
      47             :    PUBLIC :: pw_gpu_r3dc1d_3d
      48             :    PUBLIC :: pw_gpu_c1dr3d_3d
      49             :    PUBLIC :: pw_gpu_r3dc1d_3d_ps
      50             :    PUBLIC :: pw_gpu_c1dr3d_3d_ps
      51             :    PUBLIC :: pw_gpu_init, pw_gpu_finalize
      52             : 
      53             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
      54             :    LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .FALSE.
      55             : 
      56             : CONTAINS
      57             : 
      58             : ! **************************************************************************************************
      59             : !> \brief Allocates resources on the gpu device for gpu fft acceleration
      60             : !> \author Ole Schuett
      61             : ! **************************************************************************************************
      62       16926 :    SUBROUTINE pw_gpu_init()
      63             :       INTEGER                                            :: dummy
      64             :       INTERFACE
      65             :          SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
      66             :          END SUBROUTINE pw_gpu_init_c
      67             :       END INTERFACE
      68             : 
      69             :       MARK_USED(dummy) ! TODO: fix fpretty
      70             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
      71             :       CALL pw_gpu_init_c()
      72             : #else
      73             :       ! Nothing to do.
      74             : #endif
      75       16926 :    END SUBROUTINE pw_gpu_init
      76             : 
      77             : ! **************************************************************************************************
      78             : !> \brief Releases resources on the gpu device for gpu fft acceleration
      79             : !> \author Ole Schuett
      80             : ! **************************************************************************************************
      81       16926 :    SUBROUTINE pw_gpu_finalize()
      82             :       INTEGER                                            :: dummy
      83             :       INTERFACE
      84             :          SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
      85             :          END SUBROUTINE pw_gpu_finalize_c
      86             :       END INTERFACE
      87             : 
      88             :       MARK_USED(dummy) ! TODO: fix fpretty
      89             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
      90             :       CALL pw_gpu_finalize_c()
      91             : #else
      92             :       ! Nothing to do.
      93             : #endif
      94       16926 :    END SUBROUTINE pw_gpu_finalize
      95             : 
      96             : ! **************************************************************************************************
      97             : !> \brief perform an fft followed by a gather on the gpu
      98             : !> \param pw1 ...
      99             : !> \param pw2 ...
     100             : !> \param scale ...
     101             : !> \author Benjamin G Levine
     102             : ! **************************************************************************************************
     103           0 :    SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2, scale)
     104             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     105             :       TYPE(pw_c1d_gs_type), INTENT(INOUT)                :: pw2
     106             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     107             : 
     108             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_r3dc1d_3d'
     109             : 
     110             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     111             :       INTEGER                                            :: handle, l1, l2, l3, ngpts
     112           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     113             :       INTEGER, POINTER                                   :: ptr_ghatmap
     114             :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     115             :       INTERFACE
     116             :          SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
     117             :             IMPORT
     118             :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     119             :             TYPE(C_PTR), VALUE                           :: zout
     120             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     121             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     122             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: ngpts
     123             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     124             : 
     125             :          END SUBROUTINE pw_gpu_cfffg_c
     126             :       END INTERFACE
     127             : 
     128           0 :       CALL timeset(routineN, handle)
     129             : 
     130           0 :       ngpts = SIZE(pw2%pw_grid%gsq)
     131           0 :       l1 = LBOUND(pw1%array, 1)
     132           0 :       l2 = LBOUND(pw1%array, 2)
     133           0 :       l3 = LBOUND(pw1%array, 3)
     134           0 :       npts => pw1%pw_grid%npts
     135             : 
     136             :       ! pointers to data arrays
     137           0 :       ptr_pwin => pw1%array(l1, l2, l3)
     138           0 :       ptr_pwout => pw2%array(1)
     139             : 
     140             :       ! pointer to map array
     141           0 :       ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
     142             : 
     143             :       ! invoke the combined transformation
     144             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     145             :       CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
     146             : #else
     147             :       MARK_USED(scale)
     148           0 :       CPABORT("Compiled without pw offloading.")
     149             : #endif
     150             : 
     151           0 :       CALL timestop(handle)
     152           0 :    END SUBROUTINE pw_gpu_r3dc1d_3d
     153             : 
     154             : ! **************************************************************************************************
     155             : !> \brief perform an scatter followed by a fft on the gpu
     156             : !> \param pw1 ...
     157             : !> \param pw2 ...
     158             : !> \param scale ...
     159             : !> \author Benjamin G Levine
     160             : ! **************************************************************************************************
     161           0 :    SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2, scale)
     162             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     163             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw2
     164             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     165             : 
     166             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_c1dr3d_3d'
     167             : 
     168             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     169             :       INTEGER                                            :: handle, l1, l2, l3, ngpts, nmaps
     170           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     171             :       INTEGER, POINTER                                   :: ptr_ghatmap
     172             :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     173             :       INTERFACE
     174             :          SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
     175             :             IMPORT
     176             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     177             :             TYPE(C_PTR), VALUE                           :: dout
     178             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     179             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     180             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: ngpts, nmaps
     181             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     182             :          END SUBROUTINE pw_gpu_sfffc_c
     183             :       END INTERFACE
     184             : 
     185           0 :       CALL timeset(routineN, handle)
     186             : 
     187           0 :       ngpts = SIZE(pw1%pw_grid%gsq)
     188           0 :       l1 = LBOUND(pw2%array, 1)
     189           0 :       l2 = LBOUND(pw2%array, 2)
     190           0 :       l3 = LBOUND(pw2%array, 3)
     191           0 :       npts => pw1%pw_grid%npts
     192             : 
     193             :       ! pointers to data arrays
     194           0 :       ptr_pwin => pw1%array(1)
     195           0 :       ptr_pwout => pw2%array(l1, l2, l3)
     196             : 
     197             :       ! pointer to map array
     198           0 :       nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
     199           0 :       ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
     200             : 
     201             :       ! invoke the combined transformation
     202             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     203             :       CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
     204             : #else
     205             :       MARK_USED(scale)
     206           0 :       CPABORT("Compiled without pw offloading")
     207             : #endif
     208             : 
     209           0 :       CALL timestop(handle)
     210           0 :    END SUBROUTINE pw_gpu_c1dr3d_3d
     211             : 
     212             : ! **************************************************************************************************
     213             : !> \brief perform an parallel fft followed by a gather on the gpu
     214             : !> \param pw1 ...
     215             : !> \param pw2 ...
     216             : !> \param scale ...
     217             : !> \author Andreas Gloess
     218             : ! **************************************************************************************************
     219           0 :    SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2, scale)
     220             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     221             :       TYPE(pw_c1d_gs_type), INTENT(INOUT)                :: pw2
     222             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     223             : 
     224             :       CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_r3dc1d_3d_ps'
     225             : 
     226           0 :       COMPLEX(KIND=dp), DIMENSION(:, :), POINTER         :: grays, pbuf, qbuf, rbuf, sbuf
     227           0 :       COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER      :: tbuf
     228             :       INTEGER                                            :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
     229             :                                                             mz2, n1, n2, ngpts, nmax, numtask, &
     230             :                                                             numtask_g, numtask_r, rp
     231           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: p2p
     232             :       INTEGER, DIMENSION(2)                              :: r_dim, r_pos
     233           0 :       INTEGER, DIMENSION(:), POINTER                     :: n, nloc, nyzray
     234           0 :       INTEGER, DIMENSION(:, :, :, :), POINTER            :: bo
     235           0 :       TYPE(fft_scratch_sizes)                            :: fft_scratch_size
     236             :       TYPE(fft_scratch_type), POINTER                    :: fft_scratch
     237           0 :       TYPE(mp_cart_type)                                 :: rs_group
     238             :       TYPE(mp_comm_type)                                 :: gs_group
     239             : 
     240           0 :       CALL timeset(routineN, handle)
     241             : 
     242             :       ! dimensions
     243           0 :       n => pw1%pw_grid%npts
     244           0 :       nloc => pw1%pw_grid%npts_local
     245           0 :       grays => pw1%pw_grid%grays
     246           0 :       ngpts = nloc(1)*nloc(2)*nloc(3)
     247             : 
     248             :       !..transform
     249           0 :       IF (pw1%pw_grid%para%ray_distribution) THEN
     250           0 :          gs_group = pw1%pw_grid%para%group
     251           0 :          rs_group = pw1%pw_grid%para%rs_group
     252           0 :          nyzray => pw1%pw_grid%para%nyzray
     253           0 :          bo => pw1%pw_grid%para%bo
     254             : 
     255           0 :          numtask_g = gs_group%num_pe
     256           0 :          g_pos = gs_group%mepos
     257           0 :          numtask_r = rs_group%num_pe
     258           0 :          r_dim = rs_group%num_pe_cart
     259           0 :          r_pos = rs_group%mepos_cart
     260           0 :          IF (numtask_g /= numtask_r) THEN
     261           0 :             CPABORT("Real space and G space groups are different.")
     262             :          END IF
     263           0 :          numtask = numtask_r
     264             : 
     265           0 :          lg = SIZE(grays, 1)
     266           0 :          mg = SIZE(grays, 2)
     267           0 :          mmax = MAX(mg, 1)
     268           0 :          lmax = MAX(lg, (ngpts/mmax + 1))
     269             : 
     270           0 :          ALLOCATE (p2p(0:numtask - 1))
     271             : 
     272           0 :          CALL gs_group%rank_compare(rs_group, p2p)
     273             : 
     274           0 :          rp = p2p(g_pos)
     275           0 :          mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
     276           0 :          mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
     277           0 :          n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
     278           0 :          n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
     279           0 :          nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
     280           0 :          nmax = MAX(nmax, n1*MAXVAL(nyzray))
     281             : 
     282           0 :          fft_scratch_size%nx = nloc(1)
     283           0 :          fft_scratch_size%ny = nloc(2)
     284           0 :          fft_scratch_size%nz = nloc(3)
     285           0 :          fft_scratch_size%lmax = lmax
     286           0 :          fft_scratch_size%mmax = mmax
     287           0 :          fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
     288           0 :          fft_scratch_size%mx2 = mx2
     289           0 :          fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
     290           0 :          fft_scratch_size%mz2 = mz2
     291           0 :          fft_scratch_size%lg = lg
     292           0 :          fft_scratch_size%mg = mg
     293           0 :          fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
     294           0 :          fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
     295           0 :          fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
     296           0 :          fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
     297           0 :          fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
     298           0 :          fft_scratch_size%nmax = nmax
     299           0 :          fft_scratch_size%nmray = MAXVAL(nyzray)
     300           0 :          fft_scratch_size%nyzray = nyzray(g_pos)
     301           0 :          fft_scratch_size%gs_group = gs_group
     302           0 :          fft_scratch_size%rs_group = rs_group
     303           0 :          fft_scratch_size%g_pos = g_pos
     304           0 :          fft_scratch_size%r_pos = r_pos
     305           0 :          fft_scratch_size%r_dim = r_dim
     306           0 :          fft_scratch_size%numtask = numtask
     307             : 
     308           0 :          IF (r_dim(2) > 1) THEN
     309             :             !
     310             :             ! real space is distributed over x and y coordinate
     311             :             ! we have two stages of communication
     312             :             !
     313           0 :             IF (r_dim(1) == 1) &
     314           0 :                CPABORT("This processor distribution is not supported.")
     315             : 
     316           0 :             CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
     317             : 
     318             :             ! assign buffers
     319           0 :             qbuf => fft_scratch%p2buf
     320           0 :             rbuf => fft_scratch%p3buf
     321           0 :             pbuf => fft_scratch%p4buf
     322           0 :             sbuf => fft_scratch%p5buf
     323             : 
     324             :             ! FFT along z
     325           0 :             CALL pw_gpu_cf(pw1, qbuf)
     326             : 
     327             :             ! Exchange data ( transpose of matrix )
     328           0 :             CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
     329             : 
     330             :             ! FFT along y
     331             :             ! use the inbuild fft-lib
     332             :             ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
     333             :             ! or cufft (works faster, but is only faster if plans are stored)
     334           0 :             CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
     335             : 
     336             :             ! Exchange data ( transpose of matrix ) and sort
     337             :             CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     338           0 :                           bo(:, :, :, 2), sbuf, fft_scratch)
     339             : 
     340             :             ! FFT along x
     341           0 :             CALL pw_gpu_fg(sbuf, pw2, scale)
     342             : 
     343           0 :             CALL release_fft_scratch(fft_scratch)
     344             : 
     345             :          ELSE
     346             :             !
     347             :             ! real space is only distributed over x coordinate
     348             :             ! we have one stage of communication, after the transform of
     349             :             ! direction x
     350             :             !
     351             : 
     352           0 :             CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
     353             : 
     354             :             ! assign buffers
     355           0 :             tbuf => fft_scratch%tbuf
     356           0 :             sbuf => fft_scratch%r1buf
     357             : 
     358             :             ! FFT along y and z
     359           0 :             CALL pw_gpu_cff(pw1, tbuf)
     360             : 
     361             :             ! Exchange data ( transpose of matrix ) and sort
     362             :             CALL yz_to_x(tbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     363           0 :                          bo(:, :, :, 2), sbuf, fft_scratch)
     364             : 
     365             :             ! FFT along x
     366           0 :             CALL pw_gpu_fg(sbuf, pw2, scale)
     367             : 
     368           0 :             CALL release_fft_scratch(fft_scratch)
     369             : 
     370             :          END IF
     371             : 
     372           0 :          DEALLOCATE (p2p)
     373             : 
     374             : !--------------------------------------------------------------------------
     375             :       ELSE
     376           0 :          CPABORT("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
     377             :          !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
     378             :          !     pw1%pw_grid%para%bo, scale = scale, debug=test )
     379             :       END IF
     380             : 
     381           0 :       CALL timestop(handle)
     382           0 :    END SUBROUTINE pw_gpu_r3dc1d_3d_ps
     383             : 
     384             : ! **************************************************************************************************
     385             : !> \brief perform an parallel scatter followed by a fft on the gpu
     386             : !> \param pw1 ...
     387             : !> \param pw2 ...
     388             : !> \param scale ...
     389             : !> \author Andreas Gloess
     390             : ! **************************************************************************************************
     391           0 :    SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2, scale)
     392             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     393             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw2
     394             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     395             : 
     396             :       CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_c1dr3d_3d_ps'
     397             : 
     398           0 :       COMPLEX(KIND=dp), DIMENSION(:, :), POINTER         :: grays, pbuf, qbuf, rbuf, sbuf
     399           0 :       COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER      :: tbuf
     400             :       INTEGER                                            :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
     401             :                                                             mz2, n1, n2, ngpts, nmax, numtask, &
     402             :                                                             numtask_g, numtask_r, rp
     403           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: p2p
     404             :       INTEGER, DIMENSION(2)                              :: r_dim, r_pos
     405           0 :       INTEGER, DIMENSION(:), POINTER                     :: n, nloc, nyzray
     406           0 :       INTEGER, DIMENSION(:, :, :, :), POINTER            :: bo
     407           0 :       TYPE(fft_scratch_sizes)                            :: fft_scratch_size
     408             :       TYPE(fft_scratch_type), POINTER                    :: fft_scratch
     409           0 :       TYPE(mp_cart_type)                                 :: rs_group
     410             :       TYPE(mp_comm_type)                                 :: gs_group
     411             : 
     412           0 :       CALL timeset(routineN, handle)
     413             : 
     414             :       ! dimensions
     415           0 :       n => pw1%pw_grid%npts
     416           0 :       nloc => pw1%pw_grid%npts_local
     417           0 :       grays => pw1%pw_grid%grays
     418           0 :       ngpts = nloc(1)*nloc(2)*nloc(3)
     419             : 
     420             :       !..transform
     421           0 :       IF (pw1%pw_grid%para%ray_distribution) THEN
     422           0 :          gs_group = pw1%pw_grid%para%group
     423           0 :          rs_group = pw1%pw_grid%para%rs_group
     424           0 :          nyzray => pw1%pw_grid%para%nyzray
     425           0 :          bo => pw1%pw_grid%para%bo
     426             : 
     427           0 :          numtask_g = gs_group%num_pe
     428           0 :          g_pos = gs_group%mepos
     429           0 :          numtask_r = rs_group%num_pe
     430           0 :          r_dim = rs_group%num_pe_cart
     431           0 :          r_pos = rs_group%mepos_cart
     432           0 :          IF (numtask_g /= numtask_r) THEN
     433           0 :             CPABORT("Real space and G space groups are different.")
     434             :          END IF
     435           0 :          numtask = numtask_r
     436             : 
     437           0 :          lg = SIZE(grays, 1)
     438           0 :          mg = SIZE(grays, 2)
     439           0 :          mmax = MAX(mg, 1)
     440           0 :          lmax = MAX(lg, (ngpts/mmax + 1))
     441             : 
     442           0 :          ALLOCATE (p2p(0:numtask - 1))
     443             : 
     444           0 :          CALL gs_group%rank_compare(rs_group, p2p)
     445             : 
     446           0 :          rp = p2p(g_pos)
     447           0 :          mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
     448           0 :          mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
     449           0 :          n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
     450           0 :          n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
     451           0 :          nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
     452           0 :          nmax = MAX(nmax, n1*MAXVAL(nyzray))
     453             : 
     454           0 :          fft_scratch_size%nx = nloc(1)
     455           0 :          fft_scratch_size%ny = nloc(2)
     456           0 :          fft_scratch_size%nz = nloc(3)
     457           0 :          fft_scratch_size%lmax = lmax
     458           0 :          fft_scratch_size%mmax = mmax
     459           0 :          fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
     460           0 :          fft_scratch_size%mx2 = mx2
     461           0 :          fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
     462           0 :          fft_scratch_size%mz2 = mz2
     463           0 :          fft_scratch_size%lg = lg
     464           0 :          fft_scratch_size%mg = mg
     465           0 :          fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
     466           0 :          fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
     467           0 :          fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
     468           0 :          fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
     469           0 :          fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
     470           0 :          fft_scratch_size%nmax = nmax
     471           0 :          fft_scratch_size%nmray = MAXVAL(nyzray)
     472           0 :          fft_scratch_size%nyzray = nyzray(g_pos)
     473           0 :          fft_scratch_size%gs_group = gs_group
     474           0 :          fft_scratch_size%rs_group = rs_group
     475           0 :          fft_scratch_size%g_pos = g_pos
     476           0 :          fft_scratch_size%r_pos = r_pos
     477           0 :          fft_scratch_size%r_dim = r_dim
     478           0 :          fft_scratch_size%numtask = numtask
     479             : 
     480           0 :          IF (r_dim(2) > 1) THEN
     481             :             !
     482             :             ! real space is distributed over x and y coordinate
     483             :             ! we have two stages of communication
     484             :             !
     485           0 :             IF (r_dim(1) == 1) &
     486           0 :                CPABORT("This processor distribution is not supported.")
     487             : 
     488           0 :             CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
     489             : 
     490             :             ! assign buffers
     491           0 :             pbuf => fft_scratch%p7buf
     492           0 :             qbuf => fft_scratch%p4buf
     493           0 :             rbuf => fft_scratch%p3buf
     494           0 :             sbuf => fft_scratch%p2buf
     495             : 
     496             :             ! FFT along x
     497           0 :             CALL pw_gpu_sf(pw1, pbuf, scale)
     498             : 
     499             :             ! Exchange data ( transpose of matrix ) and sort
     500           0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) qbuf = z_zero
     501             :             CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     502           0 :                           bo(:, :, :, 2), qbuf, fft_scratch)
     503             : 
     504             :             ! FFT along y
     505             :             ! use the inbuild fft-lib
     506             :             ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
     507             :             ! or cufft (works faster, but is only faster if plans are stored)
     508           0 :             CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
     509             : 
     510             :             ! Exchange data ( transpose of matrix )
     511           0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) sbuf = z_zero
     512             : 
     513           0 :             CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
     514             : 
     515             :             ! FFT along z
     516           0 :             CALL pw_gpu_fc(sbuf, pw2)
     517             : 
     518           0 :             CALL release_fft_scratch(fft_scratch)
     519             : 
     520             :          ELSE
     521             :             !
     522             :             ! real space is only distributed over x coordinate
     523             :             ! we have one stage of communication, after the transform of
     524             :             ! direction x
     525             :             !
     526             : 
     527           0 :             CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
     528             : 
     529             :             ! assign buffers
     530           0 :             sbuf => fft_scratch%r1buf
     531           0 :             tbuf => fft_scratch%tbuf
     532             : 
     533             :             ! FFT along x
     534           0 :             CALL pw_gpu_sf(pw1, sbuf, scale)
     535             : 
     536             :             ! Exchange data ( transpose of matrix ) and sort
     537           0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) tbuf = z_zero
     538             :             CALL x_to_yz(sbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     539           0 :                          bo(:, :, :, 2), tbuf, fft_scratch)
     540             : 
     541             :             ! FFT along y and z
     542           0 :             CALL pw_gpu_ffc(tbuf, pw2)
     543             : 
     544           0 :             CALL release_fft_scratch(fft_scratch)
     545             : 
     546             :          END IF
     547             : 
     548           0 :          DEALLOCATE (p2p)
     549             : 
     550             : !--------------------------------------------------------------------------
     551             :       ELSE
     552           0 :          CPABORT("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
     553             :          !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
     554             :          !     pw1%pw_grid%para%bo, scale = scale, debug=test )
     555             :       END IF
     556             : 
     557           0 :       CALL timestop(handle)
     558           0 :    END SUBROUTINE pw_gpu_c1dr3d_3d_ps
     559             : 
     560             : ! **************************************************************************************************
     561             : !> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
     562             : !> \param pw1 ...
     563             : !> \param pwbuf ...
     564             : !> \author Andreas Gloess
     565             : ! **************************************************************************************************
     566           0 :    SUBROUTINE pw_gpu_cff(pw1, pwbuf)
     567             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     568             :       COMPLEX(KIND=dp), DIMENSION(:, :, :), &
     569             :          INTENT(INOUT), TARGET                           :: pwbuf
     570             : 
     571             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_cff'
     572             : 
     573             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     574             :       INTEGER                                            :: handle, l1, l2, l3
     575           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     576             :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     577             :       INTERFACE
     578             :          SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
     579             :             IMPORT
     580             :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     581             :             TYPE(C_PTR), VALUE                           :: zout
     582             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     583             :          END SUBROUTINE pw_gpu_cff_c
     584             :       END INTERFACE
     585             : 
     586           0 :       CALL timeset(routineN, handle)
     587             : 
     588             :       ! dimensions
     589           0 :       npts => pw1%pw_grid%npts_local
     590           0 :       l1 = LBOUND(pw1%array, 1)
     591           0 :       l2 = LBOUND(pw1%array, 2)
     592           0 :       l3 = LBOUND(pw1%array, 3)
     593             : 
     594             :       ! pointers to data arrays
     595           0 :       ptr_pwin => pw1%array(l1, l2, l3)
     596           0 :       ptr_pwout => pwbuf(1, 1, 1)
     597             : 
     598             :       ! invoke the combined transformation
     599             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     600             :       CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     601             : #else
     602           0 :       CPABORT("Compiled without pw offloading")
     603             : #endif
     604             : 
     605           0 :       CALL timestop(handle)
     606           0 :    END SUBROUTINE pw_gpu_cff
     607             : 
     608             : ! **************************************************************************************************
     609             : !> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
     610             : !> \param pwbuf ...
     611             : !> \param pw2 ...
     612             : !> \author Andreas Gloess
     613             : ! **************************************************************************************************
     614           0 :    SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
     615             :       COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
     616             :          TARGET                                          :: pwbuf
     617             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw2
     618             : 
     619             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_ffc'
     620             : 
     621             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     622             :       INTEGER                                            :: handle, l1, l2, l3
     623           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     624             :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     625             :       INTERFACE
     626             :          SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
     627             :             IMPORT
     628             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     629             :             TYPE(C_PTR), VALUE                           :: dout
     630             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     631             :          END SUBROUTINE pw_gpu_ffc_c
     632             :       END INTERFACE
     633             : 
     634           0 :       CALL timeset(routineN, handle)
     635             : 
     636             :       ! dimensions
     637           0 :       npts => pw2%pw_grid%npts_local
     638           0 :       l1 = LBOUND(pw2%array, 1)
     639           0 :       l2 = LBOUND(pw2%array, 2)
     640           0 :       l3 = LBOUND(pw2%array, 3)
     641             : 
     642             :       ! pointers to data arrays
     643           0 :       ptr_pwin => pwbuf(1, 1, 1)
     644           0 :       ptr_pwout => pw2%array(l1, l2, l3)
     645             : 
     646             :       ! invoke the combined transformation
     647             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     648             :       CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     649             : #else
     650           0 :       CPABORT("Compiled without pw offloading")
     651             : #endif
     652             : 
     653           0 :       CALL timestop(handle)
     654           0 :    END SUBROUTINE pw_gpu_ffc
     655             : 
     656             : ! **************************************************************************************************
     657             : !> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
     658             : !> \param pw1 ...
     659             : !> \param pwbuf ...
     660             : !> \author Andreas Gloess
     661             : ! **************************************************************************************************
     662           0 :    SUBROUTINE pw_gpu_cf(pw1, pwbuf)
     663             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     664             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     665             :          TARGET                                          :: pwbuf
     666             : 
     667             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_cf'
     668             : 
     669             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     670             :       INTEGER                                            :: handle, l1, l2, l3
     671           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     672             :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     673             :       INTERFACE
     674             :          SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
     675             :             IMPORT
     676             :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     677             :             TYPE(C_PTR), VALUE                           :: zout
     678             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     679             :          END SUBROUTINE pw_gpu_cf_c
     680             :       END INTERFACE
     681             : 
     682           0 :       CALL timeset(routineN, handle)
     683             : 
     684             :       ! dimensions
     685           0 :       npts => pw1%pw_grid%npts_local
     686           0 :       l1 = LBOUND(pw1%array, 1)
     687           0 :       l2 = LBOUND(pw1%array, 2)
     688           0 :       l3 = LBOUND(pw1%array, 3)
     689             : 
     690             :       ! pointers to data arrays
     691           0 :       ptr_pwin => pw1%array(l1, l2, l3)
     692           0 :       ptr_pwout => pwbuf(1, 1)
     693             : 
     694             :       ! invoke the combined transformation
     695             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     696             :       CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     697             : #else
     698           0 :       CPABORT("Compiled without pw offloading")
     699             : #endif
     700           0 :       CALL timestop(handle)
     701           0 :    END SUBROUTINE pw_gpu_cf
     702             : 
     703             : ! **************************************************************************************************
     704             : !> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
     705             : !> \param pwbuf ...
     706             : !> \param pw2 ...
     707             : !> \author Andreas Gloess
     708             : ! **************************************************************************************************
     709           0 :    SUBROUTINE pw_gpu_fc(pwbuf, pw2)
     710             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     711             :          TARGET                                          :: pwbuf
     712             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw2
     713             : 
     714             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_fc'
     715             : 
     716             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     717             :       INTEGER                                            :: handle, l1, l2, l3
     718           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     719             :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     720             :       INTERFACE
     721             :          SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
     722             :             IMPORT
     723             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     724             :             TYPE(C_PTR), VALUE                           :: dout
     725             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     726             :          END SUBROUTINE pw_gpu_fc_c
     727             :       END INTERFACE
     728             : 
     729           0 :       CALL timeset(routineN, handle)
     730             : 
     731           0 :       npts => pw2%pw_grid%npts_local
     732           0 :       l1 = LBOUND(pw2%array, 1)
     733           0 :       l2 = LBOUND(pw2%array, 2)
     734           0 :       l3 = LBOUND(pw2%array, 3)
     735             : 
     736             :       ! pointers to data arrays
     737           0 :       ptr_pwin => pwbuf(1, 1)
     738           0 :       ptr_pwout => pw2%array(l1, l2, l3)
     739             : 
     740             :       ! invoke the combined transformation
     741             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     742             :       CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     743             : #else
     744           0 :       CPABORT("Compiled without pw offloading")
     745             : #endif
     746             : 
     747           0 :       CALL timestop(handle)
     748           0 :    END SUBROUTINE pw_gpu_fc
     749             : 
     750             : ! **************************************************************************************************
     751             : !> \brief perform a parallel 1D-FFT on the gpu
     752             : !> \param pwbuf1 ...
     753             : !> \param pwbuf2 ...
     754             : !> \param dir ...
     755             : !> \param n ...
     756             : !> \param m ...
     757             : !> \author Andreas Gloess
     758             : ! **************************************************************************************************
     759           0 :    SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
     760             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     761             :          TARGET                                          :: pwbuf1
     762             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     763             :          TARGET                                          :: pwbuf2
     764             :       INTEGER, INTENT(IN)                                :: dir, n, m
     765             : 
     766             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_f'
     767             : 
     768             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     769             :       INTEGER                                            :: handle
     770             :       INTERFACE
     771             :          SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
     772             :             IMPORT
     773             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     774             :             TYPE(C_PTR), VALUE                           :: zout
     775             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: dir, n, m
     776             :          END SUBROUTINE pw_gpu_f_c
     777             :       END INTERFACE
     778             : 
     779           0 :       CALL timeset(routineN, handle)
     780             : 
     781           0 :       IF (n*m /= 0) THEN
     782             :          ! pointers to data arrays
     783           0 :          ptr_pwin => pwbuf1(1, 1)
     784           0 :          ptr_pwout => pwbuf2(1, 1)
     785             : 
     786             :          ! invoke the combined transformation
     787             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     788             :          CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
     789             : #else
     790             :          MARK_USED(dir)
     791           0 :          CPABORT("Compiled without pw offloading")
     792             : #endif
     793             :       END IF
     794             : 
     795           0 :       CALL timestop(handle)
     796           0 :    END SUBROUTINE pw_gpu_f
     797             : ! **************************************************************************************************
     798             : !> \brief perform a parallel 1D-FFT followed by a gather on the gpu
     799             : !> \param pwbuf ...
     800             : !> \param pw2 ...
     801             : !> \param scale ...
     802             : !> \author Andreas Gloess
     803             : ! **************************************************************************************************
     804           0 :    SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
     805             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     806             :          TARGET                                          :: pwbuf
     807             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw2
     808             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     809             : 
     810             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_fg'
     811             : 
     812             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     813             :       INTEGER                                            :: handle, mg, mmax, ngpts
     814           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     815             :       INTEGER, POINTER                                   :: ptr_ghatmap
     816             :       INTERFACE
     817             :          SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
     818             :             IMPORT
     819             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     820             :             TYPE(C_PTR), VALUE                           :: zout
     821             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     822             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     823             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: mmax, ngpts
     824             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     825             : 
     826             :          END SUBROUTINE pw_gpu_fg_c
     827             :       END INTERFACE
     828             : 
     829           0 :       CALL timeset(routineN, handle)
     830             : 
     831           0 :       ngpts = SIZE(pw2%pw_grid%gsq)
     832           0 :       npts => pw2%pw_grid%npts
     833             : 
     834           0 :       IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
     835           0 :          mg = SIZE(pw2%pw_grid%grays, 2)
     836           0 :          mmax = MAX(mg, 1)
     837             : 
     838             :          ! pointers to data arrays
     839           0 :          ptr_pwin => pwbuf(1, 1)
     840           0 :          ptr_pwout => pw2%array(1)
     841             : 
     842             :          ! pointer to map array
     843           0 :          ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
     844             : 
     845             :          ! invoke the combined transformation
     846             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     847             :          CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
     848             : #else
     849             :          MARK_USED(scale)
     850           0 :          CPABORT("Compiled without pw offloading")
     851             : #endif
     852             :       END IF
     853             : 
     854           0 :       CALL timestop(handle)
     855           0 :    END SUBROUTINE pw_gpu_fg
     856             : 
     857             : ! **************************************************************************************************
     858             : !> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
     859             : !> \param pw1 ...
     860             : !> \param pwbuf ...
     861             : !> \param scale ...
     862             : !> \author Andreas Gloess
     863             : ! **************************************************************************************************
     864           0 :    SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
     865             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     866             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     867             :          TARGET                                          :: pwbuf
     868             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     869             : 
     870             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_sf'
     871             : 
     872             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     873             :       INTEGER                                            :: handle, mg, mmax, ngpts, nmaps
     874           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     875             :       INTEGER, POINTER                                   :: ptr_ghatmap
     876             :       INTERFACE
     877             :          SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
     878             :             IMPORT
     879             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     880             :             TYPE(C_PTR), VALUE                           :: zout
     881             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     882             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     883             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: mmax, ngpts, nmaps
     884             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     885             : 
     886             :          END SUBROUTINE pw_gpu_sf_c
     887             :       END INTERFACE
     888             : 
     889           0 :       CALL timeset(routineN, handle)
     890             : 
     891           0 :       ngpts = SIZE(pw1%pw_grid%gsq)
     892           0 :       npts => pw1%pw_grid%npts
     893             : 
     894           0 :       IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
     895           0 :          mg = SIZE(pw1%pw_grid%grays, 2)
     896           0 :          mmax = MAX(mg, 1)
     897             : 
     898             :          ! pointers to data arrays
     899           0 :          ptr_pwin => pw1%array(1)
     900           0 :          ptr_pwout => pwbuf(1, 1)
     901             : 
     902             :          ! pointer to map array
     903           0 :          nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
     904           0 :          ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
     905             : 
     906             :          ! invoke the combined transformation
     907             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     908             :          CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
     909             : #else
     910             :          MARK_USED(scale)
     911           0 :          CPABORT("Compiled without pw offloading")
     912             : #endif
     913             :       END IF
     914             : 
     915           0 :       CALL timestop(handle)
     916           0 :    END SUBROUTINE pw_gpu_sf
     917             : 
     918             : END MODULE pw_gpu
     919             : 

Generated by: LCOV version 1.15