LCOV - code coverage report
Current view: top level - src/pw - pw_gpu.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 1.5 % 273 4
Test Date: 2025-12-04 06:27:48 Functions: 15.4 % 13 2

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \note
      10              : !> This module contains routines necessary to operate on plane waves on GPUs
      11              : !     >  independently of the GPU platform.
      12              : !> \par History
      13              : !>      BGL (06-Mar-2008)  : Created
      14              : !>      AG  (18-May-2012)  : Refacturing:
      15              : !>                           - added explicit interfaces to C routines
      16              : !>                           - enable double precision complex transformations
      17              : !>      AG  (11-Sept-2012) : Modifications:
      18              : !>                          - use pointers if precision mapping is not required
      19              : !>                          - use OMP for mapping
      20              : !>      MT  (Jan 2022)     : Modifications
      21              : !>                          - use a generic interface for fft calls to GPUs
      22              : !>                          - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
      23              : !>                            can be added easily.
      24              : !> \author Benjamin G. Levine
      25              : ! **************************************************************************************************
      26              : MODULE pw_gpu
      27              :    USE ISO_C_BINDING,                   ONLY: C_DOUBLE,&
      28              :                                               C_INT,&
      29              :                                               C_LOC,&
      30              :                                               C_PTR
      31              :    USE fft_tools,                       ONLY: &
      32              :         cube_transpose_1, cube_transpose_2, fft_scratch_sizes, fft_scratch_type, get_fft_scratch, &
      33              :         release_fft_scratch, x_to_yz, xz_to_yz, yz_to_x, yz_to_xz
      34              :    USE kinds,                           ONLY: dp
      35              :    USE mathconstants,                   ONLY: z_zero
      36              :    USE message_passing,                 ONLY: mp_cart_type
      37              :    USE pw_grid_types,                   ONLY: FULLSPACE
      38              :    USE pw_types,                        ONLY: pw_c1d_gs_type,&
      39              :                                               pw_r3d_rs_type
      40              : #include "../base/base_uses.f90"
      41              : 
      42              :    IMPLICIT NONE
      43              : 
      44              :    PRIVATE
      45              : 
      46              :    PUBLIC :: pw_gpu_r3dc1d_3d
      47              :    PUBLIC :: pw_gpu_c1dr3d_3d
      48              :    PUBLIC :: pw_gpu_r3dc1d_3d_ps
      49              :    PUBLIC :: pw_gpu_c1dr3d_3d_ps
      50              :    PUBLIC :: pw_gpu_init, pw_gpu_finalize
      51              : 
      52              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
      53              :    LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .FALSE.
      54              : 
      55              : CONTAINS
      56              : 
      57              : ! **************************************************************************************************
      58              : !> \brief Allocates resources on the gpu device for gpu fft acceleration
      59              : !> \author Ole Schuett
      60              : ! **************************************************************************************************
      61         9284 :    SUBROUTINE pw_gpu_init()
      62              :       INTEGER                                            :: dummy
      63              :       INTERFACE
      64              :          SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
      65              :          END SUBROUTINE pw_gpu_init_c
      66              :       END INTERFACE
      67              : 
      68              :       MARK_USED(dummy) ! TODO: fix fpretty
      69              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
      70              :       CALL pw_gpu_init_c()
      71              : #else
      72              :       ! Nothing to do.
      73              : #endif
      74         9284 :    END SUBROUTINE pw_gpu_init
      75              : 
      76              : ! **************************************************************************************************
      77              : !> \brief Releases resources on the gpu device for gpu fft acceleration
      78              : !> \author Ole Schuett
      79              : ! **************************************************************************************************
      80         9284 :    SUBROUTINE pw_gpu_finalize()
      81              :       INTEGER                                            :: dummy
      82              :       INTERFACE
      83              :          SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
      84              :          END SUBROUTINE pw_gpu_finalize_c
      85              :       END INTERFACE
      86              : 
      87              :       MARK_USED(dummy) ! TODO: fix fpretty
      88              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
      89              :       CALL pw_gpu_finalize_c()
      90              : #else
      91              :       ! Nothing to do.
      92              : #endif
      93         9284 :    END SUBROUTINE pw_gpu_finalize
      94              : 
      95              : ! **************************************************************************************************
      96              : !> \brief perform an fft followed by a gather on the gpu
      97              : !> \param pw1 ...
      98              : !> \param pw2 ...
      99              : !> \author Benjamin G Levine
     100              : ! **************************************************************************************************
     101            0 :    SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2)
     102              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     103              :       TYPE(pw_c1d_gs_type), INTENT(INOUT)                :: pw2
     104              : 
     105              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_r3dc1d_3d'
     106              : 
     107              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     108              :       INTEGER                                            :: handle, l1, l2, l3, ngpts
     109            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     110              :       INTEGER, POINTER                                   :: ptr_ghatmap
     111              :       REAL(KIND=dp)                                      :: scale
     112              :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     113              :       INTERFACE
     114              :          SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
     115              :             IMPORT
     116              :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     117              :             TYPE(C_PTR), VALUE                           :: zout
     118              :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     119              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     120              :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: ngpts
     121              :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     122              : 
     123              :          END SUBROUTINE pw_gpu_cfffg_c
     124              :       END INTERFACE
     125              : 
     126            0 :       CALL timeset(routineN, handle)
     127              : 
     128            0 :       scale = 1.0_dp/REAL(pw1%pw_grid%ngpts, KIND=dp)
     129              : 
     130            0 :       ngpts = SIZE(pw2%pw_grid%gsq)
     131              :       l1 = LBOUND(pw1%array, 1)
     132              :       l2 = LBOUND(pw1%array, 2)
     133              :       l3 = LBOUND(pw1%array, 3)
     134            0 :       npts => pw1%pw_grid%npts
     135              : 
     136              :       ! pointers to data arrays
     137            0 :       ptr_pwin => pw1%array(l1, l2, l3)
     138            0 :       ptr_pwout => pw2%array(1)
     139              : 
     140              :       ! pointer to map array
     141            0 :       ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
     142              : 
     143              :       ! invoke the combined transformation
     144              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     145              :       CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
     146              : #else
     147            0 :       CPABORT("Compiled without pw offloading.")
     148              : #endif
     149              : 
     150            0 :       CALL timestop(handle)
     151            0 :    END SUBROUTINE pw_gpu_r3dc1d_3d
     152              : 
     153              : ! **************************************************************************************************
     154              : !> \brief perform an scatter followed by a fft on the gpu
     155              : !> \param pw1 ...
     156              : !> \param pw2 ...
     157              : !> \author Benjamin G Levine
     158              : ! **************************************************************************************************
     159            0 :    SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2)
     160              :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     161              :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw2
     162              : 
     163              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_c1dr3d_3d'
     164              : 
     165              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     166              :       INTEGER                                            :: handle, l1, l2, l3, ngpts, nmaps
     167            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     168              :       INTEGER, POINTER                                   :: ptr_ghatmap
     169              :       REAL(KIND=dp)                                      :: scale
     170              :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     171              :       INTERFACE
     172              :          SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
     173              :             IMPORT
     174              :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     175              :             TYPE(C_PTR), VALUE                           :: dout
     176              :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     177              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     178              :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: ngpts, nmaps
     179              :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     180              :          END SUBROUTINE pw_gpu_sfffc_c
     181              :       END INTERFACE
     182              : 
     183            0 :       CALL timeset(routineN, handle)
     184              : 
     185            0 :       scale = 1.0_dp
     186              : 
     187            0 :       ngpts = SIZE(pw1%pw_grid%gsq)
     188              :       l1 = LBOUND(pw2%array, 1)
     189              :       l2 = LBOUND(pw2%array, 2)
     190              :       l3 = LBOUND(pw2%array, 3)
     191            0 :       npts => pw1%pw_grid%npts
     192              : 
     193              :       ! pointers to data arrays
     194            0 :       ptr_pwin => pw1%array(1)
     195            0 :       ptr_pwout => pw2%array(l1, l2, l3)
     196              : 
     197              :       ! pointer to map array
     198            0 :       nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
     199            0 :       ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
     200              : 
     201              :       ! invoke the combined transformation
     202              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     203              :       CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
     204              : #else
     205            0 :       CPABORT("Compiled without pw offloading")
     206              : #endif
     207              : 
     208            0 :       CALL timestop(handle)
     209            0 :    END SUBROUTINE pw_gpu_c1dr3d_3d
     210              : 
     211              : ! **************************************************************************************************
     212              : !> \brief perform an parallel fft followed by a gather on the gpu
     213              : !> \param pw1 ...
     214              : !> \param pw2 ...
     215              : !> \author Andreas Gloess
     216              : ! **************************************************************************************************
     217            0 :    SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2)
     218              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     219              :       TYPE(pw_c1d_gs_type), INTENT(INOUT)                :: pw2
     220              : 
     221              :       CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_r3dc1d_3d_ps'
     222              : 
     223            0 :       COMPLEX(KIND=dp), DIMENSION(:, :), POINTER         :: grays, pbuf, qbuf, rbuf, sbuf
     224            0 :       COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER      :: tbuf
     225              :       INTEGER                                            :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
     226              :                                                             mz2, n1, n2, ngpts, nmax, numtask, rp
     227            0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: p2p
     228              :       INTEGER, DIMENSION(2)                              :: r_dim, r_pos
     229            0 :       INTEGER, DIMENSION(:), POINTER                     :: n, nloc, nyzray
     230            0 :       INTEGER, DIMENSION(:, :, :, :), POINTER            :: bo
     231              :       REAL(KIND=dp)                                      :: scale
     232            0 :       TYPE(fft_scratch_sizes)                            :: fft_scratch_size
     233              :       TYPE(fft_scratch_type), POINTER                    :: fft_scratch
     234            0 :       TYPE(mp_cart_type)                                 :: rs_group
     235              : 
     236            0 :       CALL timeset(routineN, handle)
     237              : 
     238            0 :       scale = 1.0_dp/REAL(pw1%pw_grid%ngpts, KIND=dp)
     239              : 
     240              :       ! dimensions
     241            0 :       n => pw1%pw_grid%npts
     242            0 :       nloc => pw1%pw_grid%npts_local
     243            0 :       grays => pw1%pw_grid%grays
     244            0 :       ngpts = nloc(1)*nloc(2)*nloc(3)
     245              : 
     246              :       !..transform
     247            0 :       IF (pw1%pw_grid%para%ray_distribution) THEN
     248            0 :          rs_group = pw1%pw_grid%para%group
     249            0 :          nyzray => pw1%pw_grid%para%nyzray
     250            0 :          bo => pw1%pw_grid%para%bo
     251              : 
     252            0 :          g_pos = rs_group%mepos
     253            0 :          numtask = rs_group%num_pe
     254            0 :          r_dim = rs_group%num_pe_cart
     255            0 :          r_pos = rs_group%mepos_cart
     256              : 
     257            0 :          lg = SIZE(grays, 1)
     258            0 :          mg = SIZE(grays, 2)
     259            0 :          mmax = MAX(mg, 1)
     260            0 :          lmax = MAX(lg, (ngpts/mmax + 1))
     261              : 
     262            0 :          ALLOCATE (p2p(0:numtask - 1))
     263              : 
     264            0 :          CALL rs_group%rank_compare(rs_group, p2p)
     265              : 
     266            0 :          rp = p2p(g_pos)
     267            0 :          mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
     268            0 :          mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
     269            0 :          n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
     270            0 :          n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
     271            0 :          nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
     272            0 :          nmax = MAX(nmax, n1*MAXVAL(nyzray))
     273              : 
     274            0 :          fft_scratch_size%nx = nloc(1)
     275            0 :          fft_scratch_size%ny = nloc(2)
     276            0 :          fft_scratch_size%nz = nloc(3)
     277            0 :          fft_scratch_size%lmax = lmax
     278            0 :          fft_scratch_size%mmax = mmax
     279            0 :          fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
     280            0 :          fft_scratch_size%mx2 = mx2
     281            0 :          fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
     282            0 :          fft_scratch_size%mz2 = mz2
     283            0 :          fft_scratch_size%lg = lg
     284            0 :          fft_scratch_size%mg = mg
     285            0 :          fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
     286            0 :          fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
     287            0 :          fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
     288            0 :          fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
     289            0 :          fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
     290            0 :          fft_scratch_size%nmax = nmax
     291            0 :          fft_scratch_size%nmray = MAXVAL(nyzray)
     292            0 :          fft_scratch_size%nyzray = nyzray(g_pos)
     293            0 :          fft_scratch_size%rs_group = rs_group
     294            0 :          fft_scratch_size%g_pos = g_pos
     295            0 :          fft_scratch_size%r_pos = r_pos
     296            0 :          fft_scratch_size%r_dim = r_dim
     297            0 :          fft_scratch_size%numtask = numtask
     298              : 
     299            0 :          IF (r_dim(2) > 1) THEN
     300              :             !
     301              :             ! real space is distributed over x and y coordinate
     302              :             ! we have two stages of communication
     303              :             !
     304            0 :             IF (r_dim(1) == 1) &
     305            0 :                CPABORT("This processor distribution is not supported.")
     306              : 
     307            0 :             CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
     308              : 
     309              :             ! assign buffers
     310            0 :             qbuf => fft_scratch%p2buf
     311            0 :             rbuf => fft_scratch%p3buf
     312            0 :             pbuf => fft_scratch%p4buf
     313            0 :             sbuf => fft_scratch%p5buf
     314              : 
     315              :             ! FFT along z
     316            0 :             CALL pw_gpu_cf(pw1, qbuf)
     317              : 
     318              :             ! Exchange data ( transpose of matrix )
     319            0 :             CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
     320              : 
     321              :             ! FFT along y
     322              :             ! use the inbuild fft-lib
     323              :             ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
     324              :             ! or cufft (works faster, but is only faster if plans are stored)
     325            0 :             CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
     326              : 
     327              :             ! Exchange data ( transpose of matrix ) and sort
     328              :             CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     329            0 :                           bo(:, :, :, 2), sbuf, fft_scratch)
     330              : 
     331              :             ! FFT along x
     332            0 :             CALL pw_gpu_fg(sbuf, pw2, scale)
     333              : 
     334            0 :             CALL release_fft_scratch(fft_scratch)
     335              : 
     336              :          ELSE
     337              :             !
     338              :             ! real space is only distributed over x coordinate
     339              :             ! we have one stage of communication, after the transform of
     340              :             ! direction x
     341              :             !
     342              : 
     343            0 :             CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
     344              : 
     345              :             ! assign buffers
     346            0 :             tbuf => fft_scratch%tbuf
     347            0 :             sbuf => fft_scratch%r1buf
     348              : 
     349              :             ! FFT along y and z
     350            0 :             CALL pw_gpu_cff(pw1, tbuf)
     351              : 
     352              :             ! Exchange data ( transpose of matrix ) and sort
     353              :             CALL yz_to_x(tbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     354            0 :                          bo(:, :, :, 2), sbuf, fft_scratch)
     355              : 
     356              :             ! FFT along x
     357            0 :             CALL pw_gpu_fg(sbuf, pw2, scale)
     358              : 
     359            0 :             CALL release_fft_scratch(fft_scratch)
     360              : 
     361              :          END IF
     362              : 
     363            0 :          DEALLOCATE (p2p)
     364              : 
     365              : !--------------------------------------------------------------------------
     366              :       ELSE
     367            0 :          CPABORT("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
     368              :       END IF
     369              : 
     370            0 :       CALL timestop(handle)
     371            0 :    END SUBROUTINE pw_gpu_r3dc1d_3d_ps
     372              : 
     373              : ! **************************************************************************************************
     374              : !> \brief perform an parallel scatter followed by a fft on the gpu
     375              : !> \param pw1 ...
     376              : !> \param pw2 ...
     377              : !> \author Andreas Gloess
     378              : ! **************************************************************************************************
     379            0 :    SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2)
     380              :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     381              :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw2
     382              : 
     383              :       CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_c1dr3d_3d_ps'
     384              : 
     385            0 :       COMPLEX(KIND=dp), DIMENSION(:, :), POINTER         :: grays, pbuf, qbuf, rbuf, sbuf
     386            0 :       COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER      :: tbuf
     387              :       INTEGER                                            :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
     388              :                                                             mz2, n1, n2, ngpts, nmax, numtask, rp
     389            0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: p2p
     390              :       INTEGER, DIMENSION(2)                              :: r_dim, r_pos
     391            0 :       INTEGER, DIMENSION(:), POINTER                     :: n, nloc, nyzray
     392            0 :       INTEGER, DIMENSION(:, :, :, :), POINTER            :: bo
     393              :       REAL(KIND=dp)                                      :: scale
     394            0 :       TYPE(fft_scratch_sizes)                            :: fft_scratch_size
     395              :       TYPE(fft_scratch_type), POINTER                    :: fft_scratch
     396            0 :       TYPE(mp_cart_type)                                 :: rs_group
     397              : 
     398            0 :       CALL timeset(routineN, handle)
     399              : 
     400            0 :       scale = 1.0_dp
     401              : 
     402              :       ! dimensions
     403            0 :       n => pw1%pw_grid%npts
     404            0 :       nloc => pw1%pw_grid%npts_local
     405            0 :       grays => pw1%pw_grid%grays
     406            0 :       ngpts = nloc(1)*nloc(2)*nloc(3)
     407              : 
     408              :       !..transform
     409            0 :       IF (pw1%pw_grid%para%ray_distribution) THEN
     410            0 :          rs_group = pw1%pw_grid%para%group
     411            0 :          nyzray => pw1%pw_grid%para%nyzray
     412            0 :          bo => pw1%pw_grid%para%bo
     413              : 
     414            0 :          g_pos = rs_group%mepos
     415            0 :          numtask = rs_group%num_pe
     416            0 :          r_dim = rs_group%num_pe_cart
     417            0 :          r_pos = rs_group%mepos_cart
     418              : 
     419            0 :          lg = SIZE(grays, 1)
     420            0 :          mg = SIZE(grays, 2)
     421            0 :          mmax = MAX(mg, 1)
     422            0 :          lmax = MAX(lg, (ngpts/mmax + 1))
     423              : 
     424            0 :          ALLOCATE (p2p(0:numtask - 1))
     425              : 
     426            0 :          CALL rs_group%rank_compare(rs_group, p2p)
     427              : 
     428            0 :          rp = p2p(g_pos)
     429            0 :          mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
     430            0 :          mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
     431            0 :          n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
     432            0 :          n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
     433            0 :          nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
     434            0 :          nmax = MAX(nmax, n1*MAXVAL(nyzray))
     435              : 
     436            0 :          fft_scratch_size%nx = nloc(1)
     437            0 :          fft_scratch_size%ny = nloc(2)
     438            0 :          fft_scratch_size%nz = nloc(3)
     439            0 :          fft_scratch_size%lmax = lmax
     440            0 :          fft_scratch_size%mmax = mmax
     441            0 :          fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
     442            0 :          fft_scratch_size%mx2 = mx2
     443            0 :          fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
     444            0 :          fft_scratch_size%mz2 = mz2
     445            0 :          fft_scratch_size%lg = lg
     446            0 :          fft_scratch_size%mg = mg
     447            0 :          fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
     448            0 :          fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
     449            0 :          fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
     450            0 :          fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
     451            0 :          fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
     452            0 :          fft_scratch_size%nmax = nmax
     453            0 :          fft_scratch_size%nmray = MAXVAL(nyzray)
     454            0 :          fft_scratch_size%nyzray = nyzray(g_pos)
     455            0 :          fft_scratch_size%rs_group = rs_group
     456            0 :          fft_scratch_size%g_pos = g_pos
     457            0 :          fft_scratch_size%r_pos = r_pos
     458            0 :          fft_scratch_size%r_dim = r_dim
     459            0 :          fft_scratch_size%numtask = numtask
     460              : 
     461            0 :          IF (r_dim(2) > 1) THEN
     462              :             !
     463              :             ! real space is distributed over x and y coordinate
     464              :             ! we have two stages of communication
     465              :             !
     466            0 :             IF (r_dim(1) == 1) &
     467            0 :                CPABORT("This processor distribution is not supported.")
     468              : 
     469            0 :             CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
     470              : 
     471              :             ! assign buffers
     472            0 :             pbuf => fft_scratch%p7buf
     473            0 :             qbuf => fft_scratch%p4buf
     474            0 :             rbuf => fft_scratch%p3buf
     475            0 :             sbuf => fft_scratch%p2buf
     476              : 
     477              :             ! FFT along x
     478            0 :             CALL pw_gpu_sf(pw1, pbuf, scale)
     479              : 
     480              :             ! Exchange data ( transpose of matrix ) and sort
     481            0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) qbuf = z_zero
     482              :             CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     483            0 :                           bo(:, :, :, 2), qbuf, fft_scratch)
     484              : 
     485              :             ! FFT along y
     486              :             ! use the inbuild fft-lib
     487              :             ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
     488              :             ! or cufft (works faster, but is only faster if plans are stored)
     489            0 :             CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
     490              : 
     491              :             ! Exchange data ( transpose of matrix )
     492            0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) sbuf = z_zero
     493              : 
     494            0 :             CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
     495              : 
     496              :             ! FFT along z
     497            0 :             CALL pw_gpu_fc(sbuf, pw2)
     498              : 
     499            0 :             CALL release_fft_scratch(fft_scratch)
     500              : 
     501              :          ELSE
     502              :             !
     503              :             ! real space is only distributed over x coordinate
     504              :             ! we have one stage of communication, after the transform of
     505              :             ! direction x
     506              :             !
     507              : 
     508            0 :             CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
     509              : 
     510              :             ! assign buffers
     511            0 :             sbuf => fft_scratch%r1buf
     512            0 :             tbuf => fft_scratch%tbuf
     513              : 
     514              :             ! FFT along x
     515            0 :             CALL pw_gpu_sf(pw1, sbuf, scale)
     516              : 
     517              :             ! Exchange data ( transpose of matrix ) and sort
     518            0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) tbuf = z_zero
     519              :             CALL x_to_yz(sbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     520            0 :                          bo(:, :, :, 2), tbuf, fft_scratch)
     521              : 
     522              :             ! FFT along y and z
     523            0 :             CALL pw_gpu_ffc(tbuf, pw2)
     524              : 
     525            0 :             CALL release_fft_scratch(fft_scratch)
     526              : 
     527              :          END IF
     528              : 
     529            0 :          DEALLOCATE (p2p)
     530              : 
     531              : !--------------------------------------------------------------------------
     532              :       ELSE
     533            0 :          CPABORT("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
     534              :       END IF
     535              : 
     536            0 :       CALL timestop(handle)
     537            0 :    END SUBROUTINE pw_gpu_c1dr3d_3d_ps
     538              : 
     539              : ! **************************************************************************************************
     540              : !> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
     541              : !> \param pw1 ...
     542              : !> \param pwbuf ...
     543              : !> \author Andreas Gloess
     544              : ! **************************************************************************************************
     545            0 :    SUBROUTINE pw_gpu_cff(pw1, pwbuf)
     546              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     547              :       COMPLEX(KIND=dp), DIMENSION(:, :, :), &
     548              :          INTENT(INOUT), TARGET                           :: pwbuf
     549              : 
     550              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_cff'
     551              : 
     552              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     553              :       INTEGER                                            :: handle, l1, l2, l3
     554            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     555              :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     556              :       INTERFACE
     557              :          SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
     558              :             IMPORT
     559              :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     560              :             TYPE(C_PTR), VALUE                           :: zout
     561              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     562              :          END SUBROUTINE pw_gpu_cff_c
     563              :       END INTERFACE
     564              : 
     565            0 :       CALL timeset(routineN, handle)
     566              : 
     567              :       ! dimensions
     568            0 :       npts => pw1%pw_grid%npts_local
     569              :       l1 = LBOUND(pw1%array, 1)
     570              :       l2 = LBOUND(pw1%array, 2)
     571              :       l3 = LBOUND(pw1%array, 3)
     572              : 
     573              :       ! pointers to data arrays
     574            0 :       ptr_pwin => pw1%array(l1, l2, l3)
     575            0 :       ptr_pwout => pwbuf(1, 1, 1)
     576              : 
     577              :       ! invoke the combined transformation
     578              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     579              :       CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     580              : #else
     581            0 :       CPABORT("Compiled without pw offloading")
     582              : #endif
     583              : 
     584            0 :       CALL timestop(handle)
     585            0 :    END SUBROUTINE pw_gpu_cff
     586              : 
     587              : ! **************************************************************************************************
     588              : !> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
     589              : !> \param pwbuf ...
     590              : !> \param pw2 ...
     591              : !> \author Andreas Gloess
     592              : ! **************************************************************************************************
     593            0 :    SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
     594              :       COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
     595              :          TARGET                                          :: pwbuf
     596              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw2
     597              : 
     598              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_ffc'
     599              : 
     600              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     601              :       INTEGER                                            :: handle, l1, l2, l3
     602            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     603              :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     604              :       INTERFACE
     605              :          SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
     606              :             IMPORT
     607              :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     608              :             TYPE(C_PTR), VALUE                           :: dout
     609              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     610              :          END SUBROUTINE pw_gpu_ffc_c
     611              :       END INTERFACE
     612              : 
     613            0 :       CALL timeset(routineN, handle)
     614              : 
     615              :       ! dimensions
     616            0 :       npts => pw2%pw_grid%npts_local
     617              :       l1 = LBOUND(pw2%array, 1)
     618              :       l2 = LBOUND(pw2%array, 2)
     619              :       l3 = LBOUND(pw2%array, 3)
     620              : 
     621              :       ! pointers to data arrays
     622            0 :       ptr_pwin => pwbuf(1, 1, 1)
     623            0 :       ptr_pwout => pw2%array(l1, l2, l3)
     624              : 
     625              :       ! invoke the combined transformation
     626              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     627              :       CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     628              : #else
     629            0 :       CPABORT("Compiled without pw offloading")
     630              : #endif
     631              : 
     632            0 :       CALL timestop(handle)
     633            0 :    END SUBROUTINE pw_gpu_ffc
     634              : 
     635              : ! **************************************************************************************************
     636              : !> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
     637              : !> \param pw1 ...
     638              : !> \param pwbuf ...
     639              : !> \author Andreas Gloess
     640              : ! **************************************************************************************************
     641            0 :    SUBROUTINE pw_gpu_cf(pw1, pwbuf)
     642              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     643              :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     644              :          TARGET                                          :: pwbuf
     645              : 
     646              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_cf'
     647              : 
     648              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     649              :       INTEGER                                            :: handle, l1, l2, l3
     650            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     651              :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     652              :       INTERFACE
     653              :          SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
     654              :             IMPORT
     655              :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     656              :             TYPE(C_PTR), VALUE                           :: zout
     657              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     658              :          END SUBROUTINE pw_gpu_cf_c
     659              :       END INTERFACE
     660              : 
     661            0 :       CALL timeset(routineN, handle)
     662              : 
     663              :       ! dimensions
     664            0 :       npts => pw1%pw_grid%npts_local
     665              :       l1 = LBOUND(pw1%array, 1)
     666              :       l2 = LBOUND(pw1%array, 2)
     667              :       l3 = LBOUND(pw1%array, 3)
     668              : 
     669              :       ! pointers to data arrays
     670            0 :       ptr_pwin => pw1%array(l1, l2, l3)
     671            0 :       ptr_pwout => pwbuf(1, 1)
     672              : 
     673              :       ! invoke the combined transformation
     674              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     675              :       CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     676              : #else
     677            0 :       CPABORT("Compiled without pw offloading")
     678              : #endif
     679            0 :       CALL timestop(handle)
     680            0 :    END SUBROUTINE pw_gpu_cf
     681              : 
     682              : ! **************************************************************************************************
     683              : !> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
     684              : !> \param pwbuf ...
     685              : !> \param pw2 ...
     686              : !> \author Andreas Gloess
     687              : ! **************************************************************************************************
     688            0 :    SUBROUTINE pw_gpu_fc(pwbuf, pw2)
     689              :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     690              :          TARGET                                          :: pwbuf
     691              :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw2
     692              : 
     693              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_fc'
     694              : 
     695              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     696              :       INTEGER                                            :: handle, l1, l2, l3
     697            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     698              :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     699              :       INTERFACE
     700              :          SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
     701              :             IMPORT
     702              :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     703              :             TYPE(C_PTR), VALUE                           :: dout
     704              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     705              :          END SUBROUTINE pw_gpu_fc_c
     706              :       END INTERFACE
     707              : 
     708            0 :       CALL timeset(routineN, handle)
     709              : 
     710            0 :       npts => pw2%pw_grid%npts_local
     711              :       l1 = LBOUND(pw2%array, 1)
     712              :       l2 = LBOUND(pw2%array, 2)
     713              :       l3 = LBOUND(pw2%array, 3)
     714              : 
     715              :       ! pointers to data arrays
     716            0 :       ptr_pwin => pwbuf(1, 1)
     717            0 :       ptr_pwout => pw2%array(l1, l2, l3)
     718              : 
     719              :       ! invoke the combined transformation
     720              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     721              :       CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     722              : #else
     723            0 :       CPABORT("Compiled without pw offloading")
     724              : #endif
     725              : 
     726            0 :       CALL timestop(handle)
     727            0 :    END SUBROUTINE pw_gpu_fc
     728              : 
     729              : ! **************************************************************************************************
     730              : !> \brief perform a parallel 1D-FFT on the gpu
     731              : !> \param pwbuf1 ...
     732              : !> \param pwbuf2 ...
     733              : !> \param dir ...
     734              : !> \param n ...
     735              : !> \param m ...
     736              : !> \author Andreas Gloess
     737              : ! **************************************************************************************************
     738            0 :    SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
     739              :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     740              :          TARGET                                          :: pwbuf1
     741              :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     742              :          TARGET                                          :: pwbuf2
     743              :       INTEGER, INTENT(IN)                                :: dir, n, m
     744              : 
     745              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_f'
     746              : 
     747              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     748              :       INTEGER                                            :: handle
     749              :       INTERFACE
     750              :          SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
     751              :             IMPORT
     752              :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     753              :             TYPE(C_PTR), VALUE                           :: zout
     754              :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: dir, n, m
     755              :          END SUBROUTINE pw_gpu_f_c
     756              :       END INTERFACE
     757              : 
     758            0 :       CALL timeset(routineN, handle)
     759              : 
     760            0 :       IF (n*m /= 0) THEN
     761              :          ! pointers to data arrays
     762            0 :          ptr_pwin => pwbuf1(1, 1)
     763            0 :          ptr_pwout => pwbuf2(1, 1)
     764              : 
     765              :          ! invoke the combined transformation
     766              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     767              :          CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
     768              : #else
     769              :          MARK_USED(dir)
     770            0 :          CPABORT("Compiled without pw offloading")
     771              : #endif
     772              :       END IF
     773              : 
     774            0 :       CALL timestop(handle)
     775            0 :    END SUBROUTINE pw_gpu_f
     776              : ! **************************************************************************************************
     777              : !> \brief perform a parallel 1D-FFT followed by a gather on the gpu
     778              : !> \param pwbuf ...
     779              : !> \param pw2 ...
     780              : !> \param scale ...
     781              : !> \author Andreas Gloess
     782              : ! **************************************************************************************************
     783            0 :    SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
     784              :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     785              :          TARGET                                          :: pwbuf
     786              :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw2
     787              :       REAL(KIND=dp), INTENT(IN)                          :: scale
     788              : 
     789              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_fg'
     790              : 
     791              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     792              :       INTEGER                                            :: handle, mg, mmax, ngpts
     793            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     794              :       INTEGER, POINTER                                   :: ptr_ghatmap
     795              :       INTERFACE
     796              :          SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
     797              :             IMPORT
     798              :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     799              :             TYPE(C_PTR), VALUE                           :: zout
     800              :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     801              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     802              :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: mmax, ngpts
     803              :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     804              : 
     805              :          END SUBROUTINE pw_gpu_fg_c
     806              :       END INTERFACE
     807              : 
     808            0 :       CALL timeset(routineN, handle)
     809              : 
     810            0 :       ngpts = SIZE(pw2%pw_grid%gsq)
     811            0 :       npts => pw2%pw_grid%npts
     812              : 
     813            0 :       IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
     814            0 :          mg = SIZE(pw2%pw_grid%grays, 2)
     815            0 :          mmax = MAX(mg, 1)
     816              : 
     817              :          ! pointers to data arrays
     818            0 :          ptr_pwin => pwbuf(1, 1)
     819            0 :          ptr_pwout => pw2%array(1)
     820              : 
     821              :          ! pointer to map array
     822            0 :          ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
     823              : 
     824              :          ! invoke the combined transformation
     825              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     826              :          CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
     827              : #else
     828              :          MARK_USED(scale)
     829            0 :          CPABORT("Compiled without pw offloading")
     830              : #endif
     831              :       END IF
     832              : 
     833            0 :       CALL timestop(handle)
     834            0 :    END SUBROUTINE pw_gpu_fg
     835              : 
     836              : ! **************************************************************************************************
     837              : !> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
     838              : !> \param pw1 ...
     839              : !> \param pwbuf ...
     840              : !> \param scale ...
     841              : !> \author Andreas Gloess
     842              : ! **************************************************************************************************
     843            0 :    SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
     844              :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     845              :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     846              :          TARGET                                          :: pwbuf
     847              :       REAL(KIND=dp), INTENT(IN)                          :: scale
     848              : 
     849              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_sf'
     850              : 
     851              :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     852              :       INTEGER                                            :: handle, mg, mmax, ngpts, nmaps
     853            0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     854              :       INTEGER, POINTER                                   :: ptr_ghatmap
     855              :       INTERFACE
     856              :          SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
     857              :             IMPORT
     858              :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     859              :             TYPE(C_PTR), VALUE                           :: zout
     860              :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     861              :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     862              :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: mmax, ngpts, nmaps
     863              :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     864              : 
     865              :          END SUBROUTINE pw_gpu_sf_c
     866              :       END INTERFACE
     867              : 
     868            0 :       CALL timeset(routineN, handle)
     869              : 
     870            0 :       ngpts = SIZE(pw1%pw_grid%gsq)
     871            0 :       npts => pw1%pw_grid%npts
     872              : 
     873            0 :       IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
     874            0 :          mg = SIZE(pw1%pw_grid%grays, 2)
     875            0 :          mmax = MAX(mg, 1)
     876              : 
     877              :          ! pointers to data arrays
     878            0 :          ptr_pwin => pw1%array(1)
     879            0 :          ptr_pwout => pwbuf(1, 1)
     880              : 
     881              :          ! pointer to map array
     882            0 :          nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
     883            0 :          ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
     884              : 
     885              :          ! invoke the combined transformation
     886              : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     887              :          CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
     888              : #else
     889              :          MARK_USED(scale)
     890            0 :          CPABORT("Compiled without pw offloading")
     891              : #endif
     892              :       END IF
     893              : 
     894            0 :       CALL timestop(handle)
     895            0 :    END SUBROUTINE pw_gpu_sf
     896              : 
     897              : END MODULE pw_gpu
     898              : 
        

Generated by: LCOV version 2.0-1