LCOV - code coverage report
Current view: top level - src/pw - ps_wavelet_base.F (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:c24029e) Lines: 97.1 % 763 741
Test Date: 2026-07-04 06:36:57 Functions: 96.2 % 26 25

            Line data    Source code
       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2026 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief Creates the wavelet kernel for the wavelet based poisson solver.
      10              : !> \author Florian Schiffmann (09.2007,fschiff)
      11              : ! **************************************************************************************************
      12              : MODULE ps_wavelet_base
      13              : 
      14              :    USE kinds,                           ONLY: dp
      15              :    USE message_passing,                 ONLY: mp_comm_type
      16              :    USE ps_wavelet_fft3d,                ONLY: ctrig,&
      17              :                                               ctrig_length,&
      18              :                                               fftstp
      19              : #include "../base/base_uses.f90"
      20              : 
      21              :    IMPLICIT NONE
      22              : 
      23              :    PRIVATE
      24              : 
      25              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'ps_wavelet_base'
      26              : 
      27              :    PUBLIC :: scramble_unpack, p_poissonsolver, s_poissonsolver, f_poissonsolver
      28              : 
      29              : CONTAINS
      30              : 
      31              : ! **************************************************************************************************
      32              : !> \brief ...
      33              : !> \param n1 ...
      34              : !> \param n2 ...
      35              : !> \param n3 ...
      36              : !> \param nd1 ...
      37              : !> \param nd2 ...
      38              : !> \param nd3 ...
      39              : !> \param md1 ...
      40              : !> \param md2 ...
      41              : !> \param md3 ...
      42              : !> \param nproc ...
      43              : !> \param iproc ...
      44              : !> \param zf ...
      45              : !> \param scal ...
      46              : !> \param hx ...
      47              : !> \param hy ...
      48              : !> \param hz ...
      49              : !> \param mpi_group ...
      50              : ! **************************************************************************************************
      51        17323 :    SUBROUTINE P_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, zf &
      52              :                               , scal, hx, hy, hz, mpi_group)
      53              :       INTEGER, INTENT(in)                                :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
      54              :                                                             md3, nproc, iproc
      55              :       REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
      56              :          INTENT(inout)                                   :: zf
      57              :       REAL(KIND=dp), INTENT(in)                          :: scal, hx, hy, hz
      58              : 
      59              :       CLASS(mp_comm_type), INTENT(in)                     :: mpi_group
      60              : 
      61              :       INTEGER, PARAMETER                                 :: ncache_optimal = 8*1024
      62              : 
      63              :       INTEGER                                            :: i, i1, i3, ic1, ic2, ic3, inzee, j, j2, &
      64              :                                                             J2stb, J2stf, j3, Jp2stb, Jp2stf, lot, &
      65              :                                                             lzt, ma, mb, ncache, nfft
      66        17323 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: after1, after2, after3, before1, &
      67        17323 :                                                             before2, before3, now1, now2, now3
      68        17323 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: btrig1, btrig2, btrig3, ftrig1, ftrig2, &
      69        17323 :                                                             ftrig3
      70        17323 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: zt, zw
      71              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :)  :: zmpi2
      72              :       REAL(KIND=dp), ALLOCATABLE, &
      73        17323 :          DIMENSION(:, :, :, :, :)                        :: zmpi1
      74              : 
      75        17323 :       IF (nd1 < n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
      76        17323 :       IF (nd2 < n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
      77        17323 :       IF (nd3 < n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
      78        17323 :       IF (md1 < n1) CPABORT("Parallel convolution:ERROR:md1")
      79        17323 :       IF (md2 < n2) CPABORT("Parallel convolution:ERROR:md2")
      80        17323 :       IF (md3 < n3) CPABORT("Parallel convolution:ERROR:md3")
      81        17323 :       IF (MOD(nd3, nproc) /= 0) CPABORT("Parallel convolution:ERROR:nd3")
      82        17323 :       IF (MOD(md2, nproc) /= 0) CPABORT("Parallel convolution:ERROR:md2")
      83              : 
      84              :       !defining work arrays dimensions
      85        17323 :       ncache = ncache_optimal
      86        17323 :       IF (ncache <= MAX(n1, n2, n3)*4) ncache = MAX(n1, n2, n3)*4
      87              : 
      88        17323 :       lzt = n2
      89        17323 :       IF (MOD(n2, 2) == 0) lzt = lzt + 1
      90        17323 :       IF (MOD(n2, 4) == 0) lzt = lzt + 1 !maybe this is useless
      91              : 
      92              :       !Allocations
      93        17323 :       ALLOCATE (btrig1(2, ctrig_length))
      94        17323 :       ALLOCATE (ftrig1(2, ctrig_length))
      95        17323 :       ALLOCATE (after1(7))
      96        17323 :       ALLOCATE (now1(7))
      97        17323 :       ALLOCATE (before1(7))
      98        17323 :       ALLOCATE (btrig2(2, ctrig_length))
      99        17323 :       ALLOCATE (ftrig2(2, ctrig_length))
     100        17323 :       ALLOCATE (after2(7))
     101        17323 :       ALLOCATE (now2(7))
     102        17323 :       ALLOCATE (before2(7))
     103        17323 :       ALLOCATE (btrig3(2, ctrig_length))
     104        17323 :       ALLOCATE (ftrig3(2, ctrig_length))
     105        17323 :       ALLOCATE (after3(7))
     106        17323 :       ALLOCATE (now3(7))
     107        17323 :       ALLOCATE (before3(7))
     108        69292 :       ALLOCATE (zw(2, ncache/4, 2))
     109        69292 :       ALLOCATE (zt(2, lzt, n1))
     110        86615 :       ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
     111        41353 :       IF (nproc > 1) ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
     112              : 
     113              :       !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
     114        17323 :       CALL ctrig(n3, btrig3, after3, before3, now3, 1, ic3)
     115        17323 :       CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
     116        17323 :       CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
     117       375192 :       DO j = 1, n1
     118       357869 :          ftrig1(1, j) = btrig1(1, j)
     119       375192 :          ftrig1(2, j) = -btrig1(2, j)
     120              :       END DO
     121       375192 :       DO j = 1, n2
     122       357869 :          ftrig2(1, j) = btrig2(1, j)
     123       375192 :          ftrig2(2, j) = -btrig2(2, j)
     124              :       END DO
     125       375192 :       DO j = 1, n3
     126       357869 :          ftrig3(1, j) = btrig3(1, j)
     127       375192 :          ftrig3(2, j) = -btrig3(2, j)
     128              :       END DO
     129              : 
     130              :       ! transform along z axis
     131        17323 :       lot = ncache/(4*n3)
     132        17323 :       IF (lot < 1) THEN
     133              :          CALL cp_abort(__LOCATION__, &
     134              :                        'convolxc_off:ncache has to be enlarged to be able to hold at '// &
     135              :                        'least one 1-d FFT of this size even though this will '// &
     136            0 :                        'reduce the performance for shorter transform lengths')
     137              :       END IF
     138       320326 :       DO j2 = 1, md2/nproc
     139              :          !this condition ensures that we manage only the interesting part for the FFT
     140       320326 :          IF (iproc*(md2/nproc) + j2 <= n2) THEN
     141       613678 :             DO i1 = 1, n1, lot
     142       311348 :                ma = i1
     143       311348 :                mb = MIN(i1 + (lot - 1), n1)
     144       311348 :                nfft = mb - ma + 1
     145              :                !inserting real data into complex array of half length
     146       311348 :                CALL P_fill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
     147              : 
     148              :                !performing FFT
     149              :                !input: I1,I3,J2,(Jp2)
     150       311348 :                inzee = 1
     151       971439 :                DO i = 1, ic3
     152              :                   CALL fftstp(lot, nfft, n3, lot, n3, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
     153       660091 :                               btrig3, after3(i), now3(i), before3(i), 1)
     154       971439 :                   inzee = 3 - inzee
     155              :                END DO
     156              : 
     157              :                !output: I1,i3,J2,(Jp2)
     158              :                !exchanging components
     159              :                !input: I1,i3,J2,(Jp2)
     160       613678 :                CALL scramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2)
     161              :                !output: I1,J2,i3,(Jp2)
     162              :             END DO
     163              :          END IF
     164              :       END DO
     165              : 
     166              :       !Interprocessor data transposition
     167              :       !input: I1,J2,j3,jp3,(Jp2)
     168        17323 :       IF (nproc > 1) THEN
     169              :          !communication scheduling
     170         4806 :          CALL mpi_group%alltoall(zmpi2, zmpi1, 2*n1*(md2/nproc)*(nd3/nproc))
     171              :       END IF
     172              :       !output: I1,J2,j3,Jp2,(jp3)
     173              : 
     174              :       !now each process perform complete convolution of its planes
     175       182731 :       DO j3 = 1, nd3/nproc
     176              :          !this condition ensures that we manage only the interesting part for the FFT
     177       182731 :          IF (iproc*(nd3/nproc) + j3 <= n3/2 + 1) THEN
     178       164470 :             Jp2stb = 1
     179       164470 :             J2stb = 1
     180       164470 :             Jp2stf = 1
     181       164470 :             J2stf = 1
     182              : 
     183              :             ! transform along x axis
     184       164470 :             lot = ncache/(4*n1)
     185       164470 :             IF (lot < 1) THEN
     186              :                CALL cp_abort(__LOCATION__, &
     187              :                              'convolxc_off:ncache has to be enlarged to be able to hold at '// &
     188              :                              'least one 1-d FFT of this size even though this will '// &
     189            0 :                              'reduce the performance for shorter transform lengths')
     190              :             END IF
     191              : 
     192       333616 :             DO j = 1, n2, lot
     193       169146 :                ma = j
     194       169146 :                mb = MIN(j + (lot - 1), n2)
     195       169146 :                nfft = mb - ma + 1
     196              : 
     197              :                !reverse index ordering, leaving the planes to be transformed at the end
     198              :                !input: I1,J2,j3,Jp2,(jp3)
     199       169146 :                IF (nproc == 1) THEN
     200       136958 :                   CALL P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
     201              :                ELSE
     202        32188 :                   CALL P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
     203              :                END IF
     204              :                !output: J2,Jp2,I1,j3,(jp3)
     205              : 
     206              :                !performing FFT
     207              :                !input: I2,I1,j3,(jp3)
     208       169146 :                inzee = 1
     209       357848 :                DO i = 1, ic1 - 1
     210              :                   CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
     211       188702 :                               btrig1, after1(i), now1(i), before1(i), 1)
     212       357848 :                   inzee = 3 - inzee
     213              :                END DO
     214              : 
     215              :                !storing the last step into zt array
     216       169146 :                i = ic1
     217              :                CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
     218       333616 :                            btrig1, after1(i), now1(i), before1(i), 1)
     219              :                !output: I2,i1,j3,(jp3)
     220              :             END DO
     221              : 
     222              :             !transform along y axis
     223       164470 :             lot = ncache/(4*n2)
     224       164470 :             IF (lot < 1) THEN
     225              :                CALL cp_abort(__LOCATION__, &
     226              :                              'convolxc_off:ncache has to be enlarged to be able to hold at '// &
     227              :                              'least one 1-d FFT of this size even though this will '// &
     228            0 :                              'reduce the performance for shorter transform lengths')
     229              :             END IF
     230              : 
     231       333616 :             DO j = 1, n1, lot
     232       169146 :                ma = j
     233       169146 :                mb = MIN(j + (lot - 1), n1)
     234       169146 :                nfft = mb - ma + 1
     235              : 
     236              :                !reverse ordering
     237              :                !input: I2,i1,j3,(jp3)
     238       169146 :                CALL P_switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
     239              :                !output: i1,I2,j3,(jp3)
     240              : 
     241              :                !performing FFT
     242              :                !input: i1,I2,j3,(jp3)
     243       169146 :                inzee = 1
     244       526994 :                DO i = 1, ic2
     245              :                   CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
     246       357848 :                               btrig2, after2(i), now2(i), before2(i), 1)
     247       526994 :                   inzee = 3 - inzee
     248              :                END DO
     249              :                !output: i1,i2,j3,(jp3)
     250              : 
     251              :                !Multiply with kernel in fourier space
     252       169146 :                i3 = iproc*(nd3/nproc) + j3
     253       169146 :                CALL P_multkernel(n1, n2, n3, lot, nfft, j, i3, zw(1, 1, inzee), hx, hy, hz)
     254              : 
     255              :                !TRANSFORM BACK IN REAL SPACE
     256              : 
     257              :                !transform along y axis
     258              :                !input: i1,i2,j3,(jp3)
     259       526994 :                DO i = 1, ic2
     260              :                   CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
     261       357848 :                               ftrig2, after2(i), now2(i), before2(i), -1)
     262       526994 :                   inzee = 3 - inzee
     263              :                END DO
     264              : 
     265              :                !reverse ordering
     266              :                !input: i1,I2,j3,(jp3)
     267       333616 :                CALL P_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
     268              :                !output: I2,i1,j3,(jp3)
     269              :             END DO
     270              : 
     271              :             !transform along x axis
     272              :             !input: I2,i1,j3,(jp3)
     273       164470 :             lot = ncache/(4*n1)
     274       333616 :             DO j = 1, n2, lot
     275       169146 :                ma = j
     276       169146 :                mb = MIN(j + (lot - 1), n2)
     277       169146 :                nfft = mb - ma + 1
     278              : 
     279              :                !performing FFT
     280       169146 :                i = 1
     281              :                CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
     282       169146 :                            ftrig1, after1(i), now1(i), before1(i), -1)
     283              : 
     284       169146 :                inzee = 1
     285       357848 :                DO i = 2, ic1
     286              :                   CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
     287       188702 :                               ftrig1, after1(i), now1(i), before1(i), -1)
     288       357848 :                   inzee = 3 - inzee
     289              :                END DO
     290              :                !output: I2,I1,j3,(jp3)
     291              : 
     292              :                !reverse ordering
     293              :                !input: J2,Jp2,I1,j3,(jp3)
     294       333616 :                IF (nproc == 1) THEN
     295       136958 :                   CALL P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
     296              :                ELSE
     297        32188 :                   CALL P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
     298              :                END IF
     299              :                ! output: I1,J2,j3,Jp2,(jp3)
     300              :             END DO
     301              :          END IF
     302              :       END DO
     303              : 
     304              :       !Interprocessor data transposition
     305              :       !input: I1,J2,j3,Jp2,(jp3)
     306        17323 :       IF (nproc > 1) THEN
     307              :          !communication scheduling
     308         4806 :          CALL mpi_group%alltoall(zmpi1, zmpi2, 2*n1*(md2/nproc)*(nd3/nproc))
     309              :       END IF
     310              :       !output: I1,J2,j3,jp3,(Jp2)
     311              :       !transform along z axis
     312              :       !input: I1,J2,i3,(Jp2)
     313        17323 :       lot = ncache/(4*n3)
     314       320326 :       DO j2 = 1, md2/nproc
     315              :          !this condition ensures that we manage only the interesting part for the FFT
     316       320326 :          IF (iproc*(md2/nproc) + j2 <= n2) THEN
     317       613678 :             DO i1 = 1, n1, lot
     318       311348 :                ma = i1
     319       311348 :                mb = MIN(i1 + (lot - 1), n1)
     320       311348 :                nfft = mb - ma + 1
     321              : 
     322              :                !reverse ordering
     323              :                !input: I1,J2,i3,(Jp2)
     324       311348 :                CALL unscramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1))
     325              :                !output: I1,i3,J2,(Jp2)
     326              : 
     327              :                !performing FFT
     328              :                !input: I1,i3,J2,(Jp2)
     329       311348 :                inzee = 1
     330       971439 :                DO i = 1, ic3
     331              :                   CALL fftstp(lot, nfft, n3, lot, n3, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
     332       660091 :                               ftrig3, after3(i), now3(i), before3(i), -1)
     333       971439 :                   inzee = 3 - inzee
     334              :                END DO
     335              :                !output: I1,I3,J2,(Jp2)
     336              : 
     337              :                !rebuild the output array
     338       613678 :                CALL P_unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2), scal)
     339              : 
     340              :             END DO
     341              :          END IF
     342              :       END DO
     343              : 
     344              :       !De-allocations
     345        17323 :       DEALLOCATE (btrig1)
     346        17323 :       DEALLOCATE (ftrig1)
     347        17323 :       DEALLOCATE (after1)
     348        17323 :       DEALLOCATE (now1)
     349        17323 :       DEALLOCATE (before1)
     350        17323 :       DEALLOCATE (btrig2)
     351        17323 :       DEALLOCATE (ftrig2)
     352        17323 :       DEALLOCATE (after2)
     353        17323 :       DEALLOCATE (now2)
     354        17323 :       DEALLOCATE (before2)
     355        17323 :       DEALLOCATE (btrig3)
     356        17323 :       DEALLOCATE (ftrig3)
     357        17323 :       DEALLOCATE (after3)
     358        17323 :       DEALLOCATE (now3)
     359        17323 :       DEALLOCATE (before3)
     360        17323 :       DEALLOCATE (zmpi2)
     361        17323 :       DEALLOCATE (zw)
     362        17323 :       DEALLOCATE (zt)
     363        17323 :       IF (nproc > 1) DEALLOCATE (zmpi1)
     364              :       !  call timing(iproc,'PSolv_comput  ','OF')
     365        17323 :    END SUBROUTINE P_PoissonSolver
     366              : 
     367              : ! **************************************************************************************************
     368              : !> \brief ...
     369              : !> \param j3 ...
     370              : !> \param nfft ...
     371              : !> \param Jp2stb ...
     372              : !> \param J2stb ...
     373              : !> \param lot ...
     374              : !> \param n1 ...
     375              : !> \param md2 ...
     376              : !> \param nd3 ...
     377              : !> \param nproc ...
     378              : !> \param zmpi1 ...
     379              : !> \param zw ...
     380              : ! **************************************************************************************************
     381       169146 :    SUBROUTINE P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
     382              :       INTEGER, INTENT(in)                                :: j3, nfft
     383              :       INTEGER, INTENT(inout)                             :: Jp2stb, J2stb
     384              :       INTEGER, INTENT(in)                                :: lot, n1, md2, nd3, nproc
     385              :       REAL(KIND=dp), &
     386              :          DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
     387              :          INTENT(in)                                      :: zmpi1
     388              :       REAL(KIND=dp), DIMENSION(2, lot, n1), &
     389              :          INTENT(inout)                                   :: zw
     390              : 
     391              :       INTEGER                                            :: I1, J2, Jp2, mfft
     392              : 
     393       169146 :       mfft = 0
     394       354650 :       DO Jp2 = Jp2stb, nproc
     395      3840810 :          DO J2 = J2stb, md2/nproc
     396      3655306 :             mfft = mfft + 1
     397      3655306 :             IF (mfft > nfft) THEN
     398        13478 :                Jp2stb = Jp2
     399        13478 :                J2stb = J2
     400        13478 :                RETURN
     401              :             END IF
     402     95925808 :             DO I1 = 1, n1
     403     92098476 :                zw(1, mfft, I1) = zmpi1(1, I1, J2, j3, Jp2)
     404     95740304 :                zw(2, mfft, I1) = zmpi1(2, I1, J2, j3, Jp2)
     405              :             END DO
     406              :          END DO
     407       341172 :          J2stb = 1
     408              :       END DO
     409              :    END SUBROUTINE P_mpiswitch_upcorn
     410              : 
     411              : ! **************************************************************************************************
     412              : !> \brief ...
     413              : !> \param nfft ...
     414              : !> \param n2 ...
     415              : !> \param lot ...
     416              : !> \param n1 ...
     417              : !> \param lzt ...
     418              : !> \param zt ...
     419              : !> \param zw ...
     420              : ! **************************************************************************************************
     421       169146 :    SUBROUTINE P_switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
     422              :       INTEGER, INTENT(in)                                :: nfft, n2, lot, n1, lzt
     423              :       REAL(KIND=dp), DIMENSION(2, lzt, n1), INTENT(in)   :: zt
     424              :       REAL(KIND=dp), DIMENSION(2, lot, n2), &
     425              :          INTENT(inout)                                   :: zw
     426              : 
     427              :       INTEGER                                            :: i, j
     428              : 
     429      3810974 :       DO j = 1, nfft
     430     95909450 :          DO i = 1, n2
     431     92098476 :             zw(1, j, i) = zt(1, i, j)
     432     95740304 :             zw(2, j, i) = zt(2, i, j)
     433              :          END DO
     434              :       END DO
     435              : 
     436       169146 :    END SUBROUTINE P_switch_upcorn
     437              : 
     438              : ! **************************************************************************************************
     439              : !> \brief ...
     440              : !> \param nfft ...
     441              : !> \param n2 ...
     442              : !> \param lot ...
     443              : !> \param n1 ...
     444              : !> \param lzt ...
     445              : !> \param zw ...
     446              : !> \param zt ...
     447              : ! **************************************************************************************************
     448       169146 :    SUBROUTINE P_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
     449              :       INTEGER, INTENT(in)                                :: nfft, n2, lot, n1, lzt
     450              :       REAL(KIND=dp), DIMENSION(2, lot, n2), INTENT(in)   :: zw
     451              :       REAL(KIND=dp), DIMENSION(2, lzt, n1), &
     452              :          INTENT(inout)                                   :: zt
     453              : 
     454              :       INTEGER                                            :: i, j
     455              : 
     456      3810974 :       DO j = 1, nfft
     457     95909450 :          DO i = 1, n2
     458     92098476 :             zt(1, i, j) = zw(1, j, i)
     459     95740304 :             zt(2, i, j) = zw(2, j, i)
     460              :          END DO
     461              :       END DO
     462              : 
     463       169146 :    END SUBROUTINE P_unswitch_downcorn
     464              : 
     465              : ! **************************************************************************************************
     466              : !> \brief ...
     467              : !> \param j3 ...
     468              : !> \param nfft ...
     469              : !> \param Jp2stf ...
     470              : !> \param J2stf ...
     471              : !> \param lot ...
     472              : !> \param n1 ...
     473              : !> \param md2 ...
     474              : !> \param nd3 ...
     475              : !> \param nproc ...
     476              : !> \param zw ...
     477              : !> \param zmpi1 ...
     478              : ! **************************************************************************************************
     479       169146 :    SUBROUTINE P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
     480              :       INTEGER, INTENT(in)                                :: j3, nfft
     481              :       INTEGER, INTENT(inout)                             :: Jp2stf, J2stf
     482              :       INTEGER, INTENT(in)                                :: lot, n1, md2, nd3, nproc
     483              :       REAL(KIND=dp), DIMENSION(2, lot, n1), INTENT(in)   :: zw
     484              :       REAL(KIND=dp), &
     485              :          DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
     486              :          INTENT(inout)                                   :: zmpi1
     487              : 
     488              :       INTEGER                                            :: I1, J2, Jp2, mfft
     489              : 
     490       169146 :       mfft = 0
     491       354650 :       DO Jp2 = Jp2stf, nproc
     492      3840810 :          DO J2 = J2stf, md2/nproc
     493      3655306 :             mfft = mfft + 1
     494      3655306 :             IF (mfft > nfft) THEN
     495        13478 :                Jp2stf = Jp2
     496        13478 :                J2stf = J2
     497        13478 :                RETURN
     498              :             END IF
     499     95925808 :             DO I1 = 1, n1
     500     92098476 :                zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
     501     95740304 :                zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
     502              :             END DO
     503              :          END DO
     504       341172 :          J2stf = 1
     505              :       END DO
     506              :    END SUBROUTINE P_unmpiswitch_downcorn
     507              : 
     508              : ! **************************************************************************************************
     509              : !> \brief (Based on suitable modifications of S.Goedecker routines)
     510              : !>      Restore data into output array
     511              : !> \param md1 Dimensions of the undistributed part of the real grid
     512              : !> \param md3 Dimensions of the undistributed part of the real grid
     513              : !> \param lot ...
     514              : !> \param nfft number of planes
     515              : !> \param n3 (twice the) dimension of the last FFTtransform
     516              : !> \param zw FFT work array
     517              : !> \param zf Original distributed density as well as
     518              : !>                   Distributed solution of the poisson equation (inout)
     519              : !> \param scal Needed to achieve unitarity and correct dimensions
     520              : !> \date February 2006
     521              : !> \author S. Goedecker, L. Genovese
     522              : !> \note Assuming that high frequencies are in the corners
     523              : !>      and that n3 is multiple of 4
     524              : !>
     525              : !>  RESTRICTIONS on USAGE
     526              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
     527              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
     528              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
     529              : !>      This file is distributed under the terms of the
     530              : !>      GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
     531              : ! **************************************************************************************************
     532       311348 :    SUBROUTINE P_unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal)
     533              :       INTEGER, INTENT(in)                                :: md1, md3, lot, nfft, n3
     534              :       REAL(KIND=dp), DIMENSION(2, lot, n3), INTENT(in)   :: zw
     535              :       REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout)  :: zf
     536              :       REAL(KIND=dp), INTENT(in)                          :: scal
     537              : 
     538              :       INTEGER                                            :: i1, i3
     539              :       REAL(KIND=dp)                                      :: pot1
     540              : 
     541      7534542 :       DO i3 = 1, n3
     542    179338112 :          DO i1 = 1, nfft
     543    171803570 :             pot1 = scal*zw(1, i1, i3)
     544    179026764 :             zf(i1, i3) = pot1
     545              :          END DO
     546              :       END DO
     547              : 
     548       311348 :    END SUBROUTINE P_unfill_downcorn
     549              : 
     550              : ! **************************************************************************************************
     551              : !> \brief ...
     552              : !> \param md1 ...
     553              : !> \param md3 ...
     554              : !> \param lot ...
     555              : !> \param nfft ...
     556              : !> \param n3 ...
     557              : !> \param zf ...
     558              : !> \param zw ...
     559              : ! **************************************************************************************************
     560       311348 :    SUBROUTINE P_fill_upcorn(md1, md3, lot, nfft, n3, zf, zw)
     561              :       INTEGER, INTENT(in)                                :: md1, md3, lot, nfft, n3
     562              :       REAL(KIND=dp), DIMENSION(md1, md3), INTENT(in)     :: zf
     563              :       REAL(KIND=dp), DIMENSION(2, lot, n3), &
     564              :          INTENT(inout)                                   :: zw
     565              : 
     566              :       INTEGER                                            :: i1, i3
     567              : 
     568      7534542 :       DO i3 = 1, n3
     569    179338112 :          DO i1 = 1, nfft
     570    171803570 :             zw(1, i1, i3) = zf(i1, i3)
     571    179026764 :             zw(2, i1, i3) = 0._dp
     572              :          END DO
     573              :       END DO
     574              : 
     575       311348 :    END SUBROUTINE P_fill_upcorn
     576              : 
     577              : ! **************************************************************************************************
     578              : !> \brief (Based on suitable modifications of S.Goedecker routines)
     579              : !>      Assign the correct planes to the work array zmpi2
     580              : !>      in order to prepare for interprocessor data transposition.
     581              : !> \param i1 Starting points of the plane and number of remaining lines
     582              : !> \param j2 Starting points of the plane and number of remaining lines
     583              : !> \param lot Starting points of the plane and number of remaining lines
     584              : !> \param nfft Starting points of the plane and number of remaining lines
     585              : !> \param n1 logical dimension of the FFT transform, reference for work arrays
     586              : !> \param n3 logical dimension of the FFT transform, reference for work arrays
     587              : !> \param md2 Dimensions of real grid
     588              : !> \param nproc ...
     589              : !> \param nd3 Dimensions of the kernel
     590              : !> \param zw Work array (input)
     591              : !> \param zmpi2 Work array for multiprocessor manipulation (output)
     592              : !> \date February 2006
     593              : !> \author S. Goedecker, L. Genovese
     594              : !> \note
     595              : !>  RESTRICTIONS on USAGE
     596              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
     597              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
     598              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
     599              : !>      This file is distributed under the terms of the
     600              : !>      GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
     601              : ! **************************************************************************************************
     602       311348 :    SUBROUTINE scramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw, zmpi2)
     603              :       INTEGER, INTENT(in)                                :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
     604              :                                                             nd3
     605              :       REAL(KIND=dp), DIMENSION(2, lot, n3), INTENT(in)   :: zw
     606              :       REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
     607              :          INTENT(inout)                                   :: zmpi2
     608              : 
     609              :       INTEGER                                            :: i, i3
     610              : 
     611      4205680 :       DO i3 = 1, n3/2 + 1
     612     96304156 :          DO i = 0, nfft - 1
     613     92098476 :             zmpi2(1, i1 + i, j2, i3) = zw(1, i + 1, i3)
     614     95992808 :             zmpi2(2, i1 + i, j2, i3) = zw(2, i + 1, i3)
     615              :          END DO
     616              :       END DO
     617              : 
     618       311348 :    END SUBROUTINE scramble_P
     619              : 
     620              : ! **************************************************************************************************
     621              : !> \brief (Based on suitable modifications of S.Goedecker routines)
     622              : !>      Insert the correct planes of the work array zmpi2
     623              : !>      in order to prepare for backward FFT transform
     624              : !> \param i1 Starting points of the plane and number of remaining lines
     625              : !> \param j2 Starting points of the plane and number of remaining lines
     626              : !> \param lot Starting points of the plane and number of remaining lines
     627              : !> \param nfft Starting points of the plane and number of remaining lines
     628              : !> \param n1 logical dimension of the FFT transform, reference for work arrays
     629              : !> \param n3 logical dimension of the FFT transform, reference for work arrays
     630              : !> \param md2 Dimensions of real grid
     631              : !> \param nproc ...
     632              : !> \param nd3 Dimensions of the kernel
     633              : !> \param zmpi2 Work array for multiprocessor manipulation (output)
     634              : !> \param zw Work array (input)
     635              : !> \date February 2006
     636              : !> \author S. Goedecker, L. Genovese
     637              : !> \note
     638              : !>  RESTRICTIONS on USAGE
     639              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
     640              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
     641              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
     642              : !>      This file is distributed under the terms of the
     643              : !>      GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
     644              : ! **************************************************************************************************
     645       311348 :    SUBROUTINE unscramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw)
     646              :       INTEGER, INTENT(in)                                :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
     647              :                                                             nd3
     648              :       REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
     649              :          INTENT(in)                                      :: zmpi2
     650              :       REAL(KIND=dp), DIMENSION(2, lot, n3), &
     651              :          INTENT(inout)                                   :: zw
     652              : 
     653              :       INTEGER                                            :: i, i3, j3
     654              : 
     655       311348 :       i3 = 1
     656      7047570 :       DO i = 0, nfft - 1
     657      6736222 :          zw(1, i + 1, i3) = zmpi2(1, i1 + i, j2, i3)
     658      7047570 :          zw(2, i + 1, i3) = zmpi2(2, i1 + i, j2, i3)
     659              :       END DO
     660              : 
     661      3894332 :       DO i3 = 2, n3/2 + 1
     662      3582984 :          j3 = n3 + 2 - i3
     663     89256586 :          DO i = 0, nfft - 1
     664     85362254 :             zw(1, i + 1, j3) = zmpi2(1, i1 + i, j2, i3)
     665     85362254 :             zw(2, i + 1, j3) = -zmpi2(2, i1 + i, j2, i3)
     666     85362254 :             zw(1, i + 1, i3) = zmpi2(1, i1 + i, j2, i3)
     667     88945238 :             zw(2, i + 1, i3) = zmpi2(2, i1 + i, j2, i3)
     668              :          END DO
     669              :       END DO
     670              : 
     671       311348 :    END SUBROUTINE unscramble_P
     672              : 
     673              : ! **************************************************************************************************
     674              : !> \brief (Based on suitable modifications of S.Goedecker routines)
     675              : !>      Multiply with the kernel taking into account its symmetry
     676              : !>      Conceived to be used into convolution loops
     677              : !> \param n1 ...
     678              : !> \param n2 ...
     679              : !> \param n3 ...
     680              : !> \param lot ...
     681              : !> \param nfft ...
     682              : !> \param jS ...
     683              : !> \param i3 ...
     684              : !> \param zw Work array (input/output)
     685              : !>      n1,n2:      logical dimension of the FFT transform, reference for zw
     686              : !>      nd1,nd2:    Dimensions of POT
     687              : !>      jS,j3,nfft: starting point of the plane and number of remaining lines
     688              : !>
     689              : !>  RESTRICTIONS on USAGE
     690              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
     691              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
     692              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
     693              : !>      This file is distributed under the terms of the
     694              : !>       GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
     695              : !> \param hx ...
     696              : !> \param hy ...
     697              : !> \param hz ...
     698              : !> \date February 2006
     699              : !> \author S. Goedecker, L. Genovese
     700              : ! **************************************************************************************************
     701       169146 :    SUBROUTINE P_multkernel(n1, n2, n3, lot, nfft, jS, i3, zw, hx, hy, hz)
     702              :       INTEGER, INTENT(in)                                :: n1, n2, n3, lot, nfft, jS, i3
     703              :       REAL(KIND=dp), DIMENSION(2, lot, n2), &
     704              :          INTENT(inout)                                   :: zw
     705              :       REAL(KIND=dp), INTENT(in)                          :: hx, hy, hz
     706              : 
     707              :       INTEGER                                            :: i1, i2, j1, j2, j3
     708              :       REAL(KIND=dp)                                      :: fourpi2, ker, mu3, p1, p2, pi
     709              : 
     710       169146 :       pi = 4._dp*ATAN(1._dp)
     711       169146 :       fourpi2 = 4._dp*pi**2
     712       169146 :       j3 = i3 !n3/2+1-abs(n3/2+2-i3)
     713       169146 :       mu3 = REAL(j3 - 1, KIND=dp)/REAL(n3, KIND=dp)
     714       169146 :       mu3 = (mu3/hy)**2 !beware of the exchanged dimension
     715              :       !Body
     716              :       !generic case
     717      4063478 :       DO i2 = 1, n2
     718     96161954 :          DO i1 = 1, nfft
     719     92098476 :             j1 = i1 + jS - 1
     720     92098476 :             j1 = j1 - (j1/(n1/2 + 2))*n1 !n1/2+1-abs(n1/2+2-jS-i1)
     721     92098476 :             j2 = i2 - (i2/(n2/2 + 2))*n2 !n2/2+1-abs(n2/2+1-i2)
     722     92098476 :             p1 = REAL(j1 - 1, KIND=dp)/REAL(n1, KIND=dp)
     723     92098476 :             p2 = REAL(j2 - 1, KIND=dp)/REAL(n2, KIND=dp)
     724     92098476 :             ker = -fourpi2*((p1/hx)**2 + (p2/hz)**2 + mu3) !beware of the exchanged dimension
     725     92098476 :             IF (ker /= 0._dp) ker = 1._dp/ker
     726     92098476 :             zw(1, i1, i2) = zw(1, i1, i2)*ker
     727     95992808 :             zw(2, i1, i2) = zw(2, i1, i2)*ker
     728              :          END DO
     729              :       END DO
     730              : 
     731       169146 :    END SUBROUTINE P_multkernel
     732              : 
     733              : ! **************************************************************************************************
     734              : !> \brief (Based on suitable modifications of S.Goedecker routines)
     735              : !>      Multiply with the kernel taking into account its symmetry
     736              : !>      Conceived to be used into convolution loops
     737              : !> \param nd1 ...
     738              : !> \param nd2 ...
     739              : !> \param n1 ...
     740              : !> \param n2 ...
     741              : !> \param lot ...
     742              : !> \param nfft ...
     743              : !> \param jS ...
     744              : !> \param pot Kernel, symmetric and real, half the length
     745              : !> \param zw Work array (input/output)
     746              : !>      n1,n2:    logical dimension of the FFT transform, reference for zw
     747              : !>      nd1,nd2:  Dimensions of POT
     748              : !>      jS, nfft: starting point of the plane and number of remaining lines
     749              : !>
     750              : !>  RESTRICTIONS on USAGE
     751              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
     752              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
     753              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
     754              : !>      This file is distributed under the terms of the
     755              : !>       GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
     756              : !> \date February 2006
     757              : !> \author S. Goedecker, L. Genovese
     758              : ! **************************************************************************************************
     759      1838763 :    SUBROUTINE multkernel(nd1, nd2, n1, n2, lot, nfft, jS, pot, zw)
     760              :       INTEGER, INTENT(in)                                :: nd1, nd2, n1, n2, lot, nfft, jS
     761              :       REAL(KIND=dp), DIMENSION(nd1, nd2), INTENT(in)     :: pot
     762              :       REAL(KIND=dp), DIMENSION(2, lot, n2), &
     763              :          INTENT(inout)                                   :: zw
     764              : 
     765              :       INTEGER                                            :: i2, j, j1, j2
     766              : 
     767     36647123 :       DO j = 1, nfft
     768     34808360 :          j1 = j + jS - 1
     769     34808360 :          j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
     770     34808360 :          zw(1, j, 1) = zw(1, j, 1)*pot(j1, 1)
     771     36647123 :          zw(2, j, 1) = zw(2, j, 1)*pot(j1, 1)
     772              :       END DO
     773              : 
     774              :       !generic case
     775     93668907 :       DO i2 = 2, n2/2
     776   1656450975 :          DO j = 1, nfft
     777   1562782068 :             j1 = j + jS - 1
     778   1562782068 :             j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
     779   1562782068 :             j2 = n2 + 2 - i2
     780   1562782068 :             zw(1, j, i2) = zw(1, j, i2)*pot(j1, i2)
     781   1562782068 :             zw(2, j, i2) = zw(2, j, i2)*pot(j1, i2)
     782   1562782068 :             zw(1, j, j2) = zw(1, j, j2)*pot(j1, i2)
     783   1654612212 :             zw(2, j, j2) = zw(2, j, j2)*pot(j1, i2)
     784              :          END DO
     785              :       END DO
     786              : 
     787              :       !case i2=n2/2+1
     788     36647123 :       DO j = 1, nfft
     789     34808360 :          j1 = j + jS - 1
     790     34808360 :          j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
     791     34808360 :          j2 = n2/2 + 1
     792     34808360 :          zw(1, j, j2) = zw(1, j, j2)*pot(j1, j2)
     793     36647123 :          zw(2, j, j2) = zw(2, j, j2)*pot(j1, j2)
     794              :       END DO
     795              : 
     796      1838763 :    END SUBROUTINE multkernel
     797              : 
     798              : ! **************************************************************************************************
     799              : !> \brief !HERE POT MUST BE THE KERNEL (BEWARE THE HALF DIMENSION)
     800              : !> ****h* BigDFT/S_PoissonSolver
     801              : !>      (Based on suitable modifications of S.Goedecker routines)
     802              : !>      Applies the local FFT space Kernel to the density in Real space.
     803              : !>      Does NOT calculate the LDA exchange-correlation terms
     804              : !> \param n1 logical dimension of the transform.
     805              : !> \param n2 logical dimension of the transform.
     806              : !> \param n3 logical dimension of the transform.
     807              : !> \param nd1 Dimension of POT
     808              : !> \param nd2 Dimension of POT
     809              : !> \param nd3 Dimension of POT
     810              : !> \param md1 Dimension of ZF
     811              : !> \param md2 Dimension of ZF
     812              : !> \param md3 Dimension of ZF
     813              : !> \param nproc number of processors used as returned by MPI_COMM_SIZE
     814              : !> \param iproc [0:nproc-1] number of processor as returned by MPI_COMM_RANK
     815              : !> \param pot Kernel, only the distributed part (REAL)
     816              : !>                   POT(i1,i2,i3)
     817              : !>                   i1=1,nd1 , i2=1,nd2 , i3=1,nd3/nproc
     818              : !> \param zf Density (input/output)
     819              : !>                   ZF(i1,i3,i2)
     820              : !>                   i1=1,md1 , i2=1,md2/nproc , i3=1,md3
     821              : !> \param scal factor of renormalization of the FFT in order to acheve unitarity
     822              : !>                   and the correct dimension
     823              : !> \param mpi_group ...
     824              : !> \date October 2006
     825              : !> \author S. Goedecker, L. Genovese
     826              : !> \note   As transform lengths most products of the prime factors 2,3,5 are allowed.
     827              : !>                   The detailed table with allowed transform lengths can
     828              : !>                   be found in subroutine CTRIG
     829              : !> \note
     830              : !>  RESTRICTIONS on USAGE
     831              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
     832              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
     833              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
     834              : !>      This file is distributed under the terms of the
     835              : !>       GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
     836              : ! **************************************************************************************************
     837           54 :    SUBROUTINE S_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, pot, zf, &
     838              :                               scal, mpi_group)
     839              :       INTEGER, INTENT(in)                                :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
     840              :                                                             md3, nproc, iproc
     841              :       REAL(KIND=dp), DIMENSION(nd1, nd2, nd3/nproc), &
     842              :          INTENT(in)                                      :: pot
     843              :       REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
     844              :          INTENT(inout)                                   :: zf
     845              :       REAL(KIND=dp), INTENT(in)                          :: scal
     846              : 
     847              :       CLASS(mp_comm_type), INTENT(in)                     :: mpi_group
     848              : 
     849              :       CHARACTER(len=*), PARAMETER                        :: routineN = 'S_PoissonSolver'
     850              :       INTEGER, PARAMETER                                 :: ncache_optimal = 8*1024
     851              : 
     852              :       INTEGER                                            :: handle, i, i1, i3, ic1, ic2, ic3, inzee, &
     853              :                                                             j, j2, J2stb, J2stf, j3, Jp2stb, &
     854              :                                                             Jp2stf, lot, lzt, ma, mb, ncache, nfft
     855           54 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: after1, after2, after3, before1, &
     856           54 :                                                             before2, before3, now1, now2, now3
     857              :       REAL(kind=dp)                                      :: twopion
     858           54 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: btrig1, btrig2, btrig3, cosinarr, &
     859           54 :                                                             ftrig1, ftrig2, ftrig3
     860           54 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: zt, zw
     861              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :)  :: zmpi2
     862              :       REAL(KIND=dp), ALLOCATABLE, &
     863           54 :          DIMENSION(:, :, :, :, :)                        :: zmpi1
     864              : 
     865           54 :       CALL timeset(routineN, handle)
     866              :       ! check input
     867           54 :       IF (MOD(n3, 2) /= 0) CPABORT("Parallel convolution:ERROR:n3")
     868           54 :       IF (nd1 < n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
     869           54 :       IF (nd2 < n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
     870           54 :       IF (nd3 < n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
     871           54 :       IF (md1 < n1) CPABORT("Parallel convolution:ERROR:md1")
     872           54 :       IF (md2 < n2) CPABORT("Parallel convolution:ERROR:md2")
     873           54 :       IF (md3 < n3/2) CPABORT("Parallel convolution:ERROR:md3")
     874           54 :       IF (MOD(nd3, nproc) /= 0) CPABORT("Parallel convolution:ERROR:nd3")
     875           54 :       IF (MOD(md2, nproc) /= 0) CPABORT("Parallel convolution:ERROR:md2")
     876              : 
     877              :       !defining work arrays dimensions
     878           54 :       ncache = ncache_optimal
     879           54 :       IF (ncache <= MAX(n1, n2, n3/2)*4) ncache = MAX(n1, n2, n3/2)*4
     880              : 
     881              :       !  if (timing_flag == 1 .and. iproc ==0) print *,'parallel ncache=',ncache
     882              : 
     883           54 :       lzt = n2
     884           54 :       IF (MOD(n2, 2) == 0) lzt = lzt + 1
     885           54 :       IF (MOD(n2, 4) == 0) lzt = lzt + 1 !maybe this is useless
     886              : 
     887              :       !Allocations
     888           54 :       ALLOCATE (btrig1(2, ctrig_length))
     889           54 :       ALLOCATE (ftrig1(2, ctrig_length))
     890           54 :       ALLOCATE (after1(7))
     891           54 :       ALLOCATE (now1(7))
     892           54 :       ALLOCATE (before1(7))
     893           54 :       ALLOCATE (btrig2(2, ctrig_length))
     894           54 :       ALLOCATE (ftrig2(2, ctrig_length))
     895           54 :       ALLOCATE (after2(7))
     896           54 :       ALLOCATE (now2(7))
     897           54 :       ALLOCATE (before2(7))
     898           54 :       ALLOCATE (btrig3(2, ctrig_length))
     899           54 :       ALLOCATE (ftrig3(2, ctrig_length))
     900           54 :       ALLOCATE (after3(7))
     901           54 :       ALLOCATE (now3(7))
     902           54 :       ALLOCATE (before3(7))
     903          216 :       ALLOCATE (zw(2, ncache/4, 2))
     904          216 :       ALLOCATE (zt(2, lzt, n1))
     905          270 :       ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
     906          216 :       ALLOCATE (cosinarr(2, n3/2))
     907           54 :       IF (nproc > 1) THEN
     908          324 :          ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
     909           54 :          zmpi1 = 0.0_dp
     910              :       END IF
     911           54 :       zmpi2 = 0.0_dp
     912           54 :       zw = 0.0_dp
     913           54 :       zt = 0.0_dp
     914              : 
     915              :       !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
     916           54 :       CALL ctrig(n3/2, btrig3, after3, before3, now3, 1, ic3)
     917           54 :       CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
     918           54 :       CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
     919         2970 :       DO j = 1, n1
     920         2916 :          ftrig1(1, j) = btrig1(1, j)
     921         2970 :          ftrig1(2, j) = -btrig1(2, j)
     922              :       END DO
     923         2970 :       DO j = 1, n2
     924         2916 :          ftrig2(1, j) = btrig2(1, j)
     925         2970 :          ftrig2(2, j) = -btrig2(2, j)
     926              :       END DO
     927         2970 :       DO j = 1, n3/2
     928         2916 :          ftrig3(1, j) = btrig3(1, j)
     929         2970 :          ftrig3(2, j) = -btrig3(2, j)
     930              :       END DO
     931              : 
     932              :       !Calculating array of phases for HalFFT decoding
     933           54 :       twopion = 8._dp*ATAN(1._dp)/REAL(n3, KIND=dp)
     934         2970 :       DO i3 = 1, n3/2
     935         2916 :          cosinarr(1, i3) = COS(twopion*(i3 - 1))
     936         2970 :          cosinarr(2, i3) = -SIN(twopion*(i3 - 1))
     937              :       END DO
     938              : 
     939              :       !initializing integral
     940              :       !ehartree=0._dp
     941              : 
     942              :       ! transform along z axis
     943           54 :       lot = ncache/(2*n3)
     944           54 :       IF (lot < 1) THEN
     945              :          CALL cp_abort(__LOCATION__, &
     946              :                        'convolxc_off:ncache has to be enlarged to be able to hold at '// &
     947              :                        'least one 1-d FFT of this size even though this will '// &
     948            0 :                        'reduce the performance for shorter transform lengths')
     949              :       END IF
     950              : 
     951         1512 :       DO j2 = 1, md2/nproc
     952              :          !this condition ensures that we manage only the interesting part for the FFT
     953         1512 :          IF (iproc*(md2/nproc) + j2 <= n2) THEN
     954         4374 :             DO i1 = 1, n1, lot
     955         2916 :                ma = i1
     956         2916 :                mb = MIN(i1 + (lot - 1), n1)
     957         2916 :                nfft = mb - ma + 1
     958              : 
     959              :                !inserting real data into complex array of half length
     960         2916 :                CALL halfill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
     961              : 
     962              :                !performing FFT
     963              :                !input: I1,I3,J2,(Jp2)
     964         2916 :                inzee = 1
     965        11664 :                DO i = 1, ic3
     966              :                   CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
     967         8748 :                               btrig3, after3(i), now3(i), before3(i), 1)
     968        11664 :                   inzee = 3 - inzee
     969              :                END DO
     970              :                !output: I1,i3,J2,(Jp2)
     971              :                !unpacking FFT in order to restore correct result,
     972              :                !while exchanging components
     973              :                !input: I1,i3,J2,(Jp2)
     974         4374 :                CALL scramble_unpack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2, cosinarr)
     975              :                !output: I1,J2,i3,(Jp2)
     976              :             END DO
     977              :          END IF
     978              :       END DO
     979              :       !Interprocessor data transposition
     980              :       !input: I1,J2,j3,jp3,(Jp2)
     981           54 :       IF (nproc > 1) THEN
     982           54 :          CALL mpi_group%alltoall(zmpi2, zmpi1, 2*n1*(md2/nproc)*(nd3/nproc))
     983              :       END IF
     984              :       !output: I1,J2,j3,Jp2,(jp3)
     985              : 
     986              :       !now each process perform complete convolution of its planes
     987         1566 :       DO j3 = 1, nd3/nproc
     988              :          !this condition ensures that we manage only the interesting part for the FFT
     989         1566 :          IF (iproc*(nd3/nproc) + j3 <= n3/2 + 1) THEN
     990         1485 :             Jp2stb = 1
     991         1485 :             J2stb = 1
     992         1485 :             Jp2stf = 1
     993         1485 :             J2stf = 1
     994              : 
     995              :             ! transform along x axis
     996         1485 :             lot = ncache/(4*n1)
     997         1485 :             IF (lot < 1) THEN
     998              :                CALL cp_abort(__LOCATION__, &
     999              :                              'convolxc_off:ncache has to be enlarged to be able to hold at '// &
    1000              :                              'least one 1-d FFT of this size even though this will '// &
    1001            0 :                              'reduce the performance for shorter transform lengths')
    1002              :             END IF
    1003              : 
    1004         4455 :             DO j = 1, n2, lot
    1005         2970 :                ma = j
    1006         2970 :                mb = MIN(j + (lot - 1), n2)
    1007         2970 :                nfft = mb - ma + 1
    1008              : 
    1009              :                !reverse index ordering, leaving the planes to be transformed at the end
    1010              :                !input: I1,J2,j3,Jp2,(jp3)
    1011         2970 :                IF (nproc == 1) THEN
    1012            0 :                   CALL S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
    1013              :                ELSE
    1014         2970 :                   CALL S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
    1015              :                END IF
    1016              :                !output: J2,Jp2,I1,j3,(jp3)
    1017              : 
    1018              :                !performing FFT
    1019              :                !input: I2,I1,j3,(jp3)
    1020         2970 :                inzee = 1
    1021         8910 :                DO i = 1, ic1 - 1
    1022              :                   CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1023         5940 :                               btrig1, after1(i), now1(i), before1(i), 1)
    1024         8910 :                   inzee = 3 - inzee
    1025              :                END DO
    1026              : 
    1027              :                !storing the last step into zt array
    1028         2970 :                i = ic1
    1029              :                CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
    1030         4455 :                            btrig1, after1(i), now1(i), before1(i), 1)
    1031              :                !output: I2,i1,j3,(jp3)
    1032              :             END DO
    1033              : 
    1034              :             !transform along y axis
    1035         1485 :             lot = ncache/(4*n2)
    1036         1485 :             IF (lot < 1) THEN
    1037              :                CALL cp_abort(__LOCATION__, &
    1038              :                              'convolxc_off:ncache has to be enlarged to be able to hold at '// &
    1039              :                              'least one 1-d FFT of this size even though this will '// &
    1040            0 :                              'reduce the performance for shorter transform lengths')
    1041              :             END IF
    1042              : 
    1043         4455 :             DO j = 1, n1, lot
    1044         2970 :                ma = j
    1045         2970 :                mb = MIN(j + (lot - 1), n1)
    1046         2970 :                nfft = mb - ma + 1
    1047              : 
    1048              :                !reverse ordering
    1049              :                !input: I2,i1,j3,(jp3)
    1050         2970 :                CALL S_switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
    1051              :                !output: i1,I2,j3,(jp3)
    1052              : 
    1053              :                !performing FFT
    1054              :                !input: i1,I2,j3,(jp3)
    1055         2970 :                inzee = 1
    1056        11880 :                DO i = 1, ic2
    1057              :                   CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1058         8910 :                               btrig2, after2(i), now2(i), before2(i), 1)
    1059        11880 :                   inzee = 3 - inzee
    1060              :                END DO
    1061              :                !output: i1,i2,j3,(jp3)
    1062              : 
    1063              :                !Multiply with kernel in fourier space
    1064         2970 :                CALL multkernel(nd1, nd2, n1, n2, lot, nfft, j, pot(1, 1, j3), zw(1, 1, inzee))
    1065              : 
    1066              :                !TRANSFORM BACK IN REAL SPACE
    1067              : 
    1068              :                !transform along y axis
    1069              :                !input: i1,i2,j3,(jp3)
    1070        11880 :                DO i = 1, ic2
    1071              :                   CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1072         8910 :                               ftrig2, after2(i), now2(i), before2(i), -1)
    1073        11880 :                   inzee = 3 - inzee
    1074              :                END DO
    1075              : 
    1076              :                !reverse ordering
    1077              :                !input: i1,I2,j3,(jp3)
    1078         4455 :                CALL S_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
    1079              :                !output: I2,i1,j3,(jp3)
    1080              :             END DO
    1081              : 
    1082              :             !transform along x axis
    1083              :             !input: I2,i1,j3,(jp3)
    1084         1485 :             lot = ncache/(4*n1)
    1085         4455 :             DO j = 1, n2, lot
    1086         2970 :                ma = j
    1087         2970 :                mb = MIN(j + (lot - 1), n2)
    1088         2970 :                nfft = mb - ma + 1
    1089              : 
    1090              :                !performing FFT
    1091         2970 :                i = 1
    1092              :                CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
    1093         2970 :                            ftrig1, after1(i), now1(i), before1(i), -1)
    1094              : 
    1095         2970 :                inzee = 1
    1096         8910 :                DO i = 2, ic1
    1097              :                   CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1098         5940 :                               ftrig1, after1(i), now1(i), before1(i), -1)
    1099         8910 :                   inzee = 3 - inzee
    1100              :                END DO
    1101              :                !output: I2,I1,j3,(jp3)
    1102              : 
    1103              :                !reverse ordering
    1104              :                !input: J2,Jp2,I1,j3,(jp3)
    1105         4455 :                IF (nproc == 1) THEN
    1106            0 :                   CALL S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
    1107              :                ELSE
    1108         2970 :                   CALL S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
    1109              :                END IF
    1110              :                ! output: I1,J2,j3,Jp2,(jp3)
    1111              :             END DO
    1112              :          END IF
    1113              :       END DO
    1114              : 
    1115              :       !Interprocessor data transposition
    1116              :       !input: I1,J2,j3,Jp2,(jp3)
    1117           54 :       IF (nproc > 1) THEN
    1118              :          !communication scheduling
    1119           54 :          CALL mpi_group%alltoall(zmpi1, zmpi2, 2*n1*(md2/nproc)*(nd3/nproc))
    1120              :       END IF
    1121              : 
    1122              :       !output: I1,J2,j3,jp3,(Jp2)
    1123              : 
    1124              :       !transform along z axis
    1125              :       !input: I1,J2,i3,(Jp2)
    1126           54 :       lot = ncache/(2*n3)
    1127         1512 :       DO j2 = 1, md2/nproc
    1128              :          !this condition ensures that we manage only the interesting part for the FFT
    1129         1512 :          IF (iproc*(md2/nproc) + j2 <= n2) THEN
    1130         4374 :             DO i1 = 1, n1, lot
    1131         2916 :                ma = i1
    1132         2916 :                mb = MIN(i1 + (lot - 1), n1)
    1133         2916 :                nfft = mb - ma + 1
    1134              : 
    1135              :                !reverse ordering and repack the FFT data in order to be backward HalFFT transformed
    1136              :                !input: I1,J2,i3,(Jp2)
    1137         2916 :                CALL unscramble_pack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1), cosinarr)
    1138              :                !output: I1,i3,J2,(Jp2)
    1139              : 
    1140              :                !performing FFT
    1141              :                !input: I1,i3,J2,(Jp2)
    1142         2916 :                inzee = 1
    1143        11664 :                DO i = 1, ic3
    1144              :                   CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1145         8748 :                               ftrig3, after3(i), now3(i), before3(i), -1)
    1146        11664 :                   inzee = 3 - inzee
    1147              :                END DO
    1148              :                !output: I1,I3,J2,(Jp2)
    1149              : 
    1150              :                !rebuild the output array
    1151              :                CALL unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2) &
    1152         4374 :                                     , scal)
    1153              : 
    1154              :                !integrate local pieces together
    1155              :                !ehartree=ehartree+0.5_dp*ehartreetmp*hx*hy*hz
    1156              :             END DO
    1157              :          END IF
    1158              :       END DO
    1159              : 
    1160              :       !De-allocations
    1161           54 :       DEALLOCATE (btrig1)
    1162           54 :       DEALLOCATE (ftrig1)
    1163           54 :       DEALLOCATE (after1)
    1164           54 :       DEALLOCATE (now1)
    1165           54 :       DEALLOCATE (before1)
    1166           54 :       DEALLOCATE (btrig2)
    1167           54 :       DEALLOCATE (ftrig2)
    1168           54 :       DEALLOCATE (after2)
    1169           54 :       DEALLOCATE (now2)
    1170           54 :       DEALLOCATE (before2)
    1171           54 :       DEALLOCATE (btrig3)
    1172           54 :       DEALLOCATE (ftrig3)
    1173           54 :       DEALLOCATE (after3)
    1174           54 :       DEALLOCATE (now3)
    1175           54 :       DEALLOCATE (before3)
    1176           54 :       DEALLOCATE (zmpi2)
    1177           54 :       DEALLOCATE (zw)
    1178           54 :       DEALLOCATE (zt)
    1179           54 :       DEALLOCATE (cosinarr)
    1180           54 :       IF (nproc > 1) DEALLOCATE (zmpi1)
    1181              : 
    1182              :       !  call timing(iproc,'PSolv_comput  ','OF')
    1183           54 :       CALL timestop(handle)
    1184          108 :    END SUBROUTINE S_PoissonSolver
    1185              : 
    1186              : ! **************************************************************************************************
    1187              : !> \brief ...
    1188              : !> \param j3 ...
    1189              : !> \param nfft ...
    1190              : !> \param Jp2stb ...
    1191              : !> \param J2stb ...
    1192              : !> \param lot ...
    1193              : !> \param n1 ...
    1194              : !> \param md2 ...
    1195              : !> \param nd3 ...
    1196              : !> \param nproc ...
    1197              : !> \param zmpi1 ...
    1198              : !> \param zw ...
    1199              : ! **************************************************************************************************
    1200         2970 :    SUBROUTINE S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
    1201              :       INTEGER, INTENT(in)                                :: j3, nfft
    1202              :       INTEGER, INTENT(inout)                             :: Jp2stb, J2stb
    1203              :       INTEGER, INTENT(in)                                :: lot, n1, md2, nd3, nproc
    1204              :       REAL(KIND=dp), &
    1205              :          DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
    1206              :          INTENT(in)                                      :: zmpi1
    1207              :       REAL(KIND=dp), DIMENSION(2, lot, n1), &
    1208              :          INTENT(inout)                                   :: zw
    1209              : 
    1210              :       INTEGER                                            :: I1, J2, Jp2, mfft
    1211              : 
    1212         2970 :       mfft = 0
    1213         5940 :       DO Jp2 = Jp2stb, nproc
    1214        84645 :          DO J2 = J2stb, md2/nproc
    1215        81675 :             mfft = mfft + 1
    1216        81675 :             IF (mfft > nfft) THEN
    1217         1485 :                Jp2stb = Jp2
    1218         1485 :                J2stb = J2
    1219         1485 :                RETURN
    1220              :             END IF
    1221      4413420 :             DO I1 = 1, n1
    1222      4330260 :                zw(1, mfft, I1) = zmpi1(1, I1, J2, j3, Jp2)
    1223      4410450 :                zw(2, mfft, I1) = zmpi1(2, I1, J2, j3, Jp2)
    1224              :             END DO
    1225              :          END DO
    1226         4455 :          J2stb = 1
    1227              :       END DO
    1228              :    END SUBROUTINE S_mpiswitch_upcorn
    1229              : 
    1230              : ! **************************************************************************************************
    1231              : !> \brief ...
    1232              : !> \param nfft ...
    1233              : !> \param n2 ...
    1234              : !> \param lot ...
    1235              : !> \param n1 ...
    1236              : !> \param lzt ...
    1237              : !> \param zt ...
    1238              : !> \param zw ...
    1239              : ! **************************************************************************************************
    1240         2970 :    SUBROUTINE S_switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
    1241              :       INTEGER, INTENT(in)                                :: nfft, n2, lot, n1, lzt
    1242              :       REAL(KIND=dp), DIMENSION(2, lzt, n1), INTENT(in)   :: zt
    1243              :       REAL(KIND=dp), DIMENSION(2, lot, n2), &
    1244              :          INTENT(inout)                                   :: zw
    1245              : 
    1246              :       INTEGER                                            :: i, j
    1247              : 
    1248        83160 :       DO j = 1, nfft
    1249      4413420 :          DO i = 1, n2
    1250      4330260 :             zw(1, j, i) = zt(1, i, j)
    1251      4410450 :             zw(2, j, i) = zt(2, i, j)
    1252              :          END DO
    1253              :       END DO
    1254         2970 :    END SUBROUTINE S_switch_upcorn
    1255              : 
    1256              : ! **************************************************************************************************
    1257              : !> \brief ...
    1258              : !> \param nfft ...
    1259              : !> \param n2 ...
    1260              : !> \param lot ...
    1261              : !> \param n1 ...
    1262              : !> \param lzt ...
    1263              : !> \param zw ...
    1264              : !> \param zt ...
    1265              : ! **************************************************************************************************
    1266         2970 :    SUBROUTINE S_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
    1267              :       INTEGER, INTENT(in)                                :: nfft, n2, lot, n1, lzt
    1268              :       REAL(KIND=dp), DIMENSION(2, lot, n2), INTENT(in)   :: zw
    1269              :       REAL(KIND=dp), DIMENSION(2, lzt, n1), &
    1270              :          INTENT(inout)                                   :: zt
    1271              : 
    1272              :       INTEGER                                            :: i, j
    1273              : 
    1274        83160 :       DO j = 1, nfft
    1275      4413420 :          DO i = 1, n2
    1276      4330260 :             zt(1, i, j) = zw(1, j, i)
    1277      4410450 :             zt(2, i, j) = zw(2, j, i)
    1278              :          END DO
    1279              :       END DO
    1280         2970 :    END SUBROUTINE S_unswitch_downcorn
    1281              : 
    1282              : ! **************************************************************************************************
    1283              : !> \brief ...
    1284              : !> \param j3 ...
    1285              : !> \param nfft ...
    1286              : !> \param Jp2stf ...
    1287              : !> \param J2stf ...
    1288              : !> \param lot ...
    1289              : !> \param n1 ...
    1290              : !> \param md2 ...
    1291              : !> \param nd3 ...
    1292              : !> \param nproc ...
    1293              : !> \param zw ...
    1294              : !> \param zmpi1 ...
    1295              : ! **************************************************************************************************
    1296         2970 :    SUBROUTINE S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
    1297              :       INTEGER, INTENT(in)                                :: j3, nfft
    1298              :       INTEGER, INTENT(inout)                             :: Jp2stf, J2stf
    1299              :       INTEGER, INTENT(in)                                :: lot, n1, md2, nd3, nproc
    1300              :       REAL(KIND=dp), DIMENSION(2, lot, n1), INTENT(in)   :: zw
    1301              :       REAL(KIND=dp), &
    1302              :          DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
    1303              :          INTENT(inout)                                   :: zmpi1
    1304              : 
    1305              :       INTEGER                                            :: I1, J2, Jp2, mfft
    1306              : 
    1307         2970 :       mfft = 0
    1308         5940 :       DO Jp2 = Jp2stf, nproc
    1309        84645 :          DO J2 = J2stf, md2/nproc
    1310        81675 :             mfft = mfft + 1
    1311        81675 :             IF (mfft > nfft) THEN
    1312         1485 :                Jp2stf = Jp2
    1313         1485 :                J2stf = J2
    1314         1485 :                RETURN
    1315              :             END IF
    1316      4413420 :             DO I1 = 1, n1
    1317      4330260 :                zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
    1318      4410450 :                zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
    1319              :             END DO
    1320              :          END DO
    1321         4455 :          J2stf = 1
    1322              :       END DO
    1323              :    END SUBROUTINE S_unmpiswitch_downcorn
    1324              : 
    1325              : ! **************************************************************************************************
    1326              : !> \brief (Based on suitable modifications of S.Goedecker routines)
    1327              : !>      Restore data into output array, calculating in the meanwhile
    1328              : !>      Hartree energy of the potential
    1329              : !> \param md1 Dimensions of the undistributed part of the real grid
    1330              : !> \param md3 Dimensions of the undistributed part of the real grid
    1331              : !> \param lot ...
    1332              : !> \param nfft number of planes
    1333              : !> \param n3 (twice the) dimension of the last FFTtransform.
    1334              : !> \param zw FFT work array
    1335              : !> \param zf Original distributed density as well as
    1336              : !>                   Distributed solution of the poisson equation (inout)
    1337              : !> \param scal Needed to achieve unitarity and correct dimensions
    1338              : !> \date February 2006
    1339              : !> \author S. Goedecker, L. Genovese
    1340              : !> \note Assuming that high frequencies are in the corners
    1341              : !>      and that n3 is multiple of 4
    1342              : !>
    1343              : !>  RESTRICTIONS on USAGE
    1344              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
    1345              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
    1346              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
    1347              : !>      This file is distributed under the terms of the
    1348              : !>      GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
    1349              : ! **************************************************************************************************
    1350       579295 :    SUBROUTINE unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal)
    1351              :       INTEGER, INTENT(in)                                :: md1, md3, lot, nfft, n3
    1352              :       REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
    1353              :       REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout)  :: zf
    1354              :       REAL(KIND=dp), INTENT(in)                          :: scal
    1355              : 
    1356              :       INTEGER                                            :: i1, i3
    1357              :       REAL(KIND=dp)                                      :: pot1
    1358              : 
    1359     14639933 :       DO i3 = 1, n3/4
    1360    407286462 :          DO i1 = 1, nfft
    1361    392646529 :             pot1 = scal*zw(1, i1, i3)
    1362              :             !ehartreetmp =ehartreetmp + pot1* zf(i1,2*i3-1)
    1363    392646529 :             zf(i1, 2*i3 - 1) = pot1
    1364    392646529 :             pot1 = scal*zw(2, i1, i3)
    1365              :             !ehartreetmp =ehartreetmp + pot1* zf(i1,2*i3)
    1366    406707167 :             zf(i1, 2*i3) = pot1
    1367              :          END DO
    1368              :       END DO
    1369       579295 :    END SUBROUTINE unfill_downcorn
    1370              : 
    1371              : ! **************************************************************************************************
    1372              : !> \brief ...
    1373              : !> \param md1 ...
    1374              : !> \param md3 ...
    1375              : !> \param lot ...
    1376              : !> \param nfft ...
    1377              : !> \param n3 ...
    1378              : !> \param zf ...
    1379              : !> \param zw ...
    1380              : ! **************************************************************************************************
    1381       579295 :    SUBROUTINE halfill_upcorn(md1, md3, lot, nfft, n3, zf, zw)
    1382              :       INTEGER                                            :: md1, md3, lot, nfft, n3
    1383              :       REAL(KIND=dp)                                      :: zf(md1, md3), zw(2, lot, n3/2)
    1384              : 
    1385              :       INTEGER                                            :: i1, i3
    1386              : 
    1387     14639933 :       DO i3 = 1, n3/4
    1388              :          ! WARNING: Assuming that high frequencies are in the corners
    1389              :          !          and that n3 is multiple of 4
    1390              :          !in principle we can relax this condition
    1391    407286462 :          DO i1 = 1, nfft
    1392    392646529 :             zw(1, i1, i3) = 0._dp
    1393    406707167 :             zw(2, i1, i3) = 0._dp
    1394              :          END DO
    1395              :       END DO
    1396     14639933 :       DO i3 = n3/4 + 1, n3/2
    1397    407286462 :          DO i1 = 1, nfft
    1398    392646529 :             zw(1, i1, i3) = zf(i1, 2*i3 - 1 - n3/2)
    1399    406707167 :             zw(2, i1, i3) = zf(i1, 2*i3 - n3/2)
    1400              :          END DO
    1401              :       END DO
    1402              : 
    1403       579295 :    END SUBROUTINE halfill_upcorn
    1404              : 
    1405              : ! **************************************************************************************************
    1406              : !> \brief (Based on suitable modifications of S.Goedecker routines)
    1407              : !>      Assign the correct planes to the work array zmpi2
    1408              : !>      in order to prepare for interprocessor data transposition.
    1409              : !>      In the meanwhile, it unpacks the data of the HalFFT in order to prepare for
    1410              : !>      multiplication with the kernel
    1411              : !> \param i1 Starting points of the plane and number of remaining lines
    1412              : !> \param j2 Starting points of the plane and number of remaining lines
    1413              : !> \param lot Starting points of the plane and number of remaining lines
    1414              : !> \param nfft Starting points of the plane and number of remaining lines
    1415              : !> \param n1 logical dimension of the FFT transform, reference for work arrays
    1416              : !> \param n3 logical dimension of the FFT transform, reference for work arrays
    1417              : !> \param md2 Dimensions of real grid
    1418              : !> \param nproc ...
    1419              : !> \param nd3 Dimensions of the kernel
    1420              : !> \param zw Work array (input)
    1421              : !> \param zmpi2 Work array for multiprocessor manipulation (output)
    1422              : !> \param cosinarr Array of the phases needed for unpacking
    1423              : !> \date February 2006
    1424              : !> \author S. Goedecker, L. Genovese
    1425              : !> \note
    1426              : !>  RESTRICTIONS on USAGE
    1427              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
    1428              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
    1429              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
    1430              : !>      This file is distributed under the terms of the
    1431              : !>      GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
    1432              : ! **************************************************************************************************
    1433       658079 :    SUBROUTINE scramble_unpack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw, zmpi2, cosinarr)
    1434              :       INTEGER, INTENT(in)                                :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
    1435              :                                                             nd3
    1436              :       REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
    1437              :       REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
    1438              :          INTENT(inout)                                   :: zmpi2
    1439              :       REAL(KIND=dp), DIMENSION(2, n3/2), INTENT(in)      :: cosinarr
    1440              : 
    1441              :       INTEGER                                            :: i, i3, ind1, ind2
    1442              :       REAL(KIND=dp)                                      :: a, b, c, cp, d, feI, feR, fI, foI, foR, &
    1443              :                                                             fR, sp
    1444              : 
    1445              : !case i3=1 and i3=n3/2+1
    1446              : 
    1447     19971566 :       DO i = 0, nfft - 1
    1448     19313487 :          a = zw(1, i + 1, 1)
    1449     19313487 :          b = zw(2, i + 1, 1)
    1450     19313487 :          zmpi2(1, i1 + i, j2, 1) = a + b
    1451     19313487 :          zmpi2(2, i1 + i, j2, 1) = 0._dp
    1452     19313487 :          zmpi2(1, i1 + i, j2, n3/2 + 1) = a - b
    1453     19971566 :          zmpi2(2, i1 + i, j2, n3/2 + 1) = 0._dp
    1454              :       END DO
    1455              :       !case 2<=i3<=n3/2
    1456     32894636 :       DO i3 = 2, n3/2
    1457     32236557 :          ind1 = i3
    1458     32236557 :          ind2 = n3/2 - i3 + 2
    1459     32236557 :          cp = cosinarr(1, i3)
    1460     32236557 :          sp = cosinarr(2, i3)
    1461    936638287 :          DO i = 0, nfft - 1
    1462    903743651 :             a = zw(1, i + 1, ind1)
    1463    903743651 :             b = zw(2, i + 1, ind1)
    1464    903743651 :             c = zw(1, i + 1, ind2)
    1465    903743651 :             d = zw(2, i + 1, ind2)
    1466    903743651 :             feR = .5_dp*(a + c)
    1467    903743651 :             feI = .5_dp*(b - d)
    1468    903743651 :             foR = .5_dp*(a - c)
    1469    903743651 :             foI = .5_dp*(b + d)
    1470    903743651 :             fR = feR + cp*foI - sp*foR
    1471    903743651 :             fI = feI - cp*foR - sp*foI
    1472    903743651 :             zmpi2(1, i1 + i, j2, ind1) = fR
    1473    935980208 :             zmpi2(2, i1 + i, j2, ind1) = fI
    1474              :          END DO
    1475              :       END DO
    1476              : 
    1477       658079 :    END SUBROUTINE scramble_unpack
    1478              : 
    1479              : ! **************************************************************************************************
    1480              : !> \brief (Based on suitable modifications of S.Goedecker routines)
    1481              : !>      Insert the correct planes of the work array zmpi2
    1482              : !>      in order to prepare for backward FFT transform
    1483              : !>      In the meanwhile, it packs the data in order to be transformed with the HalFFT
    1484              : !>      procedure
    1485              : !> \param i1 Starting points of the plane and number of remaining lines
    1486              : !> \param j2 Starting points of the plane and number of remaining lines
    1487              : !> \param lot Starting points of the plane and number of remaining lines
    1488              : !> \param nfft Starting points of the plane and number of remaining lines
    1489              : !> \param n1 logical dimension of the FFT transform, reference for work arrays
    1490              : !> \param n3 logical dimension of the FFT transform, reference for work arrays
    1491              : !> \param md2 Dimensions of real grid
    1492              : !> \param nproc ...
    1493              : !> \param nd3 Dimensions of the kernel
    1494              : !> \param zmpi2 Work array for multiprocessor manipulation (output)
    1495              : !> \param zw Work array (inout)
    1496              : !> \param cosinarr Array of the phases needed for packing
    1497              : !> \date February 2006
    1498              : !> \author S. Goedecker, L. Genovese
    1499              : !> \note
    1500              : !>  RESTRICTIONS on USAGE
    1501              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
    1502              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
    1503              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
    1504              : !>      This file is distributed under the terms of the
    1505              : !>      GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
    1506              : ! **************************************************************************************************
    1507       579295 :    SUBROUTINE unscramble_pack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw, cosinarr)
    1508              :       INTEGER, INTENT(in)                                :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
    1509              :                                                             nd3
    1510              :       REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
    1511              :          INTENT(in)                                      :: zmpi2
    1512              :       REAL(KIND=dp), DIMENSION(2, lot, n3/2), &
    1513              :          INTENT(inout)                                   :: zw
    1514              :       REAL(KIND=dp), DIMENSION(2, n3/2), INTENT(in)      :: cosinarr
    1515              : 
    1516              :       INTEGER                                            :: i, i3, indA, indB
    1517              :       REAL(KIND=dp)                                      :: a, b, c, cp, d, ie, ih, io, re, rh, ro, &
    1518              :                                                             sp
    1519              : 
    1520     28700571 :       DO i3 = 1, n3/2
    1521     28121276 :          indA = i3
    1522     28121276 :          indB = n3/2 + 2 - i3
    1523     28121276 :          cp = cosinarr(1, i3)
    1524     28121276 :          sp = cosinarr(2, i3)
    1525    813993629 :          DO i = 0, nfft - 1
    1526    785293058 :             a = zmpi2(1, i1 + i, j2, indA)
    1527    785293058 :             b = zmpi2(2, i1 + i, j2, indA)
    1528    785293058 :             c = zmpi2(1, i1 + i, j2, indB)
    1529    785293058 :             d = -zmpi2(2, i1 + i, j2, indB)
    1530    785293058 :             re = (a + c)
    1531    785293058 :             ie = (b + d)
    1532    785293058 :             ro = (a - c)*cp - (b - d)*sp
    1533    785293058 :             io = (a - c)*sp + (b - d)*cp
    1534    785293058 :             rh = re - io
    1535    785293058 :             ih = ie + ro
    1536    785293058 :             zw(1, i + 1, indA) = rh
    1537    813414334 :             zw(2, i + 1, indA) = ih
    1538              :          END DO
    1539              :       END DO
    1540              : 
    1541       579295 :    END SUBROUTINE unscramble_pack
    1542              : 
    1543              : ! **************************************************************************************************
    1544              : !> \brief (Based on suitable modifications of S.Goedecker routines)
    1545              : !>      Applies the local FFT space Kernel to the density in Real space.
    1546              : !>      Calculates also the LDA exchange-correlation terms
    1547              : !> \param n1 logical dimension of the transform.
    1548              : !> \param n2 logical dimension of the transform.
    1549              : !> \param n3 logical dimension of the transform.
    1550              : !> \param nd1 Dimension of POT
    1551              : !> \param nd2 Dimension of POT
    1552              : !> \param nd3 Dimension of POT
    1553              : !> \param md1 Dimension of ZF
    1554              : !> \param md2 Dimension of ZF
    1555              : !> \param md3 Dimension of ZF
    1556              : !> \param nproc number of processors used as returned by MPI_COMM_SIZE
    1557              : !> \param iproc [0:nproc-1] number of processor as returned by MPI_COMM_RANK
    1558              : !> \param pot Kernel, only the distributed part (REAL)
    1559              : !>                   POT(i1,i2,i3)
    1560              : !>                   i1=1,nd1 , i2=1,nd2 , i3=1,nd3/nproc
    1561              : !> \param zf Density (input/output)
    1562              : !>                   ZF(i1,i3,i2)
    1563              : !>                   i1=1,md1 , i2=1,md2/nproc , i3=1,md3
    1564              : !> \param scal factor of renormalization of the FFT in order to acheve unitarity
    1565              : !>                   and the correct dimension
    1566              : !> \param mpi_group ...
    1567              : !> \date February 2006
    1568              : !> \author S. Goedecker, L. Genovese
    1569              : !> \note As transform lengths
    1570              : !>                     most products of the prime factors 2,3,5 are allowed.
    1571              : !>                   The detailed table with allowed transform lengths can
    1572              : !>                   be found in subroutine CTRIG
    1573              : !> \note
    1574              : !>  RESTRICTIONS on USAGE
    1575              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
    1576              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
    1577              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
    1578              : !>      This file is distributed under the terms of the
    1579              : !>       GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
    1580              : ! **************************************************************************************************
    1581        15918 :    SUBROUTINE F_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, pot, zf, &
    1582              :                               scal, mpi_group)
    1583              :       INTEGER, INTENT(in)                                :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
    1584              :                                                             md3, nproc, iproc
    1585              :       REAL(KIND=dp), DIMENSION(nd1, nd2, nd3/nproc), &
    1586              :          INTENT(in)                                      :: pot
    1587              :       REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
    1588              :          INTENT(inout)                                   :: zf
    1589              :       REAL(KIND=dp), INTENT(in)                          :: scal
    1590              : 
    1591              :       CLASS(mp_comm_type), INTENT(in)                     :: mpi_group
    1592              : 
    1593              :       INTEGER, PARAMETER                                 :: ncache_optimal = 8*1024
    1594              : 
    1595              :       INTEGER                                            :: i, i1, i3, ic1, ic2, ic3, inzee, j, j2, &
    1596              :                                                             J2stb, J2stf, j3, Jp2stb, Jp2stf, lot, &
    1597              :                                                             lzt, ma, mb, ncache, nfft
    1598        15918 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: after1, after2, after3, before1, &
    1599        15918 :                                                             before2, before3, now1, now2, now3
    1600              :       REAL(kind=dp)                                      :: twopion
    1601        15918 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: btrig1, btrig2, btrig3, cosinarr, &
    1602        15918 :                                                             ftrig1, ftrig2, ftrig3
    1603        15918 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: zt, zw
    1604              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :)  :: zmpi2
    1605              :       REAL(KIND=dp), ALLOCATABLE, &
    1606        15918 :          DIMENSION(:, :, :, :, :)                        :: zmpi1
    1607              : 
    1608        15918 :       IF (MOD(n1, 2) /= 0) CPABORT("Parallel convolution:ERROR:n1")
    1609        15918 :       IF (MOD(n2, 2) /= 0) CPABORT("Parallel convolution:ERROR:n2")
    1610        15918 :       IF (MOD(n3, 2) /= 0) CPABORT("Parallel convolution:ERROR:n3")
    1611        15918 :       IF (nd1 < n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
    1612        15918 :       IF (nd2 < n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
    1613        15918 :       IF (nd3 < n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
    1614        15918 :       IF (md1 < n1/2) CPABORT("Parallel convolution:ERROR:md1")
    1615        15918 :       IF (md2 < n2/2) CPABORT("Parallel convolution:ERROR:md2")
    1616        15918 :       IF (md3 < n3/2) CPABORT("Parallel convolution:ERROR:md3")
    1617        15918 :       IF (MOD(nd3, nproc) /= 0) CPABORT("Parallel convolution:ERROR:nd3")
    1618        15918 :       IF (MOD(md2, nproc) /= 0) CPABORT("Parallel convolution:ERROR:md2")
    1619              : 
    1620              :       !defining work arrays dimensions
    1621              : 
    1622        15918 :       ncache = ncache_optimal
    1623        15918 :       IF (ncache <= MAX(n1, n2, n3/2)*4) ncache = MAX(n1, n2, n3/2)*4
    1624        15918 :       lzt = n2/2
    1625        15918 :       IF (MOD(n2/2, 2) == 0) lzt = lzt + 1
    1626        15918 :       IF (MOD(n2/2, 4) == 0) lzt = lzt + 1
    1627              : 
    1628              :       !Allocations
    1629        15918 :       ALLOCATE (btrig1(2, ctrig_length))
    1630        15918 :       ALLOCATE (ftrig1(2, ctrig_length))
    1631        15918 :       ALLOCATE (after1(7))
    1632        15918 :       ALLOCATE (now1(7))
    1633        15918 :       ALLOCATE (before1(7))
    1634        15918 :       ALLOCATE (btrig2(2, ctrig_length))
    1635        15918 :       ALLOCATE (ftrig2(2, ctrig_length))
    1636        15918 :       ALLOCATE (after2(7))
    1637        15918 :       ALLOCATE (now2(7))
    1638        15918 :       ALLOCATE (before2(7))
    1639        15918 :       ALLOCATE (btrig3(2, ctrig_length))
    1640        15918 :       ALLOCATE (ftrig3(2, ctrig_length))
    1641        15918 :       ALLOCATE (after3(7))
    1642        15918 :       ALLOCATE (now3(7))
    1643        15918 :       ALLOCATE (before3(7))
    1644        63672 :       ALLOCATE (zw(2, ncache/4, 2))
    1645        63672 :       ALLOCATE (zt(2, lzt, n1))
    1646        79590 :       ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
    1647   4868387730 :       zmpi2 = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
    1648        63672 :       ALLOCATE (cosinarr(2, n3/2))
    1649        57678 :       IF (nproc > 1) ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
    1650              : 
    1651              :       !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
    1652        15918 :       CALL ctrig(n3/2, btrig3, after3, before3, now3, 1, ic3)
    1653        15918 :       CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
    1654        15918 :       CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
    1655      1207786 :       DO j = 1, n1
    1656      1191868 :          ftrig1(1, j) = btrig1(1, j)
    1657      1207786 :          ftrig1(2, j) = -btrig1(2, j)
    1658              :       END DO
    1659      1207786 :       DO j = 1, n2
    1660      1191868 :          ftrig2(1, j) = btrig2(1, j)
    1661      1207786 :          ftrig2(2, j) = -btrig2(2, j)
    1662              :       END DO
    1663      1231790 :       DO j = 1, n3
    1664      1215872 :          ftrig3(1, j) = btrig3(1, j)
    1665      1231790 :          ftrig3(2, j) = -btrig3(2, j)
    1666              :       END DO
    1667              : 
    1668              :       !Calculating array of phases for HalFFT decoding
    1669        15918 :       twopion = 8._dp*ATAN(1._dp)/REAL(n3, KIND=dp)
    1670       623854 :       DO i3 = 1, n3/2
    1671       607936 :          cosinarr(1, i3) = COS(twopion*(i3 - 1))
    1672       623854 :          cosinarr(2, i3) = -SIN(twopion*(i3 - 1))
    1673              :       END DO
    1674              : 
    1675              :       ! transform along z axis
    1676        15918 :       lot = ncache/(2*n3)
    1677        15918 :       IF (lot < 1) THEN
    1678              :          CALL cp_abort(__LOCATION__, &
    1679              :                        'convolxc_on:ncache has to be enlarged to be able to hold at '// &
    1680              :                        'least one 1-d FFT of this size even though this will '// &
    1681            0 :                        'reduce the performance for shorter transform lengths')
    1682              :       END IF
    1683              : 
    1684       428288 :       DO j2 = 1, md2/nproc
    1685              :          !this condition ensures that we manage only the interesting part for the FFT
    1686       428288 :          IF (iproc*(md2/nproc) + j2 <= n2/2) THEN
    1687       987642 :             DO i1 = 1, (n1/2), lot
    1688       576379 :                ma = i1
    1689       576379 :                mb = MIN(i1 + (lot - 1), (n1/2))
    1690       576379 :                nfft = mb - ma + 1
    1691              : 
    1692              :                !inserting real data into complex array of half length
    1693       576379 :                CALL halfill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
    1694              : 
    1695              :                !performing FFT
    1696              :                !input: I1,I3,J2,(Jp2)
    1697       576379 :                inzee = 1
    1698      2039390 :                DO i = 1, ic3
    1699              :                   CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1700      1463011 :                               btrig3, after3(i), now3(i), before3(i), 1)
    1701      2039390 :                   inzee = 3 - inzee
    1702              :                END DO
    1703              :                !output: I1,i3,J2,(Jp2)
    1704              : 
    1705              :                !unpacking FFT in order to restore correct result,
    1706              :                !while exchanging components
    1707              :                !input: I1,i3,J2,(Jp2)
    1708       987642 :                CALL scramble_unpack(i1, j2, lot, nfft, n1/2, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2, cosinarr)
    1709              :                !output: I1,J2,i3,(Jp2)
    1710              :             END DO
    1711              :          END IF
    1712              :       END DO
    1713              : 
    1714              :       !Interprocessor data transposition
    1715              :       !input: I1,J2,j3,jp3,(Jp2)
    1716        15918 :       IF (nproc > 1) THEN
    1717              :          !communication scheduling
    1718         8352 :          CALL mpi_group%alltoall(zmpi2, zmpi1, n1*(md2/nproc)*(nd3/nproc))
    1719              :       END IF
    1720              :       !output: I1,J2,j3,Jp2,(jp3)
    1721              : 
    1722              :       !now each process perform complete convolution of its planes
    1723       451524 :       DO j3 = 1, nd3/nproc
    1724              :          !this condition ensures that we manage only the interesting part for the FFT
    1725       451524 :          IF (iproc*(nd3/nproc) + j3 <= n3/2 + 1) THEN
    1726       431430 :             Jp2stb = 1
    1727       431430 :             J2stb = 1
    1728       431430 :             Jp2stf = 1
    1729       431430 :             J2stf = 1
    1730              : 
    1731              :             ! transform along x axis
    1732       431430 :             lot = ncache/(4*n1)
    1733       431430 :             IF (lot < 1) THEN
    1734              :                CALL cp_abort(__LOCATION__, &
    1735              :                              'convolxc_on:ncache has to be enlarged to be able to hold at '// &
    1736              :                              'least one 1-d FFT of this size even though this will '// &
    1737            0 :                              'reduce the performance for shorter transform lengths')
    1738              :             END IF
    1739              : 
    1740      1414014 :             DO j = 1, n2/2, lot
    1741       982584 :                ma = j
    1742       982584 :                mb = MIN(j + (lot - 1), n2/2)
    1743       982584 :                nfft = mb - ma + 1
    1744              : 
    1745              :                !reverse index ordering, leaving the planes to be transformed at the end
    1746              :                !input: I1,J2,j3,Jp2,(jp3)
    1747       982584 :                IF (nproc == 1) THEN
    1748       421221 :                   CALL mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
    1749              :                ELSE
    1750       561363 :                   CALL mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
    1751              :                END IF
    1752              :                !output: J2,Jp2,I1,j3,(jp3)
    1753              : 
    1754              :                !performing FFT
    1755              :                !input: I2,I1,j3,(jp3)
    1756       982584 :                inzee = 1
    1757      3123492 :                DO i = 1, ic1 - 1
    1758              :                   CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1759      2140908 :                               btrig1, after1(i), now1(i), before1(i), 1)
    1760      3123492 :                   inzee = 3 - inzee
    1761              :                END DO
    1762              : 
    1763              :                !storing the last step into zt array
    1764       982584 :                i = ic1
    1765              :                CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
    1766      1414014 :                            btrig1, after1(i), now1(i), before1(i), 1)
    1767              :                !output: I2,i1,j3,(jp3)
    1768              :             END DO
    1769              : 
    1770              :             !transform along y axis
    1771       431430 :             lot = ncache/(4*n2)
    1772       431430 :             IF (lot < 1) THEN
    1773              :                CALL cp_abort(__LOCATION__, &
    1774              :                              'convolxc_on:ncache has to be enlarged to be able to hold at '// &
    1775              :                              'least one 1-d FFT of this size even though this will '// &
    1776            0 :                              'reduce the performance for shorter transform lengths')
    1777              :             END IF
    1778              : 
    1779      2267223 :             DO j = 1, n1, lot
    1780      1835793 :                ma = j
    1781      1835793 :                mb = MIN(j + (lot - 1), n1)
    1782      1835793 :                nfft = mb - ma + 1
    1783              : 
    1784              :                !reverse ordering
    1785              :                !input: I2,i1,j3,(jp3)
    1786      1835793 :                CALL switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
    1787              :                !output: i1,I2,j3,(jp3)
    1788              : 
    1789              :                !performing FFT
    1790              :                !input: i1,I2,j3,(jp3)
    1791      1835793 :                inzee = 1
    1792      7704036 :                DO i = 1, ic2
    1793              :                   CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1794      5868243 :                               btrig2, after2(i), now2(i), before2(i), 1)
    1795      7704036 :                   inzee = 3 - inzee
    1796              :                END DO
    1797              :                !output: i1,i2,j3,(jp3)
    1798              : 
    1799              :                !Multiply with kernel in fourier space
    1800      1835793 :                CALL multkernel(nd1, nd2, n1, n2, lot, nfft, j, pot(1, 1, j3), zw(1, 1, inzee))
    1801              : 
    1802              :                !TRANSFORM BACK IN REAL SPACE
    1803              : 
    1804              :                !transform along y axis
    1805              :                !input: i1,i2,j3,(jp3)
    1806      7704036 :                DO i = 1, ic2
    1807              :                   CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1808      5868243 :                               ftrig2, after2(i), now2(i), before2(i), -1)
    1809      7704036 :                   inzee = 3 - inzee
    1810              :                END DO
    1811              : 
    1812              :                !reverse ordering
    1813              :                !input: i1,I2,j3,(jp3)
    1814      2267223 :                CALL unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
    1815              :                !output: I2,i1,j3,(jp3)
    1816              :             END DO
    1817              : 
    1818              :             !transform along x axis
    1819              :             !input: I2,i1,j3,(jp3)
    1820       431430 :             lot = ncache/(4*n1)
    1821      1414014 :             DO j = 1, n2/2, lot
    1822       982584 :                ma = j
    1823       982584 :                mb = MIN(j + (lot - 1), n2/2)
    1824       982584 :                nfft = mb - ma + 1
    1825              : 
    1826              :                !performing FFT
    1827       982584 :                i = 1
    1828              :                CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
    1829       982584 :                            ftrig1, after1(i), now1(i), before1(i), -1)
    1830              : 
    1831       982584 :                inzee = 1
    1832      3123492 :                DO i = 2, ic1
    1833              :                   CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1834      2140908 :                               ftrig1, after1(i), now1(i), before1(i), -1)
    1835      3123492 :                   inzee = 3 - inzee
    1836              :                END DO
    1837              :                !output: I2,I1,j3,(jp3)
    1838              : 
    1839              :                !reverse ordering
    1840              :                !input: J2,Jp2,I1,j3,(jp3)
    1841      1414014 :                IF (nproc == 1) THEN
    1842       421221 :                   CALL unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
    1843              :                ELSE
    1844       561363 :                   CALL unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
    1845              :                END IF
    1846              :                ! output: I1,J2,j3,Jp2,(jp3)
    1847              :             END DO
    1848              :          END IF
    1849              :       END DO
    1850              : 
    1851              :       !Interprocessor data transposition
    1852              :       !input: I1,J2,j3,Jp2,(jp3)
    1853        15918 :       IF (nproc > 1) THEN
    1854              :          !communication scheduling
    1855         8352 :          CALL mpi_group%alltoall(zmpi1, zmpi2, n1*(md2/nproc)*(nd3/nproc))
    1856              :          !output: I1,J2,j3,jp3,(Jp2)
    1857              :       END IF
    1858              : 
    1859              :       !transform along z axis
    1860              :       !input: I1,J2,i3,(Jp2)
    1861        15918 :       lot = ncache/(2*n3)
    1862       428288 :       DO j2 = 1, md2/nproc
    1863              :          !this condition ensures that we manage only the interesting part for the FFT
    1864       428288 :          IF (iproc*(md2/nproc) + j2 <= n2/2) THEN
    1865       987642 :             DO i1 = 1, (n1/2), lot
    1866       576379 :                ma = i1
    1867       576379 :                mb = MIN(i1 + (lot - 1), (n1/2))
    1868       576379 :                nfft = mb - ma + 1
    1869              : 
    1870              :                !reverse ordering and repack the FFT data in order to be backward HalFFT transformed
    1871              :                !input: I1,J2,i3,(Jp2)
    1872       576379 :                CALL unscramble_pack(i1, j2, lot, nfft, n1/2, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1), cosinarr)
    1873              :                !output: I1,i3,J2,(Jp2)
    1874              : 
    1875              :                !performing FFT
    1876              :                !input: I1,i3,J2,(Jp2)
    1877       576379 :                inzee = 1
    1878      2039390 :                DO i = 1, ic3
    1879              :                   CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
    1880      1463011 :                               ftrig3, after3(i), now3(i), before3(i), -1)
    1881      2039390 :                   inzee = 3 - inzee
    1882              :                END DO
    1883              :                !output: I1,I3,J2,(Jp2)
    1884              : 
    1885              :                !calculates the exchange correlation terms locally and rebuild the output array
    1886              :                CALL unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2) &
    1887       987642 :                                     , scal) !,ehartreetmp)
    1888              : 
    1889              :                !integrate local pieces together
    1890              :                !ehartree=ehartree+0.5_dp*ehartreetmp*hgrid**3
    1891              :             END DO
    1892              :          END IF
    1893              :       END DO
    1894              : 
    1895              :       !De-allocations
    1896        15918 :       DEALLOCATE (btrig1)
    1897        15918 :       DEALLOCATE (ftrig1)
    1898        15918 :       DEALLOCATE (after1)
    1899        15918 :       DEALLOCATE (now1)
    1900        15918 :       DEALLOCATE (before1)
    1901        15918 :       DEALLOCATE (btrig2)
    1902        15918 :       DEALLOCATE (ftrig2)
    1903        15918 :       DEALLOCATE (after2)
    1904        15918 :       DEALLOCATE (now2)
    1905        15918 :       DEALLOCATE (before2)
    1906        15918 :       DEALLOCATE (btrig3)
    1907        15918 :       DEALLOCATE (ftrig3)
    1908        15918 :       DEALLOCATE (after3)
    1909        15918 :       DEALLOCATE (now3)
    1910        15918 :       DEALLOCATE (before3)
    1911        15918 :       DEALLOCATE (zmpi2)
    1912        15918 :       DEALLOCATE (zw)
    1913        15918 :       DEALLOCATE (zt)
    1914        15918 :       DEALLOCATE (cosinarr)
    1915        15918 :       IF (nproc > 1) DEALLOCATE (zmpi1)
    1916              : 
    1917        15918 :    END SUBROUTINE F_PoissonSolver
    1918              : 
    1919              : ! **************************************************************************************************
    1920              : !> \brief ...
    1921              : !> \param nfft ...
    1922              : !> \param n2 ...
    1923              : !> \param lot ...
    1924              : !> \param n1 ...
    1925              : !> \param lzt ...
    1926              : !> \param zt ...
    1927              : !> \param zw ...
    1928              : ! **************************************************************************************************
    1929      1835793 :    SUBROUTINE switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
    1930              :       INTEGER                                            :: nfft, n2, lot, n1, lzt
    1931              :       REAL(KIND=dp)                                      :: zt(2, lzt, n1), zw(2, lot, n2)
    1932              : 
    1933              :       INTEGER                                            :: i, j
    1934              : 
    1935              : ! WARNING: Assuming that high frequencies are in the corners
    1936              : !          and that n2 is multiple of 2
    1937              : ! Low frequencies
    1938              : 
    1939     36563963 :       DO j = 1, nfft
    1940   1631989261 :          DO i = n2/2 + 1, n2
    1941   1595425298 :             zw(1, j, i) = zt(1, i - n2/2, j)
    1942   1630153468 :             zw(2, j, i) = zt(2, i - n2/2, j)
    1943              :          END DO
    1944              :       END DO
    1945              :       ! High frequencies
    1946     95424510 :       DO i = 1, n2/2
    1947   1690849808 :          DO j = 1, nfft
    1948   1595425298 :             zw(1, j, i) = 0._dp
    1949   1689014015 :             zw(2, j, i) = 0._dp
    1950              :          END DO
    1951              :       END DO
    1952      1835793 :    END SUBROUTINE switch_upcorn
    1953              : 
    1954              : ! **************************************************************************************************
    1955              : !> \brief ...
    1956              : !> \param j3 ...
    1957              : !> \param nfft ...
    1958              : !> \param Jp2stb ...
    1959              : !> \param J2stb ...
    1960              : !> \param lot ...
    1961              : !> \param n1 ...
    1962              : !> \param md2 ...
    1963              : !> \param nd3 ...
    1964              : !> \param nproc ...
    1965              : !> \param zmpi1 ...
    1966              : !> \param zw ...
    1967              : ! **************************************************************************************************
    1968       982584 :    SUBROUTINE mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
    1969              :       INTEGER                                            :: j3, nfft, Jp2stb, J2stb, lot, n1, md2, &
    1970              :                                                             nd3, nproc
    1971              :       REAL(KIND=dp) :: zmpi1(2, n1/2, md2/nproc, nd3/nproc, nproc), zw(2, lot, n1)
    1972              : 
    1973              :       INTEGER                                            :: i1, j2, jp2, mfft
    1974              : 
    1975              : ! WARNING: Assuming that high frequencies are in the corners
    1976              : !          and that n1 is multiple of 2
    1977              : 
    1978       982584 :       mfft = 0
    1979      1563579 :       Main: DO Jp2 = Jp2stb, nproc
    1980     18539093 :          DO J2 = J2stb, md2/nproc
    1981     17958098 :             mfft = mfft + 1
    1982     17958098 :             IF (mfft > nfft) THEN
    1983       594013 :                Jp2stb = Jp2
    1984       594013 :                J2stb = J2
    1985       594013 :                EXIT Main
    1986              :             END IF
    1987    815076734 :             DO I1 = 1, n1/2
    1988    797712649 :                zw(1, mfft, I1) = 0._dp
    1989    815076734 :                zw(2, mfft, I1) = 0._dp
    1990              :             END DO
    1991    815657729 :             DO I1 = n1/2 + 1, n1
    1992    797712649 :                zw(1, mfft, I1) = zmpi1(1, I1 - n1/2, J2, j3, Jp2)
    1993    815076734 :                zw(2, mfft, I1) = zmpi1(2, I1 - n1/2, J2, j3, Jp2)
    1994              :             END DO
    1995              :          END DO
    1996       969566 :          J2stb = 1
    1997              :       END DO Main
    1998       982584 :    END SUBROUTINE mpiswitch_upcorn
    1999              : 
    2000              : ! **************************************************************************************************
    2001              : !> \brief ...
    2002              : !> \param nfft ...
    2003              : !> \param n2 ...
    2004              : !> \param lot ...
    2005              : !> \param n1 ...
    2006              : !> \param lzt ...
    2007              : !> \param zw ...
    2008              : !> \param zt ...
    2009              : ! **************************************************************************************************
    2010      1835793 :    SUBROUTINE unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
    2011              :       INTEGER                                            :: nfft, n2, lot, n1, lzt
    2012              :       REAL(KIND=dp)                                      :: zw(2, lot, n2), zt(2, lzt, n1)
    2013              : 
    2014              :       INTEGER                                            :: i, j
    2015              : 
    2016              : ! WARNING: Assuming that high frequencies are in the corners
    2017              : !          and that n2 is multiple of 2
    2018              : ! Low frequencies
    2019              : 
    2020     36563963 :       DO j = 1, nfft
    2021   1631989261 :          DO i = 1, n2/2
    2022   1595425298 :             zt(1, i, j) = zw(1, j, i)
    2023   1630153468 :             zt(2, i, j) = zw(2, j, i)
    2024              :          END DO
    2025              :       END DO
    2026      1835793 :       RETURN
    2027              :    END SUBROUTINE unswitch_downcorn
    2028              : 
    2029              : ! **************************************************************************************************
    2030              : !> \brief ...
    2031              : !> \param j3 ...
    2032              : !> \param nfft ...
    2033              : !> \param Jp2stf ...
    2034              : !> \param J2stf ...
    2035              : !> \param lot ...
    2036              : !> \param n1 ...
    2037              : !> \param md2 ...
    2038              : !> \param nd3 ...
    2039              : !> \param nproc ...
    2040              : !> \param zw ...
    2041              : !> \param zmpi1 ...
    2042              : ! **************************************************************************************************
    2043       982584 :    SUBROUTINE unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
    2044              :       INTEGER                                            :: j3, nfft, Jp2stf, J2stf, lot, n1, md2, &
    2045              :                                                             nd3, nproc
    2046              :       REAL(KIND=dp) :: zw(2, lot, n1), zmpi1(2, n1/2, md2/nproc, nd3/nproc, nproc)
    2047              : 
    2048              :       INTEGER                                            :: i1, j2, jp2, mfft
    2049              : 
    2050              : ! WARNING: Assuming that high frequencies are in the corners
    2051              : !          and that n1 is multiple of 2
    2052              : 
    2053       982584 :       mfft = 0
    2054      1563579 :       Main: DO Jp2 = Jp2stf, nproc
    2055     18539093 :          DO J2 = J2stf, md2/nproc
    2056     17958098 :             mfft = mfft + 1
    2057     17958098 :             IF (mfft > nfft) THEN
    2058       594013 :                Jp2stf = Jp2
    2059       594013 :                J2stf = J2
    2060       594013 :                EXIT Main
    2061              :             END IF
    2062    815657729 :             DO I1 = 1, n1/2
    2063    797712649 :                zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
    2064    815076734 :                zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
    2065              :             END DO
    2066              :          END DO
    2067       969566 :          J2stf = 1
    2068              :       END DO Main
    2069       982584 :    END SUBROUTINE unmpiswitch_downcorn
    2070              : 
    2071              : ! **************************************************************************************************
    2072              : !> \brief (Based on suitable modifications of S.Goedecker routines)
    2073              : !>      Restore data into output array, calculating in the meanwhile
    2074              : !>      Hartree energy of the potential
    2075              : !> \param md1 Dimensions of the undistributed part of the real grid
    2076              : !> \param md3 Dimensions of the undistributed part of the real grid
    2077              : !> \param lot ...
    2078              : !> \param nfft number of planes
    2079              : !> \param n3 (twice the) dimension of the last FFTtransform.
    2080              : !> \param zw FFT work array
    2081              : !> \param zf Original distributed density as well as
    2082              : !>                   Distributed solution of the poisson equation (inout)
    2083              : !> \param scal Needed to achieve unitarity and correct dimensions
    2084              : !> \param ehartreetmp Hartree energy
    2085              : !> \date February 2006
    2086              : !> \author S. Goedecker, L. Genovese
    2087              : !> \note Assuming that high frequencies are in the corners
    2088              : !>      and that n3 is multiple of 4
    2089              : !>
    2090              : !>  RESTRICTIONS on USAGE
    2091              : !>      Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
    2092              : !>      Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
    2093              : !>      Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
    2094              : !>      This file is distributed under the terms of the
    2095              : !>      GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
    2096              : ! **************************************************************************************************
    2097            0 :    SUBROUTINE F_unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal, ehartreetmp)
    2098              :       INTEGER, INTENT(in)                                :: md1, md3, lot, nfft, n3
    2099              :       REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
    2100              :       REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout)  :: zf
    2101              :       REAL(KIND=dp), INTENT(in)                          :: scal
    2102              :       REAL(KIND=dp), INTENT(out)                         :: ehartreetmp
    2103              : 
    2104              :       INTEGER                                            :: i1, i3
    2105              :       REAL(KIND=dp)                                      :: pot1
    2106              : 
    2107            0 :       ehartreetmp = 0._dp
    2108            0 :       DO i3 = 1, n3/4
    2109            0 :          DO i1 = 1, nfft
    2110            0 :             pot1 = scal*zw(1, i1, i3)
    2111            0 :             ehartreetmp = ehartreetmp + pot1*zf(i1, 2*i3 - 1)
    2112            0 :             zf(i1, 2*i3 - 1) = pot1
    2113            0 :             pot1 = scal*zw(2, i1, i3)
    2114            0 :             ehartreetmp = ehartreetmp + pot1*zf(i1, 2*i3)
    2115            0 :             zf(i1, 2*i3) = pot1
    2116              :          END DO
    2117              :       END DO
    2118            0 :    END SUBROUTINE F_unfill_downcorn
    2119              : 
    2120              : END MODULE ps_wavelet_base
        

Generated by: LCOV version 2.0-1