Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Creates the wavelet kernel for the wavelet based poisson solver.
10 : !> \author Florian Schiffmann (09.2007,fschiff)
11 : ! **************************************************************************************************
12 : MODULE ps_wavelet_base
13 :
14 : USE kinds, ONLY: dp
15 : USE message_passing, ONLY: mp_comm_type
16 : USE ps_wavelet_fft3d, ONLY: ctrig,&
17 : ctrig_length,&
18 : fftstp
19 : #include "../base/base_uses.f90"
20 :
21 : IMPLICIT NONE
22 :
23 : PRIVATE
24 :
25 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'ps_wavelet_base'
26 :
27 : PUBLIC :: scramble_unpack, p_poissonsolver, s_poissonsolver, f_poissonsolver
28 :
29 : CONTAINS
30 :
31 : ! **************************************************************************************************
32 : !> \brief ...
33 : !> \param n1 ...
34 : !> \param n2 ...
35 : !> \param n3 ...
36 : !> \param nd1 ...
37 : !> \param nd2 ...
38 : !> \param nd3 ...
39 : !> \param md1 ...
40 : !> \param md2 ...
41 : !> \param md3 ...
42 : !> \param nproc ...
43 : !> \param iproc ...
44 : !> \param zf ...
45 : !> \param scal ...
46 : !> \param hx ...
47 : !> \param hy ...
48 : !> \param hz ...
49 : !> \param mpi_group ...
50 : ! **************************************************************************************************
51 17323 : SUBROUTINE P_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, zf &
52 : , scal, hx, hy, hz, mpi_group)
53 : INTEGER, INTENT(in) :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
54 : md3, nproc, iproc
55 : REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
56 : INTENT(inout) :: zf
57 : REAL(KIND=dp), INTENT(in) :: scal, hx, hy, hz
58 :
59 : CLASS(mp_comm_type), INTENT(in) :: mpi_group
60 :
61 : INTEGER, PARAMETER :: ncache_optimal = 8*1024
62 :
63 : INTEGER :: i, i1, i3, ic1, ic2, ic3, inzee, j, j2, &
64 : J2stb, J2stf, j3, Jp2stb, Jp2stf, lot, &
65 : lzt, ma, mb, ncache, nfft
66 17323 : INTEGER, ALLOCATABLE, DIMENSION(:) :: after1, after2, after3, before1, &
67 17323 : before2, before3, now1, now2, now3
68 17323 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: btrig1, btrig2, btrig3, ftrig1, ftrig2, &
69 17323 : ftrig3
70 17323 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: zt, zw
71 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: zmpi2
72 : REAL(KIND=dp), ALLOCATABLE, &
73 17323 : DIMENSION(:, :, :, :, :) :: zmpi1
74 :
75 17323 : IF (nd1 < n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
76 17323 : IF (nd2 < n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
77 17323 : IF (nd3 < n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
78 17323 : IF (md1 < n1) CPABORT("Parallel convolution:ERROR:md1")
79 17323 : IF (md2 < n2) CPABORT("Parallel convolution:ERROR:md2")
80 17323 : IF (md3 < n3) CPABORT("Parallel convolution:ERROR:md3")
81 17323 : IF (MOD(nd3, nproc) /= 0) CPABORT("Parallel convolution:ERROR:nd3")
82 17323 : IF (MOD(md2, nproc) /= 0) CPABORT("Parallel convolution:ERROR:md2")
83 :
84 : !defining work arrays dimensions
85 17323 : ncache = ncache_optimal
86 17323 : IF (ncache <= MAX(n1, n2, n3)*4) ncache = MAX(n1, n2, n3)*4
87 :
88 17323 : lzt = n2
89 17323 : IF (MOD(n2, 2) == 0) lzt = lzt + 1
90 17323 : IF (MOD(n2, 4) == 0) lzt = lzt + 1 !maybe this is useless
91 :
92 : !Allocations
93 17323 : ALLOCATE (btrig1(2, ctrig_length))
94 17323 : ALLOCATE (ftrig1(2, ctrig_length))
95 17323 : ALLOCATE (after1(7))
96 17323 : ALLOCATE (now1(7))
97 17323 : ALLOCATE (before1(7))
98 17323 : ALLOCATE (btrig2(2, ctrig_length))
99 17323 : ALLOCATE (ftrig2(2, ctrig_length))
100 17323 : ALLOCATE (after2(7))
101 17323 : ALLOCATE (now2(7))
102 17323 : ALLOCATE (before2(7))
103 17323 : ALLOCATE (btrig3(2, ctrig_length))
104 17323 : ALLOCATE (ftrig3(2, ctrig_length))
105 17323 : ALLOCATE (after3(7))
106 17323 : ALLOCATE (now3(7))
107 17323 : ALLOCATE (before3(7))
108 69292 : ALLOCATE (zw(2, ncache/4, 2))
109 69292 : ALLOCATE (zt(2, lzt, n1))
110 86615 : ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
111 41353 : IF (nproc > 1) ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
112 :
113 : !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
114 17323 : CALL ctrig(n3, btrig3, after3, before3, now3, 1, ic3)
115 17323 : CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
116 17323 : CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
117 375192 : DO j = 1, n1
118 357869 : ftrig1(1, j) = btrig1(1, j)
119 375192 : ftrig1(2, j) = -btrig1(2, j)
120 : END DO
121 375192 : DO j = 1, n2
122 357869 : ftrig2(1, j) = btrig2(1, j)
123 375192 : ftrig2(2, j) = -btrig2(2, j)
124 : END DO
125 375192 : DO j = 1, n3
126 357869 : ftrig3(1, j) = btrig3(1, j)
127 375192 : ftrig3(2, j) = -btrig3(2, j)
128 : END DO
129 :
130 : ! transform along z axis
131 17323 : lot = ncache/(4*n3)
132 17323 : IF (lot < 1) THEN
133 : CALL cp_abort(__LOCATION__, &
134 : 'convolxc_off:ncache has to be enlarged to be able to hold at '// &
135 : 'least one 1-d FFT of this size even though this will '// &
136 0 : 'reduce the performance for shorter transform lengths')
137 : END IF
138 320326 : DO j2 = 1, md2/nproc
139 : !this condition ensures that we manage only the interesting part for the FFT
140 320326 : IF (iproc*(md2/nproc) + j2 <= n2) THEN
141 613678 : DO i1 = 1, n1, lot
142 311348 : ma = i1
143 311348 : mb = MIN(i1 + (lot - 1), n1)
144 311348 : nfft = mb - ma + 1
145 : !inserting real data into complex array of half length
146 311348 : CALL P_fill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
147 :
148 : !performing FFT
149 : !input: I1,I3,J2,(Jp2)
150 311348 : inzee = 1
151 971439 : DO i = 1, ic3
152 : CALL fftstp(lot, nfft, n3, lot, n3, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
153 660091 : btrig3, after3(i), now3(i), before3(i), 1)
154 971439 : inzee = 3 - inzee
155 : END DO
156 :
157 : !output: I1,i3,J2,(Jp2)
158 : !exchanging components
159 : !input: I1,i3,J2,(Jp2)
160 613678 : CALL scramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2)
161 : !output: I1,J2,i3,(Jp2)
162 : END DO
163 : END IF
164 : END DO
165 :
166 : !Interprocessor data transposition
167 : !input: I1,J2,j3,jp3,(Jp2)
168 17323 : IF (nproc > 1) THEN
169 : !communication scheduling
170 4806 : CALL mpi_group%alltoall(zmpi2, zmpi1, 2*n1*(md2/nproc)*(nd3/nproc))
171 : END IF
172 : !output: I1,J2,j3,Jp2,(jp3)
173 :
174 : !now each process perform complete convolution of its planes
175 182731 : DO j3 = 1, nd3/nproc
176 : !this condition ensures that we manage only the interesting part for the FFT
177 182731 : IF (iproc*(nd3/nproc) + j3 <= n3/2 + 1) THEN
178 164470 : Jp2stb = 1
179 164470 : J2stb = 1
180 164470 : Jp2stf = 1
181 164470 : J2stf = 1
182 :
183 : ! transform along x axis
184 164470 : lot = ncache/(4*n1)
185 164470 : IF (lot < 1) THEN
186 : CALL cp_abort(__LOCATION__, &
187 : 'convolxc_off:ncache has to be enlarged to be able to hold at '// &
188 : 'least one 1-d FFT of this size even though this will '// &
189 0 : 'reduce the performance for shorter transform lengths')
190 : END IF
191 :
192 333616 : DO j = 1, n2, lot
193 169146 : ma = j
194 169146 : mb = MIN(j + (lot - 1), n2)
195 169146 : nfft = mb - ma + 1
196 :
197 : !reverse index ordering, leaving the planes to be transformed at the end
198 : !input: I1,J2,j3,Jp2,(jp3)
199 169146 : IF (nproc == 1) THEN
200 136958 : CALL P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
201 : ELSE
202 32188 : CALL P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
203 : END IF
204 : !output: J2,Jp2,I1,j3,(jp3)
205 :
206 : !performing FFT
207 : !input: I2,I1,j3,(jp3)
208 169146 : inzee = 1
209 357848 : DO i = 1, ic1 - 1
210 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
211 188702 : btrig1, after1(i), now1(i), before1(i), 1)
212 357848 : inzee = 3 - inzee
213 : END DO
214 :
215 : !storing the last step into zt array
216 169146 : i = ic1
217 : CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
218 333616 : btrig1, after1(i), now1(i), before1(i), 1)
219 : !output: I2,i1,j3,(jp3)
220 : END DO
221 :
222 : !transform along y axis
223 164470 : lot = ncache/(4*n2)
224 164470 : IF (lot < 1) THEN
225 : CALL cp_abort(__LOCATION__, &
226 : 'convolxc_off:ncache has to be enlarged to be able to hold at '// &
227 : 'least one 1-d FFT of this size even though this will '// &
228 0 : 'reduce the performance for shorter transform lengths')
229 : END IF
230 :
231 333616 : DO j = 1, n1, lot
232 169146 : ma = j
233 169146 : mb = MIN(j + (lot - 1), n1)
234 169146 : nfft = mb - ma + 1
235 :
236 : !reverse ordering
237 : !input: I2,i1,j3,(jp3)
238 169146 : CALL P_switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
239 : !output: i1,I2,j3,(jp3)
240 :
241 : !performing FFT
242 : !input: i1,I2,j3,(jp3)
243 169146 : inzee = 1
244 526994 : DO i = 1, ic2
245 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
246 357848 : btrig2, after2(i), now2(i), before2(i), 1)
247 526994 : inzee = 3 - inzee
248 : END DO
249 : !output: i1,i2,j3,(jp3)
250 :
251 : !Multiply with kernel in fourier space
252 169146 : i3 = iproc*(nd3/nproc) + j3
253 169146 : CALL P_multkernel(n1, n2, n3, lot, nfft, j, i3, zw(1, 1, inzee), hx, hy, hz)
254 :
255 : !TRANSFORM BACK IN REAL SPACE
256 :
257 : !transform along y axis
258 : !input: i1,i2,j3,(jp3)
259 526994 : DO i = 1, ic2
260 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
261 357848 : ftrig2, after2(i), now2(i), before2(i), -1)
262 526994 : inzee = 3 - inzee
263 : END DO
264 :
265 : !reverse ordering
266 : !input: i1,I2,j3,(jp3)
267 333616 : CALL P_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
268 : !output: I2,i1,j3,(jp3)
269 : END DO
270 :
271 : !transform along x axis
272 : !input: I2,i1,j3,(jp3)
273 164470 : lot = ncache/(4*n1)
274 333616 : DO j = 1, n2, lot
275 169146 : ma = j
276 169146 : mb = MIN(j + (lot - 1), n2)
277 169146 : nfft = mb - ma + 1
278 :
279 : !performing FFT
280 169146 : i = 1
281 : CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
282 169146 : ftrig1, after1(i), now1(i), before1(i), -1)
283 :
284 169146 : inzee = 1
285 357848 : DO i = 2, ic1
286 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
287 188702 : ftrig1, after1(i), now1(i), before1(i), -1)
288 357848 : inzee = 3 - inzee
289 : END DO
290 : !output: I2,I1,j3,(jp3)
291 :
292 : !reverse ordering
293 : !input: J2,Jp2,I1,j3,(jp3)
294 333616 : IF (nproc == 1) THEN
295 136958 : CALL P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
296 : ELSE
297 32188 : CALL P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
298 : END IF
299 : ! output: I1,J2,j3,Jp2,(jp3)
300 : END DO
301 : END IF
302 : END DO
303 :
304 : !Interprocessor data transposition
305 : !input: I1,J2,j3,Jp2,(jp3)
306 17323 : IF (nproc > 1) THEN
307 : !communication scheduling
308 4806 : CALL mpi_group%alltoall(zmpi1, zmpi2, 2*n1*(md2/nproc)*(nd3/nproc))
309 : END IF
310 : !output: I1,J2,j3,jp3,(Jp2)
311 : !transform along z axis
312 : !input: I1,J2,i3,(Jp2)
313 17323 : lot = ncache/(4*n3)
314 320326 : DO j2 = 1, md2/nproc
315 : !this condition ensures that we manage only the interesting part for the FFT
316 320326 : IF (iproc*(md2/nproc) + j2 <= n2) THEN
317 613678 : DO i1 = 1, n1, lot
318 311348 : ma = i1
319 311348 : mb = MIN(i1 + (lot - 1), n1)
320 311348 : nfft = mb - ma + 1
321 :
322 : !reverse ordering
323 : !input: I1,J2,i3,(Jp2)
324 311348 : CALL unscramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1))
325 : !output: I1,i3,J2,(Jp2)
326 :
327 : !performing FFT
328 : !input: I1,i3,J2,(Jp2)
329 311348 : inzee = 1
330 971439 : DO i = 1, ic3
331 : CALL fftstp(lot, nfft, n3, lot, n3, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
332 660091 : ftrig3, after3(i), now3(i), before3(i), -1)
333 971439 : inzee = 3 - inzee
334 : END DO
335 : !output: I1,I3,J2,(Jp2)
336 :
337 : !rebuild the output array
338 613678 : CALL P_unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2), scal)
339 :
340 : END DO
341 : END IF
342 : END DO
343 :
344 : !De-allocations
345 17323 : DEALLOCATE (btrig1)
346 17323 : DEALLOCATE (ftrig1)
347 17323 : DEALLOCATE (after1)
348 17323 : DEALLOCATE (now1)
349 17323 : DEALLOCATE (before1)
350 17323 : DEALLOCATE (btrig2)
351 17323 : DEALLOCATE (ftrig2)
352 17323 : DEALLOCATE (after2)
353 17323 : DEALLOCATE (now2)
354 17323 : DEALLOCATE (before2)
355 17323 : DEALLOCATE (btrig3)
356 17323 : DEALLOCATE (ftrig3)
357 17323 : DEALLOCATE (after3)
358 17323 : DEALLOCATE (now3)
359 17323 : DEALLOCATE (before3)
360 17323 : DEALLOCATE (zmpi2)
361 17323 : DEALLOCATE (zw)
362 17323 : DEALLOCATE (zt)
363 17323 : IF (nproc > 1) DEALLOCATE (zmpi1)
364 : ! call timing(iproc,'PSolv_comput ','OF')
365 17323 : END SUBROUTINE P_PoissonSolver
366 :
367 : ! **************************************************************************************************
368 : !> \brief ...
369 : !> \param j3 ...
370 : !> \param nfft ...
371 : !> \param Jp2stb ...
372 : !> \param J2stb ...
373 : !> \param lot ...
374 : !> \param n1 ...
375 : !> \param md2 ...
376 : !> \param nd3 ...
377 : !> \param nproc ...
378 : !> \param zmpi1 ...
379 : !> \param zw ...
380 : ! **************************************************************************************************
381 169146 : SUBROUTINE P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
382 : INTEGER, INTENT(in) :: j3, nfft
383 : INTEGER, INTENT(inout) :: Jp2stb, J2stb
384 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
385 : REAL(KIND=dp), &
386 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
387 : INTENT(in) :: zmpi1
388 : REAL(KIND=dp), DIMENSION(2, lot, n1), &
389 : INTENT(inout) :: zw
390 :
391 : INTEGER :: I1, J2, Jp2, mfft
392 :
393 169146 : mfft = 0
394 354650 : DO Jp2 = Jp2stb, nproc
395 3840810 : DO J2 = J2stb, md2/nproc
396 3655306 : mfft = mfft + 1
397 3655306 : IF (mfft > nfft) THEN
398 13478 : Jp2stb = Jp2
399 13478 : J2stb = J2
400 13478 : RETURN
401 : END IF
402 95925808 : DO I1 = 1, n1
403 92098476 : zw(1, mfft, I1) = zmpi1(1, I1, J2, j3, Jp2)
404 95740304 : zw(2, mfft, I1) = zmpi1(2, I1, J2, j3, Jp2)
405 : END DO
406 : END DO
407 341172 : J2stb = 1
408 : END DO
409 : END SUBROUTINE P_mpiswitch_upcorn
410 :
411 : ! **************************************************************************************************
412 : !> \brief ...
413 : !> \param nfft ...
414 : !> \param n2 ...
415 : !> \param lot ...
416 : !> \param n1 ...
417 : !> \param lzt ...
418 : !> \param zt ...
419 : !> \param zw ...
420 : ! **************************************************************************************************
421 169146 : SUBROUTINE P_switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
422 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
423 : REAL(KIND=dp), DIMENSION(2, lzt, n1), INTENT(in) :: zt
424 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
425 : INTENT(inout) :: zw
426 :
427 : INTEGER :: i, j
428 :
429 3810974 : DO j = 1, nfft
430 95909450 : DO i = 1, n2
431 92098476 : zw(1, j, i) = zt(1, i, j)
432 95740304 : zw(2, j, i) = zt(2, i, j)
433 : END DO
434 : END DO
435 :
436 169146 : END SUBROUTINE P_switch_upcorn
437 :
438 : ! **************************************************************************************************
439 : !> \brief ...
440 : !> \param nfft ...
441 : !> \param n2 ...
442 : !> \param lot ...
443 : !> \param n1 ...
444 : !> \param lzt ...
445 : !> \param zw ...
446 : !> \param zt ...
447 : ! **************************************************************************************************
448 169146 : SUBROUTINE P_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
449 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
450 : REAL(KIND=dp), DIMENSION(2, lot, n2), INTENT(in) :: zw
451 : REAL(KIND=dp), DIMENSION(2, lzt, n1), &
452 : INTENT(inout) :: zt
453 :
454 : INTEGER :: i, j
455 :
456 3810974 : DO j = 1, nfft
457 95909450 : DO i = 1, n2
458 92098476 : zt(1, i, j) = zw(1, j, i)
459 95740304 : zt(2, i, j) = zw(2, j, i)
460 : END DO
461 : END DO
462 :
463 169146 : END SUBROUTINE P_unswitch_downcorn
464 :
465 : ! **************************************************************************************************
466 : !> \brief ...
467 : !> \param j3 ...
468 : !> \param nfft ...
469 : !> \param Jp2stf ...
470 : !> \param J2stf ...
471 : !> \param lot ...
472 : !> \param n1 ...
473 : !> \param md2 ...
474 : !> \param nd3 ...
475 : !> \param nproc ...
476 : !> \param zw ...
477 : !> \param zmpi1 ...
478 : ! **************************************************************************************************
479 169146 : SUBROUTINE P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
480 : INTEGER, INTENT(in) :: j3, nfft
481 : INTEGER, INTENT(inout) :: Jp2stf, J2stf
482 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
483 : REAL(KIND=dp), DIMENSION(2, lot, n1), INTENT(in) :: zw
484 : REAL(KIND=dp), &
485 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
486 : INTENT(inout) :: zmpi1
487 :
488 : INTEGER :: I1, J2, Jp2, mfft
489 :
490 169146 : mfft = 0
491 354650 : DO Jp2 = Jp2stf, nproc
492 3840810 : DO J2 = J2stf, md2/nproc
493 3655306 : mfft = mfft + 1
494 3655306 : IF (mfft > nfft) THEN
495 13478 : Jp2stf = Jp2
496 13478 : J2stf = J2
497 13478 : RETURN
498 : END IF
499 95925808 : DO I1 = 1, n1
500 92098476 : zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
501 95740304 : zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
502 : END DO
503 : END DO
504 341172 : J2stf = 1
505 : END DO
506 : END SUBROUTINE P_unmpiswitch_downcorn
507 :
508 : ! **************************************************************************************************
509 : !> \brief (Based on suitable modifications of S.Goedecker routines)
510 : !> Restore data into output array
511 : !> \param md1 Dimensions of the undistributed part of the real grid
512 : !> \param md3 Dimensions of the undistributed part of the real grid
513 : !> \param lot ...
514 : !> \param nfft number of planes
515 : !> \param n3 (twice the) dimension of the last FFTtransform
516 : !> \param zw FFT work array
517 : !> \param zf Original distributed density as well as
518 : !> Distributed solution of the poisson equation (inout)
519 : !> \param scal Needed to achieve unitarity and correct dimensions
520 : !> \date February 2006
521 : !> \author S. Goedecker, L. Genovese
522 : !> \note Assuming that high frequencies are in the corners
523 : !> and that n3 is multiple of 4
524 : !>
525 : !> RESTRICTIONS on USAGE
526 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
527 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
528 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
529 : !> This file is distributed under the terms of the
530 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
531 : ! **************************************************************************************************
532 311348 : SUBROUTINE P_unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal)
533 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
534 : REAL(KIND=dp), DIMENSION(2, lot, n3), INTENT(in) :: zw
535 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout) :: zf
536 : REAL(KIND=dp), INTENT(in) :: scal
537 :
538 : INTEGER :: i1, i3
539 : REAL(KIND=dp) :: pot1
540 :
541 7534542 : DO i3 = 1, n3
542 179338112 : DO i1 = 1, nfft
543 171803570 : pot1 = scal*zw(1, i1, i3)
544 179026764 : zf(i1, i3) = pot1
545 : END DO
546 : END DO
547 :
548 311348 : END SUBROUTINE P_unfill_downcorn
549 :
550 : ! **************************************************************************************************
551 : !> \brief ...
552 : !> \param md1 ...
553 : !> \param md3 ...
554 : !> \param lot ...
555 : !> \param nfft ...
556 : !> \param n3 ...
557 : !> \param zf ...
558 : !> \param zw ...
559 : ! **************************************************************************************************
560 311348 : SUBROUTINE P_fill_upcorn(md1, md3, lot, nfft, n3, zf, zw)
561 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
562 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(in) :: zf
563 : REAL(KIND=dp), DIMENSION(2, lot, n3), &
564 : INTENT(inout) :: zw
565 :
566 : INTEGER :: i1, i3
567 :
568 7534542 : DO i3 = 1, n3
569 179338112 : DO i1 = 1, nfft
570 171803570 : zw(1, i1, i3) = zf(i1, i3)
571 179026764 : zw(2, i1, i3) = 0._dp
572 : END DO
573 : END DO
574 :
575 311348 : END SUBROUTINE P_fill_upcorn
576 :
577 : ! **************************************************************************************************
578 : !> \brief (Based on suitable modifications of S.Goedecker routines)
579 : !> Assign the correct planes to the work array zmpi2
580 : !> in order to prepare for interprocessor data transposition.
581 : !> \param i1 Starting points of the plane and number of remaining lines
582 : !> \param j2 Starting points of the plane and number of remaining lines
583 : !> \param lot Starting points of the plane and number of remaining lines
584 : !> \param nfft Starting points of the plane and number of remaining lines
585 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
586 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
587 : !> \param md2 Dimensions of real grid
588 : !> \param nproc ...
589 : !> \param nd3 Dimensions of the kernel
590 : !> \param zw Work array (input)
591 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
592 : !> \date February 2006
593 : !> \author S. Goedecker, L. Genovese
594 : !> \note
595 : !> RESTRICTIONS on USAGE
596 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
597 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
598 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
599 : !> This file is distributed under the terms of the
600 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
601 : ! **************************************************************************************************
602 311348 : SUBROUTINE scramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw, zmpi2)
603 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
604 : nd3
605 : REAL(KIND=dp), DIMENSION(2, lot, n3), INTENT(in) :: zw
606 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
607 : INTENT(inout) :: zmpi2
608 :
609 : INTEGER :: i, i3
610 :
611 4205680 : DO i3 = 1, n3/2 + 1
612 96304156 : DO i = 0, nfft - 1
613 92098476 : zmpi2(1, i1 + i, j2, i3) = zw(1, i + 1, i3)
614 95992808 : zmpi2(2, i1 + i, j2, i3) = zw(2, i + 1, i3)
615 : END DO
616 : END DO
617 :
618 311348 : END SUBROUTINE scramble_P
619 :
620 : ! **************************************************************************************************
621 : !> \brief (Based on suitable modifications of S.Goedecker routines)
622 : !> Insert the correct planes of the work array zmpi2
623 : !> in order to prepare for backward FFT transform
624 : !> \param i1 Starting points of the plane and number of remaining lines
625 : !> \param j2 Starting points of the plane and number of remaining lines
626 : !> \param lot Starting points of the plane and number of remaining lines
627 : !> \param nfft Starting points of the plane and number of remaining lines
628 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
629 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
630 : !> \param md2 Dimensions of real grid
631 : !> \param nproc ...
632 : !> \param nd3 Dimensions of the kernel
633 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
634 : !> \param zw Work array (input)
635 : !> \date February 2006
636 : !> \author S. Goedecker, L. Genovese
637 : !> \note
638 : !> RESTRICTIONS on USAGE
639 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
640 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
641 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
642 : !> This file is distributed under the terms of the
643 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
644 : ! **************************************************************************************************
645 311348 : SUBROUTINE unscramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw)
646 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
647 : nd3
648 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
649 : INTENT(in) :: zmpi2
650 : REAL(KIND=dp), DIMENSION(2, lot, n3), &
651 : INTENT(inout) :: zw
652 :
653 : INTEGER :: i, i3, j3
654 :
655 311348 : i3 = 1
656 7047570 : DO i = 0, nfft - 1
657 6736222 : zw(1, i + 1, i3) = zmpi2(1, i1 + i, j2, i3)
658 7047570 : zw(2, i + 1, i3) = zmpi2(2, i1 + i, j2, i3)
659 : END DO
660 :
661 3894332 : DO i3 = 2, n3/2 + 1
662 3582984 : j3 = n3 + 2 - i3
663 89256586 : DO i = 0, nfft - 1
664 85362254 : zw(1, i + 1, j3) = zmpi2(1, i1 + i, j2, i3)
665 85362254 : zw(2, i + 1, j3) = -zmpi2(2, i1 + i, j2, i3)
666 85362254 : zw(1, i + 1, i3) = zmpi2(1, i1 + i, j2, i3)
667 88945238 : zw(2, i + 1, i3) = zmpi2(2, i1 + i, j2, i3)
668 : END DO
669 : END DO
670 :
671 311348 : END SUBROUTINE unscramble_P
672 :
673 : ! **************************************************************************************************
674 : !> \brief (Based on suitable modifications of S.Goedecker routines)
675 : !> Multiply with the kernel taking into account its symmetry
676 : !> Conceived to be used into convolution loops
677 : !> \param n1 ...
678 : !> \param n2 ...
679 : !> \param n3 ...
680 : !> \param lot ...
681 : !> \param nfft ...
682 : !> \param jS ...
683 : !> \param i3 ...
684 : !> \param zw Work array (input/output)
685 : !> n1,n2: logical dimension of the FFT transform, reference for zw
686 : !> nd1,nd2: Dimensions of POT
687 : !> jS,j3,nfft: starting point of the plane and number of remaining lines
688 : !>
689 : !> RESTRICTIONS on USAGE
690 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
691 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
692 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
693 : !> This file is distributed under the terms of the
694 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
695 : !> \param hx ...
696 : !> \param hy ...
697 : !> \param hz ...
698 : !> \date February 2006
699 : !> \author S. Goedecker, L. Genovese
700 : ! **************************************************************************************************
701 169146 : SUBROUTINE P_multkernel(n1, n2, n3, lot, nfft, jS, i3, zw, hx, hy, hz)
702 : INTEGER, INTENT(in) :: n1, n2, n3, lot, nfft, jS, i3
703 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
704 : INTENT(inout) :: zw
705 : REAL(KIND=dp), INTENT(in) :: hx, hy, hz
706 :
707 : INTEGER :: i1, i2, j1, j2, j3
708 : REAL(KIND=dp) :: fourpi2, ker, mu3, p1, p2, pi
709 :
710 169146 : pi = 4._dp*ATAN(1._dp)
711 169146 : fourpi2 = 4._dp*pi**2
712 169146 : j3 = i3 !n3/2+1-abs(n3/2+2-i3)
713 169146 : mu3 = REAL(j3 - 1, KIND=dp)/REAL(n3, KIND=dp)
714 169146 : mu3 = (mu3/hy)**2 !beware of the exchanged dimension
715 : !Body
716 : !generic case
717 4063478 : DO i2 = 1, n2
718 96161954 : DO i1 = 1, nfft
719 92098476 : j1 = i1 + jS - 1
720 92098476 : j1 = j1 - (j1/(n1/2 + 2))*n1 !n1/2+1-abs(n1/2+2-jS-i1)
721 92098476 : j2 = i2 - (i2/(n2/2 + 2))*n2 !n2/2+1-abs(n2/2+1-i2)
722 92098476 : p1 = REAL(j1 - 1, KIND=dp)/REAL(n1, KIND=dp)
723 92098476 : p2 = REAL(j2 - 1, KIND=dp)/REAL(n2, KIND=dp)
724 92098476 : ker = -fourpi2*((p1/hx)**2 + (p2/hz)**2 + mu3) !beware of the exchanged dimension
725 92098476 : IF (ker /= 0._dp) ker = 1._dp/ker
726 92098476 : zw(1, i1, i2) = zw(1, i1, i2)*ker
727 95992808 : zw(2, i1, i2) = zw(2, i1, i2)*ker
728 : END DO
729 : END DO
730 :
731 169146 : END SUBROUTINE P_multkernel
732 :
733 : ! **************************************************************************************************
734 : !> \brief (Based on suitable modifications of S.Goedecker routines)
735 : !> Multiply with the kernel taking into account its symmetry
736 : !> Conceived to be used into convolution loops
737 : !> \param nd1 ...
738 : !> \param nd2 ...
739 : !> \param n1 ...
740 : !> \param n2 ...
741 : !> \param lot ...
742 : !> \param nfft ...
743 : !> \param jS ...
744 : !> \param pot Kernel, symmetric and real, half the length
745 : !> \param zw Work array (input/output)
746 : !> n1,n2: logical dimension of the FFT transform, reference for zw
747 : !> nd1,nd2: Dimensions of POT
748 : !> jS, nfft: starting point of the plane and number of remaining lines
749 : !>
750 : !> RESTRICTIONS on USAGE
751 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
752 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
753 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
754 : !> This file is distributed under the terms of the
755 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
756 : !> \date February 2006
757 : !> \author S. Goedecker, L. Genovese
758 : ! **************************************************************************************************
759 1838763 : SUBROUTINE multkernel(nd1, nd2, n1, n2, lot, nfft, jS, pot, zw)
760 : INTEGER, INTENT(in) :: nd1, nd2, n1, n2, lot, nfft, jS
761 : REAL(KIND=dp), DIMENSION(nd1, nd2), INTENT(in) :: pot
762 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
763 : INTENT(inout) :: zw
764 :
765 : INTEGER :: i2, j, j1, j2
766 :
767 36647123 : DO j = 1, nfft
768 34808360 : j1 = j + jS - 1
769 34808360 : j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
770 34808360 : zw(1, j, 1) = zw(1, j, 1)*pot(j1, 1)
771 36647123 : zw(2, j, 1) = zw(2, j, 1)*pot(j1, 1)
772 : END DO
773 :
774 : !generic case
775 93668907 : DO i2 = 2, n2/2
776 1656450975 : DO j = 1, nfft
777 1562782068 : j1 = j + jS - 1
778 1562782068 : j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
779 1562782068 : j2 = n2 + 2 - i2
780 1562782068 : zw(1, j, i2) = zw(1, j, i2)*pot(j1, i2)
781 1562782068 : zw(2, j, i2) = zw(2, j, i2)*pot(j1, i2)
782 1562782068 : zw(1, j, j2) = zw(1, j, j2)*pot(j1, i2)
783 1654612212 : zw(2, j, j2) = zw(2, j, j2)*pot(j1, i2)
784 : END DO
785 : END DO
786 :
787 : !case i2=n2/2+1
788 36647123 : DO j = 1, nfft
789 34808360 : j1 = j + jS - 1
790 34808360 : j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
791 34808360 : j2 = n2/2 + 1
792 34808360 : zw(1, j, j2) = zw(1, j, j2)*pot(j1, j2)
793 36647123 : zw(2, j, j2) = zw(2, j, j2)*pot(j1, j2)
794 : END DO
795 :
796 1838763 : END SUBROUTINE multkernel
797 :
798 : ! **************************************************************************************************
799 : !> \brief !HERE POT MUST BE THE KERNEL (BEWARE THE HALF DIMENSION)
800 : !> ****h* BigDFT/S_PoissonSolver
801 : !> (Based on suitable modifications of S.Goedecker routines)
802 : !> Applies the local FFT space Kernel to the density in Real space.
803 : !> Does NOT calculate the LDA exchange-correlation terms
804 : !> \param n1 logical dimension of the transform.
805 : !> \param n2 logical dimension of the transform.
806 : !> \param n3 logical dimension of the transform.
807 : !> \param nd1 Dimension of POT
808 : !> \param nd2 Dimension of POT
809 : !> \param nd3 Dimension of POT
810 : !> \param md1 Dimension of ZF
811 : !> \param md2 Dimension of ZF
812 : !> \param md3 Dimension of ZF
813 : !> \param nproc number of processors used as returned by MPI_COMM_SIZE
814 : !> \param iproc [0:nproc-1] number of processor as returned by MPI_COMM_RANK
815 : !> \param pot Kernel, only the distributed part (REAL)
816 : !> POT(i1,i2,i3)
817 : !> i1=1,nd1 , i2=1,nd2 , i3=1,nd3/nproc
818 : !> \param zf Density (input/output)
819 : !> ZF(i1,i3,i2)
820 : !> i1=1,md1 , i2=1,md2/nproc , i3=1,md3
821 : !> \param scal factor of renormalization of the FFT in order to acheve unitarity
822 : !> and the correct dimension
823 : !> \param mpi_group ...
824 : !> \date October 2006
825 : !> \author S. Goedecker, L. Genovese
826 : !> \note As transform lengths most products of the prime factors 2,3,5 are allowed.
827 : !> The detailed table with allowed transform lengths can
828 : !> be found in subroutine CTRIG
829 : !> \note
830 : !> RESTRICTIONS on USAGE
831 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
832 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
833 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
834 : !> This file is distributed under the terms of the
835 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
836 : ! **************************************************************************************************
837 54 : SUBROUTINE S_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, pot, zf, &
838 : scal, mpi_group)
839 : INTEGER, INTENT(in) :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
840 : md3, nproc, iproc
841 : REAL(KIND=dp), DIMENSION(nd1, nd2, nd3/nproc), &
842 : INTENT(in) :: pot
843 : REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
844 : INTENT(inout) :: zf
845 : REAL(KIND=dp), INTENT(in) :: scal
846 :
847 : CLASS(mp_comm_type), INTENT(in) :: mpi_group
848 :
849 : CHARACTER(len=*), PARAMETER :: routineN = 'S_PoissonSolver'
850 : INTEGER, PARAMETER :: ncache_optimal = 8*1024
851 :
852 : INTEGER :: handle, i, i1, i3, ic1, ic2, ic3, inzee, &
853 : j, j2, J2stb, J2stf, j3, Jp2stb, &
854 : Jp2stf, lot, lzt, ma, mb, ncache, nfft
855 54 : INTEGER, ALLOCATABLE, DIMENSION(:) :: after1, after2, after3, before1, &
856 54 : before2, before3, now1, now2, now3
857 : REAL(kind=dp) :: twopion
858 54 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: btrig1, btrig2, btrig3, cosinarr, &
859 54 : ftrig1, ftrig2, ftrig3
860 54 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: zt, zw
861 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: zmpi2
862 : REAL(KIND=dp), ALLOCATABLE, &
863 54 : DIMENSION(:, :, :, :, :) :: zmpi1
864 :
865 54 : CALL timeset(routineN, handle)
866 : ! check input
867 54 : IF (MOD(n3, 2) /= 0) CPABORT("Parallel convolution:ERROR:n3")
868 54 : IF (nd1 < n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
869 54 : IF (nd2 < n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
870 54 : IF (nd3 < n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
871 54 : IF (md1 < n1) CPABORT("Parallel convolution:ERROR:md1")
872 54 : IF (md2 < n2) CPABORT("Parallel convolution:ERROR:md2")
873 54 : IF (md3 < n3/2) CPABORT("Parallel convolution:ERROR:md3")
874 54 : IF (MOD(nd3, nproc) /= 0) CPABORT("Parallel convolution:ERROR:nd3")
875 54 : IF (MOD(md2, nproc) /= 0) CPABORT("Parallel convolution:ERROR:md2")
876 :
877 : !defining work arrays dimensions
878 54 : ncache = ncache_optimal
879 54 : IF (ncache <= MAX(n1, n2, n3/2)*4) ncache = MAX(n1, n2, n3/2)*4
880 :
881 : ! if (timing_flag == 1 .and. iproc ==0) print *,'parallel ncache=',ncache
882 :
883 54 : lzt = n2
884 54 : IF (MOD(n2, 2) == 0) lzt = lzt + 1
885 54 : IF (MOD(n2, 4) == 0) lzt = lzt + 1 !maybe this is useless
886 :
887 : !Allocations
888 54 : ALLOCATE (btrig1(2, ctrig_length))
889 54 : ALLOCATE (ftrig1(2, ctrig_length))
890 54 : ALLOCATE (after1(7))
891 54 : ALLOCATE (now1(7))
892 54 : ALLOCATE (before1(7))
893 54 : ALLOCATE (btrig2(2, ctrig_length))
894 54 : ALLOCATE (ftrig2(2, ctrig_length))
895 54 : ALLOCATE (after2(7))
896 54 : ALLOCATE (now2(7))
897 54 : ALLOCATE (before2(7))
898 54 : ALLOCATE (btrig3(2, ctrig_length))
899 54 : ALLOCATE (ftrig3(2, ctrig_length))
900 54 : ALLOCATE (after3(7))
901 54 : ALLOCATE (now3(7))
902 54 : ALLOCATE (before3(7))
903 216 : ALLOCATE (zw(2, ncache/4, 2))
904 216 : ALLOCATE (zt(2, lzt, n1))
905 270 : ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
906 216 : ALLOCATE (cosinarr(2, n3/2))
907 54 : IF (nproc > 1) THEN
908 324 : ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
909 54 : zmpi1 = 0.0_dp
910 : END IF
911 54 : zmpi2 = 0.0_dp
912 54 : zw = 0.0_dp
913 54 : zt = 0.0_dp
914 :
915 : !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
916 54 : CALL ctrig(n3/2, btrig3, after3, before3, now3, 1, ic3)
917 54 : CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
918 54 : CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
919 2970 : DO j = 1, n1
920 2916 : ftrig1(1, j) = btrig1(1, j)
921 2970 : ftrig1(2, j) = -btrig1(2, j)
922 : END DO
923 2970 : DO j = 1, n2
924 2916 : ftrig2(1, j) = btrig2(1, j)
925 2970 : ftrig2(2, j) = -btrig2(2, j)
926 : END DO
927 2970 : DO j = 1, n3/2
928 2916 : ftrig3(1, j) = btrig3(1, j)
929 2970 : ftrig3(2, j) = -btrig3(2, j)
930 : END DO
931 :
932 : !Calculating array of phases for HalFFT decoding
933 54 : twopion = 8._dp*ATAN(1._dp)/REAL(n3, KIND=dp)
934 2970 : DO i3 = 1, n3/2
935 2916 : cosinarr(1, i3) = COS(twopion*(i3 - 1))
936 2970 : cosinarr(2, i3) = -SIN(twopion*(i3 - 1))
937 : END DO
938 :
939 : !initializing integral
940 : !ehartree=0._dp
941 :
942 : ! transform along z axis
943 54 : lot = ncache/(2*n3)
944 54 : IF (lot < 1) THEN
945 : CALL cp_abort(__LOCATION__, &
946 : 'convolxc_off:ncache has to be enlarged to be able to hold at '// &
947 : 'least one 1-d FFT of this size even though this will '// &
948 0 : 'reduce the performance for shorter transform lengths')
949 : END IF
950 :
951 1512 : DO j2 = 1, md2/nproc
952 : !this condition ensures that we manage only the interesting part for the FFT
953 1512 : IF (iproc*(md2/nproc) + j2 <= n2) THEN
954 4374 : DO i1 = 1, n1, lot
955 2916 : ma = i1
956 2916 : mb = MIN(i1 + (lot - 1), n1)
957 2916 : nfft = mb - ma + 1
958 :
959 : !inserting real data into complex array of half length
960 2916 : CALL halfill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
961 :
962 : !performing FFT
963 : !input: I1,I3,J2,(Jp2)
964 2916 : inzee = 1
965 11664 : DO i = 1, ic3
966 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
967 8748 : btrig3, after3(i), now3(i), before3(i), 1)
968 11664 : inzee = 3 - inzee
969 : END DO
970 : !output: I1,i3,J2,(Jp2)
971 : !unpacking FFT in order to restore correct result,
972 : !while exchanging components
973 : !input: I1,i3,J2,(Jp2)
974 4374 : CALL scramble_unpack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2, cosinarr)
975 : !output: I1,J2,i3,(Jp2)
976 : END DO
977 : END IF
978 : END DO
979 : !Interprocessor data transposition
980 : !input: I1,J2,j3,jp3,(Jp2)
981 54 : IF (nproc > 1) THEN
982 54 : CALL mpi_group%alltoall(zmpi2, zmpi1, 2*n1*(md2/nproc)*(nd3/nproc))
983 : END IF
984 : !output: I1,J2,j3,Jp2,(jp3)
985 :
986 : !now each process perform complete convolution of its planes
987 1566 : DO j3 = 1, nd3/nproc
988 : !this condition ensures that we manage only the interesting part for the FFT
989 1566 : IF (iproc*(nd3/nproc) + j3 <= n3/2 + 1) THEN
990 1485 : Jp2stb = 1
991 1485 : J2stb = 1
992 1485 : Jp2stf = 1
993 1485 : J2stf = 1
994 :
995 : ! transform along x axis
996 1485 : lot = ncache/(4*n1)
997 1485 : IF (lot < 1) THEN
998 : CALL cp_abort(__LOCATION__, &
999 : 'convolxc_off:ncache has to be enlarged to be able to hold at '// &
1000 : 'least one 1-d FFT of this size even though this will '// &
1001 0 : 'reduce the performance for shorter transform lengths')
1002 : END IF
1003 :
1004 4455 : DO j = 1, n2, lot
1005 2970 : ma = j
1006 2970 : mb = MIN(j + (lot - 1), n2)
1007 2970 : nfft = mb - ma + 1
1008 :
1009 : !reverse index ordering, leaving the planes to be transformed at the end
1010 : !input: I1,J2,j3,Jp2,(jp3)
1011 2970 : IF (nproc == 1) THEN
1012 0 : CALL S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
1013 : ELSE
1014 2970 : CALL S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
1015 : END IF
1016 : !output: J2,Jp2,I1,j3,(jp3)
1017 :
1018 : !performing FFT
1019 : !input: I2,I1,j3,(jp3)
1020 2970 : inzee = 1
1021 8910 : DO i = 1, ic1 - 1
1022 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1023 5940 : btrig1, after1(i), now1(i), before1(i), 1)
1024 8910 : inzee = 3 - inzee
1025 : END DO
1026 :
1027 : !storing the last step into zt array
1028 2970 : i = ic1
1029 : CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
1030 4455 : btrig1, after1(i), now1(i), before1(i), 1)
1031 : !output: I2,i1,j3,(jp3)
1032 : END DO
1033 :
1034 : !transform along y axis
1035 1485 : lot = ncache/(4*n2)
1036 1485 : IF (lot < 1) THEN
1037 : CALL cp_abort(__LOCATION__, &
1038 : 'convolxc_off:ncache has to be enlarged to be able to hold at '// &
1039 : 'least one 1-d FFT of this size even though this will '// &
1040 0 : 'reduce the performance for shorter transform lengths')
1041 : END IF
1042 :
1043 4455 : DO j = 1, n1, lot
1044 2970 : ma = j
1045 2970 : mb = MIN(j + (lot - 1), n1)
1046 2970 : nfft = mb - ma + 1
1047 :
1048 : !reverse ordering
1049 : !input: I2,i1,j3,(jp3)
1050 2970 : CALL S_switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
1051 : !output: i1,I2,j3,(jp3)
1052 :
1053 : !performing FFT
1054 : !input: i1,I2,j3,(jp3)
1055 2970 : inzee = 1
1056 11880 : DO i = 1, ic2
1057 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1058 8910 : btrig2, after2(i), now2(i), before2(i), 1)
1059 11880 : inzee = 3 - inzee
1060 : END DO
1061 : !output: i1,i2,j3,(jp3)
1062 :
1063 : !Multiply with kernel in fourier space
1064 2970 : CALL multkernel(nd1, nd2, n1, n2, lot, nfft, j, pot(1, 1, j3), zw(1, 1, inzee))
1065 :
1066 : !TRANSFORM BACK IN REAL SPACE
1067 :
1068 : !transform along y axis
1069 : !input: i1,i2,j3,(jp3)
1070 11880 : DO i = 1, ic2
1071 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1072 8910 : ftrig2, after2(i), now2(i), before2(i), -1)
1073 11880 : inzee = 3 - inzee
1074 : END DO
1075 :
1076 : !reverse ordering
1077 : !input: i1,I2,j3,(jp3)
1078 4455 : CALL S_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
1079 : !output: I2,i1,j3,(jp3)
1080 : END DO
1081 :
1082 : !transform along x axis
1083 : !input: I2,i1,j3,(jp3)
1084 1485 : lot = ncache/(4*n1)
1085 4455 : DO j = 1, n2, lot
1086 2970 : ma = j
1087 2970 : mb = MIN(j + (lot - 1), n2)
1088 2970 : nfft = mb - ma + 1
1089 :
1090 : !performing FFT
1091 2970 : i = 1
1092 : CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
1093 2970 : ftrig1, after1(i), now1(i), before1(i), -1)
1094 :
1095 2970 : inzee = 1
1096 8910 : DO i = 2, ic1
1097 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1098 5940 : ftrig1, after1(i), now1(i), before1(i), -1)
1099 8910 : inzee = 3 - inzee
1100 : END DO
1101 : !output: I2,I1,j3,(jp3)
1102 :
1103 : !reverse ordering
1104 : !input: J2,Jp2,I1,j3,(jp3)
1105 4455 : IF (nproc == 1) THEN
1106 0 : CALL S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
1107 : ELSE
1108 2970 : CALL S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
1109 : END IF
1110 : ! output: I1,J2,j3,Jp2,(jp3)
1111 : END DO
1112 : END IF
1113 : END DO
1114 :
1115 : !Interprocessor data transposition
1116 : !input: I1,J2,j3,Jp2,(jp3)
1117 54 : IF (nproc > 1) THEN
1118 : !communication scheduling
1119 54 : CALL mpi_group%alltoall(zmpi1, zmpi2, 2*n1*(md2/nproc)*(nd3/nproc))
1120 : END IF
1121 :
1122 : !output: I1,J2,j3,jp3,(Jp2)
1123 :
1124 : !transform along z axis
1125 : !input: I1,J2,i3,(Jp2)
1126 54 : lot = ncache/(2*n3)
1127 1512 : DO j2 = 1, md2/nproc
1128 : !this condition ensures that we manage only the interesting part for the FFT
1129 1512 : IF (iproc*(md2/nproc) + j2 <= n2) THEN
1130 4374 : DO i1 = 1, n1, lot
1131 2916 : ma = i1
1132 2916 : mb = MIN(i1 + (lot - 1), n1)
1133 2916 : nfft = mb - ma + 1
1134 :
1135 : !reverse ordering and repack the FFT data in order to be backward HalFFT transformed
1136 : !input: I1,J2,i3,(Jp2)
1137 2916 : CALL unscramble_pack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1), cosinarr)
1138 : !output: I1,i3,J2,(Jp2)
1139 :
1140 : !performing FFT
1141 : !input: I1,i3,J2,(Jp2)
1142 2916 : inzee = 1
1143 11664 : DO i = 1, ic3
1144 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1145 8748 : ftrig3, after3(i), now3(i), before3(i), -1)
1146 11664 : inzee = 3 - inzee
1147 : END DO
1148 : !output: I1,I3,J2,(Jp2)
1149 :
1150 : !rebuild the output array
1151 : CALL unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2) &
1152 4374 : , scal)
1153 :
1154 : !integrate local pieces together
1155 : !ehartree=ehartree+0.5_dp*ehartreetmp*hx*hy*hz
1156 : END DO
1157 : END IF
1158 : END DO
1159 :
1160 : !De-allocations
1161 54 : DEALLOCATE (btrig1)
1162 54 : DEALLOCATE (ftrig1)
1163 54 : DEALLOCATE (after1)
1164 54 : DEALLOCATE (now1)
1165 54 : DEALLOCATE (before1)
1166 54 : DEALLOCATE (btrig2)
1167 54 : DEALLOCATE (ftrig2)
1168 54 : DEALLOCATE (after2)
1169 54 : DEALLOCATE (now2)
1170 54 : DEALLOCATE (before2)
1171 54 : DEALLOCATE (btrig3)
1172 54 : DEALLOCATE (ftrig3)
1173 54 : DEALLOCATE (after3)
1174 54 : DEALLOCATE (now3)
1175 54 : DEALLOCATE (before3)
1176 54 : DEALLOCATE (zmpi2)
1177 54 : DEALLOCATE (zw)
1178 54 : DEALLOCATE (zt)
1179 54 : DEALLOCATE (cosinarr)
1180 54 : IF (nproc > 1) DEALLOCATE (zmpi1)
1181 :
1182 : ! call timing(iproc,'PSolv_comput ','OF')
1183 54 : CALL timestop(handle)
1184 108 : END SUBROUTINE S_PoissonSolver
1185 :
1186 : ! **************************************************************************************************
1187 : !> \brief ...
1188 : !> \param j3 ...
1189 : !> \param nfft ...
1190 : !> \param Jp2stb ...
1191 : !> \param J2stb ...
1192 : !> \param lot ...
1193 : !> \param n1 ...
1194 : !> \param md2 ...
1195 : !> \param nd3 ...
1196 : !> \param nproc ...
1197 : !> \param zmpi1 ...
1198 : !> \param zw ...
1199 : ! **************************************************************************************************
1200 2970 : SUBROUTINE S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
1201 : INTEGER, INTENT(in) :: j3, nfft
1202 : INTEGER, INTENT(inout) :: Jp2stb, J2stb
1203 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
1204 : REAL(KIND=dp), &
1205 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
1206 : INTENT(in) :: zmpi1
1207 : REAL(KIND=dp), DIMENSION(2, lot, n1), &
1208 : INTENT(inout) :: zw
1209 :
1210 : INTEGER :: I1, J2, Jp2, mfft
1211 :
1212 2970 : mfft = 0
1213 5940 : DO Jp2 = Jp2stb, nproc
1214 84645 : DO J2 = J2stb, md2/nproc
1215 81675 : mfft = mfft + 1
1216 81675 : IF (mfft > nfft) THEN
1217 1485 : Jp2stb = Jp2
1218 1485 : J2stb = J2
1219 1485 : RETURN
1220 : END IF
1221 4413420 : DO I1 = 1, n1
1222 4330260 : zw(1, mfft, I1) = zmpi1(1, I1, J2, j3, Jp2)
1223 4410450 : zw(2, mfft, I1) = zmpi1(2, I1, J2, j3, Jp2)
1224 : END DO
1225 : END DO
1226 4455 : J2stb = 1
1227 : END DO
1228 : END SUBROUTINE S_mpiswitch_upcorn
1229 :
1230 : ! **************************************************************************************************
1231 : !> \brief ...
1232 : !> \param nfft ...
1233 : !> \param n2 ...
1234 : !> \param lot ...
1235 : !> \param n1 ...
1236 : !> \param lzt ...
1237 : !> \param zt ...
1238 : !> \param zw ...
1239 : ! **************************************************************************************************
1240 2970 : SUBROUTINE S_switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
1241 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
1242 : REAL(KIND=dp), DIMENSION(2, lzt, n1), INTENT(in) :: zt
1243 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
1244 : INTENT(inout) :: zw
1245 :
1246 : INTEGER :: i, j
1247 :
1248 83160 : DO j = 1, nfft
1249 4413420 : DO i = 1, n2
1250 4330260 : zw(1, j, i) = zt(1, i, j)
1251 4410450 : zw(2, j, i) = zt(2, i, j)
1252 : END DO
1253 : END DO
1254 2970 : END SUBROUTINE S_switch_upcorn
1255 :
1256 : ! **************************************************************************************************
1257 : !> \brief ...
1258 : !> \param nfft ...
1259 : !> \param n2 ...
1260 : !> \param lot ...
1261 : !> \param n1 ...
1262 : !> \param lzt ...
1263 : !> \param zw ...
1264 : !> \param zt ...
1265 : ! **************************************************************************************************
1266 2970 : SUBROUTINE S_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
1267 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
1268 : REAL(KIND=dp), DIMENSION(2, lot, n2), INTENT(in) :: zw
1269 : REAL(KIND=dp), DIMENSION(2, lzt, n1), &
1270 : INTENT(inout) :: zt
1271 :
1272 : INTEGER :: i, j
1273 :
1274 83160 : DO j = 1, nfft
1275 4413420 : DO i = 1, n2
1276 4330260 : zt(1, i, j) = zw(1, j, i)
1277 4410450 : zt(2, i, j) = zw(2, j, i)
1278 : END DO
1279 : END DO
1280 2970 : END SUBROUTINE S_unswitch_downcorn
1281 :
1282 : ! **************************************************************************************************
1283 : !> \brief ...
1284 : !> \param j3 ...
1285 : !> \param nfft ...
1286 : !> \param Jp2stf ...
1287 : !> \param J2stf ...
1288 : !> \param lot ...
1289 : !> \param n1 ...
1290 : !> \param md2 ...
1291 : !> \param nd3 ...
1292 : !> \param nproc ...
1293 : !> \param zw ...
1294 : !> \param zmpi1 ...
1295 : ! **************************************************************************************************
1296 2970 : SUBROUTINE S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
1297 : INTEGER, INTENT(in) :: j3, nfft
1298 : INTEGER, INTENT(inout) :: Jp2stf, J2stf
1299 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
1300 : REAL(KIND=dp), DIMENSION(2, lot, n1), INTENT(in) :: zw
1301 : REAL(KIND=dp), &
1302 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
1303 : INTENT(inout) :: zmpi1
1304 :
1305 : INTEGER :: I1, J2, Jp2, mfft
1306 :
1307 2970 : mfft = 0
1308 5940 : DO Jp2 = Jp2stf, nproc
1309 84645 : DO J2 = J2stf, md2/nproc
1310 81675 : mfft = mfft + 1
1311 81675 : IF (mfft > nfft) THEN
1312 1485 : Jp2stf = Jp2
1313 1485 : J2stf = J2
1314 1485 : RETURN
1315 : END IF
1316 4413420 : DO I1 = 1, n1
1317 4330260 : zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
1318 4410450 : zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
1319 : END DO
1320 : END DO
1321 4455 : J2stf = 1
1322 : END DO
1323 : END SUBROUTINE S_unmpiswitch_downcorn
1324 :
1325 : ! **************************************************************************************************
1326 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1327 : !> Restore data into output array, calculating in the meanwhile
1328 : !> Hartree energy of the potential
1329 : !> \param md1 Dimensions of the undistributed part of the real grid
1330 : !> \param md3 Dimensions of the undistributed part of the real grid
1331 : !> \param lot ...
1332 : !> \param nfft number of planes
1333 : !> \param n3 (twice the) dimension of the last FFTtransform.
1334 : !> \param zw FFT work array
1335 : !> \param zf Original distributed density as well as
1336 : !> Distributed solution of the poisson equation (inout)
1337 : !> \param scal Needed to achieve unitarity and correct dimensions
1338 : !> \date February 2006
1339 : !> \author S. Goedecker, L. Genovese
1340 : !> \note Assuming that high frequencies are in the corners
1341 : !> and that n3 is multiple of 4
1342 : !>
1343 : !> RESTRICTIONS on USAGE
1344 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1345 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1346 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1347 : !> This file is distributed under the terms of the
1348 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1349 : ! **************************************************************************************************
1350 579295 : SUBROUTINE unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal)
1351 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
1352 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
1353 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout) :: zf
1354 : REAL(KIND=dp), INTENT(in) :: scal
1355 :
1356 : INTEGER :: i1, i3
1357 : REAL(KIND=dp) :: pot1
1358 :
1359 14639933 : DO i3 = 1, n3/4
1360 407286462 : DO i1 = 1, nfft
1361 392646529 : pot1 = scal*zw(1, i1, i3)
1362 : !ehartreetmp =ehartreetmp + pot1* zf(i1,2*i3-1)
1363 392646529 : zf(i1, 2*i3 - 1) = pot1
1364 392646529 : pot1 = scal*zw(2, i1, i3)
1365 : !ehartreetmp =ehartreetmp + pot1* zf(i1,2*i3)
1366 406707167 : zf(i1, 2*i3) = pot1
1367 : END DO
1368 : END DO
1369 579295 : END SUBROUTINE unfill_downcorn
1370 :
1371 : ! **************************************************************************************************
1372 : !> \brief ...
1373 : !> \param md1 ...
1374 : !> \param md3 ...
1375 : !> \param lot ...
1376 : !> \param nfft ...
1377 : !> \param n3 ...
1378 : !> \param zf ...
1379 : !> \param zw ...
1380 : ! **************************************************************************************************
1381 579295 : SUBROUTINE halfill_upcorn(md1, md3, lot, nfft, n3, zf, zw)
1382 : INTEGER :: md1, md3, lot, nfft, n3
1383 : REAL(KIND=dp) :: zf(md1, md3), zw(2, lot, n3/2)
1384 :
1385 : INTEGER :: i1, i3
1386 :
1387 14639933 : DO i3 = 1, n3/4
1388 : ! WARNING: Assuming that high frequencies are in the corners
1389 : ! and that n3 is multiple of 4
1390 : !in principle we can relax this condition
1391 407286462 : DO i1 = 1, nfft
1392 392646529 : zw(1, i1, i3) = 0._dp
1393 406707167 : zw(2, i1, i3) = 0._dp
1394 : END DO
1395 : END DO
1396 14639933 : DO i3 = n3/4 + 1, n3/2
1397 407286462 : DO i1 = 1, nfft
1398 392646529 : zw(1, i1, i3) = zf(i1, 2*i3 - 1 - n3/2)
1399 406707167 : zw(2, i1, i3) = zf(i1, 2*i3 - n3/2)
1400 : END DO
1401 : END DO
1402 :
1403 579295 : END SUBROUTINE halfill_upcorn
1404 :
1405 : ! **************************************************************************************************
1406 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1407 : !> Assign the correct planes to the work array zmpi2
1408 : !> in order to prepare for interprocessor data transposition.
1409 : !> In the meanwhile, it unpacks the data of the HalFFT in order to prepare for
1410 : !> multiplication with the kernel
1411 : !> \param i1 Starting points of the plane and number of remaining lines
1412 : !> \param j2 Starting points of the plane and number of remaining lines
1413 : !> \param lot Starting points of the plane and number of remaining lines
1414 : !> \param nfft Starting points of the plane and number of remaining lines
1415 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
1416 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
1417 : !> \param md2 Dimensions of real grid
1418 : !> \param nproc ...
1419 : !> \param nd3 Dimensions of the kernel
1420 : !> \param zw Work array (input)
1421 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
1422 : !> \param cosinarr Array of the phases needed for unpacking
1423 : !> \date February 2006
1424 : !> \author S. Goedecker, L. Genovese
1425 : !> \note
1426 : !> RESTRICTIONS on USAGE
1427 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1428 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1429 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1430 : !> This file is distributed under the terms of the
1431 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1432 : ! **************************************************************************************************
1433 658079 : SUBROUTINE scramble_unpack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw, zmpi2, cosinarr)
1434 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
1435 : nd3
1436 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
1437 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
1438 : INTENT(inout) :: zmpi2
1439 : REAL(KIND=dp), DIMENSION(2, n3/2), INTENT(in) :: cosinarr
1440 :
1441 : INTEGER :: i, i3, ind1, ind2
1442 : REAL(KIND=dp) :: a, b, c, cp, d, feI, feR, fI, foI, foR, &
1443 : fR, sp
1444 :
1445 : !case i3=1 and i3=n3/2+1
1446 :
1447 19971566 : DO i = 0, nfft - 1
1448 19313487 : a = zw(1, i + 1, 1)
1449 19313487 : b = zw(2, i + 1, 1)
1450 19313487 : zmpi2(1, i1 + i, j2, 1) = a + b
1451 19313487 : zmpi2(2, i1 + i, j2, 1) = 0._dp
1452 19313487 : zmpi2(1, i1 + i, j2, n3/2 + 1) = a - b
1453 19971566 : zmpi2(2, i1 + i, j2, n3/2 + 1) = 0._dp
1454 : END DO
1455 : !case 2<=i3<=n3/2
1456 32894636 : DO i3 = 2, n3/2
1457 32236557 : ind1 = i3
1458 32236557 : ind2 = n3/2 - i3 + 2
1459 32236557 : cp = cosinarr(1, i3)
1460 32236557 : sp = cosinarr(2, i3)
1461 936638287 : DO i = 0, nfft - 1
1462 903743651 : a = zw(1, i + 1, ind1)
1463 903743651 : b = zw(2, i + 1, ind1)
1464 903743651 : c = zw(1, i + 1, ind2)
1465 903743651 : d = zw(2, i + 1, ind2)
1466 903743651 : feR = .5_dp*(a + c)
1467 903743651 : feI = .5_dp*(b - d)
1468 903743651 : foR = .5_dp*(a - c)
1469 903743651 : foI = .5_dp*(b + d)
1470 903743651 : fR = feR + cp*foI - sp*foR
1471 903743651 : fI = feI - cp*foR - sp*foI
1472 903743651 : zmpi2(1, i1 + i, j2, ind1) = fR
1473 935980208 : zmpi2(2, i1 + i, j2, ind1) = fI
1474 : END DO
1475 : END DO
1476 :
1477 658079 : END SUBROUTINE scramble_unpack
1478 :
1479 : ! **************************************************************************************************
1480 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1481 : !> Insert the correct planes of the work array zmpi2
1482 : !> in order to prepare for backward FFT transform
1483 : !> In the meanwhile, it packs the data in order to be transformed with the HalFFT
1484 : !> procedure
1485 : !> \param i1 Starting points of the plane and number of remaining lines
1486 : !> \param j2 Starting points of the plane and number of remaining lines
1487 : !> \param lot Starting points of the plane and number of remaining lines
1488 : !> \param nfft Starting points of the plane and number of remaining lines
1489 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
1490 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
1491 : !> \param md2 Dimensions of real grid
1492 : !> \param nproc ...
1493 : !> \param nd3 Dimensions of the kernel
1494 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
1495 : !> \param zw Work array (inout)
1496 : !> \param cosinarr Array of the phases needed for packing
1497 : !> \date February 2006
1498 : !> \author S. Goedecker, L. Genovese
1499 : !> \note
1500 : !> RESTRICTIONS on USAGE
1501 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1502 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1503 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1504 : !> This file is distributed under the terms of the
1505 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1506 : ! **************************************************************************************************
1507 579295 : SUBROUTINE unscramble_pack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw, cosinarr)
1508 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
1509 : nd3
1510 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
1511 : INTENT(in) :: zmpi2
1512 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), &
1513 : INTENT(inout) :: zw
1514 : REAL(KIND=dp), DIMENSION(2, n3/2), INTENT(in) :: cosinarr
1515 :
1516 : INTEGER :: i, i3, indA, indB
1517 : REAL(KIND=dp) :: a, b, c, cp, d, ie, ih, io, re, rh, ro, &
1518 : sp
1519 :
1520 28700571 : DO i3 = 1, n3/2
1521 28121276 : indA = i3
1522 28121276 : indB = n3/2 + 2 - i3
1523 28121276 : cp = cosinarr(1, i3)
1524 28121276 : sp = cosinarr(2, i3)
1525 813993629 : DO i = 0, nfft - 1
1526 785293058 : a = zmpi2(1, i1 + i, j2, indA)
1527 785293058 : b = zmpi2(2, i1 + i, j2, indA)
1528 785293058 : c = zmpi2(1, i1 + i, j2, indB)
1529 785293058 : d = -zmpi2(2, i1 + i, j2, indB)
1530 785293058 : re = (a + c)
1531 785293058 : ie = (b + d)
1532 785293058 : ro = (a - c)*cp - (b - d)*sp
1533 785293058 : io = (a - c)*sp + (b - d)*cp
1534 785293058 : rh = re - io
1535 785293058 : ih = ie + ro
1536 785293058 : zw(1, i + 1, indA) = rh
1537 813414334 : zw(2, i + 1, indA) = ih
1538 : END DO
1539 : END DO
1540 :
1541 579295 : END SUBROUTINE unscramble_pack
1542 :
1543 : ! **************************************************************************************************
1544 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1545 : !> Applies the local FFT space Kernel to the density in Real space.
1546 : !> Calculates also the LDA exchange-correlation terms
1547 : !> \param n1 logical dimension of the transform.
1548 : !> \param n2 logical dimension of the transform.
1549 : !> \param n3 logical dimension of the transform.
1550 : !> \param nd1 Dimension of POT
1551 : !> \param nd2 Dimension of POT
1552 : !> \param nd3 Dimension of POT
1553 : !> \param md1 Dimension of ZF
1554 : !> \param md2 Dimension of ZF
1555 : !> \param md3 Dimension of ZF
1556 : !> \param nproc number of processors used as returned by MPI_COMM_SIZE
1557 : !> \param iproc [0:nproc-1] number of processor as returned by MPI_COMM_RANK
1558 : !> \param pot Kernel, only the distributed part (REAL)
1559 : !> POT(i1,i2,i3)
1560 : !> i1=1,nd1 , i2=1,nd2 , i3=1,nd3/nproc
1561 : !> \param zf Density (input/output)
1562 : !> ZF(i1,i3,i2)
1563 : !> i1=1,md1 , i2=1,md2/nproc , i3=1,md3
1564 : !> \param scal factor of renormalization of the FFT in order to acheve unitarity
1565 : !> and the correct dimension
1566 : !> \param mpi_group ...
1567 : !> \date February 2006
1568 : !> \author S. Goedecker, L. Genovese
1569 : !> \note As transform lengths
1570 : !> most products of the prime factors 2,3,5 are allowed.
1571 : !> The detailed table with allowed transform lengths can
1572 : !> be found in subroutine CTRIG
1573 : !> \note
1574 : !> RESTRICTIONS on USAGE
1575 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1576 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1577 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1578 : !> This file is distributed under the terms of the
1579 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1580 : ! **************************************************************************************************
1581 15918 : SUBROUTINE F_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, pot, zf, &
1582 : scal, mpi_group)
1583 : INTEGER, INTENT(in) :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
1584 : md3, nproc, iproc
1585 : REAL(KIND=dp), DIMENSION(nd1, nd2, nd3/nproc), &
1586 : INTENT(in) :: pot
1587 : REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
1588 : INTENT(inout) :: zf
1589 : REAL(KIND=dp), INTENT(in) :: scal
1590 :
1591 : CLASS(mp_comm_type), INTENT(in) :: mpi_group
1592 :
1593 : INTEGER, PARAMETER :: ncache_optimal = 8*1024
1594 :
1595 : INTEGER :: i, i1, i3, ic1, ic2, ic3, inzee, j, j2, &
1596 : J2stb, J2stf, j3, Jp2stb, Jp2stf, lot, &
1597 : lzt, ma, mb, ncache, nfft
1598 15918 : INTEGER, ALLOCATABLE, DIMENSION(:) :: after1, after2, after3, before1, &
1599 15918 : before2, before3, now1, now2, now3
1600 : REAL(kind=dp) :: twopion
1601 15918 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: btrig1, btrig2, btrig3, cosinarr, &
1602 15918 : ftrig1, ftrig2, ftrig3
1603 15918 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: zt, zw
1604 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: zmpi2
1605 : REAL(KIND=dp), ALLOCATABLE, &
1606 15918 : DIMENSION(:, :, :, :, :) :: zmpi1
1607 :
1608 15918 : IF (MOD(n1, 2) /= 0) CPABORT("Parallel convolution:ERROR:n1")
1609 15918 : IF (MOD(n2, 2) /= 0) CPABORT("Parallel convolution:ERROR:n2")
1610 15918 : IF (MOD(n3, 2) /= 0) CPABORT("Parallel convolution:ERROR:n3")
1611 15918 : IF (nd1 < n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
1612 15918 : IF (nd2 < n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
1613 15918 : IF (nd3 < n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
1614 15918 : IF (md1 < n1/2) CPABORT("Parallel convolution:ERROR:md1")
1615 15918 : IF (md2 < n2/2) CPABORT("Parallel convolution:ERROR:md2")
1616 15918 : IF (md3 < n3/2) CPABORT("Parallel convolution:ERROR:md3")
1617 15918 : IF (MOD(nd3, nproc) /= 0) CPABORT("Parallel convolution:ERROR:nd3")
1618 15918 : IF (MOD(md2, nproc) /= 0) CPABORT("Parallel convolution:ERROR:md2")
1619 :
1620 : !defining work arrays dimensions
1621 :
1622 15918 : ncache = ncache_optimal
1623 15918 : IF (ncache <= MAX(n1, n2, n3/2)*4) ncache = MAX(n1, n2, n3/2)*4
1624 15918 : lzt = n2/2
1625 15918 : IF (MOD(n2/2, 2) == 0) lzt = lzt + 1
1626 15918 : IF (MOD(n2/2, 4) == 0) lzt = lzt + 1
1627 :
1628 : !Allocations
1629 15918 : ALLOCATE (btrig1(2, ctrig_length))
1630 15918 : ALLOCATE (ftrig1(2, ctrig_length))
1631 15918 : ALLOCATE (after1(7))
1632 15918 : ALLOCATE (now1(7))
1633 15918 : ALLOCATE (before1(7))
1634 15918 : ALLOCATE (btrig2(2, ctrig_length))
1635 15918 : ALLOCATE (ftrig2(2, ctrig_length))
1636 15918 : ALLOCATE (after2(7))
1637 15918 : ALLOCATE (now2(7))
1638 15918 : ALLOCATE (before2(7))
1639 15918 : ALLOCATE (btrig3(2, ctrig_length))
1640 15918 : ALLOCATE (ftrig3(2, ctrig_length))
1641 15918 : ALLOCATE (after3(7))
1642 15918 : ALLOCATE (now3(7))
1643 15918 : ALLOCATE (before3(7))
1644 63672 : ALLOCATE (zw(2, ncache/4, 2))
1645 63672 : ALLOCATE (zt(2, lzt, n1))
1646 79590 : ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
1647 4868387730 : zmpi2 = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
1648 63672 : ALLOCATE (cosinarr(2, n3/2))
1649 57678 : IF (nproc > 1) ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
1650 :
1651 : !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
1652 15918 : CALL ctrig(n3/2, btrig3, after3, before3, now3, 1, ic3)
1653 15918 : CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
1654 15918 : CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
1655 1207786 : DO j = 1, n1
1656 1191868 : ftrig1(1, j) = btrig1(1, j)
1657 1207786 : ftrig1(2, j) = -btrig1(2, j)
1658 : END DO
1659 1207786 : DO j = 1, n2
1660 1191868 : ftrig2(1, j) = btrig2(1, j)
1661 1207786 : ftrig2(2, j) = -btrig2(2, j)
1662 : END DO
1663 1231790 : DO j = 1, n3
1664 1215872 : ftrig3(1, j) = btrig3(1, j)
1665 1231790 : ftrig3(2, j) = -btrig3(2, j)
1666 : END DO
1667 :
1668 : !Calculating array of phases for HalFFT decoding
1669 15918 : twopion = 8._dp*ATAN(1._dp)/REAL(n3, KIND=dp)
1670 623854 : DO i3 = 1, n3/2
1671 607936 : cosinarr(1, i3) = COS(twopion*(i3 - 1))
1672 623854 : cosinarr(2, i3) = -SIN(twopion*(i3 - 1))
1673 : END DO
1674 :
1675 : ! transform along z axis
1676 15918 : lot = ncache/(2*n3)
1677 15918 : IF (lot < 1) THEN
1678 : CALL cp_abort(__LOCATION__, &
1679 : 'convolxc_on:ncache has to be enlarged to be able to hold at '// &
1680 : 'least one 1-d FFT of this size even though this will '// &
1681 0 : 'reduce the performance for shorter transform lengths')
1682 : END IF
1683 :
1684 428288 : DO j2 = 1, md2/nproc
1685 : !this condition ensures that we manage only the interesting part for the FFT
1686 428288 : IF (iproc*(md2/nproc) + j2 <= n2/2) THEN
1687 987642 : DO i1 = 1, (n1/2), lot
1688 576379 : ma = i1
1689 576379 : mb = MIN(i1 + (lot - 1), (n1/2))
1690 576379 : nfft = mb - ma + 1
1691 :
1692 : !inserting real data into complex array of half length
1693 576379 : CALL halfill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
1694 :
1695 : !performing FFT
1696 : !input: I1,I3,J2,(Jp2)
1697 576379 : inzee = 1
1698 2039390 : DO i = 1, ic3
1699 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1700 1463011 : btrig3, after3(i), now3(i), before3(i), 1)
1701 2039390 : inzee = 3 - inzee
1702 : END DO
1703 : !output: I1,i3,J2,(Jp2)
1704 :
1705 : !unpacking FFT in order to restore correct result,
1706 : !while exchanging components
1707 : !input: I1,i3,J2,(Jp2)
1708 987642 : CALL scramble_unpack(i1, j2, lot, nfft, n1/2, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2, cosinarr)
1709 : !output: I1,J2,i3,(Jp2)
1710 : END DO
1711 : END IF
1712 : END DO
1713 :
1714 : !Interprocessor data transposition
1715 : !input: I1,J2,j3,jp3,(Jp2)
1716 15918 : IF (nproc > 1) THEN
1717 : !communication scheduling
1718 8352 : CALL mpi_group%alltoall(zmpi2, zmpi1, n1*(md2/nproc)*(nd3/nproc))
1719 : END IF
1720 : !output: I1,J2,j3,Jp2,(jp3)
1721 :
1722 : !now each process perform complete convolution of its planes
1723 451524 : DO j3 = 1, nd3/nproc
1724 : !this condition ensures that we manage only the interesting part for the FFT
1725 451524 : IF (iproc*(nd3/nproc) + j3 <= n3/2 + 1) THEN
1726 431430 : Jp2stb = 1
1727 431430 : J2stb = 1
1728 431430 : Jp2stf = 1
1729 431430 : J2stf = 1
1730 :
1731 : ! transform along x axis
1732 431430 : lot = ncache/(4*n1)
1733 431430 : IF (lot < 1) THEN
1734 : CALL cp_abort(__LOCATION__, &
1735 : 'convolxc_on:ncache has to be enlarged to be able to hold at '// &
1736 : 'least one 1-d FFT of this size even though this will '// &
1737 0 : 'reduce the performance for shorter transform lengths')
1738 : END IF
1739 :
1740 1414014 : DO j = 1, n2/2, lot
1741 982584 : ma = j
1742 982584 : mb = MIN(j + (lot - 1), n2/2)
1743 982584 : nfft = mb - ma + 1
1744 :
1745 : !reverse index ordering, leaving the planes to be transformed at the end
1746 : !input: I1,J2,j3,Jp2,(jp3)
1747 982584 : IF (nproc == 1) THEN
1748 421221 : CALL mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
1749 : ELSE
1750 561363 : CALL mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
1751 : END IF
1752 : !output: J2,Jp2,I1,j3,(jp3)
1753 :
1754 : !performing FFT
1755 : !input: I2,I1,j3,(jp3)
1756 982584 : inzee = 1
1757 3123492 : DO i = 1, ic1 - 1
1758 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1759 2140908 : btrig1, after1(i), now1(i), before1(i), 1)
1760 3123492 : inzee = 3 - inzee
1761 : END DO
1762 :
1763 : !storing the last step into zt array
1764 982584 : i = ic1
1765 : CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
1766 1414014 : btrig1, after1(i), now1(i), before1(i), 1)
1767 : !output: I2,i1,j3,(jp3)
1768 : END DO
1769 :
1770 : !transform along y axis
1771 431430 : lot = ncache/(4*n2)
1772 431430 : IF (lot < 1) THEN
1773 : CALL cp_abort(__LOCATION__, &
1774 : 'convolxc_on:ncache has to be enlarged to be able to hold at '// &
1775 : 'least one 1-d FFT of this size even though this will '// &
1776 0 : 'reduce the performance for shorter transform lengths')
1777 : END IF
1778 :
1779 2267223 : DO j = 1, n1, lot
1780 1835793 : ma = j
1781 1835793 : mb = MIN(j + (lot - 1), n1)
1782 1835793 : nfft = mb - ma + 1
1783 :
1784 : !reverse ordering
1785 : !input: I2,i1,j3,(jp3)
1786 1835793 : CALL switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
1787 : !output: i1,I2,j3,(jp3)
1788 :
1789 : !performing FFT
1790 : !input: i1,I2,j3,(jp3)
1791 1835793 : inzee = 1
1792 7704036 : DO i = 1, ic2
1793 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1794 5868243 : btrig2, after2(i), now2(i), before2(i), 1)
1795 7704036 : inzee = 3 - inzee
1796 : END DO
1797 : !output: i1,i2,j3,(jp3)
1798 :
1799 : !Multiply with kernel in fourier space
1800 1835793 : CALL multkernel(nd1, nd2, n1, n2, lot, nfft, j, pot(1, 1, j3), zw(1, 1, inzee))
1801 :
1802 : !TRANSFORM BACK IN REAL SPACE
1803 :
1804 : !transform along y axis
1805 : !input: i1,i2,j3,(jp3)
1806 7704036 : DO i = 1, ic2
1807 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1808 5868243 : ftrig2, after2(i), now2(i), before2(i), -1)
1809 7704036 : inzee = 3 - inzee
1810 : END DO
1811 :
1812 : !reverse ordering
1813 : !input: i1,I2,j3,(jp3)
1814 2267223 : CALL unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
1815 : !output: I2,i1,j3,(jp3)
1816 : END DO
1817 :
1818 : !transform along x axis
1819 : !input: I2,i1,j3,(jp3)
1820 431430 : lot = ncache/(4*n1)
1821 1414014 : DO j = 1, n2/2, lot
1822 982584 : ma = j
1823 982584 : mb = MIN(j + (lot - 1), n2/2)
1824 982584 : nfft = mb - ma + 1
1825 :
1826 : !performing FFT
1827 982584 : i = 1
1828 : CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
1829 982584 : ftrig1, after1(i), now1(i), before1(i), -1)
1830 :
1831 982584 : inzee = 1
1832 3123492 : DO i = 2, ic1
1833 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1834 2140908 : ftrig1, after1(i), now1(i), before1(i), -1)
1835 3123492 : inzee = 3 - inzee
1836 : END DO
1837 : !output: I2,I1,j3,(jp3)
1838 :
1839 : !reverse ordering
1840 : !input: J2,Jp2,I1,j3,(jp3)
1841 1414014 : IF (nproc == 1) THEN
1842 421221 : CALL unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
1843 : ELSE
1844 561363 : CALL unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
1845 : END IF
1846 : ! output: I1,J2,j3,Jp2,(jp3)
1847 : END DO
1848 : END IF
1849 : END DO
1850 :
1851 : !Interprocessor data transposition
1852 : !input: I1,J2,j3,Jp2,(jp3)
1853 15918 : IF (nproc > 1) THEN
1854 : !communication scheduling
1855 8352 : CALL mpi_group%alltoall(zmpi1, zmpi2, n1*(md2/nproc)*(nd3/nproc))
1856 : !output: I1,J2,j3,jp3,(Jp2)
1857 : END IF
1858 :
1859 : !transform along z axis
1860 : !input: I1,J2,i3,(Jp2)
1861 15918 : lot = ncache/(2*n3)
1862 428288 : DO j2 = 1, md2/nproc
1863 : !this condition ensures that we manage only the interesting part for the FFT
1864 428288 : IF (iproc*(md2/nproc) + j2 <= n2/2) THEN
1865 987642 : DO i1 = 1, (n1/2), lot
1866 576379 : ma = i1
1867 576379 : mb = MIN(i1 + (lot - 1), (n1/2))
1868 576379 : nfft = mb - ma + 1
1869 :
1870 : !reverse ordering and repack the FFT data in order to be backward HalFFT transformed
1871 : !input: I1,J2,i3,(Jp2)
1872 576379 : CALL unscramble_pack(i1, j2, lot, nfft, n1/2, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1), cosinarr)
1873 : !output: I1,i3,J2,(Jp2)
1874 :
1875 : !performing FFT
1876 : !input: I1,i3,J2,(Jp2)
1877 576379 : inzee = 1
1878 2039390 : DO i = 1, ic3
1879 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1880 1463011 : ftrig3, after3(i), now3(i), before3(i), -1)
1881 2039390 : inzee = 3 - inzee
1882 : END DO
1883 : !output: I1,I3,J2,(Jp2)
1884 :
1885 : !calculates the exchange correlation terms locally and rebuild the output array
1886 : CALL unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2) &
1887 987642 : , scal) !,ehartreetmp)
1888 :
1889 : !integrate local pieces together
1890 : !ehartree=ehartree+0.5_dp*ehartreetmp*hgrid**3
1891 : END DO
1892 : END IF
1893 : END DO
1894 :
1895 : !De-allocations
1896 15918 : DEALLOCATE (btrig1)
1897 15918 : DEALLOCATE (ftrig1)
1898 15918 : DEALLOCATE (after1)
1899 15918 : DEALLOCATE (now1)
1900 15918 : DEALLOCATE (before1)
1901 15918 : DEALLOCATE (btrig2)
1902 15918 : DEALLOCATE (ftrig2)
1903 15918 : DEALLOCATE (after2)
1904 15918 : DEALLOCATE (now2)
1905 15918 : DEALLOCATE (before2)
1906 15918 : DEALLOCATE (btrig3)
1907 15918 : DEALLOCATE (ftrig3)
1908 15918 : DEALLOCATE (after3)
1909 15918 : DEALLOCATE (now3)
1910 15918 : DEALLOCATE (before3)
1911 15918 : DEALLOCATE (zmpi2)
1912 15918 : DEALLOCATE (zw)
1913 15918 : DEALLOCATE (zt)
1914 15918 : DEALLOCATE (cosinarr)
1915 15918 : IF (nproc > 1) DEALLOCATE (zmpi1)
1916 :
1917 15918 : END SUBROUTINE F_PoissonSolver
1918 :
1919 : ! **************************************************************************************************
1920 : !> \brief ...
1921 : !> \param nfft ...
1922 : !> \param n2 ...
1923 : !> \param lot ...
1924 : !> \param n1 ...
1925 : !> \param lzt ...
1926 : !> \param zt ...
1927 : !> \param zw ...
1928 : ! **************************************************************************************************
1929 1835793 : SUBROUTINE switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
1930 : INTEGER :: nfft, n2, lot, n1, lzt
1931 : REAL(KIND=dp) :: zt(2, lzt, n1), zw(2, lot, n2)
1932 :
1933 : INTEGER :: i, j
1934 :
1935 : ! WARNING: Assuming that high frequencies are in the corners
1936 : ! and that n2 is multiple of 2
1937 : ! Low frequencies
1938 :
1939 36563963 : DO j = 1, nfft
1940 1631989261 : DO i = n2/2 + 1, n2
1941 1595425298 : zw(1, j, i) = zt(1, i - n2/2, j)
1942 1630153468 : zw(2, j, i) = zt(2, i - n2/2, j)
1943 : END DO
1944 : END DO
1945 : ! High frequencies
1946 95424510 : DO i = 1, n2/2
1947 1690849808 : DO j = 1, nfft
1948 1595425298 : zw(1, j, i) = 0._dp
1949 1689014015 : zw(2, j, i) = 0._dp
1950 : END DO
1951 : END DO
1952 1835793 : END SUBROUTINE switch_upcorn
1953 :
1954 : ! **************************************************************************************************
1955 : !> \brief ...
1956 : !> \param j3 ...
1957 : !> \param nfft ...
1958 : !> \param Jp2stb ...
1959 : !> \param J2stb ...
1960 : !> \param lot ...
1961 : !> \param n1 ...
1962 : !> \param md2 ...
1963 : !> \param nd3 ...
1964 : !> \param nproc ...
1965 : !> \param zmpi1 ...
1966 : !> \param zw ...
1967 : ! **************************************************************************************************
1968 982584 : SUBROUTINE mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
1969 : INTEGER :: j3, nfft, Jp2stb, J2stb, lot, n1, md2, &
1970 : nd3, nproc
1971 : REAL(KIND=dp) :: zmpi1(2, n1/2, md2/nproc, nd3/nproc, nproc), zw(2, lot, n1)
1972 :
1973 : INTEGER :: i1, j2, jp2, mfft
1974 :
1975 : ! WARNING: Assuming that high frequencies are in the corners
1976 : ! and that n1 is multiple of 2
1977 :
1978 982584 : mfft = 0
1979 1563579 : Main: DO Jp2 = Jp2stb, nproc
1980 18539093 : DO J2 = J2stb, md2/nproc
1981 17958098 : mfft = mfft + 1
1982 17958098 : IF (mfft > nfft) THEN
1983 594013 : Jp2stb = Jp2
1984 594013 : J2stb = J2
1985 594013 : EXIT Main
1986 : END IF
1987 815076734 : DO I1 = 1, n1/2
1988 797712649 : zw(1, mfft, I1) = 0._dp
1989 815076734 : zw(2, mfft, I1) = 0._dp
1990 : END DO
1991 815657729 : DO I1 = n1/2 + 1, n1
1992 797712649 : zw(1, mfft, I1) = zmpi1(1, I1 - n1/2, J2, j3, Jp2)
1993 815076734 : zw(2, mfft, I1) = zmpi1(2, I1 - n1/2, J2, j3, Jp2)
1994 : END DO
1995 : END DO
1996 969566 : J2stb = 1
1997 : END DO Main
1998 982584 : END SUBROUTINE mpiswitch_upcorn
1999 :
2000 : ! **************************************************************************************************
2001 : !> \brief ...
2002 : !> \param nfft ...
2003 : !> \param n2 ...
2004 : !> \param lot ...
2005 : !> \param n1 ...
2006 : !> \param lzt ...
2007 : !> \param zw ...
2008 : !> \param zt ...
2009 : ! **************************************************************************************************
2010 1835793 : SUBROUTINE unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
2011 : INTEGER :: nfft, n2, lot, n1, lzt
2012 : REAL(KIND=dp) :: zw(2, lot, n2), zt(2, lzt, n1)
2013 :
2014 : INTEGER :: i, j
2015 :
2016 : ! WARNING: Assuming that high frequencies are in the corners
2017 : ! and that n2 is multiple of 2
2018 : ! Low frequencies
2019 :
2020 36563963 : DO j = 1, nfft
2021 1631989261 : DO i = 1, n2/2
2022 1595425298 : zt(1, i, j) = zw(1, j, i)
2023 1630153468 : zt(2, i, j) = zw(2, j, i)
2024 : END DO
2025 : END DO
2026 1835793 : RETURN
2027 : END SUBROUTINE unswitch_downcorn
2028 :
2029 : ! **************************************************************************************************
2030 : !> \brief ...
2031 : !> \param j3 ...
2032 : !> \param nfft ...
2033 : !> \param Jp2stf ...
2034 : !> \param J2stf ...
2035 : !> \param lot ...
2036 : !> \param n1 ...
2037 : !> \param md2 ...
2038 : !> \param nd3 ...
2039 : !> \param nproc ...
2040 : !> \param zw ...
2041 : !> \param zmpi1 ...
2042 : ! **************************************************************************************************
2043 982584 : SUBROUTINE unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
2044 : INTEGER :: j3, nfft, Jp2stf, J2stf, lot, n1, md2, &
2045 : nd3, nproc
2046 : REAL(KIND=dp) :: zw(2, lot, n1), zmpi1(2, n1/2, md2/nproc, nd3/nproc, nproc)
2047 :
2048 : INTEGER :: i1, j2, jp2, mfft
2049 :
2050 : ! WARNING: Assuming that high frequencies are in the corners
2051 : ! and that n1 is multiple of 2
2052 :
2053 982584 : mfft = 0
2054 1563579 : Main: DO Jp2 = Jp2stf, nproc
2055 18539093 : DO J2 = J2stf, md2/nproc
2056 17958098 : mfft = mfft + 1
2057 17958098 : IF (mfft > nfft) THEN
2058 594013 : Jp2stf = Jp2
2059 594013 : J2stf = J2
2060 594013 : EXIT Main
2061 : END IF
2062 815657729 : DO I1 = 1, n1/2
2063 797712649 : zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
2064 815076734 : zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
2065 : END DO
2066 : END DO
2067 969566 : J2stf = 1
2068 : END DO Main
2069 982584 : END SUBROUTINE unmpiswitch_downcorn
2070 :
2071 : ! **************************************************************************************************
2072 : !> \brief (Based on suitable modifications of S.Goedecker routines)
2073 : !> Restore data into output array, calculating in the meanwhile
2074 : !> Hartree energy of the potential
2075 : !> \param md1 Dimensions of the undistributed part of the real grid
2076 : !> \param md3 Dimensions of the undistributed part of the real grid
2077 : !> \param lot ...
2078 : !> \param nfft number of planes
2079 : !> \param n3 (twice the) dimension of the last FFTtransform.
2080 : !> \param zw FFT work array
2081 : !> \param zf Original distributed density as well as
2082 : !> Distributed solution of the poisson equation (inout)
2083 : !> \param scal Needed to achieve unitarity and correct dimensions
2084 : !> \param ehartreetmp Hartree energy
2085 : !> \date February 2006
2086 : !> \author S. Goedecker, L. Genovese
2087 : !> \note Assuming that high frequencies are in the corners
2088 : !> and that n3 is multiple of 4
2089 : !>
2090 : !> RESTRICTIONS on USAGE
2091 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
2092 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
2093 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
2094 : !> This file is distributed under the terms of the
2095 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
2096 : ! **************************************************************************************************
2097 0 : SUBROUTINE F_unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal, ehartreetmp)
2098 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
2099 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
2100 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout) :: zf
2101 : REAL(KIND=dp), INTENT(in) :: scal
2102 : REAL(KIND=dp), INTENT(out) :: ehartreetmp
2103 :
2104 : INTEGER :: i1, i3
2105 : REAL(KIND=dp) :: pot1
2106 :
2107 0 : ehartreetmp = 0._dp
2108 0 : DO i3 = 1, n3/4
2109 0 : DO i1 = 1, nfft
2110 0 : pot1 = scal*zw(1, i1, i3)
2111 0 : ehartreetmp = ehartreetmp + pot1*zf(i1, 2*i3 - 1)
2112 0 : zf(i1, 2*i3 - 1) = pot1
2113 0 : pot1 = scal*zw(2, i1, i3)
2114 0 : ehartreetmp = ehartreetmp + pot1*zf(i1, 2*i3)
2115 0 : zf(i1, 2*i3) = pot1
2116 : END DO
2117 : END DO
2118 0 : END SUBROUTINE F_unfill_downcorn
2119 :
2120 : END MODULE ps_wavelet_base
|