Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <limits.h>
10 : #include <math.h>
11 : #include <omp.h>
12 : #include <stdbool.h>
13 : #include <stdio.h>
14 : #include <stdlib.h>
15 : #include <string.h>
16 :
17 : #include "../common/grid_basis_set.h"
18 : #include "../common/grid_common.h"
19 : #include "../common/grid_constants.h"
20 : #include "grid_dgemm_coefficients.h"
21 : #include "grid_dgemm_collocate.h"
22 : #include "grid_dgemm_collocation_integration.h"
23 : #include "grid_dgemm_non_orthorombic_corrections.h"
24 : #include "grid_dgemm_prepare_pab.h"
25 : #include "grid_dgemm_private_header.h"
26 : #include "grid_dgemm_task_list.h"
27 : #include "grid_dgemm_tensor_local.h"
28 :
29 : void collocate_l0(double *scratch, const double alpha, const bool orthogonal,
30 : const struct tensor_ *exp_xy,
31 : const struct tensor_ *p_alpha_beta_reduced_,
32 : struct tensor_ *cube);
33 :
34 5100 : void rotate_to_cartesian_harmonics(const grid_basis_set *ibasis,
35 : const grid_basis_set *jbasis,
36 : const int iatom, const int jatom,
37 : const int iset, const int jset,
38 : double *const block, tensor *work,
39 : tensor *pab) {
40 5100 : dgemm_params m1, m2;
41 5100 : memset(&m1, 0, sizeof(dgemm_params));
42 5100 : memset(&m2, 0, sizeof(dgemm_params));
43 :
44 : // Define some more convenient aliases.
45 5100 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
46 5100 : const int nsgf_setb = jbasis->nsgf_set[jset];
47 5100 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
48 5100 : const int nsgfb = jbasis->nsgf;
49 5100 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
50 5100 : const int sgfb = jbasis->first_sgf[jset] - 1;
51 5100 : const int maxcoa = ibasis->maxco;
52 5100 : const int maxcob = jbasis->maxco;
53 5100 : const int ncoseta = ncoset(ibasis->lmax[iset]);
54 5100 : const int ncosetb = ncoset(jbasis->lmax[jset]);
55 5100 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
56 5100 : const int ncob = jbasis->npgf[jset] * ncosetb;
57 :
58 5100 : initialize_tensor_2(work, nsgf_setb, ncoa);
59 5100 : realloc_tensor(work);
60 :
61 5100 : initialize_tensor_2(pab, ncob, ncoa);
62 5100 : realloc_tensor(pab);
63 :
64 : // the rotations happen here.
65 5100 : if (iatom <= jatom) {
66 3388 : m1.op1 = 'N';
67 3388 : m1.op2 = 'N';
68 3388 : m1.m = work->size[0];
69 3388 : m1.n = work->size[1];
70 3388 : m1.k = nsgf_seta;
71 3388 : m1.alpha = 1.0;
72 3388 : m1.beta = 0.0;
73 3388 : m1.a = block + sgfb * nsgfa + sgfa;
74 3388 : m1.lda = nsgfa;
75 3388 : m1.b = &ibasis->sphi[sgfa * maxcoa];
76 3388 : m1.ldb = maxcoa;
77 3388 : m1.c = work->data;
78 3388 : m1.ldc = work->ld_;
79 : } else {
80 1712 : m1.op1 = 'T';
81 1712 : m1.op2 = 'N';
82 1712 : m1.m = work->size[0];
83 1712 : m1.n = work->size[1];
84 1712 : m1.k = nsgf_seta;
85 1712 : m1.alpha = 1.0;
86 1712 : m1.beta = 0.0;
87 1712 : m1.a = block + sgfa * nsgfb + sgfb;
88 1712 : m1.lda = nsgfb;
89 1712 : m1.b = &ibasis->sphi[sgfa * maxcoa];
90 1712 : m1.ldb = maxcoa;
91 1712 : m1.c = work->data;
92 1712 : m1.ldc = work->ld_;
93 : }
94 :
95 5100 : dgemm_simplified(&m1);
96 :
97 5100 : m2.op1 = 'T';
98 5100 : m2.op2 = 'N';
99 5100 : m2.m = pab->size[0];
100 5100 : m2.n = pab->size[1];
101 5100 : m2.k = work->size[0];
102 5100 : m2.alpha = 1.0;
103 5100 : m2.beta = 0.0;
104 5100 : m2.a = &jbasis->sphi[sgfb * maxcob];
105 5100 : m2.lda = maxcob;
106 5100 : m2.b = work->data;
107 5100 : m2.ldb = work->ld_;
108 5100 : m2.c = pab->data;
109 5100 : m2.ldc = pab->ld_;
110 :
111 5100 : dgemm_simplified(&m2);
112 5100 : }
113 :
114 : void tensor_reduction_for_collocate_integrate(
115 : double *scratch, const double alpha, const bool *const orthogonal,
116 : const struct tensor_ *Exp, const struct tensor_ *co,
117 : const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube);
118 :
119 : /* compute the functions (x - x_i)^l exp (-eta (x - x_i)^2) for l = 0..lp using
120 : * a recursive relation to avoid computing the exponential on each grid point. I
121 : * think it is not really necessary anymore since it is *not* the dominating
122 : * contribution to computation of collocate and integrate */
123 :
124 271002 : void grid_fill_pol_dgemm(const bool transpose, const double dr,
125 : const double roffset, const int pol_offset,
126 : const int xmin, const int xmax, const int lp,
127 : const int cmax, const double zetp, double *pol_) {
128 271002 : tensor pol;
129 271002 : const double t_exp_1 = exp(-zetp * dr * dr);
130 271002 : const double t_exp_2 = t_exp_1 * t_exp_1;
131 :
132 271002 : double t_exp_min_1 = exp(-zetp * (dr - roffset) * (dr - roffset));
133 271002 : double t_exp_min_2 = exp(2.0 * zetp * (dr - roffset) * dr);
134 :
135 271002 : double t_exp_plus_1 = exp(-zetp * roffset * roffset);
136 271002 : double t_exp_plus_2 = exp(2.0 * zetp * roffset * dr);
137 :
138 271002 : if (transpose) {
139 105327 : initialize_tensor_2(&pol, cmax, lp + 1);
140 105327 : pol.data = pol_;
141 : /* It is original Ole code. I need to transpose the polynomials for the
142 : * integration routine and Ole code already does it. */
143 1416424 : for (int ig = 0; ig >= xmin; ig--) {
144 1311097 : const double rpg = ig * dr - roffset;
145 1311097 : t_exp_min_1 *= t_exp_min_2 * t_exp_1;
146 1311097 : t_exp_min_2 *= t_exp_2;
147 1311097 : double pg = t_exp_min_1;
148 5369183 : for (int icoef = 0; icoef <= lp; icoef++) {
149 4058086 : idx2(pol, pol_offset + ig - xmin, icoef) = pg;
150 4058086 : pg *= rpg;
151 : }
152 : }
153 :
154 105327 : double t_exp_plus_1 = exp(-zetp * roffset * roffset);
155 105327 : double t_exp_plus_2 = exp(2 * zetp * roffset * dr);
156 1360432 : for (int ig = 1; ig <= xmax; ig++) {
157 1255105 : const double rpg = ig * dr - roffset;
158 1255105 : t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
159 1255105 : t_exp_plus_2 *= t_exp_2;
160 1255105 : double pg = t_exp_plus_1;
161 5140739 : for (int icoef = 0; icoef <= lp; icoef++) {
162 3885634 : idx2(pol, pol_offset + ig - xmin, icoef) = pg;
163 3885634 : pg *= rpg;
164 : }
165 : }
166 :
167 : } else {
168 165675 : initialize_tensor_2(&pol, lp + 1, cmax);
169 165675 : pol.data = pol_;
170 : /* memset(pol.data, 0, sizeof(double) * pol.alloc_size_); */
171 : /*
172 : * compute the values of all (x-xp)**lp*exp(..)
173 : *
174 : * still requires the old trick:
175 : * new trick to avoid to many exps (reuse the result from the previous
176 : * gridpoint): exp( -a*(x+d)**2)=exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
177 : * exp(-2*a*(x+d)*d)=exp(-2*a*x*d)*exp(-2*a*d**2)
178 : */
179 :
180 : /* compute the exponential recursively and store the polynomial prefactors
181 : * as well */
182 2205408 : for (int ig = 0; ig >= xmin; ig--) {
183 2039733 : const double rpg = ig * dr - roffset;
184 2039733 : t_exp_min_1 *= t_exp_min_2 * t_exp_1;
185 2039733 : t_exp_min_2 *= t_exp_2;
186 2039733 : double pg = t_exp_min_1;
187 2039733 : idx2(pol, 0, pol_offset + ig - xmin) = pg;
188 2039733 : if (lp > 0)
189 1312089 : idx2(pol, 1, pol_offset + ig - xmin) = rpg;
190 : }
191 :
192 2117082 : for (int ig = 1; ig <= xmax; ig++) {
193 1951407 : const double rpg = ig * dr - roffset;
194 1951407 : t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
195 1951407 : t_exp_plus_2 *= t_exp_2;
196 1951407 : double pg = t_exp_plus_1;
197 1951407 : idx2(pol, 0, pol_offset + ig - xmin) = pg;
198 1951407 : if (lp > 0)
199 1253337 : idx2(pol, 1, pol_offset + ig - xmin) = rpg;
200 : }
201 :
202 : /* compute the remaining powers using previously computed stuff */
203 165675 : if (lp >= 2) {
204 61209 : double *__restrict__ poly = &idx2(pol, 1, 0);
205 61209 : double *__restrict__ src1 = &idx2(pol, 0, 0);
206 61209 : double *__restrict__ dst = &idx2(pol, 2, 0);
207 : // #pragma omp simd linear(dst, src1, poly) simdlen(8)
208 240150 : GRID_PRAGMA_SIMD((dst, src1, poly), 8)
209 61209 : for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++)
210 1477752 : dst[ig] = src1[ig] * poly[ig] * poly[ig];
211 : }
212 :
213 178941 : for (int icoef = 3; icoef <= lp; icoef++) {
214 13266 : double *__restrict__ poly = &idx2(pol, 1, 0);
215 13266 : double *__restrict__ src1 = &idx2(pol, icoef - 1, 0);
216 13266 : double *__restrict__ dst = &idx2(pol, icoef, 0);
217 13266 : GRID_PRAGMA_SIMD((dst, src1, poly), 8)
218 13266 : for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
219 372702 : dst[ig] = src1[ig] * poly[ig];
220 : }
221 : }
222 :
223 : //
224 165675 : if (lp > 0) {
225 : // I can not declare src__ variable constant because it breaks openmp
226 : // standard.
227 106155 : double *__restrict__ dst = &idx2(pol, 1, 0);
228 106155 : double *__restrict__ src = &idx2(pol, 0, 0);
229 377157 : GRID_PRAGMA_SIMD((dst, src), 8)
230 106155 : for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
231 2565426 : dst[ig] *= src[ig];
232 : }
233 : }
234 : }
235 271002 : }
236 :
237 0 : void apply_sphere_cutoff_ortho(struct collocation_integration_ *const handler,
238 : const double disr_radius, const int cmax,
239 : const int *const lb_cube,
240 : const int *const cube_center) {
241 : // a mapping so that the ig corresponds to the right grid point
242 0 : int **map = handler->map;
243 0 : map[1] = map[0] + 2 * cmax + 1;
244 0 : map[2] = map[1] + 2 * cmax + 1;
245 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
246 :
247 0 : for (int i = 0; i < 3; i++) {
248 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
249 0 : map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
250 0 : handler->grid.lower_corner[i],
251 : handler->grid.full_size[i]);
252 : }
253 : }
254 :
255 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
256 0 : .xmax = handler->grid.window_size[0]};
257 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
258 0 : .xmax = handler->grid.window_size[1]};
259 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
260 0 : .xmax = handler->grid.window_size[2]};
261 :
262 0 : for (int kg = 0; kg < handler->cube.size[0]; kg++) {
263 0 : const int k = map[0][kg];
264 0 : const int kd =
265 0 : (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
266 0 : const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
267 0 : const double kremain = disr_radius * disr_radius - kr * kr;
268 :
269 0 : if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
270 :
271 0 : const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
272 0 : for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
273 0 : const int j = map[1][jg - lb_cube[1]];
274 0 : const double jr = ((2 * jg - 1) >> 1) *
275 0 : handler->dh[1][1]; // distance from center in a.u.
276 0 : const double jremain = kremain - jr * jr;
277 :
278 0 : if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
279 0 : const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
280 0 : const int xmax = 1 - xmin;
281 :
282 : // printf("xmin %d, xmax %d\n", xmin, xmax);
283 0 : for (int x = xmin - lb_cube[2];
284 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
285 0 : const int x1 = map[2][x];
286 :
287 0 : if (!is_point_in_interval(x1, xwindow))
288 0 : continue;
289 :
290 0 : int lower_corner[3] = {k, j, x1};
291 0 : int upper_corner[3] = {k + 1, j + 1, x1 + 1};
292 :
293 0 : compute_interval(map[2], handler->grid.full_size[2],
294 0 : handler->grid.size[2], xmax - lb_cube[2], x1, &x,
295 : lower_corner + 2, upper_corner + 2, xwindow);
296 :
297 0 : if (upper_corner[2] - lower_corner[2]) {
298 0 : const int position1[3] = {kg, jg - lb_cube[1], x};
299 :
300 0 : double *restrict dst = &idx3(handler->grid, lower_corner[0],
301 : lower_corner[1], lower_corner[2]);
302 0 : double *restrict src = &idx3(handler->cube, position1[0],
303 : position1[1], position1[2]);
304 :
305 0 : const int sizex = upper_corner[2] - lower_corner[2];
306 0 : GRID_PRAGMA_SIMD((dst, src), 8)
307 0 : for (int x = 0; x < sizex; x++) {
308 0 : dst[x] += src[x];
309 : }
310 : }
311 :
312 0 : if (handler->grid.size[2] == handler->grid.full_size[2])
313 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
314 : else
315 0 : x += upper_corner[2] - lower_corner[2] - 1;
316 : }
317 : }
318 : }
319 : }
320 : }
321 0 : }
322 :
323 0 : void apply_spherical_cutoff_generic(
324 : struct collocation_integration_ *const handler, const double disr_radius,
325 : const int cmax, const int *const lb_cube, const int *const ub_cube,
326 : const double *const roffset, const int *const cube_center) {
327 :
328 0 : const double a = handler->dh[0][0] * handler->dh[0][0] +
329 0 : handler->dh[0][1] * handler->dh[0][1] +
330 0 : handler->dh[0][2] * handler->dh[0][2];
331 0 : const double a_inv = 1.0 / a;
332 : // a mapping so that the ig corresponds to the right grid point
333 0 : int **map = handler->map;
334 0 : map[1] = map[0] + 2 * cmax + 1;
335 0 : map[2] = map[1] + 2 * cmax + 1;
336 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
337 :
338 0 : for (int i = 0; i < 3; i++) {
339 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
340 0 : map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
341 0 : handler->grid.lower_corner[i],
342 : handler->grid.full_size[i]);
343 : }
344 : }
345 :
346 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
347 0 : .xmax = handler->grid.window_size[0]};
348 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
349 0 : .xmax = handler->grid.window_size[1]};
350 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
351 0 : .xmax = handler->grid.window_size[2]};
352 :
353 0 : for (int k = 0; k < handler->cube.size[0]; k++) {
354 0 : const int iz = map[0][k];
355 :
356 0 : if (!is_point_in_interval(iz, zwindow))
357 0 : continue;
358 :
359 0 : const double tz = (k + lb_cube[0] - roffset[0]);
360 0 : const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
361 0 : tz * handler->dh[2][2]};
362 :
363 0 : for (int j = 0; j < handler->cube.size[1]; j++) {
364 0 : const int iy = map[1][j];
365 :
366 0 : if (!is_point_in_interval(iy, ywindow))
367 0 : continue;
368 :
369 0 : const double ty = (j - roffset[1] + lb_cube[1]);
370 0 : const double y[3] = {z[0] + ty * handler->dh[1][0],
371 0 : z[1] + ty * handler->dh[1][1],
372 0 : z[2] + ty * handler->dh[1][2]};
373 :
374 : /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
375 : /* 4 (x1^2 + y1^2 + z1^2)
376 : * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
377 :
378 0 : const double b =
379 0 : -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
380 0 : handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
381 0 : handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
382 :
383 0 : const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
384 0 : (roffset[2] * handler->dh[0][0] - y[0]) +
385 0 : (roffset[2] * handler->dh[0][1] - y[1]) *
386 0 : (roffset[2] * handler->dh[0][1] - y[1]) +
387 0 : (roffset[2] * handler->dh[0][2] - y[2]) *
388 : (roffset[2] * handler->dh[0][2] - y[2]) -
389 0 : disr_radius * disr_radius;
390 :
391 0 : double delta = b * b - 4.0 * a * c;
392 :
393 0 : if (delta < 0.0)
394 0 : continue;
395 :
396 0 : delta = sqrt(delta);
397 :
398 0 : const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
399 0 : const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
400 :
401 0 : int lower_corner[3] = {iz, iy, xmin};
402 0 : int upper_corner[3] = {iz + 1, iy + 1, xmin};
403 :
404 0 : for (int x = xmin - lb_cube[2];
405 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
406 0 : const int x1 = map[2][x];
407 :
408 0 : if (!is_point_in_interval(x1, xwindow))
409 0 : continue;
410 :
411 0 : compute_interval(map[2], handler->grid.full_size[2],
412 0 : handler->grid.size[2], xmax - lb_cube[2], x1, &x,
413 : lower_corner + 2, upper_corner + 2, xwindow);
414 :
415 0 : if (upper_corner[2] - lower_corner[2]) {
416 0 : const int position1[3] = {k, j, x};
417 :
418 : /* the function will internally take care of the local vs global
419 : * grid */
420 :
421 0 : double *__restrict__ dst = &idx3(handler->grid, lower_corner[0],
422 : lower_corner[1], lower_corner[2]);
423 0 : double *__restrict__ src =
424 0 : &idx3(handler->cube, position1[0], position1[1], position1[2]);
425 :
426 0 : const int sizex = upper_corner[2] - lower_corner[2];
427 0 : GRID_PRAGMA_SIMD((dst, src), 8)
428 0 : for (int x = 0; x < sizex; x++) {
429 0 : dst[x] += src[x];
430 : }
431 :
432 0 : if (handler->grid.size[0] == handler->grid.full_size[0])
433 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
434 : else
435 0 : x += upper_corner[2] - lower_corner[2] - 1;
436 : }
437 : }
438 : }
439 : }
440 0 : }
441 :
442 10560 : void collocate_l0(double *scratch, const double alpha, const bool orthogonal_xy,
443 : const struct tensor_ *exp_xy,
444 : const struct tensor_ *p_alpha_beta_reduced_,
445 : struct tensor_ *cube) {
446 10560 : const double *__restrict pz =
447 : &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
448 10560 : const double *__restrict py =
449 10560 : &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
450 10560 : const double *__restrict px =
451 10560 : &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
452 :
453 10560 : memset(&idx3(cube[0], 0, 0, 0), 0, sizeof(double) * cube->alloc_size_);
454 10560 : memset(scratch, 0, sizeof(double) * cube->size[1] * cube->ld_);
455 :
456 10560 : cblas_dger(CblasRowMajor, cube->size[1], cube->size[2], alpha, py, 1, px, 1,
457 10560 : scratch, cube->ld_);
458 :
459 10560 : if (exp_xy && !orthogonal_xy) {
460 0 : for (int y = 0; y < cube->size[1]; y++) {
461 0 : const double *__restrict src = &idx2(exp_xy[0], y, 0);
462 0 : double *__restrict dst = &scratch[y * cube->ld_];
463 : // #pragma omp simd linear(dst, src) simdlen(8)
464 0 : GRID_PRAGMA_SIMD((dst, src), 8)
465 0 : for (int x = 0; x < cube->size[2]; x++) {
466 0 : dst[x] *= src[x];
467 : }
468 : }
469 : }
470 :
471 10560 : cblas_dger(CblasRowMajor, cube->size[0], cube->size[1] * cube->ld_, 1.0, pz,
472 10560 : 1, scratch, 1, &idx3(cube[0], 0, 0, 0), cube->size[1] * cube->ld_);
473 10560 : }
474 :
475 : /* compute the following operation (variant) it is a tensor contraction
476 :
477 : V_{kji} = \sum_{\alpha\beta\gamma}
478 : C_{\alpha\gamma\beta} T_{2,\alpha,i} T_{1,\beta,j} T_{0,\gamma,k}
479 :
480 : */
481 81054 : void tensor_reduction_for_collocate_integrate(
482 : double *scratch, const double alpha, const bool *const orthogonal,
483 : const struct tensor_ *Exp, const struct tensor_ *co,
484 : const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube) {
485 81054 : if (co->size[0] > 1) {
486 70494 : dgemm_params m1, m2, m3;
487 :
488 70494 : memset(&m1, 0, sizeof(dgemm_params));
489 70494 : memset(&m2, 0, sizeof(dgemm_params));
490 70494 : memset(&m3, 0, sizeof(dgemm_params));
491 70494 : tensor T;
492 70494 : tensor W;
493 :
494 70494 : double *__restrict const pz =
495 : &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
496 70494 : double *__restrict const py =
497 70494 : &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
498 70494 : double *__restrict const px =
499 70494 : &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
500 :
501 70494 : initialize_tensor_3(&T, co->size[0] /* alpha */, co->size[1] /* gamma */,
502 : cube->size[1] /* j */);
503 70494 : initialize_tensor_3(&W, co->size[1] /* gamma */, cube->size[1] /* j */,
504 : cube->size[2] /* i */);
505 :
506 70494 : T.data = scratch;
507 70494 : W.data = scratch + T.alloc_size_;
508 : /* WARNING we are in row major layout. cblas allows it and it is more
509 : * natural to read left to right than top to bottom
510 : *
511 : * we do first T_{\alpha,\gamma,j} = \sum_beta C_{alpha\gamma\beta}
512 : * Y_{\beta, j}
513 : *
514 : * keep in mind that Y_{\beta, j} = p_alpha_beta_reduced(1, \beta, j)
515 : * and the order of indices is also important. the last indice is the
516 : * fastest one. it can be done with one dgemm.
517 : */
518 :
519 70494 : m1.op1 = 'N';
520 70494 : m1.op2 = 'N';
521 70494 : m1.alpha = alpha;
522 70494 : m1.beta = 0.0;
523 70494 : m1.m = co->size[0] * co->size[1]; /* alpha gamma */
524 70494 : m1.n = cube->size[1]; /* j */
525 70494 : m1.k = co->size[2]; /* beta */
526 70494 : m1.a = co->data; // Coef_{alpha,gamma,beta} Coef_xzy
527 70494 : m1.lda = co->ld_;
528 70494 : m1.b = py; // Y_{beta, j} = p_alpha_beta_reduced(1, beta, j)
529 70494 : m1.ldb = p_alpha_beta_reduced_->ld_;
530 70494 : m1.c = T.data; // T_{\alpha, \gamma, j} = T(alpha, gamma, j)
531 70494 : m1.ldc = T.ld_;
532 :
533 : /*
534 : * the next step is a reduction along the alpha index.
535 : *
536 : * We compute then
537 : *
538 : * W_{gamma, j, i} = sum_{\alpha} T_{\gamma, j, alpha} X_{\alpha, i}
539 : *
540 : * which means we need to transpose T_{\alpha, \gamma, j} to get
541 : * T_{\gamma, j, \alpha}. Fortunately we can do it while doing the
542 : * matrix - matrix multiplication
543 : */
544 :
545 70494 : m2.op1 = 'T';
546 70494 : m2.op2 = 'N';
547 70494 : m2.alpha = 1.0;
548 70494 : m2.beta = 0.0;
549 70494 : m2.m = cube->size[1] * co->size[1]; // (\gamma j) direction
550 70494 : m2.n = cube->size[2]; // i
551 70494 : m2.k = co->size[0]; // alpha
552 70494 : m2.a = T.data; // T_{\alpha, \gamma, j}
553 70494 : m2.lda = T.ld_ * co->size[1];
554 70494 : m2.b = px; // X_{alpha, i} = p_alpha_beta_reduced(0, alpha, i)
555 70494 : m2.ldb = p_alpha_beta_reduced_->ld_;
556 70494 : m2.c = W.data; // W_{\gamma, j, i}
557 70494 : m2.ldc = W.ld_;
558 :
559 : /* the final step is again a reduction along the gamma indice. It can
560 : * again be done with one dgemm. The operation is simply
561 : *
562 : * Cube_{k, j, i} = \sum_{alpha} Z_{k, \gamma} W_{\gamma, j, i}
563 : *
564 : * which means we need to transpose Z_{\gamma, k}.
565 : */
566 :
567 70494 : m3.op1 = 'T';
568 70494 : m3.op2 = 'N';
569 70494 : m3.alpha = alpha;
570 70494 : m3.beta = 0.0;
571 70494 : m3.m = cube->size[0]; // Z_{k \gamma}
572 70494 : m3.n = cube->size[1] * cube->size[2]; // (ji) direction
573 70494 : m3.k = co->size[1]; // \gamma
574 70494 : m3.a = pz; // p_alpha_beta_reduced(0, gamma, i)
575 70494 : m3.lda = p_alpha_beta_reduced_->ld_;
576 70494 : m3.b = &idx3(W, 0, 0, 0); // W_{\gamma, j, i}
577 70494 : m3.ldb = W.size[1] * W.ld_;
578 70494 : m3.c = &idx3(cube[0], 0, 0, 0); // cube_{kji}
579 70494 : m3.ldc = cube->ld_ * cube->size[1];
580 :
581 70494 : dgemm_simplified(&m1);
582 70494 : dgemm_simplified(&m2);
583 :
584 : // apply the non orthorombic corrections in the xy plane
585 70494 : if (Exp && !orthogonal[2]) {
586 4668 : tensor exp_xy;
587 4668 : initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
588 4668 : exp_xy.data = &idx3(Exp[0], 2, 0, 0);
589 4668 : apply_non_orthorombic_corrections_xy_blocked(&exp_xy, &W);
590 : }
591 :
592 70494 : dgemm_simplified(&m3);
593 : } else {
594 10560 : if (Exp && !orthogonal[2]) {
595 0 : tensor exp_xy;
596 0 : initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
597 :
598 0 : exp_xy.data = &idx3(Exp[0], 2, 0, 0);
599 0 : collocate_l0(scratch, co->data[0] * alpha, orthogonal[2], &exp_xy,
600 : p_alpha_beta_reduced_, cube);
601 : } else {
602 10560 : collocate_l0(scratch, co->data[0] * alpha, true, NULL,
603 : p_alpha_beta_reduced_, cube);
604 : }
605 : }
606 :
607 81054 : if (Exp == NULL)
608 : return;
609 :
610 45945 : if (Exp && (!orthogonal[0] && !orthogonal[1])) {
611 0 : tensor exp_yz;
612 0 : tensor exp_xz;
613 0 : initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
614 0 : initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
615 0 : exp_xz.data = &idx3(Exp[0], 0, 0, 0);
616 0 : exp_yz.data = &idx3(Exp[0], 1, 0, 0);
617 0 : apply_non_orthorombic_corrections_xz_yz_blocked(&exp_xz, &exp_yz, cube);
618 0 : return;
619 : }
620 :
621 45945 : if (Exp && (!orthogonal[0] || !orthogonal[1])) {
622 20163 : if (!orthogonal[0]) {
623 20163 : tensor exp_xz;
624 20163 : initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
625 20163 : exp_xz.data = &idx3(Exp[0], 0, 0, 0);
626 20163 : apply_non_orthorombic_corrections_xz_blocked(&exp_xz, cube);
627 : }
628 :
629 20163 : if (!orthogonal[1]) {
630 0 : tensor exp_yz;
631 0 : initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
632 0 : exp_yz.data = &idx3(Exp[0], 1, 0, 0);
633 0 : apply_non_orthorombic_corrections_yz_blocked(&exp_yz, cube);
634 : }
635 : }
636 :
637 : return;
638 : }
639 :
640 : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
641 : * very general and does not depend on the orthorombic nature of the grid. for
642 : * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
643 :
644 45945 : void apply_mapping_cubic(struct collocation_integration_ *handler,
645 : const int cmax, const int *const lower_boundaries_cube,
646 : const int *const cube_center) {
647 :
648 : // a mapping so that the ig corresponds to the right grid point
649 45945 : int **map = handler->map;
650 45945 : map[1] = map[0] + 2 * cmax + 1;
651 45945 : map[2] = map[1] + 2 * cmax + 1;
652 :
653 183780 : for (int i = 0; i < 3; i++) {
654 3462100 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
655 3324265 : map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
656 3324265 : handler->grid.lower_corner[i],
657 : handler->grid.full_size[i]);
658 : }
659 : }
660 :
661 45945 : int lower_corner[3];
662 45945 : int upper_corner[3];
663 :
664 45945 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
665 45945 : .xmax = handler->grid.window_size[0]};
666 45945 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
667 45945 : .xmax = handler->grid.window_size[1]};
668 45945 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
669 45945 : .xmax = handler->grid.window_size[2]};
670 :
671 : /* this code makes a decomposition of the cube such that we can add block of
672 : * datas in a vectorized way. */
673 :
674 : /* the decomposition depends unfortunately on the way the grid is split over
675 : * mpi ranks. If each mpi rank has the full grid then it is simple. A 1D
676 : * example of the decomposition will explain it better. We have an interval
677 : * [x1, x1 + cube_size - 1] (and a second index x [0, cube_size -1]) and a
678 : * grid that goes from [0.. grid_size - 1].
679 : *
680 : * We start from x1 and compute the largest interval [x1.. x1 + diff] that fit
681 : * to [0.. grid_size - 1]. Computing the difference diff is simply
682 : * min(grid_size - x1, cube_size - x). then we add the result in a vectorized
683 : * fashion. we itterate the processus by reducing the interval [x1, x1 +
684 : * cube_size - 1] until it is empty. */
685 :
686 134487 : for (int z = 0; (z < handler->cube.size[0]); z++) {
687 88542 : const int z1 = map[0][z];
688 :
689 88542 : if (!is_point_in_interval(z1, zwindow))
690 0 : continue;
691 :
692 88542 : compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
693 : handler->cube.size[0], z1, &z, lower_corner, upper_corner,
694 : zwindow);
695 :
696 : /* // We have a full plane. */
697 88542 : if (upper_corner[0] - lower_corner[0]) {
698 268410 : for (int y = 0; y < handler->cube.size[1]; y++) {
699 179868 : const int y1 = map[1][y];
700 :
701 : // this check is completely irrelevant when running without MPI.
702 179868 : if (!is_point_in_interval(y1, ywindow))
703 0 : continue;
704 :
705 179868 : compute_interval(map[1], handler->grid.full_size[1],
706 179868 : handler->grid.size[1], handler->cube.size[1], y1, &y,
707 : lower_corner + 1, upper_corner + 1, ywindow);
708 :
709 179868 : if (upper_corner[1] - lower_corner[1]) {
710 594034 : for (int x = 0; x < handler->cube.size[2]; x++) {
711 414166 : const int x1 = map[2][x];
712 :
713 414166 : if (!is_point_in_interval(x1, xwindow))
714 0 : continue;
715 :
716 414166 : compute_interval(map[2], handler->grid.full_size[2],
717 414166 : handler->grid.size[2], handler->cube.size[2], x1,
718 : &x, lower_corner + 2, upper_corner + 2, xwindow);
719 :
720 414166 : if (upper_corner[2] - lower_corner[2]) {
721 414166 : const int position1[3] = {z, y, x};
722 : /* the function will internally take care of the local vx global
723 : * grid */
724 414166 : add_sub_grid(
725 : lower_corner, // lower corner of the portion of cube (in the
726 : // full grid)
727 : upper_corner, // upper corner of the portion of cube (in the
728 : // full grid)
729 : position1, // starting position subblock inside the cube
730 414166 : &handler->cube, // the cube to extract data from
731 : &handler->grid); // the grid to add data from
732 414166 : if (handler->grid.size[2] == handler->grid.full_size[2])
733 414166 : update_loop_index(handler->grid.full_size[2], x1, &x);
734 : else
735 0 : x += upper_corner[2] - lower_corner[2] - 1;
736 : }
737 : }
738 179868 : if (handler->grid.size[1] == handler->grid.full_size[1])
739 179868 : update_loop_index(handler->grid.full_size[1], y1, &y);
740 : else
741 0 : y += upper_corner[1] - lower_corner[1] - 1;
742 : }
743 : }
744 88542 : if (handler->grid.size[0] == handler->grid.full_size[0])
745 88542 : update_loop_index(handler->grid.full_size[0], z1, &z);
746 : else
747 0 : z += upper_corner[0] - lower_corner[0] - 1;
748 : }
749 : }
750 45945 : }
751 :
752 : // *****************************************************************************
753 45945 : void grid_collocate(collocation_integration *const handler,
754 : const bool use_ortho, const double zetp, const double rp[3],
755 : const double radius) {
756 : // *** position of the gaussian product
757 : //
758 : // this is the actual definition of the position on the grid
759 : // i.e. a point rp(:) gets here grid coordinates
760 : // MODULO(rp(:)/dr(:),npts(:))+1
761 : // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on
762 : // grid)
763 :
764 : // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
765 45945 : int cubecenter[3];
766 45945 : int cube_size[3];
767 45945 : int lb_cube[3], ub_cube[3];
768 45945 : int pol_offset[3] = {0, 0, 0};
769 45945 : double roffset[3];
770 45945 : double disr_radius;
771 : /* cube : grid containing pointlike product between polynomials
772 : *
773 : * pol : grid containing the polynomials in all three directions
774 : *
775 : * pol_folded : grid containing the polynomials after folding for periodic
776 : * boundaries conditions
777 : */
778 :
779 : /* seting up the cube parameters */
780 91890 : int cmax = compute_cube_properties(
781 45945 : use_ortho, radius, (const double(*)[3])handler->dh,
782 45945 : (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
783 : cubecenter, lb_cube, ub_cube, cube_size);
784 :
785 : /* initialize the multidimensional array containing the polynomials */
786 45945 : initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
787 45945 : realloc_tensor(&handler->pol);
788 45945 : memset(handler->pol.data, 0, sizeof(double) * handler->pol.alloc_size_);
789 :
790 : /* compute the polynomials */
791 :
792 : // WARNING : do not reverse the order in pol otherwise you will have to
793 : // reverse the order in collocate_dgemm as well.
794 :
795 45945 : if (use_ortho) {
796 21114 : grid_fill_pol_dgemm(false, handler->dh[0][0], roffset[2], pol_offset[2],
797 21114 : lb_cube[2], ub_cube[2], handler->coef.size[2] - 1, cmax,
798 21114 : zetp, &idx3(handler->pol, 2, 0, 0)); /* i indice */
799 21114 : grid_fill_pol_dgemm(false, handler->dh[1][1], roffset[1], pol_offset[1],
800 21114 : lb_cube[1], ub_cube[1], handler->coef.size[1] - 1, cmax,
801 21114 : zetp, &idx3(handler->pol, 1, 0, 0)); /* j indice */
802 21114 : grid_fill_pol_dgemm(false, handler->dh[2][2], roffset[0], pol_offset[0],
803 21114 : lb_cube[0], ub_cube[0], handler->coef.size[0] - 1, cmax,
804 : zetp, &idx3(handler->pol, 0, 0, 0)); /* k indice */
805 : } else {
806 24831 : grid_fill_pol_dgemm(false, 1.0, roffset[0], pol_offset[2], lb_cube[0],
807 24831 : ub_cube[0], handler->coef.size[0] - 1, cmax,
808 24831 : zetp * handler->dx[0],
809 : &idx3(handler->pol, 0, 0, 0)); /* k indice */
810 24831 : grid_fill_pol_dgemm(false, 1.0, roffset[1], pol_offset[1], lb_cube[1],
811 24831 : ub_cube[1], handler->coef.size[1] - 1, cmax,
812 24831 : zetp * handler->dx[1],
813 24831 : &idx3(handler->pol, 1, 0, 0)); /* j indice */
814 24831 : grid_fill_pol_dgemm(false, 1.0, roffset[2], pol_offset[0], lb_cube[2],
815 24831 : ub_cube[2], handler->coef.size[2] - 1, cmax,
816 24831 : zetp * handler->dx[2],
817 24831 : &idx3(handler->pol, 2, 0, 0)); /* i indice */
818 :
819 24831 : calculate_non_orthorombic_corrections_tensor(
820 : zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
821 24831 : handler->orthogonal, &handler->Exp);
822 :
823 : /* Use a slightly modified version of Ole code */
824 24831 : grid_transform_coef_xzy_to_ikj((const double(*)[3])handler->dh,
825 24831 : &handler->coef);
826 : }
827 :
828 : /* allocate memory for the polynomial and the cube */
829 :
830 45945 : initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
831 :
832 45945 : realloc_tensor(&handler->cube);
833 45945 : initialize_W_and_T(handler, &handler->cube, &handler->coef);
834 :
835 45945 : tensor_reduction_for_collocate_integrate(
836 45945 : handler->scratch, // pointer to scratch memory
837 45945 : 1.0, handler->orthogonal, &handler->Exp, &handler->coef, &handler->pol,
838 : &handler->cube);
839 :
840 45945 : if (handler->apply_cutoff) {
841 0 : if (use_ortho) {
842 0 : apply_sphere_cutoff_ortho(handler, disr_radius, cmax, lb_cube,
843 : cubecenter);
844 : } else {
845 0 : apply_spherical_cutoff_generic(handler, radius, cmax, lb_cube, ub_cube,
846 : roffset, cubecenter);
847 : }
848 0 : return;
849 : }
850 45945 : apply_mapping_cubic(handler, cmax, lb_cube, cubecenter);
851 : }
852 :
853 : //******************************************************************************
854 0 : void grid_dgemm_collocate_pgf_product(
855 : const bool use_ortho, const int border_mask, const enum grid_func func,
856 : const int la_max, const int la_min, const int lb_max, const int lb_min,
857 : const double zeta, const double zetb, const double rscale,
858 : const double dh[3][3], const double dh_inv[3][3], const double ra[3],
859 : const double rab[3], const int grid_global_size[3],
860 : const int grid_local_size[3], const int shift_local[3],
861 : const int border_width[3], const double radius, const int o1, const int o2,
862 0 : const int n1, const int n2, const double pab_[n2][n1],
863 : double *const grid_) {
864 0 : collocation_integration *handler = collocate_create_handle();
865 :
866 0 : tensor pab;
867 0 : initialize_tensor_2(&pab, n2, n1);
868 0 : alloc_tensor(&pab);
869 0 : memcpy(pab.data, pab_, sizeof(double) * n1 * n2);
870 : // Uncomment this to dump all tasks to file.
871 : // #define __GRID_DUMP_TASKS
872 0 : int offset[2] = {o1, o2};
873 :
874 0 : int lmax[2] = {la_max, lb_max};
875 0 : int lmin[2] = {la_min, lb_min};
876 :
877 0 : const double zetp = zeta + zetb;
878 0 : const double f = zetb / zetp;
879 0 : const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
880 0 : const double prefactor = rscale * exp(-zeta * f * rab2);
881 0 : const double zeta_pair[2] = {zeta, zetb};
882 0 : initialize_basis_vectors(handler, dh, dh_inv);
883 0 : verify_orthogonality((const double(*)[3])dh, handler->orthogonal);
884 :
885 0 : initialize_tensor_3(&(handler->grid), grid_local_size[2], grid_local_size[1],
886 : grid_local_size[0]);
887 :
888 0 : handler->grid.ld_ = grid_local_size[0];
889 0 : handler->grid.data = grid_;
890 :
891 0 : setup_global_grid_size(&handler->grid, (const int *)grid_global_size);
892 :
893 0 : initialize_tensor_3(&handler->grid, grid_local_size[2], grid_local_size[1],
894 : grid_local_size[0]);
895 0 : handler->grid.ld_ = grid_local_size[0];
896 :
897 0 : setup_grid_window(&handler->grid, shift_local, border_width, border_mask);
898 :
899 0 : double rp[3], rb[3];
900 0 : for (int i = 0; i < 3; i++) {
901 0 : rp[i] = ra[i] + f * rab[i];
902 0 : rb[i] = ra[i] + rab[i];
903 : }
904 :
905 0 : int lmin_diff[2], lmax_diff[2];
906 0 : grid_prepare_get_ldiffs_dgemm(func, lmin_diff, lmax_diff);
907 :
908 0 : int lmin_prep[2];
909 0 : int lmax_prep[2];
910 :
911 0 : lmin_prep[0] = imax(lmin[0] + lmin_diff[0], 0);
912 0 : lmin_prep[1] = imax(lmin[1] + lmin_diff[1], 0);
913 :
914 0 : lmax_prep[0] = lmax[0] + lmax_diff[0];
915 0 : lmax_prep[1] = lmax[1] + lmax_diff[1];
916 :
917 0 : const int n1_prep = ncoset(lmax_prep[0]);
918 0 : const int n2_prep = ncoset(lmax_prep[1]);
919 :
920 : /* I really do not like this. This will disappear */
921 0 : tensor pab_prep;
922 0 : initialize_tensor_2(&pab_prep, n2_prep, n1_prep);
923 0 : alloc_tensor(&pab_prep);
924 0 : memset(pab_prep.data, 0, pab_prep.alloc_size_ * sizeof(double));
925 :
926 0 : grid_prepare_pab_dgemm(func, offset, lmax, lmin, &zeta_pair[0], &pab,
927 : &pab_prep);
928 :
929 : // *** initialise the coefficient matrix, we transform the sum
930 : //
931 : // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
932 : // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
933 : // (z-a_z)**lza
934 : //
935 : // into
936 : //
937 : // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
938 : //
939 : // where p is center of the product gaussian, and lp = la_max + lb_max
940 : // (current implementation is l**7)
941 : //
942 :
943 : /* precautionary tail since I probably intitialize data to NULL when I
944 : * initialize a new tensor. I want to keep the memory (I put a ridiculous
945 : * amount already) */
946 :
947 0 : initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
948 0 : lmax_prep[0] + lmax_prep[1] + 1);
949 0 : realloc_tensor(&(handler->alpha));
950 :
951 0 : const int lp = lmax_prep[0] + lmax_prep[1];
952 :
953 0 : initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
954 0 : realloc_tensor(&(handler->coef));
955 :
956 : // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
957 : // need them to be stored as
958 :
959 0 : grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &(handler->alpha));
960 :
961 : //
962 : // compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and
963 : // alpha(ls,lxa,lxb,1) use a three step procedure we don't store zeros, so
964 : // counting is done using lxyz,lxy in order to have contiguous memory access
965 : // in collocate_fast.F
966 : //
967 :
968 : // coef[x][z][y]
969 0 : grid_compute_coefficients_dgemm(lmin_prep, lmax_prep, lp, prefactor,
970 : &(handler->alpha), &pab_prep,
971 : &(handler->coef));
972 :
973 0 : grid_collocate(handler, use_ortho, zetp, rp, radius);
974 :
975 0 : collocate_destroy_handle(handler);
976 0 : free(pab_prep.data);
977 0 : }
978 :
979 5100 : void extract_blocks(grid_context *const ctx, const _task *const task,
980 : const offload_buffer *pab_blocks, tensor *const work,
981 : tensor *const pab) {
982 5100 : const int iatom = task->iatom;
983 5100 : const int jatom = task->jatom;
984 5100 : const int iset = task->iset;
985 5100 : const int jset = task->jset;
986 5100 : const int ikind = ctx->atom_kinds[iatom];
987 5100 : const int jkind = ctx->atom_kinds[jatom];
988 5100 : const grid_basis_set *ibasis = ctx->basis_sets[ikind];
989 5100 : const grid_basis_set *jbasis = ctx->basis_sets[jkind];
990 :
991 5100 : const int block_num = task->block_num;
992 :
993 : // Locate current matrix block within the buffer. This block
994 : // contains the weights of the gaussian pairs in the spherical
995 : // harmonic basis, but we do computation in the cartesian
996 : // harmonic basis so we have to rotate the coefficients. It is nothing
997 : // else than a basis change and it done with two dgemm.
998 :
999 5100 : const int block_offset = ctx->block_offsets[block_num]; // zero based
1000 5100 : double *const block = &pab_blocks->host_buffer[block_offset];
1001 :
1002 5100 : rotate_to_cartesian_harmonics(ibasis, jbasis, iatom, jatom, iset, jset, block,
1003 : work, pab);
1004 5100 : }
1005 :
1006 45945 : void compute_coefficients(grid_context *const ctx,
1007 : struct collocation_integration_ *handler,
1008 : const _task *const previous_task,
1009 : const _task *const task,
1010 : const offload_buffer *pab_blocks, tensor *const pab,
1011 : tensor *const work, tensor *const pab_prep) {
1012 : // Load subblock from buffer and decontract into Cartesian sublock pab.
1013 : // The previous pab can be reused when only ipgf or jpgf has changed.
1014 45945 : if (task->update_block_ || (previous_task == NULL)) {
1015 4542 : extract_blocks(ctx, task, pab_blocks, work, pab);
1016 : }
1017 :
1018 45945 : int lmin_prep[2];
1019 45945 : int lmax_prep[2];
1020 :
1021 45945 : lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
1022 45945 : lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
1023 :
1024 45945 : lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
1025 45945 : lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
1026 :
1027 45945 : const int n1_prep = ncoset(lmax_prep[0]);
1028 45945 : const int n2_prep = ncoset(lmax_prep[1]);
1029 :
1030 : /* we do not reallocate memory. We initialized the structure with the
1031 : * maximum lmax of the all list already.
1032 : */
1033 45945 : initialize_tensor_2(pab_prep, n2_prep, n1_prep);
1034 45945 : realloc_tensor(pab_prep);
1035 :
1036 45945 : grid_prepare_pab_dgemm(handler->func, task->offset, task->lmin, task->lmax,
1037 45945 : &task->zeta[0], pab, pab_prep);
1038 :
1039 : // *** initialise the coefficient matrix, we transform the sum
1040 : //
1041 : // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
1042 : // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb
1043 : // (y-a_y)**lya (z-a_z)**lza
1044 : //
1045 : // into
1046 : //
1047 : // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp
1048 : // (z-p_z)**lzp
1049 : //
1050 : // where p is center of the product gaussian, and lp = la_max + lb_max
1051 : // (current implementation is l**7)
1052 : //
1053 :
1054 : /* precautionary tail since I probably intitialize data to NULL when I
1055 : * initialize a new tensor. I want to keep the memory (I put a ridiculous
1056 : * amount already) */
1057 :
1058 45945 : initialize_tensor_4(&handler->alpha, 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
1059 45945 : lmax_prep[0] + lmax_prep[1] + 1);
1060 45945 : realloc_tensor(&handler->alpha);
1061 :
1062 45945 : const int lp = lmax_prep[0] + lmax_prep[1];
1063 :
1064 45945 : initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
1065 45945 : realloc_tensor(&handler->coef);
1066 :
1067 : // these two functions can be done with dgemm again....
1068 :
1069 45945 : grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax_prep,
1070 : &handler->alpha);
1071 :
1072 : // compute the coefficients after applying the function of interest
1073 : // coef[x][z][y]
1074 91890 : grid_compute_coefficients_dgemm(
1075 : lmin_prep, lmax_prep, lp,
1076 69201 : task->prefactor * ((task->iatom == task->jatom) ? 1.0 : 2.0),
1077 : &handler->alpha, pab_prep, &handler->coef);
1078 45945 : }
1079 :
1080 612 : void collocate_one_grid_level_dgemm(grid_context *const ctx,
1081 : const int *const border_width,
1082 : const int *const shift_local,
1083 : const enum grid_func func, const int level,
1084 : const offload_buffer *pab_blocks) {
1085 612 : tensor *const grid = &ctx->grid[level];
1086 : // Using default(shared) because with GCC 9 the behavior around const changed:
1087 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
1088 612 : #pragma omp parallel default(shared)
1089 : {
1090 : const int num_threads = omp_get_num_threads();
1091 : const int thread_id = omp_get_thread_num();
1092 :
1093 : tensor work, pab, pab_prep;
1094 :
1095 : struct collocation_integration_ *handler = ctx->handler[thread_id];
1096 :
1097 : handler->func = func;
1098 : grid_prepare_get_ldiffs_dgemm(handler->func, handler->lmin_diff,
1099 : handler->lmax_diff);
1100 :
1101 : handler->apply_cutoff = ctx->apply_cutoff;
1102 :
1103 : // Allocate pab matrix for re-use across tasks.
1104 : initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
1105 : alloc_tensor(&pab);
1106 :
1107 : initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
1108 : alloc_tensor(&work);
1109 :
1110 : initialize_tensor_2(&pab_prep, ctx->maxco, ctx->maxco);
1111 : alloc_tensor(&pab_prep);
1112 :
1113 : initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
1114 : (const double(*)[3])grid->dh_inv);
1115 :
1116 : /* setup the grid parameters, window parameters (if the grid is split), etc
1117 : */
1118 :
1119 : tensor_copy(&handler->grid, grid);
1120 :
1121 : for (int d = 0; d < 3; d++)
1122 : handler->orthogonal[d] = handler->grid.orthogonal[d];
1123 :
1124 : if ((thread_id == 0) || (num_threads == 1)) {
1125 : // thread id 0 directly store the results in the final storage space
1126 : handler->grid.data = ctx->grid[level].data;
1127 : }
1128 :
1129 : if ((num_threads > 1) && (thread_id > 0)) {
1130 : handler->grid.data = ((double *)ctx->scratch) +
1131 : (thread_id - 1) * handler->grid.alloc_size_;
1132 : memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
1133 : }
1134 :
1135 : /* it is only useful when we split the list over multiple threads. The first
1136 : * iteration should load the block whatever status the task->block_update_
1137 : * has */
1138 : const _task *prev_task = NULL;
1139 : #pragma omp for schedule(static)
1140 : for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
1141 : // Define some convenient aliases.
1142 : const _task *task = &ctx->tasks[level][itask];
1143 :
1144 : if (task->level != level) {
1145 : printf("level %d, %d\n", task->level, level);
1146 : abort();
1147 : }
1148 : /* the grid is divided over several ranks or not periodic */
1149 : if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
1150 : (handler->grid.size[1] != handler->grid.full_size[1]) ||
1151 : (handler->grid.size[2] != handler->grid.full_size[2])) {
1152 : /* unfortunately the window where the gaussian should be added depends
1153 : * on the bonds. So I have to adjust the window all the time. */
1154 :
1155 : setup_grid_window(&handler->grid, shift_local, border_width,
1156 : task->border_mask);
1157 : }
1158 :
1159 : /* this is a three steps procedure. pab_blocks contains the coefficients
1160 : * of the operator in the spherical harmonic basis while we do computation
1161 : * in the cartesian harmonic basis.
1162 : *
1163 : * step 1 : rotate the coefficients from the harmonic to the cartesian
1164 : * basis
1165 :
1166 : * step 2 : extract the subblock and apply additional transformations
1167 : * corresponding the spatial derivatives of the operator (the last is not
1168 : * always done)
1169 :
1170 : * step 3 : change from (x - x_1)^\alpha (x - x_2)^\beta to (x -
1171 : * x_{12})^k. It is a transformation which involves binomial
1172 : * coefficients.
1173 : *
1174 : * \f[ (x - x_1) ^\alpha (x - x_2) ^ beta = \sum_{k_{1} k_{2}} ^
1175 : * {\alpha\beta} \text{Binomial}(\alpha,k_1)
1176 : * \text{Binomial}(\beta,k_2) (x - x_{12})^{k_1 + k_2} (x_12 - x_1)
1177 : * ^{\alpha - k_1} (x_12 - x_2) ^{\beta - k_2} ]
1178 : *
1179 : * step 1 is done only when necessary, the two remaining steps are done
1180 : * for each bond.
1181 : */
1182 :
1183 : compute_coefficients(ctx, handler, prev_task, task, pab_blocks, &pab,
1184 : &work, &pab_prep);
1185 :
1186 : grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp,
1187 : task->radius);
1188 : prev_task = task;
1189 : }
1190 :
1191 : // Merge thread local grids into shared grid. Could be improved though....
1192 :
1193 : // thread 0 does nothing since the data are already placed in the final
1194 : // destination
1195 : if (num_threads > 1) {
1196 : if ((grid->alloc_size_ / (num_threads - 1)) >= 2) {
1197 : const int block_size = grid->alloc_size_ / (num_threads - 1) +
1198 : (grid->alloc_size_ % (num_threads - 1));
1199 :
1200 : for (int bk = 0; bk < num_threads; bk++) {
1201 : if (thread_id > 0) {
1202 : int bk_id = (bk + thread_id - 1) % num_threads;
1203 : int begin = bk_id * block_size;
1204 : int end = imin((bk_id + 1) * block_size, grid->alloc_size_);
1205 : cblas_daxpy(end - begin, 1.0, handler->grid.data + begin, 1,
1206 : grid->data + begin, 1);
1207 : }
1208 : #pragma omp barrier
1209 : }
1210 : }
1211 : } else {
1212 : #pragma omp critical
1213 : if (thread_id > 0)
1214 : cblas_daxpy(handler->grid.alloc_size_, 1.0,
1215 : &idx3(handler->grid, 0, 0, 0), 1, &idx3(grid[0], 0, 0, 0),
1216 : 1);
1217 : }
1218 : handler->grid.data = NULL;
1219 : free(pab.data);
1220 : free(pab_prep.data);
1221 : free(work.data);
1222 : }
1223 612 : }
1224 :
1225 : /*******************************************************************************
1226 : * \brief Collocate all tasks of a given list onto given grids.
1227 : * See grid_task_list.h for details.
1228 : ******************************************************************************/
1229 153 : void grid_dgemm_collocate_task_list(grid_dgemm_task_list *const ptr,
1230 : const enum grid_func func,
1231 : const int nlevels,
1232 : const offload_buffer *pab_blocks,
1233 : offload_buffer *grids[nlevels]) {
1234 :
1235 153 : grid_context *const ctx = (grid_context *)ptr;
1236 :
1237 153 : assert(ctx->checksum == ctx_checksum);
1238 :
1239 153 : const int max_threads = omp_get_max_threads();
1240 :
1241 153 : assert(ctx->nlevels == nlevels);
1242 :
1243 : // #pragma omp parallel for
1244 765 : for (int level = 0; level < ctx->nlevels; level++) {
1245 612 : const _layout *layout = &ctx->layouts[level];
1246 612 : set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1247 612 : layout->npts_global, layout->npts_local,
1248 612 : layout->shift_local, layout->border_width, layout->dh,
1249 612 : layout->dh_inv, grids[level]);
1250 612 : memset(ctx->grid[level].data, 0,
1251 612 : sizeof(double) * ctx->grid[level].alloc_size_);
1252 : }
1253 :
1254 153 : if (ctx->scratch == NULL) {
1255 153 : int max_size = ctx->grid[0].alloc_size_;
1256 :
1257 : /* compute the size of the largest grid. It is used afterwards to allocate
1258 : * scratch memory for the grid on each omp thread */
1259 612 : for (int x = 1; x < nlevels; x++) {
1260 459 : max_size = imax(ctx->grid[x].alloc_size_, max_size);
1261 : }
1262 :
1263 153 : max_size = ((max_size / 4096) + (max_size % 4096 != 0)) * 4096;
1264 :
1265 : /* scratch is a void pointer !!!!! */
1266 153 : ctx->scratch =
1267 153 : grid_allocate_scratch(max_size * max_threads * sizeof(double));
1268 : }
1269 :
1270 765 : for (int level = 0; level < ctx->nlevels; level++) {
1271 612 : const _layout *layout = &ctx->layouts[level];
1272 612 : collocate_one_grid_level_dgemm(ctx, layout->border_width,
1273 612 : layout->shift_local, func, level,
1274 : pab_blocks);
1275 : }
1276 :
1277 153 : grid_free_scratch(ctx->scratch);
1278 153 : ctx->scratch = NULL;
1279 153 : }
|