Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <limits.h>
10 : #include <math.h>
11 : #include <stdbool.h>
12 : #include <stdio.h>
13 : #include <stdlib.h>
14 : #include <string.h>
15 : #include <unistd.h>
16 :
17 : #include <omp.h>
18 :
19 : #include "../common/grid_common.h"
20 : #include "grid_dgemm_coefficients.h"
21 : #include "grid_dgemm_collocate.h"
22 : #include "grid_dgemm_collocation_integration.h"
23 : #include "grid_dgemm_non_orthorombic_corrections.h"
24 : #include "grid_dgemm_private_header.h"
25 : #include "grid_dgemm_tensor_local.h"
26 : #include "grid_dgemm_utils.h"
27 :
28 0 : void extract_cube_within_spherical_cutoff_ortho(
29 : struct collocation_integration_ *const handler, const double disr_radius,
30 : const int cmax, const int *const lb_cube, const int *const cube_center) {
31 : // a mapping so that the ig corresponds to the right grid point
32 0 : int **map = handler->map;
33 0 : map[1] = map[0] + 2 * cmax + 1;
34 0 : map[2] = map[1] + 2 * cmax + 1;
35 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
36 :
37 0 : for (int i = 0; i < 3; i++) {
38 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
39 0 : map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
40 0 : handler->grid.lower_corner[i],
41 : handler->grid.full_size[i]);
42 : }
43 : }
44 :
45 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
46 0 : .xmax = handler->grid.window_size[0]};
47 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
48 0 : .xmax = handler->grid.window_size[1]};
49 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
50 0 : .xmax = handler->grid.window_size[2]};
51 :
52 0 : for (int kg = 0; kg < handler->cube.size[0]; kg++) {
53 0 : const int k = map[0][kg];
54 0 : const int kd =
55 0 : (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
56 0 : const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
57 0 : const double kremain = disr_radius * disr_radius - kr * kr;
58 :
59 0 : if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
60 :
61 0 : const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
62 0 : for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
63 0 : const int j = map[1][jg - lb_cube[1]];
64 0 : const double jr = ((2 * jg - 1) >> 1) *
65 0 : handler->dh[1][1]; // distance from center in a.u.
66 0 : const double jremain = kremain - jr * jr;
67 :
68 0 : if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
69 0 : const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
70 0 : const int xmax = 1 - xmin;
71 :
72 : // printf("xmin %d, xmax %d\n", xmin, xmax);
73 0 : for (int x = xmin - lb_cube[2];
74 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
75 0 : const int x1 = map[2][x];
76 :
77 0 : if (!is_point_in_interval(x1, xwindow))
78 0 : continue;
79 :
80 0 : int lower_corner[3] = {k, j, x1};
81 0 : int upper_corner[3] = {k + 1, j + 1, x1 + 1};
82 :
83 0 : compute_interval(map[2], handler->grid.full_size[2],
84 0 : handler->grid.size[2], handler->cube.size[2], x1,
85 : &x, lower_corner + 2, upper_corner + 2, xwindow);
86 :
87 0 : if (upper_corner[2] - lower_corner[2]) {
88 0 : const int position1[3] = {kg, jg - lb_cube[1], x};
89 :
90 : /* the function will internally take care of the local vs global
91 : * grid */
92 :
93 0 : double *restrict dst = &idx3(handler->cube, position1[0],
94 : position1[1], position1[2]);
95 :
96 0 : double *restrict src = &idx3(handler->grid, lower_corner[0],
97 : lower_corner[1], lower_corner[2]);
98 :
99 0 : const int sizex = upper_corner[2] - lower_corner[2];
100 0 : GRID_PRAGMA_SIMD((dst, src), 8)
101 0 : for (int x = 0; x < sizex; x++) {
102 0 : dst[x] = src[x];
103 : }
104 : }
105 :
106 0 : if (handler->grid.size[2] == handler->grid.full_size[2])
107 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
108 : else
109 0 : x += upper_corner[2] - lower_corner[2] - 1;
110 : }
111 : }
112 : }
113 : }
114 : }
115 0 : }
116 :
117 0 : void extract_cube_within_spherical_cutoff_generic(
118 : struct collocation_integration_ *const handler, const double disr_radius,
119 : const int cmax, const int *const lb_cube, const int *const ub_cube,
120 : const double *const roffset, const int *const cube_center) {
121 :
122 0 : const double a = handler->dh[0][0] * handler->dh[0][0] +
123 0 : handler->dh[0][1] * handler->dh[0][1] +
124 0 : handler->dh[0][2] * handler->dh[0][2];
125 0 : const double a_inv = 1.0 / a;
126 : // a mapping so that the ig corresponds to the right grid point
127 0 : int **map = handler->map;
128 0 : map[1] = map[0] + 2 * cmax + 1;
129 0 : map[2] = map[1] + 2 * cmax + 1;
130 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
131 :
132 0 : for (int i = 0; i < 3; i++) {
133 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
134 0 : map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
135 0 : handler->grid.lower_corner[i],
136 : handler->grid.full_size[i]);
137 : }
138 : }
139 :
140 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
141 0 : .xmax = handler->grid.window_size[0]};
142 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
143 0 : .xmax = handler->grid.window_size[1]};
144 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
145 0 : .xmax = handler->grid.window_size[2]};
146 :
147 0 : for (int k = 0; k < handler->cube.size[0]; k++) {
148 0 : const int iz = map[0][k];
149 :
150 0 : if (!is_point_in_interval(iz, zwindow))
151 0 : continue;
152 :
153 0 : const double tz = (k + lb_cube[0] - roffset[0]);
154 0 : const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
155 0 : tz * handler->dh[2][2]};
156 :
157 0 : for (int j = 0; j < handler->cube.size[1]; j++) {
158 0 : const int iy = map[1][j];
159 :
160 0 : if (!is_point_in_interval(iy, ywindow))
161 0 : continue;
162 :
163 0 : const double ty = (j - roffset[1] + lb_cube[1]);
164 0 : const double y[3] = {z[0] + ty * handler->dh[1][0],
165 0 : z[1] + ty * handler->dh[1][1],
166 0 : z[2] + ty * handler->dh[1][2]};
167 :
168 : /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
169 : /* 4 (x1^2 + y1^2 + z1^2)
170 : * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
171 :
172 0 : const double b =
173 0 : -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
174 0 : handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
175 0 : handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
176 :
177 0 : const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
178 0 : (roffset[2] * handler->dh[0][0] - y[0]) +
179 0 : (roffset[2] * handler->dh[0][1] - y[1]) *
180 0 : (roffset[2] * handler->dh[0][1] - y[1]) +
181 0 : (roffset[2] * handler->dh[0][2] - y[2]) *
182 : (roffset[2] * handler->dh[0][2] - y[2]) -
183 0 : disr_radius * disr_radius;
184 :
185 0 : double delta = b * b - 4.0 * a * c;
186 :
187 0 : if (delta < 0.0)
188 0 : continue;
189 :
190 0 : delta = sqrt(delta);
191 :
192 0 : const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
193 0 : const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
194 :
195 0 : int lower_corner[3] = {iz, iy, xmin};
196 0 : int upper_corner[3] = {iz + 1, iy + 1, xmin};
197 :
198 0 : for (int x = xmin - lb_cube[2];
199 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
200 0 : const int x1 = map[2][x];
201 :
202 0 : if (!is_point_in_interval(x1, xwindow))
203 0 : continue;
204 :
205 0 : compute_interval(map[2], handler->grid.full_size[2],
206 0 : handler->grid.size[2], handler->cube.size[2], x1, &x,
207 : lower_corner + 2, upper_corner + 2, xwindow);
208 :
209 0 : if (upper_corner[2] - lower_corner[2]) {
210 0 : const int position1[3] = {k, j, x};
211 :
212 : /* the function will internally take care of the local vs global
213 : * grid */
214 :
215 0 : double *__restrict__ src = &idx3(handler->grid, lower_corner[0],
216 : lower_corner[1], lower_corner[2]);
217 0 : double *__restrict__ dst =
218 0 : &idx3(handler->cube, position1[0], position1[1], position1[2]);
219 :
220 0 : const int sizex = upper_corner[2] - lower_corner[2];
221 :
222 : // #pragma omp simd linear(dst, src) simdlen(8)
223 0 : GRID_PRAGMA_SIMD((dst, src), 8)
224 0 : for (int x = 0; x < sizex; x++) {
225 0 : dst[x] = src[x];
226 : }
227 :
228 0 : if (handler->grid.size[2] == handler->grid.full_size[2])
229 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
230 : else
231 0 : x += upper_corner[2] - lower_corner[2] - 1;
232 : }
233 : }
234 : }
235 : }
236 0 : }
237 :
238 5078 : static void rotate_and_store_coefficients(grid_context *const ctx,
239 : const _task *prev_task,
240 : const _task *task, tensor *const hab,
241 : tensor *work, // some scratch matrix
242 : double *blocks) {
243 5078 : if (prev_task != NULL) {
244 : /* prev_task is NULL when we deal with the first iteration. In that case
245 : * we just need to initialize the hab matrix to the proper size. Note
246 : * that resizing does not really occurs since memory allocation is done
247 : * for the maximum size the matrix can have. */
248 4482 : const int iatom = prev_task->iatom;
249 4482 : const int jatom = prev_task->jatom;
250 4482 : const int iset = prev_task->iset;
251 4482 : const int jset = prev_task->jset;
252 4482 : const int ikind = ctx->atom_kinds[iatom];
253 4482 : const int jkind = ctx->atom_kinds[jatom];
254 4482 : const grid_basis_set *ibasis = ctx->basis_sets[ikind];
255 4482 : const grid_basis_set *jbasis = ctx->basis_sets[jkind];
256 :
257 4482 : const int block_num = prev_task->block_num;
258 4482 : double *const block = &blocks[ctx->block_offsets[block_num]];
259 :
260 4482 : const int ncoseta = ncoset(ibasis->lmax[iset]);
261 4482 : const int ncosetb = ncoset(jbasis->lmax[jset]);
262 :
263 4482 : const int ncoa = ibasis->npgf[iset] * ncoseta;
264 4482 : const int ncob = jbasis->npgf[jset] * ncosetb;
265 :
266 4482 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set */
267 4482 : const int nsgf_setb = jbasis->nsgf_set[jset];
268 4482 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
269 4482 : const int nsgfb = jbasis->nsgf;
270 4482 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
271 4482 : const int sgfb = jbasis->first_sgf[jset] - 1;
272 4482 : const int maxcoa = ibasis->maxco;
273 4482 : const int maxcob = jbasis->maxco;
274 :
275 4482 : initialize_tensor_2(work, nsgf_setb, ncoa);
276 4482 : realloc_tensor(work);
277 :
278 : // Warning these matrices are row major....
279 :
280 4482 : dgemm_params m1, m2;
281 4482 : memset(&m1, 0, sizeof(dgemm_params));
282 4482 : memset(&m2, 0, sizeof(dgemm_params));
283 :
284 4482 : m1.op1 = 'N';
285 4482 : m1.op2 = 'N';
286 4482 : m1.m = nsgf_setb;
287 4482 : m1.n = ncoa;
288 4482 : m1.k = ncob;
289 4482 : m1.alpha = 1.0;
290 4482 : m1.beta = 0.0;
291 4482 : m1.a = &jbasis->sphi[sgfb * maxcob];
292 4482 : m1.lda = maxcob;
293 4482 : m1.b = hab->data;
294 4482 : m1.ldb = ncoa;
295 4482 : m1.c = work->data;
296 4482 : m1.ldc = work->ld_;
297 :
298 : // phi[b][ncob]
299 : // work[b][ncoa] = phi[b][ncob] * hab[ncob][ncoa]
300 :
301 4482 : if (iatom <= jatom) {
302 : // I need to have the final result in the form
303 :
304 : // block[b][a] = work[b][ncoa] transpose(phi[a][ncoa])
305 2982 : m2.op1 = 'N';
306 2982 : m2.op2 = 'T';
307 2982 : m2.m = nsgf_setb;
308 2982 : m2.n = nsgf_seta;
309 2982 : m2.k = ncoa;
310 2982 : m2.a = work->data;
311 2982 : m2.lda = work->ld_;
312 2982 : m2.b = &ibasis->sphi[sgfa * maxcoa];
313 2982 : m2.ldb = maxcoa;
314 2982 : m2.c = block + sgfb * nsgfa + sgfa;
315 2982 : m2.ldc = nsgfa;
316 2982 : m2.alpha = 1.0;
317 2982 : m2.beta = 1.0;
318 : } else {
319 : // block[a][b] = phi[a][ncoa] Transpose(work[b][ncoa])
320 1500 : m2.op1 = 'N';
321 1500 : m2.op2 = 'T';
322 1500 : m2.m = nsgf_seta;
323 1500 : m2.n = nsgf_setb;
324 1500 : m2.k = ncoa;
325 1500 : m2.a = &ibasis->sphi[sgfa * maxcoa];
326 1500 : m2.lda = maxcoa;
327 1500 : m2.b = work->data;
328 1500 : m2.ldb = work->ld_;
329 1500 : m2.c = block + sgfa * nsgfb + sgfb;
330 1500 : m2.ldc = nsgfb;
331 1500 : m2.alpha = 1.0;
332 1500 : m2.beta = 1.0;
333 : }
334 :
335 : /* these dgemm are *row* major */
336 4482 : dgemm_simplified(&m1);
337 4482 : dgemm_simplified(&m2);
338 : }
339 :
340 5078 : if (task != NULL) {
341 4482 : const int iatom = task->iatom;
342 4482 : const int jatom = task->jatom;
343 4482 : const int ikind = ctx->atom_kinds[iatom];
344 4482 : const int jkind = ctx->atom_kinds[jatom];
345 4482 : const int iset = task->iset;
346 4482 : const int jset = task->jset;
347 4482 : const grid_basis_set *ibasis = ctx->basis_sets[ikind];
348 4482 : const grid_basis_set *jbasis = ctx->basis_sets[jkind];
349 4482 : const int ncoseta = ncoset(ibasis->lmax[iset]);
350 4482 : const int ncosetb = ncoset(jbasis->lmax[jset]);
351 :
352 4482 : const int ncoa = ibasis->npgf[iset] * ncoseta;
353 4482 : const int ncob = jbasis->npgf[jset] * ncosetb;
354 :
355 4482 : initialize_tensor_2(hab, ncob, ncoa);
356 4482 : realloc_tensor(hab);
357 : }
358 5078 : }
359 :
360 49036 : void update_force_pair(orbital a, orbital b, const double pab,
361 : const double ftz[2], const double *const rab,
362 : const tensor *const vab, tensor *force) {
363 49036 : const double axpm0 = idx2(vab[0], idx(b), idx(a));
364 196144 : for (int i = 0; i < 3; i++) {
365 147108 : const double aip1 = idx2(vab[0], idx(b), idx(up(i, a)));
366 147108 : const double aim1 = idx2(vab[0], idx(b), idx(down(i, a)));
367 147108 : const double bim1 = idx2(vab[0], idx(down(i, b)), idx(a));
368 147108 : idx2(force[0], 0, i) += pab * (ftz[0] * aip1 - a.l[i] * aim1);
369 147108 : idx2(force[0], 1, i) +=
370 147108 : pab * (ftz[1] * (aip1 - rab[i] * axpm0) - b.l[i] * bim1);
371 : }
372 49036 : }
373 :
374 34952 : void update_virial_pair(orbital a, orbital b, const double pab,
375 : const double ftz[2], const double *const rab,
376 : const tensor *const vab, tensor *virial) {
377 139808 : for (int i = 0; i < 3; i++) {
378 419424 : for (int j = 0; j < 3; j++) {
379 314568 : idx3(virial[0], 0, i, j) +=
380 314568 : pab * ftz[0] * idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
381 314568 : pab * a.l[j] * idx2(vab[0], idx(b), idx(up(i, down(j, a))));
382 : }
383 : }
384 :
385 139808 : for (int i = 0; i < 3; i++) {
386 419424 : for (int j = 0; j < 3; j++) {
387 314568 : idx3(virial[0], 1, i, j) +=
388 314568 : pab * ftz[1] *
389 314568 : (idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
390 314568 : idx2(vab[0], idx(b), idx(up(i, a))) * rab[j] -
391 314568 : idx2(vab[0], idx(b), idx(up(j, a))) * rab[i] +
392 314568 : idx2(vab[0], idx(b), idx(a)) * rab[j] * rab[i]) -
393 314568 : pab * b.l[j] * idx2(vab[0], idx(up(i, down(j, b))), idx(a));
394 : }
395 : }
396 34952 : }
397 :
398 352870 : void update_all(const bool compute_forces, const bool compute_virial,
399 : const orbital a, const orbital b, const double f,
400 : const double *const ftz, const double *rab, const tensor *vab,
401 : const double pab, double *hab, tensor *forces,
402 : tensor *virials) {
403 :
404 352870 : *hab += f * idx2(vab[0], idx(b), idx(a));
405 :
406 352870 : if (compute_forces) {
407 49036 : update_force_pair(a, b, f * pab, ftz, rab, vab, forces);
408 : }
409 :
410 352870 : if (compute_virial) {
411 34952 : update_virial_pair(a, b, f * pab, ftz, rab, vab, virials);
412 : }
413 352870 : }
414 :
415 0 : static void update_tau(const bool compute_forces, const bool compute_virial,
416 : const orbital a, const orbital b, const double ftz[2],
417 : const double *const rab, const tensor *const vab,
418 : const double pab, double *const hab, tensor *forces,
419 : tensor *virials) {
420 :
421 0 : for (int i = 0; i < 3; i++) {
422 0 : update_all(compute_forces, compute_virial, down(i, a), down(i, b),
423 0 : 0.5 * a.l[i] * b.l[i], ftz, rab, vab, pab, hab, forces, virials);
424 0 : update_all(compute_forces, compute_virial, up(i, a), down(i, b),
425 0 : -0.5 * ftz[0] * b.l[i], ftz, rab, vab, pab, hab, forces,
426 : virials);
427 0 : update_all(compute_forces, compute_virial, down(i, a), up(i, b),
428 0 : -0.5 * a.l[i] * ftz[1], ftz, rab, vab, pab, hab, forces,
429 : virials);
430 0 : update_all(compute_forces, compute_virial, up(i, a), up(i, b),
431 0 : 0.5 * ftz[0] * ftz[1], ftz, rab, vab, pab, hab, forces, virials);
432 : }
433 0 : }
434 :
435 : static void
436 44389 : update_hab_forces_and_stress(const _task *task, const tensor *const vab,
437 : const tensor *const pab, const bool compute_tau,
438 : const bool compute_forces,
439 : const bool compute_virial, tensor *const forces,
440 : tensor *const virial, tensor *const hab) {
441 44389 : double zeta[2] = {task->zeta[0] * 2.0, task->zeta[1] * 2.0};
442 104127 : for (int lb = task->lmin[1]; lb <= task->lmax[1]; lb++) {
443 142989 : for (int la = task->lmin[0]; la <= task->lmax[0]; la++) {
444 207817 : for (int bx = 0; bx <= lb; bx++) {
445 295454 : for (int by = 0; by <= lb - bx; by++) {
446 170888 : const int bz = lb - bx - by;
447 170888 : const orbital b = {{bx, by, bz}};
448 170888 : const int idx_b = task->offset[1] + idx(b);
449 427643 : for (int ax = 0; ax <= la; ax++) {
450 609625 : for (int ay = 0; ay <= la - ax; ay++) {
451 352870 : const int az = la - ax - ay;
452 352870 : const orbital a = {{ax, ay, az}};
453 352870 : const int idx_a = task->offset[0] + idx(a);
454 352870 : double *habval = &idx2(hab[0], idx_b, idx_a);
455 352870 : const double prefactor = idx2(pab[0], idx_b, idx_a);
456 :
457 : // now compute the forces
458 352870 : if (compute_tau) {
459 0 : update_tau(compute_forces, compute_virial, a, b, zeta,
460 0 : task->rab, vab, prefactor, habval, forces, virial);
461 : } else {
462 352870 : update_all(compute_forces, compute_virial, a, b, 1.0, zeta,
463 352870 : task->rab, vab, prefactor, habval, forces, virial);
464 : }
465 : }
466 : }
467 : }
468 : }
469 : }
470 : }
471 44389 : }
472 :
473 : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
474 : * very general and does not depend on the orthorombic nature of the grid. for
475 : * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
476 :
477 44389 : void extract_cube(struct collocation_integration_ *handler, const int cmax,
478 : const int *lower_boundaries_cube, const int *cube_center) {
479 :
480 : // a mapping so that the ig corresponds to the right grid point
481 44389 : int **map = handler->map;
482 44389 : map[1] = map[0] + 2 * cmax + 1;
483 44389 : map[2] = map[1] + 2 * cmax + 1;
484 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
485 177556 : for (int i = 0; i < 3; i++) {
486 3366244 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
487 3233077 : map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
488 3233077 : handler->grid.lower_corner[i],
489 : handler->grid.full_size[i]);
490 : }
491 : }
492 :
493 44389 : int lower_corner[3];
494 44389 : int upper_corner[3];
495 :
496 44389 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
497 44389 : .xmax = handler->grid.window_size[0]};
498 44389 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
499 44389 : .xmax = handler->grid.window_size[1]};
500 44389 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
501 44389 : .xmax = handler->grid.window_size[2]};
502 :
503 130515 : for (int z = 0; (z < handler->cube.size[0]); z++) {
504 86126 : const int z1 = map[0][z];
505 :
506 86126 : if (!is_point_in_interval(z1, zwindow))
507 0 : continue;
508 :
509 86126 : compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
510 : handler->cube.size[0], z1, &z, lower_corner, upper_corner,
511 : zwindow);
512 :
513 : /* // We have a full plane. */
514 86126 : if (upper_corner[0] - lower_corner[0]) {
515 258230 : for (int y = 0; y < handler->cube.size[1]; y++) {
516 172104 : const int y1 = map[1][y];
517 :
518 : // this check is completely irrelevant when running without MPI.
519 172104 : if (!is_point_in_interval(y1, ywindow))
520 0 : continue;
521 :
522 172104 : compute_interval(map[1], handler->grid.full_size[1],
523 172104 : handler->grid.size[1], handler->cube.size[1], y1, &y,
524 : lower_corner + 1, upper_corner + 1, ywindow);
525 :
526 172104 : if (upper_corner[1] - lower_corner[1]) {
527 552924 : for (int x = 0; x < handler->cube.size[2]; x++) {
528 380820 : const int x1 = map[2][x];
529 :
530 380820 : if (!is_point_in_interval(x1, xwindow))
531 0 : continue;
532 :
533 380820 : compute_interval(map[2], handler->grid.full_size[2],
534 380820 : handler->grid.size[2], handler->cube.size[2], x1,
535 : &x, lower_corner + 2, upper_corner + 2, xwindow);
536 :
537 380820 : if (upper_corner[2] - lower_corner[2]) { // should be non zero
538 380820 : const int position1[3] = {z, y, x};
539 :
540 : /* the function will internally take care of the local vx global
541 : * grid */
542 380820 : extract_sub_grid(
543 : lower_corner, // lower corner of the portion of cube (in the
544 : // full grid)
545 : upper_corner, // upper corner of the portion of cube (in the
546 : // full grid)
547 : position1, // starting position subblock inside the cube
548 380820 : &handler->grid,
549 380820 : &handler->cube); // the grid to add data from
550 :
551 380820 : if (handler->grid.size[2] == handler->grid.full_size[2])
552 380820 : update_loop_index(handler->grid.full_size[2], x1, &x);
553 : else
554 0 : x += upper_corner[2] - lower_corner[2] - 1;
555 : }
556 : }
557 172104 : if (handler->grid.size[1] == handler->grid.full_size[1])
558 172104 : update_loop_index(handler->grid.full_size[1], y1, &y);
559 : else
560 0 : y += upper_corner[1] - lower_corner[1] - 1;
561 : }
562 : }
563 86126 : if (handler->grid.size[0] == handler->grid.full_size[0])
564 86126 : update_loop_index(handler->grid.full_size[0], z1, &z);
565 : else
566 0 : z += upper_corner[0] - lower_corner[0] - 1;
567 : }
568 : }
569 44389 : }
570 :
571 44389 : void grid_integrate(collocation_integration *const handler,
572 : const bool use_ortho, const double zetp, const double rp[3],
573 : const double radius) {
574 44389 : if (handler == NULL)
575 0 : abort();
576 :
577 44389 : const int lp = handler->coef.size[0] - 1;
578 :
579 44389 : int cubecenter[3];
580 44389 : int cube_size[3];
581 44389 : int lb_cube[3], ub_cube[3];
582 44389 : double roffset[3];
583 44389 : double disr_radius;
584 :
585 : /* cube : grid comtaining pointlike product between polynomials
586 : *
587 : * pol : grid containing the polynomials in all three directions
588 : */
589 :
590 : /* seting up the cube parameters */
591 88778 : int cmax = compute_cube_properties(
592 44389 : use_ortho, radius, (const double(*)[3])handler->dh,
593 44389 : (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
594 : cubecenter, lb_cube, ub_cube, cube_size);
595 :
596 : /* initialize the multidimensional array containing the polynomials */
597 44389 : if (lp != 0) {
598 35109 : initialize_tensor_3(&handler->pol, 3, cmax, handler->coef.size[0]);
599 : } else {
600 9280 : initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
601 : }
602 44389 : handler->pol_alloc_size = realloc_tensor(&handler->pol);
603 :
604 : /* allocate memory for the polynomial and the cube */
605 :
606 44389 : initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
607 :
608 44389 : handler->cube_alloc_size = realloc_tensor(&handler->cube);
609 :
610 44389 : initialize_W_and_T(handler, &handler->coef, &handler->cube);
611 :
612 : /* compute the polynomials */
613 :
614 : // WARNING : do not reverse the order in pol otherwise you will have to
615 : // reverse the order in collocate_dgemm as well.
616 :
617 : /* the tensor contraction is done for a given order so I have to be careful
618 : * how the tensors X, Y, Z are stored. In collocate, they are stored
619 : * normally 0 (Z), (1) Y, (2) X in the table pol. but in integrate (which
620 : * uses the same tensor reduction), I have a special treatment for l = 0. In
621 : * that case the order *should* be the same than for collocate. For l > 0,
622 : * we need a different storage order which is (X) 2, (Y) 0, and (Z) 1.
623 : *
624 : * the reason for this is that the cube is stored as cube[z][y][x] so the
625 : * first direction taken for the dgemm is along x.
626 : */
627 :
628 44389 : int perm[3] = {2, 0, 1};
629 :
630 44389 : if (lp == 0) {
631 : /* I need to restore the same order than for collocate */
632 9280 : perm[0] = 0;
633 9280 : perm[1] = 1;
634 9280 : perm[2] = 2;
635 : }
636 :
637 44389 : bool use_ortho_forced = handler->orthogonal[0] && handler->orthogonal[1] &&
638 24226 : handler->orthogonal[2];
639 44389 : if (use_ortho) {
640 21114 : grid_fill_pol_dgemm((lp != 0), handler->dh[0][0], roffset[2], 0, lb_cube[2],
641 21114 : ub_cube[2], lp, cmax, zetp,
642 21114 : &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
643 21114 : grid_fill_pol_dgemm((lp != 0), handler->dh[1][1], roffset[1], 0, lb_cube[1],
644 21114 : ub_cube[1], lp, cmax, zetp,
645 21114 : &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
646 21114 : grid_fill_pol_dgemm((lp != 0), handler->dh[2][2], roffset[0], 0, lb_cube[0],
647 21114 : ub_cube[0], lp, cmax, zetp,
648 21114 : &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
649 : } else {
650 23275 : double dx[3];
651 :
652 23275 : dx[2] = handler->dh[0][0] * handler->dh[0][0] +
653 23275 : handler->dh[0][1] * handler->dh[0][1] +
654 23275 : handler->dh[0][2] * handler->dh[0][2];
655 :
656 23275 : dx[1] = handler->dh[1][0] * handler->dh[1][0] +
657 23275 : handler->dh[1][1] * handler->dh[1][1] +
658 23275 : handler->dh[1][2] * handler->dh[1][2];
659 :
660 23275 : dx[0] = handler->dh[2][0] * handler->dh[2][0] +
661 23275 : handler->dh[2][1] * handler->dh[2][1] +
662 23275 : handler->dh[2][2] * handler->dh[2][2];
663 :
664 23275 : grid_fill_pol_dgemm((lp != 0), 1.0, roffset[2], 0, lb_cube[2], ub_cube[2],
665 23275 : lp, cmax, zetp * dx[2],
666 23275 : &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
667 23275 : grid_fill_pol_dgemm((lp != 0), 1.0, roffset[1], 0, lb_cube[1], ub_cube[1],
668 23275 : lp, cmax, zetp * dx[1],
669 23275 : &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
670 23275 : grid_fill_pol_dgemm((lp != 0), 1.0, roffset[0], 0, lb_cube[0], ub_cube[0],
671 23275 : lp, cmax, zetp * dx[0],
672 23275 : &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
673 :
674 : /* the three remaining tensors are initialized in the function */
675 23275 : calculate_non_orthorombic_corrections_tensor(
676 : zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
677 23275 : handler->orthogonal, &handler->Exp);
678 : }
679 :
680 44389 : if (handler->apply_cutoff) {
681 0 : memset(handler->cube.data, 0, sizeof(double) * handler->cube.alloc_size_);
682 0 : if (!use_ortho && !use_ortho_forced) {
683 0 : extract_cube_within_spherical_cutoff_generic(
684 : handler, disr_radius, cmax, lb_cube, ub_cube, roffset, cubecenter);
685 : } else {
686 0 : extract_cube_within_spherical_cutoff_ortho(handler, disr_radius, cmax,
687 : lb_cube, cubecenter);
688 : }
689 : } else {
690 44389 : extract_cube(handler, cmax, lb_cube, cubecenter);
691 : }
692 :
693 44389 : if (!use_ortho && !use_ortho_forced)
694 23275 : apply_non_orthorombic_corrections(handler->orthogonal, &handler->Exp,
695 : &handler->cube);
696 :
697 44389 : if (lp != 0) {
698 35109 : tensor_reduction_for_collocate_integrate(handler->scratch,
699 : // pointer to scratch memory
700 35109 : 1.0, handler->orthogonal, NULL,
701 : &handler->cube, &handler->pol,
702 : &handler->coef);
703 : } else {
704 : /* it is very specific to integrate because we might end up with a
705 : * single element after the tensor product/contraction. In that case, I
706 : * compute the cube and then do a scalar product between the two. */
707 :
708 : /* we could also do this with 2 matrix-vector multiplications and a scalar
709 : * product
710 : *
711 : * H_{jk} = C_{ijk} . P_i (along x) C_{ijk} is *stored C[k][j][i]* !!!!!!
712 : * L_{k} = H_{jk} . P_j (along y)
713 : * v_{ab} = L_k . P_k (along z)
714 : */
715 :
716 9280 : tensor cube_tmp;
717 9280 : initialize_tensor_2(&cube_tmp, cube_size[0], cube_size[1]);
718 9280 : alloc_tensor(&cube_tmp);
719 :
720 : /* first along x */
721 9280 : cblas_dgemv(CblasRowMajor, CblasNoTrans,
722 9280 : handler->cube.size[0] * handler->cube.size[1],
723 9280 : handler->cube.size[2], 1.0, handler->cube.data,
724 9280 : handler->cube.ld_, &idx3(handler->pol, 2, 0, 0), 1, 0.0,
725 : cube_tmp.data, 1);
726 :
727 : /* second along j */
728 9280 : cblas_dgemv(CblasRowMajor, CblasNoTrans, handler->cube.size[0],
729 9280 : handler->cube.size[1], 1.0, cube_tmp.data, cube_tmp.ld_,
730 9280 : &idx3(handler->pol, 1, 0, 0), 1, 0.0, handler->scratch, 1);
731 :
732 : /* finally along k, it is a scalar product.... */
733 18560 : handler->coef.data[0] = cblas_ddot(handler->cube.size[0], handler->scratch,
734 9280 : 1, &idx3(handler->pol, 0, 0, 0), 1);
735 :
736 9280 : free(cube_tmp.data);
737 : }
738 :
739 : /* go from ijk -> xyz */
740 44389 : if (!use_ortho)
741 23275 : grid_transform_coef_jik_to_yxz((const double(*)[3])handler->dh,
742 : &handler->coef);
743 44389 : }
744 :
745 596 : void integrate_one_grid_level_dgemm(
746 : grid_context *const ctx, const int level, const bool calculate_tau,
747 : const bool calculate_forces, const bool calculate_virial,
748 : const int *const shift_local, const int *const border_width,
749 : const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks,
750 : tensor *forces_, tensor *virial_) {
751 596 : tensor *const grid = &ctx->grid[level];
752 :
753 : // Using default(shared) because with GCC 9 the behavior around const changed:
754 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
755 596 : #pragma omp parallel default(shared)
756 : {
757 : const int num_threads = omp_get_num_threads();
758 : const int thread_id = omp_get_thread_num();
759 :
760 : double *hab_block_local = NULL;
761 :
762 : if (num_threads == 1) {
763 : hab_block_local = hab_blocks->host_buffer;
764 : } else {
765 : hab_block_local = ((double *)ctx->scratch) +
766 : thread_id * (hab_blocks->size / sizeof(double));
767 : memset(hab_block_local, 0, hab_blocks->size);
768 : }
769 :
770 : tensor work, pab, hab, vab, forces_local_, virial_local_,
771 : forces_local_pair_, virial_local_pair_;
772 : memset(&work, 0, sizeof(tensor));
773 : memset(&pab, 0, sizeof(tensor));
774 : memset(&hab, 0, sizeof(tensor));
775 : memset(&vab, 0, sizeof(tensor));
776 : memset(&forces_local_, 0, sizeof(tensor));
777 : memset(&virial_local_, 0, sizeof(tensor));
778 : memset(&forces_local_pair_, 0, sizeof(tensor));
779 : memset(&virial_local_pair_, 0, sizeof(tensor));
780 :
781 : struct collocation_integration_ *handler = ctx->handler[thread_id];
782 : handler->apply_cutoff = ctx->apply_cutoff;
783 : handler->lmax_diff[0] = 0;
784 : handler->lmax_diff[1] = 0;
785 : handler->lmin_diff[0] = 0;
786 : handler->lmin_diff[1] = 0;
787 :
788 : if (calculate_tau || calculate_forces || calculate_virial) {
789 : handler->lmax_diff[0] = 1;
790 : handler->lmax_diff[1] = 0;
791 : handler->lmin_diff[0] = -1;
792 : handler->lmin_diff[1] = -1;
793 : }
794 :
795 : if (calculate_virial) {
796 : handler->lmax_diff[0]++;
797 : handler->lmax_diff[1]++;
798 : }
799 :
800 : if (calculate_tau) {
801 : handler->lmax_diff[0]++;
802 : handler->lmax_diff[1]++;
803 : handler->lmin_diff[0]--;
804 : handler->lmin_diff[1]--;
805 : }
806 :
807 : // Allocate pab matrix for re-use across tasks.
808 : initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
809 : alloc_tensor(&pab);
810 :
811 : initialize_tensor_2(&vab, ctx->maxco, ctx->maxco);
812 : alloc_tensor(&vab);
813 :
814 : initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
815 : alloc_tensor(&work);
816 :
817 : initialize_tensor_2(&hab, ctx->maxco, ctx->maxco);
818 : alloc_tensor(&hab);
819 :
820 : if (calculate_forces) {
821 : initialize_tensor_2(&forces_local_, forces_->size[0], forces_->size[1]);
822 : alloc_tensor(&forces_local_);
823 : initialize_tensor_2(&virial_local_, 3, 3);
824 : alloc_tensor(&virial_local_);
825 : memset(forces_local_.data, 0, sizeof(double) * forces_local_.alloc_size_);
826 : memset(virial_local_.data, 0, sizeof(double) * virial_local_.alloc_size_);
827 : initialize_tensor_2(&forces_local_pair_, 2, 3);
828 : alloc_tensor(&forces_local_pair_);
829 : initialize_tensor_3(&virial_local_pair_, 2, 3, 3);
830 : alloc_tensor(&virial_local_pair_);
831 : }
832 :
833 : initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
834 : (const double(*)[3])grid->dh_inv);
835 :
836 : tensor_copy(&handler->grid, grid);
837 : handler->grid.data = grid->data;
838 : for (int d = 0; d < 3; d++)
839 : handler->orthogonal[d] = grid->orthogonal[d];
840 :
841 : /* it is only useful when we split the list over multiple threads. The
842 : * first iteration should load the block whatever status the
843 : * task->block_update_ variable has */
844 : const _task *prev_task = NULL;
845 : #pragma omp for schedule(static)
846 : for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
847 : // Define some convenient aliases.
848 : const _task *task = &ctx->tasks[level][itask];
849 :
850 : if (task->level != level) {
851 : printf("level %d, %d\n", task->level, level);
852 : abort();
853 : }
854 :
855 : if (task->update_block_ || (prev_task == NULL)) {
856 : /* need to load pab if forces are needed */
857 : if (calculate_forces) {
858 : extract_blocks(ctx, task, pab_blocks, &work, &pab);
859 : }
860 : /* store the coefficients of the operator after rotation to
861 : * the spherical harmonic basis */
862 :
863 : rotate_and_store_coefficients(ctx, prev_task, task, &hab, &work,
864 : hab_block_local);
865 :
866 : /* There is a difference here between collocate and integrate.
867 : * For collocate, I do not need to know the task where blocks
868 : * have been updated the last time. For integrate this
869 : * information is crucial to update the coefficients of the
870 : * potential */
871 : prev_task = task;
872 : memset(hab.data, 0, sizeof(double) * hab.alloc_size_);
873 : }
874 :
875 : /* the grid is divided over several ranks or not periodic */
876 : if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
877 : (handler->grid.size[1] != handler->grid.full_size[1]) ||
878 : (handler->grid.size[2] != handler->grid.full_size[2])) {
879 : /* unfortunately the window where the gaussian should be added depends
880 : * on the bonds. So I have to adjust the window all the time. */
881 :
882 : setup_grid_window(&handler->grid, shift_local, border_width,
883 : task->border_mask);
884 : }
885 :
886 : int lmax[2] = {task->lmax[0] + handler->lmax_diff[0],
887 : task->lmax[1] + handler->lmax_diff[1]};
888 : int lmin[2] = {task->lmin[0] + handler->lmin_diff[0],
889 : task->lmin[1] + handler->lmin_diff[1]};
890 :
891 : lmin[0] = imax(lmin[0], 0);
892 : lmin[1] = imax(lmin[1], 0);
893 :
894 : initialize_tensor_4(&(handler->alpha), 3, lmax[1] + 1, lmax[0] + 1,
895 : lmax[0] + lmax[1] + 1);
896 : realloc_tensor(&(handler->alpha));
897 :
898 : const int lp = lmax[0] + lmax[1];
899 :
900 : initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
901 : realloc_tensor(&(handler->coef));
902 : grid_integrate(handler, ctx->orthorhombic, task->zetp, task->rp,
903 : task->radius);
904 : /*
905 : handler->coef contains coefficients in the (x - x_12) basis. now
906 : we need to rotate them in the (x - x_1) (x - x_2) basis
907 : */
908 :
909 : /* compute the transformation matrix */
910 : grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax,
911 : &(handler->alpha));
912 :
913 : initialize_tensor_2(&vab, ncoset(lmax[1]), ncoset(lmax[0]));
914 : realloc_tensor(&vab);
915 :
916 : grid_compute_vab(lmin, lmax, lp, task->prefactor,
917 : &handler->alpha, // contains the matrix for the rotation
918 : &handler->coef, // contains the coefficients of
919 : // the potential in the
920 : // (x_x_12) basis
921 : &vab); // contains the coefficients of the potential
922 : // in the (x - x_1)(x - x_2) basis
923 :
924 : if (calculate_forces) {
925 : memset(forces_local_pair_.data, 0,
926 : sizeof(double) * forces_local_pair_.alloc_size_);
927 : memset(virial_local_pair_.data, 0,
928 : sizeof(double) * virial_local_pair_.alloc_size_);
929 : }
930 :
931 : update_hab_forces_and_stress(
932 : task, &vab, &pab, calculate_tau, calculate_forces, calculate_virial,
933 : &forces_local_pair_, /* matrix
934 : * containing the
935 : * contribution of
936 : * the gaussian
937 : * pair for each
938 : * atom */
939 : &virial_local_pair_, /* same but for the virial term (stress tensor)
940 : */
941 : &hab);
942 :
943 : if (calculate_forces) {
944 : const double scaling = (task->iatom == task->jatom) ? 1.0 : 2.0;
945 : idx2(forces_local_, task->iatom, 0) +=
946 : scaling * idx2(forces_local_pair_, 0, 0);
947 : idx2(forces_local_, task->iatom, 1) +=
948 : scaling * idx2(forces_local_pair_, 0, 1);
949 : idx2(forces_local_, task->iatom, 2) +=
950 : scaling * idx2(forces_local_pair_, 0, 2);
951 :
952 : idx2(forces_local_, task->jatom, 0) +=
953 : scaling * idx2(forces_local_pair_, 1, 0);
954 : idx2(forces_local_, task->jatom, 1) +=
955 : scaling * idx2(forces_local_pair_, 1, 1);
956 : idx2(forces_local_, task->jatom, 2) +=
957 : scaling * idx2(forces_local_pair_, 1, 2);
958 : if (calculate_virial) {
959 : for (int i = 0; i < 3; i++) {
960 : for (int j = 0; j < 3; j++) {
961 : idx2(virial_local_, i, j) +=
962 : scaling * (idx3(virial_local_pair_, 0, i, j) +
963 : idx3(virial_local_pair_, 1, i, j));
964 : }
965 : }
966 : }
967 : }
968 : }
969 :
970 : rotate_and_store_coefficients(ctx, prev_task, NULL, &hab, &work,
971 : hab_block_local);
972 :
973 : // now reduction over the hab blocks
974 : if (num_threads > 1) {
975 : // does not store the number of elements but the amount of memory
976 : // occupied. That's a strange choice.
977 : const int hab_size = hab_blocks->size / sizeof(double);
978 : if ((hab_size / num_threads) >= 2) {
979 : const int block_size =
980 : hab_size / num_threads + (hab_size % num_threads);
981 :
982 : for (int bk = 0; bk < num_threads; bk++) {
983 : int bk_id = (bk + thread_id) % num_threads;
984 : size_t begin = bk_id * block_size;
985 : size_t end = imin((bk_id + 1) * block_size, hab_size);
986 : cblas_daxpy(end - begin, 1.0, hab_block_local + begin, 1,
987 : hab_blocks->host_buffer + begin, 1);
988 : #pragma omp barrier
989 : }
990 : } else {
991 : const int hab_size = hab_blocks->size / sizeof(double);
992 : #pragma omp critical
993 : cblas_daxpy(hab_size, 1.0, hab_block_local, 1, hab_blocks->host_buffer,
994 : 1);
995 : }
996 : }
997 :
998 : if (calculate_forces) {
999 : if (num_threads > 1) {
1000 : if ((forces_->alloc_size_ / num_threads) >= 2) {
1001 : const int block_size = forces_->alloc_size_ / num_threads +
1002 : (forces_->alloc_size_ % num_threads);
1003 :
1004 : for (int bk = 0; bk < num_threads; bk++) {
1005 : int bk_id = (bk + thread_id) % num_threads;
1006 : int begin = bk_id * block_size;
1007 : int end = imin((bk_id + 1) * block_size, forces_->alloc_size_);
1008 : cblas_daxpy(end - begin, 1.0, forces_local_.data + begin, 1,
1009 : forces_->data + begin, 1);
1010 : #pragma omp barrier
1011 : }
1012 : } else {
1013 : #pragma omp critical
1014 : cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1015 : forces_->data, 1);
1016 : }
1017 : } else {
1018 : // we are running with OMP_NUM_THREADS=1
1019 : cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1020 : forces_->data, 1);
1021 : }
1022 : }
1023 :
1024 : if (calculate_virial) {
1025 : #pragma omp critical
1026 : for (int i = 0; i < 3; i++) {
1027 : for (int j = 0; j < 3; j++) {
1028 : idx2(virial_[0], i, j) += idx2(virial_local_, i, j);
1029 : }
1030 : }
1031 : }
1032 :
1033 : handler->grid.data = NULL;
1034 : free(vab.data);
1035 : free(work.data);
1036 : free(pab.data);
1037 : free(hab.data);
1038 :
1039 : if (calculate_forces) {
1040 : free(forces_local_.data);
1041 : free(virial_local_.data);
1042 : free(virial_local_pair_.data);
1043 : free(forces_local_pair_.data);
1044 : }
1045 : }
1046 596 : }
1047 :
1048 : /*******************************************************************************
1049 : * \brief Integrate all tasks of in given list from given grids using matrix -
1050 : * matrix multiplication
1051 : ******************************************************************************/
1052 149 : void grid_dgemm_integrate_task_list(
1053 : void *ptr, const bool compute_tau, const int natoms, const int nlevels,
1054 : const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
1055 : offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
1056 :
1057 149 : grid_context *const ctx = (grid_context *)ptr;
1058 :
1059 149 : assert(ctx->checksum == ctx_checksum);
1060 149 : assert(ctx->nlevels == nlevels);
1061 149 : assert(ctx->natoms == natoms);
1062 :
1063 : // Zero result arrays.
1064 149 : memset(hab_blocks->host_buffer, 0, hab_blocks->size);
1065 :
1066 149 : const int max_threads = omp_get_max_threads();
1067 :
1068 149 : if (ctx->scratch == NULL)
1069 149 : ctx->scratch = malloc(hab_blocks->size * max_threads);
1070 149 : assert(ctx->scratch != NULL);
1071 :
1072 : // #pragma omp parallel for
1073 745 : for (int level = 0; level < ctx->nlevels; level++) {
1074 596 : const _layout *layout = &ctx->layouts[level];
1075 596 : set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1076 596 : layout->npts_global, layout->npts_local,
1077 596 : layout->shift_local, layout->border_width, layout->dh,
1078 596 : layout->dh_inv, grids[level]);
1079 596 : ctx->grid[level].data = grids[level]->host_buffer;
1080 : }
1081 :
1082 149 : bool calculate_virial = (virial != NULL);
1083 149 : bool calculate_forces = (forces != NULL);
1084 :
1085 149 : tensor forces_, virial_;
1086 149 : if (calculate_forces) {
1087 19 : initialize_tensor_2(&forces_, natoms, 3);
1088 19 : alloc_tensor(&forces_);
1089 19 : initialize_tensor_2(&virial_, 3, 3);
1090 19 : alloc_tensor(&virial_);
1091 19 : memset(forces_.data, 0, sizeof(double) * forces_.alloc_size_);
1092 19 : memset(virial_.data, 0, sizeof(double) * virial_.alloc_size_);
1093 : }
1094 :
1095 745 : for (int level = 0; level < ctx->nlevels; level++) {
1096 596 : const _layout *layout = &ctx->layouts[level];
1097 596 : integrate_one_grid_level_dgemm(ctx, level, compute_tau, calculate_forces,
1098 596 : calculate_virial, layout->shift_local,
1099 596 : layout->border_width, pab_blocks, hab_blocks,
1100 : &forces_, &virial_);
1101 596 : ctx->grid[level].data = NULL;
1102 : }
1103 149 : if (calculate_forces) {
1104 19 : if (calculate_virial) {
1105 16 : virial[0][0] = idx2(virial_, 0, 0);
1106 16 : virial[0][1] = idx2(virial_, 0, 1);
1107 16 : virial[0][2] = idx2(virial_, 0, 2);
1108 16 : virial[1][0] = idx2(virial_, 1, 0);
1109 16 : virial[1][1] = idx2(virial_, 1, 1);
1110 16 : virial[1][2] = idx2(virial_, 1, 2);
1111 16 : virial[2][0] = idx2(virial_, 2, 0);
1112 16 : virial[2][1] = idx2(virial_, 2, 1);
1113 16 : virial[2][2] = idx2(virial_, 2, 2);
1114 : }
1115 :
1116 19 : memcpy(forces[0], forces_.data, sizeof(double) * forces_.alloc_size_);
1117 19 : free(forces_.data);
1118 19 : free(virial_.data);
1119 : }
1120 :
1121 149 : free(ctx->scratch);
1122 149 : ctx->scratch = NULL;
1123 149 : }
|