Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include "grid_dgemm_non_orthorombic_corrections.h"
9 :
10 : #include <math.h>
11 : #include <stdio.h>
12 : #include <stdlib.h>
13 : #include <string.h>
14 :
15 : #include "../common/grid_common.h"
16 : #include "grid_dgemm_utils.h"
17 :
18 1108868 : double exp_recursive(const double c_exp, const double c_exp_minus_1,
19 : const int index) {
20 1108868 : if (index == -1)
21 : return c_exp_minus_1;
22 :
23 1108868 : if (index == 1)
24 : return c_exp;
25 :
26 1108868 : double res = 1.0;
27 :
28 1108868 : if (index < 0) {
29 15250368 : for (int i = 0; i < -index; i++) {
30 14141500 : res *= c_exp_minus_1;
31 : }
32 : return res;
33 : }
34 :
35 0 : if (index > 0) {
36 0 : for (int i = 0; i < index; i++) {
37 0 : res *= c_exp;
38 : }
39 : return res;
40 : }
41 :
42 : return 1.0;
43 : }
44 :
45 91544 : void exp_i(const double alpha, const int imin, const int imax,
46 : double *restrict const res) {
47 91544 : const double c_exp_co = exp(alpha);
48 : /* const double c_exp_minus_1 = 1/ c_exp; */
49 91544 : res[0] = exp(imin * alpha);
50 2245864 : for (int i = 1; i < (imax - imin); i++) {
51 2154320 : res[i] = res[i - 1] *
52 : c_exp_co; // exp_recursive(c_exp_co, 1.0 / c_exp_co, i + imin);
53 : }
54 91544 : }
55 :
56 45772 : void exp_ij(const double alpha, const int offset_i, const int imin,
57 : const int imax, const int offset_j, const int jmin, const int jmax,
58 : tensor *exp_ij_) {
59 45772 : double c_exp = exp(alpha * imin);
60 45772 : const double c_exp_co = exp(alpha);
61 :
62 1154640 : for (int i = 0; i < (imax - imin); i++) {
63 1108868 : double *restrict dst = &idx2(exp_ij_[0], i + offset_i, offset_j);
64 1108868 : double ctmp = exp_recursive(c_exp, 1.0 / c_exp, jmin);
65 :
66 30500736 : for (int j = 0; j < (jmax - jmin); j++) {
67 29391868 : dst[j] *= ctmp;
68 29391868 : ctmp *= c_exp;
69 : }
70 1108868 : c_exp *= c_exp_co;
71 : }
72 45772 : }
73 :
74 45772 : void calculate_non_orthorombic_corrections_tensor(
75 : const double mu_mean, const double *r_ab, const double basis[3][3],
76 : const int *const xmin, const int *const xmax, bool plane[3],
77 : tensor *const Exp) {
78 : // zx, zy, yx
79 45772 : const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
80 :
81 : // need to review this
82 45772 : const double c[3] = {
83 : /* alpha gamma */
84 45772 : -2.0 * mu_mean *
85 45772 : (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
86 45772 : basis[0][2] * basis[2][2]),
87 : /* beta gamma */
88 45772 : -2.0 * mu_mean *
89 45772 : (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
90 45772 : basis[1][2] * basis[2][2]),
91 : /* alpha beta */
92 45772 : -2.0 * mu_mean *
93 45772 : (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
94 45772 : basis[0][2] * basis[1][2])};
95 :
96 : /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
97 : * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
98 : * better with only 7 exponentials
99 : *
100 : * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
101 : * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
102 : * that the sum of two terms in an exponential is equal to the product of
103 : * the two exponentials.
104 : *
105 : * this means that exp (a i) with i integer can be computed recursively with
106 : * one exponential only
107 : */
108 :
109 : /* we have a orthorombic case */
110 45772 : if (plane[0] && plane[1] && plane[2])
111 0 : return;
112 :
113 45772 : tensor exp_tmp;
114 45772 : double *x1, *x2;
115 : /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
116 45772 : const int max_elem =
117 45772 : imax(imax(xmax[0] - xmin[0], xmax[1] - xmin[1]), xmax[2] - xmin[2]) + 1;
118 45772 : initialize_tensor_3(Exp, 3, max_elem, max_elem);
119 45772 : realloc_tensor(Exp);
120 :
121 45772 : x1 = grid_allocate_scratch(sizeof(double) * max_elem);
122 45772 : x2 = grid_allocate_scratch(sizeof(double) * max_elem);
123 45772 : initialize_tensor_2(&exp_tmp, Exp->size[1], Exp->size[2]);
124 :
125 45772 : memset(&idx3(Exp[0], 0, 0, 0), 0, sizeof(double) * Exp->alloc_size_);
126 183088 : for (int dir = 0; dir < 3; dir++) {
127 137316 : int d1 = n[dir][0];
128 137316 : int d2 = n[dir][1];
129 :
130 137316 : if (!plane[dir]) {
131 45772 : const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
132 :
133 45772 : exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1);
134 45772 : exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2);
135 :
136 45772 : exp_tmp.data = &idx3(Exp[0], dir, 0, 0);
137 45772 : cblas_dger(CblasRowMajor, xmax[d1] - xmin[d1] + 1,
138 45772 : xmax[d2] - xmin[d2] + 1, c_exp_const, x1, 1, x2, 1,
139 : &idx2(exp_tmp, 0, 0), exp_tmp.ld_);
140 45772 : exp_ij(c[dir], 0, xmin[d1], xmax[d1] + 1, 0, xmin[d2], xmax[d2] + 1,
141 : &exp_tmp);
142 : }
143 : }
144 45772 : grid_free_scratch(x1);
145 45772 : grid_free_scratch(x2);
146 : }
147 :
148 0 : void calculate_non_orthorombic_corrections_tensor_blocked(
149 : const double mu_mean, const double *r_ab, const double basis[3][3],
150 : const int *const lower_corner, const int *const upper_corner,
151 : const int *const block_size, const int *const offset, const int *const xmin,
152 : const int *const xmax, bool *plane, tensor *const Exp) {
153 : // zx, zy, yx
154 0 : const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
155 :
156 : // need to review this
157 0 : const double c[3] = {
158 : /* alpha gamma */
159 0 : -2.0 * mu_mean *
160 0 : (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
161 0 : basis[0][2] * basis[2][2]),
162 : /* beta gamma */
163 0 : -2.0 * mu_mean *
164 0 : (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
165 0 : basis[1][2] * basis[2][2]),
166 : /* alpha beta */
167 0 : -2.0 * mu_mean *
168 0 : (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
169 0 : basis[0][2] * basis[1][2])};
170 :
171 : /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
172 : * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
173 : * better with only 7 exponentials
174 : *
175 : * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
176 : * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
177 : * that the sum of two terms in an exponential is equal to the product of
178 : * the two exponentials.
179 : *
180 : * this means that exp (a i) with i integer can be computed recursively with
181 : * one exponential only
182 : */
183 :
184 : /* we have a orthorombic case */
185 0 : if (plane[0] && plane[1] && plane[2])
186 0 : return;
187 :
188 0 : tensor exp_blocked;
189 0 : double *x1, *x2;
190 : /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
191 0 : initialize_tensor_2(&exp_blocked, imax(block_size[0], block_size[1]),
192 : imax(block_size[1], block_size[2]));
193 :
194 0 : const int cube_size[3] = {(upper_corner[0] - lower_corner[0]) * block_size[0],
195 0 : (upper_corner[1] - lower_corner[1]) * block_size[1],
196 0 : (upper_corner[2] - lower_corner[2]) *
197 : block_size[2]};
198 :
199 0 : const int max_elem = imax(imax(cube_size[0], cube_size[1]), cube_size[2]);
200 0 : x1 = grid_allocate_scratch(sizeof(double) * max_elem);
201 0 : x2 = grid_allocate_scratch(sizeof(double) * max_elem);
202 :
203 0 : initialize_tensor_4(Exp, 3,
204 0 : imax(upper_corner[0] - lower_corner[0],
205 : upper_corner[1] - lower_corner[1]),
206 0 : imax(upper_corner[2] - lower_corner[2],
207 0 : upper_corner[1] - lower_corner[1]),
208 0 : exp_blocked.alloc_size_);
209 :
210 0 : realloc_tensor(Exp);
211 : /* memset(Exp->data, 0, sizeof(double) * Exp->alloc_size_); */
212 :
213 0 : for (int dir = 0; dir < 3; dir++) {
214 0 : int d1 = n[dir][0];
215 0 : int d2 = n[dir][1];
216 :
217 0 : if (!plane[dir]) {
218 0 : const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
219 0 : memset(x1, 0, sizeof(double) * max_elem);
220 0 : memset(x2, 0, sizeof(double) * max_elem);
221 : /* memset(exp_tmp.data, 0, sizeof(double) * exp_tmp.alloc_size_); */
222 0 : exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1 + offset[d1]);
223 0 : exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2 + offset[d2]);
224 :
225 0 : for (int y = 0; y < (upper_corner[d1] - lower_corner[d1]); y++) {
226 0 : const int y_1 = y * block_size[d1];
227 0 : for (int x = 0; x < (upper_corner[d2] - lower_corner[d2]); x++) {
228 0 : const int x_1 = x * block_size[d2];
229 0 : exp_blocked.data = &idx4(Exp[0], dir, y, x, 0);
230 :
231 0 : for (int y2 = 0; y2 < block_size[d1]; y2++) {
232 0 : double *restrict dst = &idx2(exp_blocked, y2, 0);
233 0 : const double scal = x1[y_1 + y2] * c_exp_const;
234 0 : double *restrict src = &x2[x_1];
235 : // #pragma omp simd linear(dst, src) simdlen(8)
236 0 : GRID_PRAGMA_SIMD((dst, src), 8)
237 0 : for (int x3 = 0; x3 < block_size[d2]; x3++) {
238 0 : dst[x3] = scal * src[x3];
239 : }
240 : }
241 :
242 0 : exp_ij(c[dir], 0, xmin[d1] - offset[d1] + y_1,
243 0 : xmin[d1] - offset[d1] + y_1 + block_size[d1], 0,
244 : xmin[d2] - offset[d2] + x_1,
245 0 : xmin[d2] - offset[d2] + x_1 + block_size[d2], &exp_blocked);
246 : }
247 : }
248 : }
249 : }
250 :
251 0 : grid_free_scratch(x1);
252 0 : grid_free_scratch(x2);
253 : /* free(exp_tmp.data); */
254 : }
255 :
256 22497 : void apply_non_orthorombic_corrections(const bool plane[restrict 3],
257 : const tensor *const Exp,
258 : tensor *const cube) {
259 : // Well we should never call non orthorombic corrections if everything is
260 : // orthorombic
261 22497 : if (plane[0] && plane[1] && plane[2])
262 : return;
263 :
264 : /*k and i are orthogonal, k and j as well */
265 22497 : if (plane[0] && plane[1]) {
266 39732 : for (int z = 0; z < cube->size[0]; z++) {
267 983748 : for (int y = 0; y < cube->size[1]; y++) {
268 946350 : double *restrict yx = &idx3(Exp[0], 2, y, 0);
269 946350 : double *restrict dst = &idx3(cube[0], z, y, 0);
270 946350 : GRID_PRAGMA_SIMD((dst, yx), 8)
271 946350 : for (int x = 0; x < cube->size[2]; x++) {
272 24745974 : dst[x] *= yx[x];
273 : }
274 : }
275 : }
276 : return;
277 : }
278 :
279 : /* k and i are orhogonal, i and j as well */
280 20163 : if (plane[0] && plane[2]) {
281 0 : for (int z = 0; z < cube->size[0]; z++) {
282 0 : for (int y = 0; y < cube->size[1]; y++) {
283 0 : const double zy = idx3(Exp[0], 1, z, y);
284 0 : double *restrict dst = &idx3(cube[0], z, y, 0);
285 :
286 : // #pragma omp simd linear(dst) simdlen(8)
287 0 : GRID_PRAGMA_SIMD((dst), 8)
288 0 : for (int x = 0; x < cube->size[2]; x++) {
289 0 : dst[x] *= zy;
290 : }
291 : }
292 : }
293 : return;
294 : }
295 :
296 : /* j, k are orthognal, i and j are orthognal */
297 20163 : if (plane[1] && plane[2]) {
298 508286 : for (int z = 0; z < cube->size[0]; z++) {
299 488123 : double *restrict zx = &idx3(Exp[0], 0, z, 0);
300 13139192 : for (int y = 0; y < cube->size[1]; y++) {
301 12651069 : double *restrict dst = &idx3(cube[0], z, y, 0);
302 : // #pragma omp simd linear(dst, zx) simdlen(8)
303 12651069 : GRID_PRAGMA_SIMD((dst, zx), 8)
304 12651069 : for (int x = 0; x < cube->size[2]; x++) {
305 358329545 : dst[x] *= zx[x];
306 : }
307 : }
308 : }
309 : return;
310 : }
311 :
312 0 : if (plane[0]) {
313 : // z perpendicular to x. but y non perpendicular to any
314 0 : for (int z = 0; z < cube->size[0]; z++) {
315 0 : for (int y = 0; y < cube->size[1]; y++) {
316 0 : const double zy = idx3(Exp[0], 1, z, y);
317 0 : double *restrict yx = &idx3(Exp[0], 2, y, 0);
318 0 : double *restrict dst = &idx3(cube[0], z, y, 0);
319 :
320 : // #pragma omp simd linear(dst, yx) simdlen(8)
321 0 : GRID_PRAGMA_SIMD((dst, yx), 8)
322 0 : for (int x = 0; x < cube->size[2]; x++) {
323 0 : dst[x] *= zy * yx[x];
324 : }
325 : }
326 : }
327 : return;
328 : }
329 :
330 0 : if (plane[1]) {
331 : // z perpendicular to y, but x and z are not and y and x neither
332 0 : for (int z = 0; z < cube->size[0]; z++) {
333 0 : double *restrict zx = &idx3(Exp[0], 0, z, 0);
334 0 : for (int y = 0; y < cube->size[1]; y++) {
335 0 : double *restrict yx = &idx3(Exp[0], 2, y, 0);
336 0 : double *restrict dst = &idx3(cube[0], z, y, 0);
337 : // #pragma omp simd linear(dst, yx) simdlen(8)
338 0 : GRID_PRAGMA_SIMD((dst, yx), 8)
339 0 : for (int x = 0; x < cube->size[2]; x++) {
340 0 : dst[x] *= zx[x] * yx[x];
341 : }
342 : }
343 : }
344 : return;
345 : }
346 :
347 0 : if (plane[2]) {
348 : // x perpendicular to y, but x and z are not and y and z neither
349 0 : for (int z = 0; z < cube->size[0]; z++) {
350 0 : double *restrict zx = &idx3(Exp[0], 0, z, 0);
351 0 : for (int y = 0; y < cube->size[1]; y++) {
352 0 : const double zy = idx3(Exp[0], 1, z, y);
353 0 : double *restrict dst = &idx3(cube[0], z, y, 0);
354 :
355 : // #pragma omp simd linear(dst) simdlen(8)
356 0 : GRID_PRAGMA_SIMD((dst), 8)
357 0 : for (int x = 0; x < cube->size[2]; x++) {
358 0 : dst[x] *= zx[x] * zy;
359 : }
360 : }
361 : }
362 : return;
363 : }
364 :
365 : /* generic case */
366 :
367 0 : for (int z = 0; z < cube->size[0]; z++) {
368 0 : double *restrict zx = &idx3(Exp[0], 0, z, 0);
369 0 : for (int y = 0; y < cube->size[1]; y++) {
370 0 : const double zy = idx3(Exp[0], 1, z, y);
371 0 : const double *restrict yx = &idx3(Exp[0], 2, y, 0);
372 0 : double *restrict dst = &idx3(cube[0], z, y, 0);
373 :
374 : // #pragma omp simd linear(dst, zx, yx) simdlen(8)
375 0 : GRID_PRAGMA_SIMD((dst, zx), 8)
376 0 : for (int x = 0; x < cube->size[2]; x++) {
377 0 : dst[x] *= zx[x] * zy * yx[x];
378 : }
379 : }
380 : }
381 : return;
382 : }
383 :
384 3112 : void apply_non_orthorombic_corrections_xy_blocked(
385 : const struct tensor_ *const Exp, struct tensor_ *const m) {
386 12448 : for (int gamma = 0; gamma < m->size[0]; gamma++) {
387 236688 : for (int y1 = 0; y1 < m->size[1]; y1++) {
388 227352 : double *restrict dst = &idx3(m[0], gamma, y1, 0);
389 227352 : double *restrict src = &idx2(Exp[0], y1, 0);
390 :
391 : // #pragma omp simd linear(dst, src) simdlen(8)
392 227352 : GRID_PRAGMA_SIMD((dst, src), 8)
393 227352 : for (int x1 = 0; x1 < m->size[2]; x1++) {
394 5777400 : dst[x1] *= src[x1];
395 : }
396 : }
397 : }
398 3112 : }
399 :
400 20163 : void apply_non_orthorombic_corrections_xz_blocked(
401 : const struct tensor_ *const Exp, struct tensor_ *const m) {
402 508286 : for (int z1 = 0; z1 < m->size[0]; z1++) {
403 488123 : double *restrict src = &idx2(Exp[0], z1, 0);
404 13139192 : for (int y1 = 0; y1 < m->size[1]; y1++) {
405 12651069 : double *restrict dst = &idx3(m[0], z1, y1, 0);
406 : // #pragma omp simd linear(dst, src) simdlen(8)
407 12651069 : GRID_PRAGMA_SIMD((dst, src), 8)
408 12651069 : for (int x1 = 0; x1 < m->size[2]; x1++) {
409 358329545 : dst[x1] *= src[x1];
410 : }
411 : }
412 : }
413 20163 : }
414 :
415 0 : void apply_non_orthorombic_corrections_yz_blocked(
416 : const struct tensor_ *const Exp, struct tensor_ *const m) {
417 0 : for (int z1 = 0; z1 < m->size[0]; z1++) {
418 0 : for (int y1 = 0; y1 < m->size[1]; y1++) {
419 0 : const double src = idx2(Exp[0], z1, y1);
420 0 : double *restrict dst = &idx3(m[0], z1, y1, 0);
421 : // #pragma omp simd linear(dst) simdlen(8)
422 0 : GRID_PRAGMA_SIMD((dst), 8)
423 0 : for (int x1 = 0; x1 < m->size[2]; x1++) {
424 0 : dst[x1] *= src;
425 : }
426 : }
427 : }
428 0 : }
429 :
430 0 : void apply_non_orthorombic_corrections_xz_yz_blocked(
431 : const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz,
432 : struct tensor_ *const m) {
433 0 : for (int z1 = 0; z1 < m->size[0]; z1++) {
434 0 : const double *restrict src_xz = &idx2(Exp_xz[0], z1, 0);
435 0 : for (int y1 = 0; y1 < m->size[1]; y1++) {
436 0 : const double src = idx2(Exp_yz[0], z1, y1);
437 0 : double *restrict dst = &idx3(m[0], z1, y1, 0);
438 : // #pragma omp simd linear(dst) simdlen(8)
439 0 : GRID_PRAGMA_SIMD((dst), 8)
440 0 : for (int x1 = 0; x1 < m->size[2]; x1++) {
441 0 : dst[x1] *= src * src_xz[x1];
442 : }
443 : }
444 : }
445 0 : }
|