LCOV - code coverage report
Current view: top level - src/grid/dgemm - grid_dgemm_integrate.c (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:42dac4a) Lines: 68.3 % 454 310
Test Date: 2025-07-25 12:55:17 Functions: 75.0 % 12 9

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2025 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : 
       8              : #include <assert.h>
       9              : #include <limits.h>
      10              : #include <math.h>
      11              : #include <stdbool.h>
      12              : #include <stdio.h>
      13              : #include <stdlib.h>
      14              : #include <string.h>
      15              : #include <unistd.h>
      16              : 
      17              : #include <omp.h>
      18              : 
      19              : #ifdef __LIBXSMM
      20              : #include <libxsmm.h>
      21              : #endif
      22              : 
      23              : #include "../common/grid_common.h"
      24              : #include "grid_dgemm_coefficients.h"
      25              : #include "grid_dgemm_collocate.h"
      26              : #include "grid_dgemm_collocation_integration.h"
      27              : #include "grid_dgemm_non_orthorombic_corrections.h"
      28              : #include "grid_dgemm_private_header.h"
      29              : #include "grid_dgemm_tensor_local.h"
      30              : #include "grid_dgemm_utils.h"
      31              : 
      32            0 : void extract_cube_within_spherical_cutoff_ortho(
      33              :     struct collocation_integration_ *const handler, const double disr_radius,
      34              :     const int cmax, const int *const lb_cube, const int *const cube_center) {
      35              :   // a mapping so that the ig corresponds to the right grid point
      36            0 :   int **map = handler->map;
      37            0 :   map[1] = map[0] + 2 * cmax + 1;
      38            0 :   map[2] = map[1] + 2 * cmax + 1;
      39              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
      40              : 
      41            0 :   for (int i = 0; i < 3; i++) {
      42            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
      43            0 :       map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
      44            0 :                               handler->grid.lower_corner[i],
      45              :                           handler->grid.full_size[i]);
      46              :     }
      47              :   }
      48              : 
      49            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
      50            0 :                             .xmax = handler->grid.window_size[0]};
      51            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
      52            0 :                             .xmax = handler->grid.window_size[1]};
      53            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
      54            0 :                             .xmax = handler->grid.window_size[2]};
      55              : 
      56            0 :   for (int kg = 0; kg < handler->cube.size[0]; kg++) {
      57            0 :     const int k = map[0][kg];
      58            0 :     const int kd =
      59            0 :         (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
      60            0 :     const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
      61            0 :     const double kremain = disr_radius * disr_radius - kr * kr;
      62              : 
      63            0 :     if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
      64              : 
      65            0 :       const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
      66            0 :       for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
      67            0 :         const int j = map[1][jg - lb_cube[1]];
      68            0 :         const double jr = ((2 * jg - 1) >> 1) *
      69            0 :                           handler->dh[1][1]; // distance from center in a.u.
      70            0 :         const double jremain = kremain - jr * jr;
      71              : 
      72            0 :         if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
      73            0 :           const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
      74            0 :           const int xmax = 1 - xmin;
      75              : 
      76              :           // printf("xmin %d, xmax %d\n", xmin, xmax);
      77            0 :           for (int x = xmin - lb_cube[2];
      78            0 :                x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
      79            0 :             const int x1 = map[2][x];
      80              : 
      81            0 :             if (!is_point_in_interval(x1, xwindow))
      82            0 :               continue;
      83              : 
      84            0 :             int lower_corner[3] = {k, j, x1};
      85            0 :             int upper_corner[3] = {k + 1, j + 1, x1 + 1};
      86              : 
      87            0 :             compute_interval(map[2], handler->grid.full_size[2],
      88              :                              handler->grid.size[2], handler->cube.size[2], x1,
      89              :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
      90              : 
      91            0 :             if (upper_corner[2] - lower_corner[2]) {
      92            0 :               const int position1[3] = {kg, jg - lb_cube[1], x};
      93              : 
      94              :               /* the function will internally take care of the local vs global
      95              :                * grid */
      96              : 
      97            0 :               double *restrict dst = &idx3(handler->cube, position1[0],
      98              :                                            position1[1], position1[2]);
      99              : 
     100            0 :               double *restrict src = &idx3(handler->grid, lower_corner[0],
     101              :                                            lower_corner[1], lower_corner[2]);
     102              : 
     103            0 :               const int sizex = upper_corner[2] - lower_corner[2];
     104            0 :               GRID_PRAGMA_SIMD((dst, src), 8)
     105            0 :               for (int x = 0; x < sizex; x++) {
     106            0 :                 dst[x] = src[x];
     107              :               }
     108              :             }
     109              : 
     110            0 :             if (handler->grid.size[2] == handler->grid.full_size[2])
     111            0 :               update_loop_index(handler->grid.full_size[2], x1, &x);
     112              :             else
     113            0 :               x += upper_corner[2] - lower_corner[2] - 1;
     114              :           }
     115              :         }
     116              :       }
     117              :     }
     118              :   }
     119            0 : }
     120              : 
     121            0 : void extract_cube_within_spherical_cutoff_generic(
     122              :     struct collocation_integration_ *const handler, const double disr_radius,
     123              :     const int cmax, const int *const lb_cube, const int *const ub_cube,
     124              :     const double *const roffset, const int *const cube_center) {
     125              : 
     126            0 :   const double a = handler->dh[0][0] * handler->dh[0][0] +
     127            0 :                    handler->dh[0][1] * handler->dh[0][1] +
     128            0 :                    handler->dh[0][2] * handler->dh[0][2];
     129            0 :   const double a_inv = 1.0 / a;
     130              :   // a mapping so that the ig corresponds to the right grid point
     131            0 :   int **map = handler->map;
     132            0 :   map[1] = map[0] + 2 * cmax + 1;
     133            0 :   map[2] = map[1] + 2 * cmax + 1;
     134              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     135              : 
     136            0 :   for (int i = 0; i < 3; i++) {
     137            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     138            0 :       map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
     139            0 :                               handler->grid.lower_corner[i],
     140              :                           handler->grid.full_size[i]);
     141              :     }
     142              :   }
     143              : 
     144            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     145            0 :                             .xmax = handler->grid.window_size[0]};
     146            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     147            0 :                             .xmax = handler->grid.window_size[1]};
     148            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     149            0 :                             .xmax = handler->grid.window_size[2]};
     150              : 
     151            0 :   for (int k = 0; k < handler->cube.size[0]; k++) {
     152            0 :     const int iz = map[0][k];
     153              : 
     154            0 :     if (!is_point_in_interval(iz, zwindow))
     155            0 :       continue;
     156              : 
     157            0 :     const double tz = (k + lb_cube[0] - roffset[0]);
     158            0 :     const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
     159            0 :                          tz * handler->dh[2][2]};
     160              : 
     161            0 :     for (int j = 0; j < handler->cube.size[1]; j++) {
     162            0 :       const int iy = map[1][j];
     163              : 
     164            0 :       if (!is_point_in_interval(iy, ywindow))
     165            0 :         continue;
     166              : 
     167            0 :       const double ty = (j - roffset[1] + lb_cube[1]);
     168            0 :       const double y[3] = {z[0] + ty * handler->dh[1][0],
     169            0 :                            z[1] + ty * handler->dh[1][1],
     170            0 :                            z[2] + ty * handler->dh[1][2]};
     171              : 
     172              :       /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
     173              :       /*                                            4 (x1^2 + y1^2 + z1^2)
     174              :        * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
     175              : 
     176            0 :       const double b =
     177            0 :           -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
     178            0 :                   handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
     179            0 :                   handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
     180              : 
     181            0 :       const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
     182            0 :                            (roffset[2] * handler->dh[0][0] - y[0]) +
     183            0 :                        (roffset[2] * handler->dh[0][1] - y[1]) *
     184            0 :                            (roffset[2] * handler->dh[0][1] - y[1]) +
     185            0 :                        (roffset[2] * handler->dh[0][2] - y[2]) *
     186              :                            (roffset[2] * handler->dh[0][2] - y[2]) -
     187            0 :                        disr_radius * disr_radius;
     188              : 
     189            0 :       double delta = b * b - 4.0 * a * c;
     190              : 
     191            0 :       if (delta < 0.0)
     192            0 :         continue;
     193              : 
     194            0 :       delta = sqrt(delta);
     195              : 
     196            0 :       const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
     197            0 :       const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
     198              : 
     199            0 :       int lower_corner[3] = {iz, iy, xmin};
     200            0 :       int upper_corner[3] = {iz + 1, iy + 1, xmin};
     201              : 
     202            0 :       for (int x = xmin - lb_cube[2];
     203            0 :            x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     204            0 :         const int x1 = map[2][x];
     205              : 
     206            0 :         if (!is_point_in_interval(x1, xwindow))
     207            0 :           continue;
     208              : 
     209            0 :         compute_interval(map[2], handler->grid.full_size[2],
     210              :                          handler->grid.size[2], handler->cube.size[2], x1, &x,
     211              :                          lower_corner + 2, upper_corner + 2, xwindow);
     212              : 
     213            0 :         if (upper_corner[2] - lower_corner[2]) {
     214            0 :           const int position1[3] = {k, j, x};
     215              : 
     216              :           /* the function will internally take care of the local vs global
     217              :            * grid */
     218              : 
     219            0 :           double *__restrict__ src = &idx3(handler->grid, lower_corner[0],
     220              :                                            lower_corner[1], lower_corner[2]);
     221            0 :           double *__restrict__ dst =
     222            0 :               &idx3(handler->cube, position1[0], position1[1], position1[2]);
     223              : 
     224            0 :           const int sizex = upper_corner[2] - lower_corner[2];
     225              : 
     226              :           // #pragma omp simd linear(dst, src) simdlen(8)
     227            0 :           GRID_PRAGMA_SIMD((dst, src), 8)
     228            0 :           for (int x = 0; x < sizex; x++) {
     229            0 :             dst[x] = src[x];
     230              :           }
     231              : 
     232            0 :           if (handler->grid.size[2] == handler->grid.full_size[2])
     233            0 :             update_loop_index(handler->grid.full_size[2], x1, &x);
     234              :           else
     235            0 :             x += upper_corner[2] - lower_corner[2] - 1;
     236              :         }
     237              :       }
     238              :     }
     239              :   }
     240            0 : }
     241              : 
     242         5114 : static void rotate_and_store_coefficients(grid_context *const ctx,
     243              :                                           const _task *prev_task,
     244              :                                           const _task *task, tensor *const hab,
     245              :                                           tensor *work, // some scratch matrix
     246              :                                           double *blocks) {
     247         5114 :   if (prev_task != NULL) {
     248              :     /* prev_task is NULL when we deal with the first iteration. In that case
     249              :      * we just need to initialize the hab matrix to the proper size. Note
     250              :      * that resizing does not really occurs since memory allocation is done
     251              :      * for the maximum size the matrix can have. */
     252         4482 :     const int iatom = prev_task->iatom;
     253         4482 :     const int jatom = prev_task->jatom;
     254         4482 :     const int iset = prev_task->iset;
     255         4482 :     const int jset = prev_task->jset;
     256         4482 :     const int ikind = ctx->atom_kinds[iatom];
     257         4482 :     const int jkind = ctx->atom_kinds[jatom];
     258         4482 :     const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     259         4482 :     const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     260              : 
     261         4482 :     const int block_num = prev_task->block_num;
     262         4482 :     double *const block = &blocks[ctx->block_offsets[block_num]];
     263              : 
     264         4482 :     const int ncoseta = ncoset(ibasis->lmax[iset]);
     265         4482 :     const int ncosetb = ncoset(jbasis->lmax[jset]);
     266              : 
     267         4482 :     const int ncoa = ibasis->npgf[iset] * ncoseta;
     268         4482 :     const int ncob = jbasis->npgf[jset] * ncosetb;
     269              : 
     270         4482 :     const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set */
     271         4482 :     const int nsgf_setb = jbasis->nsgf_set[jset];
     272         4482 :     const int nsgfa = ibasis->nsgf; // size of entire spherical basis
     273         4482 :     const int nsgfb = jbasis->nsgf;
     274         4482 :     const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
     275         4482 :     const int sgfb = jbasis->first_sgf[jset] - 1;
     276         4482 :     const int maxcoa = ibasis->maxco;
     277         4482 :     const int maxcob = jbasis->maxco;
     278              : 
     279         4482 :     initialize_tensor_2(work, nsgf_setb, ncoa);
     280         4482 :     realloc_tensor(work);
     281              : 
     282              :     // Warning these matrices are row major....
     283              : 
     284         4482 :     dgemm_params m1, m2;
     285         4482 :     memset(&m1, 0, sizeof(dgemm_params));
     286         4482 :     memset(&m2, 0, sizeof(dgemm_params));
     287              : 
     288         4482 :     m1.op1 = 'N';
     289         4482 :     m1.op2 = 'N';
     290         4482 :     m1.m = nsgf_setb;
     291         4482 :     m1.n = ncoa;
     292         4482 :     m1.k = ncob;
     293         4482 :     m1.alpha = 1.0;
     294         4482 :     m1.beta = 0.0;
     295         4482 :     m1.a = &jbasis->sphi[sgfb * maxcob];
     296         4482 :     m1.lda = maxcob;
     297         4482 :     m1.b = hab->data;
     298         4482 :     m1.ldb = ncoa;
     299         4482 :     m1.c = work->data;
     300         4482 :     m1.ldc = work->ld_;
     301              : 
     302              :     // phi[b][ncob]
     303              :     // work[b][ncoa] = phi[b][ncob] * hab[ncob][ncoa]
     304              : 
     305         4482 :     if (iatom <= jatom) {
     306              :       // I need to have the final result in the form
     307              : 
     308              :       // block[b][a] = work[b][ncoa] transpose(phi[a][ncoa])
     309         2982 :       m2.op1 = 'N';
     310         2982 :       m2.op2 = 'T';
     311         2982 :       m2.m = nsgf_setb;
     312         2982 :       m2.n = nsgf_seta;
     313         2982 :       m2.k = ncoa;
     314         2982 :       m2.a = work->data;
     315         2982 :       m2.lda = work->ld_;
     316         2982 :       m2.b = &ibasis->sphi[sgfa * maxcoa];
     317         2982 :       m2.ldb = maxcoa;
     318         2982 :       m2.c = block + sgfb * nsgfa + sgfa;
     319         2982 :       m2.ldc = nsgfa;
     320         2982 :       m2.alpha = 1.0;
     321         2982 :       m2.beta = 1.0;
     322              :     } else {
     323              :       // block[a][b] = phi[a][ncoa] Transpose(work[b][ncoa])
     324         1500 :       m2.op1 = 'N';
     325         1500 :       m2.op2 = 'T';
     326         1500 :       m2.m = nsgf_seta;
     327         1500 :       m2.n = nsgf_setb;
     328         1500 :       m2.k = ncoa;
     329         1500 :       m2.a = &ibasis->sphi[sgfa * maxcoa];
     330         1500 :       m2.lda = maxcoa;
     331         1500 :       m2.b = work->data;
     332         1500 :       m2.ldb = work->ld_;
     333         1500 :       m2.c = block + sgfa * nsgfb + sgfb;
     334         1500 :       m2.ldc = nsgfb;
     335         1500 :       m2.alpha = 1.0;
     336         1500 :       m2.beta = 1.0;
     337              :     }
     338              : 
     339         4482 :     m1.use_libxsmm = true;
     340         4482 :     m2.use_libxsmm = true;
     341              : 
     342              :     /* these dgemm are *row* major */
     343         4482 :     dgemm_simplified(&m1);
     344         4482 :     dgemm_simplified(&m2);
     345              :   }
     346              : 
     347         5114 :   if (task != NULL) {
     348         4482 :     const int iatom = task->iatom;
     349         4482 :     const int jatom = task->jatom;
     350         4482 :     const int ikind = ctx->atom_kinds[iatom];
     351         4482 :     const int jkind = ctx->atom_kinds[jatom];
     352         4482 :     const int iset = task->iset;
     353         4482 :     const int jset = task->jset;
     354         4482 :     const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     355         4482 :     const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     356         4482 :     const int ncoseta = ncoset(ibasis->lmax[iset]);
     357         4482 :     const int ncosetb = ncoset(jbasis->lmax[jset]);
     358              : 
     359         4482 :     const int ncoa = ibasis->npgf[iset] * ncoseta;
     360         4482 :     const int ncob = jbasis->npgf[jset] * ncosetb;
     361              : 
     362         4482 :     initialize_tensor_2(hab, ncob, ncoa);
     363         4482 :     realloc_tensor(hab);
     364              :   }
     365         5114 : }
     366              : 
     367        49036 : void update_force_pair(orbital a, orbital b, const double pab,
     368              :                        const double ftz[2], const double *const rab,
     369              :                        const tensor *const vab, tensor *force) {
     370        49036 :   const double axpm0 = idx2(vab[0], idx(b), idx(a));
     371       196144 :   for (int i = 0; i < 3; i++) {
     372       147108 :     const double aip1 = idx2(vab[0], idx(b), idx(up(i, a)));
     373       147108 :     const double aim1 = idx2(vab[0], idx(b), idx(down(i, a)));
     374       147108 :     const double bim1 = idx2(vab[0], idx(down(i, b)), idx(a));
     375       147108 :     idx2(force[0], 0, i) += pab * (ftz[0] * aip1 - a.l[i] * aim1);
     376       147108 :     idx2(force[0], 1, i) +=
     377       147108 :         pab * (ftz[1] * (aip1 - rab[i] * axpm0) - b.l[i] * bim1);
     378              :   }
     379        49036 : }
     380              : 
     381        34952 : void update_virial_pair(orbital a, orbital b, const double pab,
     382              :                         const double ftz[2], const double *const rab,
     383              :                         const tensor *const vab, tensor *virial) {
     384       139808 :   for (int i = 0; i < 3; i++) {
     385       419424 :     for (int j = 0; j < 3; j++) {
     386       314568 :       idx3(virial[0], 0, i, j) +=
     387       314568 :           pab * ftz[0] * idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
     388       314568 :           pab * a.l[j] * idx2(vab[0], idx(b), idx(up(i, down(j, a))));
     389              :     }
     390              :   }
     391              : 
     392       139808 :   for (int i = 0; i < 3; i++) {
     393       419424 :     for (int j = 0; j < 3; j++) {
     394       314568 :       idx3(virial[0], 1, i, j) +=
     395       314568 :           pab * ftz[1] *
     396       314568 :               (idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
     397       314568 :                idx2(vab[0], idx(b), idx(up(i, a))) * rab[j] -
     398       314568 :                idx2(vab[0], idx(b), idx(up(j, a))) * rab[i] +
     399       314568 :                idx2(vab[0], idx(b), idx(a)) * rab[j] * rab[i]) -
     400       314568 :           pab * b.l[j] * idx2(vab[0], idx(up(i, down(j, b))), idx(a));
     401              :     }
     402              :   }
     403        34952 : }
     404              : 
     405       352870 : void update_all(const bool compute_forces, const bool compute_virial,
     406              :                 const orbital a, const orbital b, const double f,
     407              :                 const double *const ftz, const double *rab, const tensor *vab,
     408              :                 const double pab, double *hab, tensor *forces,
     409              :                 tensor *virials) {
     410              : 
     411       352870 :   *hab += f * idx2(vab[0], idx(b), idx(a));
     412              : 
     413       352870 :   if (compute_forces) {
     414        49036 :     update_force_pair(a, b, f * pab, ftz, rab, vab, forces);
     415              :   }
     416              : 
     417       352870 :   if (compute_virial) {
     418        34952 :     update_virial_pair(a, b, f * pab, ftz, rab, vab, virials);
     419              :   }
     420       352870 : }
     421              : 
     422            0 : static void update_tau(const bool compute_forces, const bool compute_virial,
     423              :                        const orbital a, const orbital b, const double ftz[2],
     424              :                        const double *const rab, const tensor *const vab,
     425              :                        const double pab, double *const hab, tensor *forces,
     426              :                        tensor *virials) {
     427              : 
     428            0 :   for (int i = 0; i < 3; i++) {
     429            0 :     update_all(compute_forces, compute_virial, down(i, a), down(i, b),
     430            0 :                0.5 * a.l[i] * b.l[i], ftz, rab, vab, pab, hab, forces, virials);
     431            0 :     update_all(compute_forces, compute_virial, up(i, a), down(i, b),
     432            0 :                -0.5 * ftz[0] * b.l[i], ftz, rab, vab, pab, hab, forces,
     433              :                virials);
     434            0 :     update_all(compute_forces, compute_virial, down(i, a), up(i, b),
     435            0 :                -0.5 * a.l[i] * ftz[1], ftz, rab, vab, pab, hab, forces,
     436              :                virials);
     437            0 :     update_all(compute_forces, compute_virial, up(i, a), up(i, b),
     438            0 :                0.5 * ftz[0] * ftz[1], ftz, rab, vab, pab, hab, forces, virials);
     439              :   }
     440            0 : }
     441              : 
     442              : static void
     443        44389 : update_hab_forces_and_stress(const _task *task, const tensor *const vab,
     444              :                              const tensor *const pab, const bool compute_tau,
     445              :                              const bool compute_forces,
     446              :                              const bool compute_virial, tensor *const forces,
     447              :                              tensor *const virial, tensor *const hab) {
     448        44389 :   double zeta[2] = {task->zeta[0] * 2.0, task->zeta[1] * 2.0};
     449       104127 :   for (int lb = task->lmin[1]; lb <= task->lmax[1]; lb++) {
     450       142989 :     for (int la = task->lmin[0]; la <= task->lmax[0]; la++) {
     451       207817 :       for (int bx = 0; bx <= lb; bx++) {
     452       295454 :         for (int by = 0; by <= lb - bx; by++) {
     453       170888 :           const int bz = lb - bx - by;
     454       170888 :           const orbital b = {{bx, by, bz}};
     455       170888 :           const int idx_b = task->offset[1] + idx(b);
     456       427643 :           for (int ax = 0; ax <= la; ax++) {
     457       609625 :             for (int ay = 0; ay <= la - ax; ay++) {
     458       352870 :               const int az = la - ax - ay;
     459       352870 :               const orbital a = {{ax, ay, az}};
     460       352870 :               const int idx_a = task->offset[0] + idx(a);
     461       352870 :               double *habval = &idx2(hab[0], idx_b, idx_a);
     462       352870 :               const double prefactor = idx2(pab[0], idx_b, idx_a);
     463              : 
     464              :               // now compute the forces
     465       352870 :               if (compute_tau) {
     466            0 :                 update_tau(compute_forces, compute_virial, a, b, zeta,
     467            0 :                            task->rab, vab, prefactor, habval, forces, virial);
     468              :               } else {
     469       352870 :                 update_all(compute_forces, compute_virial, a, b, 1.0, zeta,
     470       352870 :                            task->rab, vab, prefactor, habval, forces, virial);
     471              :               }
     472              :             }
     473              :           }
     474              :         }
     475              :       }
     476              :     }
     477              :   }
     478        44389 : }
     479              : 
     480              : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
     481              :  * very general and does not depend on the orthorombic nature of the grid. for
     482              :  * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
     483              : 
     484        44389 : void extract_cube(struct collocation_integration_ *handler, const int cmax,
     485              :                   const int *lower_boundaries_cube, const int *cube_center) {
     486              : 
     487              :   // a mapping so that the ig corresponds to the right grid point
     488        44389 :   int **map = handler->map;
     489        44389 :   map[1] = map[0] + 2 * cmax + 1;
     490        44389 :   map[2] = map[1] + 2 * cmax + 1;
     491              :   //  memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     492       177556 :   for (int i = 0; i < 3; i++) {
     493      3385300 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     494      3252133 :       map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
     495      3252133 :                               handler->grid.lower_corner[i],
     496              :                           handler->grid.full_size[i]);
     497              :     }
     498              :   }
     499              : 
     500        44389 :   int lower_corner[3];
     501        44389 :   int upper_corner[3];
     502              : 
     503        44389 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     504        44389 :                             .xmax = handler->grid.window_size[0]};
     505        44389 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     506        44389 :                             .xmax = handler->grid.window_size[1]};
     507        44389 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     508        44389 :                             .xmax = handler->grid.window_size[2]};
     509              : 
     510       130455 :   for (int z = 0; (z < handler->cube.size[0]); z++) {
     511        86066 :     const int z1 = map[0][z];
     512              : 
     513       172132 :     if (!is_point_in_interval(z1, zwindow))
     514            0 :       continue;
     515              : 
     516        86066 :     compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
     517              :                      handler->cube.size[0], z1, &z, lower_corner, upper_corner,
     518              :                      zwindow);
     519              : 
     520              :     /* // We have a full plane. */
     521        86066 :     if (upper_corner[0] - lower_corner[0]) {
     522       261386 :       for (int y = 0; y < handler->cube.size[1]; y++) {
     523       175320 :         const int y1 = map[1][y];
     524              : 
     525              :         // this check is completely irrelevant when running without MPI.
     526       350640 :         if (!is_point_in_interval(y1, ywindow))
     527            0 :           continue;
     528              : 
     529       175320 :         compute_interval(map[1], handler->grid.full_size[1],
     530              :                          handler->grid.size[1], handler->cube.size[1], y1, &y,
     531              :                          lower_corner + 1, upper_corner + 1, ywindow);
     532              : 
     533       175320 :         if (upper_corner[1] - lower_corner[1]) {
     534       570824 :           for (int x = 0; x < handler->cube.size[2]; x++) {
     535       395504 :             const int x1 = map[2][x];
     536              : 
     537       791008 :             if (!is_point_in_interval(x1, xwindow))
     538            0 :               continue;
     539              : 
     540       395504 :             compute_interval(map[2], handler->grid.full_size[2],
     541              :                              handler->grid.size[2], handler->cube.size[2], x1,
     542              :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
     543              : 
     544       395504 :             if (upper_corner[2] - lower_corner[2]) { // should be non zero
     545       395504 :               const int position1[3] = {z, y, x};
     546              : 
     547              :               /* the function will internally take care of the local vx global
     548              :                * grid */
     549       395504 :               extract_sub_grid(
     550              :                   lower_corner, // lower corner of the portion of cube (in the
     551              :                   // full grid)
     552              :                   upper_corner, // upper corner of the portion of cube (in the
     553              :                   // full grid)
     554              :                   position1, // starting position subblock inside the cube
     555       395504 :                   &handler->grid,
     556              :                   &handler->cube); // the grid to add data from
     557              : 
     558       395504 :               if (handler->grid.size[2] == handler->grid.full_size[2])
     559       395504 :                 update_loop_index(handler->grid.full_size[2], x1, &x);
     560              :               else
     561            0 :                 x += upper_corner[2] - lower_corner[2] - 1;
     562              :             }
     563              :           }
     564       175320 :           if (handler->grid.size[1] == handler->grid.full_size[1])
     565       175320 :             update_loop_index(handler->grid.full_size[1], y1, &y);
     566              :           else
     567            0 :             y += upper_corner[1] - lower_corner[1] - 1;
     568              :         }
     569              :       }
     570        86066 :       if (handler->grid.size[0] == handler->grid.full_size[0])
     571        86066 :         update_loop_index(handler->grid.full_size[0], z1, &z);
     572              :       else
     573            0 :         z += upper_corner[0] - lower_corner[0] - 1;
     574              :     }
     575              :   }
     576        44389 : }
     577              : 
     578        44389 : void grid_integrate(collocation_integration *const handler,
     579              :                     const bool use_ortho, const double zetp, const double rp[3],
     580              :                     const double radius) {
     581        44389 :   if (handler == NULL)
     582            0 :     abort();
     583              : 
     584        44389 :   const int lp = handler->coef.size[0] - 1;
     585              : 
     586        44389 :   int cubecenter[3];
     587        44389 :   int cube_size[3];
     588        44389 :   int lb_cube[3], ub_cube[3];
     589        44389 :   double roffset[3];
     590        44389 :   double disr_radius;
     591              : 
     592              :   /* cube : grid comtaining pointlike product between polynomials
     593              :    *
     594              :    * pol : grid  containing the polynomials in all three directions
     595              :    */
     596              : 
     597              :   /* seting up the cube parameters */
     598        88778 :   int cmax = compute_cube_properties(
     599        44389 :       use_ortho, radius, (const double(*)[3])handler->dh,
     600        44389 :       (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
     601              :       cubecenter, lb_cube, ub_cube, cube_size);
     602              : 
     603              :   /* initialize the multidimensional array containing the polynomials */
     604        44389 :   if (lp != 0) {
     605        35109 :     initialize_tensor_3(&handler->pol, 3, cmax, handler->coef.size[0]);
     606              :   } else {
     607         9280 :     initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
     608              :   }
     609        44389 :   handler->pol_alloc_size = realloc_tensor(&handler->pol);
     610              : 
     611              :   /* allocate memory for the polynomial and the cube */
     612              : 
     613        44389 :   initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
     614              : 
     615        44389 :   handler->cube_alloc_size = realloc_tensor(&handler->cube);
     616              : 
     617        44389 :   initialize_W_and_T(handler, &handler->coef, &handler->cube);
     618              : 
     619              :   /* compute the polynomials */
     620              : 
     621              :   // WARNING : do not reverse the order in pol otherwise you will have to
     622              :   // reverse the order in collocate_dgemm as well.
     623              : 
     624              :   /* the tensor contraction is done for a given order so I have to be careful
     625              :    * how the tensors X, Y, Z are stored. In collocate, they are stored
     626              :    * normally 0 (Z), (1) Y, (2) X in the table pol. but in integrate (which
     627              :    * uses the same tensor reduction), I have a special treatment for l = 0. In
     628              :    * that case the order *should* be the same than for collocate. For l > 0,
     629              :    * we need a different storage order which is (X) 2, (Y) 0, and (Z) 1.
     630              :    *
     631              :    * the reason for this is that the cube is stored as cube[z][y][x] so the
     632              :    * first direction taken for the dgemm is along x.
     633              :    */
     634              : 
     635        44389 :   int perm[3] = {2, 0, 1};
     636              : 
     637        44389 :   if (lp == 0) {
     638              :     /* I need to restore the same order than for collocate */
     639         9280 :     perm[0] = 0;
     640         9280 :     perm[1] = 1;
     641         9280 :     perm[2] = 2;
     642              :   }
     643              : 
     644        44389 :   bool use_ortho_forced = handler->orthogonal[0] && handler->orthogonal[1] &&
     645        24226 :                           handler->orthogonal[2];
     646        44389 :   if (use_ortho) {
     647        21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[0][0], roffset[2], 0, lb_cube[2],
     648              :                         ub_cube[2], lp, cmax, zetp,
     649        21114 :                         &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
     650        21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[1][1], roffset[1], 0, lb_cube[1],
     651              :                         ub_cube[1], lp, cmax, zetp,
     652        21114 :                         &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
     653        21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[2][2], roffset[0], 0, lb_cube[0],
     654              :                         ub_cube[0], lp, cmax, zetp,
     655        21114 :                         &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
     656              :   } else {
     657        23275 :     double dx[3];
     658              : 
     659        23275 :     dx[2] = handler->dh[0][0] * handler->dh[0][0] +
     660        23275 :             handler->dh[0][1] * handler->dh[0][1] +
     661        23275 :             handler->dh[0][2] * handler->dh[0][2];
     662              : 
     663        23275 :     dx[1] = handler->dh[1][0] * handler->dh[1][0] +
     664        23275 :             handler->dh[1][1] * handler->dh[1][1] +
     665        23275 :             handler->dh[1][2] * handler->dh[1][2];
     666              : 
     667        23275 :     dx[0] = handler->dh[2][0] * handler->dh[2][0] +
     668        23275 :             handler->dh[2][1] * handler->dh[2][1] +
     669        23275 :             handler->dh[2][2] * handler->dh[2][2];
     670              : 
     671        23275 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[2], 0, lb_cube[2], ub_cube[2],
     672              :                         lp, cmax, zetp * dx[2],
     673        23275 :                         &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
     674        23275 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[1], 0, lb_cube[1], ub_cube[1],
     675              :                         lp, cmax, zetp * dx[1],
     676        23275 :                         &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
     677        23275 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[0], 0, lb_cube[0], ub_cube[0],
     678              :                         lp, cmax, zetp * dx[0],
     679        23275 :                         &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
     680              : 
     681              :     /* the three remaining tensors are initialized in the function */
     682        23275 :     calculate_non_orthorombic_corrections_tensor(
     683              :         zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
     684        23275 :         handler->orthogonal, &handler->Exp);
     685              :   }
     686              : 
     687        44389 :   if (handler->apply_cutoff) {
     688            0 :     memset(handler->cube.data, 0, sizeof(double) * handler->cube.alloc_size_);
     689            0 :     if (!use_ortho && !use_ortho_forced) {
     690            0 :       extract_cube_within_spherical_cutoff_generic(
     691              :           handler, disr_radius, cmax, lb_cube, ub_cube, roffset, cubecenter);
     692              :     } else {
     693            0 :       extract_cube_within_spherical_cutoff_ortho(handler, disr_radius, cmax,
     694              :                                                  lb_cube, cubecenter);
     695              :     }
     696              :   } else {
     697        44389 :     extract_cube(handler, cmax, lb_cube, cubecenter);
     698              :   }
     699              : 
     700        44389 :   if (!use_ortho && !use_ortho_forced)
     701        23275 :     apply_non_orthorombic_corrections(handler->orthogonal, &handler->Exp,
     702              :                                       &handler->cube);
     703              : 
     704        44389 :   if (lp != 0) {
     705        35109 :     tensor_reduction_for_collocate_integrate(handler->scratch,
     706              :                                              // pointer to scratch memory
     707        35109 :                                              1.0, handler->orthogonal, NULL,
     708              :                                              &handler->cube, &handler->pol,
     709              :                                              &handler->coef);
     710              :   } else {
     711              :     /* it is very specific to integrate because we might end up with a
     712              :      * single element after the tensor product/contraction. In that case, I
     713              :      * compute the cube and then do a scalar product between the two. */
     714              : 
     715              :     /* we could also do this with 2 matrix-vector multiplications and a scalar
     716              :      * product
     717              :      *
     718              :      * H_{jk} = C_{ijk} . P_i (along x) C_{ijk} is *stored C[k][j][i]* !!!!!!
     719              :      * L_{k} = H_{jk} . P_j (along y)
     720              :      * v_{ab} = L_k . P_k (along z)
     721              :      */
     722              : 
     723         9280 :     tensor cube_tmp;
     724         9280 :     initialize_tensor_2(&cube_tmp, cube_size[0], cube_size[1]);
     725         9280 :     alloc_tensor(&cube_tmp);
     726              : 
     727              :     /* first along x */
     728         9280 :     cblas_dgemv(CblasRowMajor, CblasNoTrans,
     729         9280 :                 handler->cube.size[0] * handler->cube.size[1],
     730         9280 :                 handler->cube.size[2], 1.0, handler->cube.data,
     731         9280 :                 handler->cube.ld_, &idx3(handler->pol, 2, 0, 0), 1, 0.0,
     732              :                 cube_tmp.data, 1);
     733              : 
     734              :     /* second along j */
     735         9280 :     cblas_dgemv(CblasRowMajor, CblasNoTrans, handler->cube.size[0],
     736         9280 :                 handler->cube.size[1], 1.0, cube_tmp.data, cube_tmp.ld_,
     737         9280 :                 &idx3(handler->pol, 1, 0, 0), 1, 0.0, handler->scratch, 1);
     738              : 
     739              :     /* finally along k, it is a scalar product.... */
     740        18560 :     handler->coef.data[0] = cblas_ddot(handler->cube.size[0], handler->scratch,
     741         9280 :                                        1, &idx3(handler->pol, 0, 0, 0), 1);
     742              : 
     743         9280 :     free(cube_tmp.data);
     744              :   }
     745              : 
     746              :   /* go from ijk -> xyz */
     747        44389 :   if (!use_ortho)
     748        23275 :     grid_transform_coef_jik_to_yxz((const double(*)[3])handler->dh,
     749              :                                    &handler->coef);
     750        44389 : }
     751              : 
     752          632 : void integrate_one_grid_level_dgemm(
     753              :     grid_context *const ctx, const int level, const bool calculate_tau,
     754              :     const bool calculate_forces, const bool calculate_virial,
     755              :     const int *const shift_local, const int *const border_width,
     756              :     const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks,
     757              :     tensor *forces_, tensor *virial_) {
     758          632 :   tensor *const grid = &ctx->grid[level];
     759              : 
     760              :   // Using default(shared) because with GCC 9 the behavior around const changed:
     761              :   // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
     762          632 : #pragma omp parallel default(shared)
     763              :   {
     764              :     const int num_threads = omp_get_num_threads();
     765              :     const int thread_id = omp_get_thread_num();
     766              : 
     767              :     double *hab_block_local = NULL;
     768              : 
     769              :     if (num_threads == 1) {
     770              :       hab_block_local = hab_blocks->host_buffer;
     771              :     } else {
     772              :       hab_block_local = ((double *)ctx->scratch) +
     773              :                         thread_id * (hab_blocks->size / sizeof(double));
     774              :       memset(hab_block_local, 0, hab_blocks->size);
     775              :     }
     776              : 
     777              :     tensor work, pab, hab, vab, forces_local_, virial_local_,
     778              :         forces_local_pair_, virial_local_pair_;
     779              :     memset(&work, 0, sizeof(tensor));
     780              :     memset(&pab, 0, sizeof(tensor));
     781              :     memset(&hab, 0, sizeof(tensor));
     782              :     memset(&vab, 0, sizeof(tensor));
     783              :     memset(&forces_local_, 0, sizeof(tensor));
     784              :     memset(&virial_local_, 0, sizeof(tensor));
     785              :     memset(&forces_local_pair_, 0, sizeof(tensor));
     786              :     memset(&virial_local_pair_, 0, sizeof(tensor));
     787              : 
     788              :     struct collocation_integration_ *handler = ctx->handler[thread_id];
     789              :     handler->apply_cutoff = ctx->apply_cutoff;
     790              :     handler->lmax_diff[0] = 0;
     791              :     handler->lmax_diff[1] = 0;
     792              :     handler->lmin_diff[0] = 0;
     793              :     handler->lmin_diff[1] = 0;
     794              : 
     795              :     if (calculate_tau || calculate_forces || calculate_virial) {
     796              :       handler->lmax_diff[0] = 1;
     797              :       handler->lmax_diff[1] = 0;
     798              :       handler->lmin_diff[0] = -1;
     799              :       handler->lmin_diff[1] = -1;
     800              :     }
     801              : 
     802              :     if (calculate_virial) {
     803              :       handler->lmax_diff[0]++;
     804              :       handler->lmax_diff[1]++;
     805              :     }
     806              : 
     807              :     if (calculate_tau) {
     808              :       handler->lmax_diff[0]++;
     809              :       handler->lmax_diff[1]++;
     810              :       handler->lmin_diff[0]--;
     811              :       handler->lmin_diff[1]--;
     812              :     }
     813              : 
     814              :     // Allocate pab matrix for re-use across tasks.
     815              :     initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
     816              :     alloc_tensor(&pab);
     817              : 
     818              :     initialize_tensor_2(&vab, ctx->maxco, ctx->maxco);
     819              :     alloc_tensor(&vab);
     820              : 
     821              :     initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
     822              :     alloc_tensor(&work);
     823              : 
     824              :     initialize_tensor_2(&hab, ctx->maxco, ctx->maxco);
     825              :     alloc_tensor(&hab);
     826              : 
     827              :     if (calculate_forces) {
     828              :       initialize_tensor_2(&forces_local_, forces_->size[0], forces_->size[1]);
     829              :       alloc_tensor(&forces_local_);
     830              :       initialize_tensor_2(&virial_local_, 3, 3);
     831              :       alloc_tensor(&virial_local_);
     832              :       memset(forces_local_.data, 0, sizeof(double) * forces_local_.alloc_size_);
     833              :       memset(virial_local_.data, 0, sizeof(double) * virial_local_.alloc_size_);
     834              :       initialize_tensor_2(&forces_local_pair_, 2, 3);
     835              :       alloc_tensor(&forces_local_pair_);
     836              :       initialize_tensor_3(&virial_local_pair_, 2, 3, 3);
     837              :       alloc_tensor(&virial_local_pair_);
     838              :     }
     839              : 
     840              :     initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
     841              :                              (const double(*)[3])grid->dh_inv);
     842              : 
     843              :     tensor_copy(&handler->grid, grid);
     844              :     handler->grid.data = grid->data;
     845              :     for (int d = 0; d < 3; d++)
     846              :       handler->orthogonal[d] = grid->orthogonal[d];
     847              : 
     848              :     /* it is only useful when we split the list over multiple threads. The
     849              :      * first iteration should load the block whatever status the
     850              :      * task->block_update_ variable has */
     851              :     const _task *prev_task = NULL;
     852              : #pragma omp for schedule(static)
     853              :     for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
     854              :       // Define some convenient aliases.
     855              :       const _task *task = &ctx->tasks[level][itask];
     856              : 
     857              :       if (task->level != level) {
     858              :         printf("level %d, %d\n", task->level, level);
     859              :         abort();
     860              :       }
     861              : 
     862              :       if (task->update_block_ || (prev_task == NULL)) {
     863              :         /* need to load pab if forces are needed */
     864              :         if (calculate_forces) {
     865              :           extract_blocks(ctx, task, pab_blocks, &work, &pab);
     866              :         }
     867              :         /* store the coefficients of the operator after rotation to
     868              :          * the spherical harmonic basis */
     869              : 
     870              :         rotate_and_store_coefficients(ctx, prev_task, task, &hab, &work,
     871              :                                       hab_block_local);
     872              : 
     873              :         /* There is a difference here between collocate and integrate.
     874              :          * For collocate, I do not need to know the task where blocks
     875              :          * have been updated the last time. For integrate this
     876              :          * information is crucial to update the coefficients of the
     877              :          * potential */
     878              :         prev_task = task;
     879              :         memset(hab.data, 0, sizeof(double) * hab.alloc_size_);
     880              :       }
     881              : 
     882              :       /* the grid is divided over several ranks or not periodic */
     883              :       if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
     884              :           (handler->grid.size[1] != handler->grid.full_size[1]) ||
     885              :           (handler->grid.size[2] != handler->grid.full_size[2])) {
     886              :         /* unfortunately the window where the gaussian should be added depends
     887              :          * on the bonds. So I have to adjust the window all the time. */
     888              : 
     889              :         setup_grid_window(&handler->grid, shift_local, border_width,
     890              :                           task->border_mask);
     891              :       }
     892              : 
     893              :       int lmax[2] = {task->lmax[0] + handler->lmax_diff[0],
     894              :                      task->lmax[1] + handler->lmax_diff[1]};
     895              :       int lmin[2] = {task->lmin[0] + handler->lmin_diff[0],
     896              :                      task->lmin[1] + handler->lmin_diff[1]};
     897              : 
     898              :       lmin[0] = imax(lmin[0], 0);
     899              :       lmin[1] = imax(lmin[1], 0);
     900              : 
     901              :       initialize_tensor_4(&(handler->alpha), 3, lmax[1] + 1, lmax[0] + 1,
     902              :                           lmax[0] + lmax[1] + 1);
     903              :       realloc_tensor(&(handler->alpha));
     904              : 
     905              :       const int lp = lmax[0] + lmax[1];
     906              : 
     907              :       initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
     908              :       realloc_tensor(&(handler->coef));
     909              :       grid_integrate(handler, ctx->orthorhombic, task->zetp, task->rp,
     910              :                      task->radius);
     911              :       /*
     912              :               handler->coef contains coefficients in the (x - x_12) basis. now
     913              :               we need to rotate them in the (x - x_1) (x - x_2) basis
     914              :       */
     915              : 
     916              :       /* compute the transformation matrix */
     917              :       grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax,
     918              :                                &(handler->alpha));
     919              : 
     920              :       initialize_tensor_2(&vab, ncoset(lmax[1]), ncoset(lmax[0]));
     921              :       realloc_tensor(&vab);
     922              : 
     923              :       grid_compute_vab(lmin, lmax, lp, task->prefactor,
     924              :                        &handler->alpha, // contains the matrix for the rotation
     925              :                        &handler->coef,  // contains the coefficients of
     926              :                        // the potential in the
     927              :                        // (x_x_12) basis
     928              :                        &vab); // contains the coefficients of the potential
     929              :       // in the (x - x_1)(x - x_2) basis
     930              : 
     931              :       if (calculate_forces) {
     932              :         memset(forces_local_pair_.data, 0,
     933              :                sizeof(double) * forces_local_pair_.alloc_size_);
     934              :         memset(virial_local_pair_.data, 0,
     935              :                sizeof(double) * virial_local_pair_.alloc_size_);
     936              :       }
     937              : 
     938              :       update_hab_forces_and_stress(
     939              :           task, &vab, &pab, calculate_tau, calculate_forces, calculate_virial,
     940              :           &forces_local_pair_, /* matrix
     941              :                                 * containing the
     942              :                                 * contribution of
     943              :                                 * the gaussian
     944              :                                 * pair for each
     945              :                                 * atom */
     946              :           &virial_local_pair_, /* same but for the virial term (stress tensor)
     947              :                                 */
     948              :           &hab);
     949              : 
     950              :       if (calculate_forces) {
     951              :         const double scaling = (task->iatom == task->jatom) ? 1.0 : 2.0;
     952              :         idx2(forces_local_, task->iatom, 0) +=
     953              :             scaling * idx2(forces_local_pair_, 0, 0);
     954              :         idx2(forces_local_, task->iatom, 1) +=
     955              :             scaling * idx2(forces_local_pair_, 0, 1);
     956              :         idx2(forces_local_, task->iatom, 2) +=
     957              :             scaling * idx2(forces_local_pair_, 0, 2);
     958              : 
     959              :         idx2(forces_local_, task->jatom, 0) +=
     960              :             scaling * idx2(forces_local_pair_, 1, 0);
     961              :         idx2(forces_local_, task->jatom, 1) +=
     962              :             scaling * idx2(forces_local_pair_, 1, 1);
     963              :         idx2(forces_local_, task->jatom, 2) +=
     964              :             scaling * idx2(forces_local_pair_, 1, 2);
     965              :         if (calculate_virial) {
     966              :           for (int i = 0; i < 3; i++) {
     967              :             for (int j = 0; j < 3; j++) {
     968              :               idx2(virial_local_, i, j) +=
     969              :                   scaling * (idx3(virial_local_pair_, 0, i, j) +
     970              :                              idx3(virial_local_pair_, 1, i, j));
     971              :             }
     972              :           }
     973              :         }
     974              :       }
     975              :     }
     976              : 
     977              :     rotate_and_store_coefficients(ctx, prev_task, NULL, &hab, &work,
     978              :                                   hab_block_local);
     979              : 
     980              :     // now reduction over the hab blocks
     981              :     if (num_threads > 1) {
     982              :       // does not store the number of elements but the amount of memory
     983              :       // occupied. That's a strange choice.
     984              :       const int hab_size = hab_blocks->size / sizeof(double);
     985              :       if ((hab_size / num_threads) >= 2) {
     986              :         const int block_size =
     987              :             hab_size / num_threads + (hab_size % num_threads);
     988              : 
     989              :         for (int bk = 0; bk < num_threads; bk++) {
     990              :           int bk_id = (bk + thread_id) % num_threads;
     991              :           size_t begin = bk_id * block_size;
     992              :           size_t end = imin((bk_id + 1) * block_size, hab_size);
     993              :           cblas_daxpy(end - begin, 1.0, hab_block_local + begin, 1,
     994              :                       hab_blocks->host_buffer + begin, 1);
     995              : #pragma omp barrier
     996              :         }
     997              :       } else {
     998              :         const int hab_size = hab_blocks->size / sizeof(double);
     999              : #pragma omp critical
    1000              :         cblas_daxpy(hab_size, 1.0, hab_block_local, 1, hab_blocks->host_buffer,
    1001              :                     1);
    1002              :       }
    1003              :     }
    1004              : 
    1005              :     if (calculate_forces) {
    1006              :       if (num_threads > 1) {
    1007              :         if ((forces_->alloc_size_ / num_threads) >= 2) {
    1008              :           const int block_size = forces_->alloc_size_ / num_threads +
    1009              :                                  (forces_->alloc_size_ % num_threads);
    1010              : 
    1011              :           for (int bk = 0; bk < num_threads; bk++) {
    1012              :             int bk_id = (bk + thread_id) % num_threads;
    1013              :             int begin = bk_id * block_size;
    1014              :             int end = imin((bk_id + 1) * block_size, forces_->alloc_size_);
    1015              :             cblas_daxpy(end - begin, 1.0, forces_local_.data + begin, 1,
    1016              :                         forces_->data + begin, 1);
    1017              : #pragma omp barrier
    1018              :           }
    1019              :         } else {
    1020              : #pragma omp critical
    1021              :           cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
    1022              :                       forces_->data, 1);
    1023              :         }
    1024              :       } else {
    1025              :         // we are running with OMP_NUM_THREADS=1
    1026              :         cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
    1027              :                     forces_->data, 1);
    1028              :       }
    1029              :     }
    1030              : 
    1031              :     if (calculate_virial) {
    1032              : #pragma omp critical
    1033              :       for (int i = 0; i < 3; i++) {
    1034              :         for (int j = 0; j < 3; j++) {
    1035              :           idx2(virial_[0], i, j) += idx2(virial_local_, i, j);
    1036              :         }
    1037              :       }
    1038              :     }
    1039              : 
    1040              :     handler->grid.data = NULL;
    1041              :     free(vab.data);
    1042              :     free(work.data);
    1043              :     free(pab.data);
    1044              :     free(hab.data);
    1045              : 
    1046              :     if (calculate_forces) {
    1047              :       free(forces_local_.data);
    1048              :       free(virial_local_.data);
    1049              :       free(virial_local_pair_.data);
    1050              :       free(forces_local_pair_.data);
    1051              :     }
    1052              :   }
    1053          632 : }
    1054              : 
    1055              : /*******************************************************************************
    1056              :  * \brief Integrate all tasks of in given list from given grids using matrix -
    1057              :  * matrix multiplication
    1058              :  ******************************************************************************/
    1059          158 : void grid_dgemm_integrate_task_list(
    1060              :     void *ptr, const bool compute_tau, const int natoms, const int nlevels,
    1061              :     const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
    1062              :     offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
    1063              : 
    1064          158 :   grid_context *const ctx = (grid_context *)ptr;
    1065              : 
    1066          158 :   assert(ctx->checksum == ctx_checksum);
    1067          158 :   assert(ctx->nlevels == nlevels);
    1068          158 :   assert(ctx->natoms == natoms);
    1069              : 
    1070              :   // Zero result arrays.
    1071          158 :   memset(hab_blocks->host_buffer, 0, hab_blocks->size);
    1072              : 
    1073          158 :   const int max_threads = omp_get_max_threads();
    1074              : 
    1075          158 :   if (ctx->scratch == NULL)
    1076          158 :     ctx->scratch = malloc(hab_blocks->size * max_threads);
    1077          158 :   assert(ctx->scratch != NULL);
    1078              : 
    1079              :   // #pragma omp parallel for
    1080          790 :   for (int level = 0; level < ctx->nlevels; level++) {
    1081          632 :     const _layout *layout = &ctx->layouts[level];
    1082          632 :     set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
    1083          632 :                         layout->npts_global, layout->npts_local,
    1084          632 :                         layout->shift_local, layout->border_width, layout->dh,
    1085          632 :                         layout->dh_inv, grids[level]);
    1086          632 :     ctx->grid[level].data = grids[level]->host_buffer;
    1087              :   }
    1088              : 
    1089          158 :   bool calculate_virial = (virial != NULL);
    1090          158 :   bool calculate_forces = (forces != NULL);
    1091              : 
    1092          158 :   tensor forces_, virial_;
    1093          158 :   if (calculate_forces) {
    1094           20 :     initialize_tensor_2(&forces_, natoms, 3);
    1095           20 :     alloc_tensor(&forces_);
    1096           20 :     initialize_tensor_2(&virial_, 3, 3);
    1097           20 :     alloc_tensor(&virial_);
    1098           20 :     memset(forces_.data, 0, sizeof(double) * forces_.alloc_size_);
    1099           20 :     memset(virial_.data, 0, sizeof(double) * virial_.alloc_size_);
    1100              :   }
    1101              : 
    1102          790 :   for (int level = 0; level < ctx->nlevels; level++) {
    1103          632 :     const _layout *layout = &ctx->layouts[level];
    1104          632 :     integrate_one_grid_level_dgemm(ctx, level, compute_tau, calculate_forces,
    1105          632 :                                    calculate_virial, layout->shift_local,
    1106          632 :                                    layout->border_width, pab_blocks, hab_blocks,
    1107              :                                    &forces_, &virial_);
    1108          632 :     ctx->grid[level].data = NULL;
    1109              :   }
    1110          158 :   if (calculate_forces) {
    1111           20 :     if (calculate_virial) {
    1112           16 :       virial[0][0] = idx2(virial_, 0, 0);
    1113           16 :       virial[0][1] = idx2(virial_, 0, 1);
    1114           16 :       virial[0][2] = idx2(virial_, 0, 2);
    1115           16 :       virial[1][0] = idx2(virial_, 1, 0);
    1116           16 :       virial[1][1] = idx2(virial_, 1, 1);
    1117           16 :       virial[1][2] = idx2(virial_, 1, 2);
    1118           16 :       virial[2][0] = idx2(virial_, 2, 0);
    1119           16 :       virial[2][1] = idx2(virial_, 2, 1);
    1120           16 :       virial[2][2] = idx2(virial_, 2, 2);
    1121              :     }
    1122              : 
    1123           20 :     memcpy(forces[0], forces_.data, sizeof(double) * forces_.alloc_size_);
    1124           20 :     free(forces_.data);
    1125           20 :     free(virial_.data);
    1126              :   }
    1127              : 
    1128          158 :   free(ctx->scratch);
    1129          158 :   ctx->scratch = NULL;
    1130          158 : }
        

Generated by: LCOV version 2.0-1