LCOV - code coverage report
Current view: top level - src/grid/dgemm - grid_dgemm_integrate.c (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:85b8a9b) Lines: 68.5 % 463 317
Test Date: 2026-06-14 06:48:14 Functions: 75.0 % 12 9

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2026 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : 
       8              : #include <assert.h>
       9              : #include <limits.h>
      10              : #include <math.h>
      11              : #include <stdbool.h>
      12              : #include <stdio.h>
      13              : #include <stdlib.h>
      14              : #include <string.h>
      15              : #include <unistd.h>
      16              : 
      17              : #include <omp.h>
      18              : 
      19              : #include "../common/grid_common.h"
      20              : #include "grid_dgemm_coefficients.h"
      21              : #include "grid_dgemm_collocate.h"
      22              : #include "grid_dgemm_collocation_integration.h"
      23              : #include "grid_dgemm_non_orthorombic_corrections.h"
      24              : #include "grid_dgemm_private_header.h"
      25              : #include "grid_dgemm_tensor_local.h"
      26              : #include "grid_dgemm_utils.h"
      27              : 
      28            0 : void extract_cube_within_spherical_cutoff_ortho(
      29              :     struct collocation_integration_ *const handler, const double disr_radius,
      30              :     const int cmax, const int *const lb_cube, const int *const cube_center) {
      31              :   // a mapping so that the ig corresponds to the right grid point
      32            0 :   int **map = handler->map;
      33            0 :   map[1] = map[0] + 2 * cmax + 1;
      34            0 :   map[2] = map[1] + 2 * cmax + 1;
      35              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
      36              : 
      37            0 :   for (int i = 0; i < 3; i++) {
      38            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
      39            0 :       map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
      40            0 :                               handler->grid.lower_corner[i],
      41              :                           handler->grid.full_size[i]);
      42              :     }
      43              :   }
      44              : 
      45            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
      46            0 :                             .xmax = handler->grid.window_size[0]};
      47            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
      48            0 :                             .xmax = handler->grid.window_size[1]};
      49            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
      50            0 :                             .xmax = handler->grid.window_size[2]};
      51              : 
      52            0 :   for (int kg = 0; kg < handler->cube.size[0]; kg++) {
      53            0 :     const int k = map[0][kg];
      54            0 :     const int kd =
      55            0 :         (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
      56            0 :     const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
      57            0 :     const double kremain = disr_radius * disr_radius - kr * kr;
      58              : 
      59            0 :     if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
      60              : 
      61            0 :       const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
      62            0 :       for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
      63            0 :         const int j = map[1][jg - lb_cube[1]];
      64            0 :         const double jr = ((2 * jg - 1) >> 1) *
      65            0 :                           handler->dh[1][1]; // distance from center in a.u.
      66            0 :         const double jremain = kremain - jr * jr;
      67              : 
      68            0 :         if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
      69            0 :           const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
      70            0 :           const int xmax = 1 - xmin;
      71              : 
      72              :           // printf("xmin %d, xmax %d\n", xmin, xmax);
      73            0 :           for (int x = xmin - lb_cube[2];
      74            0 :                x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
      75            0 :             const int x1 = map[2][x];
      76              : 
      77            0 :             if (!is_point_in_interval(x1, xwindow))
      78            0 :               continue;
      79              : 
      80            0 :             int lower_corner[3] = {k, j, x1};
      81            0 :             int upper_corner[3] = {k + 1, j + 1, x1 + 1};
      82              : 
      83            0 :             compute_interval(map[2], handler->grid.full_size[2],
      84            0 :                              handler->grid.size[2], handler->cube.size[2], x1,
      85              :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
      86              : 
      87            0 :             if (upper_corner[2] - lower_corner[2]) {
      88            0 :               const int position1[3] = {kg, jg - lb_cube[1], x};
      89              : 
      90              :               /* the function will internally take care of the local vs global
      91              :                * grid */
      92              : 
      93            0 :               double *restrict dst = &idx3(handler->cube, position1[0],
      94              :                                            position1[1], position1[2]);
      95              : 
      96            0 :               double *restrict src = &idx3(handler->grid, lower_corner[0],
      97              :                                            lower_corner[1], lower_corner[2]);
      98              : 
      99            0 :               const int sizex = upper_corner[2] - lower_corner[2];
     100            0 :               GRID_PRAGMA_SIMD((dst, src), 8)
     101            0 :               for (int x = 0; x < sizex; x++) {
     102            0 :                 dst[x] = src[x];
     103              :               }
     104              :             }
     105              : 
     106            0 :             if (handler->grid.size[2] == handler->grid.full_size[2])
     107            0 :               update_loop_index(handler->grid.full_size[2], x1, &x);
     108              :             else
     109            0 :               x += upper_corner[2] - lower_corner[2] - 1;
     110              :           }
     111              :         }
     112              :       }
     113              :     }
     114              :   }
     115            0 : }
     116              : 
     117            0 : void extract_cube_within_spherical_cutoff_generic(
     118              :     struct collocation_integration_ *const handler, const double disr_radius,
     119              :     const int cmax, const int *const lb_cube, const int *const ub_cube,
     120              :     const double *const roffset, const int *const cube_center) {
     121              : 
     122            0 :   const double a = handler->dh[0][0] * handler->dh[0][0] +
     123            0 :                    handler->dh[0][1] * handler->dh[0][1] +
     124            0 :                    handler->dh[0][2] * handler->dh[0][2];
     125            0 :   const double a_inv = 1.0 / a;
     126              :   // a mapping so that the ig corresponds to the right grid point
     127            0 :   int **map = handler->map;
     128            0 :   map[1] = map[0] + 2 * cmax + 1;
     129            0 :   map[2] = map[1] + 2 * cmax + 1;
     130              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     131              : 
     132            0 :   for (int i = 0; i < 3; i++) {
     133            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     134            0 :       map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
     135            0 :                               handler->grid.lower_corner[i],
     136              :                           handler->grid.full_size[i]);
     137              :     }
     138              :   }
     139              : 
     140            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     141            0 :                             .xmax = handler->grid.window_size[0]};
     142            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     143            0 :                             .xmax = handler->grid.window_size[1]};
     144            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     145            0 :                             .xmax = handler->grid.window_size[2]};
     146              : 
     147            0 :   for (int k = 0; k < handler->cube.size[0]; k++) {
     148            0 :     const int iz = map[0][k];
     149              : 
     150            0 :     if (!is_point_in_interval(iz, zwindow))
     151            0 :       continue;
     152              : 
     153            0 :     const double tz = (k + lb_cube[0] - roffset[0]);
     154            0 :     const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
     155            0 :                          tz * handler->dh[2][2]};
     156              : 
     157            0 :     for (int j = 0; j < handler->cube.size[1]; j++) {
     158            0 :       const int iy = map[1][j];
     159              : 
     160            0 :       if (!is_point_in_interval(iy, ywindow))
     161            0 :         continue;
     162              : 
     163            0 :       const double ty = (j - roffset[1] + lb_cube[1]);
     164            0 :       const double y[3] = {z[0] + ty * handler->dh[1][0],
     165            0 :                            z[1] + ty * handler->dh[1][1],
     166            0 :                            z[2] + ty * handler->dh[1][2]};
     167              : 
     168              :       /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
     169              :       /*                                            4 (x1^2 + y1^2 + z1^2)
     170              :        * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
     171              : 
     172            0 :       const double b =
     173            0 :           -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
     174            0 :                   handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
     175            0 :                   handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
     176              : 
     177            0 :       const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
     178            0 :                            (roffset[2] * handler->dh[0][0] - y[0]) +
     179            0 :                        (roffset[2] * handler->dh[0][1] - y[1]) *
     180            0 :                            (roffset[2] * handler->dh[0][1] - y[1]) +
     181            0 :                        (roffset[2] * handler->dh[0][2] - y[2]) *
     182              :                            (roffset[2] * handler->dh[0][2] - y[2]) -
     183            0 :                        disr_radius * disr_radius;
     184              : 
     185            0 :       double delta = b * b - 4.0 * a * c;
     186              : 
     187            0 :       if (delta < 0.0)
     188            0 :         continue;
     189              : 
     190            0 :       delta = sqrt(delta);
     191              : 
     192            0 :       const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
     193            0 :       const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
     194              : 
     195            0 :       int lower_corner[3] = {iz, iy, xmin};
     196            0 :       int upper_corner[3] = {iz + 1, iy + 1, xmin};
     197              : 
     198            0 :       for (int x = xmin - lb_cube[2];
     199            0 :            x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     200            0 :         const int x1 = map[2][x];
     201              : 
     202            0 :         if (!is_point_in_interval(x1, xwindow))
     203            0 :           continue;
     204              : 
     205            0 :         compute_interval(map[2], handler->grid.full_size[2],
     206            0 :                          handler->grid.size[2], handler->cube.size[2], x1, &x,
     207              :                          lower_corner + 2, upper_corner + 2, xwindow);
     208              : 
     209            0 :         if (upper_corner[2] - lower_corner[2]) {
     210            0 :           const int position1[3] = {k, j, x};
     211              : 
     212              :           /* the function will internally take care of the local vs global
     213              :            * grid */
     214              : 
     215            0 :           double *__restrict__ src = &idx3(handler->grid, lower_corner[0],
     216              :                                            lower_corner[1], lower_corner[2]);
     217            0 :           double *__restrict__ dst =
     218            0 :               &idx3(handler->cube, position1[0], position1[1], position1[2]);
     219              : 
     220            0 :           const int sizex = upper_corner[2] - lower_corner[2];
     221              : 
     222              :           // #pragma omp simd linear(dst, src) simdlen(8)
     223            0 :           GRID_PRAGMA_SIMD((dst, src), 8)
     224            0 :           for (int x = 0; x < sizex; x++) {
     225            0 :             dst[x] = src[x];
     226              :           }
     227              : 
     228            0 :           if (handler->grid.size[2] == handler->grid.full_size[2])
     229            0 :             update_loop_index(handler->grid.full_size[2], x1, &x);
     230              :           else
     231            0 :             x += upper_corner[2] - lower_corner[2] - 1;
     232              :         }
     233              :       }
     234              :     }
     235              :   }
     236            0 : }
     237              : 
     238         5078 : static void rotate_and_store_coefficients(grid_context *const ctx,
     239              :                                           const _task *prev_task,
     240              :                                           const _task *task, tensor *const hab,
     241              :                                           tensor *work, // some scratch matrix
     242              :                                           double *blocks) {
     243         5078 :   if (prev_task != NULL) {
     244              :     /* prev_task is NULL when we deal with the first iteration. In that case
     245              :      * we just need to initialize the hab matrix to the proper size. Note
     246              :      * that resizing does not really occurs since memory allocation is done
     247              :      * for the maximum size the matrix can have. */
     248         4482 :     const int iatom = prev_task->iatom;
     249         4482 :     const int jatom = prev_task->jatom;
     250         4482 :     const int iset = prev_task->iset;
     251         4482 :     const int jset = prev_task->jset;
     252         4482 :     const int ikind = ctx->atom_kinds[iatom];
     253         4482 :     const int jkind = ctx->atom_kinds[jatom];
     254         4482 :     const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     255         4482 :     const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     256              : 
     257         4482 :     const int block_num = prev_task->block_num;
     258         4482 :     double *const block = &blocks[ctx->block_offsets[block_num]];
     259              : 
     260         4482 :     const int ncoseta = ncoset(ibasis->lmax[iset]);
     261         4482 :     const int ncosetb = ncoset(jbasis->lmax[jset]);
     262              : 
     263         4482 :     const int ncoa = ibasis->npgf[iset] * ncoseta;
     264         4482 :     const int ncob = jbasis->npgf[jset] * ncosetb;
     265              : 
     266         4482 :     const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set */
     267         4482 :     const int nsgf_setb = jbasis->nsgf_set[jset];
     268         4482 :     const int nsgfa = ibasis->nsgf; // size of entire spherical basis
     269         4482 :     const int nsgfb = jbasis->nsgf;
     270         4482 :     const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
     271         4482 :     const int sgfb = jbasis->first_sgf[jset] - 1;
     272         4482 :     const int maxcoa = ibasis->maxco;
     273         4482 :     const int maxcob = jbasis->maxco;
     274              : 
     275         4482 :     initialize_tensor_2(work, nsgf_setb, ncoa);
     276         4482 :     realloc_tensor(work);
     277              : 
     278              :     // Warning these matrices are row major....
     279              : 
     280         4482 :     dgemm_params m1, m2;
     281         4482 :     memset(&m1, 0, sizeof(dgemm_params));
     282         4482 :     memset(&m2, 0, sizeof(dgemm_params));
     283              : 
     284         4482 :     m1.op1 = 'N';
     285         4482 :     m1.op2 = 'N';
     286         4482 :     m1.m = nsgf_setb;
     287         4482 :     m1.n = ncoa;
     288         4482 :     m1.k = ncob;
     289         4482 :     m1.alpha = 1.0;
     290         4482 :     m1.beta = 0.0;
     291         4482 :     m1.a = &jbasis->sphi[sgfb * maxcob];
     292         4482 :     m1.lda = maxcob;
     293         4482 :     m1.b = hab->data;
     294         4482 :     m1.ldb = ncoa;
     295         4482 :     m1.c = work->data;
     296         4482 :     m1.ldc = work->ld_;
     297              : 
     298              :     // phi[b][ncob]
     299              :     // work[b][ncoa] = phi[b][ncob] * hab[ncob][ncoa]
     300              : 
     301         4482 :     if (iatom <= jatom) {
     302              :       // I need to have the final result in the form
     303              : 
     304              :       // block[b][a] = work[b][ncoa] transpose(phi[a][ncoa])
     305         2982 :       m2.op1 = 'N';
     306         2982 :       m2.op2 = 'T';
     307         2982 :       m2.m = nsgf_setb;
     308         2982 :       m2.n = nsgf_seta;
     309         2982 :       m2.k = ncoa;
     310         2982 :       m2.a = work->data;
     311         2982 :       m2.lda = work->ld_;
     312         2982 :       m2.b = &ibasis->sphi[sgfa * maxcoa];
     313         2982 :       m2.ldb = maxcoa;
     314         2982 :       m2.c = block + sgfb * nsgfa + sgfa;
     315         2982 :       m2.ldc = nsgfa;
     316         2982 :       m2.alpha = 1.0;
     317         2982 :       m2.beta = 1.0;
     318              :     } else {
     319              :       // block[a][b] = phi[a][ncoa] Transpose(work[b][ncoa])
     320         1500 :       m2.op1 = 'N';
     321         1500 :       m2.op2 = 'T';
     322         1500 :       m2.m = nsgf_seta;
     323         1500 :       m2.n = nsgf_setb;
     324         1500 :       m2.k = ncoa;
     325         1500 :       m2.a = &ibasis->sphi[sgfa * maxcoa];
     326         1500 :       m2.lda = maxcoa;
     327         1500 :       m2.b = work->data;
     328         1500 :       m2.ldb = work->ld_;
     329         1500 :       m2.c = block + sgfa * nsgfb + sgfb;
     330         1500 :       m2.ldc = nsgfb;
     331         1500 :       m2.alpha = 1.0;
     332         1500 :       m2.beta = 1.0;
     333              :     }
     334              : 
     335              :     /* these dgemm are *row* major */
     336         4482 :     dgemm_simplified(&m1);
     337         4482 :     dgemm_simplified(&m2);
     338              :   }
     339              : 
     340         5078 :   if (task != NULL) {
     341         4482 :     const int iatom = task->iatom;
     342         4482 :     const int jatom = task->jatom;
     343         4482 :     const int ikind = ctx->atom_kinds[iatom];
     344         4482 :     const int jkind = ctx->atom_kinds[jatom];
     345         4482 :     const int iset = task->iset;
     346         4482 :     const int jset = task->jset;
     347         4482 :     const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     348         4482 :     const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     349         4482 :     const int ncoseta = ncoset(ibasis->lmax[iset]);
     350         4482 :     const int ncosetb = ncoset(jbasis->lmax[jset]);
     351              : 
     352         4482 :     const int ncoa = ibasis->npgf[iset] * ncoseta;
     353         4482 :     const int ncob = jbasis->npgf[jset] * ncosetb;
     354              : 
     355         4482 :     initialize_tensor_2(hab, ncob, ncoa);
     356         4482 :     realloc_tensor(hab);
     357              :   }
     358         5078 : }
     359              : 
     360        49036 : void update_force_pair(orbital a, orbital b, const double pab,
     361              :                        const double ftz[2], const double *const rab,
     362              :                        const tensor *const vab, tensor *force) {
     363        49036 :   const double axpm0 = idx2(vab[0], idx(b), idx(a));
     364       196144 :   for (int i = 0; i < 3; i++) {
     365       147108 :     const double aip1 = idx2(vab[0], idx(b), idx(up(i, a)));
     366       147108 :     const double aim1 = idx2(vab[0], idx(b), idx(down(i, a)));
     367       147108 :     const double bim1 = idx2(vab[0], idx(down(i, b)), idx(a));
     368       147108 :     idx2(force[0], 0, i) += pab * (ftz[0] * aip1 - a.l[i] * aim1);
     369       147108 :     idx2(force[0], 1, i) +=
     370       147108 :         pab * (ftz[1] * (aip1 - rab[i] * axpm0) - b.l[i] * bim1);
     371              :   }
     372        49036 : }
     373              : 
     374        34952 : void update_virial_pair(orbital a, orbital b, const double pab,
     375              :                         const double ftz[2], const double *const rab,
     376              :                         const tensor *const vab, tensor *virial) {
     377       139808 :   for (int i = 0; i < 3; i++) {
     378       419424 :     for (int j = 0; j < 3; j++) {
     379       314568 :       idx3(virial[0], 0, i, j) +=
     380       314568 :           pab * ftz[0] * idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
     381       314568 :           pab * a.l[j] * idx2(vab[0], idx(b), idx(up(i, down(j, a))));
     382              :     }
     383              :   }
     384              : 
     385       139808 :   for (int i = 0; i < 3; i++) {
     386       419424 :     for (int j = 0; j < 3; j++) {
     387       314568 :       idx3(virial[0], 1, i, j) +=
     388       314568 :           pab * ftz[1] *
     389       314568 :               (idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
     390       314568 :                idx2(vab[0], idx(b), idx(up(i, a))) * rab[j] -
     391       314568 :                idx2(vab[0], idx(b), idx(up(j, a))) * rab[i] +
     392       314568 :                idx2(vab[0], idx(b), idx(a)) * rab[j] * rab[i]) -
     393       314568 :           pab * b.l[j] * idx2(vab[0], idx(up(i, down(j, b))), idx(a));
     394              :     }
     395              :   }
     396        34952 : }
     397              : 
     398       352870 : void update_all(const bool compute_forces, const bool compute_virial,
     399              :                 const orbital a, const orbital b, const double f,
     400              :                 const double *const ftz, const double *rab, const tensor *vab,
     401              :                 const double pab, double *hab, tensor *forces,
     402              :                 tensor *virials) {
     403              : 
     404       352870 :   *hab += f * idx2(vab[0], idx(b), idx(a));
     405              : 
     406       352870 :   if (compute_forces) {
     407        49036 :     update_force_pair(a, b, f * pab, ftz, rab, vab, forces);
     408              :   }
     409              : 
     410       352870 :   if (compute_virial) {
     411        34952 :     update_virial_pair(a, b, f * pab, ftz, rab, vab, virials);
     412              :   }
     413       352870 : }
     414              : 
     415            0 : static void update_tau(const bool compute_forces, const bool compute_virial,
     416              :                        const orbital a, const orbital b, const double ftz[2],
     417              :                        const double *const rab, const tensor *const vab,
     418              :                        const double pab, double *const hab, tensor *forces,
     419              :                        tensor *virials) {
     420              : 
     421            0 :   for (int i = 0; i < 3; i++) {
     422            0 :     update_all(compute_forces, compute_virial, down(i, a), down(i, b),
     423            0 :                0.5 * a.l[i] * b.l[i], ftz, rab, vab, pab, hab, forces, virials);
     424            0 :     update_all(compute_forces, compute_virial, up(i, a), down(i, b),
     425            0 :                -0.5 * ftz[0] * b.l[i], ftz, rab, vab, pab, hab, forces,
     426              :                virials);
     427            0 :     update_all(compute_forces, compute_virial, down(i, a), up(i, b),
     428            0 :                -0.5 * a.l[i] * ftz[1], ftz, rab, vab, pab, hab, forces,
     429              :                virials);
     430            0 :     update_all(compute_forces, compute_virial, up(i, a), up(i, b),
     431            0 :                0.5 * ftz[0] * ftz[1], ftz, rab, vab, pab, hab, forces, virials);
     432              :   }
     433            0 : }
     434              : 
     435              : static void
     436        44389 : update_hab_forces_and_stress(const _task *task, const tensor *const vab,
     437              :                              const tensor *const pab, const bool compute_tau,
     438              :                              const bool compute_forces,
     439              :                              const bool compute_virial, tensor *const forces,
     440              :                              tensor *const virial, tensor *const hab) {
     441        44389 :   double zeta[2] = {task->zeta[0] * 2.0, task->zeta[1] * 2.0};
     442       104127 :   for (int lb = task->lmin[1]; lb <= task->lmax[1]; lb++) {
     443       142989 :     for (int la = task->lmin[0]; la <= task->lmax[0]; la++) {
     444       207817 :       for (int bx = 0; bx <= lb; bx++) {
     445       295454 :         for (int by = 0; by <= lb - bx; by++) {
     446       170888 :           const int bz = lb - bx - by;
     447       170888 :           const orbital b = {{bx, by, bz}};
     448       170888 :           const int idx_b = task->offset[1] + idx(b);
     449       427643 :           for (int ax = 0; ax <= la; ax++) {
     450       609625 :             for (int ay = 0; ay <= la - ax; ay++) {
     451       352870 :               const int az = la - ax - ay;
     452       352870 :               const orbital a = {{ax, ay, az}};
     453       352870 :               const int idx_a = task->offset[0] + idx(a);
     454       352870 :               double *habval = &idx2(hab[0], idx_b, idx_a);
     455       352870 :               const double prefactor = idx2(pab[0], idx_b, idx_a);
     456              : 
     457              :               // now compute the forces
     458       352870 :               if (compute_tau) {
     459            0 :                 update_tau(compute_forces, compute_virial, a, b, zeta,
     460            0 :                            task->rab, vab, prefactor, habval, forces, virial);
     461              :               } else {
     462       352870 :                 update_all(compute_forces, compute_virial, a, b, 1.0, zeta,
     463       352870 :                            task->rab, vab, prefactor, habval, forces, virial);
     464              :               }
     465              :             }
     466              :           }
     467              :         }
     468              :       }
     469              :     }
     470              :   }
     471        44389 : }
     472              : 
     473              : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
     474              :  * very general and does not depend on the orthorombic nature of the grid. for
     475              :  * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
     476              : 
     477        44389 : void extract_cube(struct collocation_integration_ *handler, const int cmax,
     478              :                   const int *lower_boundaries_cube, const int *cube_center) {
     479              : 
     480              :   // a mapping so that the ig corresponds to the right grid point
     481        44389 :   int **map = handler->map;
     482        44389 :   map[1] = map[0] + 2 * cmax + 1;
     483        44389 :   map[2] = map[1] + 2 * cmax + 1;
     484              :   //  memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     485       177556 :   for (int i = 0; i < 3; i++) {
     486      3366244 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     487      3233077 :       map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
     488      3233077 :                               handler->grid.lower_corner[i],
     489              :                           handler->grid.full_size[i]);
     490              :     }
     491              :   }
     492              : 
     493        44389 :   int lower_corner[3];
     494        44389 :   int upper_corner[3];
     495              : 
     496        44389 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     497        44389 :                             .xmax = handler->grid.window_size[0]};
     498        44389 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     499        44389 :                             .xmax = handler->grid.window_size[1]};
     500        44389 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     501        44389 :                             .xmax = handler->grid.window_size[2]};
     502              : 
     503       130515 :   for (int z = 0; (z < handler->cube.size[0]); z++) {
     504        86126 :     const int z1 = map[0][z];
     505              : 
     506        86126 :     if (!is_point_in_interval(z1, zwindow))
     507            0 :       continue;
     508              : 
     509        86126 :     compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
     510              :                      handler->cube.size[0], z1, &z, lower_corner, upper_corner,
     511              :                      zwindow);
     512              : 
     513              :     /* // We have a full plane. */
     514        86126 :     if (upper_corner[0] - lower_corner[0]) {
     515       258230 :       for (int y = 0; y < handler->cube.size[1]; y++) {
     516       172104 :         const int y1 = map[1][y];
     517              : 
     518              :         // this check is completely irrelevant when running without MPI.
     519       172104 :         if (!is_point_in_interval(y1, ywindow))
     520            0 :           continue;
     521              : 
     522       172104 :         compute_interval(map[1], handler->grid.full_size[1],
     523       172104 :                          handler->grid.size[1], handler->cube.size[1], y1, &y,
     524              :                          lower_corner + 1, upper_corner + 1, ywindow);
     525              : 
     526       172104 :         if (upper_corner[1] - lower_corner[1]) {
     527       552924 :           for (int x = 0; x < handler->cube.size[2]; x++) {
     528       380820 :             const int x1 = map[2][x];
     529              : 
     530       380820 :             if (!is_point_in_interval(x1, xwindow))
     531            0 :               continue;
     532              : 
     533       380820 :             compute_interval(map[2], handler->grid.full_size[2],
     534       380820 :                              handler->grid.size[2], handler->cube.size[2], x1,
     535              :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
     536              : 
     537       380820 :             if (upper_corner[2] - lower_corner[2]) { // should be non zero
     538       380820 :               const int position1[3] = {z, y, x};
     539              : 
     540              :               /* the function will internally take care of the local vx global
     541              :                * grid */
     542       380820 :               extract_sub_grid(
     543              :                   lower_corner, // lower corner of the portion of cube (in the
     544              :                   // full grid)
     545              :                   upper_corner, // upper corner of the portion of cube (in the
     546              :                   // full grid)
     547              :                   position1, // starting position subblock inside the cube
     548       380820 :                   &handler->grid,
     549       380820 :                   &handler->cube); // the grid to add data from
     550              : 
     551       380820 :               if (handler->grid.size[2] == handler->grid.full_size[2])
     552       380820 :                 update_loop_index(handler->grid.full_size[2], x1, &x);
     553              :               else
     554            0 :                 x += upper_corner[2] - lower_corner[2] - 1;
     555              :             }
     556              :           }
     557       172104 :           if (handler->grid.size[1] == handler->grid.full_size[1])
     558       172104 :             update_loop_index(handler->grid.full_size[1], y1, &y);
     559              :           else
     560            0 :             y += upper_corner[1] - lower_corner[1] - 1;
     561              :         }
     562              :       }
     563        86126 :       if (handler->grid.size[0] == handler->grid.full_size[0])
     564        86126 :         update_loop_index(handler->grid.full_size[0], z1, &z);
     565              :       else
     566            0 :         z += upper_corner[0] - lower_corner[0] - 1;
     567              :     }
     568              :   }
     569        44389 : }
     570              : 
     571        44389 : void grid_integrate(collocation_integration *const handler,
     572              :                     const bool use_ortho, const double zetp, const double rp[3],
     573              :                     const double radius) {
     574        44389 :   if (handler == NULL)
     575            0 :     abort();
     576              : 
     577        44389 :   const int lp = handler->coef.size[0] - 1;
     578              : 
     579        44389 :   int cubecenter[3];
     580        44389 :   int cube_size[3];
     581        44389 :   int lb_cube[3], ub_cube[3];
     582        44389 :   double roffset[3];
     583        44389 :   double disr_radius;
     584              : 
     585              :   /* cube : grid comtaining pointlike product between polynomials
     586              :    *
     587              :    * pol : grid  containing the polynomials in all three directions
     588              :    */
     589              : 
     590              :   /* seting up the cube parameters */
     591        88778 :   int cmax = compute_cube_properties(
     592        44389 :       use_ortho, radius, (const double(*)[3])handler->dh,
     593        44389 :       (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
     594              :       cubecenter, lb_cube, ub_cube, cube_size);
     595              : 
     596              :   /* initialize the multidimensional array containing the polynomials */
     597        44389 :   if (lp != 0) {
     598        35109 :     initialize_tensor_3(&handler->pol, 3, cmax, handler->coef.size[0]);
     599              :   } else {
     600         9280 :     initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
     601              :   }
     602        44389 :   handler->pol_alloc_size = realloc_tensor(&handler->pol);
     603              : 
     604              :   /* allocate memory for the polynomial and the cube */
     605              : 
     606        44389 :   initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
     607              : 
     608        44389 :   handler->cube_alloc_size = realloc_tensor(&handler->cube);
     609              : 
     610        44389 :   initialize_W_and_T(handler, &handler->coef, &handler->cube);
     611              : 
     612              :   /* compute the polynomials */
     613              : 
     614              :   // WARNING : do not reverse the order in pol otherwise you will have to
     615              :   // reverse the order in collocate_dgemm as well.
     616              : 
     617              :   /* the tensor contraction is done for a given order so I have to be careful
     618              :    * how the tensors X, Y, Z are stored. In collocate, they are stored
     619              :    * normally 0 (Z), (1) Y, (2) X in the table pol. but in integrate (which
     620              :    * uses the same tensor reduction), I have a special treatment for l = 0. In
     621              :    * that case the order *should* be the same than for collocate. For l > 0,
     622              :    * we need a different storage order which is (X) 2, (Y) 0, and (Z) 1.
     623              :    *
     624              :    * the reason for this is that the cube is stored as cube[z][y][x] so the
     625              :    * first direction taken for the dgemm is along x.
     626              :    */
     627              : 
     628        44389 :   int perm[3] = {2, 0, 1};
     629              : 
     630        44389 :   if (lp == 0) {
     631              :     /* I need to restore the same order than for collocate */
     632         9280 :     perm[0] = 0;
     633         9280 :     perm[1] = 1;
     634         9280 :     perm[2] = 2;
     635              :   }
     636              : 
     637        44389 :   bool use_ortho_forced = handler->orthogonal[0] && handler->orthogonal[1] &&
     638        24226 :                           handler->orthogonal[2];
     639        44389 :   if (use_ortho) {
     640        21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[0][0], roffset[2], 0, lb_cube[2],
     641        21114 :                         ub_cube[2], lp, cmax, zetp,
     642        21114 :                         &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
     643        21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[1][1], roffset[1], 0, lb_cube[1],
     644        21114 :                         ub_cube[1], lp, cmax, zetp,
     645        21114 :                         &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
     646        21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[2][2], roffset[0], 0, lb_cube[0],
     647        21114 :                         ub_cube[0], lp, cmax, zetp,
     648        21114 :                         &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
     649              :   } else {
     650        23275 :     double dx[3];
     651              : 
     652        23275 :     dx[2] = handler->dh[0][0] * handler->dh[0][0] +
     653        23275 :             handler->dh[0][1] * handler->dh[0][1] +
     654        23275 :             handler->dh[0][2] * handler->dh[0][2];
     655              : 
     656        23275 :     dx[1] = handler->dh[1][0] * handler->dh[1][0] +
     657        23275 :             handler->dh[1][1] * handler->dh[1][1] +
     658        23275 :             handler->dh[1][2] * handler->dh[1][2];
     659              : 
     660        23275 :     dx[0] = handler->dh[2][0] * handler->dh[2][0] +
     661        23275 :             handler->dh[2][1] * handler->dh[2][1] +
     662        23275 :             handler->dh[2][2] * handler->dh[2][2];
     663              : 
     664        23275 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[2], 0, lb_cube[2], ub_cube[2],
     665        23275 :                         lp, cmax, zetp * dx[2],
     666        23275 :                         &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
     667        23275 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[1], 0, lb_cube[1], ub_cube[1],
     668        23275 :                         lp, cmax, zetp * dx[1],
     669        23275 :                         &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
     670        23275 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[0], 0, lb_cube[0], ub_cube[0],
     671        23275 :                         lp, cmax, zetp * dx[0],
     672        23275 :                         &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
     673              : 
     674              :     /* the three remaining tensors are initialized in the function */
     675        23275 :     calculate_non_orthorombic_corrections_tensor(
     676              :         zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
     677        23275 :         handler->orthogonal, &handler->Exp);
     678              :   }
     679              : 
     680        44389 :   if (handler->apply_cutoff) {
     681            0 :     memset(handler->cube.data, 0, sizeof(double) * handler->cube.alloc_size_);
     682            0 :     if (!use_ortho && !use_ortho_forced) {
     683            0 :       extract_cube_within_spherical_cutoff_generic(
     684              :           handler, disr_radius, cmax, lb_cube, ub_cube, roffset, cubecenter);
     685              :     } else {
     686            0 :       extract_cube_within_spherical_cutoff_ortho(handler, disr_radius, cmax,
     687              :                                                  lb_cube, cubecenter);
     688              :     }
     689              :   } else {
     690        44389 :     extract_cube(handler, cmax, lb_cube, cubecenter);
     691              :   }
     692              : 
     693        44389 :   if (!use_ortho && !use_ortho_forced)
     694        23275 :     apply_non_orthorombic_corrections(handler->orthogonal, &handler->Exp,
     695              :                                       &handler->cube);
     696              : 
     697        44389 :   if (lp != 0) {
     698        35109 :     tensor_reduction_for_collocate_integrate(handler->scratch,
     699              :                                              // pointer to scratch memory
     700        35109 :                                              1.0, handler->orthogonal, NULL,
     701              :                                              &handler->cube, &handler->pol,
     702              :                                              &handler->coef);
     703              :   } else {
     704              :     /* it is very specific to integrate because we might end up with a
     705              :      * single element after the tensor product/contraction. In that case, I
     706              :      * compute the cube and then do a scalar product between the two. */
     707              : 
     708              :     /* we could also do this with 2 matrix-vector multiplications and a scalar
     709              :      * product
     710              :      *
     711              :      * H_{jk} = C_{ijk} . P_i (along x) C_{ijk} is *stored C[k][j][i]* !!!!!!
     712              :      * L_{k} = H_{jk} . P_j (along y)
     713              :      * v_{ab} = L_k . P_k (along z)
     714              :      */
     715              : 
     716         9280 :     tensor cube_tmp;
     717         9280 :     initialize_tensor_2(&cube_tmp, cube_size[0], cube_size[1]);
     718         9280 :     alloc_tensor(&cube_tmp);
     719              : 
     720              :     /* first along x */
     721         9280 :     cblas_dgemv(CblasRowMajor, CblasNoTrans,
     722         9280 :                 handler->cube.size[0] * handler->cube.size[1],
     723         9280 :                 handler->cube.size[2], 1.0, handler->cube.data,
     724         9280 :                 handler->cube.ld_, &idx3(handler->pol, 2, 0, 0), 1, 0.0,
     725              :                 cube_tmp.data, 1);
     726              : 
     727              :     /* second along j */
     728         9280 :     cblas_dgemv(CblasRowMajor, CblasNoTrans, handler->cube.size[0],
     729         9280 :                 handler->cube.size[1], 1.0, cube_tmp.data, cube_tmp.ld_,
     730         9280 :                 &idx3(handler->pol, 1, 0, 0), 1, 0.0, handler->scratch, 1);
     731              : 
     732              :     /* finally along k, it is a scalar product.... */
     733        18560 :     handler->coef.data[0] = cblas_ddot(handler->cube.size[0], handler->scratch,
     734         9280 :                                        1, &idx3(handler->pol, 0, 0, 0), 1);
     735              : 
     736         9280 :     free(cube_tmp.data);
     737              :   }
     738              : 
     739              :   /* go from ijk -> xyz */
     740        44389 :   if (!use_ortho)
     741        23275 :     grid_transform_coef_jik_to_yxz((const double(*)[3])handler->dh,
     742              :                                    &handler->coef);
     743        44389 : }
     744              : 
     745          596 : void integrate_one_grid_level_dgemm(
     746              :     grid_context *const ctx, const int level, const bool calculate_tau,
     747              :     const bool calculate_forces, const bool calculate_virial,
     748              :     const int *const shift_local, const int *const border_width,
     749              :     const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks,
     750              :     tensor *forces_, tensor *virial_) {
     751          596 :   tensor *const grid = &ctx->grid[level];
     752              : 
     753              :   // Using default(shared) because with GCC 9 the behavior around const changed:
     754              :   // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
     755          596 : #pragma omp parallel default(shared)
     756              :   {
     757              :     const int num_threads = omp_get_num_threads();
     758              :     const int thread_id = omp_get_thread_num();
     759              : 
     760              :     double *hab_block_local = NULL;
     761              : 
     762              :     if (num_threads == 1) {
     763              :       hab_block_local = hab_blocks->host_buffer;
     764              :     } else {
     765              :       hab_block_local = ((double *)ctx->scratch) +
     766              :                         thread_id * (hab_blocks->size / sizeof(double));
     767              :       memset(hab_block_local, 0, hab_blocks->size);
     768              :     }
     769              : 
     770              :     tensor work, pab, hab, vab, forces_local_, virial_local_,
     771              :         forces_local_pair_, virial_local_pair_;
     772              :     memset(&work, 0, sizeof(tensor));
     773              :     memset(&pab, 0, sizeof(tensor));
     774              :     memset(&hab, 0, sizeof(tensor));
     775              :     memset(&vab, 0, sizeof(tensor));
     776              :     memset(&forces_local_, 0, sizeof(tensor));
     777              :     memset(&virial_local_, 0, sizeof(tensor));
     778              :     memset(&forces_local_pair_, 0, sizeof(tensor));
     779              :     memset(&virial_local_pair_, 0, sizeof(tensor));
     780              : 
     781              :     struct collocation_integration_ *handler = ctx->handler[thread_id];
     782              :     handler->apply_cutoff = ctx->apply_cutoff;
     783              :     handler->lmax_diff[0] = 0;
     784              :     handler->lmax_diff[1] = 0;
     785              :     handler->lmin_diff[0] = 0;
     786              :     handler->lmin_diff[1] = 0;
     787              : 
     788              :     if (calculate_tau || calculate_forces || calculate_virial) {
     789              :       handler->lmax_diff[0] = 1;
     790              :       handler->lmax_diff[1] = 0;
     791              :       handler->lmin_diff[0] = -1;
     792              :       handler->lmin_diff[1] = -1;
     793              :     }
     794              : 
     795              :     if (calculate_virial) {
     796              :       handler->lmax_diff[0]++;
     797              :       handler->lmax_diff[1]++;
     798              :     }
     799              : 
     800              :     if (calculate_tau) {
     801              :       handler->lmax_diff[0]++;
     802              :       handler->lmax_diff[1]++;
     803              :       handler->lmin_diff[0]--;
     804              :       handler->lmin_diff[1]--;
     805              :     }
     806              : 
     807              :     // Allocate pab matrix for re-use across tasks.
     808              :     initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
     809              :     alloc_tensor(&pab);
     810              : 
     811              :     initialize_tensor_2(&vab, ctx->maxco, ctx->maxco);
     812              :     alloc_tensor(&vab);
     813              : 
     814              :     initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
     815              :     alloc_tensor(&work);
     816              : 
     817              :     initialize_tensor_2(&hab, ctx->maxco, ctx->maxco);
     818              :     alloc_tensor(&hab);
     819              : 
     820              :     if (calculate_forces) {
     821              :       initialize_tensor_2(&forces_local_, forces_->size[0], forces_->size[1]);
     822              :       alloc_tensor(&forces_local_);
     823              :       initialize_tensor_2(&virial_local_, 3, 3);
     824              :       alloc_tensor(&virial_local_);
     825              :       memset(forces_local_.data, 0, sizeof(double) * forces_local_.alloc_size_);
     826              :       memset(virial_local_.data, 0, sizeof(double) * virial_local_.alloc_size_);
     827              :       initialize_tensor_2(&forces_local_pair_, 2, 3);
     828              :       alloc_tensor(&forces_local_pair_);
     829              :       initialize_tensor_3(&virial_local_pair_, 2, 3, 3);
     830              :       alloc_tensor(&virial_local_pair_);
     831              :     }
     832              : 
     833              :     initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
     834              :                              (const double(*)[3])grid->dh_inv);
     835              : 
     836              :     tensor_copy(&handler->grid, grid);
     837              :     handler->grid.data = grid->data;
     838              :     for (int d = 0; d < 3; d++)
     839              :       handler->orthogonal[d] = grid->orthogonal[d];
     840              : 
     841              :     /* it is only useful when we split the list over multiple threads. The
     842              :      * first iteration should load the block whatever status the
     843              :      * task->block_update_ variable has */
     844              :     const _task *prev_task = NULL;
     845              : #pragma omp for schedule(static)
     846              :     for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
     847              :       // Define some convenient aliases.
     848              :       const _task *task = &ctx->tasks[level][itask];
     849              : 
     850              :       if (task->level != level) {
     851              :         printf("level %d, %d\n", task->level, level);
     852              :         abort();
     853              :       }
     854              : 
     855              :       if (task->update_block_ || (prev_task == NULL)) {
     856              :         /* need to load pab if forces are needed */
     857              :         if (calculate_forces) {
     858              :           extract_blocks(ctx, task, pab_blocks, &work, &pab);
     859              :         }
     860              :         /* store the coefficients of the operator after rotation to
     861              :          * the spherical harmonic basis */
     862              : 
     863              :         rotate_and_store_coefficients(ctx, prev_task, task, &hab, &work,
     864              :                                       hab_block_local);
     865              : 
     866              :         /* There is a difference here between collocate and integrate.
     867              :          * For collocate, I do not need to know the task where blocks
     868              :          * have been updated the last time. For integrate this
     869              :          * information is crucial to update the coefficients of the
     870              :          * potential */
     871              :         prev_task = task;
     872              :         memset(hab.data, 0, sizeof(double) * hab.alloc_size_);
     873              :       }
     874              : 
     875              :       /* the grid is divided over several ranks or not periodic */
     876              :       if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
     877              :           (handler->grid.size[1] != handler->grid.full_size[1]) ||
     878              :           (handler->grid.size[2] != handler->grid.full_size[2])) {
     879              :         /* unfortunately the window where the gaussian should be added depends
     880              :          * on the bonds. So I have to adjust the window all the time. */
     881              : 
     882              :         setup_grid_window(&handler->grid, shift_local, border_width,
     883              :                           task->border_mask);
     884              :       }
     885              : 
     886              :       int lmax[2] = {task->lmax[0] + handler->lmax_diff[0],
     887              :                      task->lmax[1] + handler->lmax_diff[1]};
     888              :       int lmin[2] = {task->lmin[0] + handler->lmin_diff[0],
     889              :                      task->lmin[1] + handler->lmin_diff[1]};
     890              : 
     891              :       lmin[0] = imax(lmin[0], 0);
     892              :       lmin[1] = imax(lmin[1], 0);
     893              : 
     894              :       initialize_tensor_4(&(handler->alpha), 3, lmax[1] + 1, lmax[0] + 1,
     895              :                           lmax[0] + lmax[1] + 1);
     896              :       realloc_tensor(&(handler->alpha));
     897              : 
     898              :       const int lp = lmax[0] + lmax[1];
     899              : 
     900              :       initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
     901              :       realloc_tensor(&(handler->coef));
     902              :       grid_integrate(handler, ctx->orthorhombic, task->zetp, task->rp,
     903              :                      task->radius);
     904              :       /*
     905              :               handler->coef contains coefficients in the (x - x_12) basis. now
     906              :               we need to rotate them in the (x - x_1) (x - x_2) basis
     907              :       */
     908              : 
     909              :       /* compute the transformation matrix */
     910              :       grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax,
     911              :                                &(handler->alpha));
     912              : 
     913              :       initialize_tensor_2(&vab, ncoset(lmax[1]), ncoset(lmax[0]));
     914              :       realloc_tensor(&vab);
     915              : 
     916              :       grid_compute_vab(lmin, lmax, lp, task->prefactor,
     917              :                        &handler->alpha, // contains the matrix for the rotation
     918              :                        &handler->coef,  // contains the coefficients of
     919              :                        // the potential in the
     920              :                        // (x_x_12) basis
     921              :                        &vab); // contains the coefficients of the potential
     922              :       // in the (x - x_1)(x - x_2) basis
     923              : 
     924              :       if (calculate_forces) {
     925              :         memset(forces_local_pair_.data, 0,
     926              :                sizeof(double) * forces_local_pair_.alloc_size_);
     927              :         memset(virial_local_pair_.data, 0,
     928              :                sizeof(double) * virial_local_pair_.alloc_size_);
     929              :       }
     930              : 
     931              :       update_hab_forces_and_stress(
     932              :           task, &vab, &pab, calculate_tau, calculate_forces, calculate_virial,
     933              :           &forces_local_pair_, /* matrix
     934              :                                 * containing the
     935              :                                 * contribution of
     936              :                                 * the gaussian
     937              :                                 * pair for each
     938              :                                 * atom */
     939              :           &virial_local_pair_, /* same but for the virial term (stress tensor)
     940              :                                 */
     941              :           &hab);
     942              : 
     943              :       if (calculate_forces) {
     944              :         const double scaling = (task->iatom == task->jatom) ? 1.0 : 2.0;
     945              :         idx2(forces_local_, task->iatom, 0) +=
     946              :             scaling * idx2(forces_local_pair_, 0, 0);
     947              :         idx2(forces_local_, task->iatom, 1) +=
     948              :             scaling * idx2(forces_local_pair_, 0, 1);
     949              :         idx2(forces_local_, task->iatom, 2) +=
     950              :             scaling * idx2(forces_local_pair_, 0, 2);
     951              : 
     952              :         idx2(forces_local_, task->jatom, 0) +=
     953              :             scaling * idx2(forces_local_pair_, 1, 0);
     954              :         idx2(forces_local_, task->jatom, 1) +=
     955              :             scaling * idx2(forces_local_pair_, 1, 1);
     956              :         idx2(forces_local_, task->jatom, 2) +=
     957              :             scaling * idx2(forces_local_pair_, 1, 2);
     958              :         if (calculate_virial) {
     959              :           for (int i = 0; i < 3; i++) {
     960              :             for (int j = 0; j < 3; j++) {
     961              :               idx2(virial_local_, i, j) +=
     962              :                   scaling * (idx3(virial_local_pair_, 0, i, j) +
     963              :                              idx3(virial_local_pair_, 1, i, j));
     964              :             }
     965              :           }
     966              :         }
     967              :       }
     968              :     }
     969              : 
     970              :     rotate_and_store_coefficients(ctx, prev_task, NULL, &hab, &work,
     971              :                                   hab_block_local);
     972              : 
     973              :     // now reduction over the hab blocks
     974              :     if (num_threads > 1) {
     975              :       // does not store the number of elements but the amount of memory
     976              :       // occupied. That's a strange choice.
     977              :       const int hab_size = hab_blocks->size / sizeof(double);
     978              :       if ((hab_size / num_threads) >= 2) {
     979              :         const int block_size =
     980              :             hab_size / num_threads + (hab_size % num_threads);
     981              : 
     982              :         for (int bk = 0; bk < num_threads; bk++) {
     983              :           int bk_id = (bk + thread_id) % num_threads;
     984              :           size_t begin = bk_id * block_size;
     985              :           size_t end = imin((bk_id + 1) * block_size, hab_size);
     986              :           cblas_daxpy(end - begin, 1.0, hab_block_local + begin, 1,
     987              :                       hab_blocks->host_buffer + begin, 1);
     988              : #pragma omp barrier
     989              :         }
     990              :       } else {
     991              :         const int hab_size = hab_blocks->size / sizeof(double);
     992              : #pragma omp critical
     993              :         cblas_daxpy(hab_size, 1.0, hab_block_local, 1, hab_blocks->host_buffer,
     994              :                     1);
     995              :       }
     996              :     }
     997              : 
     998              :     if (calculate_forces) {
     999              :       if (num_threads > 1) {
    1000              :         if ((forces_->alloc_size_ / num_threads) >= 2) {
    1001              :           const int block_size = forces_->alloc_size_ / num_threads +
    1002              :                                  (forces_->alloc_size_ % num_threads);
    1003              : 
    1004              :           for (int bk = 0; bk < num_threads; bk++) {
    1005              :             int bk_id = (bk + thread_id) % num_threads;
    1006              :             int begin = bk_id * block_size;
    1007              :             int end = imin((bk_id + 1) * block_size, forces_->alloc_size_);
    1008              :             cblas_daxpy(end - begin, 1.0, forces_local_.data + begin, 1,
    1009              :                         forces_->data + begin, 1);
    1010              : #pragma omp barrier
    1011              :           }
    1012              :         } else {
    1013              : #pragma omp critical
    1014              :           cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
    1015              :                       forces_->data, 1);
    1016              :         }
    1017              :       } else {
    1018              :         // we are running with OMP_NUM_THREADS=1
    1019              :         cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
    1020              :                     forces_->data, 1);
    1021              :       }
    1022              :     }
    1023              : 
    1024              :     if (calculate_virial) {
    1025              : #pragma omp critical
    1026              :       for (int i = 0; i < 3; i++) {
    1027              :         for (int j = 0; j < 3; j++) {
    1028              :           idx2(virial_[0], i, j) += idx2(virial_local_, i, j);
    1029              :         }
    1030              :       }
    1031              :     }
    1032              : 
    1033              :     handler->grid.data = NULL;
    1034              :     free(vab.data);
    1035              :     free(work.data);
    1036              :     free(pab.data);
    1037              :     free(hab.data);
    1038              : 
    1039              :     if (calculate_forces) {
    1040              :       free(forces_local_.data);
    1041              :       free(virial_local_.data);
    1042              :       free(virial_local_pair_.data);
    1043              :       free(forces_local_pair_.data);
    1044              :     }
    1045              :   }
    1046          596 : }
    1047              : 
    1048              : /*******************************************************************************
    1049              :  * \brief Integrate all tasks of in given list from given grids using matrix -
    1050              :  * matrix multiplication
    1051              :  ******************************************************************************/
    1052          149 : void grid_dgemm_integrate_task_list(
    1053              :     void *ptr, const bool compute_tau, const int natoms, const int nlevels,
    1054              :     const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
    1055              :     offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
    1056              : 
    1057          149 :   grid_context *const ctx = (grid_context *)ptr;
    1058              : 
    1059          149 :   assert(ctx->checksum == ctx_checksum);
    1060          149 :   assert(ctx->nlevels == nlevels);
    1061          149 :   assert(ctx->natoms == natoms);
    1062              : 
    1063              :   // Zero result arrays.
    1064          149 :   memset(hab_blocks->host_buffer, 0, hab_blocks->size);
    1065              : 
    1066          149 :   const int max_threads = omp_get_max_threads();
    1067              : 
    1068          149 :   if (ctx->scratch == NULL)
    1069          149 :     ctx->scratch = malloc(hab_blocks->size * max_threads);
    1070          149 :   assert(ctx->scratch != NULL);
    1071              : 
    1072              :   // #pragma omp parallel for
    1073          745 :   for (int level = 0; level < ctx->nlevels; level++) {
    1074          596 :     const _layout *layout = &ctx->layouts[level];
    1075          596 :     set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
    1076          596 :                         layout->npts_global, layout->npts_local,
    1077          596 :                         layout->shift_local, layout->border_width, layout->dh,
    1078          596 :                         layout->dh_inv, grids[level]);
    1079          596 :     ctx->grid[level].data = grids[level]->host_buffer;
    1080              :   }
    1081              : 
    1082          149 :   bool calculate_virial = (virial != NULL);
    1083          149 :   bool calculate_forces = (forces != NULL);
    1084              : 
    1085          149 :   tensor forces_, virial_;
    1086          149 :   if (calculate_forces) {
    1087           19 :     initialize_tensor_2(&forces_, natoms, 3);
    1088           19 :     alloc_tensor(&forces_);
    1089           19 :     initialize_tensor_2(&virial_, 3, 3);
    1090           19 :     alloc_tensor(&virial_);
    1091           19 :     memset(forces_.data, 0, sizeof(double) * forces_.alloc_size_);
    1092           19 :     memset(virial_.data, 0, sizeof(double) * virial_.alloc_size_);
    1093              :   }
    1094              : 
    1095          745 :   for (int level = 0; level < ctx->nlevels; level++) {
    1096          596 :     const _layout *layout = &ctx->layouts[level];
    1097          596 :     integrate_one_grid_level_dgemm(ctx, level, compute_tau, calculate_forces,
    1098          596 :                                    calculate_virial, layout->shift_local,
    1099          596 :                                    layout->border_width, pab_blocks, hab_blocks,
    1100              :                                    &forces_, &virial_);
    1101          596 :     ctx->grid[level].data = NULL;
    1102              :   }
    1103          149 :   if (calculate_forces) {
    1104           19 :     if (calculate_virial) {
    1105           16 :       virial[0][0] = idx2(virial_, 0, 0);
    1106           16 :       virial[0][1] = idx2(virial_, 0, 1);
    1107           16 :       virial[0][2] = idx2(virial_, 0, 2);
    1108           16 :       virial[1][0] = idx2(virial_, 1, 0);
    1109           16 :       virial[1][1] = idx2(virial_, 1, 1);
    1110           16 :       virial[1][2] = idx2(virial_, 1, 2);
    1111           16 :       virial[2][0] = idx2(virial_, 2, 0);
    1112           16 :       virial[2][1] = idx2(virial_, 2, 1);
    1113           16 :       virial[2][2] = idx2(virial_, 2, 2);
    1114              :     }
    1115              : 
    1116           19 :     memcpy(forces[0], forces_.data, sizeof(double) * forces_.alloc_size_);
    1117           19 :     free(forces_.data);
    1118           19 :     free(virial_.data);
    1119              :   }
    1120              : 
    1121          149 :   free(ctx->scratch);
    1122          149 :   ctx->scratch = NULL;
    1123          149 : }
        

Generated by: LCOV version 2.0-1