LCOV - code coverage report
Current view: top level - src/grid/dgemm - grid_dgemm_integrate.c (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:c5411e0) Lines: 309 453 68.2 %
Date: 2024-05-16 06:48:40 Functions: 9 12 75.0 %

          Line data    Source code
       1             : /*----------------------------------------------------------------------------*/
       2             : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3             : /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */
       4             : /*                                                                            */
       5             : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6             : /*----------------------------------------------------------------------------*/
       7             : 
       8             : #include <assert.h>
       9             : #include <limits.h>
      10             : #include <math.h>
      11             : #include <stdbool.h>
      12             : #include <stdio.h>
      13             : #include <stdlib.h>
      14             : #include <string.h>
      15             : #include <unistd.h>
      16             : 
      17             : #include <omp.h>
      18             : 
      19             : #ifdef __LIBXSMM
      20             : #include <libxsmm.h>
      21             : #endif
      22             : 
      23             : #include "../common/grid_common.h"
      24             : #include "grid_dgemm_coefficients.h"
      25             : #include "grid_dgemm_collocate.h"
      26             : #include "grid_dgemm_collocation_integration.h"
      27             : #include "grid_dgemm_non_orthorombic_corrections.h"
      28             : #include "grid_dgemm_private_header.h"
      29             : #include "grid_dgemm_tensor_local.h"
      30             : #include "grid_dgemm_utils.h"
      31             : 
      32           0 : void extract_cube_within_spherical_cutoff_ortho(
      33             :     struct collocation_integration_ *const handler, const double disr_radius,
      34             :     const int cmax, const int *const lb_cube, const int *const cube_center) {
      35             :   // a mapping so that the ig corresponds to the right grid point
      36           0 :   int **map = handler->map;
      37           0 :   map[1] = map[0] + 2 * cmax + 1;
      38           0 :   map[2] = map[1] + 2 * cmax + 1;
      39             :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
      40             : 
      41           0 :   for (int i = 0; i < 3; i++) {
      42           0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
      43           0 :       map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
      44           0 :                               handler->grid.lower_corner[i],
      45             :                           handler->grid.full_size[i]);
      46             :     }
      47             :   }
      48             : 
      49           0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
      50           0 :                             .xmax = handler->grid.window_size[0]};
      51           0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
      52           0 :                             .xmax = handler->grid.window_size[1]};
      53           0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
      54           0 :                             .xmax = handler->grid.window_size[2]};
      55             : 
      56           0 :   for (int kg = 0; kg < handler->cube.size[0]; kg++) {
      57           0 :     const int k = map[0][kg];
      58           0 :     const int kd =
      59           0 :         (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
      60           0 :     const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
      61           0 :     const double kremain = disr_radius * disr_radius - kr * kr;
      62             : 
      63           0 :     if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
      64             : 
      65           0 :       const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
      66           0 :       for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
      67           0 :         const int j = map[1][jg - lb_cube[1]];
      68           0 :         const double jr = ((2 * jg - 1) >> 1) *
      69           0 :                           handler->dh[1][1]; // distance from center in a.u.
      70           0 :         const double jremain = kremain - jr * jr;
      71             : 
      72           0 :         if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
      73           0 :           const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
      74           0 :           const int xmax = 1 - xmin;
      75             : 
      76             :           // printf("xmin %d, xmax %d\n", xmin, xmax);
      77           0 :           for (int x = xmin - lb_cube[2];
      78           0 :                x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
      79           0 :             const int x1 = map[2][x];
      80             : 
      81           0 :             if (!is_point_in_interval(x1, xwindow))
      82           0 :               continue;
      83             : 
      84           0 :             int lower_corner[3] = {k, j, x1};
      85           0 :             int upper_corner[3] = {k + 1, j + 1, x1 + 1};
      86             : 
      87           0 :             compute_interval(map[2], handler->grid.full_size[2],
      88             :                              handler->grid.size[2], handler->cube.size[2], x1,
      89             :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
      90             : 
      91           0 :             if (upper_corner[2] - lower_corner[2]) {
      92           0 :               const int position1[3] = {kg, jg - lb_cube[1], x};
      93             : 
      94             :               /* the function will internally take care of the local vs global
      95             :                * grid */
      96             : 
      97           0 :               double *restrict dst = &idx3(handler->cube, position1[0],
      98             :                                            position1[1], position1[2]);
      99             : 
     100           0 :               double *restrict src = &idx3(handler->grid, lower_corner[0],
     101             :                                            lower_corner[1], lower_corner[2]);
     102             : 
     103           0 :               const int sizex = upper_corner[2] - lower_corner[2];
     104           0 :               GRID_PRAGMA_SIMD((dst, src), 8)
     105           0 :               for (int x = 0; x < sizex; x++) {
     106           0 :                 dst[x] = src[x];
     107             :               }
     108             :             }
     109             : 
     110           0 :             if (handler->grid.size[2] == handler->grid.full_size[2])
     111           0 :               update_loop_index(handler->grid.full_size[2], x1, &x);
     112             :             else
     113           0 :               x += upper_corner[2] - lower_corner[2] - 1;
     114             :           }
     115             :         }
     116             :       }
     117             :     }
     118             :   }
     119           0 : }
     120             : 
     121           0 : void extract_cube_within_spherical_cutoff_generic(
     122             :     struct collocation_integration_ *const handler, const double disr_radius,
     123             :     const int cmax, const int *const lb_cube, const int *const ub_cube,
     124             :     const double *const roffset, const int *const cube_center) {
     125             : 
     126           0 :   const double a = handler->dh[0][0] * handler->dh[0][0] +
     127           0 :                    handler->dh[0][1] * handler->dh[0][1] +
     128           0 :                    handler->dh[0][2] * handler->dh[0][2];
     129           0 :   const double a_inv = 1.0 / a;
     130             :   // a mapping so that the ig corresponds to the right grid point
     131           0 :   int **map = handler->map;
     132           0 :   map[1] = map[0] + 2 * cmax + 1;
     133           0 :   map[2] = map[1] + 2 * cmax + 1;
     134             :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     135             : 
     136           0 :   for (int i = 0; i < 3; i++) {
     137           0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     138           0 :       map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
     139           0 :                               handler->grid.lower_corner[i],
     140             :                           handler->grid.full_size[i]);
     141             :     }
     142             :   }
     143             : 
     144           0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     145           0 :                             .xmax = handler->grid.window_size[0]};
     146           0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     147           0 :                             .xmax = handler->grid.window_size[1]};
     148           0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     149           0 :                             .xmax = handler->grid.window_size[2]};
     150             : 
     151           0 :   for (int k = 0; k < handler->cube.size[0]; k++) {
     152           0 :     const int iz = map[0][k];
     153             : 
     154           0 :     if (!is_point_in_interval(iz, zwindow))
     155           0 :       continue;
     156             : 
     157           0 :     const double tz = (k + lb_cube[0] - roffset[0]);
     158           0 :     const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
     159           0 :                          tz * handler->dh[2][2]};
     160             : 
     161           0 :     for (int j = 0; j < handler->cube.size[1]; j++) {
     162           0 :       const int iy = map[1][j];
     163             : 
     164           0 :       if (!is_point_in_interval(iy, ywindow))
     165           0 :         continue;
     166             : 
     167           0 :       const double ty = (j - roffset[1] + lb_cube[1]);
     168           0 :       const double y[3] = {z[0] + ty * handler->dh[1][0],
     169           0 :                            z[1] + ty * handler->dh[1][1],
     170           0 :                            z[2] + ty * handler->dh[1][2]};
     171             : 
     172             :       /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
     173             :       /*                                            4 (x1^2 + y1^2 + z1^2)
     174             :        * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
     175             : 
     176           0 :       const double b =
     177           0 :           -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
     178           0 :                   handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
     179           0 :                   handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
     180             : 
     181           0 :       const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
     182           0 :                            (roffset[2] * handler->dh[0][0] - y[0]) +
     183           0 :                        (roffset[2] * handler->dh[0][1] - y[1]) *
     184           0 :                            (roffset[2] * handler->dh[0][1] - y[1]) +
     185           0 :                        (roffset[2] * handler->dh[0][2] - y[2]) *
     186             :                            (roffset[2] * handler->dh[0][2] - y[2]) -
     187           0 :                        disr_radius * disr_radius;
     188             : 
     189           0 :       double delta = b * b - 4.0 * a * c;
     190             : 
     191           0 :       if (delta < 0.0)
     192           0 :         continue;
     193             : 
     194           0 :       delta = sqrt(delta);
     195             : 
     196           0 :       const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
     197           0 :       const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
     198             : 
     199           0 :       int lower_corner[3] = {iz, iy, xmin};
     200           0 :       int upper_corner[3] = {iz + 1, iy + 1, xmin};
     201             : 
     202           0 :       for (int x = xmin - lb_cube[2];
     203           0 :            x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     204           0 :         const int x1 = map[2][x];
     205             : 
     206           0 :         if (!is_point_in_interval(x1, xwindow))
     207           0 :           continue;
     208             : 
     209           0 :         compute_interval(map[2], handler->grid.full_size[2],
     210             :                          handler->grid.size[2], handler->cube.size[2], x1, &x,
     211             :                          lower_corner + 2, upper_corner + 2, xwindow);
     212             : 
     213           0 :         if (upper_corner[2] - lower_corner[2]) {
     214           0 :           const int position1[3] = {k, j, x};
     215             : 
     216             :           /* the function will internally take care of the local vs global
     217             :            * grid */
     218             : 
     219           0 :           double *__restrict__ src = &idx3(handler->grid, lower_corner[0],
     220             :                                            lower_corner[1], lower_corner[2]);
     221           0 :           double *__restrict__ dst =
     222           0 :               &idx3(handler->cube, position1[0], position1[1], position1[2]);
     223             : 
     224           0 :           const int sizex = upper_corner[2] - lower_corner[2];
     225             : 
     226             :           // #pragma omp simd linear(dst, src) simdlen(8)
     227           0 :           GRID_PRAGMA_SIMD((dst, src), 8)
     228           0 :           for (int x = 0; x < sizex; x++) {
     229           0 :             dst[x] = src[x];
     230             :           }
     231             : 
     232           0 :           if (handler->grid.size[2] == handler->grid.full_size[2])
     233           0 :             update_loop_index(handler->grid.full_size[2], x1, &x);
     234             :           else
     235           0 :             x += upper_corner[2] - lower_corner[2] - 1;
     236             :         }
     237             :       }
     238             :     }
     239             :   }
     240           0 : }
     241             : 
     242        5076 : static void rotate_and_store_coefficients(grid_context *const ctx,
     243             :                                           const _task *prev_task,
     244             :                                           const _task *task, tensor *const hab,
     245             :                                           tensor *work, // some scratch matrix
     246             :                                           double *blocks) {
     247        5076 :   if (prev_task != NULL) {
     248             :     /* prev_task is NULL when we deal with the first iteration. In that case
     249             :      * we just need to initialize the hab matrix to the proper size. Note
     250             :      * that resizing does not really occurs since memory allocation is done
     251             :      * for the maximum size the matrix can have. */
     252        4452 :     const int iatom = prev_task->iatom;
     253        4452 :     const int jatom = prev_task->jatom;
     254        4452 :     const int iset = prev_task->iset;
     255        4452 :     const int jset = prev_task->jset;
     256        4452 :     const int ikind = ctx->atom_kinds[iatom];
     257        4452 :     const int jkind = ctx->atom_kinds[jatom];
     258        4452 :     const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     259        4452 :     const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     260             : 
     261        4452 :     const int block_num = prev_task->block_num;
     262        4452 :     double *const block = &blocks[ctx->block_offsets[block_num]];
     263             : 
     264        4452 :     const int ncoseta = ncoset(ibasis->lmax[iset]);
     265        4452 :     const int ncosetb = ncoset(jbasis->lmax[jset]);
     266             : 
     267        4452 :     const int ncoa = ibasis->npgf[iset] * ncoseta;
     268        4452 :     const int ncob = jbasis->npgf[jset] * ncosetb;
     269             : 
     270        4452 :     const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set */
     271        4452 :     const int nsgf_setb = jbasis->nsgf_set[jset];
     272        4452 :     const int nsgfa = ibasis->nsgf; // size of entire spherical basis
     273        4452 :     const int nsgfb = jbasis->nsgf;
     274        4452 :     const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
     275        4452 :     const int sgfb = jbasis->first_sgf[jset] - 1;
     276        4452 :     const int maxcoa = ibasis->maxco;
     277        4452 :     const int maxcob = jbasis->maxco;
     278             : 
     279        4452 :     initialize_tensor_2(work, nsgf_setb, ncoa);
     280        4452 :     realloc_tensor(work);
     281             : 
     282             :     // Warning these matrices are row major....
     283             : 
     284        4452 :     dgemm_params m1, m2;
     285        4452 :     memset(&m1, 0, sizeof(dgemm_params));
     286        4452 :     memset(&m2, 0, sizeof(dgemm_params));
     287             : 
     288        4452 :     m1.op1 = 'N';
     289        4452 :     m1.op2 = 'N';
     290        4452 :     m1.m = nsgf_setb;
     291        4452 :     m1.n = ncoa;
     292        4452 :     m1.k = ncob;
     293        4452 :     m1.alpha = 1.0;
     294        4452 :     m1.beta = 0.0;
     295        4452 :     m1.a = &jbasis->sphi[sgfb * maxcob];
     296        4452 :     m1.lda = maxcob;
     297        4452 :     m1.b = hab->data;
     298        4452 :     m1.ldb = ncoa;
     299        4452 :     m1.c = work->data;
     300        4452 :     m1.ldc = work->ld_;
     301             : 
     302             :     // phi[b][ncob]
     303             :     // work[b][ncoa] = phi[b][ncob] * hab[ncob][ncoa]
     304             : 
     305        4452 :     if (iatom <= jatom) {
     306             :       // I need to have the final result in the form
     307             : 
     308             :       // block[b][a] = work[b][ncoa] transpose(phi[a][ncoa])
     309        2964 :       m2.op1 = 'N';
     310        2964 :       m2.op2 = 'T';
     311        2964 :       m2.m = nsgf_setb;
     312        2964 :       m2.n = nsgf_seta;
     313        2964 :       m2.k = ncoa;
     314        2964 :       m2.a = work->data;
     315        2964 :       m2.lda = work->ld_;
     316        2964 :       m2.b = &ibasis->sphi[sgfa * maxcoa];
     317        2964 :       m2.ldb = maxcoa;
     318        2964 :       m2.c = block + sgfb * nsgfa + sgfa;
     319        2964 :       m2.ldc = nsgfa;
     320        2964 :       m2.alpha = 1.0;
     321        2964 :       m2.beta = 1.0;
     322             :     } else {
     323             :       // block[a][b] = phi[a][ncoa] Transpose(work[b][ncoa])
     324        1488 :       m2.op1 = 'N';
     325        1488 :       m2.op2 = 'T';
     326        1488 :       m2.m = nsgf_seta;
     327        1488 :       m2.n = nsgf_setb;
     328        1488 :       m2.k = ncoa;
     329        1488 :       m2.a = &ibasis->sphi[sgfa * maxcoa];
     330        1488 :       m2.lda = maxcoa;
     331        1488 :       m2.b = work->data;
     332        1488 :       m2.ldb = work->ld_;
     333        1488 :       m2.c = block + sgfa * nsgfb + sgfb;
     334        1488 :       m2.ldc = nsgfb;
     335        1488 :       m2.alpha = 1.0;
     336        1488 :       m2.beta = 1.0;
     337             :     }
     338             : 
     339        4452 :     m1.use_libxsmm = true;
     340        4452 :     m2.use_libxsmm = true;
     341             : 
     342             :     /* these dgemm are *row* major */
     343        4452 :     dgemm_simplified(&m1);
     344        4452 :     dgemm_simplified(&m2);
     345             :   }
     346             : 
     347        5076 :   if (task != NULL) {
     348        4452 :     const int iatom = task->iatom;
     349        4452 :     const int jatom = task->jatom;
     350        4452 :     const int ikind = ctx->atom_kinds[iatom];
     351        4452 :     const int jkind = ctx->atom_kinds[jatom];
     352        4452 :     const int iset = task->iset;
     353        4452 :     const int jset = task->jset;
     354        4452 :     const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     355        4452 :     const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     356        4452 :     const int ncoseta = ncoset(ibasis->lmax[iset]);
     357        4452 :     const int ncosetb = ncoset(jbasis->lmax[jset]);
     358             : 
     359        4452 :     const int ncoa = ibasis->npgf[iset] * ncoseta;
     360        4452 :     const int ncob = jbasis->npgf[jset] * ncosetb;
     361             : 
     362        4452 :     initialize_tensor_2(hab, ncob, ncoa);
     363        4452 :     realloc_tensor(hab);
     364             :   }
     365        5076 : }
     366             : 
     367       49036 : void update_force_pair(orbital a, orbital b, const double pab,
     368             :                        const double ftz[2], const double *const rab,
     369             :                        const tensor *const vab, tensor *force) {
     370       49036 :   const double axpm0 = idx2(vab[0], idx(b), idx(a));
     371      196144 :   for (int i = 0; i < 3; i++) {
     372      147108 :     const double aip1 = idx2(vab[0], idx(b), idx(up(i, a)));
     373      147108 :     const double aim1 = idx2(vab[0], idx(b), idx(down(i, a)));
     374      147108 :     const double bim1 = idx2(vab[0], idx(down(i, b)), idx(a));
     375      147108 :     idx2(force[0], 0, i) += pab * (ftz[0] * aip1 - a.l[i] * aim1);
     376      147108 :     idx2(force[0], 1, i) +=
     377      147108 :         pab * (ftz[1] * (aip1 - rab[i] * axpm0) - b.l[i] * bim1);
     378             :   }
     379       49036 : }
     380             : 
     381       34952 : void update_virial_pair(orbital a, orbital b, const double pab,
     382             :                         const double ftz[2], const double *const rab,
     383             :                         const tensor *const vab, tensor *virial) {
     384      139808 :   for (int i = 0; i < 3; i++) {
     385      419424 :     for (int j = 0; j < 3; j++) {
     386      314568 :       idx3(virial[0], 0, i, j) +=
     387      314568 :           pab * ftz[0] * idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
     388      314568 :           pab * a.l[j] * idx2(vab[0], idx(b), idx(up(i, down(j, a))));
     389             :     }
     390             :   }
     391             : 
     392      139808 :   for (int i = 0; i < 3; i++) {
     393      419424 :     for (int j = 0; j < 3; j++) {
     394      314568 :       idx3(virial[0], 1, i, j) +=
     395      314568 :           pab * ftz[1] *
     396      314568 :               (idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
     397      314568 :                idx2(vab[0], idx(b), idx(up(i, a))) * rab[j] -
     398      314568 :                idx2(vab[0], idx(b), idx(up(j, a))) * rab[i] +
     399      314568 :                idx2(vab[0], idx(b), idx(a)) * rab[j] * rab[i]) -
     400      314568 :           pab * b.l[j] * idx2(vab[0], idx(up(i, down(j, b))), idx(a));
     401             :     }
     402             :   }
     403       34952 : }
     404             : 
     405      340422 : void update_all(const bool compute_forces, const bool compute_virial,
     406             :                 const orbital a, const orbital b, const double f,
     407             :                 const double *const ftz, const double *rab, const tensor *vab,
     408             :                 const double pab, double *hab, tensor *forces,
     409             :                 tensor *virials) {
     410             : 
     411      340422 :   *hab += f * idx2(vab[0], idx(b), idx(a));
     412             : 
     413      340422 :   if (compute_forces) {
     414       49036 :     update_force_pair(a, b, f * pab, ftz, rab, vab, forces);
     415             :   }
     416             : 
     417      340422 :   if (compute_virial) {
     418       34952 :     update_virial_pair(a, b, f * pab, ftz, rab, vab, virials);
     419             :   }
     420      340422 : }
     421             : 
     422           0 : static void update_tau(const bool compute_forces, const bool compute_virial,
     423             :                        const orbital a, const orbital b, const double ftz[2],
     424             :                        const double *const rab, const tensor *const vab,
     425             :                        const double pab, double *const hab, tensor *forces,
     426             :                        tensor *virials) {
     427             : 
     428           0 :   for (int i = 0; i < 3; i++) {
     429           0 :     update_all(compute_forces, compute_virial, down(i, a), down(i, b),
     430           0 :                0.5 * a.l[i] * b.l[i], ftz, rab, vab, pab, hab, forces, virials);
     431           0 :     update_all(compute_forces, compute_virial, up(i, a), down(i, b),
     432           0 :                -0.5 * ftz[0] * b.l[i], ftz, rab, vab, pab, hab, forces,
     433             :                virials);
     434           0 :     update_all(compute_forces, compute_virial, down(i, a), up(i, b),
     435           0 :                -0.5 * a.l[i] * ftz[1], ftz, rab, vab, pab, hab, forces,
     436             :                virials);
     437           0 :     update_all(compute_forces, compute_virial, up(i, a), up(i, b),
     438           0 :                0.5 * ftz[0] * ftz[1], ftz, rab, vab, pab, hab, forces, virials);
     439             :   }
     440           0 : }
     441             : 
     442             : static void
     443       43611 : update_hab_forces_and_stress(const _task *task, const tensor *const vab,
     444             :                              const tensor *const pab, const bool compute_tau,
     445             :                              const bool compute_forces,
     446             :                              const bool compute_virial, tensor *const forces,
     447             :                              tensor *const virial, tensor *const hab) {
     448       43611 :   double zeta[2] = {task->zeta[0] * 2.0, task->zeta[1] * 2.0};
     449      101793 :   for (int lb = task->lmin[1]; lb <= task->lmax[1]; lb++) {
     450      138321 :     for (int la = task->lmin[0]; la <= task->lmax[0]; la++) {
     451      200037 :       for (int bx = 0; bx <= lb; bx++) {
     452      284562 :         for (int by = 0; by <= lb - bx; by++) {
     453      164664 :           const int bz = lb - bx - by;
     454      164664 :           const orbital b = {{bx, by, bz}};
     455      164664 :           const int idx_b = task->offset[1] + idx(b);
     456      412083 :           for (int ax = 0; ax <= la; ax++) {
     457      587841 :             for (int ay = 0; ay <= la - ax; ay++) {
     458      340422 :               const int az = la - ax - ay;
     459      340422 :               const orbital a = {{ax, ay, az}};
     460      340422 :               const int idx_a = task->offset[0] + idx(a);
     461      340422 :               double *habval = &idx2(hab[0], idx_b, idx_a);
     462      340422 :               const double prefactor = idx2(pab[0], idx_b, idx_a);
     463             : 
     464             :               // now compute the forces
     465      340422 :               if (compute_tau) {
     466           0 :                 update_tau(compute_forces, compute_virial, a, b, zeta,
     467           0 :                            task->rab, vab, prefactor, habval, forces, virial);
     468             :               } else {
     469      340422 :                 update_all(compute_forces, compute_virial, a, b, 1.0, zeta,
     470      340422 :                            task->rab, vab, prefactor, habval, forces, virial);
     471             :               }
     472             :             }
     473             :           }
     474             :         }
     475             :       }
     476             :     }
     477             :   }
     478       43611 : }
     479             : 
     480             : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
     481             :  * very general and does not depend on the orthorombic nature of the grid. for
     482             :  * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
     483             : 
     484       43611 : void extract_cube(struct collocation_integration_ *handler, const int cmax,
     485             :                   const int *lower_boundaries_cube, const int *cube_center) {
     486             : 
     487             :   // a mapping so that the ig corresponds to the right grid point
     488       43611 :   int **map = handler->map;
     489       43611 :   map[1] = map[0] + 2 * cmax + 1;
     490       43611 :   map[2] = map[1] + 2 * cmax + 1;
     491             :   //  memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     492      174444 :   for (int i = 0; i < 3; i++) {
     493     3332608 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     494     3201775 :       map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
     495     3201775 :                               handler->grid.lower_corner[i],
     496             :                           handler->grid.full_size[i]);
     497             :     }
     498             :   }
     499             : 
     500       43611 :   int lower_corner[3];
     501       43611 :   int upper_corner[3];
     502             : 
     503       43611 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     504       43611 :                             .xmax = handler->grid.window_size[0]};
     505       43611 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     506       43611 :                             .xmax = handler->grid.window_size[1]};
     507       43611 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     508       43611 :                             .xmax = handler->grid.window_size[2]};
     509             : 
     510      128484 :   for (int z = 0; (z < handler->cube.size[0]); z++) {
     511       84873 :     const int z1 = map[0][z];
     512             : 
     513      169746 :     if (!is_point_in_interval(z1, zwindow))
     514           0 :       continue;
     515             : 
     516       84873 :     compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
     517             :                      handler->cube.size[0], z1, &z, lower_corner, upper_corner,
     518             :                      zwindow);
     519             : 
     520             :     /* // We have a full plane. */
     521       84873 :     if (upper_corner[0] - lower_corner[0]) {
     522      255507 :       for (int y = 0; y < handler->cube.size[1]; y++) {
     523      170634 :         const int y1 = map[1][y];
     524             : 
     525             :         // this check is completely irrelevant when running without MPI.
     526      341268 :         if (!is_point_in_interval(y1, ywindow))
     527           0 :           continue;
     528             : 
     529      170634 :         compute_interval(map[1], handler->grid.full_size[1],
     530             :                          handler->grid.size[1], handler->cube.size[1], y1, &y,
     531             :                          lower_corner + 1, upper_corner + 1, ywindow);
     532             : 
     533      170634 :         if (upper_corner[1] - lower_corner[1]) {
     534      545794 :           for (int x = 0; x < handler->cube.size[2]; x++) {
     535      375160 :             const int x1 = map[2][x];
     536             : 
     537      750320 :             if (!is_point_in_interval(x1, xwindow))
     538           0 :               continue;
     539             : 
     540      375160 :             compute_interval(map[2], handler->grid.full_size[2],
     541             :                              handler->grid.size[2], handler->cube.size[2], x1,
     542             :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
     543             : 
     544      375160 :             if (upper_corner[2] - lower_corner[2]) { // should be non zero
     545      375160 :               const int position1[3] = {z, y, x};
     546             : 
     547             :               /* the function will internally take care of the local vx global
     548             :                * grid */
     549      375160 :               extract_sub_grid(
     550             :                   lower_corner, // lower corner of the portion of cube (in the
     551             :                   // full grid)
     552             :                   upper_corner, // upper corner of the portion of cube (in the
     553             :                   // full grid)
     554             :                   position1, // starting position subblock inside the cube
     555      375160 :                   &handler->grid,
     556             :                   &handler->cube); // the grid to add data from
     557             : 
     558      375160 :               if (handler->grid.size[2] == handler->grid.full_size[2])
     559      375160 :                 update_loop_index(handler->grid.full_size[2], x1, &x);
     560             :               else
     561           0 :                 x += upper_corner[2] - lower_corner[2] - 1;
     562             :             }
     563             :           }
     564      170634 :           if (handler->grid.size[1] == handler->grid.full_size[1])
     565      170634 :             update_loop_index(handler->grid.full_size[1], y1, &y);
     566             :           else
     567           0 :             y += upper_corner[1] - lower_corner[1] - 1;
     568             :         }
     569             :       }
     570       84873 :       if (handler->grid.size[0] == handler->grid.full_size[0])
     571       84873 :         update_loop_index(handler->grid.full_size[0], z1, &z);
     572             :       else
     573           0 :         z += upper_corner[0] - lower_corner[0] - 1;
     574             :     }
     575             :   }
     576       43611 : }
     577             : 
     578       43611 : void grid_integrate(collocation_integration *const handler,
     579             :                     const bool use_ortho, const double zetp, const double rp[3],
     580             :                     const double radius) {
     581       43611 :   if (handler == NULL)
     582           0 :     abort();
     583             : 
     584       43611 :   const int lp = handler->coef.size[0] - 1;
     585             : 
     586       43611 :   int cubecenter[3];
     587       43611 :   int cube_size[3];
     588       43611 :   int lb_cube[3], ub_cube[3];
     589       43611 :   double roffset[3];
     590       43611 :   double disr_radius;
     591             : 
     592             :   /* cube : grid comtaining pointlike product between polynomials
     593             :    *
     594             :    * pol : grid  containing the polynomials in all three directions
     595             :    */
     596             : 
     597             :   /* seting up the cube parameters */
     598       87222 :   int cmax = compute_cube_properties(
     599       43611 :       use_ortho, radius, (const double(*)[3])handler->dh,
     600       43611 :       (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
     601             :       cubecenter, lb_cube, ub_cube, cube_size);
     602             : 
     603             :   /* initialize the multidimensional array containing the polynomials */
     604       43611 :   if (lp != 0) {
     605       34331 :     initialize_tensor_3(&handler->pol, 3, cmax, handler->coef.size[0]);
     606             :   } else {
     607        9280 :     initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
     608             :   }
     609       43611 :   handler->pol_alloc_size = realloc_tensor(&handler->pol);
     610             : 
     611             :   /* allocate memory for the polynomial and the cube */
     612             : 
     613       43611 :   initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
     614             : 
     615       43611 :   handler->cube_alloc_size = realloc_tensor(&handler->cube);
     616             : 
     617       43611 :   initialize_W_and_T(handler, &handler->coef, &handler->cube);
     618             : 
     619             :   /* compute the polynomials */
     620             : 
     621             :   // WARNING : do not reverse the order in pol otherwise you will have to
     622             :   // reverse the order in collocate_dgemm as well.
     623             : 
     624             :   /* the tensor contraction is done for a given order so I have to be careful
     625             :    * how the tensors X, Y, Z are stored. In collocate, they are stored
     626             :    * normally 0 (Z), (1) Y, (2) X in the table pol. but in integrate (which
     627             :    * uses the same tensor reduction), I have a special treatment for l = 0. In
     628             :    * that case the order *should* be the same than for collocate. For l > 0,
     629             :    * we need a different storage order which is (X) 2, (Y) 0, and (Z) 1.
     630             :    *
     631             :    * the reason for this is that the cube is stored as cube[z][y][x] so the
     632             :    * first direction taken for the dgemm is along x.
     633             :    */
     634             : 
     635       43611 :   int perm[3] = {2, 0, 1};
     636             : 
     637       43611 :   if (lp == 0) {
     638             :     /* I need to restore the same order than for collocate */
     639        9280 :     perm[0] = 0;
     640        9280 :     perm[1] = 1;
     641        9280 :     perm[2] = 2;
     642             :   }
     643             : 
     644       43611 :   bool use_ortho_forced = handler->orthogonal[0] && handler->orthogonal[1] &&
     645       23448 :                           handler->orthogonal[2];
     646       43611 :   if (use_ortho) {
     647       21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[0][0], roffset[2], 0, lb_cube[2],
     648             :                         ub_cube[2], lp, cmax, zetp,
     649       21114 :                         &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
     650       21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[1][1], roffset[1], 0, lb_cube[1],
     651             :                         ub_cube[1], lp, cmax, zetp,
     652       21114 :                         &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
     653       21114 :     grid_fill_pol_dgemm((lp != 0), handler->dh[2][2], roffset[0], 0, lb_cube[0],
     654             :                         ub_cube[0], lp, cmax, zetp,
     655       21114 :                         &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
     656             :   } else {
     657       22497 :     double dx[3];
     658             : 
     659       22497 :     dx[2] = handler->dh[0][0] * handler->dh[0][0] +
     660       22497 :             handler->dh[0][1] * handler->dh[0][1] +
     661       22497 :             handler->dh[0][2] * handler->dh[0][2];
     662             : 
     663       22497 :     dx[1] = handler->dh[1][0] * handler->dh[1][0] +
     664       22497 :             handler->dh[1][1] * handler->dh[1][1] +
     665       22497 :             handler->dh[1][2] * handler->dh[1][2];
     666             : 
     667       22497 :     dx[0] = handler->dh[2][0] * handler->dh[2][0] +
     668       22497 :             handler->dh[2][1] * handler->dh[2][1] +
     669       22497 :             handler->dh[2][2] * handler->dh[2][2];
     670             : 
     671       22497 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[2], 0, lb_cube[2], ub_cube[2],
     672             :                         lp, cmax, zetp * dx[2],
     673       22497 :                         &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
     674       22497 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[1], 0, lb_cube[1], ub_cube[1],
     675             :                         lp, cmax, zetp * dx[1],
     676       22497 :                         &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
     677       22497 :     grid_fill_pol_dgemm((lp != 0), 1.0, roffset[0], 0, lb_cube[0], ub_cube[0],
     678             :                         lp, cmax, zetp * dx[0],
     679       22497 :                         &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
     680             : 
     681             :     /* the three remaining tensors are initialized in the function */
     682       22497 :     calculate_non_orthorombic_corrections_tensor(
     683             :         zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
     684       22497 :         handler->orthogonal, &handler->Exp);
     685             :   }
     686             : 
     687       43611 :   if (handler->apply_cutoff) {
     688           0 :     memset(handler->cube.data, 0, sizeof(double) * handler->cube.alloc_size_);
     689           0 :     if (!use_ortho && !use_ortho_forced) {
     690           0 :       extract_cube_within_spherical_cutoff_generic(
     691             :           handler, disr_radius, cmax, lb_cube, ub_cube, roffset, cubecenter);
     692             :     } else {
     693           0 :       extract_cube_within_spherical_cutoff_ortho(handler, disr_radius, cmax,
     694             :                                                  lb_cube, cubecenter);
     695             :     }
     696             :   } else {
     697       43611 :     extract_cube(handler, cmax, lb_cube, cubecenter);
     698             :   }
     699             : 
     700       43611 :   if (!use_ortho && !use_ortho_forced)
     701       22497 :     apply_non_orthorombic_corrections(handler->orthogonal, &handler->Exp,
     702             :                                       &handler->cube);
     703             : 
     704       43611 :   if (lp != 0) {
     705       34331 :     tensor_reduction_for_collocate_integrate(handler->scratch,
     706             :                                              // pointer to scratch memory
     707       34331 :                                              1.0, handler->orthogonal, NULL,
     708             :                                              &handler->cube, &handler->pol,
     709             :                                              &handler->coef);
     710             :   } else {
     711             :     /* it is very specific to integrate because we might end up with a
     712             :      * single element after the tensor product/contraction. In that case, I
     713             :      * compute the cube and then do a scalar product between the two. */
     714             : 
     715             :     /* we could also do this with 2 matrix-vector multiplications and a scalar
     716             :      * product
     717             :      *
     718             :      * H_{jk} = C_{ijk} . P_i (along x) C_{ijk} is *stored C[k][j][i]* !!!!!!
     719             :      * L_{k} = H_{jk} . P_j (along y)
     720             :      * v_{ab} = L_k . P_k (along z)
     721             :      */
     722             : 
     723        9280 :     tensor cube_tmp;
     724        9280 :     initialize_tensor_2(&cube_tmp, cube_size[0], cube_size[1]);
     725        9280 :     alloc_tensor(&cube_tmp);
     726             : 
     727             :     /* first along x */
     728        9280 :     cblas_dgemv(CblasRowMajor, CblasNoTrans,
     729        9280 :                 handler->cube.size[0] * handler->cube.size[1],
     730        9280 :                 handler->cube.size[2], 1.0, handler->cube.data,
     731        9280 :                 handler->cube.ld_, &idx3(handler->pol, 2, 0, 0), 1, 0.0,
     732             :                 cube_tmp.data, 1);
     733             : 
     734             :     /* second along j */
     735        9280 :     cblas_dgemv(CblasRowMajor, CblasNoTrans, handler->cube.size[0],
     736        9280 :                 handler->cube.size[1], 1.0, cube_tmp.data, cube_tmp.ld_,
     737        9280 :                 &idx3(handler->pol, 1, 0, 0), 1, 0.0, handler->scratch, 1);
     738             : 
     739             :     /* finally along k, it is a scalar product.... */
     740       18560 :     handler->coef.data[0] = cblas_ddot(handler->cube.size[0], handler->scratch,
     741        9280 :                                        1, &idx3(handler->pol, 0, 0, 0), 1);
     742             : 
     743        9280 :     free(cube_tmp.data);
     744             :   }
     745             : 
     746             :   /* go from ijk -> xyz */
     747       43611 :   if (!use_ortho)
     748       22497 :     grid_transform_coef_jik_to_yxz((const double(*)[3])handler->dh,
     749             :                                    &handler->coef);
     750       43611 : }
     751             : 
     752         624 : void integrate_one_grid_level_dgemm(
     753             :     grid_context *const ctx, const int level, const bool calculate_tau,
     754             :     const bool calculate_forces, const bool calculate_virial,
     755             :     const int *const shift_local, const int *const border_width,
     756             :     const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks,
     757             :     tensor *forces_, tensor *virial_) {
     758         624 :   tensor *const grid = &ctx->grid[level];
     759             : 
     760             :   // Using default(shared) because with GCC 9 the behavior around const changed:
     761             :   // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
     762         624 : #pragma omp parallel default(shared)
     763             :   {
     764             :     const int num_threads = omp_get_num_threads();
     765             :     const int thread_id = omp_get_thread_num();
     766             : 
     767             :     double *hab_block_local = NULL;
     768             : 
     769             :     if (num_threads == 1) {
     770             :       hab_block_local = hab_blocks->host_buffer;
     771             :     } else {
     772             :       hab_block_local = ((double *)ctx->scratch) +
     773             :                         thread_id * (hab_blocks->size / sizeof(double));
     774             :       memset(hab_block_local, 0, hab_blocks->size);
     775             :     }
     776             : 
     777             :     tensor work, pab, hab, vab, forces_local_, virial_local_,
     778             :         forces_local_pair_, virial_local_pair_;
     779             :     memset(&work, 0, sizeof(tensor));
     780             :     memset(&pab, 0, sizeof(tensor));
     781             :     memset(&hab, 0, sizeof(tensor));
     782             :     memset(&vab, 0, sizeof(tensor));
     783             :     memset(&forces_local_, 0, sizeof(tensor));
     784             :     memset(&virial_local_, 0, sizeof(tensor));
     785             :     memset(&forces_local_pair_, 0, sizeof(tensor));
     786             :     memset(&virial_local_pair_, 0, sizeof(tensor));
     787             : 
     788             :     struct collocation_integration_ *handler = ctx->handler[thread_id];
     789             :     handler->apply_cutoff = ctx->apply_cutoff;
     790             :     handler->lmax_diff[0] = 0;
     791             :     handler->lmax_diff[1] = 0;
     792             :     handler->lmin_diff[0] = 0;
     793             :     handler->lmin_diff[1] = 0;
     794             : 
     795             :     if (calculate_tau || calculate_forces || calculate_virial) {
     796             :       handler->lmax_diff[0] = 1;
     797             :       handler->lmax_diff[1] = 0;
     798             :       handler->lmin_diff[0] = -1;
     799             :       handler->lmin_diff[1] = -1;
     800             :     }
     801             : 
     802             :     if (calculate_virial) {
     803             :       handler->lmax_diff[0]++;
     804             :       handler->lmax_diff[1]++;
     805             :     }
     806             : 
     807             :     if (calculate_tau) {
     808             :       handler->lmax_diff[0]++;
     809             :       handler->lmax_diff[1]++;
     810             :       handler->lmin_diff[0]--;
     811             :       handler->lmin_diff[1]--;
     812             :     }
     813             : 
     814             :     // Allocate pab matrix for re-use across tasks.
     815             :     initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
     816             :     alloc_tensor(&pab);
     817             : 
     818             :     initialize_tensor_2(&vab, ctx->maxco, ctx->maxco);
     819             :     alloc_tensor(&vab);
     820             : 
     821             :     initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
     822             :     alloc_tensor(&work);
     823             : 
     824             :     initialize_tensor_2(&hab, ctx->maxco, ctx->maxco);
     825             :     alloc_tensor(&hab);
     826             : 
     827             :     if (calculate_forces) {
     828             :       initialize_tensor_2(&forces_local_, forces_->size[0], forces_->size[1]);
     829             :       alloc_tensor(&forces_local_);
     830             :       initialize_tensor_2(&virial_local_, 3, 3);
     831             :       alloc_tensor(&virial_local_);
     832             :       memset(forces_local_.data, 0, sizeof(double) * forces_local_.alloc_size_);
     833             :       memset(virial_local_.data, 0, sizeof(double) * virial_local_.alloc_size_);
     834             :       initialize_tensor_2(&forces_local_pair_, 2, 3);
     835             :       alloc_tensor(&forces_local_pair_);
     836             :       initialize_tensor_3(&virial_local_pair_, 2, 3, 3);
     837             :       alloc_tensor(&virial_local_pair_);
     838             :     }
     839             : 
     840             :     initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
     841             :                              (const double(*)[3])grid->dh_inv);
     842             : 
     843             :     tensor_copy(&handler->grid, grid);
     844             :     handler->grid.data = grid->data;
     845             :     for (int d = 0; d < 3; d++)
     846             :       handler->orthogonal[d] = grid->orthogonal[d];
     847             : 
     848             :     /* it is only useful when we split the list over multiple threads. The
     849             :      * first iteration should load the block whatever status the
     850             :      * task->block_update_ variable has */
     851             :     const _task *prev_task = NULL;
     852             : #pragma omp for schedule(static)
     853             :     for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
     854             :       // Define some convenient aliases.
     855             :       const _task *task = &ctx->tasks[level][itask];
     856             : 
     857             :       if (task->level != level) {
     858             :         printf("level %d, %d\n", task->level, level);
     859             :         abort();
     860             :       }
     861             : 
     862             :       if (task->update_block_ || (prev_task == NULL)) {
     863             :         /* need to load pab if forces are needed */
     864             :         if (calculate_forces) {
     865             :           extract_blocks(ctx, task, pab_blocks, &work, &pab);
     866             :         }
     867             :         /* store the coefficients of the operator after rotation to
     868             :          * the spherical harmonic basis */
     869             : 
     870             :         rotate_and_store_coefficients(ctx, prev_task, task, &hab, &work,
     871             :                                       hab_block_local);
     872             : 
     873             :         /* There is a difference here between collocate and integrate.
     874             :          * For collocate, I do not need to know the task where blocks
     875             :          * have been updated the last time. For integrate this
     876             :          * information is crucial to update the coefficients of the
     877             :          * potential */
     878             :         prev_task = task;
     879             :         memset(hab.data, 0, sizeof(double) * hab.alloc_size_);
     880             :       }
     881             : 
     882             :       /* the grid is divided over several ranks or not periodic */
     883             :       if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
     884             :           (handler->grid.size[1] != handler->grid.full_size[1]) ||
     885             :           (handler->grid.size[2] != handler->grid.full_size[2])) {
     886             :         /* unfortunately the window where the gaussian should be added depends
     887             :          * on the bonds. So I have to adjust the window all the time. */
     888             : 
     889             :         setup_grid_window(&handler->grid, shift_local, border_width,
     890             :                           task->border_mask);
     891             :       }
     892             : 
     893             :       int lmax[2] = {task->lmax[0] + handler->lmax_diff[0],
     894             :                      task->lmax[1] + handler->lmax_diff[1]};
     895             :       int lmin[2] = {task->lmin[0] + handler->lmin_diff[0],
     896             :                      task->lmin[1] + handler->lmin_diff[1]};
     897             : 
     898             :       lmin[0] = imax(lmin[0], 0);
     899             :       lmin[1] = imax(lmin[1], 0);
     900             : 
     901             :       initialize_tensor_4(&(handler->alpha), 3, lmax[1] + 1, lmax[0] + 1,
     902             :                           lmax[0] + lmax[1] + 1);
     903             :       realloc_tensor(&(handler->alpha));
     904             : 
     905             :       const int lp = lmax[0] + lmax[1];
     906             : 
     907             :       initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
     908             :       realloc_tensor(&(handler->coef));
     909             :       grid_integrate(handler, ctx->orthorhombic, task->zetp, task->rp,
     910             :                      task->radius);
     911             :       /*
     912             :               handler->coef contains coefficients in the (x - x_12) basis. now
     913             :               we need to rotate them in the (x - x_1) (x - x_2) basis
     914             :       */
     915             : 
     916             :       /* compute the transformation matrix */
     917             :       grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax,
     918             :                                &(handler->alpha));
     919             : 
     920             :       initialize_tensor_2(&vab, ncoset(lmax[1]), ncoset(lmax[0]));
     921             :       realloc_tensor(&vab);
     922             : 
     923             :       grid_compute_vab(lmin, lmax, lp, task->prefactor,
     924             :                        &handler->alpha, // contains the matrix for the rotation
     925             :                        &handler->coef,  // contains the coefficients of
     926             :                        // the potential in the
     927             :                        // (x_x_12) basis
     928             :                        &vab); // contains the coefficients of the potential
     929             :       // in the (x - x_1)(x - x_2) basis
     930             : 
     931             :       if (calculate_forces) {
     932             :         memset(forces_local_pair_.data, 0,
     933             :                sizeof(double) * forces_local_pair_.alloc_size_);
     934             :         memset(virial_local_pair_.data, 0,
     935             :                sizeof(double) * virial_local_pair_.alloc_size_);
     936             :       }
     937             : 
     938             :       update_hab_forces_and_stress(
     939             :           task, &vab, &pab, calculate_tau, calculate_forces, calculate_virial,
     940             :           &forces_local_pair_, /* matrix
     941             :                                 * containing the
     942             :                                 * contribution of
     943             :                                 * the gaussian
     944             :                                 * pair for each
     945             :                                 * atom */
     946             :           &virial_local_pair_, /* same but for the virial term (stress tensor)
     947             :                                 */
     948             :           &hab);
     949             : 
     950             :       if (calculate_forces) {
     951             :         const double scaling = (task->iatom == task->jatom) ? 1.0 : 2.0;
     952             :         idx2(forces_local_, task->iatom, 0) +=
     953             :             scaling * idx2(forces_local_pair_, 0, 0);
     954             :         idx2(forces_local_, task->iatom, 1) +=
     955             :             scaling * idx2(forces_local_pair_, 0, 1);
     956             :         idx2(forces_local_, task->iatom, 2) +=
     957             :             scaling * idx2(forces_local_pair_, 0, 2);
     958             : 
     959             :         idx2(forces_local_, task->jatom, 0) +=
     960             :             scaling * idx2(forces_local_pair_, 1, 0);
     961             :         idx2(forces_local_, task->jatom, 1) +=
     962             :             scaling * idx2(forces_local_pair_, 1, 1);
     963             :         idx2(forces_local_, task->jatom, 2) +=
     964             :             scaling * idx2(forces_local_pair_, 1, 2);
     965             :         if (calculate_virial) {
     966             :           for (int i = 0; i < 3; i++) {
     967             :             for (int j = 0; j < 3; j++) {
     968             :               idx2(virial_local_, i, j) +=
     969             :                   scaling * (idx3(virial_local_pair_, 0, i, j) +
     970             :                              idx3(virial_local_pair_, 1, i, j));
     971             :             }
     972             :           }
     973             :         }
     974             :       }
     975             :     }
     976             : 
     977             :     rotate_and_store_coefficients(ctx, prev_task, NULL, &hab, &work,
     978             :                                   hab_block_local);
     979             : 
     980             :     // now reduction over the hab blocks
     981             :     if (num_threads > 1) {
     982             :       // does not store the number of elements but the amount of memory
     983             :       // occupied. That's a strange choice.
     984             :       const int hab_size = hab_blocks->size / sizeof(double);
     985             :       if ((hab_size / num_threads) >= 2) {
     986             :         const int block_size =
     987             :             hab_size / num_threads + (hab_size % num_threads);
     988             : 
     989             :         for (int bk = 0; bk < num_threads; bk++) {
     990             :           int bk_id = (bk + thread_id) % num_threads;
     991             :           size_t begin = bk_id * block_size;
     992             :           size_t end = imin((bk_id + 1) * block_size, hab_size);
     993             :           cblas_daxpy(end - begin, 1.0, hab_block_local + begin, 1,
     994             :                       hab_blocks->host_buffer + begin, 1);
     995             : #pragma omp barrier
     996             :         }
     997             :       } else {
     998             :         const int hab_size = hab_blocks->size / sizeof(double);
     999             : #pragma omp critical
    1000             :         cblas_daxpy(hab_size, 1.0, hab_block_local, 1, hab_blocks->host_buffer,
    1001             :                     1);
    1002             :       }
    1003             :     }
    1004             : 
    1005             :     if (calculate_forces) {
    1006             :       if (num_threads > 1) {
    1007             :         if ((forces_->alloc_size_ / num_threads) >= 2) {
    1008             :           const int block_size = forces_->alloc_size_ / num_threads +
    1009             :                                  (forces_->alloc_size_ % num_threads);
    1010             : 
    1011             :           for (int bk = 0; bk < num_threads; bk++) {
    1012             :             int bk_id = (bk + thread_id) % num_threads;
    1013             :             int begin = bk_id * block_size;
    1014             :             int end = imin((bk_id + 1) * block_size, forces_->alloc_size_);
    1015             :             cblas_daxpy(end - begin, 1.0, forces_local_.data + begin, 1,
    1016             :                         forces_->data + begin, 1);
    1017             : #pragma omp barrier
    1018             :           }
    1019             :         } else {
    1020             : #pragma omp critical
    1021             :           cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
    1022             :                       forces_->data, 1);
    1023             :         }
    1024             :       } else {
    1025             :         // we are running with OMP_NUM_THREADS=1
    1026             :         cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
    1027             :                     forces_->data, 1);
    1028             :       }
    1029             :     }
    1030             : 
    1031             :     if (calculate_virial) {
    1032             : #pragma omp critical
    1033             :       for (int i = 0; i < 3; i++) {
    1034             :         for (int j = 0; j < 3; j++) {
    1035             :           idx2(virial_[0], i, j) += idx2(virial_local_, i, j);
    1036             :         }
    1037             :       }
    1038             :     }
    1039             : 
    1040             :     handler->grid.data = NULL;
    1041             :     free(vab.data);
    1042             :     free(work.data);
    1043             :     free(pab.data);
    1044             :     free(hab.data);
    1045             : 
    1046             :     if (calculate_forces) {
    1047             :       free(forces_local_.data);
    1048             :       free(virial_local_.data);
    1049             :       free(virial_local_pair_.data);
    1050             :       free(forces_local_pair_.data);
    1051             :     }
    1052             :   }
    1053         624 : }
    1054             : 
    1055             : /*******************************************************************************
    1056             :  * \brief Integrate all tasks of in given list from given grids using matrix -
    1057             :  * matrix multiplication
    1058             :  ******************************************************************************/
    1059         156 : void grid_dgemm_integrate_task_list(
    1060             :     void *ptr, const bool compute_tau, const int natoms, const int nlevels,
    1061             :     const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
    1062             :     offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
    1063             : 
    1064         156 :   grid_context *const ctx = (grid_context *)ptr;
    1065             : 
    1066         156 :   assert(ctx->checksum == ctx_checksum);
    1067         156 :   assert(ctx->nlevels == nlevels);
    1068         156 :   assert(ctx->natoms == natoms);
    1069             : 
    1070             :   // Zero result arrays.
    1071         156 :   memset(hab_blocks->host_buffer, 0, hab_blocks->size);
    1072             : 
    1073         156 :   const int max_threads = omp_get_max_threads();
    1074             : 
    1075         156 :   if (ctx->scratch == NULL)
    1076         156 :     ctx->scratch = malloc(hab_blocks->size * max_threads);
    1077             : 
    1078             :   // #pragma omp parallel for
    1079         780 :   for (int level = 0; level < ctx->nlevels; level++) {
    1080         624 :     const _layout *layout = &ctx->layouts[level];
    1081         624 :     set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
    1082         624 :                         layout->npts_global, layout->npts_local,
    1083         624 :                         layout->shift_local, layout->border_width, layout->dh,
    1084         624 :                         layout->dh_inv, grids[level]);
    1085         624 :     ctx->grid[level].data = grids[level]->host_buffer;
    1086             :   }
    1087             : 
    1088         156 :   bool calculate_virial = (virial != NULL);
    1089         156 :   bool calculate_forces = (forces != NULL);
    1090             : 
    1091         156 :   tensor forces_, virial_;
    1092         156 :   if (calculate_forces) {
    1093          20 :     initialize_tensor_2(&forces_, natoms, 3);
    1094          20 :     alloc_tensor(&forces_);
    1095          20 :     initialize_tensor_2(&virial_, 3, 3);
    1096          20 :     alloc_tensor(&virial_);
    1097          20 :     memset(forces_.data, 0, sizeof(double) * forces_.alloc_size_);
    1098          20 :     memset(virial_.data, 0, sizeof(double) * virial_.alloc_size_);
    1099             :   }
    1100             : 
    1101         780 :   for (int level = 0; level < ctx->nlevels; level++) {
    1102         624 :     const _layout *layout = &ctx->layouts[level];
    1103         624 :     integrate_one_grid_level_dgemm(ctx, level, compute_tau, calculate_forces,
    1104         624 :                                    calculate_virial, layout->shift_local,
    1105         624 :                                    layout->border_width, pab_blocks, hab_blocks,
    1106             :                                    &forces_, &virial_);
    1107         624 :     ctx->grid[level].data = NULL;
    1108             :   }
    1109         156 :   if (calculate_forces) {
    1110          20 :     if (calculate_virial) {
    1111          16 :       virial[0][0] = idx2(virial_, 0, 0);
    1112          16 :       virial[0][1] = idx2(virial_, 0, 1);
    1113          16 :       virial[0][2] = idx2(virial_, 0, 2);
    1114          16 :       virial[1][0] = idx2(virial_, 1, 0);
    1115          16 :       virial[1][1] = idx2(virial_, 1, 1);
    1116          16 :       virial[1][2] = idx2(virial_, 1, 2);
    1117          16 :       virial[2][0] = idx2(virial_, 2, 0);
    1118          16 :       virial[2][1] = idx2(virial_, 2, 1);
    1119          16 :       virial[2][2] = idx2(virial_, 2, 2);
    1120             :     }
    1121             : 
    1122          20 :     memcpy(forces[0], forces_.data, sizeof(double) * forces_.alloc_size_);
    1123          20 :     free(forces_.data);
    1124          20 :     free(virial_.data);
    1125             :   }
    1126             : 
    1127         156 :   free(ctx->scratch);
    1128         156 :   ctx->scratch = NULL;
    1129         156 : }

Generated by: LCOV version 1.15