LCOV - code coverage report
Current view: top level - src/grid/dgemm - grid_dgemm_collocate.c (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:42dac4a) Lines: 64.7 % 586 379
Test Date: 2025-07-25 12:55:17 Functions: 76.9 % 13 10

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2025 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : 
       8              : #include <assert.h>
       9              : #include <limits.h>
      10              : #include <math.h>
      11              : #include <omp.h>
      12              : #include <stdbool.h>
      13              : #include <stdio.h>
      14              : #include <stdlib.h>
      15              : #include <string.h>
      16              : 
      17              : #ifdef __LIBXSMM
      18              : #include <libxsmm.h>
      19              : #endif
      20              : 
      21              : #include "../common/grid_basis_set.h"
      22              : #include "../common/grid_common.h"
      23              : #include "../common/grid_constants.h"
      24              : #include "grid_dgemm_coefficients.h"
      25              : #include "grid_dgemm_collocate.h"
      26              : #include "grid_dgemm_collocation_integration.h"
      27              : #include "grid_dgemm_non_orthorombic_corrections.h"
      28              : #include "grid_dgemm_prepare_pab.h"
      29              : #include "grid_dgemm_private_header.h"
      30              : #include "grid_dgemm_task_list.h"
      31              : #include "grid_dgemm_tensor_local.h"
      32              : 
      33              : void collocate_l0(double *scratch, const double alpha, const bool orthogonal,
      34              :                   const struct tensor_ *exp_xy,
      35              :                   const struct tensor_ *p_alpha_beta_reduced_,
      36              :                   struct tensor_ *cube);
      37              : 
      38         5100 : void rotate_to_cartesian_harmonics(const grid_basis_set *ibasis,
      39              :                                    const grid_basis_set *jbasis,
      40              :                                    const int iatom, const int jatom,
      41              :                                    const int iset, const int jset,
      42              :                                    double *const block, tensor *work,
      43              :                                    tensor *pab) {
      44         5100 :   dgemm_params m1, m2;
      45         5100 :   memset(&m1, 0, sizeof(dgemm_params));
      46         5100 :   memset(&m2, 0, sizeof(dgemm_params));
      47              : 
      48              :   // Define some more convenient aliases.
      49         5100 :   const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
      50         5100 :   const int nsgf_setb = jbasis->nsgf_set[jset];
      51         5100 :   const int nsgfa = ibasis->nsgf; // size of entire spherical basis
      52         5100 :   const int nsgfb = jbasis->nsgf;
      53         5100 :   const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
      54         5100 :   const int sgfb = jbasis->first_sgf[jset] - 1;
      55         5100 :   const int maxcoa = ibasis->maxco;
      56         5100 :   const int maxcob = jbasis->maxco;
      57         5100 :   const int ncoseta = ncoset(ibasis->lmax[iset]);
      58         5100 :   const int ncosetb = ncoset(jbasis->lmax[jset]);
      59         5100 :   const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
      60         5100 :   const int ncob = jbasis->npgf[jset] * ncosetb;
      61              : 
      62         5100 :   initialize_tensor_2(work, nsgf_setb, ncoa);
      63         5100 :   realloc_tensor(work);
      64              : 
      65         5100 :   initialize_tensor_2(pab, ncob, ncoa);
      66         5100 :   realloc_tensor(pab);
      67              : 
      68              :   // the rotations happen here.
      69         5100 :   if (iatom <= jatom) {
      70         3388 :     m1.op1 = 'N';
      71         3388 :     m1.op2 = 'N';
      72         3388 :     m1.m = work->size[0];
      73         3388 :     m1.n = work->size[1];
      74         3388 :     m1.k = nsgf_seta;
      75         3388 :     m1.alpha = 1.0;
      76         3388 :     m1.beta = 0.0;
      77         3388 :     m1.a = block + sgfb * nsgfa + sgfa;
      78         3388 :     m1.lda = nsgfa;
      79         3388 :     m1.b = &ibasis->sphi[sgfa * maxcoa];
      80         3388 :     m1.ldb = maxcoa;
      81         3388 :     m1.c = work->data;
      82         3388 :     m1.ldc = work->ld_;
      83              :   } else {
      84         1712 :     m1.op1 = 'T';
      85         1712 :     m1.op2 = 'N';
      86         1712 :     m1.m = work->size[0];
      87         1712 :     m1.n = work->size[1];
      88         1712 :     m1.k = nsgf_seta;
      89         1712 :     m1.alpha = 1.0;
      90         1712 :     m1.beta = 0.0;
      91         1712 :     m1.a = block + sgfa * nsgfb + sgfb;
      92         1712 :     m1.lda = nsgfb;
      93         1712 :     m1.b = &ibasis->sphi[sgfa * maxcoa];
      94         1712 :     m1.ldb = maxcoa;
      95         1712 :     m1.c = work->data;
      96         1712 :     m1.ldc = work->ld_;
      97              :   }
      98         5100 :   m1.use_libxsmm = true;
      99         5100 :   dgemm_simplified(&m1);
     100              : 
     101         5100 :   m2.op1 = 'T';
     102         5100 :   m2.op2 = 'N';
     103         5100 :   m2.m = pab->size[0];
     104         5100 :   m2.n = pab->size[1];
     105         5100 :   m2.k = work->size[0];
     106         5100 :   m2.alpha = 1.0;
     107         5100 :   m2.beta = 0.0;
     108         5100 :   m2.a = &jbasis->sphi[sgfb * maxcob];
     109         5100 :   m2.lda = maxcob;
     110         5100 :   m2.b = work->data;
     111         5100 :   m2.ldb = work->ld_;
     112         5100 :   m2.c = pab->data;
     113         5100 :   m2.ldc = pab->ld_;
     114         5100 :   m2.use_libxsmm = true;
     115         5100 :   dgemm_simplified(&m2);
     116         5100 : }
     117              : 
     118              : void tensor_reduction_for_collocate_integrate(
     119              :     double *scratch, const double alpha, const bool *const orthogonal,
     120              :     const struct tensor_ *Exp, const struct tensor_ *co,
     121              :     const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube);
     122              : 
     123              : /* compute the functions (x - x_i)^l exp (-eta (x - x_i)^2) for l = 0..lp using
     124              :  * a recursive relation to avoid computing the exponential on each grid point. I
     125              :  * think it is not really necessary anymore since it is *not* the dominating
     126              :  * contribution to computation of collocate and integrate */
     127              : 
     128       271002 : void grid_fill_pol_dgemm(const bool transpose, const double dr,
     129              :                          const double roffset, const int pol_offset,
     130              :                          const int xmin, const int xmax, const int lp,
     131              :                          const int cmax, const double zetp, double *pol_) {
     132       271002 :   tensor pol;
     133       271002 :   const double t_exp_1 = exp(-zetp * dr * dr);
     134       271002 :   const double t_exp_2 = t_exp_1 * t_exp_1;
     135              : 
     136       271002 :   double t_exp_min_1 = exp(-zetp * (dr - roffset) * (dr - roffset));
     137       271002 :   double t_exp_min_2 = exp(2.0 * zetp * (dr - roffset) * dr);
     138              : 
     139       271002 :   double t_exp_plus_1 = exp(-zetp * roffset * roffset);
     140       271002 :   double t_exp_plus_2 = exp(2.0 * zetp * roffset * dr);
     141              : 
     142       271002 :   if (transpose) {
     143       105327 :     initialize_tensor_2(&pol, cmax, lp + 1);
     144       105327 :     pol.data = pol_;
     145              :     /* It is original Ole code. I need to transpose the polynomials for the
     146              :      * integration routine and Ole code already does it. */
     147      1425952 :     for (int ig = 0; ig >= xmin; ig--) {
     148      1320625 :       const double rpg = ig * dr - roffset;
     149      1320625 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     150      1320625 :       t_exp_min_2 *= t_exp_2;
     151      1320625 :       double pg = t_exp_min_1;
     152      5409677 :       for (int icoef = 0; icoef <= lp; icoef++) {
     153      4089052 :         idx2(pol, pol_offset + ig - xmin, icoef) = pg;
     154      4089052 :         pg *= rpg;
     155              :       }
     156              :     }
     157              : 
     158       105327 :     double t_exp_plus_1 = exp(-zetp * roffset * roffset);
     159       105327 :     double t_exp_plus_2 = exp(2 * zetp * roffset * dr);
     160      1369960 :     for (int ig = 1; ig <= xmax; ig++) {
     161      1264633 :       const double rpg = ig * dr - roffset;
     162      1264633 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     163      1264633 :       t_exp_plus_2 *= t_exp_2;
     164      1264633 :       double pg = t_exp_plus_1;
     165      5181233 :       for (int icoef = 0; icoef <= lp; icoef++) {
     166      3916600 :         idx2(pol, pol_offset + ig - xmin, icoef) = pg;
     167      3916600 :         pg *= rpg;
     168              :       }
     169              :     }
     170              : 
     171              :   } else {
     172       165675 :     initialize_tensor_2(&pol, lp + 1, cmax);
     173       165675 :     pol.data = pol_;
     174              :     /* memset(pol.data, 0, sizeof(double) * pol.alloc_size_); */
     175              :     /*
     176              :      *   compute the values of all (x-xp)**lp*exp(..)
     177              :      *
     178              :      *  still requires the old trick:
     179              :      *  new trick to avoid to many exps (reuse the result from the previous
     180              :      * gridpoint): exp( -a*(x+d)**2)=exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
     181              :      *  exp(-2*a*(x+d)*d)=exp(-2*a*x*d)*exp(-2*a*d**2)
     182              :      */
     183              : 
     184              :     /* compute the exponential recursively and store the polynomial prefactors
     185              :      * as well */
     186      2219700 :     for (int ig = 0; ig >= xmin; ig--) {
     187      2054025 :       const double rpg = ig * dr - roffset;
     188      2054025 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     189      2054025 :       t_exp_min_2 *= t_exp_2;
     190      2054025 :       double pg = t_exp_min_1;
     191      2054025 :       idx2(pol, 0, pol_offset + ig - xmin) = pg;
     192      2054025 :       if (lp > 0)
     193      1326381 :         idx2(pol, 1, pol_offset + ig - xmin) = rpg;
     194              :     }
     195              : 
     196      2131374 :     for (int ig = 1; ig <= xmax; ig++) {
     197      1965699 :       const double rpg = ig * dr - roffset;
     198      1965699 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     199      1965699 :       t_exp_plus_2 *= t_exp_2;
     200      1965699 :       double pg = t_exp_plus_1;
     201      1965699 :       idx2(pol, 0, pol_offset + ig - xmin) = pg;
     202      1965699 :       if (lp > 0)
     203      1267629 :         idx2(pol, 1, pol_offset + ig - xmin) = rpg;
     204              :     }
     205              : 
     206              :     /* compute the remaining powers using previously computed stuff */
     207       165675 :     if (lp >= 2) {
     208        61209 :       double *__restrict__ poly = &idx2(pol, 1, 0);
     209        61209 :       double *__restrict__ src1 = &idx2(pol, 0, 0);
     210        61209 :       double *__restrict__ dst = &idx2(pol, 2, 0);
     211              :       // #pragma omp simd linear(dst, src1, poly) simdlen(8)
     212       240150 :       GRID_PRAGMA_SIMD((dst, src1, poly), 8)
     213        61209 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++)
     214      1506336 :         dst[ig] = src1[ig] * poly[ig] * poly[ig];
     215              :     }
     216              : 
     217       178941 :     for (int icoef = 3; icoef <= lp; icoef++) {
     218        13266 :       double *__restrict__ poly = &idx2(pol, 1, 0);
     219        13266 :       double *__restrict__ src1 = &idx2(pol, icoef - 1, 0);
     220        13266 :       double *__restrict__ dst = &idx2(pol, icoef, 0);
     221        13266 :       GRID_PRAGMA_SIMD((dst, src1, poly), 8)
     222        13266 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
     223       372702 :         dst[ig] = src1[ig] * poly[ig];
     224              :       }
     225              :     }
     226              : 
     227              :     //
     228       165675 :     if (lp > 0) {
     229              :       // I can not declare src__ variable constant because it breaks openmp
     230              :       // standard.
     231       106155 :       double *__restrict__ dst = &idx2(pol, 1, 0);
     232       106155 :       double *__restrict__ src = &idx2(pol, 0, 0);
     233       377157 :       GRID_PRAGMA_SIMD((dst, src), 8)
     234       106155 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
     235      2594010 :         dst[ig] *= src[ig];
     236              :       }
     237              :     }
     238              :   }
     239       271002 : }
     240              : 
     241            0 : void apply_sphere_cutoff_ortho(struct collocation_integration_ *const handler,
     242              :                                const double disr_radius, const int cmax,
     243              :                                const int *const lb_cube,
     244              :                                const int *const cube_center) {
     245              :   // a mapping so that the ig corresponds to the right grid point
     246            0 :   int **map = handler->map;
     247            0 :   map[1] = map[0] + 2 * cmax + 1;
     248            0 :   map[2] = map[1] + 2 * cmax + 1;
     249              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     250              : 
     251            0 :   for (int i = 0; i < 3; i++) {
     252            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     253            0 :       map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
     254            0 :                               handler->grid.lower_corner[i],
     255              :                           handler->grid.full_size[i]);
     256              :     }
     257              :   }
     258              : 
     259            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     260            0 :                             .xmax = handler->grid.window_size[0]};
     261            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     262            0 :                             .xmax = handler->grid.window_size[1]};
     263            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     264            0 :                             .xmax = handler->grid.window_size[2]};
     265              : 
     266            0 :   for (int kg = 0; kg < handler->cube.size[0]; kg++) {
     267            0 :     const int k = map[0][kg];
     268            0 :     const int kd =
     269            0 :         (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
     270            0 :     const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
     271            0 :     const double kremain = disr_radius * disr_radius - kr * kr;
     272              : 
     273            0 :     if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
     274              : 
     275            0 :       const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
     276            0 :       for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
     277            0 :         const int j = map[1][jg - lb_cube[1]];
     278            0 :         const double jr = ((2 * jg - 1) >> 1) *
     279            0 :                           handler->dh[1][1]; // distance from center in a.u.
     280            0 :         const double jremain = kremain - jr * jr;
     281              : 
     282            0 :         if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
     283            0 :           const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
     284            0 :           const int xmax = 1 - xmin;
     285              : 
     286              :           // printf("xmin %d, xmax %d\n", xmin, xmax);
     287            0 :           for (int x = xmin - lb_cube[2];
     288            0 :                x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     289            0 :             const int x1 = map[2][x];
     290              : 
     291            0 :             if (!is_point_in_interval(x1, xwindow))
     292            0 :               continue;
     293              : 
     294            0 :             int lower_corner[3] = {k, j, x1};
     295            0 :             int upper_corner[3] = {k + 1, j + 1, x1 + 1};
     296              : 
     297            0 :             compute_interval(map[2], handler->grid.full_size[2],
     298              :                              handler->grid.size[2], xmax - lb_cube[2], x1, &x,
     299              :                              lower_corner + 2, upper_corner + 2, xwindow);
     300              : 
     301            0 :             if (upper_corner[2] - lower_corner[2]) {
     302            0 :               const int position1[3] = {kg, jg - lb_cube[1], x};
     303              : 
     304            0 :               double *restrict dst = &idx3(handler->grid, lower_corner[0],
     305              :                                            lower_corner[1], lower_corner[2]);
     306            0 :               double *restrict src = &idx3(handler->cube, position1[0],
     307              :                                            position1[1], position1[2]);
     308              : 
     309            0 :               const int sizex = upper_corner[2] - lower_corner[2];
     310            0 :               GRID_PRAGMA_SIMD((dst, src), 8)
     311            0 :               for (int x = 0; x < sizex; x++) {
     312            0 :                 dst[x] += src[x];
     313              :               }
     314              :             }
     315              : 
     316            0 :             if (handler->grid.size[2] == handler->grid.full_size[2])
     317            0 :               update_loop_index(handler->grid.full_size[2], x1, &x);
     318              :             else
     319            0 :               x += upper_corner[2] - lower_corner[2] - 1;
     320              :           }
     321              :         }
     322              :       }
     323              :     }
     324              :   }
     325            0 : }
     326              : 
     327            0 : void apply_spherical_cutoff_generic(
     328              :     struct collocation_integration_ *const handler, const double disr_radius,
     329              :     const int cmax, const int *const lb_cube, const int *const ub_cube,
     330              :     const double *const roffset, const int *const cube_center) {
     331              : 
     332            0 :   const double a = handler->dh[0][0] * handler->dh[0][0] +
     333            0 :                    handler->dh[0][1] * handler->dh[0][1] +
     334            0 :                    handler->dh[0][2] * handler->dh[0][2];
     335            0 :   const double a_inv = 1.0 / a;
     336              :   // a mapping so that the ig corresponds to the right grid point
     337            0 :   int **map = handler->map;
     338            0 :   map[1] = map[0] + 2 * cmax + 1;
     339            0 :   map[2] = map[1] + 2 * cmax + 1;
     340              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     341              : 
     342            0 :   for (int i = 0; i < 3; i++) {
     343            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     344            0 :       map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
     345            0 :                               handler->grid.lower_corner[i],
     346              :                           handler->grid.full_size[i]);
     347              :     }
     348              :   }
     349              : 
     350            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     351            0 :                             .xmax = handler->grid.window_size[0]};
     352            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     353            0 :                             .xmax = handler->grid.window_size[1]};
     354            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     355            0 :                             .xmax = handler->grid.window_size[2]};
     356              : 
     357            0 :   for (int k = 0; k < handler->cube.size[0]; k++) {
     358            0 :     const int iz = map[0][k];
     359              : 
     360            0 :     if (!is_point_in_interval(iz, zwindow))
     361            0 :       continue;
     362              : 
     363            0 :     const double tz = (k + lb_cube[0] - roffset[0]);
     364            0 :     const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
     365            0 :                          tz * handler->dh[2][2]};
     366              : 
     367            0 :     for (int j = 0; j < handler->cube.size[1]; j++) {
     368            0 :       const int iy = map[1][j];
     369              : 
     370            0 :       if (!is_point_in_interval(iy, ywindow))
     371            0 :         continue;
     372              : 
     373            0 :       const double ty = (j - roffset[1] + lb_cube[1]);
     374            0 :       const double y[3] = {z[0] + ty * handler->dh[1][0],
     375            0 :                            z[1] + ty * handler->dh[1][1],
     376            0 :                            z[2] + ty * handler->dh[1][2]};
     377              : 
     378              :       /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
     379              :       /*                                            4 (x1^2 + y1^2 + z1^2)
     380              :        * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
     381              : 
     382            0 :       const double b =
     383            0 :           -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
     384            0 :                   handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
     385            0 :                   handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
     386              : 
     387            0 :       const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
     388            0 :                            (roffset[2] * handler->dh[0][0] - y[0]) +
     389            0 :                        (roffset[2] * handler->dh[0][1] - y[1]) *
     390            0 :                            (roffset[2] * handler->dh[0][1] - y[1]) +
     391            0 :                        (roffset[2] * handler->dh[0][2] - y[2]) *
     392              :                            (roffset[2] * handler->dh[0][2] - y[2]) -
     393            0 :                        disr_radius * disr_radius;
     394              : 
     395            0 :       double delta = b * b - 4.0 * a * c;
     396              : 
     397            0 :       if (delta < 0.0)
     398            0 :         continue;
     399              : 
     400            0 :       delta = sqrt(delta);
     401              : 
     402            0 :       const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
     403            0 :       const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
     404              : 
     405            0 :       int lower_corner[3] = {iz, iy, xmin};
     406            0 :       int upper_corner[3] = {iz + 1, iy + 1, xmin};
     407              : 
     408            0 :       for (int x = xmin - lb_cube[2];
     409            0 :            x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     410            0 :         const int x1 = map[2][x];
     411              : 
     412            0 :         if (!is_point_in_interval(x1, xwindow))
     413            0 :           continue;
     414              : 
     415            0 :         compute_interval(map[2], handler->grid.full_size[2],
     416              :                          handler->grid.size[2], xmax - lb_cube[2], x1, &x,
     417              :                          lower_corner + 2, upper_corner + 2, xwindow);
     418              : 
     419            0 :         if (upper_corner[2] - lower_corner[2]) {
     420            0 :           const int position1[3] = {k, j, x};
     421              : 
     422              :           /* the function will internally take care of the local vs global
     423              :            * grid */
     424              : 
     425            0 :           double *__restrict__ dst = &idx3(handler->grid, lower_corner[0],
     426              :                                            lower_corner[1], lower_corner[2]);
     427            0 :           double *__restrict__ src =
     428            0 :               &idx3(handler->cube, position1[0], position1[1], position1[2]);
     429              : 
     430            0 :           const int sizex = upper_corner[2] - lower_corner[2];
     431            0 :           GRID_PRAGMA_SIMD((dst, src), 8)
     432            0 :           for (int x = 0; x < sizex; x++) {
     433            0 :             dst[x] += src[x];
     434              :           }
     435              : 
     436            0 :           if (handler->grid.size[0] == handler->grid.full_size[0])
     437            0 :             update_loop_index(handler->grid.full_size[2], x1, &x);
     438              :           else
     439            0 :             x += upper_corner[2] - lower_corner[2] - 1;
     440              :         }
     441              :       }
     442              :     }
     443              :   }
     444            0 : }
     445              : 
     446        10560 : void collocate_l0(double *scratch, const double alpha, const bool orthogonal_xy,
     447              :                   const struct tensor_ *exp_xy,
     448              :                   const struct tensor_ *p_alpha_beta_reduced_,
     449              :                   struct tensor_ *cube) {
     450        10560 :   const double *__restrict pz =
     451              :       &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
     452        10560 :   const double *__restrict py =
     453        10560 :       &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
     454        10560 :   const double *__restrict px =
     455        10560 :       &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
     456              : 
     457        10560 :   memset(&idx3(cube[0], 0, 0, 0), 0, sizeof(double) * cube->alloc_size_);
     458        10560 :   memset(scratch, 0, sizeof(double) * cube->size[1] * cube->ld_);
     459              : 
     460        10560 :   cblas_dger(CblasRowMajor, cube->size[1], cube->size[2], alpha, py, 1, px, 1,
     461              :              scratch, cube->ld_);
     462              : 
     463        10560 :   if (exp_xy && !orthogonal_xy) {
     464            0 :     for (int y = 0; y < cube->size[1]; y++) {
     465            0 :       const double *__restrict src = &idx2(exp_xy[0], y, 0);
     466            0 :       double *__restrict dst = &scratch[y * cube->ld_];
     467              :       // #pragma omp simd linear(dst, src) simdlen(8)
     468            0 :       GRID_PRAGMA_SIMD((dst, src), 8)
     469            0 :       for (int x = 0; x < cube->size[2]; x++) {
     470            0 :         dst[x] *= src[x];
     471              :       }
     472              :     }
     473              :   }
     474              : 
     475        10560 :   cblas_dger(CblasRowMajor, cube->size[0], cube->size[1] * cube->ld_, 1.0, pz,
     476        10560 :              1, scratch, 1, &idx3(cube[0], 0, 0, 0), cube->size[1] * cube->ld_);
     477        10560 : }
     478              : 
     479              : /* compute the following operation (variant) it is a tensor contraction
     480              : 
     481              :                                  V_{kji} = \sum_{\alpha\beta\gamma}
     482              :    C_{\alpha\gamma\beta} T_{2,\alpha,i} T_{1,\beta,j} T_{0,\gamma,k}
     483              : 
     484              : */
     485        81054 : void tensor_reduction_for_collocate_integrate(
     486              :     double *scratch, const double alpha, const bool *const orthogonal,
     487              :     const struct tensor_ *Exp, const struct tensor_ *co,
     488              :     const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube) {
     489        81054 :   if (co->size[0] > 1) {
     490        70494 :     dgemm_params m1, m2, m3;
     491              : 
     492        70494 :     memset(&m1, 0, sizeof(dgemm_params));
     493        70494 :     memset(&m2, 0, sizeof(dgemm_params));
     494        70494 :     memset(&m3, 0, sizeof(dgemm_params));
     495        70494 :     tensor T;
     496        70494 :     tensor W;
     497              : 
     498        70494 :     double *__restrict const pz =
     499              :         &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
     500        70494 :     double *__restrict const py =
     501        70494 :         &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
     502        70494 :     double *__restrict const px =
     503        70494 :         &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
     504              : 
     505        70494 :     initialize_tensor_3(&T, co->size[0] /* alpha */, co->size[1] /* gamma */,
     506              :                         cube->size[1] /* j */);
     507        70494 :     initialize_tensor_3(&W, co->size[1] /* gamma */, cube->size[1] /* j */,
     508              :                         cube->size[2] /* i */);
     509              : 
     510        70494 :     T.data = scratch;
     511        70494 :     W.data = scratch + T.alloc_size_;
     512              :     /* WARNING we are in row major layout. cblas allows it and it is more
     513              :      * natural to read left to right than top to bottom
     514              :      *
     515              :      * we do first T_{\alpha,\gamma,j} = \sum_beta C_{alpha\gamma\beta}
     516              :      * Y_{\beta, j}
     517              :      *
     518              :      * keep in mind that Y_{\beta, j} = p_alpha_beta_reduced(1, \beta, j)
     519              :      * and the order of indices is also important. the last indice is the
     520              :      * fastest one. it can be done with one dgemm.
     521              :      */
     522              : 
     523        70494 :     m1.op1 = 'N';
     524        70494 :     m1.op2 = 'N';
     525        70494 :     m1.alpha = alpha;
     526        70494 :     m1.beta = 0.0;
     527        70494 :     m1.m = co->size[0] * co->size[1]; /* alpha gamma */
     528        70494 :     m1.n = cube->size[1];             /* j */
     529        70494 :     m1.k = co->size[2];               /* beta */
     530        70494 :     m1.a = co->data;                  // Coef_{alpha,gamma,beta} Coef_xzy
     531        70494 :     m1.lda = co->ld_;
     532        70494 :     m1.b = py; // Y_{beta, j} = p_alpha_beta_reduced(1, beta, j)
     533        70494 :     m1.ldb = p_alpha_beta_reduced_->ld_;
     534        70494 :     m1.c = T.data; // T_{\alpha, \gamma, j} = T(alpha, gamma, j)
     535        70494 :     m1.ldc = T.ld_;
     536        70494 :     m1.use_libxsmm = true;
     537              :     /*
     538              :      * the next step is a reduction along the alpha index.
     539              :      *
     540              :      * We compute then
     541              :      *
     542              :      * W_{gamma, j, i} = sum_{\alpha} T_{\gamma, j, alpha} X_{\alpha, i}
     543              :      *
     544              :      * which means we need to transpose T_{\alpha, \gamma, j} to get
     545              :      * T_{\gamma, j, \alpha}. Fortunately we can do it while doing the
     546              :      * matrix - matrix multiplication
     547              :      */
     548              : 
     549        70494 :     m2.op1 = 'T';
     550        70494 :     m2.op2 = 'N';
     551        70494 :     m2.alpha = 1.0;
     552        70494 :     m2.beta = 0.0;
     553        70494 :     m2.m = cube->size[1] * co->size[1]; // (\gamma j) direction
     554        70494 :     m2.n = cube->size[2];               // i
     555        70494 :     m2.k = co->size[0];                 // alpha
     556        70494 :     m2.a = T.data;                      // T_{\alpha, \gamma, j}
     557        70494 :     m2.lda = T.ld_ * co->size[1];
     558        70494 :     m2.b = px; // X_{alpha, i}  = p_alpha_beta_reduced(0, alpha, i)
     559        70494 :     m2.ldb = p_alpha_beta_reduced_->ld_;
     560        70494 :     m2.c = W.data; // W_{\gamma, j, i}
     561        70494 :     m2.ldc = W.ld_;
     562        70494 :     m2.use_libxsmm = true;
     563              :     /* the final step is again a reduction along the gamma indice. It can
     564              :      * again be done with one dgemm. The operation is simply
     565              :      *
     566              :      * Cube_{k, j, i} = \sum_{alpha} Z_{k, \gamma} W_{\gamma, j, i}
     567              :      *
     568              :      * which means we need to transpose Z_{\gamma, k}.
     569              :      */
     570              : 
     571        70494 :     m3.op1 = 'T';
     572        70494 :     m3.op2 = 'N';
     573        70494 :     m3.alpha = alpha;
     574        70494 :     m3.beta = 0.0;
     575        70494 :     m3.m = cube->size[0];                 // Z_{k \gamma}
     576        70494 :     m3.n = cube->size[1] * cube->size[2]; // (ji) direction
     577        70494 :     m3.k = co->size[1];                   // \gamma
     578        70494 :     m3.a = pz;                            // p_alpha_beta_reduced(0, gamma, i)
     579        70494 :     m3.lda = p_alpha_beta_reduced_->ld_;
     580        70494 :     m3.b = &idx3(W, 0, 0, 0); // W_{\gamma, j, i}
     581        70494 :     m3.ldb = W.size[1] * W.ld_;
     582        70494 :     m3.c = &idx3(cube[0], 0, 0, 0); // cube_{kji}
     583        70494 :     m3.ldc = cube->ld_ * cube->size[1];
     584        70494 :     m3.use_libxsmm = true;
     585        70494 :     dgemm_simplified(&m1);
     586        70494 :     dgemm_simplified(&m2);
     587              : 
     588              :     // apply the non orthorombic corrections in the xy plane
     589        70494 :     if (Exp && !orthogonal[2]) {
     590         4668 :       tensor exp_xy;
     591         4668 :       initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
     592         4668 :       exp_xy.data = &idx3(Exp[0], 2, 0, 0);
     593         4668 :       apply_non_orthorombic_corrections_xy_blocked(&exp_xy, &W);
     594              :     }
     595              : 
     596        70494 :     dgemm_simplified(&m3);
     597              :   } else {
     598        10560 :     if (Exp && !orthogonal[2]) {
     599            0 :       tensor exp_xy;
     600            0 :       initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
     601              : 
     602            0 :       exp_xy.data = &idx3(Exp[0], 2, 0, 0);
     603            0 :       collocate_l0(scratch, co->data[0] * alpha, orthogonal[2], &exp_xy,
     604              :                    p_alpha_beta_reduced_, cube);
     605              :     } else {
     606        10560 :       collocate_l0(scratch, co->data[0] * alpha, true, NULL,
     607              :                    p_alpha_beta_reduced_, cube);
     608              :     }
     609              :   }
     610              : 
     611        81054 :   if (Exp == NULL)
     612              :     return;
     613              : 
     614        45945 :   if (Exp && (!orthogonal[0] && !orthogonal[1])) {
     615            0 :     tensor exp_yz;
     616            0 :     tensor exp_xz;
     617            0 :     initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
     618            0 :     initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
     619            0 :     exp_xz.data = &idx3(Exp[0], 0, 0, 0);
     620            0 :     exp_yz.data = &idx3(Exp[0], 1, 0, 0);
     621            0 :     apply_non_orthorombic_corrections_xz_yz_blocked(&exp_xz, &exp_yz, cube);
     622            0 :     return;
     623              :   }
     624              : 
     625        45945 :   if (Exp && (!orthogonal[0] || !orthogonal[1])) {
     626        20163 :     if (!orthogonal[0]) {
     627        20163 :       tensor exp_xz;
     628        20163 :       initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
     629        20163 :       exp_xz.data = &idx3(Exp[0], 0, 0, 0);
     630        20163 :       apply_non_orthorombic_corrections_xz_blocked(&exp_xz, cube);
     631              :     }
     632              : 
     633        20163 :     if (!orthogonal[1]) {
     634            0 :       tensor exp_yz;
     635            0 :       initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
     636            0 :       exp_yz.data = &idx3(Exp[0], 1, 0, 0);
     637            0 :       apply_non_orthorombic_corrections_yz_blocked(&exp_yz, cube);
     638              :     }
     639              :   }
     640              : 
     641              :   return;
     642              : }
     643              : 
     644              : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
     645              :  * very general and does not depend on the orthorombic nature of the grid. for
     646              :  * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
     647              : 
     648        45945 : void apply_mapping_cubic(struct collocation_integration_ *handler,
     649              :                          const int cmax, const int *const lower_boundaries_cube,
     650              :                          const int *const cube_center) {
     651              : 
     652              :   // a mapping so that the ig corresponds to the right grid point
     653        45945 :   int **map = handler->map;
     654        45945 :   map[1] = map[0] + 2 * cmax + 1;
     655        45945 :   map[2] = map[1] + 2 * cmax + 1;
     656              : 
     657       183780 :   for (int i = 0; i < 3; i++) {
     658      3490684 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     659      3352849 :       map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
     660      3352849 :                               handler->grid.lower_corner[i],
     661              :                           handler->grid.full_size[i]);
     662              :     }
     663              :   }
     664              : 
     665        45945 :   int lower_corner[3];
     666        45945 :   int upper_corner[3];
     667              : 
     668        45945 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     669        45945 :                             .xmax = handler->grid.window_size[0]};
     670        45945 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     671        45945 :                             .xmax = handler->grid.window_size[1]};
     672        45945 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     673        45945 :                             .xmax = handler->grid.window_size[2]};
     674              : 
     675              :   /* this code makes a decomposition of the cube such that we can add block of
     676              :    * datas in a vectorized way. */
     677              : 
     678              :   /* the decomposition depends unfortunately on the way the grid is split over
     679              :    * mpi ranks. If each mpi rank has the full grid then it is simple. A 1D
     680              :    * example of the decomposition will explain it better. We have an interval
     681              :    * [x1, x1 + cube_size - 1] (and a second index x [0, cube_size -1]) and a
     682              :    * grid that goes from [0.. grid_size - 1].
     683              :    *
     684              :    * We start from x1 and compute the largest interval [x1.. x1 + diff] that fit
     685              :    * to [0.. grid_size - 1]. Computing the difference diff is simply
     686              :    * min(grid_size - x1, cube_size - x). then we add the result in a vectorized
     687              :    * fashion. we itterate the processus by reducing the interval [x1, x1 +
     688              :    * cube_size - 1] until it is empty. */
     689              : 
     690       134397 :   for (int z = 0; (z < handler->cube.size[0]); z++) {
     691        88452 :     const int z1 = map[0][z];
     692              : 
     693       176904 :     if (!is_point_in_interval(z1, zwindow))
     694            0 :       continue;
     695              : 
     696        88452 :     compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
     697              :                      handler->cube.size[0], z1, &z, lower_corner, upper_corner,
     698              :                      zwindow);
     699              : 
     700              :     /* // We have a full plane. */
     701        88452 :     if (upper_corner[0] - lower_corner[0]) {
     702       273144 :       for (int y = 0; y < handler->cube.size[1]; y++) {
     703       184692 :         const int y1 = map[1][y];
     704              : 
     705              :         // this check is completely irrelevant when running without MPI.
     706       369384 :         if (!is_point_in_interval(y1, ywindow))
     707            0 :           continue;
     708              : 
     709       184692 :         compute_interval(map[1], handler->grid.full_size[1],
     710              :                          handler->grid.size[1], handler->cube.size[1], y1, &y,
     711              :                          lower_corner + 1, upper_corner + 1, ywindow);
     712              : 
     713       184692 :         if (upper_corner[1] - lower_corner[1]) {
     714       620884 :           for (int x = 0; x < handler->cube.size[2]; x++) {
     715       436192 :             const int x1 = map[2][x];
     716              : 
     717       872384 :             if (!is_point_in_interval(x1, xwindow))
     718            0 :               continue;
     719              : 
     720       436192 :             compute_interval(map[2], handler->grid.full_size[2],
     721              :                              handler->grid.size[2], handler->cube.size[2], x1,
     722              :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
     723              : 
     724       436192 :             if (upper_corner[2] - lower_corner[2]) {
     725       436192 :               const int position1[3] = {z, y, x};
     726              :               /* the function will internally take care of the local vx global
     727              :                * grid */
     728       436192 :               add_sub_grid(
     729              :                   lower_corner, // lower corner of the portion of cube (in the
     730              :                   // full grid)
     731              :                   upper_corner, // upper corner of the portion of cube (in the
     732              :                   // full grid)
     733              :                   position1,       // starting position subblock inside the cube
     734       436192 :                   &handler->cube,  // the cube to extract data from
     735              :                   &handler->grid); // the grid to add data from
     736       436192 :               if (handler->grid.size[2] == handler->grid.full_size[2])
     737       436192 :                 update_loop_index(handler->grid.full_size[2], x1, &x);
     738              :               else
     739            0 :                 x += upper_corner[2] - lower_corner[2] - 1;
     740              :             }
     741              :           }
     742       184692 :           if (handler->grid.size[1] == handler->grid.full_size[1])
     743       184692 :             update_loop_index(handler->grid.full_size[1], y1, &y);
     744              :           else
     745            0 :             y += upper_corner[1] - lower_corner[1] - 1;
     746              :         }
     747              :       }
     748        88452 :       if (handler->grid.size[0] == handler->grid.full_size[0])
     749        88452 :         update_loop_index(handler->grid.full_size[0], z1, &z);
     750              :       else
     751            0 :         z += upper_corner[0] - lower_corner[0] - 1;
     752              :     }
     753              :   }
     754        45945 : }
     755              : 
     756              : // *****************************************************************************
     757        45945 : void grid_collocate(collocation_integration *const handler,
     758              :                     const bool use_ortho, const double zetp, const double rp[3],
     759              :                     const double radius) {
     760              :   // *** position of the gaussian product
     761              :   //
     762              :   // this is the actual definition of the position on the grid
     763              :   // i.e. a point rp(:) gets here grid coordinates
     764              :   // MODULO(rp(:)/dr(:),npts(:))+1
     765              :   // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on
     766              :   // grid)
     767              : 
     768              :   // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
     769        45945 :   int cubecenter[3];
     770        45945 :   int cube_size[3];
     771        45945 :   int lb_cube[3], ub_cube[3];
     772        45945 :   int pol_offset[3] = {0, 0, 0};
     773        45945 :   double roffset[3];
     774        45945 :   double disr_radius;
     775              :   /* cube : grid containing pointlike product between polynomials
     776              :    *
     777              :    * pol : grid  containing the polynomials in all three directions
     778              :    *
     779              :    * pol_folded : grid containing the polynomials after folding for periodic
     780              :    * boundaries conditions
     781              :    */
     782              : 
     783              :   /* seting up the cube parameters */
     784        91890 :   int cmax = compute_cube_properties(
     785        45945 :       use_ortho, radius, (const double(*)[3])handler->dh,
     786        45945 :       (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
     787              :       cubecenter, lb_cube, ub_cube, cube_size);
     788              : 
     789              :   /* initialize the multidimensional array containing the polynomials */
     790        45945 :   initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
     791        45945 :   realloc_tensor(&handler->pol);
     792        45945 :   memset(handler->pol.data, 0, sizeof(double) * handler->pol.alloc_size_);
     793              : 
     794              :   /* compute the polynomials */
     795              : 
     796              :   // WARNING : do not reverse the order in pol otherwise you will have to
     797              :   // reverse the order in collocate_dgemm as well.
     798              : 
     799        45945 :   if (use_ortho) {
     800        21114 :     grid_fill_pol_dgemm(false, handler->dh[0][0], roffset[2], pol_offset[2],
     801        21114 :                         lb_cube[2], ub_cube[2], handler->coef.size[2] - 1, cmax,
     802        21114 :                         zetp, &idx3(handler->pol, 2, 0, 0)); /* i indice */
     803        21114 :     grid_fill_pol_dgemm(false, handler->dh[1][1], roffset[1], pol_offset[1],
     804        21114 :                         lb_cube[1], ub_cube[1], handler->coef.size[1] - 1, cmax,
     805        21114 :                         zetp, &idx3(handler->pol, 1, 0, 0)); /* j indice */
     806        21114 :     grid_fill_pol_dgemm(false, handler->dh[2][2], roffset[0], pol_offset[0],
     807        21114 :                         lb_cube[0], ub_cube[0], handler->coef.size[0] - 1, cmax,
     808              :                         zetp, &idx3(handler->pol, 0, 0, 0)); /* k indice */
     809              :   } else {
     810        24831 :     grid_fill_pol_dgemm(false, 1.0, roffset[0], pol_offset[2], lb_cube[0],
     811        24831 :                         ub_cube[0], handler->coef.size[0] - 1, cmax,
     812        24831 :                         zetp * handler->dx[0],
     813              :                         &idx3(handler->pol, 0, 0, 0)); /* k indice */
     814        24831 :     grid_fill_pol_dgemm(false, 1.0, roffset[1], pol_offset[1], lb_cube[1],
     815        24831 :                         ub_cube[1], handler->coef.size[1] - 1, cmax,
     816        24831 :                         zetp * handler->dx[1],
     817        24831 :                         &idx3(handler->pol, 1, 0, 0)); /* j indice */
     818        24831 :     grid_fill_pol_dgemm(false, 1.0, roffset[2], pol_offset[0], lb_cube[2],
     819        24831 :                         ub_cube[2], handler->coef.size[2] - 1, cmax,
     820        24831 :                         zetp * handler->dx[2],
     821        24831 :                         &idx3(handler->pol, 2, 0, 0)); /* i indice */
     822              : 
     823        24831 :     calculate_non_orthorombic_corrections_tensor(
     824              :         zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
     825        24831 :         handler->orthogonal, &handler->Exp);
     826              : 
     827              :     /* Use a slightly modified version of Ole code */
     828        24831 :     grid_transform_coef_xzy_to_ikj((const double(*)[3])handler->dh,
     829        24831 :                                    &handler->coef);
     830              :   }
     831              : 
     832              :   /* allocate memory for the polynomial and the cube */
     833              : 
     834        45945 :   initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
     835              : 
     836        45945 :   realloc_tensor(&handler->cube);
     837        45945 :   initialize_W_and_T(handler, &handler->cube, &handler->coef);
     838              : 
     839        45945 :   tensor_reduction_for_collocate_integrate(
     840        45945 :       handler->scratch, // pointer to scratch memory
     841        45945 :       1.0, handler->orthogonal, &handler->Exp, &handler->coef, &handler->pol,
     842              :       &handler->cube);
     843              : 
     844        45945 :   if (handler->apply_cutoff) {
     845            0 :     if (use_ortho) {
     846            0 :       apply_sphere_cutoff_ortho(handler, disr_radius, cmax, lb_cube,
     847              :                                 cubecenter);
     848              :     } else {
     849            0 :       apply_spherical_cutoff_generic(handler, radius, cmax, lb_cube, ub_cube,
     850              :                                      roffset, cubecenter);
     851              :     }
     852            0 :     return;
     853              :   }
     854        45945 :   apply_mapping_cubic(handler, cmax, lb_cube, cubecenter);
     855              : }
     856              : 
     857              : //******************************************************************************
     858            0 : void grid_dgemm_collocate_pgf_product(
     859              :     const bool use_ortho, const int border_mask, const enum grid_func func,
     860              :     const int la_max, const int la_min, const int lb_max, const int lb_min,
     861              :     const double zeta, const double zetb, const double rscale,
     862              :     const double dh[3][3], const double dh_inv[3][3], const double ra[3],
     863              :     const double rab[3], const int grid_global_size[3],
     864              :     const int grid_local_size[3], const int shift_local[3],
     865              :     const int border_width[3], const double radius, const int o1, const int o2,
     866            0 :     const int n1, const int n2, const double pab_[n2][n1],
     867              :     double *const grid_) {
     868            0 :   collocation_integration *handler = collocate_create_handle();
     869              : 
     870            0 :   tensor pab;
     871            0 :   initialize_tensor_2(&pab, n2, n1);
     872            0 :   alloc_tensor(&pab);
     873            0 :   memcpy(pab.data, pab_, sizeof(double) * n1 * n2);
     874              :   // Uncomment this to dump all tasks to file.
     875              :   // #define __GRID_DUMP_TASKS
     876            0 :   int offset[2] = {o1, o2};
     877              : 
     878            0 :   int lmax[2] = {la_max, lb_max};
     879            0 :   int lmin[2] = {la_min, lb_min};
     880              : 
     881            0 :   const double zetp = zeta + zetb;
     882            0 :   const double f = zetb / zetp;
     883            0 :   const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
     884            0 :   const double prefactor = rscale * exp(-zeta * f * rab2);
     885            0 :   const double zeta_pair[2] = {zeta, zetb};
     886            0 :   initialize_basis_vectors(handler, dh, dh_inv);
     887            0 :   verify_orthogonality((const double(*)[3])dh, handler->orthogonal);
     888              : 
     889            0 :   initialize_tensor_3(&(handler->grid), grid_local_size[2], grid_local_size[1],
     890              :                       grid_local_size[0]);
     891              : 
     892            0 :   handler->grid.ld_ = grid_local_size[0];
     893            0 :   handler->grid.data = grid_;
     894              : 
     895            0 :   setup_global_grid_size(&handler->grid, (const int *)grid_global_size);
     896              : 
     897            0 :   initialize_tensor_3(&handler->grid, grid_local_size[2], grid_local_size[1],
     898              :                       grid_local_size[0]);
     899            0 :   handler->grid.ld_ = grid_local_size[0];
     900              : 
     901            0 :   setup_grid_window(&handler->grid, shift_local, border_width, border_mask);
     902              : 
     903            0 :   double rp[3], rb[3];
     904            0 :   for (int i = 0; i < 3; i++) {
     905            0 :     rp[i] = ra[i] + f * rab[i];
     906            0 :     rb[i] = ra[i] + rab[i];
     907              :   }
     908              : 
     909            0 :   int lmin_diff[2], lmax_diff[2];
     910            0 :   grid_prepare_get_ldiffs_dgemm(func, lmin_diff, lmax_diff);
     911              : 
     912            0 :   int lmin_prep[2];
     913            0 :   int lmax_prep[2];
     914              : 
     915            0 :   lmin_prep[0] = imax(lmin[0] + lmin_diff[0], 0);
     916            0 :   lmin_prep[1] = imax(lmin[1] + lmin_diff[1], 0);
     917              : 
     918            0 :   lmax_prep[0] = lmax[0] + lmax_diff[0];
     919            0 :   lmax_prep[1] = lmax[1] + lmax_diff[1];
     920              : 
     921            0 :   const int n1_prep = ncoset(lmax_prep[0]);
     922            0 :   const int n2_prep = ncoset(lmax_prep[1]);
     923              : 
     924              :   /* I really do not like this. This will disappear */
     925            0 :   tensor pab_prep;
     926            0 :   initialize_tensor_2(&pab_prep, n2_prep, n1_prep);
     927            0 :   alloc_tensor(&pab_prep);
     928            0 :   memset(pab_prep.data, 0, pab_prep.alloc_size_ * sizeof(double));
     929              : 
     930            0 :   grid_prepare_pab_dgemm(func, offset, lmax, lmin, &zeta_pair[0], &pab,
     931              :                          &pab_prep);
     932              : 
     933              :   //   *** initialise the coefficient matrix, we transform the sum
     934              :   //
     935              :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
     936              :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
     937              :   //         (z-a_z)**lza
     938              :   //
     939              :   // into
     940              :   //
     941              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
     942              :   //
     943              :   // where p is center of the product gaussian, and lp = la_max + lb_max
     944              :   // (current implementation is l**7)
     945              :   //
     946              : 
     947              :   /* precautionary tail since I probably intitialize data to NULL when I
     948              :    * initialize a new tensor. I want to keep the memory (I put a ridiculous
     949              :    * amount already) */
     950              : 
     951            0 :   initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
     952            0 :                       lmax_prep[0] + lmax_prep[1] + 1);
     953            0 :   realloc_tensor(&(handler->alpha));
     954              : 
     955            0 :   const int lp = lmax_prep[0] + lmax_prep[1];
     956              : 
     957            0 :   initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
     958            0 :   realloc_tensor(&(handler->coef));
     959              : 
     960              :   // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
     961              :   // need them to be stored as
     962              : 
     963            0 :   grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &(handler->alpha));
     964              : 
     965              :   //
     966              :   //   compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and
     967              :   //   alpha(ls,lxa,lxb,1) use a three step procedure we don't store zeros, so
     968              :   //   counting is done using lxyz,lxy in order to have contiguous memory access
     969              :   //   in collocate_fast.F
     970              :   //
     971              : 
     972              :   // coef[x][z][y]
     973            0 :   grid_compute_coefficients_dgemm(lmin_prep, lmax_prep, lp, prefactor,
     974              :                                   &(handler->alpha), &pab_prep,
     975              :                                   &(handler->coef));
     976              : 
     977            0 :   grid_collocate(handler, use_ortho, zetp, rp, radius);
     978              : 
     979            0 :   collocate_destroy_handle(handler);
     980            0 :   free(pab_prep.data);
     981            0 : }
     982              : 
     983         5100 : void extract_blocks(grid_context *const ctx, const _task *const task,
     984              :                     const offload_buffer *pab_blocks, tensor *const work,
     985              :                     tensor *const pab) {
     986         5100 :   const int iatom = task->iatom;
     987         5100 :   const int jatom = task->jatom;
     988         5100 :   const int iset = task->iset;
     989         5100 :   const int jset = task->jset;
     990         5100 :   const int ikind = ctx->atom_kinds[iatom];
     991         5100 :   const int jkind = ctx->atom_kinds[jatom];
     992         5100 :   const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     993         5100 :   const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     994              : 
     995         5100 :   const int block_num = task->block_num;
     996              : 
     997              :   // Locate current matrix block within the buffer. This block
     998              :   // contains the weights of the gaussian pairs in the spherical
     999              :   // harmonic basis, but we do computation in the cartesian
    1000              :   // harmonic basis so we have to rotate the coefficients. It is nothing
    1001              :   // else than a basis change and it done with two dgemm.
    1002              : 
    1003         5100 :   const int block_offset = ctx->block_offsets[block_num]; // zero based
    1004         5100 :   double *const block = &pab_blocks->host_buffer[block_offset];
    1005              : 
    1006         5100 :   rotate_to_cartesian_harmonics(ibasis, jbasis, iatom, jatom, iset, jset, block,
    1007              :                                 work, pab);
    1008         5100 : }
    1009              : 
    1010        45945 : void compute_coefficients(grid_context *const ctx,
    1011              :                           struct collocation_integration_ *handler,
    1012              :                           const _task *const previous_task,
    1013              :                           const _task *const task,
    1014              :                           const offload_buffer *pab_blocks, tensor *const pab,
    1015              :                           tensor *const work, tensor *const pab_prep) {
    1016              :   // Load subblock from buffer and decontract into Cartesian sublock pab.
    1017              :   // The previous pab can be reused when only ipgf or jpgf has changed.
    1018        45945 :   if (task->update_block_ || (previous_task == NULL)) {
    1019         4542 :     extract_blocks(ctx, task, pab_blocks, work, pab);
    1020              :   }
    1021              : 
    1022        45945 :   int lmin_prep[2];
    1023        45945 :   int lmax_prep[2];
    1024              : 
    1025        45945 :   lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
    1026        45945 :   lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
    1027              : 
    1028        45945 :   lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
    1029        45945 :   lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
    1030              : 
    1031        45945 :   const int n1_prep = ncoset(lmax_prep[0]);
    1032        45945 :   const int n2_prep = ncoset(lmax_prep[1]);
    1033              : 
    1034              :   /* we do not reallocate memory. We initialized the structure with the
    1035              :    * maximum lmax of the all list already.
    1036              :    */
    1037        45945 :   initialize_tensor_2(pab_prep, n2_prep, n1_prep);
    1038        45945 :   realloc_tensor(pab_prep);
    1039              : 
    1040        45945 :   grid_prepare_pab_dgemm(handler->func, task->offset, task->lmin, task->lmax,
    1041              :                          &task->zeta[0], pab, pab_prep);
    1042              : 
    1043              :   //   *** initialise the coefficient matrix, we transform the sum
    1044              :   //
    1045              :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
    1046              :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb
    1047              :   //         (y-a_y)**lya (z-a_z)**lza
    1048              :   //
    1049              :   // into
    1050              :   //
    1051              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp
    1052              :   // (z-p_z)**lzp
    1053              :   //
    1054              :   // where p is center of the product gaussian, and lp = la_max + lb_max
    1055              :   // (current implementation is l**7)
    1056              :   //
    1057              : 
    1058              :   /* precautionary tail since I probably intitialize data to NULL when I
    1059              :    * initialize a new tensor. I want to keep the memory (I put a ridiculous
    1060              :    * amount already) */
    1061              : 
    1062        45945 :   initialize_tensor_4(&handler->alpha, 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
    1063        45945 :                       lmax_prep[0] + lmax_prep[1] + 1);
    1064        45945 :   realloc_tensor(&handler->alpha);
    1065              : 
    1066        45945 :   const int lp = lmax_prep[0] + lmax_prep[1];
    1067              : 
    1068        45945 :   initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
    1069        45945 :   realloc_tensor(&handler->coef);
    1070              : 
    1071              :   // these two functions can be done with dgemm again....
    1072              : 
    1073        45945 :   grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax_prep,
    1074              :                            &handler->alpha);
    1075              : 
    1076              :   // compute the coefficients after applying the function of interest
    1077              :   // coef[x][z][y]
    1078        45945 :   grid_compute_coefficients_dgemm(
    1079              :       lmin_prep, lmax_prep, lp,
    1080        45945 :       task->prefactor * ((task->iatom == task->jatom) ? 1.0 : 2.0),
    1081              :       &handler->alpha, pab_prep, &handler->coef);
    1082        45945 : }
    1083              : 
    1084          648 : void collocate_one_grid_level_dgemm(grid_context *const ctx,
    1085              :                                     const int *const border_width,
    1086              :                                     const int *const shift_local,
    1087              :                                     const enum grid_func func, const int level,
    1088              :                                     const offload_buffer *pab_blocks) {
    1089          648 :   tensor *const grid = &ctx->grid[level];
    1090              :   // Using default(shared) because with GCC 9 the behavior around const changed:
    1091              :   // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
    1092          648 : #pragma omp parallel default(shared)
    1093              :   {
    1094              :     const int num_threads = omp_get_num_threads();
    1095              :     const int thread_id = omp_get_thread_num();
    1096              : 
    1097              :     tensor work, pab, pab_prep;
    1098              : 
    1099              :     struct collocation_integration_ *handler = ctx->handler[thread_id];
    1100              : 
    1101              :     handler->func = func;
    1102              :     grid_prepare_get_ldiffs_dgemm(handler->func, handler->lmin_diff,
    1103              :                                   handler->lmax_diff);
    1104              : 
    1105              :     handler->apply_cutoff = ctx->apply_cutoff;
    1106              : 
    1107              :     // Allocate pab matrix for re-use across tasks.
    1108              :     initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
    1109              :     alloc_tensor(&pab);
    1110              : 
    1111              :     initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
    1112              :     alloc_tensor(&work);
    1113              : 
    1114              :     initialize_tensor_2(&pab_prep, ctx->maxco, ctx->maxco);
    1115              :     alloc_tensor(&pab_prep);
    1116              : 
    1117              :     initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
    1118              :                              (const double(*)[3])grid->dh_inv);
    1119              : 
    1120              :     /* setup the grid parameters, window parameters (if the grid is split), etc
    1121              :      */
    1122              : 
    1123              :     tensor_copy(&handler->grid, grid);
    1124              : 
    1125              :     for (int d = 0; d < 3; d++)
    1126              :       handler->orthogonal[d] = handler->grid.orthogonal[d];
    1127              : 
    1128              :     if ((thread_id == 0) || (num_threads == 1)) {
    1129              :       // thread id 0 directly store the results in the final storage space
    1130              :       handler->grid.data = ctx->grid[level].data;
    1131              :     }
    1132              : 
    1133              :     if ((num_threads > 1) && (thread_id > 0)) {
    1134              :       handler->grid.data = ((double *)ctx->scratch) +
    1135              :                            (thread_id - 1) * handler->grid.alloc_size_;
    1136              :       memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
    1137              :     }
    1138              : 
    1139              :     /* it is only useful when we split the list over multiple threads. The first
    1140              :      * iteration should load the block whatever status the task->block_update_
    1141              :      * has */
    1142              :     const _task *prev_task = NULL;
    1143              : #pragma omp for schedule(static)
    1144              :     for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
    1145              :       // Define some convenient aliases.
    1146              :       const _task *task = &ctx->tasks[level][itask];
    1147              : 
    1148              :       if (task->level != level) {
    1149              :         printf("level %d, %d\n", task->level, level);
    1150              :         abort();
    1151              :       }
    1152              :       /* the grid is divided over several ranks or not periodic */
    1153              :       if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
    1154              :           (handler->grid.size[1] != handler->grid.full_size[1]) ||
    1155              :           (handler->grid.size[2] != handler->grid.full_size[2])) {
    1156              :         /* unfortunately the window where the gaussian should be added depends
    1157              :          * on the bonds. So I have to adjust the window all the time. */
    1158              : 
    1159              :         setup_grid_window(&handler->grid, shift_local, border_width,
    1160              :                           task->border_mask);
    1161              :       }
    1162              : 
    1163              :       /* this is a three steps procedure. pab_blocks contains the coefficients
    1164              :        * of the operator in the spherical harmonic basis while we do computation
    1165              :        * in the cartesian harmonic basis.
    1166              :        *
    1167              :        * step 1 : rotate the coefficients from the harmonic to the cartesian
    1168              :        * basis
    1169              : 
    1170              :        * step 2 : extract the subblock and apply additional transformations
    1171              :        * corresponding the spatial derivatives of the operator (the last is not
    1172              :        * always done)
    1173              : 
    1174              :        * step 3 : change from (x - x_1)^\alpha (x - x_2)^\beta to (x -
    1175              :        * x_{12})^k. It is a transformation which involves binomial
    1176              :        * coefficients.
    1177              :        *
    1178              :        * \f[ (x - x_1) ^\alpha (x - x_2) ^ beta = \sum_{k_{1} k_{2}} ^
    1179              :        *     {\alpha\beta} \text{Binomial}(\alpha,k_1)
    1180              :        *     \text{Binomial}(\beta,k_2) (x - x_{12})^{k_1 + k_2} (x_12 - x_1)
    1181              :        *     ^{\alpha - k_1} (x_12 - x_2) ^{\beta - k_2} ]
    1182              :        *
    1183              :        * step 1 is done only when necessary, the two remaining steps are done
    1184              :        * for each bond.
    1185              :        */
    1186              : 
    1187              :       compute_coefficients(ctx, handler, prev_task, task, pab_blocks, &pab,
    1188              :                            &work, &pab_prep);
    1189              : 
    1190              :       grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp,
    1191              :                      task->radius);
    1192              :       prev_task = task;
    1193              :     }
    1194              : 
    1195              :     // Merge thread local grids into shared grid. Could be improved though....
    1196              : 
    1197              :     // thread 0 does nothing since the data are already placed in the final
    1198              :     // destination
    1199              :     if (num_threads > 1) {
    1200              :       if ((grid->alloc_size_ / (num_threads - 1)) >= 2) {
    1201              :         const int block_size = grid->alloc_size_ / (num_threads - 1) +
    1202              :                                (grid->alloc_size_ % (num_threads - 1));
    1203              : 
    1204              :         for (int bk = 0; bk < num_threads; bk++) {
    1205              :           if (thread_id > 0) {
    1206              :             int bk_id = (bk + thread_id - 1) % num_threads;
    1207              :             int begin = bk_id * block_size;
    1208              :             int end = imin((bk_id + 1) * block_size, grid->alloc_size_);
    1209              :             cblas_daxpy(end - begin, 1.0, handler->grid.data + begin, 1,
    1210              :                         grid->data + begin, 1);
    1211              :           }
    1212              : #pragma omp barrier
    1213              :         }
    1214              :       }
    1215              :     } else {
    1216              : #pragma omp critical
    1217              :       if (thread_id > 0)
    1218              :         cblas_daxpy(handler->grid.alloc_size_, 1.0,
    1219              :                     &idx3(handler->grid, 0, 0, 0), 1, &idx3(grid[0], 0, 0, 0),
    1220              :                     1);
    1221              :     }
    1222              :     handler->grid.data = NULL;
    1223              :     free(pab.data);
    1224              :     free(pab_prep.data);
    1225              :     free(work.data);
    1226              :   }
    1227          648 : }
    1228              : 
    1229              : /*******************************************************************************
    1230              :  * \brief Collocate all tasks of a given list onto given grids.
    1231              :  *        See grid_task_list.h for details.
    1232              :  ******************************************************************************/
    1233          162 : void grid_dgemm_collocate_task_list(grid_dgemm_task_list *const ptr,
    1234              :                                     const enum grid_func func,
    1235              :                                     const int nlevels,
    1236              :                                     const offload_buffer *pab_blocks,
    1237              :                                     offload_buffer *grids[nlevels]) {
    1238              : 
    1239          162 :   grid_context *const ctx = (grid_context *)ptr;
    1240              : 
    1241          162 :   assert(ctx->checksum == ctx_checksum);
    1242              : 
    1243          162 :   const int max_threads = omp_get_max_threads();
    1244              : 
    1245          162 :   assert(ctx->nlevels == nlevels);
    1246              : 
    1247              :   // #pragma omp parallel for
    1248          810 :   for (int level = 0; level < ctx->nlevels; level++) {
    1249          648 :     const _layout *layout = &ctx->layouts[level];
    1250          648 :     set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
    1251          648 :                         layout->npts_global, layout->npts_local,
    1252          648 :                         layout->shift_local, layout->border_width, layout->dh,
    1253          648 :                         layout->dh_inv, grids[level]);
    1254          648 :     memset(ctx->grid[level].data, 0,
    1255          648 :            sizeof(double) * ctx->grid[level].alloc_size_);
    1256              :   }
    1257              : 
    1258          162 :   if (ctx->scratch == NULL) {
    1259          162 :     int max_size = ctx->grid[0].alloc_size_;
    1260              : 
    1261              :     /* compute the size of the largest grid. It is used afterwards to allocate
    1262              :      * scratch memory for the grid on each omp thread */
    1263          648 :     for (int x = 1; x < nlevels; x++) {
    1264          486 :       max_size = imax(ctx->grid[x].alloc_size_, max_size);
    1265              :     }
    1266              : 
    1267          162 :     max_size = ((max_size / 4096) + (max_size % 4096 != 0)) * 4096;
    1268              : 
    1269              :     /* scratch is a void pointer !!!!! */
    1270          162 :     ctx->scratch =
    1271          162 :         grid_allocate_scratch(max_size * max_threads * sizeof(double));
    1272              :   }
    1273              : 
    1274          810 :   for (int level = 0; level < ctx->nlevels; level++) {
    1275          648 :     const _layout *layout = &ctx->layouts[level];
    1276          648 :     collocate_one_grid_level_dgemm(ctx, layout->border_width,
    1277          648 :                                    layout->shift_local, func, level,
    1278              :                                    pab_blocks);
    1279              :   }
    1280              : 
    1281          162 :   grid_free_scratch(ctx->scratch);
    1282          162 :   ctx->scratch = NULL;
    1283          162 : }
        

Generated by: LCOV version 2.0-1