LCOV - CP2K Regtests (git:96bff0e) - src/grid/dgemm/grid_dgemm

LCOV - code coverage report

Current view:	top level - src/grid/dgemm - grid_dgemm_collocate.c (source / functions)		Hit	Total	Coverage
Test:	CP2K Regtests (git:96bff0e)	Lines:	379	586	64.7 %
Date:	2024-07-27 06:51:10	Functions:	10	13	76.9 %

          Line data    Source code

       1             : /*----------------------------------------------------------------------------*/
       2             : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3             : /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */
       4             : /*                                                                            */
       5             : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6             : /*----------------------------------------------------------------------------*/
       7             : 
       8             : #include <assert.h>
       9             : #include <limits.h>
      10             : #include <math.h>
      11             : #include <omp.h>
      12             : #include <stdbool.h>
      13             : #include <stdio.h>
      14             : #include <stdlib.h>
      15             : #include <string.h>
      16             : 
      17             : #ifdef __LIBXSMM
      18             : #include <libxsmm.h>
      19             : #endif
      20             : 
      21             : #include "../common/grid_basis_set.h"
      22             : #include "../common/grid_common.h"
      23             : #include "../common/grid_constants.h"
      24             : #include "grid_dgemm_coefficients.h"
      25             : #include "grid_dgemm_collocate.h"
      26             : #include "grid_dgemm_collocation_integration.h"
      27             : #include "grid_dgemm_non_orthorombic_corrections.h"
      28             : #include "grid_dgemm_prepare_pab.h"
      29             : #include "grid_dgemm_private_header.h"
      30             : #include "grid_dgemm_task_list.h"
      31             : #include "grid_dgemm_tensor_local.h"
      32             : 
      33             : void collocate_l0(double *scratch, const double alpha, const bool orthogonal,
      34             :                   const struct tensor_ *exp_xy,
      35             :                   const struct tensor_ *p_alpha_beta_reduced_,
      36             :                   struct tensor_ *cube);
      37             : 
      38        5040 : void rotate_to_cartesian_harmonics(const grid_basis_set *ibasis,
      39             :                                    const grid_basis_set *jbasis,
      40             :                                    const int iatom, const int jatom,
      41             :                                    const int iset, const int jset,
      42             :                                    double *const block, tensor *work,
      43             :                                    tensor *pab) {
      44        5040 :   dgemm_params m1, m2;
      45        5040 :   memset(&m1, 0, sizeof(dgemm_params));
      46        5040 :   memset(&m2, 0, sizeof(dgemm_params));
      47             : 
      48             :   // Define some more convenient aliases.
      49        5040 :   const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
      50        5040 :   const int nsgf_setb = jbasis->nsgf_set[jset];
      51        5040 :   const int nsgfa = ibasis->nsgf; // size of entire spherical basis
      52        5040 :   const int nsgfb = jbasis->nsgf;
      53        5040 :   const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
      54        5040 :   const int sgfb = jbasis->first_sgf[jset] - 1;
      55        5040 :   const int maxcoa = ibasis->maxco;
      56        5040 :   const int maxcob = jbasis->maxco;
      57        5040 :   const int ncoseta = ncoset(ibasis->lmax[iset]);
      58        5040 :   const int ncosetb = ncoset(jbasis->lmax[jset]);
      59        5040 :   const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
      60        5040 :   const int ncob = jbasis->npgf[jset] * ncosetb;
      61             : 
      62        5040 :   initialize_tensor_2(work, nsgf_setb, ncoa);
      63        5040 :   realloc_tensor(work);
      64             : 
      65        5040 :   initialize_tensor_2(pab, ncob, ncoa);
      66        5040 :   realloc_tensor(pab);
      67             : 
      68             :   // the rotations happen here.
      69        5040 :   if (iatom <= jatom) {
      70        3352 :     m1.op1 = 'N';
      71        3352 :     m1.op2 = 'N';
      72        3352 :     m1.m = work->size[0];
      73        3352 :     m1.n = work->size[1];
      74        3352 :     m1.k = nsgf_seta;
      75        3352 :     m1.alpha = 1.0;
      76        3352 :     m1.beta = 0.0;
      77        3352 :     m1.a = block + sgfb * nsgfa + sgfa;
      78        3352 :     m1.lda = nsgfa;
      79        3352 :     m1.b = &ibasis->sphi[sgfa * maxcoa];
      80        3352 :     m1.ldb = maxcoa;
      81        3352 :     m1.c = work->data;
      82        3352 :     m1.ldc = work->ld_;
      83             :   } else {
      84        1688 :     m1.op1 = 'T';
      85        1688 :     m1.op2 = 'N';
      86        1688 :     m1.m = work->size[0];
      87        1688 :     m1.n = work->size[1];
      88        1688 :     m1.k = nsgf_seta;
      89        1688 :     m1.alpha = 1.0;
      90        1688 :     m1.beta = 0.0;
      91        1688 :     m1.a = block + sgfa * nsgfb + sgfb;
      92        1688 :     m1.lda = nsgfb;
      93        1688 :     m1.b = &ibasis->sphi[sgfa * maxcoa];
      94        1688 :     m1.ldb = maxcoa;
      95        1688 :     m1.c = work->data;
      96        1688 :     m1.ldc = work->ld_;
      97             :   }
      98        5040 :   m1.use_libxsmm = true;
      99        5040 :   dgemm_simplified(&m1);
     100             : 
     101        5040 :   m2.op1 = 'T';
     102        5040 :   m2.op2 = 'N';
     103        5040 :   m2.m = pab->size[0];
     104        5040 :   m2.n = pab->size[1];
     105        5040 :   m2.k = work->size[0];
     106        5040 :   m2.alpha = 1.0;
     107        5040 :   m2.beta = 0.0;
     108        5040 :   m2.a = &jbasis->sphi[sgfb * maxcob];
     109        5040 :   m2.lda = maxcob;
     110        5040 :   m2.b = work->data;
     111        5040 :   m2.ldb = work->ld_;
     112        5040 :   m2.c = pab->data;
     113        5040 :   m2.ldc = pab->ld_;
     114        5040 :   m2.use_libxsmm = true;
     115        5040 :   dgemm_simplified(&m2);
     116        5040 : }
     117             : 
     118             : void tensor_reduction_for_collocate_integrate(
     119             :     double *scratch, const double alpha, const bool *const orthogonal,
     120             :     const struct tensor_ *Exp, const struct tensor_ *co,
     121             :     const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube);
     122             : 
     123             : /* compute the functions (x - x_i)^l exp (-eta (x - x_i)^2) for l = 0..lp using
     124             :  * a recursive relation to avoid computing the exponential on each grid point. I
     125             :  * think it is not really necessary anymore since it is *not* the dominating
     126             :  * contribution to computation of collocate and integrate */
     127             : 
     128      264000 : void grid_fill_pol_dgemm(const bool transpose, const double dr,
     129             :                          const double roffset, const int pol_offset,
     130             :                          const int xmin, const int xmax, const int lp,
     131             :                          const int cmax, const double zetp, double *pol_) {
     132      264000 :   tensor pol;
     133      264000 :   const double t_exp_1 = exp(-zetp * dr * dr);
     134      264000 :   const double t_exp_2 = t_exp_1 * t_exp_1;
     135             : 
     136      264000 :   double t_exp_min_1 = exp(-zetp * (dr - roffset) * (dr - roffset));
     137      264000 :   double t_exp_min_2 = exp(2.0 * zetp * (dr - roffset) * dr);
     138             : 
     139      264000 :   double t_exp_plus_1 = exp(-zetp * roffset * roffset);
     140      264000 :   double t_exp_plus_2 = exp(2.0 * zetp * roffset * dr);
     141             : 
     142      264000 :   if (transpose) {
     143      102993 :     initialize_tensor_2(&pol, cmax, lp + 1);
     144      102993 :     pol.data = pol_;
     145             :     /* It is original Ole code. I need to transpose the polynomials for the
     146             :      * integration routine and Ole code already does it. */
     147     1397272 :     for (int ig = 0; ig >= xmin; ig--) {
     148     1294279 :       const double rpg = ig * dr - roffset;
     149     1294279 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     150     1294279 :       t_exp_min_2 *= t_exp_2;
     151     1294279 :       double pg = t_exp_min_1;
     152     5304293 :       for (int icoef = 0; icoef <= lp; icoef++) {
     153     4010014 :         idx2(pol, pol_offset + ig - xmin, icoef) = pg;
     154     4010014 :         pg *= rpg;
     155             :       }
     156             :     }
     157             : 
     158      102993 :     double t_exp_plus_1 = exp(-zetp * roffset * roffset);
     159      102993 :     double t_exp_plus_2 = exp(2 * zetp * roffset * dr);
     160     1343614 :     for (int ig = 1; ig <= xmax; ig++) {
     161     1240621 :       const double rpg = ig * dr - roffset;
     162     1240621 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     163     1240621 :       t_exp_plus_2 *= t_exp_2;
     164     1240621 :       double pg = t_exp_plus_1;
     165     5085185 :       for (int icoef = 0; icoef <= lp; icoef++) {
     166     3844564 :         idx2(pol, pol_offset + ig - xmin, icoef) = pg;
     167     3844564 :         pg *= rpg;
     168             :       }
     169             :     }
     170             : 
     171             :   } else {
     172      161007 :     initialize_tensor_2(&pol, lp + 1, cmax);
     173      161007 :     pol.data = pol_;
     174             :     /* memset(pol.data, 0, sizeof(double) * pol.alloc_size_); */
     175             :     /*
     176             :      *   compute the values of all (x-xp)**lp*exp(..)
     177             :      *
     178             :      *  still requires the old trick:
     179             :      *  new trick to avoid to many exps (reuse the result from the previous
     180             :      * gridpoint): exp( -a*(x+d)**2)=exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
     181             :      *  exp(-2*a*(x+d)*d)=exp(-2*a*x*d)*exp(-2*a*d**2)
     182             :      */
     183             : 
     184             :     /* compute the exponential recursively and store the polynomial prefactors
     185             :      * as well */
     186     2162340 :     for (int ig = 0; ig >= xmin; ig--) {
     187     2001333 :       const double rpg = ig * dr - roffset;
     188     2001333 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     189     2001333 :       t_exp_min_2 *= t_exp_2;
     190     2001333 :       double pg = t_exp_min_1;
     191     2001333 :       idx2(pol, 0, pol_offset + ig - xmin) = pg;
     192     2001333 :       if (lp > 0)
     193     1273689 :         idx2(pol, 1, pol_offset + ig - xmin) = rpg;
     194             :     }
     195             : 
     196     2078682 :     for (int ig = 1; ig <= xmax; ig++) {
     197     1917675 :       const double rpg = ig * dr - roffset;
     198     1917675 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     199     1917675 :       t_exp_plus_2 *= t_exp_2;
     200     1917675 :       double pg = t_exp_plus_1;
     201     1917675 :       idx2(pol, 0, pol_offset + ig - xmin) = pg;
     202     1917675 :       if (lp > 0)
     203     1219605 :         idx2(pol, 1, pol_offset + ig - xmin) = rpg;
     204             :     }
     205             : 
     206             :     /* compute the remaining powers using previously computed stuff */
     207      161007 :     if (lp >= 2) {
     208       56541 :       double *__restrict__ poly = &idx2(pol, 1, 0);
     209       56541 :       double *__restrict__ src1 = &idx2(pol, 0, 0);
     210       56541 :       double *__restrict__ dst = &idx2(pol, 2, 0);
     211             :       // #pragma omp simd linear(dst, src1, poly) simdlen(8)
     212      230814 :       GRID_PRAGMA_SIMD((dst, src1, poly), 8)
     213       56541 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++)
     214     1405620 :         dst[ig] = src1[ig] * poly[ig] * poly[ig];
     215             :     }
     216             : 
     217      174273 :     for (int icoef = 3; icoef <= lp; icoef++) {
     218       13266 :       double *__restrict__ poly = &idx2(pol, 1, 0);
     219       13266 :       double *__restrict__ src1 = &idx2(pol, icoef - 1, 0);
     220       13266 :       double *__restrict__ dst = &idx2(pol, icoef, 0);
     221       13266 :       GRID_PRAGMA_SIMD((dst, src1, poly), 8)
     222       13266 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
     223      372702 :         dst[ig] = src1[ig] * poly[ig];
     224             :       }
     225             :     }
     226             : 
     227             :     //
     228      161007 :     if (lp > 0) {
     229             :       // I can not declare src__ variable constant because it breaks openmp
     230             :       // standard.
     231      101487 :       double *__restrict__ dst = &idx2(pol, 1, 0);
     232      101487 :       double *__restrict__ src = &idx2(pol, 0, 0);
     233      365487 :       GRID_PRAGMA_SIMD((dst, src), 8)
     234      101487 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
     235     2493294 :         dst[ig] *= src[ig];
     236             :       }
     237             :     }
     238             :   }
     239      264000 : }
     240             : 
     241           0 : void apply_sphere_cutoff_ortho(struct collocation_integration_ *const handler,
     242             :                                const double disr_radius, const int cmax,
     243             :                                const int *const lb_cube,
     244             :                                const int *const cube_center) {
     245             :   // a mapping so that the ig corresponds to the right grid point
     246           0 :   int **map = handler->map;
     247           0 :   map[1] = map[0] + 2 * cmax + 1;
     248           0 :   map[2] = map[1] + 2 * cmax + 1;
     249             :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     250             : 
     251           0 :   for (int i = 0; i < 3; i++) {
     252           0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     253           0 :       map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
     254           0 :                               handler->grid.lower_corner[i],
     255             :                           handler->grid.full_size[i]);
     256             :     }
     257             :   }
     258             : 
     259           0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     260           0 :                             .xmax = handler->grid.window_size[0]};
     261           0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     262           0 :                             .xmax = handler->grid.window_size[1]};
     263           0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     264           0 :                             .xmax = handler->grid.window_size[2]};
     265             : 
     266           0 :   for (int kg = 0; kg < handler->cube.size[0]; kg++) {
     267           0 :     const int k = map[0][kg];
     268           0 :     const int kd =
     269           0 :         (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
     270           0 :     const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
     271           0 :     const double kremain = disr_radius * disr_radius - kr * kr;
     272             : 
     273           0 :     if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
     274             : 
     275           0 :       const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
     276           0 :       for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
     277           0 :         const int j = map[1][jg - lb_cube[1]];
     278           0 :         const double jr = ((2 * jg - 1) >> 1) *
     279           0 :                           handler->dh[1][1]; // distance from center in a.u.
     280           0 :         const double jremain = kremain - jr * jr;
     281             : 
     282           0 :         if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
     283           0 :           const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
     284           0 :           const int xmax = 1 - xmin;
     285             : 
     286             :           // printf("xmin %d, xmax %d\n", xmin, xmax);
     287           0 :           for (int x = xmin - lb_cube[2];
     288           0 :                x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     289           0 :             const int x1 = map[2][x];
     290             : 
     291           0 :             if (!is_point_in_interval(x1, xwindow))
     292           0 :               continue;
     293             : 
     294           0 :             int lower_corner[3] = {k, j, x1};
     295           0 :             int upper_corner[3] = {k + 1, j + 1, x1 + 1};
     296             : 
     297           0 :             compute_interval(map[2], handler->grid.full_size[2],
     298             :                              handler->grid.size[2], xmax - lb_cube[2], x1, &x,
     299             :                              lower_corner + 2, upper_corner + 2, xwindow);
     300             : 
     301           0 :             if (upper_corner[2] - lower_corner[2]) {
     302           0 :               const int position1[3] = {kg, jg - lb_cube[1], x};
     303             : 
     304           0 :               double *restrict dst = &idx3(handler->grid, lower_corner[0],
     305             :                                            lower_corner[1], lower_corner[2]);
     306           0 :               double *restrict src = &idx3(handler->cube, position1[0],
     307             :                                            position1[1], position1[2]);
     308             : 
     309           0 :               const int sizex = upper_corner[2] - lower_corner[2];
     310           0 :               GRID_PRAGMA_SIMD((dst, src), 8)
     311           0 :               for (int x = 0; x < sizex; x++) {
     312           0 :                 dst[x] += src[x];
     313             :               }
     314             :             }
     315             : 
     316           0 :             if (handler->grid.size[2] == handler->grid.full_size[2])
     317           0 :               update_loop_index(handler->grid.full_size[2], x1, &x);
     318             :             else
     319           0 :               x += upper_corner[2] - lower_corner[2] - 1;
     320             :           }
     321             :         }
     322             :       }
     323             :     }
     324             :   }
     325           0 : }
     326             : 
     327           0 : void apply_spherical_cutoff_generic(
     328             :     struct collocation_integration_ *const handler, const double disr_radius,
     329             :     const int cmax, const int *const lb_cube, const int *const ub_cube,
     330             :     const double *const roffset, const int *const cube_center) {
     331             : 
     332           0 :   const double a = handler->dh[0][0] * handler->dh[0][0] +
     333           0 :                    handler->dh[0][1] * handler->dh[0][1] +
     334           0 :                    handler->dh[0][2] * handler->dh[0][2];
     335           0 :   const double a_inv = 1.0 / a;
     336             :   // a mapping so that the ig corresponds to the right grid point
     337           0 :   int **map = handler->map;
     338           0 :   map[1] = map[0] + 2 * cmax + 1;
     339           0 :   map[2] = map[1] + 2 * cmax + 1;
     340             :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     341             : 
     342           0 :   for (int i = 0; i < 3; i++) {
     343           0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     344           0 :       map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
     345           0 :                               handler->grid.lower_corner[i],
     346             :                           handler->grid.full_size[i]);
     347             :     }
     348             :   }
     349             : 
     350           0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     351           0 :                             .xmax = handler->grid.window_size[0]};
     352           0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     353           0 :                             .xmax = handler->grid.window_size[1]};
     354           0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     355           0 :                             .xmax = handler->grid.window_size[2]};
     356             : 
     357           0 :   for (int k = 0; k < handler->cube.size[0]; k++) {
     358           0 :     const int iz = map[0][k];
     359             : 
     360           0 :     if (!is_point_in_interval(iz, zwindow))
     361           0 :       continue;
     362             : 
     363           0 :     const double tz = (k + lb_cube[0] - roffset[0]);
     364           0 :     const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
     365           0 :                          tz * handler->dh[2][2]};
     366             : 
     367           0 :     for (int j = 0; j < handler->cube.size[1]; j++) {
     368           0 :       const int iy = map[1][j];
     369             : 
     370           0 :       if (!is_point_in_interval(iy, ywindow))
     371           0 :         continue;
     372             : 
     373           0 :       const double ty = (j - roffset[1] + lb_cube[1]);
     374           0 :       const double y[3] = {z[0] + ty * handler->dh[1][0],
     375           0 :                            z[1] + ty * handler->dh[1][1],
     376           0 :                            z[2] + ty * handler->dh[1][2]};
     377             : 
     378             :       /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
     379             :       /*                                            4 (x1^2 + y1^2 + z1^2)
     380             :        * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
     381             : 
     382           0 :       const double b =
     383           0 :           -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
     384           0 :                   handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
     385           0 :                   handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
     386             : 
     387           0 :       const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
     388           0 :                            (roffset[2] * handler->dh[0][0] - y[0]) +
     389           0 :                        (roffset[2] * handler->dh[0][1] - y[1]) *
     390           0 :                            (roffset[2] * handler->dh[0][1] - y[1]) +
     391           0 :                        (roffset[2] * handler->dh[0][2] - y[2]) *
     392             :                            (roffset[2] * handler->dh[0][2] - y[2]) -
     393           0 :                        disr_radius * disr_radius;
     394             : 
     395           0 :       double delta = b * b - 4.0 * a * c;
     396             : 
     397           0 :       if (delta < 0.0)
     398           0 :         continue;
     399             : 
     400           0 :       delta = sqrt(delta);
     401             : 
     402           0 :       const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
     403           0 :       const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
     404             : 
     405           0 :       int lower_corner[3] = {iz, iy, xmin};
     406           0 :       int upper_corner[3] = {iz + 1, iy + 1, xmin};
     407             : 
     408           0 :       for (int x = xmin - lb_cube[2];
     409           0 :            x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     410           0 :         const int x1 = map[2][x];
     411             : 
     412           0 :         if (!is_point_in_interval(x1, xwindow))
     413           0 :           continue;
     414             : 
     415           0 :         compute_interval(map[2], handler->grid.full_size[2],
     416             :                          handler->grid.size[2], xmax - lb_cube[2], x1, &x,
     417             :                          lower_corner + 2, upper_corner + 2, xwindow);
     418             : 
     419           0 :         if (upper_corner[2] - lower_corner[2]) {
     420           0 :           const int position1[3] = {k, j, x};
     421             : 
     422             :           /* the function will internally take care of the local vs global
     423             :            * grid */
     424             : 
     425           0 :           double *__restrict__ dst = &idx3(handler->grid, lower_corner[0],
     426             :                                            lower_corner[1], lower_corner[2]);
     427           0 :           double *__restrict__ src =
     428           0 :               &idx3(handler->cube, position1[0], position1[1], position1[2]);
     429             : 
     430           0 :           const int sizex = upper_corner[2] - lower_corner[2];
     431           0 :           GRID_PRAGMA_SIMD((dst, src), 8)
     432           0 :           for (int x = 0; x < sizex; x++) {
     433           0 :             dst[x] += src[x];
     434             :           }
     435             : 
     436           0 :           if (handler->grid.size[0] == handler->grid.full_size[0])
     437           0 :             update_loop_index(handler->grid.full_size[2], x1, &x);
     438             :           else
     439           0 :             x += upper_corner[2] - lower_corner[2] - 1;
     440             :         }
     441             :       }
     442             :     }
     443             :   }
     444           0 : }
     445             : 
     446       10560 : void collocate_l0(double *scratch, const double alpha, const bool orthogonal_xy,
     447             :                   const struct tensor_ *exp_xy,
     448             :                   const struct tensor_ *p_alpha_beta_reduced_,
     449             :                   struct tensor_ *cube) {
     450       10560 :   const double *__restrict pz =
     451             :       &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
     452       10560 :   const double *__restrict py =
     453       10560 :       &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
     454       10560 :   const double *__restrict px =
     455       10560 :       &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
     456             : 
     457       10560 :   memset(&idx3(cube[0], 0, 0, 0), 0, sizeof(double) * cube->alloc_size_);
     458       10560 :   memset(scratch, 0, sizeof(double) * cube->size[1] * cube->ld_);
     459             : 
     460       10560 :   cblas_dger(CblasRowMajor, cube->size[1], cube->size[2], alpha, py, 1, px, 1,
     461             :              scratch, cube->ld_);
     462             : 
     463       10560 :   if (exp_xy && !orthogonal_xy) {
     464           0 :     for (int y = 0; y < cube->size[1]; y++) {
     465           0 :       const double *__restrict src = &idx2(exp_xy[0], y, 0);
     466           0 :       double *__restrict dst = &scratch[y * cube->ld_];
     467             :       // #pragma omp simd linear(dst, src) simdlen(8)
     468           0 :       GRID_PRAGMA_SIMD((dst, src), 8)
     469           0 :       for (int x = 0; x < cube->size[2]; x++) {
     470           0 :         dst[x] *= src[x];
     471             :       }
     472             :     }
     473             :   }
     474             : 
     475       10560 :   cblas_dger(CblasRowMajor, cube->size[0], cube->size[1] * cube->ld_, 1.0, pz,
     476       10560 :              1, scratch, 1, &idx3(cube[0], 0, 0, 0), cube->size[1] * cube->ld_);
     477       10560 : }
     478             : 
     479             : /* compute the following operation (variant) it is a tensor contraction
     480             : 
     481             :                                  V_{kji} = \sum_{\alpha\beta\gamma}
     482             :    C_{\alpha\gamma\beta} T_{2,\alpha,i} T_{1,\beta,j} T_{0,\gamma,k}
     483             : 
     484             : */
     485       78720 : void tensor_reduction_for_collocate_integrate(
     486             :     double *scratch, const double alpha, const bool *const orthogonal,
     487             :     const struct tensor_ *Exp, const struct tensor_ *co,
     488             :     const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube) {
     489       78720 :   if (co->size[0] > 1) {
     490       68160 :     dgemm_params m1, m2, m3;
     491             : 
     492       68160 :     memset(&m1, 0, sizeof(dgemm_params));
     493       68160 :     memset(&m2, 0, sizeof(dgemm_params));
     494       68160 :     memset(&m3, 0, sizeof(dgemm_params));
     495       68160 :     tensor T;
     496       68160 :     tensor W;
     497             : 
     498       68160 :     double *__restrict const pz =
     499             :         &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
     500       68160 :     double *__restrict const py =
     501       68160 :         &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
     502       68160 :     double *__restrict const px =
     503       68160 :         &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
     504             : 
     505       68160 :     initialize_tensor_3(&T, co->size[0] /* alpha */, co->size[1] /* gamma */,
     506             :                         cube->size[1] /* j */);
     507       68160 :     initialize_tensor_3(&W, co->size[1] /* gamma */, cube->size[1] /* j */,
     508             :                         cube->size[2] /* i */);
     509             : 
     510       68160 :     T.data = scratch;
     511       68160 :     W.data = scratch + T.alloc_size_;
     512             :     /* WARNING we are in row major layout. cblas allows it and it is more
     513             :      * natural to read left to right than top to bottom
     514             :      *
     515             :      * we do first T_{\alpha,\gamma,j} = \sum_beta C_{alpha\gamma\beta}
     516             :      * Y_{\beta, j}
     517             :      *
     518             :      * keep in mind that Y_{\beta, j} = p_alpha_beta_reduced(1, \beta, j)
     519             :      * and the order of indices is also important. the last indice is the
     520             :      * fastest one. it can be done with one dgemm.
     521             :      */
     522             : 
     523       68160 :     m1.op1 = 'N';
     524       68160 :     m1.op2 = 'N';
     525       68160 :     m1.alpha = alpha;
     526       68160 :     m1.beta = 0.0;
     527       68160 :     m1.m = co->size[0] * co->size[1]; /* alpha gamma */
     528       68160 :     m1.n = cube->size[1];             /* j */
     529       68160 :     m1.k = co->size[2];               /* beta */
     530       68160 :     m1.a = co->data;                  // Coef_{alpha,gamma,beta} Coef_xzy
     531       68160 :     m1.lda = co->ld_;
     532       68160 :     m1.b = py; // Y_{beta, j} = p_alpha_beta_reduced(1, beta, j)
     533       68160 :     m1.ldb = p_alpha_beta_reduced_->ld_;
     534       68160 :     m1.c = T.data; // T_{\alpha, \gamma, j} = T(alpha, gamma, j)
     535       68160 :     m1.ldc = T.ld_;
     536       68160 :     m1.use_libxsmm = true;
     537             :     /*
     538             :      * the next step is a reduction along the alpha index.
     539             :      *
     540             :      * We compute then
     541             :      *
     542             :      * W_{gamma, j, i} = sum_{\alpha} T_{\gamma, j, alpha} X_{\alpha, i}
     543             :      *
     544             :      * which means we need to transpose T_{\alpha, \gamma, j} to get
     545             :      * T_{\gamma, j, \alpha}. Fortunately we can do it while doing the
     546             :      * matrix - matrix multiplication
     547             :      */
     548             : 
     549       68160 :     m2.op1 = 'T';
     550       68160 :     m2.op2 = 'N';
     551       68160 :     m2.alpha = 1.0;
     552       68160 :     m2.beta = 0.0;
     553       68160 :     m2.m = cube->size[1] * co->size[1]; // (\gamma j) direction
     554       68160 :     m2.n = cube->size[2];               // i
     555       68160 :     m2.k = co->size[0];                 // alpha
     556       68160 :     m2.a = T.data;                      // T_{\alpha, \gamma, j}
     557       68160 :     m2.lda = T.ld_ * co->size[1];
     558       68160 :     m2.b = px; // X_{alpha, i}  = p_alpha_beta_reduced(0, alpha, i)
     559       68160 :     m2.ldb = p_alpha_beta_reduced_->ld_;
     560       68160 :     m2.c = W.data; // W_{\gamma, j, i}
     561       68160 :     m2.ldc = W.ld_;
     562       68160 :     m2.use_libxsmm = true;
     563             :     /* the final step is again a reduction along the gamma indice. It can
     564             :      * again be done with one dgemm. The operation is simply
     565             :      *
     566             :      * Cube_{k, j, i} = \sum_{alpha} Z_{k, \gamma} W_{\gamma, j, i}
     567             :      *
     568             :      * which means we need to transpose Z_{\gamma, k}.
     569             :      */
     570             : 
     571       68160 :     m3.op1 = 'T';
     572       68160 :     m3.op2 = 'N';
     573       68160 :     m3.alpha = alpha;
     574       68160 :     m3.beta = 0.0;
     575       68160 :     m3.m = cube->size[0];                 // Z_{k \gamma}
     576       68160 :     m3.n = cube->size[1] * cube->size[2]; // (ji) direction
     577       68160 :     m3.k = co->size[1];                   // \gamma
     578       68160 :     m3.a = pz;                            // p_alpha_beta_reduced(0, gamma, i)
     579       68160 :     m3.lda = p_alpha_beta_reduced_->ld_;
     580       68160 :     m3.b = &idx3(W, 0, 0, 0); // W_{\gamma, j, i}
     581       68160 :     m3.ldb = W.size[1] * W.ld_;
     582       68160 :     m3.c = &idx3(cube[0], 0, 0, 0); // cube_{kji}
     583       68160 :     m3.ldc = cube->ld_ * cube->size[1];
     584       68160 :     m3.use_libxsmm = true;
     585       68160 :     dgemm_simplified(&m1);
     586       68160 :     dgemm_simplified(&m2);
     587             : 
     588             :     // apply the non orthorombic corrections in the xy plane
     589       68160 :     if (Exp && !orthogonal[2]) {
     590        3112 :       tensor exp_xy;
     591        3112 :       initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
     592        3112 :       exp_xy.data = &idx3(Exp[0], 2, 0, 0);
     593        3112 :       apply_non_orthorombic_corrections_xy_blocked(&exp_xy, &W);
     594             :     }
     595             : 
     596       68160 :     dgemm_simplified(&m3);
     597             :   } else {
     598       10560 :     if (Exp && !orthogonal[2]) {
     599           0 :       tensor exp_xy;
     600           0 :       initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
     601             : 
     602           0 :       exp_xy.data = &idx3(Exp[0], 2, 0, 0);
     603           0 :       collocate_l0(scratch, co->data[0] * alpha, orthogonal[2], &exp_xy,
     604             :                    p_alpha_beta_reduced_, cube);
     605             :     } else {
     606       10560 :       collocate_l0(scratch, co->data[0] * alpha, true, NULL,
     607             :                    p_alpha_beta_reduced_, cube);
     608             :     }
     609             :   }
     610             : 
     611       78720 :   if (Exp == NULL)
     612             :     return;
     613             : 
     614       44389 :   if (Exp && (!orthogonal[0] && !orthogonal[1])) {
     615           0 :     tensor exp_yz;
     616           0 :     tensor exp_xz;
     617           0 :     initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
     618           0 :     initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
     619           0 :     exp_xz.data = &idx3(Exp[0], 0, 0, 0);
     620           0 :     exp_yz.data = &idx3(Exp[0], 1, 0, 0);
     621           0 :     apply_non_orthorombic_corrections_xz_yz_blocked(&exp_xz, &exp_yz, cube);
     622           0 :     return;
     623             :   }
     624             : 
     625       44389 :   if (Exp && (!orthogonal[0] || !orthogonal[1])) {
     626       20163 :     if (!orthogonal[0]) {
     627       20163 :       tensor exp_xz;
     628       20163 :       initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
     629       20163 :       exp_xz.data = &idx3(Exp[0], 0, 0, 0);
     630       20163 :       apply_non_orthorombic_corrections_xz_blocked(&exp_xz, cube);
     631             :     }
     632             : 
     633       20163 :     if (!orthogonal[1]) {
     634           0 :       tensor exp_yz;
     635           0 :       initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
     636           0 :       exp_yz.data = &idx3(Exp[0], 1, 0, 0);
     637           0 :       apply_non_orthorombic_corrections_yz_blocked(&exp_yz, cube);
     638             :     }
     639             :   }
     640             : 
     641             :   return;
     642             : }
     643             : 
     644             : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
     645             :  * very general and does not depend on the orthorombic nature of the grid. for
     646             :  * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
     647             : 
     648       44389 : void apply_mapping_cubic(struct collocation_integration_ *handler,
     649             :                          const int cmax, const int *const lower_boundaries_cube,
     650             :                          const int *const cube_center) {
     651             : 
     652             :   // a mapping so that the ig corresponds to the right grid point
     653       44389 :   int **map = handler->map;
     654       44389 :   map[1] = map[0] + 2 * cmax + 1;
     655       44389 :   map[2] = map[1] + 2 * cmax + 1;
     656             : 
     657      177556 :   for (int i = 0; i < 3; i++) {
     658     3385300 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     659     3252133 :       map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
     660     3252133 :                               handler->grid.lower_corner[i],
     661             :                           handler->grid.full_size[i]);
     662             :     }
     663             :   }
     664             : 
     665       44389 :   int lower_corner[3];
     666       44389 :   int upper_corner[3];
     667             : 
     668       44389 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     669       44389 :                             .xmax = handler->grid.window_size[0]};
     670       44389 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     671       44389 :                             .xmax = handler->grid.window_size[1]};
     672       44389 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     673       44389 :                             .xmax = handler->grid.window_size[2]};
     674             : 
     675             :   /* this code makes a decomposition of the cube such that we can add block of
     676             :    * datas in a vectorized way. */
     677             : 
     678             :   /* the decomposition depends unfortunately on the way the grid is split over
     679             :    * mpi ranks. If each mpi rank has the full grid then it is simple. A 1D
     680             :    * example of the decomposition will explain it better. We have an interval
     681             :    * [x1, x1 + cube_size - 1] (and a second index x [0, cube_size -1]) and a
     682             :    * grid that goes from [0.. grid_size - 1].
     683             :    *
     684             :    * We start from x1 and compute the largest interval [x1.. x1 + diff] that fit
     685             :    * to [0.. grid_size - 1]. Computing the difference diff is simply
     686             :    * min(grid_size - x1, cube_size - x). then we add the result in a vectorized
     687             :    * fashion. we itterate the processus by reducing the interval [x1, x1 +
     688             :    * cube_size - 1] until it is empty. */
     689             : 
     690      130455 :   for (int z = 0; (z < handler->cube.size[0]); z++) {
     691       86066 :     const int z1 = map[0][z];
     692             : 
     693      172132 :     if (!is_point_in_interval(z1, zwindow))
     694           0 :       continue;
     695             : 
     696       86066 :     compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
     697             :                      handler->cube.size[0], z1, &z, lower_corner, upper_corner,
     698             :                      zwindow);
     699             : 
     700             :     /* // We have a full plane. */
     701       86066 :     if (upper_corner[0] - lower_corner[0]) {
     702      261386 :       for (int y = 0; y < handler->cube.size[1]; y++) {
     703      175320 :         const int y1 = map[1][y];
     704             : 
     705             :         // this check is completely irrelevant when running without MPI.
     706      350640 :         if (!is_point_in_interval(y1, ywindow))
     707           0 :           continue;
     708             : 
     709      175320 :         compute_interval(map[1], handler->grid.full_size[1],
     710             :                          handler->grid.size[1], handler->cube.size[1], y1, &y,
     711             :                          lower_corner + 1, upper_corner + 1, ywindow);
     712             : 
     713      175320 :         if (upper_corner[1] - lower_corner[1]) {
     714      570824 :           for (int x = 0; x < handler->cube.size[2]; x++) {
     715      395504 :             const int x1 = map[2][x];
     716             : 
     717      791008 :             if (!is_point_in_interval(x1, xwindow))
     718           0 :               continue;
     719             : 
     720      395504 :             compute_interval(map[2], handler->grid.full_size[2],
     721             :                              handler->grid.size[2], handler->cube.size[2], x1,
     722             :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
     723             : 
     724      395504 :             if (upper_corner[2] - lower_corner[2]) {
     725      395504 :               const int position1[3] = {z, y, x};
     726             :               /* the function will internally take care of the local vx global
     727             :                * grid */
     728      395504 :               add_sub_grid(
     729             :                   lower_corner, // lower corner of the portion of cube (in the
     730             :                   // full grid)
     731             :                   upper_corner, // upper corner of the portion of cube (in the
     732             :                   // full grid)
     733             :                   position1,       // starting position subblock inside the cube
     734      395504 :                   &handler->cube,  // the cube to extract data from
     735             :                   &handler->grid); // the grid to add data from
     736      395504 :               if (handler->grid.size[2] == handler->grid.full_size[2])
     737      395504 :                 update_loop_index(handler->grid.full_size[2], x1, &x);
     738             :               else
     739           0 :                 x += upper_corner[2] - lower_corner[2] - 1;
     740             :             }
     741             :           }
     742      175320 :           if (handler->grid.size[1] == handler->grid.full_size[1])
     743      175320 :             update_loop_index(handler->grid.full_size[1], y1, &y);
     744             :           else
     745           0 :             y += upper_corner[1] - lower_corner[1] - 1;
     746             :         }
     747             :       }
     748       86066 :       if (handler->grid.size[0] == handler->grid.full_size[0])
     749       86066 :         update_loop_index(handler->grid.full_size[0], z1, &z);
     750             :       else
     751           0 :         z += upper_corner[0] - lower_corner[0] - 1;
     752             :     }
     753             :   }
     754       44389 : }
     755             : 
     756             : // *****************************************************************************
     757       44389 : void grid_collocate(collocation_integration *const handler,
     758             :                     const bool use_ortho, const double zetp, const double rp[3],
     759             :                     const double radius) {
     760             :   // *** position of the gaussian product
     761             :   //
     762             :   // this is the actual definition of the position on the grid
     763             :   // i.e. a point rp(:) gets here grid coordinates
     764             :   // MODULO(rp(:)/dr(:),npts(:))+1
     765             :   // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on
     766             :   // grid)
     767             : 
     768             :   // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
     769       44389 :   int cubecenter[3];
     770       44389 :   int cube_size[3];
     771       44389 :   int lb_cube[3], ub_cube[3];
     772       44389 :   int pol_offset[3] = {0, 0, 0};
     773       44389 :   double roffset[3];
     774       44389 :   double disr_radius;
     775             :   /* cube : grid containing pointlike product between polynomials
     776             :    *
     777             :    * pol : grid  containing the polynomials in all three directions
     778             :    *
     779             :    * pol_folded : grid containing the polynomials after folding for periodic
     780             :    * boundaries conditions
     781             :    */
     782             : 
     783             :   /* seting up the cube parameters */
     784       88778 :   int cmax = compute_cube_properties(
     785       44389 :       use_ortho, radius, (const double(*)[3])handler->dh,
     786       44389 :       (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
     787             :       cubecenter, lb_cube, ub_cube, cube_size);
     788             : 
     789             :   /* initialize the multidimensional array containing the polynomials */
     790       44389 :   initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
     791       44389 :   realloc_tensor(&handler->pol);
     792       44389 :   memset(handler->pol.data, 0, sizeof(double) * handler->pol.alloc_size_);
     793             : 
     794             :   /* compute the polynomials */
     795             : 
     796             :   // WARNING : do not reverse the order in pol otherwise you will have to
     797             :   // reverse the order in collocate_dgemm as well.
     798             : 
     799       44389 :   if (use_ortho) {
     800       21114 :     grid_fill_pol_dgemm(false, handler->dh[0][0], roffset[2], pol_offset[2],
     801       21114 :                         lb_cube[2], ub_cube[2], handler->coef.size[2] - 1, cmax,
     802       21114 :                         zetp, &idx3(handler->pol, 2, 0, 0)); /* i indice */
     803       21114 :     grid_fill_pol_dgemm(false, handler->dh[1][1], roffset[1], pol_offset[1],
     804       21114 :                         lb_cube[1], ub_cube[1], handler->coef.size[1] - 1, cmax,
     805       21114 :                         zetp, &idx3(handler->pol, 1, 0, 0)); /* j indice */
     806       21114 :     grid_fill_pol_dgemm(false, handler->dh[2][2], roffset[0], pol_offset[0],
     807       21114 :                         lb_cube[0], ub_cube[0], handler->coef.size[0] - 1, cmax,
     808             :                         zetp, &idx3(handler->pol, 0, 0, 0)); /* k indice */
     809             :   } else {
     810       23275 :     grid_fill_pol_dgemm(false, 1.0, roffset[0], pol_offset[2], lb_cube[0],
     811       23275 :                         ub_cube[0], handler->coef.size[0] - 1, cmax,
     812       23275 :                         zetp * handler->dx[0],
     813             :                         &idx3(handler->pol, 0, 0, 0)); /* k indice */
     814       23275 :     grid_fill_pol_dgemm(false, 1.0, roffset[1], pol_offset[1], lb_cube[1],
     815       23275 :                         ub_cube[1], handler->coef.size[1] - 1, cmax,
     816       23275 :                         zetp * handler->dx[1],
     817       23275 :                         &idx3(handler->pol, 1, 0, 0)); /* j indice */
     818       23275 :     grid_fill_pol_dgemm(false, 1.0, roffset[2], pol_offset[0], lb_cube[2],
     819       23275 :                         ub_cube[2], handler->coef.size[2] - 1, cmax,
     820       23275 :                         zetp * handler->dx[2],
     821       23275 :                         &idx3(handler->pol, 2, 0, 0)); /* i indice */
     822             : 
     823       23275 :     calculate_non_orthorombic_corrections_tensor(
     824             :         zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
     825       23275 :         handler->orthogonal, &handler->Exp);
     826             : 
     827             :     /* Use a slightly modified version of Ole code */
     828       23275 :     grid_transform_coef_xzy_to_ikj((const double(*)[3])handler->dh,
     829       23275 :                                    &handler->coef);
     830             :   }
     831             : 
     832             :   /* allocate memory for the polynomial and the cube */
     833             : 
     834       44389 :   initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
     835             : 
     836       44389 :   realloc_tensor(&handler->cube);
     837       44389 :   initialize_W_and_T(handler, &handler->cube, &handler->coef);
     838             : 
     839       44389 :   tensor_reduction_for_collocate_integrate(
     840       44389 :       handler->scratch, // pointer to scratch memory
     841       44389 :       1.0, handler->orthogonal, &handler->Exp, &handler->coef, &handler->pol,
     842             :       &handler->cube);
     843             : 
     844       44389 :   if (handler->apply_cutoff) {
     845           0 :     if (use_ortho) {
     846           0 :       apply_sphere_cutoff_ortho(handler, disr_radius, cmax, lb_cube,
     847             :                                 cubecenter);
     848             :     } else {
     849           0 :       apply_spherical_cutoff_generic(handler, radius, cmax, lb_cube, ub_cube,
     850             :                                      roffset, cubecenter);
     851             :     }
     852           0 :     return;
     853             :   }
     854       44389 :   apply_mapping_cubic(handler, cmax, lb_cube, cubecenter);
     855             : }
     856             : 
     857             : //******************************************************************************
     858           0 : void grid_dgemm_collocate_pgf_product(
     859             :     const bool use_ortho, const int border_mask, const enum grid_func func,
     860             :     const int la_max, const int la_min, const int lb_max, const int lb_min,
     861             :     const double zeta, const double zetb, const double rscale,
     862             :     const double dh[3][3], const double dh_inv[3][3], const double ra[3],
     863             :     const double rab[3], const int grid_global_size[3],
     864             :     const int grid_local_size[3], const int shift_local[3],
     865             :     const int border_width[3], const double radius, const int o1, const int o2,
     866           0 :     const int n1, const int n2, const double pab_[n2][n1],
     867             :     double *const grid_) {
     868           0 :   collocation_integration *handler = collocate_create_handle();
     869             : 
     870           0 :   tensor pab;
     871           0 :   initialize_tensor_2(&pab, n2, n1);
     872           0 :   alloc_tensor(&pab);
     873           0 :   memcpy(pab.data, pab_, sizeof(double) * n1 * n2);
     874             :   // Uncomment this to dump all tasks to file.
     875             :   // #define __GRID_DUMP_TASKS
     876           0 :   int offset[2] = {o1, o2};
     877             : 
     878           0 :   int lmax[2] = {la_max, lb_max};
     879           0 :   int lmin[2] = {la_min, lb_min};
     880             : 
     881           0 :   const double zetp = zeta + zetb;
     882           0 :   const double f = zetb / zetp;
     883           0 :   const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
     884           0 :   const double prefactor = rscale * exp(-zeta * f * rab2);
     885           0 :   const double zeta_pair[2] = {zeta, zetb};
     886           0 :   initialize_basis_vectors(handler, dh, dh_inv);
     887           0 :   verify_orthogonality((const double(*)[3])dh, handler->orthogonal);
     888             : 
     889           0 :   initialize_tensor_3(&(handler->grid), grid_local_size[2], grid_local_size[1],
     890             :                       grid_local_size[0]);
     891             : 
     892           0 :   handler->grid.ld_ = grid_local_size[0];
     893           0 :   handler->grid.data = grid_;
     894             : 
     895           0 :   setup_global_grid_size(&handler->grid, (const int *)grid_global_size);
     896             : 
     897           0 :   initialize_tensor_3(&handler->grid, grid_local_size[2], grid_local_size[1],
     898             :                       grid_local_size[0]);
     899           0 :   handler->grid.ld_ = grid_local_size[0];
     900             : 
     901           0 :   setup_grid_window(&handler->grid, shift_local, border_width, border_mask);
     902             : 
     903           0 :   double rp[3], rb[3];
     904           0 :   for (int i = 0; i < 3; i++) {
     905           0 :     rp[i] = ra[i] + f * rab[i];
     906           0 :     rb[i] = ra[i] + rab[i];
     907             :   }
     908             : 
     909           0 :   int lmin_diff[2], lmax_diff[2];
     910           0 :   grid_prepare_get_ldiffs_dgemm(func, lmin_diff, lmax_diff);
     911             : 
     912           0 :   int lmin_prep[2];
     913           0 :   int lmax_prep[2];
     914             : 
     915           0 :   lmin_prep[0] = imax(lmin[0] + lmin_diff[0], 0);
     916           0 :   lmin_prep[1] = imax(lmin[1] + lmin_diff[1], 0);
     917             : 
     918           0 :   lmax_prep[0] = lmax[0] + lmax_diff[0];
     919           0 :   lmax_prep[1] = lmax[1] + lmax_diff[1];
     920             : 
     921           0 :   const int n1_prep = ncoset(lmax_prep[0]);
     922           0 :   const int n2_prep = ncoset(lmax_prep[1]);
     923             : 
     924             :   /* I really do not like this. This will disappear */
     925           0 :   tensor pab_prep;
     926           0 :   initialize_tensor_2(&pab_prep, n2_prep, n1_prep);
     927           0 :   alloc_tensor(&pab_prep);
     928           0 :   memset(pab_prep.data, 0, pab_prep.alloc_size_ * sizeof(double));
     929             : 
     930           0 :   grid_prepare_pab_dgemm(func, offset, lmax, lmin, &zeta_pair[0], &pab,
     931             :                          &pab_prep);
     932             : 
     933             :   //   *** initialise the coefficient matrix, we transform the sum
     934             :   //
     935             :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
     936             :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
     937             :   //         (z-a_z)**lza
     938             :   //
     939             :   // into
     940             :   //
     941             :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
     942             :   //
     943             :   // where p is center of the product gaussian, and lp = la_max + lb_max
     944             :   // (current implementation is l**7)
     945             :   //
     946             : 
     947             :   /* precautionary tail since I probably intitialize data to NULL when I
     948             :    * initialize a new tensor. I want to keep the memory (I put a ridiculous
     949             :    * amount already) */
     950             : 
     951           0 :   initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
     952           0 :                       lmax_prep[0] + lmax_prep[1] + 1);
     953           0 :   realloc_tensor(&(handler->alpha));
     954             : 
     955           0 :   const int lp = lmax_prep[0] + lmax_prep[1];
     956             : 
     957           0 :   initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
     958           0 :   realloc_tensor(&(handler->coef));
     959             : 
     960             :   // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
     961             :   // need them to be stored as
     962             : 
     963           0 :   grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &(handler->alpha));
     964             : 
     965             :   //
     966             :   //   compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and
     967             :   //   alpha(ls,lxa,lxb,1) use a three step procedure we don't store zeros, so
     968             :   //   counting is done using lxyz,lxy in order to have contiguous memory access
     969             :   //   in collocate_fast.F
     970             :   //
     971             : 
     972             :   // coef[x][z][y]
     973           0 :   grid_compute_coefficients_dgemm(lmin_prep, lmax_prep, lp, prefactor,
     974             :                                   &(handler->alpha), &pab_prep,
     975             :                                   &(handler->coef));
     976             : 
     977           0 :   grid_collocate(handler, use_ortho, zetp, rp, radius);
     978             : 
     979           0 :   collocate_destroy_handle(handler);
     980           0 :   free(pab_prep.data);
     981           0 : }
     982             : 
     983        5040 : void extract_blocks(grid_context *const ctx, const _task *const task,
     984             :                     const offload_buffer *pab_blocks, tensor *const work,
     985             :                     tensor *const pab) {
     986        5040 :   const int iatom = task->iatom;
     987        5040 :   const int jatom = task->jatom;
     988        5040 :   const int iset = task->iset;
     989        5040 :   const int jset = task->jset;
     990        5040 :   const int ikind = ctx->atom_kinds[iatom];
     991        5040 :   const int jkind = ctx->atom_kinds[jatom];
     992        5040 :   const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     993        5040 :   const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     994             : 
     995        5040 :   const int block_num = task->block_num;
     996             : 
     997             :   // Locate current matrix block within the buffer. This block
     998             :   // contains the weights of the gaussian pairs in the spherical
     999             :   // harmonic basis, but we do computation in the cartesian
    1000             :   // harmonic basis so we have to rotate the coefficients. It is nothing
    1001             :   // else than a basis change and it done with two dgemm.
    1002             : 
    1003        5040 :   const int block_offset = ctx->block_offsets[block_num]; // zero based
    1004        5040 :   double *const block = &pab_blocks->host_buffer[block_offset];
    1005             : 
    1006        5040 :   rotate_to_cartesian_harmonics(ibasis, jbasis, iatom, jatom, iset, jset, block,
    1007             :                                 work, pab);
    1008        5040 : }
    1009             : 
    1010       44389 : void compute_coefficients(grid_context *const ctx,
    1011             :                           struct collocation_integration_ *handler,
    1012             :                           const _task *const previous_task,
    1013             :                           const _task *const task,
    1014             :                           const offload_buffer *pab_blocks, tensor *const pab,
    1015             :                           tensor *const work, tensor *const pab_prep) {
    1016             :   // Load subblock from buffer and decontract into Cartesian sublock pab.
    1017             :   // The previous pab can be reused when only ipgf or jpgf has changed.
    1018       44389 :   if (task->update_block_ || (previous_task == NULL)) {
    1019        4482 :     extract_blocks(ctx, task, pab_blocks, work, pab);
    1020             :   }
    1021             : 
    1022       44389 :   int lmin_prep[2];
    1023       44389 :   int lmax_prep[2];
    1024             : 
    1025       44389 :   lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
    1026       44389 :   lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
    1027             : 
    1028       44389 :   lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
    1029       44389 :   lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
    1030             : 
    1031       44389 :   const int n1_prep = ncoset(lmax_prep[0]);
    1032       44389 :   const int n2_prep = ncoset(lmax_prep[1]);
    1033             : 
    1034             :   /* we do not reallocate memory. We initialized the structure with the
    1035             :    * maximum lmax of the all list already.
    1036             :    */
    1037       44389 :   initialize_tensor_2(pab_prep, n2_prep, n1_prep);
    1038       44389 :   realloc_tensor(pab_prep);
    1039             : 
    1040       44389 :   grid_prepare_pab_dgemm(handler->func, task->offset, task->lmin, task->lmax,
    1041             :                          &task->zeta[0], pab, pab_prep);
    1042             : 
    1043             :   //   *** initialise the coefficient matrix, we transform the sum
    1044             :   //
    1045             :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
    1046             :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb
    1047             :   //         (y-a_y)**lya (z-a_z)**lza
    1048             :   //
    1049             :   // into
    1050             :   //
    1051             :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp
    1052             :   // (z-p_z)**lzp
    1053             :   //
    1054             :   // where p is center of the product gaussian, and lp = la_max + lb_max
    1055             :   // (current implementation is l**7)
    1056             :   //
    1057             : 
    1058             :   /* precautionary tail since I probably intitialize data to NULL when I
    1059             :    * initialize a new tensor. I want to keep the memory (I put a ridiculous
    1060             :    * amount already) */
    1061             : 
    1062       44389 :   initialize_tensor_4(&handler->alpha, 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
    1063       44389 :                       lmax_prep[0] + lmax_prep[1] + 1);
    1064       44389 :   realloc_tensor(&handler->alpha);
    1065             : 
    1066       44389 :   const int lp = lmax_prep[0] + lmax_prep[1];
    1067             : 
    1068       44389 :   initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
    1069       44389 :   realloc_tensor(&handler->coef);
    1070             : 
    1071             :   // these two functions can be done with dgemm again....
    1072             : 
    1073       44389 :   grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax_prep,
    1074             :                            &handler->alpha);
    1075             : 
    1076             :   // compute the coefficients after applying the function of interest
    1077             :   // coef[x][z][y]
    1078       44389 :   grid_compute_coefficients_dgemm(
    1079             :       lmin_prep, lmax_prep, lp,
    1080       44389 :       task->prefactor * ((task->iatom == task->jatom) ? 1.0 : 2.0),
    1081             :       &handler->alpha, pab_prep, &handler->coef);
    1082       44389 : }
    1083             : 
    1084         632 : void collocate_one_grid_level_dgemm(grid_context *const ctx,
    1085             :                                     const int *const border_width,
    1086             :                                     const int *const shift_local,
    1087             :                                     const enum grid_func func, const int level,
    1088             :                                     const offload_buffer *pab_blocks) {
    1089         632 :   tensor *const grid = &ctx->grid[level];
    1090             :   // Using default(shared) because with GCC 9 the behavior around const changed:
    1091             :   // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
    1092         632 : #pragma omp parallel default(shared)
    1093             :   {
    1094             :     const int num_threads = omp_get_num_threads();
    1095             :     const int thread_id = omp_get_thread_num();
    1096             : 
    1097             :     tensor work, pab, pab_prep;
    1098             : 
    1099             :     struct collocation_integration_ *handler = ctx->handler[thread_id];
    1100             : 
    1101             :     handler->func = func;
    1102             :     grid_prepare_get_ldiffs_dgemm(handler->func, handler->lmin_diff,
    1103             :                                   handler->lmax_diff);
    1104             : 
    1105             :     handler->apply_cutoff = ctx->apply_cutoff;
    1106             : 
    1107             :     // Allocate pab matrix for re-use across tasks.
    1108             :     initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
    1109             :     alloc_tensor(&pab);
    1110             : 
    1111             :     initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
    1112             :     alloc_tensor(&work);
    1113             : 
    1114             :     initialize_tensor_2(&pab_prep, ctx->maxco, ctx->maxco);
    1115             :     alloc_tensor(&pab_prep);
    1116             : 
    1117             :     initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
    1118             :                              (const double(*)[3])grid->dh_inv);
    1119             : 
    1120             :     /* setup the grid parameters, window parameters (if the grid is split), etc
    1121             :      */
    1122             : 
    1123             :     tensor_copy(&handler->grid, grid);
    1124             : 
    1125             :     for (int d = 0; d < 3; d++)
    1126             :       handler->orthogonal[d] = handler->grid.orthogonal[d];
    1127             : 
    1128             :     if ((thread_id == 0) || (num_threads == 1)) {
    1129             :       // thread id 0 directly store the results in the final storage space
    1130             :       handler->grid.data = ctx->grid[level].data;
    1131             :     }
    1132             : 
    1133             :     if ((num_threads > 1) && (thread_id > 0)) {
    1134             :       handler->grid.data = ((double *)ctx->scratch) +
    1135             :                            (thread_id - 1) * handler->grid.alloc_size_;
    1136             :       memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
    1137             :     }
    1138             : 
    1139             :     /* it is only useful when we split the list over multiple threads. The first
    1140             :      * iteration should load the block whatever status the task->block_update_
    1141             :      * has */
    1142             :     const _task *prev_task = NULL;
    1143             : #pragma omp for schedule(static)
    1144             :     for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
    1145             :       // Define some convenient aliases.
    1146             :       const _task *task = &ctx->tasks[level][itask];
    1147             : 
    1148             :       if (task->level != level) {
    1149             :         printf("level %d, %d\n", task->level, level);
    1150             :         abort();
    1151             :       }
    1152             :       /* the grid is divided over several ranks or not periodic */
    1153             :       if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
    1154             :           (handler->grid.size[1] != handler->grid.full_size[1]) ||
    1155             :           (handler->grid.size[2] != handler->grid.full_size[2])) {
    1156             :         /* unfortunately the window where the gaussian should be added depends
    1157             :          * on the bonds. So I have to adjust the window all the time. */
    1158             : 
    1159             :         setup_grid_window(&handler->grid, shift_local, border_width,
    1160             :                           task->border_mask);
    1161             :       }
    1162             : 
    1163             :       /* this is a three steps procedure. pab_blocks contains the coefficients
    1164             :        * of the operator in the spherical harmonic basis while we do computation
    1165             :        * in the cartesian harmonic basis.
    1166             :        *
    1167             :        * step 1 : rotate the coefficients from the harmonic to the cartesian
    1168             :        * basis
    1169             : 
    1170             :        * step 2 : extract the subblock and apply additional transformations
    1171             :        * corresponding the spatial derivatives of the operator (the last is not
    1172             :        * always done)
    1173             : 
    1174             :        * step 3 : change from (x - x_1)^\alpha (x - x_2)^\beta to (x -
    1175             :        * x_{12})^k. It is a transformation which involves binomial
    1176             :        * coefficients.
    1177             :        *
    1178             :        * \f[ (x - x_1) ^\alpha (x - x_2) ^ beta = \sum_{k_{1} k_{2}} ^
    1179             :        *     {\alpha\beta} \text{Binomial}(\alpha,k_1)
    1180             :        *     \text{Binomial}(\beta,k_2) (x - x_{12})^{k_1 + k_2} (x_12 - x_1)
    1181             :        *     ^{\alpha - k_1} (x_12 - x_2) ^{\beta - k_2} ]
    1182             :        *
    1183             :        * step 1 is done only when necessary, the two remaining steps are done
    1184             :        * for each bond.
    1185             :        */
    1186             : 
    1187             :       compute_coefficients(ctx, handler, prev_task, task, pab_blocks, &pab,
    1188             :                            &work, &pab_prep);
    1189             : 
    1190             :       grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp,
    1191             :                      task->radius);
    1192             :       prev_task = task;
    1193             :     }
    1194             : 
    1195             :     // Merge thread local grids into shared grid. Could be improved though....
    1196             : 
    1197             :     // thread 0 does nothing since the data are already placed in the final
    1198             :     // destination
    1199             :     if (num_threads > 1) {
    1200             :       if ((grid->alloc_size_ / (num_threads - 1)) >= 2) {
    1201             :         const int block_size = grid->alloc_size_ / (num_threads - 1) +
    1202             :                                (grid->alloc_size_ % (num_threads - 1));
    1203             : 
    1204             :         for (int bk = 0; bk < num_threads; bk++) {
    1205             :           if (thread_id > 0) {
    1206             :             int bk_id = (bk + thread_id - 1) % num_threads;
    1207             :             int begin = bk_id * block_size;
    1208             :             int end = imin((bk_id + 1) * block_size, grid->alloc_size_);
    1209             :             cblas_daxpy(end - begin, 1.0, handler->grid.data + begin, 1,
    1210             :                         grid->data + begin, 1);
    1211             :           }
    1212             : #pragma omp barrier
    1213             :         }
    1214             :       }
    1215             :     } else {
    1216             : #pragma omp critical
    1217             :       if (thread_id > 0)
    1218             :         cblas_daxpy(handler->grid.alloc_size_, 1.0,
    1219             :                     &idx3(handler->grid, 0, 0, 0), 1, &idx3(grid[0], 0, 0, 0),
    1220             :                     1);
    1221             :     }
    1222             :     handler->grid.data = NULL;
    1223             :     free(pab.data);
    1224             :     free(pab_prep.data);
    1225             :     free(work.data);
    1226             :   }
    1227         632 : }
    1228             : 
    1229             : /*******************************************************************************
    1230             :  * \brief Collocate all tasks of a given list onto given grids.
    1231             :  *        See grid_task_list.h for details.
    1232             :  ******************************************************************************/
    1233         158 : void grid_dgemm_collocate_task_list(grid_dgemm_task_list *const ptr,
    1234             :                                     const enum grid_func func,
    1235             :                                     const int nlevels,
    1236             :                                     const offload_buffer *pab_blocks,
    1237             :                                     offload_buffer *grids[nlevels]) {
    1238             : 
    1239         158 :   grid_context *const ctx = (grid_context *)ptr;
    1240             : 
    1241         158 :   assert(ctx->checksum == ctx_checksum);
    1242             : 
    1243         158 :   const int max_threads = omp_get_max_threads();
    1244             : 
    1245         158 :   assert(ctx->nlevels == nlevels);
    1246             : 
    1247             :   // #pragma omp parallel for
    1248         790 :   for (int level = 0; level < ctx->nlevels; level++) {
    1249         632 :     const _layout *layout = &ctx->layouts[level];
    1250         632 :     set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
    1251         632 :                         layout->npts_global, layout->npts_local,
    1252         632 :                         layout->shift_local, layout->border_width, layout->dh,
    1253         632 :                         layout->dh_inv, grids[level]);
    1254         632 :     memset(ctx->grid[level].data, 0,
    1255         632 :            sizeof(double) * ctx->grid[level].alloc_size_);
    1256             :   }
    1257             : 
    1258         158 :   if (ctx->scratch == NULL) {
    1259         158 :     int max_size = ctx->grid[0].alloc_size_;
    1260             : 
    1261             :     /* compute the size of the largest grid. It is used afterwards to allocate
    1262             :      * scratch memory for the grid on each omp thread */
    1263         632 :     for (int x = 1; x < nlevels; x++) {
    1264         474 :       max_size = imax(ctx->grid[x].alloc_size_, max_size);
    1265             :     }
    1266             : 
    1267         158 :     max_size = ((max_size / 4096) + (max_size % 4096 != 0)) * 4096;
    1268             : 
    1269             :     /* scratch is a void pointer !!!!! */
    1270         158 :     ctx->scratch =
    1271         158 :         grid_allocate_scratch(max_size * max_threads * sizeof(double));
    1272             :   }
    1273             : 
    1274         790 :   for (int level = 0; level < ctx->nlevels; level++) {
    1275         632 :     const _layout *layout = &ctx->layouts[level];
    1276         632 :     collocate_one_grid_level_dgemm(ctx, layout->border_width,
    1277         632 :                                    layout->shift_local, func, level,
    1278             :                                    pab_blocks);
    1279             :   }
    1280             : 
    1281         158 :   grid_free_scratch(ctx->scratch);
    1282         158 :   ctx->scratch = NULL;
    1283         158 : }

Generated by: LCOV version 1.15