LCOV - code coverage report
Current view: top level - src/grid/dgemm - grid_dgemm_collocate.c (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:85b8a9b) Lines: 64.4 % 587 378
Test Date: 2026-06-14 06:48:14 Functions: 76.9 % 13 10

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2026 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : 
       8              : #include <assert.h>
       9              : #include <limits.h>
      10              : #include <math.h>
      11              : #include <omp.h>
      12              : #include <stdbool.h>
      13              : #include <stdio.h>
      14              : #include <stdlib.h>
      15              : #include <string.h>
      16              : 
      17              : #include "../common/grid_basis_set.h"
      18              : #include "../common/grid_common.h"
      19              : #include "../common/grid_constants.h"
      20              : #include "grid_dgemm_coefficients.h"
      21              : #include "grid_dgemm_collocate.h"
      22              : #include "grid_dgemm_collocation_integration.h"
      23              : #include "grid_dgemm_non_orthorombic_corrections.h"
      24              : #include "grid_dgemm_prepare_pab.h"
      25              : #include "grid_dgemm_private_header.h"
      26              : #include "grid_dgemm_task_list.h"
      27              : #include "grid_dgemm_tensor_local.h"
      28              : 
      29              : void collocate_l0(double *scratch, const double alpha, const bool orthogonal,
      30              :                   const struct tensor_ *exp_xy,
      31              :                   const struct tensor_ *p_alpha_beta_reduced_,
      32              :                   struct tensor_ *cube);
      33              : 
      34         5100 : void rotate_to_cartesian_harmonics(const grid_basis_set *ibasis,
      35              :                                    const grid_basis_set *jbasis,
      36              :                                    const int iatom, const int jatom,
      37              :                                    const int iset, const int jset,
      38              :                                    double *const block, tensor *work,
      39              :                                    tensor *pab) {
      40         5100 :   dgemm_params m1, m2;
      41         5100 :   memset(&m1, 0, sizeof(dgemm_params));
      42         5100 :   memset(&m2, 0, sizeof(dgemm_params));
      43              : 
      44              :   // Define some more convenient aliases.
      45         5100 :   const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
      46         5100 :   const int nsgf_setb = jbasis->nsgf_set[jset];
      47         5100 :   const int nsgfa = ibasis->nsgf; // size of entire spherical basis
      48         5100 :   const int nsgfb = jbasis->nsgf;
      49         5100 :   const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
      50         5100 :   const int sgfb = jbasis->first_sgf[jset] - 1;
      51         5100 :   const int maxcoa = ibasis->maxco;
      52         5100 :   const int maxcob = jbasis->maxco;
      53         5100 :   const int ncoseta = ncoset(ibasis->lmax[iset]);
      54         5100 :   const int ncosetb = ncoset(jbasis->lmax[jset]);
      55         5100 :   const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
      56         5100 :   const int ncob = jbasis->npgf[jset] * ncosetb;
      57              : 
      58         5100 :   initialize_tensor_2(work, nsgf_setb, ncoa);
      59         5100 :   realloc_tensor(work);
      60              : 
      61         5100 :   initialize_tensor_2(pab, ncob, ncoa);
      62         5100 :   realloc_tensor(pab);
      63              : 
      64              :   // the rotations happen here.
      65         5100 :   if (iatom <= jatom) {
      66         3388 :     m1.op1 = 'N';
      67         3388 :     m1.op2 = 'N';
      68         3388 :     m1.m = work->size[0];
      69         3388 :     m1.n = work->size[1];
      70         3388 :     m1.k = nsgf_seta;
      71         3388 :     m1.alpha = 1.0;
      72         3388 :     m1.beta = 0.0;
      73         3388 :     m1.a = block + sgfb * nsgfa + sgfa;
      74         3388 :     m1.lda = nsgfa;
      75         3388 :     m1.b = &ibasis->sphi[sgfa * maxcoa];
      76         3388 :     m1.ldb = maxcoa;
      77         3388 :     m1.c = work->data;
      78         3388 :     m1.ldc = work->ld_;
      79              :   } else {
      80         1712 :     m1.op1 = 'T';
      81         1712 :     m1.op2 = 'N';
      82         1712 :     m1.m = work->size[0];
      83         1712 :     m1.n = work->size[1];
      84         1712 :     m1.k = nsgf_seta;
      85         1712 :     m1.alpha = 1.0;
      86         1712 :     m1.beta = 0.0;
      87         1712 :     m1.a = block + sgfa * nsgfb + sgfb;
      88         1712 :     m1.lda = nsgfb;
      89         1712 :     m1.b = &ibasis->sphi[sgfa * maxcoa];
      90         1712 :     m1.ldb = maxcoa;
      91         1712 :     m1.c = work->data;
      92         1712 :     m1.ldc = work->ld_;
      93              :   }
      94              : 
      95         5100 :   dgemm_simplified(&m1);
      96              : 
      97         5100 :   m2.op1 = 'T';
      98         5100 :   m2.op2 = 'N';
      99         5100 :   m2.m = pab->size[0];
     100         5100 :   m2.n = pab->size[1];
     101         5100 :   m2.k = work->size[0];
     102         5100 :   m2.alpha = 1.0;
     103         5100 :   m2.beta = 0.0;
     104         5100 :   m2.a = &jbasis->sphi[sgfb * maxcob];
     105         5100 :   m2.lda = maxcob;
     106         5100 :   m2.b = work->data;
     107         5100 :   m2.ldb = work->ld_;
     108         5100 :   m2.c = pab->data;
     109         5100 :   m2.ldc = pab->ld_;
     110              : 
     111         5100 :   dgemm_simplified(&m2);
     112         5100 : }
     113              : 
     114              : void tensor_reduction_for_collocate_integrate(
     115              :     double *scratch, const double alpha, const bool *const orthogonal,
     116              :     const struct tensor_ *Exp, const struct tensor_ *co,
     117              :     const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube);
     118              : 
     119              : /* compute the functions (x - x_i)^l exp (-eta (x - x_i)^2) for l = 0..lp using
     120              :  * a recursive relation to avoid computing the exponential on each grid point. I
     121              :  * think it is not really necessary anymore since it is *not* the dominating
     122              :  * contribution to computation of collocate and integrate */
     123              : 
     124       271002 : void grid_fill_pol_dgemm(const bool transpose, const double dr,
     125              :                          const double roffset, const int pol_offset,
     126              :                          const int xmin, const int xmax, const int lp,
     127              :                          const int cmax, const double zetp, double *pol_) {
     128       271002 :   tensor pol;
     129       271002 :   const double t_exp_1 = exp(-zetp * dr * dr);
     130       271002 :   const double t_exp_2 = t_exp_1 * t_exp_1;
     131              : 
     132       271002 :   double t_exp_min_1 = exp(-zetp * (dr - roffset) * (dr - roffset));
     133       271002 :   double t_exp_min_2 = exp(2.0 * zetp * (dr - roffset) * dr);
     134              : 
     135       271002 :   double t_exp_plus_1 = exp(-zetp * roffset * roffset);
     136       271002 :   double t_exp_plus_2 = exp(2.0 * zetp * roffset * dr);
     137              : 
     138       271002 :   if (transpose) {
     139       105327 :     initialize_tensor_2(&pol, cmax, lp + 1);
     140       105327 :     pol.data = pol_;
     141              :     /* It is original Ole code. I need to transpose the polynomials for the
     142              :      * integration routine and Ole code already does it. */
     143      1416424 :     for (int ig = 0; ig >= xmin; ig--) {
     144      1311097 :       const double rpg = ig * dr - roffset;
     145      1311097 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     146      1311097 :       t_exp_min_2 *= t_exp_2;
     147      1311097 :       double pg = t_exp_min_1;
     148      5369183 :       for (int icoef = 0; icoef <= lp; icoef++) {
     149      4058086 :         idx2(pol, pol_offset + ig - xmin, icoef) = pg;
     150      4058086 :         pg *= rpg;
     151              :       }
     152              :     }
     153              : 
     154       105327 :     double t_exp_plus_1 = exp(-zetp * roffset * roffset);
     155       105327 :     double t_exp_plus_2 = exp(2 * zetp * roffset * dr);
     156      1360432 :     for (int ig = 1; ig <= xmax; ig++) {
     157      1255105 :       const double rpg = ig * dr - roffset;
     158      1255105 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     159      1255105 :       t_exp_plus_2 *= t_exp_2;
     160      1255105 :       double pg = t_exp_plus_1;
     161      5140739 :       for (int icoef = 0; icoef <= lp; icoef++) {
     162      3885634 :         idx2(pol, pol_offset + ig - xmin, icoef) = pg;
     163      3885634 :         pg *= rpg;
     164              :       }
     165              :     }
     166              : 
     167              :   } else {
     168       165675 :     initialize_tensor_2(&pol, lp + 1, cmax);
     169       165675 :     pol.data = pol_;
     170              :     /* memset(pol.data, 0, sizeof(double) * pol.alloc_size_); */
     171              :     /*
     172              :      *   compute the values of all (x-xp)**lp*exp(..)
     173              :      *
     174              :      *  still requires the old trick:
     175              :      *  new trick to avoid to many exps (reuse the result from the previous
     176              :      * gridpoint): exp( -a*(x+d)**2)=exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
     177              :      *  exp(-2*a*(x+d)*d)=exp(-2*a*x*d)*exp(-2*a*d**2)
     178              :      */
     179              : 
     180              :     /* compute the exponential recursively and store the polynomial prefactors
     181              :      * as well */
     182      2205408 :     for (int ig = 0; ig >= xmin; ig--) {
     183      2039733 :       const double rpg = ig * dr - roffset;
     184      2039733 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     185      2039733 :       t_exp_min_2 *= t_exp_2;
     186      2039733 :       double pg = t_exp_min_1;
     187      2039733 :       idx2(pol, 0, pol_offset + ig - xmin) = pg;
     188      2039733 :       if (lp > 0)
     189      1312089 :         idx2(pol, 1, pol_offset + ig - xmin) = rpg;
     190              :     }
     191              : 
     192      2117082 :     for (int ig = 1; ig <= xmax; ig++) {
     193      1951407 :       const double rpg = ig * dr - roffset;
     194      1951407 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     195      1951407 :       t_exp_plus_2 *= t_exp_2;
     196      1951407 :       double pg = t_exp_plus_1;
     197      1951407 :       idx2(pol, 0, pol_offset + ig - xmin) = pg;
     198      1951407 :       if (lp > 0)
     199      1253337 :         idx2(pol, 1, pol_offset + ig - xmin) = rpg;
     200              :     }
     201              : 
     202              :     /* compute the remaining powers using previously computed stuff */
     203       165675 :     if (lp >= 2) {
     204        61209 :       double *__restrict__ poly = &idx2(pol, 1, 0);
     205        61209 :       double *__restrict__ src1 = &idx2(pol, 0, 0);
     206        61209 :       double *__restrict__ dst = &idx2(pol, 2, 0);
     207              :       // #pragma omp simd linear(dst, src1, poly) simdlen(8)
     208       240150 :       GRID_PRAGMA_SIMD((dst, src1, poly), 8)
     209        61209 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++)
     210      1477752 :         dst[ig] = src1[ig] * poly[ig] * poly[ig];
     211              :     }
     212              : 
     213       178941 :     for (int icoef = 3; icoef <= lp; icoef++) {
     214        13266 :       double *__restrict__ poly = &idx2(pol, 1, 0);
     215        13266 :       double *__restrict__ src1 = &idx2(pol, icoef - 1, 0);
     216        13266 :       double *__restrict__ dst = &idx2(pol, icoef, 0);
     217        13266 :       GRID_PRAGMA_SIMD((dst, src1, poly), 8)
     218        13266 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
     219       372702 :         dst[ig] = src1[ig] * poly[ig];
     220              :       }
     221              :     }
     222              : 
     223              :     //
     224       165675 :     if (lp > 0) {
     225              :       // I can not declare src__ variable constant because it breaks openmp
     226              :       // standard.
     227       106155 :       double *__restrict__ dst = &idx2(pol, 1, 0);
     228       106155 :       double *__restrict__ src = &idx2(pol, 0, 0);
     229       377157 :       GRID_PRAGMA_SIMD((dst, src), 8)
     230       106155 :       for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
     231      2565426 :         dst[ig] *= src[ig];
     232              :       }
     233              :     }
     234              :   }
     235       271002 : }
     236              : 
     237            0 : void apply_sphere_cutoff_ortho(struct collocation_integration_ *const handler,
     238              :                                const double disr_radius, const int cmax,
     239              :                                const int *const lb_cube,
     240              :                                const int *const cube_center) {
     241              :   // a mapping so that the ig corresponds to the right grid point
     242            0 :   int **map = handler->map;
     243            0 :   map[1] = map[0] + 2 * cmax + 1;
     244            0 :   map[2] = map[1] + 2 * cmax + 1;
     245              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     246              : 
     247            0 :   for (int i = 0; i < 3; i++) {
     248            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     249            0 :       map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
     250            0 :                               handler->grid.lower_corner[i],
     251              :                           handler->grid.full_size[i]);
     252              :     }
     253              :   }
     254              : 
     255            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     256            0 :                             .xmax = handler->grid.window_size[0]};
     257            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     258            0 :                             .xmax = handler->grid.window_size[1]};
     259            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     260            0 :                             .xmax = handler->grid.window_size[2]};
     261              : 
     262            0 :   for (int kg = 0; kg < handler->cube.size[0]; kg++) {
     263            0 :     const int k = map[0][kg];
     264            0 :     const int kd =
     265            0 :         (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
     266            0 :     const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
     267            0 :     const double kremain = disr_radius * disr_radius - kr * kr;
     268              : 
     269            0 :     if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
     270              : 
     271            0 :       const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
     272            0 :       for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
     273            0 :         const int j = map[1][jg - lb_cube[1]];
     274            0 :         const double jr = ((2 * jg - 1) >> 1) *
     275            0 :                           handler->dh[1][1]; // distance from center in a.u.
     276            0 :         const double jremain = kremain - jr * jr;
     277              : 
     278            0 :         if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
     279            0 :           const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
     280            0 :           const int xmax = 1 - xmin;
     281              : 
     282              :           // printf("xmin %d, xmax %d\n", xmin, xmax);
     283            0 :           for (int x = xmin - lb_cube[2];
     284            0 :                x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     285            0 :             const int x1 = map[2][x];
     286              : 
     287            0 :             if (!is_point_in_interval(x1, xwindow))
     288            0 :               continue;
     289              : 
     290            0 :             int lower_corner[3] = {k, j, x1};
     291            0 :             int upper_corner[3] = {k + 1, j + 1, x1 + 1};
     292              : 
     293            0 :             compute_interval(map[2], handler->grid.full_size[2],
     294            0 :                              handler->grid.size[2], xmax - lb_cube[2], x1, &x,
     295              :                              lower_corner + 2, upper_corner + 2, xwindow);
     296              : 
     297            0 :             if (upper_corner[2] - lower_corner[2]) {
     298            0 :               const int position1[3] = {kg, jg - lb_cube[1], x};
     299              : 
     300            0 :               double *restrict dst = &idx3(handler->grid, lower_corner[0],
     301              :                                            lower_corner[1], lower_corner[2]);
     302            0 :               double *restrict src = &idx3(handler->cube, position1[0],
     303              :                                            position1[1], position1[2]);
     304              : 
     305            0 :               const int sizex = upper_corner[2] - lower_corner[2];
     306            0 :               GRID_PRAGMA_SIMD((dst, src), 8)
     307            0 :               for (int x = 0; x < sizex; x++) {
     308            0 :                 dst[x] += src[x];
     309              :               }
     310              :             }
     311              : 
     312            0 :             if (handler->grid.size[2] == handler->grid.full_size[2])
     313            0 :               update_loop_index(handler->grid.full_size[2], x1, &x);
     314              :             else
     315            0 :               x += upper_corner[2] - lower_corner[2] - 1;
     316              :           }
     317              :         }
     318              :       }
     319              :     }
     320              :   }
     321            0 : }
     322              : 
     323            0 : void apply_spherical_cutoff_generic(
     324              :     struct collocation_integration_ *const handler, const double disr_radius,
     325              :     const int cmax, const int *const lb_cube, const int *const ub_cube,
     326              :     const double *const roffset, const int *const cube_center) {
     327              : 
     328            0 :   const double a = handler->dh[0][0] * handler->dh[0][0] +
     329            0 :                    handler->dh[0][1] * handler->dh[0][1] +
     330            0 :                    handler->dh[0][2] * handler->dh[0][2];
     331            0 :   const double a_inv = 1.0 / a;
     332              :   // a mapping so that the ig corresponds to the right grid point
     333            0 :   int **map = handler->map;
     334            0 :   map[1] = map[0] + 2 * cmax + 1;
     335            0 :   map[2] = map[1] + 2 * cmax + 1;
     336              :   // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
     337              : 
     338            0 :   for (int i = 0; i < 3; i++) {
     339            0 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     340            0 :       map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
     341            0 :                               handler->grid.lower_corner[i],
     342              :                           handler->grid.full_size[i]);
     343              :     }
     344              :   }
     345              : 
     346            0 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     347            0 :                             .xmax = handler->grid.window_size[0]};
     348            0 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     349            0 :                             .xmax = handler->grid.window_size[1]};
     350            0 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     351            0 :                             .xmax = handler->grid.window_size[2]};
     352              : 
     353            0 :   for (int k = 0; k < handler->cube.size[0]; k++) {
     354            0 :     const int iz = map[0][k];
     355              : 
     356            0 :     if (!is_point_in_interval(iz, zwindow))
     357            0 :       continue;
     358              : 
     359            0 :     const double tz = (k + lb_cube[0] - roffset[0]);
     360            0 :     const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
     361            0 :                          tz * handler->dh[2][2]};
     362              : 
     363            0 :     for (int j = 0; j < handler->cube.size[1]; j++) {
     364            0 :       const int iy = map[1][j];
     365              : 
     366            0 :       if (!is_point_in_interval(iy, ywindow))
     367            0 :         continue;
     368              : 
     369            0 :       const double ty = (j - roffset[1] + lb_cube[1]);
     370            0 :       const double y[3] = {z[0] + ty * handler->dh[1][0],
     371            0 :                            z[1] + ty * handler->dh[1][1],
     372            0 :                            z[2] + ty * handler->dh[1][2]};
     373              : 
     374              :       /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
     375              :       /*                                            4 (x1^2 + y1^2 + z1^2)
     376              :        * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
     377              : 
     378            0 :       const double b =
     379            0 :           -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
     380            0 :                   handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
     381            0 :                   handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
     382              : 
     383            0 :       const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
     384            0 :                            (roffset[2] * handler->dh[0][0] - y[0]) +
     385            0 :                        (roffset[2] * handler->dh[0][1] - y[1]) *
     386            0 :                            (roffset[2] * handler->dh[0][1] - y[1]) +
     387            0 :                        (roffset[2] * handler->dh[0][2] - y[2]) *
     388              :                            (roffset[2] * handler->dh[0][2] - y[2]) -
     389            0 :                        disr_radius * disr_radius;
     390              : 
     391            0 :       double delta = b * b - 4.0 * a * c;
     392              : 
     393            0 :       if (delta < 0.0)
     394            0 :         continue;
     395              : 
     396            0 :       delta = sqrt(delta);
     397              : 
     398            0 :       const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
     399            0 :       const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
     400              : 
     401            0 :       int lower_corner[3] = {iz, iy, xmin};
     402            0 :       int upper_corner[3] = {iz + 1, iy + 1, xmin};
     403              : 
     404            0 :       for (int x = xmin - lb_cube[2];
     405            0 :            x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
     406            0 :         const int x1 = map[2][x];
     407              : 
     408            0 :         if (!is_point_in_interval(x1, xwindow))
     409            0 :           continue;
     410              : 
     411            0 :         compute_interval(map[2], handler->grid.full_size[2],
     412            0 :                          handler->grid.size[2], xmax - lb_cube[2], x1, &x,
     413              :                          lower_corner + 2, upper_corner + 2, xwindow);
     414              : 
     415            0 :         if (upper_corner[2] - lower_corner[2]) {
     416            0 :           const int position1[3] = {k, j, x};
     417              : 
     418              :           /* the function will internally take care of the local vs global
     419              :            * grid */
     420              : 
     421            0 :           double *__restrict__ dst = &idx3(handler->grid, lower_corner[0],
     422              :                                            lower_corner[1], lower_corner[2]);
     423            0 :           double *__restrict__ src =
     424            0 :               &idx3(handler->cube, position1[0], position1[1], position1[2]);
     425              : 
     426            0 :           const int sizex = upper_corner[2] - lower_corner[2];
     427            0 :           GRID_PRAGMA_SIMD((dst, src), 8)
     428            0 :           for (int x = 0; x < sizex; x++) {
     429            0 :             dst[x] += src[x];
     430              :           }
     431              : 
     432            0 :           if (handler->grid.size[0] == handler->grid.full_size[0])
     433            0 :             update_loop_index(handler->grid.full_size[2], x1, &x);
     434              :           else
     435            0 :             x += upper_corner[2] - lower_corner[2] - 1;
     436              :         }
     437              :       }
     438              :     }
     439              :   }
     440            0 : }
     441              : 
     442        10560 : void collocate_l0(double *scratch, const double alpha, const bool orthogonal_xy,
     443              :                   const struct tensor_ *exp_xy,
     444              :                   const struct tensor_ *p_alpha_beta_reduced_,
     445              :                   struct tensor_ *cube) {
     446        10560 :   const double *__restrict pz =
     447              :       &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
     448        10560 :   const double *__restrict py =
     449        10560 :       &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
     450        10560 :   const double *__restrict px =
     451        10560 :       &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
     452              : 
     453        10560 :   memset(&idx3(cube[0], 0, 0, 0), 0, sizeof(double) * cube->alloc_size_);
     454        10560 :   memset(scratch, 0, sizeof(double) * cube->size[1] * cube->ld_);
     455              : 
     456        10560 :   cblas_dger(CblasRowMajor, cube->size[1], cube->size[2], alpha, py, 1, px, 1,
     457        10560 :              scratch, cube->ld_);
     458              : 
     459        10560 :   if (exp_xy && !orthogonal_xy) {
     460            0 :     for (int y = 0; y < cube->size[1]; y++) {
     461            0 :       const double *__restrict src = &idx2(exp_xy[0], y, 0);
     462            0 :       double *__restrict dst = &scratch[y * cube->ld_];
     463              :       // #pragma omp simd linear(dst, src) simdlen(8)
     464            0 :       GRID_PRAGMA_SIMD((dst, src), 8)
     465            0 :       for (int x = 0; x < cube->size[2]; x++) {
     466            0 :         dst[x] *= src[x];
     467              :       }
     468              :     }
     469              :   }
     470              : 
     471        10560 :   cblas_dger(CblasRowMajor, cube->size[0], cube->size[1] * cube->ld_, 1.0, pz,
     472        10560 :              1, scratch, 1, &idx3(cube[0], 0, 0, 0), cube->size[1] * cube->ld_);
     473        10560 : }
     474              : 
     475              : /* compute the following operation (variant) it is a tensor contraction
     476              : 
     477              :                                  V_{kji} = \sum_{\alpha\beta\gamma}
     478              :    C_{\alpha\gamma\beta} T_{2,\alpha,i} T_{1,\beta,j} T_{0,\gamma,k}
     479              : 
     480              : */
     481        81054 : void tensor_reduction_for_collocate_integrate(
     482              :     double *scratch, const double alpha, const bool *const orthogonal,
     483              :     const struct tensor_ *Exp, const struct tensor_ *co,
     484              :     const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube) {
     485        81054 :   if (co->size[0] > 1) {
     486        70494 :     dgemm_params m1, m2, m3;
     487              : 
     488        70494 :     memset(&m1, 0, sizeof(dgemm_params));
     489        70494 :     memset(&m2, 0, sizeof(dgemm_params));
     490        70494 :     memset(&m3, 0, sizeof(dgemm_params));
     491        70494 :     tensor T;
     492        70494 :     tensor W;
     493              : 
     494        70494 :     double *__restrict const pz =
     495              :         &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
     496        70494 :     double *__restrict const py =
     497        70494 :         &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
     498        70494 :     double *__restrict const px =
     499        70494 :         &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
     500              : 
     501        70494 :     initialize_tensor_3(&T, co->size[0] /* alpha */, co->size[1] /* gamma */,
     502              :                         cube->size[1] /* j */);
     503        70494 :     initialize_tensor_3(&W, co->size[1] /* gamma */, cube->size[1] /* j */,
     504              :                         cube->size[2] /* i */);
     505              : 
     506        70494 :     T.data = scratch;
     507        70494 :     W.data = scratch + T.alloc_size_;
     508              :     /* WARNING we are in row major layout. cblas allows it and it is more
     509              :      * natural to read left to right than top to bottom
     510              :      *
     511              :      * we do first T_{\alpha,\gamma,j} = \sum_beta C_{alpha\gamma\beta}
     512              :      * Y_{\beta, j}
     513              :      *
     514              :      * keep in mind that Y_{\beta, j} = p_alpha_beta_reduced(1, \beta, j)
     515              :      * and the order of indices is also important. the last indice is the
     516              :      * fastest one. it can be done with one dgemm.
     517              :      */
     518              : 
     519        70494 :     m1.op1 = 'N';
     520        70494 :     m1.op2 = 'N';
     521        70494 :     m1.alpha = alpha;
     522        70494 :     m1.beta = 0.0;
     523        70494 :     m1.m = co->size[0] * co->size[1]; /* alpha gamma */
     524        70494 :     m1.n = cube->size[1];             /* j */
     525        70494 :     m1.k = co->size[2];               /* beta */
     526        70494 :     m1.a = co->data;                  // Coef_{alpha,gamma,beta} Coef_xzy
     527        70494 :     m1.lda = co->ld_;
     528        70494 :     m1.b = py; // Y_{beta, j} = p_alpha_beta_reduced(1, beta, j)
     529        70494 :     m1.ldb = p_alpha_beta_reduced_->ld_;
     530        70494 :     m1.c = T.data; // T_{\alpha, \gamma, j} = T(alpha, gamma, j)
     531        70494 :     m1.ldc = T.ld_;
     532              : 
     533              :     /*
     534              :      * the next step is a reduction along the alpha index.
     535              :      *
     536              :      * We compute then
     537              :      *
     538              :      * W_{gamma, j, i} = sum_{\alpha} T_{\gamma, j, alpha} X_{\alpha, i}
     539              :      *
     540              :      * which means we need to transpose T_{\alpha, \gamma, j} to get
     541              :      * T_{\gamma, j, \alpha}. Fortunately we can do it while doing the
     542              :      * matrix - matrix multiplication
     543              :      */
     544              : 
     545        70494 :     m2.op1 = 'T';
     546        70494 :     m2.op2 = 'N';
     547        70494 :     m2.alpha = 1.0;
     548        70494 :     m2.beta = 0.0;
     549        70494 :     m2.m = cube->size[1] * co->size[1]; // (\gamma j) direction
     550        70494 :     m2.n = cube->size[2];               // i
     551        70494 :     m2.k = co->size[0];                 // alpha
     552        70494 :     m2.a = T.data;                      // T_{\alpha, \gamma, j}
     553        70494 :     m2.lda = T.ld_ * co->size[1];
     554        70494 :     m2.b = px; // X_{alpha, i}  = p_alpha_beta_reduced(0, alpha, i)
     555        70494 :     m2.ldb = p_alpha_beta_reduced_->ld_;
     556        70494 :     m2.c = W.data; // W_{\gamma, j, i}
     557        70494 :     m2.ldc = W.ld_;
     558              : 
     559              :     /* the final step is again a reduction along the gamma indice. It can
     560              :      * again be done with one dgemm. The operation is simply
     561              :      *
     562              :      * Cube_{k, j, i} = \sum_{alpha} Z_{k, \gamma} W_{\gamma, j, i}
     563              :      *
     564              :      * which means we need to transpose Z_{\gamma, k}.
     565              :      */
     566              : 
     567        70494 :     m3.op1 = 'T';
     568        70494 :     m3.op2 = 'N';
     569        70494 :     m3.alpha = alpha;
     570        70494 :     m3.beta = 0.0;
     571        70494 :     m3.m = cube->size[0];                 // Z_{k \gamma}
     572        70494 :     m3.n = cube->size[1] * cube->size[2]; // (ji) direction
     573        70494 :     m3.k = co->size[1];                   // \gamma
     574        70494 :     m3.a = pz;                            // p_alpha_beta_reduced(0, gamma, i)
     575        70494 :     m3.lda = p_alpha_beta_reduced_->ld_;
     576        70494 :     m3.b = &idx3(W, 0, 0, 0); // W_{\gamma, j, i}
     577        70494 :     m3.ldb = W.size[1] * W.ld_;
     578        70494 :     m3.c = &idx3(cube[0], 0, 0, 0); // cube_{kji}
     579        70494 :     m3.ldc = cube->ld_ * cube->size[1];
     580              : 
     581        70494 :     dgemm_simplified(&m1);
     582        70494 :     dgemm_simplified(&m2);
     583              : 
     584              :     // apply the non orthorombic corrections in the xy plane
     585        70494 :     if (Exp && !orthogonal[2]) {
     586         4668 :       tensor exp_xy;
     587         4668 :       initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
     588         4668 :       exp_xy.data = &idx3(Exp[0], 2, 0, 0);
     589         4668 :       apply_non_orthorombic_corrections_xy_blocked(&exp_xy, &W);
     590              :     }
     591              : 
     592        70494 :     dgemm_simplified(&m3);
     593              :   } else {
     594        10560 :     if (Exp && !orthogonal[2]) {
     595            0 :       tensor exp_xy;
     596            0 :       initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
     597              : 
     598            0 :       exp_xy.data = &idx3(Exp[0], 2, 0, 0);
     599            0 :       collocate_l0(scratch, co->data[0] * alpha, orthogonal[2], &exp_xy,
     600              :                    p_alpha_beta_reduced_, cube);
     601              :     } else {
     602        10560 :       collocate_l0(scratch, co->data[0] * alpha, true, NULL,
     603              :                    p_alpha_beta_reduced_, cube);
     604              :     }
     605              :   }
     606              : 
     607        81054 :   if (Exp == NULL)
     608              :     return;
     609              : 
     610        45945 :   if (Exp && (!orthogonal[0] && !orthogonal[1])) {
     611            0 :     tensor exp_yz;
     612            0 :     tensor exp_xz;
     613            0 :     initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
     614            0 :     initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
     615            0 :     exp_xz.data = &idx3(Exp[0], 0, 0, 0);
     616            0 :     exp_yz.data = &idx3(Exp[0], 1, 0, 0);
     617            0 :     apply_non_orthorombic_corrections_xz_yz_blocked(&exp_xz, &exp_yz, cube);
     618            0 :     return;
     619              :   }
     620              : 
     621        45945 :   if (Exp && (!orthogonal[0] || !orthogonal[1])) {
     622        20163 :     if (!orthogonal[0]) {
     623        20163 :       tensor exp_xz;
     624        20163 :       initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
     625        20163 :       exp_xz.data = &idx3(Exp[0], 0, 0, 0);
     626        20163 :       apply_non_orthorombic_corrections_xz_blocked(&exp_xz, cube);
     627              :     }
     628              : 
     629        20163 :     if (!orthogonal[1]) {
     630            0 :       tensor exp_yz;
     631            0 :       initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
     632            0 :       exp_yz.data = &idx3(Exp[0], 1, 0, 0);
     633            0 :       apply_non_orthorombic_corrections_yz_blocked(&exp_yz, cube);
     634              :     }
     635              :   }
     636              : 
     637              :   return;
     638              : }
     639              : 
     640              : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
     641              :  * very general and does not depend on the orthorombic nature of the grid. for
     642              :  * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
     643              : 
     644        45945 : void apply_mapping_cubic(struct collocation_integration_ *handler,
     645              :                          const int cmax, const int *const lower_boundaries_cube,
     646              :                          const int *const cube_center) {
     647              : 
     648              :   // a mapping so that the ig corresponds to the right grid point
     649        45945 :   int **map = handler->map;
     650        45945 :   map[1] = map[0] + 2 * cmax + 1;
     651        45945 :   map[2] = map[1] + 2 * cmax + 1;
     652              : 
     653       183780 :   for (int i = 0; i < 3; i++) {
     654      3462100 :     for (int ig = 0; ig < handler->cube.size[i]; ig++) {
     655      3324265 :       map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
     656      3324265 :                               handler->grid.lower_corner[i],
     657              :                           handler->grid.full_size[i]);
     658              :     }
     659              :   }
     660              : 
     661        45945 :   int lower_corner[3];
     662        45945 :   int upper_corner[3];
     663              : 
     664        45945 :   const Interval zwindow = {.xmin = handler->grid.window_shift[0],
     665        45945 :                             .xmax = handler->grid.window_size[0]};
     666        45945 :   const Interval ywindow = {.xmin = handler->grid.window_shift[1],
     667        45945 :                             .xmax = handler->grid.window_size[1]};
     668        45945 :   const Interval xwindow = {.xmin = handler->grid.window_shift[2],
     669        45945 :                             .xmax = handler->grid.window_size[2]};
     670              : 
     671              :   /* this code makes a decomposition of the cube such that we can add block of
     672              :    * datas in a vectorized way. */
     673              : 
     674              :   /* the decomposition depends unfortunately on the way the grid is split over
     675              :    * mpi ranks. If each mpi rank has the full grid then it is simple. A 1D
     676              :    * example of the decomposition will explain it better. We have an interval
     677              :    * [x1, x1 + cube_size - 1] (and a second index x [0, cube_size -1]) and a
     678              :    * grid that goes from [0.. grid_size - 1].
     679              :    *
     680              :    * We start from x1 and compute the largest interval [x1.. x1 + diff] that fit
     681              :    * to [0.. grid_size - 1]. Computing the difference diff is simply
     682              :    * min(grid_size - x1, cube_size - x). then we add the result in a vectorized
     683              :    * fashion. we itterate the processus by reducing the interval [x1, x1 +
     684              :    * cube_size - 1] until it is empty. */
     685              : 
     686       134487 :   for (int z = 0; (z < handler->cube.size[0]); z++) {
     687        88542 :     const int z1 = map[0][z];
     688              : 
     689        88542 :     if (!is_point_in_interval(z1, zwindow))
     690            0 :       continue;
     691              : 
     692        88542 :     compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
     693              :                      handler->cube.size[0], z1, &z, lower_corner, upper_corner,
     694              :                      zwindow);
     695              : 
     696              :     /* // We have a full plane. */
     697        88542 :     if (upper_corner[0] - lower_corner[0]) {
     698       268410 :       for (int y = 0; y < handler->cube.size[1]; y++) {
     699       179868 :         const int y1 = map[1][y];
     700              : 
     701              :         // this check is completely irrelevant when running without MPI.
     702       179868 :         if (!is_point_in_interval(y1, ywindow))
     703            0 :           continue;
     704              : 
     705       179868 :         compute_interval(map[1], handler->grid.full_size[1],
     706       179868 :                          handler->grid.size[1], handler->cube.size[1], y1, &y,
     707              :                          lower_corner + 1, upper_corner + 1, ywindow);
     708              : 
     709       179868 :         if (upper_corner[1] - lower_corner[1]) {
     710       594034 :           for (int x = 0; x < handler->cube.size[2]; x++) {
     711       414166 :             const int x1 = map[2][x];
     712              : 
     713       414166 :             if (!is_point_in_interval(x1, xwindow))
     714            0 :               continue;
     715              : 
     716       414166 :             compute_interval(map[2], handler->grid.full_size[2],
     717       414166 :                              handler->grid.size[2], handler->cube.size[2], x1,
     718              :                              &x, lower_corner + 2, upper_corner + 2, xwindow);
     719              : 
     720       414166 :             if (upper_corner[2] - lower_corner[2]) {
     721       414166 :               const int position1[3] = {z, y, x};
     722              :               /* the function will internally take care of the local vx global
     723              :                * grid */
     724       414166 :               add_sub_grid(
     725              :                   lower_corner, // lower corner of the portion of cube (in the
     726              :                   // full grid)
     727              :                   upper_corner, // upper corner of the portion of cube (in the
     728              :                   // full grid)
     729              :                   position1,       // starting position subblock inside the cube
     730       414166 :                   &handler->cube,  // the cube to extract data from
     731              :                   &handler->grid); // the grid to add data from
     732       414166 :               if (handler->grid.size[2] == handler->grid.full_size[2])
     733       414166 :                 update_loop_index(handler->grid.full_size[2], x1, &x);
     734              :               else
     735            0 :                 x += upper_corner[2] - lower_corner[2] - 1;
     736              :             }
     737              :           }
     738       179868 :           if (handler->grid.size[1] == handler->grid.full_size[1])
     739       179868 :             update_loop_index(handler->grid.full_size[1], y1, &y);
     740              :           else
     741            0 :             y += upper_corner[1] - lower_corner[1] - 1;
     742              :         }
     743              :       }
     744        88542 :       if (handler->grid.size[0] == handler->grid.full_size[0])
     745        88542 :         update_loop_index(handler->grid.full_size[0], z1, &z);
     746              :       else
     747            0 :         z += upper_corner[0] - lower_corner[0] - 1;
     748              :     }
     749              :   }
     750        45945 : }
     751              : 
     752              : // *****************************************************************************
     753        45945 : void grid_collocate(collocation_integration *const handler,
     754              :                     const bool use_ortho, const double zetp, const double rp[3],
     755              :                     const double radius) {
     756              :   // *** position of the gaussian product
     757              :   //
     758              :   // this is the actual definition of the position on the grid
     759              :   // i.e. a point rp(:) gets here grid coordinates
     760              :   // MODULO(rp(:)/dr(:),npts(:))+1
     761              :   // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on
     762              :   // grid)
     763              : 
     764              :   // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
     765        45945 :   int cubecenter[3];
     766        45945 :   int cube_size[3];
     767        45945 :   int lb_cube[3], ub_cube[3];
     768        45945 :   int pol_offset[3] = {0, 0, 0};
     769        45945 :   double roffset[3];
     770        45945 :   double disr_radius;
     771              :   /* cube : grid containing pointlike product between polynomials
     772              :    *
     773              :    * pol : grid  containing the polynomials in all three directions
     774              :    *
     775              :    * pol_folded : grid containing the polynomials after folding for periodic
     776              :    * boundaries conditions
     777              :    */
     778              : 
     779              :   /* seting up the cube parameters */
     780        91890 :   int cmax = compute_cube_properties(
     781        45945 :       use_ortho, radius, (const double(*)[3])handler->dh,
     782        45945 :       (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
     783              :       cubecenter, lb_cube, ub_cube, cube_size);
     784              : 
     785              :   /* initialize the multidimensional array containing the polynomials */
     786        45945 :   initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
     787        45945 :   realloc_tensor(&handler->pol);
     788        45945 :   memset(handler->pol.data, 0, sizeof(double) * handler->pol.alloc_size_);
     789              : 
     790              :   /* compute the polynomials */
     791              : 
     792              :   // WARNING : do not reverse the order in pol otherwise you will have to
     793              :   // reverse the order in collocate_dgemm as well.
     794              : 
     795        45945 :   if (use_ortho) {
     796        21114 :     grid_fill_pol_dgemm(false, handler->dh[0][0], roffset[2], pol_offset[2],
     797        21114 :                         lb_cube[2], ub_cube[2], handler->coef.size[2] - 1, cmax,
     798        21114 :                         zetp, &idx3(handler->pol, 2, 0, 0)); /* i indice */
     799        21114 :     grid_fill_pol_dgemm(false, handler->dh[1][1], roffset[1], pol_offset[1],
     800        21114 :                         lb_cube[1], ub_cube[1], handler->coef.size[1] - 1, cmax,
     801        21114 :                         zetp, &idx3(handler->pol, 1, 0, 0)); /* j indice */
     802        21114 :     grid_fill_pol_dgemm(false, handler->dh[2][2], roffset[0], pol_offset[0],
     803        21114 :                         lb_cube[0], ub_cube[0], handler->coef.size[0] - 1, cmax,
     804              :                         zetp, &idx3(handler->pol, 0, 0, 0)); /* k indice */
     805              :   } else {
     806        24831 :     grid_fill_pol_dgemm(false, 1.0, roffset[0], pol_offset[2], lb_cube[0],
     807        24831 :                         ub_cube[0], handler->coef.size[0] - 1, cmax,
     808        24831 :                         zetp * handler->dx[0],
     809              :                         &idx3(handler->pol, 0, 0, 0)); /* k indice */
     810        24831 :     grid_fill_pol_dgemm(false, 1.0, roffset[1], pol_offset[1], lb_cube[1],
     811        24831 :                         ub_cube[1], handler->coef.size[1] - 1, cmax,
     812        24831 :                         zetp * handler->dx[1],
     813        24831 :                         &idx3(handler->pol, 1, 0, 0)); /* j indice */
     814        24831 :     grid_fill_pol_dgemm(false, 1.0, roffset[2], pol_offset[0], lb_cube[2],
     815        24831 :                         ub_cube[2], handler->coef.size[2] - 1, cmax,
     816        24831 :                         zetp * handler->dx[2],
     817        24831 :                         &idx3(handler->pol, 2, 0, 0)); /* i indice */
     818              : 
     819        24831 :     calculate_non_orthorombic_corrections_tensor(
     820              :         zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
     821        24831 :         handler->orthogonal, &handler->Exp);
     822              : 
     823              :     /* Use a slightly modified version of Ole code */
     824        24831 :     grid_transform_coef_xzy_to_ikj((const double(*)[3])handler->dh,
     825        24831 :                                    &handler->coef);
     826              :   }
     827              : 
     828              :   /* allocate memory for the polynomial and the cube */
     829              : 
     830        45945 :   initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
     831              : 
     832        45945 :   realloc_tensor(&handler->cube);
     833        45945 :   initialize_W_and_T(handler, &handler->cube, &handler->coef);
     834              : 
     835        45945 :   tensor_reduction_for_collocate_integrate(
     836        45945 :       handler->scratch, // pointer to scratch memory
     837        45945 :       1.0, handler->orthogonal, &handler->Exp, &handler->coef, &handler->pol,
     838              :       &handler->cube);
     839              : 
     840        45945 :   if (handler->apply_cutoff) {
     841            0 :     if (use_ortho) {
     842            0 :       apply_sphere_cutoff_ortho(handler, disr_radius, cmax, lb_cube,
     843              :                                 cubecenter);
     844              :     } else {
     845            0 :       apply_spherical_cutoff_generic(handler, radius, cmax, lb_cube, ub_cube,
     846              :                                      roffset, cubecenter);
     847              :     }
     848            0 :     return;
     849              :   }
     850        45945 :   apply_mapping_cubic(handler, cmax, lb_cube, cubecenter);
     851              : }
     852              : 
     853              : //******************************************************************************
     854            0 : void grid_dgemm_collocate_pgf_product(
     855              :     const bool use_ortho, const int border_mask, const enum grid_func func,
     856              :     const int la_max, const int la_min, const int lb_max, const int lb_min,
     857              :     const double zeta, const double zetb, const double rscale,
     858              :     const double dh[3][3], const double dh_inv[3][3], const double ra[3],
     859              :     const double rab[3], const int grid_global_size[3],
     860              :     const int grid_local_size[3], const int shift_local[3],
     861              :     const int border_width[3], const double radius, const int o1, const int o2,
     862            0 :     const int n1, const int n2, const double pab_[n2][n1],
     863              :     double *const grid_) {
     864            0 :   collocation_integration *handler = collocate_create_handle();
     865              : 
     866            0 :   tensor pab;
     867            0 :   initialize_tensor_2(&pab, n2, n1);
     868            0 :   alloc_tensor(&pab);
     869            0 :   memcpy(pab.data, pab_, sizeof(double) * n1 * n2);
     870              :   // Uncomment this to dump all tasks to file.
     871              :   // #define __GRID_DUMP_TASKS
     872            0 :   int offset[2] = {o1, o2};
     873              : 
     874            0 :   int lmax[2] = {la_max, lb_max};
     875            0 :   int lmin[2] = {la_min, lb_min};
     876              : 
     877            0 :   const double zetp = zeta + zetb;
     878            0 :   const double f = zetb / zetp;
     879            0 :   const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
     880            0 :   const double prefactor = rscale * exp(-zeta * f * rab2);
     881            0 :   const double zeta_pair[2] = {zeta, zetb};
     882            0 :   initialize_basis_vectors(handler, dh, dh_inv);
     883            0 :   verify_orthogonality((const double(*)[3])dh, handler->orthogonal);
     884              : 
     885            0 :   initialize_tensor_3(&(handler->grid), grid_local_size[2], grid_local_size[1],
     886              :                       grid_local_size[0]);
     887              : 
     888            0 :   handler->grid.ld_ = grid_local_size[0];
     889            0 :   handler->grid.data = grid_;
     890              : 
     891            0 :   setup_global_grid_size(&handler->grid, (const int *)grid_global_size);
     892              : 
     893            0 :   initialize_tensor_3(&handler->grid, grid_local_size[2], grid_local_size[1],
     894              :                       grid_local_size[0]);
     895            0 :   handler->grid.ld_ = grid_local_size[0];
     896              : 
     897            0 :   setup_grid_window(&handler->grid, shift_local, border_width, border_mask);
     898              : 
     899            0 :   double rp[3], rb[3];
     900            0 :   for (int i = 0; i < 3; i++) {
     901            0 :     rp[i] = ra[i] + f * rab[i];
     902            0 :     rb[i] = ra[i] + rab[i];
     903              :   }
     904              : 
     905            0 :   int lmin_diff[2], lmax_diff[2];
     906            0 :   grid_prepare_get_ldiffs_dgemm(func, lmin_diff, lmax_diff);
     907              : 
     908            0 :   int lmin_prep[2];
     909            0 :   int lmax_prep[2];
     910              : 
     911            0 :   lmin_prep[0] = imax(lmin[0] + lmin_diff[0], 0);
     912            0 :   lmin_prep[1] = imax(lmin[1] + lmin_diff[1], 0);
     913              : 
     914            0 :   lmax_prep[0] = lmax[0] + lmax_diff[0];
     915            0 :   lmax_prep[1] = lmax[1] + lmax_diff[1];
     916              : 
     917            0 :   const int n1_prep = ncoset(lmax_prep[0]);
     918            0 :   const int n2_prep = ncoset(lmax_prep[1]);
     919              : 
     920              :   /* I really do not like this. This will disappear */
     921            0 :   tensor pab_prep;
     922            0 :   initialize_tensor_2(&pab_prep, n2_prep, n1_prep);
     923            0 :   alloc_tensor(&pab_prep);
     924            0 :   memset(pab_prep.data, 0, pab_prep.alloc_size_ * sizeof(double));
     925              : 
     926            0 :   grid_prepare_pab_dgemm(func, offset, lmax, lmin, &zeta_pair[0], &pab,
     927              :                          &pab_prep);
     928              : 
     929              :   //   *** initialise the coefficient matrix, we transform the sum
     930              :   //
     931              :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
     932              :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
     933              :   //         (z-a_z)**lza
     934              :   //
     935              :   // into
     936              :   //
     937              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
     938              :   //
     939              :   // where p is center of the product gaussian, and lp = la_max + lb_max
     940              :   // (current implementation is l**7)
     941              :   //
     942              : 
     943              :   /* precautionary tail since I probably intitialize data to NULL when I
     944              :    * initialize a new tensor. I want to keep the memory (I put a ridiculous
     945              :    * amount already) */
     946              : 
     947            0 :   initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
     948            0 :                       lmax_prep[0] + lmax_prep[1] + 1);
     949            0 :   realloc_tensor(&(handler->alpha));
     950              : 
     951            0 :   const int lp = lmax_prep[0] + lmax_prep[1];
     952              : 
     953            0 :   initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
     954            0 :   realloc_tensor(&(handler->coef));
     955              : 
     956              :   // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
     957              :   // need them to be stored as
     958              : 
     959            0 :   grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &(handler->alpha));
     960              : 
     961              :   //
     962              :   //   compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and
     963              :   //   alpha(ls,lxa,lxb,1) use a three step procedure we don't store zeros, so
     964              :   //   counting is done using lxyz,lxy in order to have contiguous memory access
     965              :   //   in collocate_fast.F
     966              :   //
     967              : 
     968              :   // coef[x][z][y]
     969            0 :   grid_compute_coefficients_dgemm(lmin_prep, lmax_prep, lp, prefactor,
     970              :                                   &(handler->alpha), &pab_prep,
     971              :                                   &(handler->coef));
     972              : 
     973            0 :   grid_collocate(handler, use_ortho, zetp, rp, radius);
     974              : 
     975            0 :   collocate_destroy_handle(handler);
     976            0 :   free(pab_prep.data);
     977            0 : }
     978              : 
     979         5100 : void extract_blocks(grid_context *const ctx, const _task *const task,
     980              :                     const offload_buffer *pab_blocks, tensor *const work,
     981              :                     tensor *const pab) {
     982         5100 :   const int iatom = task->iatom;
     983         5100 :   const int jatom = task->jatom;
     984         5100 :   const int iset = task->iset;
     985         5100 :   const int jset = task->jset;
     986         5100 :   const int ikind = ctx->atom_kinds[iatom];
     987         5100 :   const int jkind = ctx->atom_kinds[jatom];
     988         5100 :   const grid_basis_set *ibasis = ctx->basis_sets[ikind];
     989         5100 :   const grid_basis_set *jbasis = ctx->basis_sets[jkind];
     990              : 
     991         5100 :   const int block_num = task->block_num;
     992              : 
     993              :   // Locate current matrix block within the buffer. This block
     994              :   // contains the weights of the gaussian pairs in the spherical
     995              :   // harmonic basis, but we do computation in the cartesian
     996              :   // harmonic basis so we have to rotate the coefficients. It is nothing
     997              :   // else than a basis change and it done with two dgemm.
     998              : 
     999         5100 :   const int block_offset = ctx->block_offsets[block_num]; // zero based
    1000         5100 :   double *const block = &pab_blocks->host_buffer[block_offset];
    1001              : 
    1002         5100 :   rotate_to_cartesian_harmonics(ibasis, jbasis, iatom, jatom, iset, jset, block,
    1003              :                                 work, pab);
    1004         5100 : }
    1005              : 
    1006        45945 : void compute_coefficients(grid_context *const ctx,
    1007              :                           struct collocation_integration_ *handler,
    1008              :                           const _task *const previous_task,
    1009              :                           const _task *const task,
    1010              :                           const offload_buffer *pab_blocks, tensor *const pab,
    1011              :                           tensor *const work, tensor *const pab_prep) {
    1012              :   // Load subblock from buffer and decontract into Cartesian sublock pab.
    1013              :   // The previous pab can be reused when only ipgf or jpgf has changed.
    1014        45945 :   if (task->update_block_ || (previous_task == NULL)) {
    1015         4542 :     extract_blocks(ctx, task, pab_blocks, work, pab);
    1016              :   }
    1017              : 
    1018        45945 :   int lmin_prep[2];
    1019        45945 :   int lmax_prep[2];
    1020              : 
    1021        45945 :   lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
    1022        45945 :   lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
    1023              : 
    1024        45945 :   lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
    1025        45945 :   lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
    1026              : 
    1027        45945 :   const int n1_prep = ncoset(lmax_prep[0]);
    1028        45945 :   const int n2_prep = ncoset(lmax_prep[1]);
    1029              : 
    1030              :   /* we do not reallocate memory. We initialized the structure with the
    1031              :    * maximum lmax of the all list already.
    1032              :    */
    1033        45945 :   initialize_tensor_2(pab_prep, n2_prep, n1_prep);
    1034        45945 :   realloc_tensor(pab_prep);
    1035              : 
    1036        45945 :   grid_prepare_pab_dgemm(handler->func, task->offset, task->lmin, task->lmax,
    1037        45945 :                          &task->zeta[0], pab, pab_prep);
    1038              : 
    1039              :   //   *** initialise the coefficient matrix, we transform the sum
    1040              :   //
    1041              :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
    1042              :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb
    1043              :   //         (y-a_y)**lya (z-a_z)**lza
    1044              :   //
    1045              :   // into
    1046              :   //
    1047              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp
    1048              :   // (z-p_z)**lzp
    1049              :   //
    1050              :   // where p is center of the product gaussian, and lp = la_max + lb_max
    1051              :   // (current implementation is l**7)
    1052              :   //
    1053              : 
    1054              :   /* precautionary tail since I probably intitialize data to NULL when I
    1055              :    * initialize a new tensor. I want to keep the memory (I put a ridiculous
    1056              :    * amount already) */
    1057              : 
    1058        45945 :   initialize_tensor_4(&handler->alpha, 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
    1059        45945 :                       lmax_prep[0] + lmax_prep[1] + 1);
    1060        45945 :   realloc_tensor(&handler->alpha);
    1061              : 
    1062        45945 :   const int lp = lmax_prep[0] + lmax_prep[1];
    1063              : 
    1064        45945 :   initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
    1065        45945 :   realloc_tensor(&handler->coef);
    1066              : 
    1067              :   // these two functions can be done with dgemm again....
    1068              : 
    1069        45945 :   grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax_prep,
    1070              :                            &handler->alpha);
    1071              : 
    1072              :   // compute the coefficients after applying the function of interest
    1073              :   // coef[x][z][y]
    1074        91890 :   grid_compute_coefficients_dgemm(
    1075              :       lmin_prep, lmax_prep, lp,
    1076        69201 :       task->prefactor * ((task->iatom == task->jatom) ? 1.0 : 2.0),
    1077              :       &handler->alpha, pab_prep, &handler->coef);
    1078        45945 : }
    1079              : 
    1080          612 : void collocate_one_grid_level_dgemm(grid_context *const ctx,
    1081              :                                     const int *const border_width,
    1082              :                                     const int *const shift_local,
    1083              :                                     const enum grid_func func, const int level,
    1084              :                                     const offload_buffer *pab_blocks) {
    1085          612 :   tensor *const grid = &ctx->grid[level];
    1086              :   // Using default(shared) because with GCC 9 the behavior around const changed:
    1087              :   // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
    1088          612 : #pragma omp parallel default(shared)
    1089              :   {
    1090              :     const int num_threads = omp_get_num_threads();
    1091              :     const int thread_id = omp_get_thread_num();
    1092              : 
    1093              :     tensor work, pab, pab_prep;
    1094              : 
    1095              :     struct collocation_integration_ *handler = ctx->handler[thread_id];
    1096              : 
    1097              :     handler->func = func;
    1098              :     grid_prepare_get_ldiffs_dgemm(handler->func, handler->lmin_diff,
    1099              :                                   handler->lmax_diff);
    1100              : 
    1101              :     handler->apply_cutoff = ctx->apply_cutoff;
    1102              : 
    1103              :     // Allocate pab matrix for re-use across tasks.
    1104              :     initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
    1105              :     alloc_tensor(&pab);
    1106              : 
    1107              :     initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
    1108              :     alloc_tensor(&work);
    1109              : 
    1110              :     initialize_tensor_2(&pab_prep, ctx->maxco, ctx->maxco);
    1111              :     alloc_tensor(&pab_prep);
    1112              : 
    1113              :     initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
    1114              :                              (const double(*)[3])grid->dh_inv);
    1115              : 
    1116              :     /* setup the grid parameters, window parameters (if the grid is split), etc
    1117              :      */
    1118              : 
    1119              :     tensor_copy(&handler->grid, grid);
    1120              : 
    1121              :     for (int d = 0; d < 3; d++)
    1122              :       handler->orthogonal[d] = handler->grid.orthogonal[d];
    1123              : 
    1124              :     if ((thread_id == 0) || (num_threads == 1)) {
    1125              :       // thread id 0 directly store the results in the final storage space
    1126              :       handler->grid.data = ctx->grid[level].data;
    1127              :     }
    1128              : 
    1129              :     if ((num_threads > 1) && (thread_id > 0)) {
    1130              :       handler->grid.data = ((double *)ctx->scratch) +
    1131              :                            (thread_id - 1) * handler->grid.alloc_size_;
    1132              :       memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
    1133              :     }
    1134              : 
    1135              :     /* it is only useful when we split the list over multiple threads. The first
    1136              :      * iteration should load the block whatever status the task->block_update_
    1137              :      * has */
    1138              :     const _task *prev_task = NULL;
    1139              : #pragma omp for schedule(static)
    1140              :     for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
    1141              :       // Define some convenient aliases.
    1142              :       const _task *task = &ctx->tasks[level][itask];
    1143              : 
    1144              :       if (task->level != level) {
    1145              :         printf("level %d, %d\n", task->level, level);
    1146              :         abort();
    1147              :       }
    1148              :       /* the grid is divided over several ranks or not periodic */
    1149              :       if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
    1150              :           (handler->grid.size[1] != handler->grid.full_size[1]) ||
    1151              :           (handler->grid.size[2] != handler->grid.full_size[2])) {
    1152              :         /* unfortunately the window where the gaussian should be added depends
    1153              :          * on the bonds. So I have to adjust the window all the time. */
    1154              : 
    1155              :         setup_grid_window(&handler->grid, shift_local, border_width,
    1156              :                           task->border_mask);
    1157              :       }
    1158              : 
    1159              :       /* this is a three steps procedure. pab_blocks contains the coefficients
    1160              :        * of the operator in the spherical harmonic basis while we do computation
    1161              :        * in the cartesian harmonic basis.
    1162              :        *
    1163              :        * step 1 : rotate the coefficients from the harmonic to the cartesian
    1164              :        * basis
    1165              : 
    1166              :        * step 2 : extract the subblock and apply additional transformations
    1167              :        * corresponding the spatial derivatives of the operator (the last is not
    1168              :        * always done)
    1169              : 
    1170              :        * step 3 : change from (x - x_1)^\alpha (x - x_2)^\beta to (x -
    1171              :        * x_{12})^k. It is a transformation which involves binomial
    1172              :        * coefficients.
    1173              :        *
    1174              :        * \f[ (x - x_1) ^\alpha (x - x_2) ^ beta = \sum_{k_{1} k_{2}} ^
    1175              :        *     {\alpha\beta} \text{Binomial}(\alpha,k_1)
    1176              :        *     \text{Binomial}(\beta,k_2) (x - x_{12})^{k_1 + k_2} (x_12 - x_1)
    1177              :        *     ^{\alpha - k_1} (x_12 - x_2) ^{\beta - k_2} ]
    1178              :        *
    1179              :        * step 1 is done only when necessary, the two remaining steps are done
    1180              :        * for each bond.
    1181              :        */
    1182              : 
    1183              :       compute_coefficients(ctx, handler, prev_task, task, pab_blocks, &pab,
    1184              :                            &work, &pab_prep);
    1185              : 
    1186              :       grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp,
    1187              :                      task->radius);
    1188              :       prev_task = task;
    1189              :     }
    1190              : 
    1191              :     // Merge thread local grids into shared grid. Could be improved though....
    1192              : 
    1193              :     // thread 0 does nothing since the data are already placed in the final
    1194              :     // destination
    1195              :     if (num_threads > 1) {
    1196              :       if ((grid->alloc_size_ / (num_threads - 1)) >= 2) {
    1197              :         const int block_size = grid->alloc_size_ / (num_threads - 1) +
    1198              :                                (grid->alloc_size_ % (num_threads - 1));
    1199              : 
    1200              :         for (int bk = 0; bk < num_threads; bk++) {
    1201              :           if (thread_id > 0) {
    1202              :             int bk_id = (bk + thread_id - 1) % num_threads;
    1203              :             int begin = bk_id * block_size;
    1204              :             int end = imin((bk_id + 1) * block_size, grid->alloc_size_);
    1205              :             cblas_daxpy(end - begin, 1.0, handler->grid.data + begin, 1,
    1206              :                         grid->data + begin, 1);
    1207              :           }
    1208              : #pragma omp barrier
    1209              :         }
    1210              :       }
    1211              :     } else {
    1212              : #pragma omp critical
    1213              :       if (thread_id > 0)
    1214              :         cblas_daxpy(handler->grid.alloc_size_, 1.0,
    1215              :                     &idx3(handler->grid, 0, 0, 0), 1, &idx3(grid[0], 0, 0, 0),
    1216              :                     1);
    1217              :     }
    1218              :     handler->grid.data = NULL;
    1219              :     free(pab.data);
    1220              :     free(pab_prep.data);
    1221              :     free(work.data);
    1222              :   }
    1223          612 : }
    1224              : 
    1225              : /*******************************************************************************
    1226              :  * \brief Collocate all tasks of a given list onto given grids.
    1227              :  *        See grid_task_list.h for details.
    1228              :  ******************************************************************************/
    1229          153 : void grid_dgemm_collocate_task_list(grid_dgemm_task_list *const ptr,
    1230              :                                     const enum grid_func func,
    1231              :                                     const int nlevels,
    1232              :                                     const offload_buffer *pab_blocks,
    1233              :                                     offload_buffer *grids[nlevels]) {
    1234              : 
    1235          153 :   grid_context *const ctx = (grid_context *)ptr;
    1236              : 
    1237          153 :   assert(ctx->checksum == ctx_checksum);
    1238              : 
    1239          153 :   const int max_threads = omp_get_max_threads();
    1240              : 
    1241          153 :   assert(ctx->nlevels == nlevels);
    1242              : 
    1243              :   // #pragma omp parallel for
    1244          765 :   for (int level = 0; level < ctx->nlevels; level++) {
    1245          612 :     const _layout *layout = &ctx->layouts[level];
    1246          612 :     set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
    1247          612 :                         layout->npts_global, layout->npts_local,
    1248          612 :                         layout->shift_local, layout->border_width, layout->dh,
    1249          612 :                         layout->dh_inv, grids[level]);
    1250          612 :     memset(ctx->grid[level].data, 0,
    1251          612 :            sizeof(double) * ctx->grid[level].alloc_size_);
    1252              :   }
    1253              : 
    1254          153 :   if (ctx->scratch == NULL) {
    1255          153 :     int max_size = ctx->grid[0].alloc_size_;
    1256              : 
    1257              :     /* compute the size of the largest grid. It is used afterwards to allocate
    1258              :      * scratch memory for the grid on each omp thread */
    1259          612 :     for (int x = 1; x < nlevels; x++) {
    1260          459 :       max_size = imax(ctx->grid[x].alloc_size_, max_size);
    1261              :     }
    1262              : 
    1263          153 :     max_size = ((max_size / 4096) + (max_size % 4096 != 0)) * 4096;
    1264              : 
    1265              :     /* scratch is a void pointer !!!!! */
    1266          153 :     ctx->scratch =
    1267          153 :         grid_allocate_scratch(max_size * max_threads * sizeof(double));
    1268              :   }
    1269              : 
    1270          765 :   for (int level = 0; level < ctx->nlevels; level++) {
    1271          612 :     const _layout *layout = &ctx->layouts[level];
    1272          612 :     collocate_one_grid_level_dgemm(ctx, layout->border_width,
    1273          612 :                                    layout->shift_local, func, level,
    1274              :                                    pab_blocks);
    1275              :   }
    1276              : 
    1277          153 :   grid_free_scratch(ctx->scratch);
    1278          153 :   ctx->scratch = NULL;
    1279          153 : }
        

Generated by: LCOV version 2.0-1