LCOV - code coverage report
Current view: top level - src/grid/ref - grid_ref_collint.h (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 96.7 % 449 434
Test Date: 2025-12-04 06:27:48 Functions: 100.0 % 17 17

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2025 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : 
       8              : #include <assert.h>
       9              : #include <limits.h>
      10              : #include <math.h>
      11              : #include <stdio.h>
      12              : #include <stdlib.h>
      13              : #include <string.h>
      14              : 
      15              : #include "../common/grid_common.h"
      16              : #include "../common/grid_library.h"
      17              : 
      18              : #if (GRID_DO_COLLOCATE)
      19              : #define GRID_CONST_WHEN_COLLOCATE const
      20              : #define GRID_CONST_WHEN_INTEGRATE
      21              : #else
      22              : #define GRID_CONST_WHEN_COLLOCATE
      23              : #define GRID_CONST_WHEN_INTEGRATE const
      24              : #endif
      25              : 
      26              : /*******************************************************************************
      27              :  * \brief Collocates coefficients C_x onto the grid for orthorhombic case.
      28              :  * \author Ole Schuett
      29              :  ******************************************************************************/
      30              : static inline void
      31       221940 : ortho_cx_to_grid(const int lp, const int k1, const int k2, const int j1,
      32              :                  const int j2, const int cmax,
      33       221940 :                  const double pol[3][lp + 1][2 * cmax + 1],
      34              :                  const int map[3][2 * cmax + 1], const double dh[3][3],
      35              :                  const double dh_inv[3][3], const double kremain,
      36              :                  const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cx,
      37              :                  GRID_CONST_WHEN_INTEGRATE double *grid) {
      38              : 
      39       221940 :   const int kg1 = map[2][k1 + cmax];
      40       221940 :   const int kg2 = map[2][k2 + cmax];
      41       221940 :   const int jg1 = map[1][j1 + cmax];
      42       221940 :   const int jg2 = map[1][j2 + cmax];
      43              : 
      44              :   // For this innermost dimension we're going sequentially instead of in pairs.
      45              :   // Hence we're processing four points at once, which is well suited for AVX2.
      46       221940 :   const int jd = (2 * j1 - 1) / 2; // distance from center in grid points
      47       221940 :   const double jr = jd * dh[1][1]; // distance from center in a.u.
      48       221940 :   const double jremain = kremain - jr * jr;
      49       221940 :   const int istart = (int)ceil(-1e-8 - sqrt(fmax(0.0, jremain)) * dh_inv[0][0]);
      50       221940 :   const int iend = 1 - istart;
      51              : 
      52      3833244 :   for (int i = istart; i <= iend; i++) {
      53      3611304 :     const int ig = map[0][i + cmax];
      54              : 
      55      3611304 :     const int stride = npts_local[1] * npts_local[0];
      56      3611304 :     const int grid_index_0 = kg1 * stride + jg1 * npts_local[0] + ig;
      57      3611304 :     const int grid_index_1 = kg2 * stride + jg1 * npts_local[0] + ig;
      58      3611304 :     const int grid_index_2 = kg1 * stride + jg2 * npts_local[0] + ig;
      59      3611304 :     const int grid_index_3 = kg2 * stride + jg2 * npts_local[0] + ig;
      60              : 
      61      3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_0 = &grid[grid_index_0];
      62      3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_1 = &grid[grid_index_1];
      63      3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_2 = &grid[grid_index_2];
      64      3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_3 = &grid[grid_index_3];
      65              : 
      66              : #if (GRID_DO_COLLOCATE)
      67              :     // collocate
      68      1805652 :     double reg[4] = {0.0, 0.0, 0.0, 0.0};
      69      7897392 :     for (int lxp = 0; lxp <= lp; lxp++) {
      70      6091740 :       const double p = pol[0][lxp][i + cmax];
      71      6091740 :       reg[0] += cx[lxp * 4 + 0] * p;
      72      6091740 :       reg[1] += cx[lxp * 4 + 1] * p;
      73      6091740 :       reg[2] += cx[lxp * 4 + 2] * p;
      74      6091740 :       reg[3] += cx[lxp * 4 + 3] * p;
      75              :     }
      76      1805652 :     *grid_0 += reg[0];
      77      1805652 :     *grid_1 += reg[1];
      78      1805652 :     *grid_2 += reg[2];
      79      1805652 :     *grid_3 += reg[3];
      80              : 
      81              : #else
      82              :     // integrate
      83      1805652 :     const double reg[4] = {*grid_0, *grid_1, *grid_2, *grid_3};
      84      8098020 :     for (int lxp = 0; lxp <= lp; lxp++) {
      85      6292368 :       const double p = pol[0][lxp][i + cmax];
      86      6292368 :       cx[lxp * 4 + 0] += reg[0] * p;
      87      6292368 :       cx[lxp * 4 + 1] += reg[1] * p;
      88      6292368 :       cx[lxp * 4 + 2] += reg[2] * p;
      89      6292368 :       cx[lxp * 4 + 3] += reg[3] * p;
      90              :     }
      91              : #endif
      92              :   }
      93       221940 : }
      94              : 
      95              : /*******************************************************************************
      96              :  * \brief Transforms coefficients C_xy into C_x by fixing grid index j.
      97              :  * \author Ole Schuett
      98              :  ******************************************************************************/
      99       221940 : static inline void ortho_cxy_to_cx(const int lp, const int j1, const int j2,
     100              :                                    const int cmax,
     101       221940 :                                    const double pol[3][lp + 1][2 * cmax + 1],
     102              :                                    GRID_CONST_WHEN_COLLOCATE double *cxy,
     103              :                                    GRID_CONST_WHEN_INTEGRATE double *cx) {
     104              : 
     105       973818 :   for (int lyp = 0; lyp <= lp; lyp++) {
     106      2441602 :     for (int lxp = 0; lxp <= lp - lyp; lxp++) {
     107      1689724 :       const double p1 = pol[1][lyp][j1 + cmax];
     108      1689724 :       const double p2 = pol[1][lyp][j2 + cmax];
     109      1689724 :       const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
     110              : 
     111              : #if (GRID_DO_COLLOCATE)
     112              :       // collocate
     113       818154 :       cx[lxp * 4 + 0] += cxy[cxy_index + 0] * p1;
     114       818154 :       cx[lxp * 4 + 1] += cxy[cxy_index + 1] * p1;
     115       818154 :       cx[lxp * 4 + 2] += cxy[cxy_index + 0] * p2;
     116       818154 :       cx[lxp * 4 + 3] += cxy[cxy_index + 1] * p2;
     117              : #else
     118              :       // integrate
     119       871570 :       cxy[cxy_index + 0] += cx[lxp * 4 + 0] * p1;
     120       871570 :       cxy[cxy_index + 1] += cx[lxp * 4 + 1] * p1;
     121       871570 :       cxy[cxy_index + 0] += cx[lxp * 4 + 2] * p2;
     122       871570 :       cxy[cxy_index + 1] += cx[lxp * 4 + 3] * p2;
     123              : #endif
     124              :     }
     125              :   }
     126       221940 : }
     127              : 
     128              : /*******************************************************************************
     129              :  * \brief Collocates coefficients C_xy onto the grid for orthorhombic case.
     130              :  * \author Ole Schuett
     131              :  ******************************************************************************/
     132        28620 : static inline void ortho_cxy_to_grid(
     133              :     const int lp, const int k1, const int k2, const int cmax,
     134        28620 :     const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
     135              :     const double dh[3][3], const double dh_inv[3][3], const double disr_radius,
     136              :     const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cxy,
     137        28620 :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     138              : 
     139              :   // The cube contains an even number of grid points in each direction and
     140              :   // collocation is always performed on a pair of two opposing grid points.
     141              :   // Hence, the points with index 0 and 1 are both assigned distance zero via
     142              :   // the formular distance=(2*index-1)/2.
     143              : 
     144        28620 :   const int kd = (2 * k1 - 1) / 2; // distance from center in grid points
     145        28620 :   const double kr = kd * dh[2][2]; // distance from center in a.u.
     146        28620 :   const double kremain = disr_radius * disr_radius - kr * kr;
     147        28620 :   const int jstart = (int)ceil(-1e-8 - sqrt(fmax(0.0, kremain)) * dh_inv[1][1]);
     148              : 
     149        28620 :   const size_t cx_size = (lp + 1) * 4;
     150        28620 :   double cx[cx_size];
     151       250560 :   for (int j1 = jstart; j1 <= 0; j1++) {
     152       221940 :     const int j2 = 1 - j1;
     153       221940 :     memset(cx, 0, cx_size * sizeof(double));
     154              : 
     155              : #if (GRID_DO_COLLOCATE)
     156              :     // collocate
     157       110970 :     ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     158       110970 :     ortho_cx_to_grid(lp, k1, k2, j1, j2, cmax, pol, map, dh, dh_inv, kremain,
     159              :                      npts_local, cx, grid);
     160              : #else
     161              :     // integrate
     162       110970 :     ortho_cx_to_grid(lp, k1, k2, j1, j2, cmax, pol, map, dh, dh_inv, kremain,
     163              :                      npts_local, cx, grid);
     164       110970 :     ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     165              : #endif
     166              :   }
     167        28620 : }
     168              : 
     169              : /*******************************************************************************
     170              :  * \brief Transforms coefficients C_xyz into C_xz by fixing grid index k.
     171              :  * \author Ole Schuett
     172              :  ******************************************************************************/
     173        28620 : static inline void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2,
     174              :                                      const int cmax,
     175        28620 :                                      const double pol[3][lp + 1][2 * cmax + 1],
     176              :                                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     177              :                                      GRID_CONST_WHEN_INTEGRATE double *cxy) {
     178              : 
     179       124350 :   for (int lzp = 0; lzp <= lp; lzp++) {
     180       308074 :     for (int lyp = 0; lyp <= lp - lzp; lyp++) {
     181       603002 :       for (int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
     182       390658 :         const double p1 = pol[2][lzp][k1 + cmax];
     183       390658 :         const double p2 = pol[2][lzp][k2 + cmax];
     184       390658 :         const int cxyz_index =
     185       390658 :             lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp; // [lzp, lyp, lxp]
     186       390658 :         const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2;   // [lyp, lxp, 0]
     187              : 
     188              : #if (GRID_DO_COLLOCATE)
     189              :         // collocate
     190       186210 :         cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
     191       186210 :         cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
     192              : #else
     193              :         // integrate
     194       204448 :         cxyz[cxyz_index] += cxy[cxy_index + 0] * p1;
     195       204448 :         cxyz[cxyz_index] += cxy[cxy_index + 1] * p2;
     196              : #endif
     197              :       }
     198              :     }
     199              :   }
     200        28620 : }
     201              : 
     202              : /*******************************************************************************
     203              :  * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
     204              :  * \author Ole Schuett
     205              :  ******************************************************************************/
     206              : static inline void
     207         3276 : ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3],
     208              :                    const double dh_inv[3][3], const double rp[3],
     209              :                    const int npts_global[3], const int npts_local[3],
     210              :                    const int shift_local[3], const double radius,
     211              :                    GRID_CONST_WHEN_COLLOCATE double *cxyz,
     212         3276 :                    GRID_CONST_WHEN_INTEGRATE double *grid) {
     213              : 
     214              :   // *** position of the gaussian product
     215              :   //
     216              :   // this is the actual definition of the position on the grid
     217              :   // i.e. a point rp(:) gets here grid coordinates
     218              :   // MODULO(rp(:)/dr(:),npts_global(:))+1
     219              :   // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
     220              :   // and (1,1,1) on grid here in C.
     221              : 
     222              :   // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
     223         3276 :   int cubecenter[3];
     224        13104 :   for (int i = 0; i < 3; i++) {
     225              :     double dh_inv_rp = 0.0;
     226        39312 :     for (int j = 0; j < 3; j++) {
     227        29484 :       dh_inv_rp += dh_inv[j][i] * rp[j];
     228              :     }
     229         9828 :     cubecenter[i] = (int)floor(dh_inv_rp);
     230              :   }
     231              : 
     232              :   double roffset[3];
     233        13104 :   for (int i = 0; i < 3; i++) {
     234         9828 :     roffset[i] = rp[i] - ((double)cubecenter[i]) * dh[i][i];
     235              :   }
     236              : 
     237              :   // Historically, the radius gets discretized to make sphere bounds cacheable.
     238         3276 :   const double drmin = fmin(dh[0][0], fmin(dh[1][1], dh[2][2]));
     239         3276 :   const double disr_radius = drmin * fmax(1.0, ceil(radius / drmin));
     240              : 
     241              :   // Cube bounds.
     242         3276 :   int lb_cube[3], ub_cube[3];
     243        13104 :   for (int i = 0; i < 3; i++) {
     244         9828 :     lb_cube[i] = (int)ceil(-1e-8 - disr_radius * dh_inv[i][i]);
     245         9828 :     ub_cube[i] = 1 - lb_cube[i];
     246              :     // If grid is not period check that cube fits without wrapping.
     247         9828 :     if (npts_global[i] != npts_local[i]) {
     248            0 :       const int offset =
     249            0 :           modulo(cubecenter[i] + lb_cube[i] - shift_local[i], npts_global[i]) -
     250              :           lb_cube[i];
     251            0 :       assert(offset + ub_cube[i] < npts_local[i]);
     252            0 :       assert(offset + lb_cube[i] >= 0);
     253              :     }
     254              :   }
     255              : 
     256              :   // cmax = MAXVAL(ub_cube)
     257         3276 :   const int cmax = imax(imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
     258              : 
     259              :   // Precompute (x-xp)**lp*exp(..) for each direction.
     260         3276 :   double pol_mutable[3][lp + 1][2 * cmax + 1];
     261        13104 :   for (int idir = 0; idir < 3; idir++) {
     262         9828 :     const double dr = dh[idir][idir];
     263         9828 :     const double ro = roffset[idir];
     264              :     //  Reuse the result from the previous gridpoint to avoid to many exps:
     265              :     //  exp( -a*(x+d)**2) = exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
     266              :     //  exp(-2*a*(x+d)*d) = exp(-2*a*x*d)*exp(-2*a*d**2)
     267         9828 :     const double t_exp_1 = exp(-zetp * pow(dr, 2));
     268         9828 :     const double t_exp_2 = pow(t_exp_1, 2);
     269         9828 :     double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
     270         9828 :     double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
     271        95688 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     272        85860 :       const double rpg = ig * dr - ro;
     273        85860 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     274        85860 :       t_exp_min_2 *= t_exp_2;
     275        85860 :       double pg = t_exp_min_1;
     276       373050 :       for (int icoef = 0; icoef <= lp; icoef++) {
     277       287190 :         pol_mutable[idir][icoef][ig + cmax] = pg; // exp(-zetp*rpg**2)
     278       287190 :         pg *= rpg;
     279              :       }
     280              :     }
     281         9828 :     double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
     282         9828 :     double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
     283        95688 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     284        85860 :       const double rpg = (1 - ig) * dr - ro;
     285        85860 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     286        85860 :       t_exp_plus_2 *= t_exp_2;
     287        85860 :       double pg = t_exp_plus_1;
     288       373050 :       for (int icoef = 0; icoef <= lp; icoef++) {
     289       287190 :         pol_mutable[idir][icoef][1 - ig + cmax] = pg; // exp(-zetp*rpg**2)
     290       287190 :         pg *= rpg;
     291              :       }
     292              :     }
     293              :   }
     294         3276 :   const double(*pol)[lp + 1][2 * cmax + 1] =
     295         3276 :       (const double(*)[lp + 1][2 * cmax + 1]) pol_mutable;
     296              : 
     297              :   // Precompute mapping from cube to grid indices for each direction
     298         3276 :   int map_mutable[3][2 * cmax + 1];
     299        13104 :   for (int i = 0; i < 3; i++) {
     300       191376 :     for (int k = -cmax; k <= +cmax; k++) {
     301       181548 :       map_mutable[i][k + cmax] =
     302       181548 :           modulo(cubecenter[i] + k - shift_local[i], npts_global[i]);
     303              :     }
     304              :   }
     305         3276 :   const int(*map)[2 * cmax + 1] = (const int(*)[2 * cmax + 1]) map_mutable;
     306              : 
     307              :   // Loop over k dimension of the cube.
     308         3276 :   const int kstart = (int)ceil(-1e-8 - disr_radius * dh_inv[2][2]);
     309         3276 :   const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
     310         3276 :   double cxy[cxy_size];
     311        31896 :   for (int k1 = kstart; k1 <= 0; k1++) {
     312        28620 :     const int k2 = 1 - k1;
     313        28620 :     memset(cxy, 0, cxy_size * sizeof(double));
     314              : 
     315              : #if (GRID_DO_COLLOCATE)
     316              :     // collocate
     317        14310 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     318        14310 :     ortho_cxy_to_grid(lp, k1, k2, cmax, pol, map, dh, dh_inv, disr_radius,
     319              :                       npts_local, cxy, grid);
     320              : #else
     321              :     // integrate
     322        14310 :     ortho_cxy_to_grid(lp, k1, k2, cmax, pol, map, dh, dh_inv, disr_radius,
     323              :                       npts_local, cxy, grid);
     324        14310 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     325              : #endif
     326              :   }
     327         3276 : }
     328              : 
     329              : /*******************************************************************************
     330              :  * \brief Collocates coefficients C_i onto the grid for general case.
     331              :  * \author Ole Schuett
     332              :  ******************************************************************************/
     333              : static inline void
     334      2971300 : general_ci_to_grid(const int lp, const int jg, const int kg, const int ismin,
     335              :                    const int ismax, const int npts_local[3],
     336              :                    const int index_min[3], const int index_max[3],
     337              :                    const int map_i[], const double gp[3], const int k,
     338              :                    const int j, const double exp_ij[], const double exp_jk[],
     339              :                    const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci,
     340              :                    GRID_CONST_WHEN_INTEGRATE double *grid) {
     341              : 
     342      2971300 :   const int base = kg * npts_local[1] * npts_local[0] + jg * npts_local[0];
     343              : 
     344     33394640 :   for (int i = ismin; i <= ismax; i++) {
     345     30423340 :     const int ig = map_i[i - index_min[0]];
     346     30423340 :     if (ig < 0) {
     347            0 :       continue; // skip over out-of-bounds indicies
     348              :     }
     349     30423340 :     const double di = i - gp[0];
     350              : 
     351     30423340 :     const int stride_i = index_max[0] - index_min[0] + 1;
     352     30423340 :     const int stride_j = index_max[1] - index_min[1] + 1;
     353     30423340 :     const int stride_k = index_max[2] - index_min[2] + 1;
     354     30423340 :     const int idx_ij = (j - index_min[1]) * stride_i + i - index_min[0];
     355     30423340 :     const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
     356     30423340 :     const int idx_ki = (i - index_min[0]) * stride_k + k - index_min[2];
     357              : 
     358              :     // Mathieu's trick: Calculate 3D Gaussian from three precomputed 2D tables
     359              :     //
     360              :     // r   =  (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     361              :     //     =  a                 + b                 + c
     362              :     //
     363              :     // r**2  =  (a + b + c)**2  =  a**2 + b**2 + c**2 + 2ab + 2bc + 2ca
     364              :     //
     365              :     // exp(-r**2)  =  exp(-a(a+2b)) * exp(-b*(b+2c)) * exp(-c*(c+2a))
     366              :     //
     367     30423340 :     const double gaussian = exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
     368              : 
     369     30423340 :     const int grid_index = base + ig; // [kg, jg, ig]
     370     30423340 :     double dip = gaussian;
     371              : 
     372              : #if (GRID_DO_COLLOCATE)
     373              :     // collocate
     374     18254004 :     double reg = 0.0;
     375     73016016 :     for (int il = 0; il <= lp; il++) {
     376     54762012 :       reg += ci[il] * dip;
     377     54762012 :       dip *= di;
     378              :     }
     379     18254004 :     grid[grid_index] += reg;
     380              : #else
     381              :     // integrate
     382     12169336 :     const double reg = grid[grid_index];
     383     51719678 :     for (int il = 0; il <= lp; il++) {
     384     39550342 :       ci[il] += reg * dip;
     385     39550342 :       dip *= di;
     386              :     }
     387              : #endif
     388              :   }
     389      2971300 : }
     390              : 
     391              : /*******************************************************************************
     392              :  * \brief Transforms coefficients C_ij into C_i by fixing grid index j.
     393              :  * \author Ole Schuett
     394              :  ******************************************************************************/
     395      2971300 : static inline void general_cij_to_ci(const int lp, const double dj,
     396              :                                      GRID_CONST_WHEN_COLLOCATE double *cij,
     397              :                                      GRID_CONST_WHEN_INTEGRATE double *ci) {
     398      2971300 :   double djp = 1.0;
     399     12182330 :   for (int jl = 0; jl <= lp; jl++) {
     400     28227350 :     for (int il = 0; il <= lp - jl; il++) {
     401     19016320 :       const int cij_index = jl * (lp + 1) + il; // [jl, il]
     402              : #if (GRID_DO_COLLOCATE)
     403     10696680 :       ci[il] += cij[cij_index] * djp; // collocate
     404              : #else
     405      8319640 :       cij[cij_index] += ci[il] * djp; // integrate
     406              : #endif
     407              :     }
     408      9211030 :     djp *= dj;
     409              :   }
     410      2971300 : }
     411              : 
     412              : /*******************************************************************************
     413              :  * \brief Collocates coefficients C_ij onto the grid for general case.
     414              :  * \author Ole Schuett
     415              :  ******************************************************************************/
     416       251300 : static inline void general_cij_to_grid(
     417              :     const int lp, const int k, const int kg, const int npts_local[3],
     418              :     const int index_min[3], const int index_max[3], const int map_i[],
     419              :     const int map_j[], const double dh[3][3], const double gp[3],
     420              :     const double radius, const double exp_ij[], const double exp_jk[],
     421              :     const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *cij,
     422              :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     423              : 
     424      6562800 :   for (int j = index_min[1]; j <= index_max[1]; j++) {
     425      6311500 :     const int jg = map_j[j - index_min[1]];
     426      6311500 :     if (jg < 0) {
     427            0 :       continue; // skip over out-of-bounds indicies
     428              :     }
     429      6311500 :     const double dj = j - gp[1];
     430              : 
     431              :     //--------------------------------------------------------------------
     432              :     // Find bounds for the inner loop based on a quadratic equation in i.
     433              :     //
     434              :     // The real-space vector from the center of the gaussian to the
     435              :     // grid point i,j,k is given by:
     436              :     //   r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     437              :     //
     438              :     // Separating the term that depends on i:
     439              :     //   r = i*dh[0,:] - gp[0]*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     440              :     //     = i*dh[0,:] + v
     441              :     //
     442              :     // The squared distance works out to:
     443              :     //   r**2 = dh[0,:]**2 * i**2  +  2 * v * dh[0,:] * i  +  v**2
     444              :     //        = a * i**2           +  b * i                +  c
     445              :     //
     446              :     // Solving r**2==radius**2 for i yields:
     447              :     //    d =  b**2  -  4 * a * (c - radius**2)
     448              :     //    i = (-b \pm sqrt(d)) / (2*a)
     449              :     //
     450      6311500 :     double a = 0.0, b = 0.0, c = 0.0;
     451     25246000 :     for (int i = 0; i < 3; i++) {
     452     18934500 :       const double v = (0 - gp[0]) * dh[0][i] + (j - gp[1]) * dh[1][i] +
     453     18934500 :                        (k - gp[2]) * dh[2][i];
     454     18934500 :       a += dh[0][i] * dh[0][i];
     455     18934500 :       b += 2.0 * v * dh[0][i];
     456     18934500 :       c += v * v;
     457              :     }
     458      6311500 :     const double d = b * b - 4.0 * a * (c - radius * radius);
     459      6311500 :     if (0.0 < d) {
     460      2971300 :       const double sqrt_d = sqrt(d);
     461      2971300 :       const double inv_2a = 1.0 / (2.0 * a);
     462      2971300 :       const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
     463      2971300 :       const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
     464              : 
     465      2971300 :       double ci[lp + 1];
     466      2971300 :       memset(ci, 0, sizeof(ci));
     467              : 
     468              : #if (GRID_DO_COLLOCATE)
     469              :       // collocate
     470      1782780 :       general_cij_to_ci(lp, dj, cij, ci);
     471      1782780 :       general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min,
     472              :                          index_max, map_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     473              :                          grid);
     474              : #else
     475              :       // integrate
     476      1188520 :       general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min,
     477              :                          index_max, map_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     478              :                          grid);
     479      1188520 :       general_cij_to_ci(lp, dj, cij, ci);
     480              : #endif
     481              :     }
     482              :   }
     483       251300 : }
     484              : 
     485              : /*******************************************************************************
     486              :  * \brief Transforms coefficients C_ijk into C_ij by fixing grid index k.
     487              :  * \author Ole Schuett
     488              :  ******************************************************************************/
     489       251300 : static inline void general_cijk_to_cij(const int lp, const double dk,
     490              :                                        GRID_CONST_WHEN_COLLOCATE double *cijk,
     491              :                                        GRID_CONST_WHEN_INTEGRATE double *cij) {
     492       251300 :   double dkp = 1.0;
     493      1030330 :   for (int kl = 0; kl <= lp; kl++) {
     494      2387350 :     for (int jl = 0; jl <= lp - kl; jl++) {
     495      4372620 :       for (int il = 0; il <= lp - kl - jl; il++) {
     496      2764300 :         const int cij_index = jl * (lp + 1) + il; // [jl, il]
     497      2764300 :         const int cijk_index =
     498      2764300 :             kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il; // [kl, jl, il]
     499              : #if (GRID_DO_COLLOCATE)
     500      1507800 :         cij[cij_index] += cijk[cijk_index] * dkp; // collocate
     501              : #else
     502      1256500 :         cijk[cijk_index] += cij[cij_index] * dkp; // integrate
     503              : #endif
     504              :       }
     505              :     }
     506       779030 :     dkp *= dk;
     507              :   }
     508       251300 : }
     509              : 
     510              : /*******************************************************************************
     511              :  * \brief Precompute mapping of grid indices for general case.
     512              :  * \author Ole Schuett
     513              :  ******************************************************************************/
     514        46680 : static inline void general_precompute_mapping(const int index_min,
     515              :                                               const int index_max,
     516              :                                               const int shift_local,
     517              :                                               const int npts_global,
     518              :                                               const int bounds[2], int map[]) {
     519              : 
     520              :   // Precompute mapping from continous grid indices to pbc wraped.
     521      1052140 :   for (int k = index_min; k <= index_max; k++) {
     522      1005460 :     const int kg = modulo(k - shift_local, npts_global);
     523      1005460 :     if (bounds[0] <= kg && kg <= bounds[1]) {
     524      1005460 :       map[k - index_min] = kg;
     525              :     } else {
     526            0 :       map[k - index_min] = INT_MIN; // out of bounds - not mapped
     527              :     }
     528              :   }
     529        46680 : }
     530              : 
     531              : /*******************************************************************************
     532              :  * \brief Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
     533              :  * \author Ole Schuett
     534              :  ******************************************************************************/
     535              : static inline void
     536        46680 : general_fill_exp_table(const int idir, const int jdir, const int index_min[3],
     537              :                        const int index_max[3], const double zetp,
     538              :                        const double dh[3][3], const double gp[3],
     539              :                        double exp_table[]) {
     540              : 
     541        46680 :   const int stride_i = index_max[idir] - index_min[idir] + 1;
     542        46680 :   const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
     543        46680 :                       dh[idir][2] * dh[idir][2];
     544        46680 :   const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
     545        46680 :                       dh[idir][2] * dh[jdir][2];
     546              : 
     547      1052140 :   for (int i = index_min[idir]; i <= index_max[idir]; i++) {
     548      1005460 :     const double di = i - gp[idir];
     549      1005460 :     const double rii = di * di * h_ii;
     550      1005460 :     const double rij_unit = di * h_ij;
     551      1005460 :     const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
     552              : 
     553              :     // compute exponentials symmetrically around cube center
     554      1005460 :     const int j_center = (int)gp[jdir];
     555      1005460 :     const double dj_center = j_center - gp[jdir];
     556      1005460 :     const double rij_center = dj_center * rij_unit;
     557      1005460 :     const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
     558              : 
     559              :     // above center
     560      1005460 :     double exp_ij = exp_ij_center;
     561     12528080 :     for (int j = j_center; j <= index_max[jdir]; j++) {
     562     11522620 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     563     11522620 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     564     11522620 :       exp_ij *= exp_ij_unit;
     565              :     }
     566              : 
     567              :     // below center
     568      1005460 :     const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
     569      1005460 :     exp_ij = exp_ij_center * exp_ij_unit_inv;
     570     11644280 :     for (int j = j_center - 1; j >= index_min[jdir]; j--) {
     571     10638820 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     572     10638820 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     573     10638820 :       exp_ij *= exp_ij_unit_inv;
     574              :     }
     575              :   }
     576        46680 : }
     577              : 
     578              : /*******************************************************************************
     579              :  * \brief Collocates coefficients C_ijk onto the grid for general case.
     580              :  * \author Ole Schuett
     581              :  ******************************************************************************/
     582              : static inline void
     583        15560 : general_cijk_to_grid(const int border_mask, const int lp, const double zetp,
     584              :                      const double dh[3][3], const double dh_inv[3][3],
     585              :                      const double rp[3], const int npts_global[3],
     586              :                      const int npts_local[3], const int shift_local[3],
     587              :                      const int border_width[3], const double radius,
     588              :                      GRID_CONST_WHEN_COLLOCATE double *cijk,
     589        15560 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     590              : 
     591              :   // Default for border_mask == 0.
     592        15560 :   int bounds_i[2] = {0, npts_local[0] - 1};
     593        15560 :   int bounds_j[2] = {0, npts_local[1] - 1};
     594        15560 :   int bounds_k[2] = {0, npts_local[2] - 1};
     595              : 
     596              :   // See also rs_find_node() in task_list_methods.F.
     597              :   // If the bit is set then we need to exclude the border in that direction.
     598        15560 :   if (border_mask & (1 << 0))
     599            0 :     bounds_i[0] += border_width[0];
     600        15560 :   if (border_mask & (1 << 1))
     601            0 :     bounds_i[1] -= border_width[0];
     602        15560 :   if (border_mask & (1 << 2))
     603            0 :     bounds_j[0] += border_width[1];
     604        15560 :   if (border_mask & (1 << 3))
     605            0 :     bounds_j[1] -= border_width[1];
     606        15560 :   if (border_mask & (1 << 4))
     607            0 :     bounds_k[0] += border_width[2];
     608        15560 :   if (border_mask & (1 << 5))
     609            0 :     bounds_k[1] -= border_width[2];
     610              : 
     611              :   // center in grid coords
     612              :   // gp = MATMUL(dh_inv, rp)
     613        15560 :   double gp[3] = {0.0, 0.0, 0.0};
     614        62240 :   for (int i = 0; i < 3; i++) {
     615       186720 :     for (int j = 0; j < 3; j++) {
     616       140040 :       gp[i] += dh_inv[j][i] * rp[j];
     617              :     }
     618              :   }
     619              : 
     620              :   // Get the min max indices that contain at least the cube that contains a
     621              :   // sphere around rp of radius radius if the cell is very non-orthogonal this
     622              :   // implies that many useless points are included this estimate can be improved
     623              :   // (i.e. not box but sphere should be used)
     624        15560 :   int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
     625        15560 :   int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
     626        62240 :   for (int i = -1; i <= 1; i++) {
     627       186720 :     for (int j = -1; j <= 1; j++) {
     628       560160 :       for (int k = -1; k <= 1; k++) {
     629       420120 :         const double x = rp[0] + i * radius;
     630       420120 :         const double y = rp[1] + j * radius;
     631       420120 :         const double z = rp[2] + k * radius;
     632      1680480 :         for (int idir = 0; idir < 3; idir++) {
     633      1260360 :           const double resc =
     634      1260360 :               dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
     635      1260360 :           index_min[idir] = imin(index_min[idir], (int)floor(resc));
     636      1260360 :           index_max[idir] = imax(index_max[idir], (int)ceil(resc));
     637              :         }
     638              :       }
     639              :     }
     640              :   }
     641              : 
     642              :   // Precompute mappings
     643        15560 :   const int range_i = index_max[0] - index_min[0] + 1;
     644        15560 :   int map_i[range_i];
     645        15560 :   general_precompute_mapping(index_min[0], index_max[0], shift_local[0],
     646              :                              npts_global[0], bounds_i, map_i);
     647        15560 :   const int range_j = index_max[1] - index_min[1] + 1;
     648        15560 :   int map_j[range_j];
     649        15560 :   general_precompute_mapping(index_min[1], index_max[1], shift_local[1],
     650              :                              npts_global[1], bounds_j, map_j);
     651        15560 :   const int range_k = index_max[2] - index_min[2] + 1;
     652        15560 :   int map_k[range_k];
     653        15560 :   general_precompute_mapping(index_min[2], index_max[2], shift_local[2],
     654              :                              npts_global[2], bounds_k, map_k);
     655              : 
     656              :   // Precompute exponentials
     657        15560 :   double exp_ij[range_i * range_j];
     658        15560 :   general_fill_exp_table(0, 1, index_min, index_max, zetp, dh, gp, exp_ij);
     659        15560 :   double exp_jk[range_j * range_k];
     660        15560 :   general_fill_exp_table(1, 2, index_min, index_max, zetp, dh, gp, exp_jk);
     661        15560 :   double exp_ki[range_k * range_i];
     662        15560 :   general_fill_exp_table(2, 0, index_min, index_max, zetp, dh, gp, exp_ki);
     663              : 
     664              :   // go over the grid, but cycle if the point is not within the radius
     665        15560 :   const int cij_size = (lp + 1) * (lp + 1);
     666        15560 :   double cij[cij_size];
     667       266860 :   for (int k = index_min[2]; k <= index_max[2]; k++) {
     668       251300 :     const int kg = map_k[k - index_min[2]];
     669       251300 :     if (kg < bounds_k[0] || bounds_k[1] < kg) {
     670            0 :       continue; // skip over out-of-bounds indicies
     671              :     }
     672       251300 :     const double dk = k - gp[2];
     673              : 
     674              :     // zero coef_xyt
     675       251300 :     memset(cij, 0, cij_size * sizeof(double));
     676              : 
     677              : #if (GRID_DO_COLLOCATE)
     678              :     // collocate
     679       150780 :     general_cijk_to_cij(lp, dk, cijk, cij);
     680       150780 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     681              :                         map_j, dh, gp, radius, exp_ij, exp_jk, exp_ki, cij,
     682              :                         grid);
     683              : #else
     684              :     // integrate
     685       100520 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     686              :                         map_j, dh, gp, radius, exp_ij, exp_jk, exp_ki, cij,
     687              :                         grid);
     688       100520 :     general_cijk_to_cij(lp, dk, cijk, cij);
     689              : #endif
     690              :   }
     691        15560 : }
     692              : 
     693              : /*******************************************************************************
     694              :  * \brief Transforms coefficients C_xyz into C_ijk.
     695              :  * \author Ole Schuett
     696              :  ******************************************************************************/
     697              : static inline void
     698        15560 : general_cxyz_to_cijk(const int lp, const double dh[3][3],
     699              :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     700        15560 :                      GRID_CONST_WHEN_INTEGRATE double *cijk) {
     701              : 
     702              :   // transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
     703              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
     704              :   // sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
     705              : 
     706              :   // transform using multinomials
     707        15560 :   double hmatgridp[lp + 1][3][3];
     708        62240 :   for (int i = 0; i < 3; i++) {
     709       186720 :     for (int j = 0; j < 3; j++) {
     710       140040 :       hmatgridp[0][j][i] = 1.0;
     711       434124 :       for (int k = 1; k <= lp; k++) {
     712       294084 :         hmatgridp[k][j][i] = hmatgridp[k - 1][j][i] * dh[j][i];
     713              :       }
     714              :     }
     715              :   }
     716              : 
     717              :   const int lpx = lp;
     718        63796 :   for (int klx = 0; klx <= lpx; klx++) {
     719       147820 :     for (int jlx = 0; jlx <= lpx - klx; jlx++) {
     720       270744 :       for (int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
     721       171160 :         const int lx = ilx + jlx + klx;
     722       171160 :         const int lpy = lp - lx;
     723       435680 :         for (int kly = 0; kly <= lpy; kly++) {
     724       645740 :           for (int jly = 0; jly <= lpy - kly; jly++) {
     725       904036 :             for (int ily = 0; ily <= lpy - kly - jly; ily++) {
     726       522816 :               const int ly = ily + jly + kly;
     727       522816 :               const int lpz = lp - lx - ly;
     728      1213680 :               for (int klz = 0; klz <= lpz; klz++) {
     729      1577784 :                 for (int jlz = 0; jlz <= lpz - klz; jlz++) {
     730      1999460 :                   for (int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
     731      1112540 :                     const int lz = ilz + jlz + klz;
     732      1112540 :                     const int il = ilx + ily + ilz;
     733      1112540 :                     const int jl = jlx + jly + jlz;
     734      1112540 :                     const int kl = klx + kly + klz;
     735      1112540 :                     const int lp1 = lp + 1;
     736      1112540 :                     const int cijk_index =
     737      1112540 :                         kl * lp1 * lp1 + jl * lp1 + il; // [kl,jl,il]
     738      1112540 :                     const int cxyz_index =
     739      1112540 :                         lz * lp1 * lp1 + ly * lp1 + lx; // [lz,ly,lx]
     740      1112540 :                     const double p =
     741      1112540 :                         hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
     742      1112540 :                         hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
     743      1112540 :                         hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
     744      1112540 :                         hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
     745      1112540 :                         hmatgridp[klz][2][2] * fac(lx) * fac(ly) * fac(lz) /
     746      1112540 :                         (fac(ilx) * fac(ily) * fac(ilz) * fac(jlx) * fac(jly) *
     747      1112540 :                          fac(jlz) * fac(klx) * fac(kly) * fac(klz));
     748              : #if (GRID_DO_COLLOCATE)
     749       513480 :                     cijk[cijk_index] += cxyz[cxyz_index] * p; // collocate
     750              : #else
     751       599060 :                     cxyz[cxyz_index] += cijk[cijk_index] * p; // integrate
     752              : #endif
     753              :                   }
     754              :                 }
     755              :               }
     756              :             }
     757              :           }
     758              :         }
     759              :       }
     760              :     }
     761              :   }
     762        15560 : }
     763              : 
     764              : /*******************************************************************************
     765              :  * \brief Collocates coefficients C_xyz onto the grid for general case.
     766              :  * \author Ole Schuett
     767              :  ******************************************************************************/
     768              : static inline void
     769        15560 : general_cxyz_to_grid(const int border_mask, const int lp, const double zetp,
     770              :                      const double dh[3][3], const double dh_inv[3][3],
     771              :                      const double rp[3], const int npts_global[3],
     772              :                      const int npts_local[3], const int shift_local[3],
     773              :                      const int border_width[3], const double radius,
     774              :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     775        15560 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     776              : 
     777        15560 :   const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
     778        15560 :   double cijk[cijk_size];
     779        15560 :   memset(cijk, 0, cijk_size * sizeof(double));
     780              : 
     781              : #if (GRID_DO_COLLOCATE)
     782              :   // collocate
     783         9336 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
     784         9336 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     785              :                        npts_local, shift_local, border_width, radius, cijk,
     786              :                        grid);
     787              : #else
     788              :   // integrate
     789         6224 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     790              :                        npts_local, shift_local, border_width, radius, cijk,
     791              :                        grid);
     792         6224 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
     793              : #endif
     794        15560 : }
     795              : 
     796              : /*******************************************************************************
     797              :  * \brief Collocates coefficients C_xyz onto the grid.
     798              :  * \author Ole Schuett
     799              :  ******************************************************************************/
     800              : static inline void
     801        18836 : cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
     802              :              const double zetp, const double dh[3][3],
     803              :              const double dh_inv[3][3], const double rp[3],
     804              :              const int npts_global[3], const int npts_local[3],
     805              :              const int shift_local[3], const int border_width[3],
     806              :              const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
     807              :              GRID_CONST_WHEN_INTEGRATE double *grid) {
     808              : 
     809        18836 :   enum grid_library_kernel k;
     810        18836 :   if (orthorhombic && border_mask == 0) {
     811         3276 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_ORTHO : GRID_INTEGRATE_ORTHO;
     812         3276 :     ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
     813              :                        shift_local, radius, cxyz, grid);
     814              :   } else {
     815        15560 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_GENERAL : GRID_INTEGRATE_GENERAL;
     816        15560 :     general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     817              :                          npts_local, shift_local, border_width, radius, cxyz,
     818              :                          grid);
     819              :   }
     820        18836 :   grid_library_counter_add(lp, GRID_BACKEND_REF, k, 1);
     821        18836 : }
     822              : 
     823              : /*******************************************************************************
     824              :  * \brief Transforms coefficients C_ab into C_xyz.
     825              :  * \author Ole Schuett
     826              :  ******************************************************************************/
     827        18836 : static inline void cab_to_cxyz(const int la_max, const int la_min,
     828              :                                const int lb_max, const int lb_min,
     829              :                                const double prefactor, const double ra[3],
     830              :                                const double rb[3], const double rp[3],
     831              :                                GRID_CONST_WHEN_COLLOCATE double *cab,
     832        18836 :                                GRID_CONST_WHEN_INTEGRATE double *cxyz) {
     833              : 
     834              :   // Computes the polynomial expansion coefficients:
     835              :   //     (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
     836        18836 :   const int lp = la_max + lb_max;
     837        18836 :   double alpha[3][lb_max + 1][la_max + 1][lp + 1];
     838        18836 :   memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) * sizeof(double));
     839              : 
     840        75344 :   for (int i = 0; i < 3; i++) {
     841        56508 :     const double drpa = rp[i] - ra[i];
     842        56508 :     const double drpb = rp[i] - rb[i];
     843       175926 :     for (int lxa = 0; lxa <= la_max; lxa++) {
     844       360804 :       for (int lxb = 0; lxb <= lb_max; lxb++) {
     845              :         double binomial_k_lxa = 1.0;
     846              :         double a = 1.0;
     847       623070 :         for (int k = 0; k <= lxa; k++) {
     848              :           double binomial_l_lxb = 1.0;
     849              :           double b = 1.0;
     850       960348 :           for (int l = 0; l <= lxb; l++) {
     851       578664 :             alpha[i][lxb][lxa][lxa - l + lxb - k] +=
     852       578664 :                 binomial_k_lxa * binomial_l_lxb * a * b;
     853       578664 :             binomial_l_lxb *= ((double)(lxb - l)) / ((double)(l + 1));
     854       578664 :             b *= drpb;
     855              :           }
     856       381684 :           binomial_k_lxa *= ((double)(lxa - k)) / ((double)(k + 1));
     857       381684 :           a *= drpa;
     858              :         }
     859              :       }
     860              :     }
     861              :   }
     862              : 
     863              :   //   *** initialise the coefficient matrix, we transform the sum
     864              :   //
     865              :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
     866              :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
     867              :   //         (z-a_z)**lza
     868              :   //
     869              :   // into
     870              :   //
     871              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
     872              :   //
     873              :   // where p is center of the product gaussian, and lp = la_max + lb_max
     874              :   // (current implementation is l**7)
     875              :   //
     876              : 
     877        56904 :   for (int lzb = 0; lzb <= lb_max; lzb++) {
     878       118530 :     for (int lza = 0; lza <= la_max; lza++) {
     879       202430 :       for (int lyb = 0; lyb <= lb_max - lzb; lyb++) {
     880       314856 :         for (int lya = 0; lya <= la_max - lza; lya++) {
     881       192888 :           const int lxb_min = imax(lb_min - lzb - lyb, 0);
     882       192888 :           const int lxa_min = imax(la_min - lza - lya, 0);
     883       447764 :           for (int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
     884       606066 :             for (int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
     885       351190 :               const int ico = coset(lxa, lya, lza);
     886       351190 :               const int jco = coset(lxb, lyb, lzb);
     887       351190 :               const int cab_index = jco * ncoset(la_max) + ico; // [jco, ico]
     888       904864 :               for (int lzp = 0; lzp <= lza + lzb; lzp++) {
     889      1901454 :                 for (int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
     890      3834814 :                   for (int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
     891      2487034 :                     const double p = alpha[0][lxb][lxa][lxp] *
     892      2487034 :                                      alpha[1][lyb][lya][lyp] *
     893      2487034 :                                      alpha[2][lzb][lza][lzp] * prefactor;
     894      2487034 :                     const int lp1 = lp + 1;
     895      2487034 :                     const int cxyz_index =
     896      2487034 :                         lzp * lp1 * lp1 + lyp * lp1 + lxp; // [lzp, lyp, lxp]
     897              : #if (GRID_DO_COLLOCATE)
     898      1085292 :                     cxyz[cxyz_index] += cab[cab_index] * p; // collocate
     899              : #else
     900      1401742 :                     cab[cab_index] += cxyz[cxyz_index] * p; // integrate
     901              : #endif
     902              :                   }
     903              :                 }
     904              :               }
     905              :             }
     906              :           }
     907              :         }
     908              :       }
     909              :     }
     910              :   }
     911        18836 : }
     912              : 
     913              : /*******************************************************************************
     914              :  * \brief Collocates coefficients C_ab onto the grid.
     915              :  * \author Ole Schuett
     916              :  ******************************************************************************/
     917              : static inline void
     918        18836 : cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max,
     919              :             const int la_min, const int lb_max, const int lb_min,
     920              :             const double zeta, const double zetb, const double rscale,
     921              :             const double dh[3][3], const double dh_inv[3][3],
     922              :             const double ra[3], const double rab[3], const int npts_global[3],
     923              :             const int npts_local[3], const int shift_local[3],
     924              :             const int border_width[3], const double radius,
     925              :             GRID_CONST_WHEN_COLLOCATE double *cab,
     926        18836 :             GRID_CONST_WHEN_INTEGRATE double *grid) {
     927              : 
     928              :   // Check if radius is too small to be mapped onto grid of given resolution.
     929        18836 :   double dh_max = 0.0;
     930        75344 :   for (int i = 0; i < 3; i++) {
     931       226032 :     for (int j = 0; j < 3; j++) {
     932       169524 :       dh_max = fmax(dh_max, fabs(dh[i][j]));
     933              :     }
     934              :   }
     935        18836 :   if (2.0 * radius < dh_max) {
     936            0 :     return;
     937              :   }
     938              : 
     939        18836 :   const double zetp = zeta + zetb;
     940        18836 :   const double f = zetb / zetp;
     941        18836 :   const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
     942        18836 :   const double prefactor = rscale * exp(-zeta * f * rab2);
     943        18836 :   double rp[3], rb[3];
     944        75344 :   for (int i = 0; i < 3; i++) {
     945        56508 :     rp[i] = ra[i] + f * rab[i];
     946        56508 :     rb[i] = ra[i] + rab[i];
     947              :   }
     948              : 
     949        18836 :   const int lp = la_max + lb_max;
     950        18836 :   const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
     951        18836 :   double cxyz[cxyz_size];
     952        18836 :   memset(cxyz, 0, cxyz_size * sizeof(double));
     953              : 
     954              : #if (GRID_DO_COLLOCATE)
     955              :   // collocate
     956        10974 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
     957        10974 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     958              :                npts_local, shift_local, border_width, radius, cxyz, grid);
     959              : #else
     960              :   // integrate
     961         7862 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     962              :                npts_local, shift_local, border_width, radius, cxyz, grid);
     963         7862 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
     964              : #endif
     965              : }
     966              : 
     967              : // EOF
        

Generated by: LCOV version 2.0-1