LCOV - code coverage report
Current view: top level - src/grid/cpu - grid_cpu_collint.h (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:b1f098b) Lines: 516 518 99.6 %
Date: 2024-05-05 06:30:09 Functions: 13 13 100.0 %

          Line data    Source code
       1             : /*----------------------------------------------------------------------------*/
       2             : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3             : /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */
       4             : /*                                                                            */
       5             : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6             : /*----------------------------------------------------------------------------*/
       7             : 
       8             : #include <assert.h>
       9             : #include <limits.h>
      10             : #include <math.h>
      11             : #include <stdio.h>
      12             : #include <stdlib.h>
      13             : #include <string.h>
      14             : 
      15             : #if defined(__AVX2__) && defined(__FMA__)
      16             : #include <immintrin.h>
      17             : #endif
      18             : 
      19             : #include "../common/grid_common.h"
      20             : #include "../common/grid_library.h"
      21             : #include "../common/grid_sphere_cache.h"
      22             : 
      23             : #define GRID_MAX_LP_OPTIMIZED 9
      24             : 
      25             : #if (GRID_DO_COLLOCATE)
      26             : #define GRID_CONST_WHEN_COLLOCATE const
      27             : #define GRID_CONST_WHEN_INTEGRATE
      28             : #else
      29             : #define GRID_CONST_WHEN_COLLOCATE
      30             : #define GRID_CONST_WHEN_INTEGRATE const
      31             : #endif
      32             : 
      33             : /*******************************************************************************
      34             :  * \brief Simple loop body for ortho_cx_to_grid using plain C.
      35             :  * \author Ole Schuett
      36             :  ******************************************************************************/
      37             : static inline void __attribute__((always_inline))
      38 27162802082 : ortho_cx_to_grid_scalar(const int lp, const int cmax, const int i,
      39 27162802082 :                         const double pol[3][lp + 1][2 * cmax + 1],
      40             :                         GRID_CONST_WHEN_COLLOCATE double *cx,
      41             :                         GRID_CONST_WHEN_INTEGRATE double *grid_0,
      42             :                         GRID_CONST_WHEN_INTEGRATE double *grid_1,
      43             :                         GRID_CONST_WHEN_INTEGRATE double *grid_2,
      44             :                         GRID_CONST_WHEN_INTEGRATE double *grid_3) {
      45             : 
      46             : #if (GRID_DO_COLLOCATE)
      47             :   // collocate
      48 14277677424 :   double reg[4] = {0.0, 0.0, 0.0, 0.0};
      49 >11422*10^7 : #pragma omp simd reduction(+ : reg)
      50             :   for (int lxp = 0; lxp <= lp; lxp++) {
      51 38068376270 :     const double p = pol[0][lxp][i + cmax];
      52 38068376270 :     reg[0] += cx[lxp * 4 + 0] * p;
      53 38068376270 :     reg[1] += cx[lxp * 4 + 1] * p;
      54 38068376270 :     reg[2] += cx[lxp * 4 + 2] * p;
      55 38068376270 :     reg[3] += cx[lxp * 4 + 3] * p;
      56             :   }
      57 14277677424 :   *grid_0 += reg[0];
      58 14277677424 :   *grid_1 += reg[1];
      59 14277677424 :   *grid_2 += reg[2];
      60 14277677424 :   *grid_3 += reg[3];
      61             : 
      62             : #else
      63             :   // integrate
      64 12885124658 :   const double reg[4] = {*grid_0, *grid_1, *grid_2, *grid_3};
      65             : #pragma omp simd
      66 12885124658 :   for (int lxp = 0; lxp <= lp; lxp++) {
      67 35024897432 :     const double p = pol[0][lxp][i + cmax];
      68 35024897432 :     cx[lxp * 4 + 0] += reg[0] * p;
      69 35024897432 :     cx[lxp * 4 + 1] += reg[1] * p;
      70 35024897432 :     cx[lxp * 4 + 2] += reg[2] * p;
      71 35024897432 :     cx[lxp * 4 + 3] += reg[3] * p;
      72             :   }
      73             : #endif
      74             : }
      75             : 
      76             : /*******************************************************************************
      77             :  * \brief Optimized loop body for ortho_cx_to_grid using AVX2 Intel Intrinsics.
      78             :  *        This routine always processes four consecutive grid elements at once.
      79             :  * \author Ole Schuett
      80             :  ******************************************************************************/
      81             : #if defined(__AVX2__) && defined(__FMA__)
      82             : static inline void __attribute__((always_inline))
      83 42759286079 : ortho_cx_to_grid_avx2(const int lp, const int cmax, const int i,
      84 42759286079 :                       const double pol[3][lp + 1][2 * cmax + 1],
      85             :                       GRID_CONST_WHEN_COLLOCATE double *cx,
      86             :                       GRID_CONST_WHEN_INTEGRATE double *grid_0,
      87             :                       GRID_CONST_WHEN_INTEGRATE double *grid_1,
      88             :                       GRID_CONST_WHEN_INTEGRATE double *grid_2,
      89             :                       GRID_CONST_WHEN_INTEGRATE double *grid_3) {
      90             : 
      91 42759286079 :   const int icmax = i + cmax;
      92             : 
      93             : #if (GRID_DO_COLLOCATE)
      94             :   // collocate
      95             :   // First iteration for lxp == 0 does not need add instructions.
      96 22610366553 :   __m256d p_vec = _mm256_loadu_pd(&pol[0][0][icmax]);
      97 22610366553 :   __m256d r_vec_0 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[0]));
      98 22610366553 :   __m256d r_vec_1 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[1]));
      99 22610366553 :   __m256d r_vec_2 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[2]));
     100 22610366553 :   __m256d r_vec_3 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[3]));
     101             : 
     102             :   // Remaining iterations for lxp > 0 use fused multiply adds.
     103             :   GRID_PRAGMA_UNROLL_UP_TO(GRID_MAX_LP_OPTIMIZED)
     104 59634710758 :   for (int lxp = 1; lxp <= lp; lxp++) {
     105 37024344205 :     const double *cx_base = &cx[lxp * 4];
     106 37024344205 :     p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
     107 37024344205 :     r_vec_0 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[0]), r_vec_0);
     108 37024344205 :     r_vec_1 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[1]), r_vec_1);
     109 37024344205 :     r_vec_2 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[2]), r_vec_2);
     110 37024344205 :     r_vec_3 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[3]), r_vec_3);
     111             :   }
     112             : 
     113             :   // Add vectors to grid one at a time, because they can aliase when cube wraps.
     114 22610366553 :   _mm256_storeu_pd(grid_0, _mm256_add_pd(_mm256_loadu_pd(grid_0), r_vec_0));
     115 22610366553 :   _mm256_storeu_pd(grid_1, _mm256_add_pd(_mm256_loadu_pd(grid_1), r_vec_1));
     116 22610366553 :   _mm256_storeu_pd(grid_2, _mm256_add_pd(_mm256_loadu_pd(grid_2), r_vec_2));
     117 22610366553 :   _mm256_storeu_pd(grid_3, _mm256_add_pd(_mm256_loadu_pd(grid_3), r_vec_3));
     118             : 
     119             : #else
     120             :   // integrate
     121 20148919526 :   __m256d grid_vec_0 = _mm256_loadu_pd(grid_0);
     122 20148919526 :   __m256d grid_vec_1 = _mm256_loadu_pd(grid_1);
     123 20148919526 :   __m256d grid_vec_2 = _mm256_loadu_pd(grid_2);
     124 20148919526 :   __m256d grid_vec_3 = _mm256_loadu_pd(grid_3);
     125             : 
     126             :   GRID_PRAGMA_UNROLL_UP_TO(GRID_MAX_LP_OPTIMIZED + 1)
     127 74217513617 :   for (int lxp = 0; lxp <= lp; lxp++) {
     128 54068594091 :     __m256d p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
     129             : 
     130             :     // Do 4 dot products at once. https://stackoverflow.com/a/10454420
     131 54068594091 :     __m256d xy0 = _mm256_mul_pd(p_vec, grid_vec_0);
     132 54068594091 :     __m256d xy1 = _mm256_mul_pd(p_vec, grid_vec_1);
     133 54068594091 :     __m256d xy2 = _mm256_mul_pd(p_vec, grid_vec_2);
     134 54068594091 :     __m256d xy3 = _mm256_mul_pd(p_vec, grid_vec_3);
     135             : 
     136             :     // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
     137 54068594091 :     __m256d temp01 = _mm256_hadd_pd(xy0, xy1);
     138             : 
     139             :     // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
     140 54068594091 :     __m256d temp23 = _mm256_hadd_pd(xy2, xy3);
     141             : 
     142             :     // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
     143 54068594091 :     __m256d swapped = _mm256_permute2f128_pd(temp01, temp23, 0x21);
     144             : 
     145             :     // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
     146 54068594091 :     __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);
     147             : 
     148 54068594091 :     __m256d r_vec = _mm256_add_pd(swapped, blended);
     149             : 
     150             :     // cx += r_vec
     151 54068594091 :     double *cx_base = &cx[lxp * 4];
     152 54068594091 :     _mm256_storeu_pd(cx_base, _mm256_add_pd(r_vec, _mm256_loadu_pd(cx_base)));
     153             :   }
     154             : #endif
     155             : }
     156             : #endif // __AVX2__ && __FMA__
     157             : 
     158             : /*******************************************************************************
     159             :  * \brief Collocates coefficients C_x onto the grid for orthorhombic case.
     160             :  * \author Ole Schuett
     161             :  ******************************************************************************/
     162             : static inline void __attribute__((always_inline))
     163 10858466778 : ortho_cx_to_grid(const int lp, const int kg1, const int kg2, const int jg1,
     164             :                  const int jg2, const int cmax,
     165 10858466778 :                  const double pol[3][lp + 1][2 * cmax + 1],
     166             :                  const int map[3][2 * cmax + 1],
     167             :                  const int sections[3][2 * cmax + 1], const int npts_local[3],
     168             :                  int **sphere_bounds_iter, GRID_CONST_WHEN_COLLOCATE double *cx,
     169             :                  GRID_CONST_WHEN_INTEGRATE double *grid) {
     170             : 
     171             :   // Lower and upper sphere bounds relative to center, ie. in cube coordinates.
     172 10858466778 :   const int lb = *((*sphere_bounds_iter)++);
     173 10858466778 :   const int ub = 1 - lb;
     174             : 
     175             :   // AVX instructions can only load/store from evenly spaced memory locations.
     176             :   // Since the sphere bounds can wrap around due to the grid's periodicity,
     177             :   // the inner loop runs over sections with homogeneous cube to grid mapping.
     178 30763025182 :   for (int istart = lb; istart <= ub; istart++) {
     179 19904558404 :     const int istop = imin(ub, istart + sections[0][istart + cmax]);
     180 19904558404 :     const int cube2grid = map[0][istart + cmax] - istart;
     181             : 
     182 19904558404 :     const int stride = npts_local[1] * npts_local[0];
     183 19904558404 :     const int grid_index_0 = kg1 * stride + jg1 * npts_local[0];
     184 19904558404 :     const int grid_index_1 = kg2 * stride + jg1 * npts_local[0];
     185 19904558404 :     const int grid_index_2 = kg1 * stride + jg2 * npts_local[0];
     186 19904558404 :     const int grid_index_3 = kg2 * stride + jg2 * npts_local[0];
     187 19904558404 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_0 = &grid[grid_index_0];
     188 19904558404 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_1 = &grid[grid_index_1];
     189 19904558404 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_2 = &grid[grid_index_2];
     190 19904558404 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_3 = &grid[grid_index_3];
     191             : 
     192             :     // Use AVX2 to process grid points in chunks of four, ie. 256 bit vectors.
     193             : #if defined(__AVX2__) && defined(__FMA__)
     194 19904558404 :     const int istop_vec = istart + 4 * ((istop - istart + 1) / 4) - 1;
     195 62663844483 :     for (int i = istart; i <= istop_vec; i += 4) {
     196 42759286079 :       const int ig = i + cube2grid;
     197 42759286079 :       ortho_cx_to_grid_avx2(lp, cmax, i, pol, cx, &grid_base_0[ig],
     198             :                             &grid_base_1[ig], &grid_base_2[ig],
     199 42759286079 :                             &grid_base_3[ig]);
     200             :     }
     201 47067360486 :     istart = istop_vec + 1;
     202             : #endif
     203             : 
     204             :     // Process up to 3 remaining points - or everything if AVX2 isn't available.
     205 47067360486 :     for (int i = istart; i <= istop; i++) {
     206 27162802082 :       const int ig = i + cube2grid;
     207 27162802082 :       ortho_cx_to_grid_scalar(lp, cmax, i, pol, cx, &grid_base_0[ig],
     208             :                               &grid_base_1[ig], &grid_base_2[ig],
     209 27162802082 :                               &grid_base_3[ig]);
     210             :     }
     211 19904558404 :     istart = istop;
     212             :   }
     213             : }
     214             : 
     215             : /*******************************************************************************
     216             :  * \brief Transforms coefficients C_xy into C_x by fixing grid index j.
     217             :  * \author Ole Schuett
     218             :  ******************************************************************************/
     219             : static inline void __attribute__((always_inline))
     220 10858466778 : ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax,
     221 10858466778 :                 const double pol[3][lp + 1][2 * cmax + 1],
     222             :                 GRID_CONST_WHEN_COLLOCATE double *cxy,
     223             :                 GRID_CONST_WHEN_INTEGRATE double *cx) {
     224             : 
     225 39470812314 :   for (int lyp = 0; lyp <= lp; lyp++) {
     226 88587911670 :     for (int lxp = 0; lxp <= lp - lyp; lxp++) {
     227 59975566134 :       const double p1 = pol[1][lyp][j1 + cmax];
     228 59975566134 :       const double p2 = pol[1][lyp][j2 + cmax];
     229 59975566134 :       const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
     230             : 
     231             : #if (GRID_DO_COLLOCATE)
     232             :       // collocate
     233 30845451955 :       cx[lxp * 4 + 0] += cxy[cxy_index + 0] * p1;
     234 30845451955 :       cx[lxp * 4 + 1] += cxy[cxy_index + 1] * p1;
     235 30845451955 :       cx[lxp * 4 + 2] += cxy[cxy_index + 0] * p2;
     236 30845451955 :       cx[lxp * 4 + 3] += cxy[cxy_index + 1] * p2;
     237             : #else
     238             :       // integrate
     239 29130114179 :       cxy[cxy_index + 0] += cx[lxp * 4 + 0] * p1;
     240 29130114179 :       cxy[cxy_index + 1] += cx[lxp * 4 + 1] * p1;
     241 29130114179 :       cxy[cxy_index + 0] += cx[lxp * 4 + 2] * p2;
     242 29130114179 :       cxy[cxy_index + 1] += cx[lxp * 4 + 3] * p2;
     243             : #endif
     244             :     }
     245             :   }
     246             : }
     247             : 
     248             : /*******************************************************************************
     249             :  * \brief Loop body of ortho_cxy_to_grid to be inlined for low values of lp.
     250             :  * \author Ole Schuett
     251             :  ******************************************************************************/
     252 10858466778 : static inline void __attribute__((always_inline)) ortho_cxy_to_grid_low(
     253             :     const int lp, const int j1, const int j2, const int kg1, const int kg2,
     254             :     const int jg1, const int jg2, const int cmax,
     255 10858466778 :     const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
     256             :     const int sections[3][2 * cmax + 1], const int npts_local[3],
     257             :     int **sphere_bounds_iter, double *cx, GRID_CONST_WHEN_COLLOCATE double *cxy,
     258             :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     259             : 
     260             : #if (GRID_DO_COLLOCATE)
     261             :   // collocate
     262 11343818368 :   ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     263  5671909184 :   ortho_cx_to_grid(lp, kg1, kg2, jg1, jg2, cmax, pol, map, sections, npts_local,
     264             :                    sphere_bounds_iter, cx, grid);
     265             : #else
     266             :   // integrate
     267 10373115188 :   ortho_cx_to_grid(lp, kg1, kg2, jg1, jg2, cmax, pol, map, sections, npts_local,
     268             :                    sphere_bounds_iter, cx, grid);
     269  5186557594 :   ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     270             : #endif
     271             : }
     272             : 
     273             : /*******************************************************************************
     274             :  * \brief Collocates coefficients C_xy onto the grid for orthorhombic case.
     275             :  * \author Ole Schuett
     276             :  ******************************************************************************/
     277  1226222680 : static inline void ortho_cxy_to_grid(
     278             :     const int lp, const int kg1, const int kg2, const int cmax,
     279  1226222680 :     const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
     280             :     const int sections[3][2 * cmax + 1], const int npts_local[3],
     281             :     int **sphere_bounds_iter, GRID_CONST_WHEN_COLLOCATE double *cxy,
     282  1226222680 :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     283             : 
     284             :   // The cube contains an even number of grid points in each direction and
     285             :   // collocation is always performed on a pair of two opposing grid points.
     286             :   // Hence, the points with index 0 and 1 are both assigned distance zero via
     287             :   // the formular distance=(2*index-1)/2.
     288             : 
     289  1226222680 :   const int jstart = *((*sphere_bounds_iter)++);
     290  1226222680 :   const size_t cx_size = (lp + 1) * 4;
     291  1226222680 :   double cx[cx_size];
     292 12084689458 :   for (int j1 = jstart; j1 <= 0; j1++) {
     293 10858466778 :     const int j2 = 1 - j1;
     294 10858466778 :     const int jg1 = map[1][j1 + cmax];
     295 10858466778 :     const int jg2 = map[1][j2 + cmax];
     296             : 
     297 10858466778 :     memset(cx, 0, cx_size * sizeof(double));
     298             : 
     299             :     // Generate separate branches for low values of lp gives up to 30% speedup.
     300 10858466778 :     if (lp <= GRID_MAX_LP_OPTIMIZED) {
     301             :       GRID_PRAGMA_UNROLL(GRID_MAX_LP_OPTIMIZED + 1)
     302 >11944*10^7 :       for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
     303 >10858*10^7 :         if (lp == ilp) {
     304 >11944*10^7 :           ortho_cxy_to_grid_low(ilp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
     305             :                                 sections, npts_local, sphere_bounds_iter, cx,
     306             :                                 cxy, grid);
     307             :         }
     308             :       }
     309             :     } else {
     310 10858470396 :       ortho_cxy_to_grid_low(lp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
     311             :                             sections, npts_local, sphere_bounds_iter, cx, cxy,
     312             :                             grid);
     313             :     }
     314             :   }
     315  1226222680 : }
     316             : 
     317             : /*******************************************************************************
     318             :  * \brief Transforms coefficients C_xyz into C_xz by fixing grid index k.
     319             :  * \author Ole Schuett
     320             :  ******************************************************************************/
     321  1226222680 : static inline void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2,
     322             :                                      const int cmax,
     323  1226222680 :                                      const double pol[3][lp + 1][2 * cmax + 1],
     324             :                                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     325             :                                      GRID_CONST_WHEN_INTEGRATE double *cxy) {
     326             : 
     327  4394205836 :   for (int lzp = 0; lzp <= lp; lzp++) {
     328  9730972342 :     for (int lyp = 0; lyp <= lp - lzp; lyp++) {
     329 18589330929 :       for (int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
     330 12026341743 :         const double p1 = pol[2][lzp][k1 + cmax];
     331 12026341743 :         const double p2 = pol[2][lzp][k2 + cmax];
     332 12026341743 :         const int cxyz_index =
     333 12026341743 :             lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp; // [lzp, lyp, lxp]
     334 12026341743 :         const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2;   // [lyp, lxp, 0]
     335             : 
     336             : #if (GRID_DO_COLLOCATE)
     337             :         // collocate
     338  5971923887 :         cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
     339  5971923887 :         cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
     340             : #else
     341             :         // integrate
     342  6054417856 :         cxyz[cxyz_index] += cxy[cxy_index + 0] * p1;
     343  6054417856 :         cxyz[cxyz_index] += cxy[cxy_index + 1] * p2;
     344             : #endif
     345             :       }
     346             :     }
     347             :   }
     348  1226222680 : }
     349             : 
     350             : /*******************************************************************************
     351             :  * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
     352             :  * \author Ole Schuett
     353             :  ******************************************************************************/
     354             : static inline void
     355   119725623 : ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3],
     356             :                    const double dh_inv[3][3], const double rp[3],
     357             :                    const int npts_global[3], const int npts_local[3],
     358             :                    const int shift_local[3], const double radius,
     359             :                    GRID_CONST_WHEN_COLLOCATE double *cxyz,
     360   119725623 :                    GRID_CONST_WHEN_INTEGRATE double *grid) {
     361             : 
     362             :   // *** position of the gaussian product
     363             :   //
     364             :   // this is the actual definition of the position on the grid
     365             :   // i.e. a point rp(:) gets here grid coordinates
     366             :   // MODULO(rp(:)/dr(:),npts_global(:))+1
     367             :   // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
     368             :   // and (1,1,1) on grid here in C.
     369             : 
     370             :   // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
     371   119725623 :   int cubecenter[3];
     372   478902492 :   for (int i = 0; i < 3; i++) {
     373             :     double dh_inv_rp = 0.0;
     374  1436707476 :     for (int j = 0; j < 3; j++) {
     375  1077530607 :       dh_inv_rp += dh_inv[j][i] * rp[j];
     376             :     }
     377   359176869 :     cubecenter[i] = (int)floor(dh_inv_rp);
     378             :   }
     379             : 
     380             :   double roffset[3];
     381   478902492 :   for (int i = 0; i < 3; i++) {
     382   359176869 :     roffset[i] = rp[i] - ((double)cubecenter[i]) * dh[i][i];
     383             :   }
     384             : 
     385             :   // Lookup loop bounds for spherical cutoff.
     386   119725623 :   int *sphere_bounds;
     387   119725623 :   double disr_radius;
     388   119725623 :   grid_sphere_cache_lookup(radius, dh, dh_inv, &sphere_bounds, &disr_radius);
     389   119725623 :   int **sphere_bounds_iter = &sphere_bounds;
     390             : 
     391             :   // Cube bounds.
     392   119725623 :   int lb_cube[3], ub_cube[3];
     393   478902492 :   for (int i = 0; i < 3; i++) {
     394   359176869 :     lb_cube[i] = (int)ceil(-1e-8 - disr_radius * dh_inv[i][i]);
     395   359176869 :     ub_cube[i] = 1 - lb_cube[i];
     396             :     // If grid is not period check that cube fits without wrapping.
     397   359176869 :     if (npts_global[i] != npts_local[i]) {
     398       17826 :       const int offset =
     399       17826 :           modulo(cubecenter[i] + lb_cube[i] - shift_local[i], npts_global[i]) -
     400             :           lb_cube[i];
     401       17826 :       assert(offset + ub_cube[i] < npts_local[i]);
     402       17826 :       assert(offset + lb_cube[i] >= 0);
     403             :     }
     404             :   }
     405             : 
     406             :   // cmax = MAXVAL(ub_cube)
     407   119725623 :   const int cmax = imax(imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
     408             : 
     409             :   // Precompute (x-xp)**lp*exp(..) for each direction.
     410   119725623 :   double pol_mutable[3][lp + 1][2 * cmax + 1];
     411   478902492 :   for (int idir = 0; idir < 3; idir++) {
     412   359176869 :     const double dr = dh[idir][idir];
     413   359176869 :     const double ro = roffset[idir];
     414             :     //  Reuse the result from the previous gridpoint to avoid to many exps:
     415             :     //  exp( -a*(x+d)**2) = exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
     416             :     //  exp(-2*a*(x+d)*d) = exp(-2*a*x*d)*exp(-2*a*d**2)
     417   359176869 :     const double t_exp_1 = exp(-zetp * pow(dr, 2));
     418   359176869 :     const double t_exp_2 = pow(t_exp_1, 2);
     419   359176869 :     double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
     420   359176869 :     double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
     421  4039323565 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     422  3680146696 :       const double rpg = ig * dr - ro;
     423  3680146696 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     424  3680146696 :       t_exp_min_2 *= t_exp_2;
     425  3680146696 :       double pg = t_exp_min_1;
     426 13189690031 :       for (int icoef = 0; icoef <= lp; icoef++) {
     427  9509543335 :         pol_mutable[idir][icoef][ig + cmax] = pg; // exp(-zetp*rpg**2)
     428  9509543335 :         pg *= rpg;
     429             :       }
     430             :     }
     431   359176869 :     double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
     432   359176869 :     double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
     433  4039323565 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     434  3680146696 :       const double rpg = (1 - ig) * dr - ro;
     435  3680146696 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     436  3680146696 :       t_exp_plus_2 *= t_exp_2;
     437  3680146696 :       double pg = t_exp_plus_1;
     438 13189690031 :       for (int icoef = 0; icoef <= lp; icoef++) {
     439  9509543335 :         pol_mutable[idir][icoef][1 - ig + cmax] = pg; // exp(-zetp*rpg**2)
     440  9509543335 :         pg *= rpg;
     441             :       }
     442             :     }
     443             :   }
     444   119725623 :   const double(*pol)[lp + 1][2 * cmax + 1] =
     445   119725623 :       (const double(*)[lp + 1][2 * cmax + 1]) pol_mutable;
     446             : 
     447             :   // Precompute mapping from cube to grid indices for each direction
     448   119725623 :   int map_mutable[3][2 * cmax + 1];
     449   478902492 :   for (int i = 0; i < 3; i++) {
     450  8140283940 :     for (int k = -cmax; k <= +cmax; k++) {
     451  7781107071 :       map_mutable[i][k + cmax] =
     452  7781107071 :           modulo(cubecenter[i] + k - shift_local[i], npts_global[i]);
     453             :     }
     454             :   }
     455   119725623 :   const int(*map)[2 * cmax + 1] = (const int(*)[2 * cmax + 1]) map_mutable;
     456             : 
     457             :   // Precompute length of sections with homogeneous cube to grid mapping.
     458   119725623 :   int sections_mutable[3][2 * cmax + 1];
     459   478902492 :   for (int i = 0; i < 3; i++) {
     460  8140283940 :     for (int kg = 2 * cmax; kg >= 0; kg--) {
     461  7781107071 :       if (kg == 2 * cmax || map[i][kg] != map[i][kg + 1] - 1) {
     462   735127490 :         sections_mutable[i][kg] = 0;
     463             :       } else {
     464  7045979581 :         sections_mutable[i][kg] = sections_mutable[i][kg + 1] + 1;
     465             :       }
     466             :     }
     467             :   }
     468   119725623 :   const int(*sections)[2 * cmax + 1] =
     469   119725623 :       (const int(*)[2 * cmax + 1]) sections_mutable;
     470             : 
     471             :   // Loop over k dimension of the cube.
     472   119725623 :   const int kstart = *((*sphere_bounds_iter)++);
     473   119725623 :   const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
     474   119725623 :   double cxy[cxy_size];
     475  1345948303 :   for (int k1 = kstart; k1 <= 0; k1++) {
     476  1226222680 :     const int k2 = 1 - k1;
     477  1226222680 :     const int kg1 = map[2][k1 + cmax];
     478  1226222680 :     const int kg2 = map[2][k2 + cmax];
     479             : 
     480  1226222680 :     memset(cxy, 0, cxy_size * sizeof(double));
     481             : 
     482             : #if (GRID_DO_COLLOCATE)
     483             :     // collocate
     484   627509635 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     485   627509635 :     ortho_cxy_to_grid(lp, kg1, kg2, cmax, pol, map, sections, npts_local,
     486             :                       sphere_bounds_iter, cxy, grid);
     487             : #else
     488             :     // integrate
     489   598713045 :     ortho_cxy_to_grid(lp, kg1, kg2, cmax, pol, map, sections, npts_local,
     490             :                       sphere_bounds_iter, cxy, grid);
     491   598713045 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     492             : #endif
     493             :   }
     494   119725623 : }
     495             : 
     496             : /*******************************************************************************
     497             :  * \brief Collocates coefficients C_i onto the grid for general case.
     498             :  * \author Ole Schuett
     499             :  ******************************************************************************/
     500  2115702617 : static inline void __attribute__((always_inline)) general_ci_to_grid(
     501             :     const int lp, const int jg, const int kg, const int ismin, const int ismax,
     502             :     const int npts_local[3], const int index_min[3], const int index_max[3],
     503             :     const int map_i[], const int sections_i[], const double gp[3], const int k,
     504             :     const int j, const double exp_ij[], const double exp_jk[],
     505             :     const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci,
     506             :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     507             : 
     508  2115702617 :   const int base = kg * npts_local[1] * npts_local[0] + jg * npts_local[0];
     509             : 
     510             :   // AVX instructions can only load/store from evenly spaced memory locations.
     511             :   // Since the cube can wrap around due to the grid's periodicity,
     512             :   // the inner loop runs over sections with homogeneous cube to grid mapping.
     513  6246443782 :   for (int istart = ismin; istart <= ismax; istart++) {
     514  4130741165 :     const int istop = imin(ismax, istart + sections_i[istart - index_min[0]]);
     515  4130741165 :     if (map_i[istart - index_min[0]] < 0) {
     516           0 :       istart = istop; // skip over out-of-bounds indicies
     517           0 :       continue;
     518             :     }
     519             : 
     520  4130741165 :     const int cube2grid = map_i[istart - index_min[0]] - istart;
     521 39997123487 :     for (int i = istart; i <= istop; i++) {
     522 35866382322 :       const int ig = i + cube2grid;
     523 35866382322 :       const double di = i - gp[0];
     524             : 
     525 35866382322 :       const int stride_i = index_max[0] - index_min[0] + 1;
     526 35866382322 :       const int stride_j = index_max[1] - index_min[1] + 1;
     527 35866382322 :       const int stride_k = index_max[2] - index_min[2] + 1;
     528 35866382322 :       const int idx_ij = (j - index_min[1]) * stride_i + i - index_min[0];
     529 35866382322 :       const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
     530 35866382322 :       const int idx_ki = (i - index_min[0]) * stride_k + k - index_min[2];
     531             : 
     532             :       // Mathieu's trick: Calculate 3D Gaussian from three precomputed 2D tables
     533             :       //
     534             :       // r   =  (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     535             :       //     =  a                 + b                 + c
     536             :       //
     537             :       // r**2  =  (a + b + c)**2  =  a**2 + b**2 + c**2 + 2ab + 2bc + 2ca
     538             :       //
     539             :       // exp(-r**2)  =  exp(-a(a+2b)) * exp(-b*(b+2c)) * exp(-c*(c+2a))
     540             :       //
     541 35866382322 :       const double gaussian = exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
     542             : 
     543 35866382322 :       const int grid_index = base + ig; // [kg, jg, ig]
     544 35866382322 :       double dip = gaussian;
     545             : 
     546             : #if (GRID_DO_COLLOCATE)
     547             :       // collocate
     548 20204783992 :       double reg = 0.0;
     549 93883410176 :       for (int il = 0; il <= lp; il++) {
     550 73678626184 :         reg += ci[il] * dip;
     551 73678626184 :         dip *= di;
     552             :       }
     553 20204783992 :       grid[grid_index] += reg;
     554             : #else
     555             :       // integrate
     556 15661598330 :       const double reg = grid[grid_index];
     557 72791092482 :       for (int il = 0; il <= lp; il++) {
     558 57129494152 :         ci[il] += reg * dip;
     559 57129494152 :         dip *= di;
     560             :       }
     561             : #endif
     562             :     }
     563             :     istart = istop;
     564             :   }
     565             : }
     566             : 
     567             : /*******************************************************************************
     568             :  * \brief Transforms coefficients C_ij into C_i by fixing grid index j.
     569             :  * \author Ole Schuett
     570             :  ******************************************************************************/
     571             : static inline void __attribute__((always_inline))
     572             : general_cij_to_ci(const int lp, const double dj,
     573             :                   GRID_CONST_WHEN_COLLOCATE double *cij,
     574             :                   GRID_CONST_WHEN_INTEGRATE double *ci) {
     575             :   double djp = 1.0;
     576 13274684586 :   for (int jl = 0; jl <= lp; jl++) {
     577 27415009748 :     for (int il = 0; il <= lp - jl; il++) {
     578 19656612526 :       const int cij_index = jl * (lp + 1) + il; // [jl, il]
     579             : #if (GRID_DO_COLLOCATE)
     580 10937251700 :       ci[il] += cij[cij_index] * djp; // collocate
     581             : #else
     582  8719360826 :       cij[cij_index] += ci[il] * djp; // integrate
     583             : #endif
     584             :     }
     585  7758397222 :     djp *= dj;
     586             :   }
     587             : }
     588             : 
     589             : /*******************************************************************************
     590             :  * \brief Loop body of general_cij_to_grid to be inlined for low values of lp.
     591             :  * \author Ole Schuett
     592             :  ******************************************************************************/
     593   923802665 : static inline void __attribute__((always_inline)) general_cij_to_grid_low(
     594             :     const int lp, const int jg, const int kg, const int ismin, const int ismax,
     595             :     const int npts_local[3], const int index_min[3], const int index_max[3],
     596             :     const int map_i[], const int sections_i[], const double gp[3], const int k,
     597             :     const int j, const double exp_ij[], const double exp_jk[],
     598             :     const double exp_ki[], const double dj, double *ci,
     599             :     GRID_CONST_WHEN_COLLOCATE double *cij,
     600             :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     601             : 
     602             : #if (GRID_DO_COLLOCATE)
     603             :   // collocate
     604  1191899952 :   general_cij_to_ci(lp, dj, cij, ci);
     605  1191899952 :   general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
     606             :                      map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     607             :                      grid);
     608             : #else
     609             :   // integrate
     610  4324387412 :   general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
     611             :                      map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     612             :                      grid);
     613             :   general_cij_to_ci(lp, dj, cij, ci);
     614             : #endif
     615             : }
     616             : 
     617             : /*******************************************************************************
     618             :  * \brief Collocates coefficients C_ij onto the grid for general case.
     619             :  * \author Ole Schuett
     620             :  ******************************************************************************/
     621   126561049 : static inline void general_cij_to_grid(
     622             :     const int lp, const int k, const int kg, const int npts_local[3],
     623             :     const int index_min[3], const int index_max[3], const int map_i[],
     624             :     const int map_j[], const int sections_i[], const int sections_j[],
     625             :     const double dh[3][3], const double gp[3], const double radius,
     626             :     const double exp_ij[], const double exp_jk[], const double exp_ki[],
     627             :     GRID_CONST_WHEN_COLLOCATE double *cij,
     628             :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     629             : 
     630  4045619101 :   for (int j = index_min[1]; j <= index_max[1]; j++) {
     631  3919058052 :     const int jg = map_j[j - index_min[1]];
     632  3919058052 :     if (jg < 0) {
     633         560 :       j += sections_j[j - index_min[1]]; // skip over out-of-bounds indicies
     634         560 :       continue;
     635             :     }
     636             : 
     637             :     //--------------------------------------------------------------------
     638             :     // Find bounds for the inner loop based on a quadratic equation in i.
     639             :     //
     640             :     // The real-space vector from the center of the gaussian to the
     641             :     // grid point i,j,k is given by:
     642             :     //   r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     643             :     //
     644             :     // Separating the term that depends on i:
     645             :     //   r = i*dh[0,:] - gp[0]*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     646             :     //     = i*dh[0,:] + v
     647             :     //
     648             :     // The squared distance works out to:
     649             :     //   r**2 = dh[0,:]**2 * i**2  +  2 * v * dh[0,:] * i  +  v**2
     650             :     //        = a * i**2           +  b * i                +  c
     651             :     //
     652             :     // Solving r**2==radius**2 for i yields:
     653             :     //    d =  b**2  -  4 * a * (c - radius**2)
     654             :     //    i = (-b \pm sqrt(d)) / (2*a)
     655             :     //
     656             :     double a = 0.0, b = 0.0, c = 0.0;
     657 15676229968 :     for (int i = 0; i < 3; i++) {
     658 11757172476 :       const double v = (0 - gp[0]) * dh[0][i] + (j - gp[1]) * dh[1][i] +
     659 11757172476 :                        (k - gp[2]) * dh[2][i];
     660 11757172476 :       a += dh[0][i] * dh[0][i];
     661 11757172476 :       b += 2.0 * v * dh[0][i];
     662 11757172476 :       c += v * v;
     663             :     }
     664  3919057492 :     const double d = b * b - 4.0 * a * (c - radius * radius);
     665             : 
     666  3919057492 :     if (0.0 < d) {
     667  2115702617 :       const double sqrt_d = sqrt(d);
     668  2115702617 :       const double inv_2a = 1.0 / (2.0 * a);
     669  2115702617 :       const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
     670  2115702617 :       const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
     671  2115702617 :       const double dj = j - gp[1];
     672             : 
     673  2115702617 :       double ci[lp + 1];
     674  2115702617 :       memset(ci, 0, sizeof(ci));
     675             : 
     676             :       // Generate separate branches for low values of lp.
     677  2115702617 :       if (lp <= GRID_MAX_LP_OPTIMIZED) {
     678             :         GRID_PRAGMA_UNROLL(GRID_MAX_LP_OPTIMIZED + 1)
     679 23272728787 :         for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
     680 21157026170 :           if (lp == ilp) {
     681 22080828835 :             general_cij_to_grid_low(ilp, jg, kg, ismin, ismax, npts_local,
     682             :                                     index_min, index_max, map_i, sections_i, gp,
     683             :                                     k, j, exp_ij, exp_jk, exp_ki, dj, ci, cij,
     684             :                                     grid);
     685             :           }
     686             :         }
     687             :       } else {
     688  2115702617 :         general_cij_to_grid_low(lp, jg, kg, ismin, ismax, npts_local, index_min,
     689             :                                 index_max, map_i, sections_i, gp, k, j, exp_ij,
     690             :                                 exp_jk, exp_ki, dj, ci, cij, grid);
     691             :       }
     692             :     }
     693             :   }
     694   126561049 : }
     695             : 
     696             : /*******************************************************************************
     697             :  * \brief Transforms coefficients C_ijk into C_ij by fixing grid index k.
     698             :  * \author Ole Schuett
     699             :  ******************************************************************************/
     700   126561049 : static inline void general_cijk_to_cij(const int lp, const double dk,
     701             :                                        GRID_CONST_WHEN_COLLOCATE double *cijk,
     702             :                                        GRID_CONST_WHEN_INTEGRATE double *cij) {
     703   126561049 :   double dkp = 1.0;
     704   588411663 :   for (int kl = 0; kl <= lp; kl++) {
     705  1629035284 :     for (int jl = 0; jl <= lp - kl; jl++) {
     706  3640321156 :       for (int il = 0; il <= lp - kl - jl; il++) {
     707  2473136486 :         const int cij_index = jl * (lp + 1) + il; // [jl, il]
     708  2473136486 :         const int cijk_index =
     709  2473136486 :             kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il; // [kl, jl, il]
     710             : #if (GRID_DO_COLLOCATE)
     711  1338369120 :         cij[cij_index] += cijk[cijk_index] * dkp; // collocate
     712             : #else
     713  1134767366 :         cijk[cijk_index] += cij[cij_index] * dkp; // integrate
     714             : #endif
     715             :       }
     716             :     }
     717   461850614 :     dkp *= dk;
     718             :   }
     719   126561049 : }
     720             : 
     721             : /*******************************************************************************
     722             :  * \brief Precompute mapping of grid indices and its homogeneous sections.
     723             :  * \author Ole Schuett
     724             :  ******************************************************************************/
     725             : static inline void
     726    16002378 : general_precompute_mapping(const int index_min, const int index_max,
     727             :                            const int shift_local, const int npts_global,
     728             :                            const int bounds[2], int map[], int sections[]) {
     729             : 
     730             :   // Precompute mapping from continous grid indices to pbc wraped.
     731   439491535 :   for (int k = index_min; k <= index_max; k++) {
     732   423489157 :     const int kg = modulo(k - shift_local, npts_global);
     733   423489157 :     if (bounds[0] <= kg && kg <= bounds[1]) {
     734   423484736 :       map[k - index_min] = kg;
     735             :     } else {
     736        4421 :       map[k - index_min] = INT_MIN; // out of bounds - not mapped
     737             :     }
     738             :   }
     739             : 
     740             :   // Precompute length of sections with homogeneous cube to grid mapping.
     741    16002378 :   const int range = index_max - index_min + 1;
     742   439491535 :   for (int kg = range - 1; kg >= 0; kg--) {
     743   423489157 :     if (kg == range - 1 || map[kg] != map[kg + 1] - 1) {
     744    40428723 :       sections[kg] = 0;
     745             :     } else {
     746   383060434 :       sections[kg] = sections[kg + 1] + 1;
     747             :     }
     748             :   }
     749    16002378 : }
     750             : 
     751             : /*******************************************************************************
     752             :  * \brief Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
     753             :  * \author Ole Schuett
     754             :  ******************************************************************************/
     755             : static inline void
     756    16002378 : general_fill_exp_table(const int idir, const int jdir, const int index_min[3],
     757             :                        const int index_max[3], const double zetp,
     758             :                        const double dh[3][3], const double gp[3],
     759             :                        double exp_table[]) {
     760             : 
     761    16002378 :   const int stride_i = index_max[idir] - index_min[idir] + 1;
     762    16002378 :   const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
     763    16002378 :                       dh[idir][2] * dh[idir][2];
     764    16002378 :   const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
     765    16002378 :                       dh[idir][2] * dh[jdir][2];
     766             : 
     767   439491535 :   for (int i = index_min[idir]; i <= index_max[idir]; i++) {
     768   423489157 :     const double di = i - gp[idir];
     769   423489157 :     const double rii = di * di * h_ii;
     770   423489157 :     const double rij_unit = di * h_ij;
     771   423489157 :     const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
     772             : 
     773             :     // compute exponentials symmetrically around cube center
     774   423489157 :     const int j_center = (int)gp[jdir];
     775   423489157 :     const double dj_center = j_center - gp[jdir];
     776   423489157 :     const double rij_center = dj_center * rij_unit;
     777   423489157 :     const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
     778             : 
     779             :     // above center
     780   423489157 :     double exp_ij = exp_ij_center;
     781  6870342192 :     for (int j = j_center; j <= index_max[jdir]; j++) {
     782  6446853035 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     783  6446853035 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     784  6446853035 :       exp_ij *= exp_ij_unit;
     785             :     }
     786             : 
     787             :     // below center
     788   423489157 :     const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
     789   423489157 :     exp_ij = exp_ij_center * exp_ij_unit_inv;
     790  6409021220 :     for (int j = j_center - 1; j >= index_min[jdir]; j--) {
     791  5985532063 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     792  5985532063 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     793  5985532063 :       exp_ij *= exp_ij_unit_inv;
     794             :     }
     795             :   }
     796    16002378 : }
     797             : 
     798             : /*******************************************************************************
     799             :  * \brief Collocates coefficients C_ijk onto the grid for general case.
     800             :  * \author Ole Schuett
     801             :  ******************************************************************************/
     802             : static inline void
     803     5334126 : general_cijk_to_grid(const int border_mask, const int lp, const double zetp,
     804             :                      const double dh[3][3], const double dh_inv[3][3],
     805             :                      const double rp[3], const int npts_global[3],
     806             :                      const int npts_local[3], const int shift_local[3],
     807             :                      const int border_width[3], const double radius,
     808             :                      GRID_CONST_WHEN_COLLOCATE double *cijk,
     809     5334126 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     810             : 
     811             :   // Default for border_mask == 0.
     812     5334126 :   int bounds_i[2] = {0, npts_local[0] - 1};
     813     5334126 :   int bounds_j[2] = {0, npts_local[1] - 1};
     814     5334126 :   int bounds_k[2] = {0, npts_local[2] - 1};
     815             : 
     816             :   // See also rs_find_node() in task_list_methods.F.
     817             :   // If the bit is set then we need to exclude the border in that direction.
     818     5334126 :   if (border_mask & (1 << 0))
     819      102385 :     bounds_i[0] += border_width[0];
     820     5334126 :   if (border_mask & (1 << 1))
     821      102385 :     bounds_i[1] -= border_width[0];
     822     5334126 :   if (border_mask & (1 << 2))
     823      102385 :     bounds_j[0] += border_width[1];
     824     5334126 :   if (border_mask & (1 << 3))
     825      102385 :     bounds_j[1] -= border_width[1];
     826     5334126 :   if (border_mask & (1 << 4))
     827      102377 :     bounds_k[0] += border_width[2];
     828     5334126 :   if (border_mask & (1 << 5))
     829      102385 :     bounds_k[1] -= border_width[2];
     830             : 
     831             :   // center in grid coords
     832             :   // gp = MATMUL(dh_inv, rp)
     833     5334126 :   double gp[3] = {0.0, 0.0, 0.0};
     834    21336504 :   for (int i = 0; i < 3; i++) {
     835    64009512 :     for (int j = 0; j < 3; j++) {
     836    48007134 :       gp[i] += dh_inv[j][i] * rp[j];
     837             :     }
     838             :   }
     839             : 
     840             :   // Get the min max indices that contain at least the cube that contains a
     841             :   // sphere around rp of radius radius if the cell is very non-orthogonal this
     842             :   // implies that many useless points are included this estimate can be improved
     843             :   // (i.e. not box but sphere should be used)
     844     5334126 :   int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
     845     5334126 :   int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
     846    21336504 :   for (int i = -1; i <= 1; i++) {
     847    64009512 :     for (int j = -1; j <= 1; j++) {
     848   192028536 :       for (int k = -1; k <= 1; k++) {
     849   144021402 :         const double x = rp[0] + i * radius;
     850   144021402 :         const double y = rp[1] + j * radius;
     851   144021402 :         const double z = rp[2] + k * radius;
     852   576085608 :         for (int idir = 0; idir < 3; idir++) {
     853   432064206 :           const double resc =
     854   432064206 :               dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
     855   432064206 :           index_min[idir] = imin(index_min[idir], (int)floor(resc));
     856   432064206 :           index_max[idir] = imax(index_max[idir], (int)ceil(resc));
     857             :         }
     858             :       }
     859             :     }
     860             :   }
     861             : 
     862             :   // Precompute mappings
     863     5334126 :   const int range_i = index_max[0] - index_min[0] + 1;
     864     5334126 :   int map_i[range_i], sections_i[range_i];
     865     5334126 :   general_precompute_mapping(index_min[0], index_max[0], shift_local[0],
     866             :                              npts_global[0], bounds_i, map_i, sections_i);
     867     5334126 :   const int range_j = index_max[1] - index_min[1] + 1;
     868     5334126 :   int map_j[range_j], sections_j[range_j];
     869     5334126 :   general_precompute_mapping(index_min[1], index_max[1], shift_local[1],
     870             :                              npts_global[1], bounds_j, map_j, sections_j);
     871     5334126 :   const int range_k = index_max[2] - index_min[2] + 1;
     872     5334126 :   int map_k[range_k], sections_k[range_k];
     873     5334126 :   general_precompute_mapping(index_min[2], index_max[2], shift_local[2],
     874             :                              npts_global[2], bounds_k, map_k, sections_k);
     875             : 
     876             :   // Precompute exponentials
     877     5334126 :   double exp_ij[range_i * range_j];
     878     5334126 :   general_fill_exp_table(0, 1, index_min, index_max, zetp, dh, gp, exp_ij);
     879     5334126 :   double exp_jk[range_j * range_k];
     880     5334126 :   general_fill_exp_table(1, 2, index_min, index_max, zetp, dh, gp, exp_jk);
     881     5334126 :   double exp_ki[range_k * range_i];
     882     5334126 :   general_fill_exp_table(2, 0, index_min, index_max, zetp, dh, gp, exp_ki);
     883             : 
     884             :   // go over the grid, but cycle if the point is not within the radius
     885     5334126 :   const int cij_size = (lp + 1) * (lp + 1);
     886     5334126 :   double cij[cij_size];
     887   131899556 :   for (int k = index_min[2]; k <= index_max[2]; k++) {
     888   126565430 :     const int kg = map_k[k - index_min[2]];
     889   126565430 :     if (kg < 0) {
     890        4381 :       k += sections_k[k - index_min[2]]; // skip over out-of-bounds indicies
     891        4381 :       continue;
     892             :     }
     893             : 
     894             :     // zero coef_xyt
     895   126561049 :     memset(cij, 0, cij_size * sizeof(double));
     896             : 
     897             : #if (GRID_DO_COLLOCATE)
     898             :     // collocate
     899    70726726 :     general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
     900    70726726 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     901             :                         map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
     902             :                         exp_jk, exp_ki, cij, grid);
     903             : #else
     904             :     // integrate
     905    55834323 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     906             :                         map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
     907             :                         exp_jk, exp_ki, cij, grid);
     908    55834323 :     general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
     909             : #endif
     910             :   }
     911     5334126 : }
     912             : 
     913             : /*******************************************************************************
     914             :  * \brief Transforms coefficients C_xyz into C_ijk.
     915             :  * \author Ole Schuett
     916             :  ******************************************************************************/
     917             : static inline void
     918     5334126 : general_cxyz_to_cijk(const int lp, const double dh[3][3],
     919             :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     920     5334126 :                      GRID_CONST_WHEN_INTEGRATE double *cijk) {
     921             : 
     922             :   // transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
     923             :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
     924             :   // sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
     925             : 
     926             :   // transform using multinomials
     927     5334126 :   double hmatgridp[lp + 1][3][3];
     928    21336504 :   for (int i = 0; i < 3; i++) {
     929    64009512 :     for (int j = 0; j < 3; j++) {
     930    48007134 :       hmatgridp[0][j][i] = 1.0;
     931   172359540 :       for (int k = 1; k <= lp; k++) {
     932   124352406 :         hmatgridp[k][j][i] = hmatgridp[k - 1][j][i] * dh[j][i];
     933             :       }
     934             :     }
     935             :   }
     936             : 
     937             :   const int lpx = lp;
     938    24485186 :   for (int klx = 0; klx <= lpx; klx++) {
     939    67222056 :     for (int jlx = 0; jlx <= lpx - klx; jlx++) {
     940   149583252 :       for (int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
     941   101512256 :         const int lx = ilx + jlx + klx;
     942   101512256 :         const int lpy = lp - lx;
     943   294561028 :         for (int kly = 0; kly <= lpy; kly++) {
     944   534824481 :           for (int jly = 0; jly <= lpy - kly; jly++) {
     945   915790728 :             for (int ily = 0; ily <= lpy - kly - jly; ily++) {
     946   574015019 :               const int ly = ily + jly + kly;
     947   574015019 :               const int lpz = lp - lx - ly;
     948  1499417132 :               for (int klz = 0; klz <= lpz; klz++) {
     949  2368797961 :                 for (int jlz = 0; jlz <= lpz - klz; jlz++) {
     950  3633650942 :                   for (int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
     951  2190255094 :                     const int lz = ilz + jlz + klz;
     952  2190255094 :                     const int il = ilx + ily + ilz;
     953  2190255094 :                     const int jl = jlx + jly + jlz;
     954  2190255094 :                     const int kl = klx + kly + klz;
     955  2190255094 :                     const int lp1 = lp + 1;
     956  2190255094 :                     const int cijk_index =
     957  2190255094 :                         kl * lp1 * lp1 + jl * lp1 + il; // [kl,jl,il]
     958  2190255094 :                     const int cxyz_index =
     959  2190255094 :                         lz * lp1 * lp1 + ly * lp1 + lx; // [lz,ly,lx]
     960  2190255094 :                     const double p =
     961  2190255094 :                         hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
     962  2190255094 :                         hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
     963  2190255094 :                         hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
     964  2190255094 :                         hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
     965  2190255094 :                         hmatgridp[klz][2][2] * fac(lx) * fac(ly) * fac(lz) /
     966  2190255094 :                         (fac(ilx) * fac(ily) * fac(ilz) * fac(jlx) * fac(jly) *
     967  2190255094 :                          fac(jlz) * fac(klx) * fac(kly) * fac(klz));
     968             : #if (GRID_DO_COLLOCATE)
     969   959877714 :                     cijk[cijk_index] += cxyz[cxyz_index] * p; // collocate
     970             : #else
     971  1230377380 :                     cxyz[cxyz_index] += cijk[cijk_index] * p; // integrate
     972             : #endif
     973             :                   }
     974             :                 }
     975             :               }
     976             :             }
     977             :           }
     978             :         }
     979             :       }
     980             :     }
     981             :   }
     982     5334126 : }
     983             : 
     984             : /*******************************************************************************
     985             :  * \brief Collocates coefficients C_xyz onto the grid for general case.
     986             :  * \author Ole Schuett
     987             :  ******************************************************************************/
     988             : static inline void
     989     5334126 : general_cxyz_to_grid(const int border_mask, const int lp, const double zetp,
     990             :                      const double dh[3][3], const double dh_inv[3][3],
     991             :                      const double rp[3], const int npts_global[3],
     992             :                      const int npts_local[3], const int shift_local[3],
     993             :                      const int border_width[3], const double radius,
     994             :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     995     5334126 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     996             : 
     997     5334126 :   const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
     998     5334126 :   double cijk[cijk_size];
     999     5334126 :   memset(cijk, 0, cijk_size * sizeof(double));
    1000             : 
    1001             : #if (GRID_DO_COLLOCATE)
    1002             :   // collocate
    1003     2954622 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
    1004     2954622 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1005             :                        npts_local, shift_local, border_width, radius, cijk,
    1006             :                        grid);
    1007             : #else
    1008             :   // integrate
    1009     2379504 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1010             :                        npts_local, shift_local, border_width, radius, cijk,
    1011             :                        grid);
    1012     2379504 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
    1013             : #endif
    1014     5334126 : }
    1015             : 
    1016             : /*******************************************************************************
    1017             :  * \brief Collocates coefficients C_xyz onto the grid.
    1018             :  * \author Ole Schuett
    1019             :  ******************************************************************************/
    1020             : static inline void
    1021   125059749 : cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
    1022             :              const double zetp, const double dh[3][3],
    1023             :              const double dh_inv[3][3], const double rp[3],
    1024             :              const int npts_global[3], const int npts_local[3],
    1025             :              const int shift_local[3], const int border_width[3],
    1026             :              const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
    1027             :              GRID_CONST_WHEN_INTEGRATE double *grid) {
    1028             : 
    1029   125059749 :   enum grid_library_kernel k;
    1030   125059749 :   if (orthorhombic && border_mask == 0) {
    1031   119725623 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_ORTHO : GRID_INTEGRATE_ORTHO;
    1032   119725623 :     ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
    1033             :                        shift_local, radius, cxyz, grid);
    1034             :   } else {
    1035     5334126 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_GENERAL : GRID_INTEGRATE_GENERAL;
    1036     5334126 :     general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1037             :                          npts_local, shift_local, border_width, radius, cxyz,
    1038             :                          grid);
    1039             :   }
    1040   125059749 :   grid_library_counter_add(lp, GRID_BACKEND_CPU, k, 1);
    1041   125059749 : }
    1042             : 
    1043             : /*******************************************************************************
    1044             :  * \brief Transforms coefficients C_ab into C_xyz.
    1045             :  * \author Ole Schuett
    1046             :  ******************************************************************************/
    1047   125059749 : static inline void cab_to_cxyz(const int la_max, const int la_min,
    1048             :                                const int lb_max, const int lb_min,
    1049             :                                const double prefactor, const double ra[3],
    1050             :                                const double rb[3], const double rp[3],
    1051             :                                GRID_CONST_WHEN_COLLOCATE double *cab,
    1052   125059749 :                                GRID_CONST_WHEN_INTEGRATE double *cxyz) {
    1053             : 
    1054             :   // Computes the polynomial expansion coefficients:
    1055             :   //     (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
    1056   125059749 :   const int lp = la_max + lb_max;
    1057   125059749 :   double alpha[3][lb_max + 1][la_max + 1][lp + 1];
    1058   125059749 :   memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) * sizeof(double));
    1059             : 
    1060   500238996 :   for (int i = 0; i < 3; i++) {
    1061   375179247 :     const double drpa = rp[i] - ra[i];
    1062   375179247 :     const double drpb = rp[i] - rb[i];
    1063  1058057160 :     for (int lxa = 0; lxa <= la_max; lxa++) {
    1064  1957520067 :       for (int lxb = 0; lxb <= lb_max; lxb++) {
    1065             :         double binomial_k_lxa = 1.0;
    1066             :         double a = 1.0;
    1067  3345978606 :         for (int k = 0; k <= lxa; k++) {
    1068             :           double binomial_l_lxb = 1.0;
    1069             :           double b = 1.0;
    1070  5409507702 :           for (int l = 0; l <= lxb; l++) {
    1071  3338171250 :             alpha[i][lxb][lxa][lxa - l + lxb - k] +=
    1072  3338171250 :                 binomial_k_lxa * binomial_l_lxb * a * b;
    1073  3338171250 :             binomial_l_lxb *= ((double)(lxb - l)) / ((double)(l + 1));
    1074  3338171250 :             b *= drpb;
    1075             :           }
    1076  2071336452 :           binomial_k_lxa *= ((double)(lxa - k)) / ((double)(k + 1));
    1077  2071336452 :           a *= drpa;
    1078             :         }
    1079             :       }
    1080             :     }
    1081             :   }
    1082             : 
    1083             :   //   *** initialise the coefficient matrix, we transform the sum
    1084             :   //
    1085             :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
    1086             :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
    1087             :   //         (z-a_z)**lza
    1088             :   //
    1089             :   // into
    1090             :   //
    1091             :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
    1092             :   //
    1093             :   // where p is center of the product gaussian, and lp = la_max + lb_max
    1094             :   // (current implementation is l**7)
    1095             :   //
    1096             : 
    1097   342440474 :   for (int lzb = 0; lzb <= lb_max; lzb++) {
    1098   642261443 :     for (int lza = 0; lza <= la_max; lza++) {
    1099  1091539131 :       for (int lyb = 0; lyb <= lb_max - lzb; lyb++) {
    1100  1779382163 :         for (int lya = 0; lya <= la_max - lza; lya++) {
    1101  1112723750 :           const int lxb_min = imax(lb_min - lzb - lyb, 0);
    1102  1112723750 :           const int lxa_min = imax(la_min - lza - lya, 0);
    1103  2633033005 :           for (int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
    1104  3741174127 :             for (int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
    1105  2220864872 :               const int ico = coset(lxa, lya, lza);
    1106  2220864872 :               const int jco = coset(lxb, lyb, lzb);
    1107  2220864872 :               const int cab_index = jco * ncoset(la_max) + ico; // [jco, ico]
    1108  6411363376 :               for (int lzp = 0; lzp <= lza + lzb; lzp++) {
    1109 18206294258 :                 for (int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
    1110 48907489016 :                   for (int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
    1111 34891693262 :                     const double p = alpha[0][lxb][lxa][lxp] *
    1112 34891693262 :                                      alpha[1][lyb][lya][lyp] *
    1113 34891693262 :                                      alpha[2][lzb][lza][lzp] * prefactor;
    1114 34891693262 :                     const int lp1 = lp + 1;
    1115 34891693262 :                     const int cxyz_index =
    1116 34891693262 :                         lzp * lp1 * lp1 + lyp * lp1 + lxp; // [lzp, lyp, lxp]
    1117             : #if (GRID_DO_COLLOCATE)
    1118 14733727518 :                     cxyz[cxyz_index] += cab[cab_index] * p; // collocate
    1119             : #else
    1120 20157965744 :                     cab[cab_index] += cxyz[cxyz_index] * p; // integrate
    1121             : #endif
    1122             :                   }
    1123             :                 }
    1124             :               }
    1125             :             }
    1126             :           }
    1127             :         }
    1128             :       }
    1129             :     }
    1130             :   }
    1131   125059749 : }
    1132             : 
    1133             : /*******************************************************************************
    1134             :  * \brief Collocates coefficients C_ab onto the grid.
    1135             :  * \author Ole Schuett
    1136             :  ******************************************************************************/
    1137             : static inline void
    1138   170959915 : cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max,
    1139             :             const int la_min, const int lb_max, const int lb_min,
    1140             :             const double zeta, const double zetb, const double rscale,
    1141             :             const double dh[3][3], const double dh_inv[3][3],
    1142             :             const double ra[3], const double rab[3], const int npts_global[3],
    1143             :             const int npts_local[3], const int shift_local[3],
    1144             :             const int border_width[3], const double radius,
    1145             :             GRID_CONST_WHEN_COLLOCATE double *cab,
    1146   170959915 :             GRID_CONST_WHEN_INTEGRATE double *grid) {
    1147             : 
    1148             :   // Check if radius is too small to be mapped onto grid of given resolution.
    1149   170959915 :   double dh_max = 0.0;
    1150   683839660 :   for (int i = 0; i < 3; i++) {
    1151  2051518980 :     for (int j = 0; j < 3; j++) {
    1152  1538639235 :       dh_max = fmax(dh_max, fabs(dh[i][j]));
    1153             :     }
    1154             :   }
    1155   170959915 :   if (2.0 * radius < dh_max) {
    1156    45900166 :     return;
    1157             :   }
    1158             : 
    1159   125059749 :   const double zetp = zeta + zetb;
    1160   125059749 :   const double f = zetb / zetp;
    1161   125059749 :   const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
    1162   125059749 :   const double prefactor = rscale * exp(-zeta * f * rab2);
    1163   125059749 :   double rp[3], rb[3];
    1164   500238996 :   for (int i = 0; i < 3; i++) {
    1165   375179247 :     rp[i] = ra[i] + f * rab[i];
    1166   375179247 :     rb[i] = ra[i] + rab[i];
    1167             :   }
    1168             : 
    1169   125059749 :   const int lp = la_max + lb_max;
    1170   125059749 :   const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
    1171   125059749 :   double cxyz[cxyz_size];
    1172   125059749 :   memset(cxyz, 0, cxyz_size * sizeof(double));
    1173             : 
    1174             : #if (GRID_DO_COLLOCATE)
    1175             :   // collocate
    1176    62193292 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
    1177    62193292 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1178             :                npts_local, shift_local, border_width, radius, cxyz, grid);
    1179             : #else
    1180             :   // integrate
    1181    62866457 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1182             :                npts_local, shift_local, border_width, radius, cxyz, grid);
    1183    62866457 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
    1184             : #endif
    1185             : }
    1186             : 
    1187             : // EOF

Generated by: LCOV version 1.15