LCOV - code coverage report
Current view: top level - src/grid/cpu - grid_cpu_collint.h (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:936074a) Lines: 99.6 % 518 516
Test Date: 2025-12-04 06:27:48 Functions: 100.0 % 13 13

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2025 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : 
       8              : #include <assert.h>
       9              : #include <limits.h>
      10              : #include <math.h>
      11              : #include <stdio.h>
      12              : #include <stdlib.h>
      13              : #include <string.h>
      14              : 
      15              : #if defined(__AVX2__) && defined(__FMA__)
      16              : #include <immintrin.h>
      17              : #endif
      18              : 
      19              : #include "../common/grid_common.h"
      20              : #include "../common/grid_library.h"
      21              : #include "../common/grid_sphere_cache.h"
      22              : 
      23              : #define GRID_MAX_LP_OPTIMIZED 9
      24              : 
      25              : #if (GRID_DO_COLLOCATE)
      26              : #define GRID_CONST_WHEN_COLLOCATE const
      27              : #define GRID_CONST_WHEN_INTEGRATE
      28              : #else
      29              : #define GRID_CONST_WHEN_COLLOCATE
      30              : #define GRID_CONST_WHEN_INTEGRATE const
      31              : #endif
      32              : 
      33              : /*******************************************************************************
      34              :  * \brief Simple loop body for ortho_cx_to_grid using plain C.
      35              :  * \author Ole Schuett
      36              :  ******************************************************************************/
      37              : static inline void __attribute__((always_inline))
      38  29440773050 : ortho_cx_to_grid_scalar(const int lp, const int cmax, const int i,
      39  29440773050 :                         const double pol[3][lp + 1][2 * cmax + 1],
      40              :                         GRID_CONST_WHEN_COLLOCATE double *cx,
      41              :                         GRID_CONST_WHEN_INTEGRATE double *grid_0,
      42              :                         GRID_CONST_WHEN_INTEGRATE double *grid_1,
      43              :                         GRID_CONST_WHEN_INTEGRATE double *grid_2,
      44              :                         GRID_CONST_WHEN_INTEGRATE double *grid_3) {
      45              : 
      46              : #if (GRID_DO_COLLOCATE)
      47              :   // collocate
      48  15577564546 :   double reg[4] = {0.0, 0.0, 0.0, 0.0};
      49  >12462*10^7 : #pragma omp simd reduction(+ : reg)
      50              :   for (int lxp = 0; lxp <= lp; lxp++) {
      51  42662510198 :     const double p = pol[0][lxp][i + cmax];
      52  42662510198 :     reg[0] += cx[lxp * 4 + 0] * p;
      53  42662510198 :     reg[1] += cx[lxp * 4 + 1] * p;
      54  42662510198 :     reg[2] += cx[lxp * 4 + 2] * p;
      55  42662510198 :     reg[3] += cx[lxp * 4 + 3] * p;
      56              :   }
      57  15577564546 :   *grid_0 += reg[0];
      58  15577564546 :   *grid_1 += reg[1];
      59  15577564546 :   *grid_2 += reg[2];
      60  15577564546 :   *grid_3 += reg[3];
      61              : 
      62              : #else
      63              :   // integrate
      64  13863208504 :   const double reg[4] = {*grid_0, *grid_1, *grid_2, *grid_3};
      65              : #pragma omp simd
      66  13863208504 :   for (int lxp = 0; lxp <= lp; lxp++) {
      67  38625332652 :     const double p = pol[0][lxp][i + cmax];
      68  38625332652 :     cx[lxp * 4 + 0] += reg[0] * p;
      69  38625332652 :     cx[lxp * 4 + 1] += reg[1] * p;
      70  38625332652 :     cx[lxp * 4 + 2] += reg[2] * p;
      71  38625332652 :     cx[lxp * 4 + 3] += reg[3] * p;
      72              :   }
      73              : #endif
      74              : }
      75              : 
      76              : /*******************************************************************************
      77              :  * \brief Optimized loop body for ortho_cx_to_grid using AVX2 Intel Intrinsics.
      78              :  *        This routine always processes four consecutive grid elements at once.
      79              :  * \author Ole Schuett
      80              :  ******************************************************************************/
      81              : #if defined(__AVX2__) && defined(__FMA__)
      82              : static inline void __attribute__((always_inline))
      83  46966693447 : ortho_cx_to_grid_avx2(const int lp, const int cmax, const int i,
      84  46966693447 :                       const double pol[3][lp + 1][2 * cmax + 1],
      85              :                       GRID_CONST_WHEN_COLLOCATE double *cx,
      86              :                       GRID_CONST_WHEN_INTEGRATE double *grid_0,
      87              :                       GRID_CONST_WHEN_INTEGRATE double *grid_1,
      88              :                       GRID_CONST_WHEN_INTEGRATE double *grid_2,
      89              :                       GRID_CONST_WHEN_INTEGRATE double *grid_3) {
      90              : 
      91  46966693447 :   const int icmax = i + cmax;
      92              : 
      93              : #if (GRID_DO_COLLOCATE)
      94              :   // collocate
      95              :   // First iteration for lxp == 0 does not need add instructions.
      96  25000778973 :   __m256d p_vec = _mm256_loadu_pd(&pol[0][0][icmax]);
      97  25000778973 :   __m256d r_vec_0 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[0]));
      98  25000778973 :   __m256d r_vec_1 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[1]));
      99  25000778973 :   __m256d r_vec_2 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[2]));
     100  25000778973 :   __m256d r_vec_3 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[3]));
     101              : 
     102              :   // Remaining iterations for lxp > 0 use fused multiply adds.
     103              :   GRID_PRAGMA_UNROLL_UP_TO(GRID_MAX_LP_OPTIMIZED)
     104  67812574041 :   for (int lxp = 1; lxp <= lp; lxp++) {
     105  42811795068 :     const double *cx_base = &cx[lxp * 4];
     106  42811795068 :     p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
     107  42811795068 :     r_vec_0 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[0]), r_vec_0);
     108  42811795068 :     r_vec_1 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[1]), r_vec_1);
     109  42811795068 :     r_vec_2 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[2]), r_vec_2);
     110  42811795068 :     r_vec_3 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[3]), r_vec_3);
     111              :   }
     112              : 
     113              :   // Add vectors to grid one at a time, because they can aliase when cube wraps.
     114  25000778973 :   _mm256_storeu_pd(grid_0, _mm256_add_pd(_mm256_loadu_pd(grid_0), r_vec_0));
     115  25000778973 :   _mm256_storeu_pd(grid_1, _mm256_add_pd(_mm256_loadu_pd(grid_1), r_vec_1));
     116  25000778973 :   _mm256_storeu_pd(grid_2, _mm256_add_pd(_mm256_loadu_pd(grid_2), r_vec_2));
     117  25000778973 :   _mm256_storeu_pd(grid_3, _mm256_add_pd(_mm256_loadu_pd(grid_3), r_vec_3));
     118              : 
     119              : #else
     120              :   // integrate
     121  21965914474 :   __m256d grid_vec_0 = _mm256_loadu_pd(grid_0);
     122  21965914474 :   __m256d grid_vec_1 = _mm256_loadu_pd(grid_1);
     123  21965914474 :   __m256d grid_vec_2 = _mm256_loadu_pd(grid_2);
     124  21965914474 :   __m256d grid_vec_3 = _mm256_loadu_pd(grid_3);
     125              : 
     126              :   GRID_PRAGMA_UNROLL_UP_TO(GRID_MAX_LP_OPTIMIZED + 1)
     127  82443258005 :   for (int lxp = 0; lxp <= lp; lxp++) {
     128  60477343531 :     __m256d p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
     129              : 
     130              :     // Do 4 dot products at once. https://stackoverflow.com/a/10454420
     131  60477343531 :     __m256d xy0 = _mm256_mul_pd(p_vec, grid_vec_0);
     132  60477343531 :     __m256d xy1 = _mm256_mul_pd(p_vec, grid_vec_1);
     133  60477343531 :     __m256d xy2 = _mm256_mul_pd(p_vec, grid_vec_2);
     134  60477343531 :     __m256d xy3 = _mm256_mul_pd(p_vec, grid_vec_3);
     135              : 
     136              :     // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
     137  60477343531 :     __m256d temp01 = _mm256_hadd_pd(xy0, xy1);
     138              : 
     139              :     // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
     140  60477343531 :     __m256d temp23 = _mm256_hadd_pd(xy2, xy3);
     141              : 
     142              :     // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
     143  60477343531 :     __m256d swapped = _mm256_permute2f128_pd(temp01, temp23, 0x21);
     144              : 
     145              :     // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
     146  60477343531 :     __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);
     147              : 
     148  60477343531 :     __m256d r_vec = _mm256_add_pd(swapped, blended);
     149              : 
     150              :     // cx += r_vec
     151  60477343531 :     double *cx_base = &cx[lxp * 4];
     152  60477343531 :     _mm256_storeu_pd(cx_base, _mm256_add_pd(r_vec, _mm256_loadu_pd(cx_base)));
     153              :   }
     154              : #endif
     155              : }
     156              : #endif // __AVX2__ && __FMA__
     157              : 
     158              : /*******************************************************************************
     159              :  * \brief Collocates coefficients C_x onto the grid for orthorhombic case.
     160              :  * \author Ole Schuett
     161              :  ******************************************************************************/
     162              : static inline void __attribute__((always_inline))
     163  11825312012 : ortho_cx_to_grid(const int lp, const int kg1, const int kg2, const int jg1,
     164              :                  const int jg2, const int cmax,
     165  11825312012 :                  const double pol[3][lp + 1][2 * cmax + 1],
     166              :                  const int map[3][2 * cmax + 1],
     167              :                  const int sections[3][2 * cmax + 1], const int npts_local[3],
     168              :                  int **sphere_bounds_iter, GRID_CONST_WHEN_COLLOCATE double *cx,
     169              :                  GRID_CONST_WHEN_INTEGRATE double *grid) {
     170              : 
     171              :   // Lower and upper sphere bounds relative to center, ie. in cube coordinates.
     172  11825312012 :   const int lb = *((*sphere_bounds_iter)++);
     173  11825312012 :   const int ub = 1 - lb;
     174              : 
     175              :   // AVX instructions can only load/store from evenly spaced memory locations.
     176              :   // Since the sphere bounds can wrap around due to the grid's periodicity,
     177              :   // the inner loop runs over sections with homogeneous cube to grid mapping.
     178  33299820122 :   for (int istart = lb; istart <= ub; istart++) {
     179  21474508110 :     const int istop = imin(ub, istart + sections[0][istart + cmax]);
     180  21474508110 :     const int cube2grid = map[0][istart + cmax] - istart;
     181              : 
     182  21474508110 :     const int stride = npts_local[1] * npts_local[0];
     183  21474508110 :     const int grid_index_0 = kg1 * stride + jg1 * npts_local[0];
     184  21474508110 :     const int grid_index_1 = kg2 * stride + jg1 * npts_local[0];
     185  21474508110 :     const int grid_index_2 = kg1 * stride + jg2 * npts_local[0];
     186  21474508110 :     const int grid_index_3 = kg2 * stride + jg2 * npts_local[0];
     187  21474508110 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_0 = &grid[grid_index_0];
     188  21474508110 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_1 = &grid[grid_index_1];
     189  21474508110 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_2 = &grid[grid_index_2];
     190  21474508110 :     GRID_CONST_WHEN_INTEGRATE double *grid_base_3 = &grid[grid_index_3];
     191              : 
     192              :     // Use AVX2 to process grid points in chunks of four, ie. 256 bit vectors.
     193              : #if defined(__AVX2__) && defined(__FMA__)
     194  21474508110 :     const int istop_vec = istart + 4 * ((istop - istart + 1) / 4) - 1;
     195  68441201557 :     for (int i = istart; i <= istop_vec; i += 4) {
     196  46966693447 :       const int ig = i + cube2grid;
     197  46966693447 :       ortho_cx_to_grid_avx2(lp, cmax, i, pol, cx, &grid_base_0[ig],
     198              :                             &grid_base_1[ig], &grid_base_2[ig],
     199  46966693447 :                             &grid_base_3[ig]);
     200              :     }
     201  50915281160 :     istart = istop_vec + 1;
     202              : #endif
     203              : 
     204              :     // Process up to 3 remaining points - or everything if AVX2 isn't available.
     205  50915281160 :     for (int i = istart; i <= istop; i++) {
     206  29440773050 :       const int ig = i + cube2grid;
     207  29440773050 :       ortho_cx_to_grid_scalar(lp, cmax, i, pol, cx, &grid_base_0[ig],
     208              :                               &grid_base_1[ig], &grid_base_2[ig],
     209  29440773050 :                               &grid_base_3[ig]);
     210              :     }
     211  21474508110 :     istart = istop;
     212              :   }
     213              : }
     214              : 
     215              : /*******************************************************************************
     216              :  * \brief Transforms coefficients C_xy into C_x by fixing grid index j.
     217              :  * \author Ole Schuett
     218              :  ******************************************************************************/
     219              : static inline void __attribute__((always_inline))
     220  11825312012 : ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax,
     221  11825312012 :                 const double pol[3][lp + 1][2 * cmax + 1],
     222              :                 GRID_CONST_WHEN_COLLOCATE double *cxy,
     223              :                 GRID_CONST_WHEN_INTEGRATE double *cx) {
     224              : 
     225  43815662973 :   for (int lyp = 0; lyp <= lp; lyp++) {
     226  >10076*10^7 :     for (int lxp = 0; lxp <= lp - lyp; lxp++) {
     227  68774629172 :       const double p1 = pol[1][lyp][j1 + cmax];
     228  68774629172 :       const double p2 = pol[1][lyp][j2 + cmax];
     229  68774629172 :       const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
     230              : 
     231              : #if (GRID_DO_COLLOCATE)
     232              :       // collocate
     233  35756188737 :       cx[lxp * 4 + 0] += cxy[cxy_index + 0] * p1;
     234  35756188737 :       cx[lxp * 4 + 1] += cxy[cxy_index + 1] * p1;
     235  35756188737 :       cx[lxp * 4 + 2] += cxy[cxy_index + 0] * p2;
     236  35756188737 :       cx[lxp * 4 + 3] += cxy[cxy_index + 1] * p2;
     237              : #else
     238              :       // integrate
     239  33018440435 :       cxy[cxy_index + 0] += cx[lxp * 4 + 0] * p1;
     240  33018440435 :       cxy[cxy_index + 1] += cx[lxp * 4 + 1] * p1;
     241  33018440435 :       cxy[cxy_index + 0] += cx[lxp * 4 + 2] * p2;
     242  33018440435 :       cxy[cxy_index + 1] += cx[lxp * 4 + 3] * p2;
     243              : #endif
     244              :     }
     245              :   }
     246              : }
     247              : 
     248              : /*******************************************************************************
     249              :  * \brief Loop body of ortho_cxy_to_grid to be inlined for low values of lp.
     250              :  * \author Ole Schuett
     251              :  ******************************************************************************/
     252  11825312012 : static inline void __attribute__((always_inline)) ortho_cxy_to_grid_low(
     253              :     const int lp, const int j1, const int j2, const int kg1, const int kg2,
     254              :     const int jg1, const int jg2, const int cmax,
     255  11825312012 :     const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
     256              :     const int sections[3][2 * cmax + 1], const int npts_local[3],
     257              :     int **sphere_bounds_iter, double *cx, GRID_CONST_WHEN_COLLOCATE double *cxy,
     258              :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     259              : 
     260              : #if (GRID_DO_COLLOCATE)
     261              :   // collocate
     262  12434018578 :   ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     263   6217009289 :   ortho_cx_to_grid(lp, kg1, kg2, jg1, jg2, cmax, pol, map, sections, npts_local,
     264              :                    sphere_bounds_iter, cx, grid);
     265              : #else
     266              :   // integrate
     267  11216605446 :   ortho_cx_to_grid(lp, kg1, kg2, jg1, jg2, cmax, pol, map, sections, npts_local,
     268              :                    sphere_bounds_iter, cx, grid);
     269   5608302723 :   ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     270              : #endif
     271              : }
     272              : 
     273              : /*******************************************************************************
     274              :  * \brief Collocates coefficients C_xy onto the grid for orthorhombic case.
     275              :  * \author Ole Schuett
     276              :  ******************************************************************************/
     277   1323389680 : static inline void ortho_cxy_to_grid(
     278              :     const int lp, const int kg1, const int kg2, const int cmax,
     279   1323389680 :     const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
     280              :     const int sections[3][2 * cmax + 1], const int npts_local[3],
     281              :     int **sphere_bounds_iter, GRID_CONST_WHEN_COLLOCATE double *cxy,
     282   1323389680 :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     283              : 
     284              :   // The cube contains an even number of grid points in each direction and
     285              :   // collocation is always performed on a pair of two opposing grid points.
     286              :   // Hence, the points with index 0 and 1 are both assigned distance zero via
     287              :   // the formular distance=(2*index-1)/2.
     288              : 
     289   1323389680 :   const int jstart = *((*sphere_bounds_iter)++);
     290   1323389680 :   const size_t cx_size = (lp + 1) * 4;
     291   1323389680 :   double cx[cx_size];
     292  13148701692 :   for (int j1 = jstart; j1 <= 0; j1++) {
     293  11825312012 :     const int j2 = 1 - j1;
     294  11825312012 :     const int jg1 = map[1][j1 + cmax];
     295  11825312012 :     const int jg2 = map[1][j2 + cmax];
     296              : 
     297  11825312012 :     memset(cx, 0, cx_size * sizeof(double));
     298              : 
     299              :     // Generate separate branches for low values of lp gives up to 30% speedup.
     300  11825312012 :     if (lp <= GRID_MAX_LP_OPTIMIZED) {
     301              :       GRID_PRAGMA_UNROLL(GRID_MAX_LP_OPTIMIZED + 1)
     302  >13007*10^7 :       for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
     303  >11825*10^7 :         if (lp == ilp) {
     304  >13007*10^7 :           ortho_cxy_to_grid_low(ilp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
     305              :                                 sections, npts_local, sphere_bounds_iter, cx,
     306              :                                 cxy, grid);
     307              :         }
     308              :       }
     309              :     } else {
     310  11825315630 :       ortho_cxy_to_grid_low(lp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
     311              :                             sections, npts_local, sphere_bounds_iter, cx, cxy,
     312              :                             grid);
     313              :     }
     314              :   }
     315   1323389680 : }
     316              : 
     317              : /*******************************************************************************
     318              :  * \brief Transforms coefficients C_xyz into C_xz by fixing grid index k.
     319              :  * \author Ole Schuett
     320              :  ******************************************************************************/
     321   1323389680 : static inline void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2,
     322              :                                      const int cmax,
     323   1323389680 :                                      const double pol[3][lp + 1][2 * cmax + 1],
     324              :                                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     325              :                                      GRID_CONST_WHEN_INTEGRATE double *cxy) {
     326              : 
     327   4823076588 :   for (int lzp = 0; lzp <= lp; lzp++) {
     328  10915466402 :     for (int lyp = 0; lyp <= lp - lzp; lyp++) {
     329  21299720486 :       for (int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
     330  13883940992 :         const double p1 = pol[2][lzp][k1 + cmax];
     331  13883940992 :         const double p2 = pol[2][lzp][k2 + cmax];
     332  13883940992 :         const int cxyz_index =
     333  13883940992 :             lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp; // [lzp, lyp, lxp]
     334  13883940992 :         const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2;   // [lyp, lxp, 0]
     335              : 
     336              : #if (GRID_DO_COLLOCATE)
     337              :         // collocate
     338   7000178043 :         cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
     339   7000178043 :         cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
     340              : #else
     341              :         // integrate
     342   6883762949 :         cxyz[cxyz_index] += cxy[cxy_index + 0] * p1;
     343   6883762949 :         cxyz[cxyz_index] += cxy[cxy_index + 1] * p2;
     344              : #endif
     345              :       }
     346              :     }
     347              :   }
     348   1323389680 : }
     349              : 
     350              : /*******************************************************************************
     351              :  * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
     352              :  * \author Ole Schuett
     353              :  ******************************************************************************/
     354              : static inline void
     355    128252625 : ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3],
     356              :                    const double dh_inv[3][3], const double rp[3],
     357              :                    const int npts_global[3], const int npts_local[3],
     358              :                    const int shift_local[3], const double radius,
     359              :                    GRID_CONST_WHEN_COLLOCATE double *cxyz,
     360    128252625 :                    GRID_CONST_WHEN_INTEGRATE double *grid) {
     361              : 
     362              :   // *** position of the gaussian product
     363              :   //
     364              :   // this is the actual definition of the position on the grid
     365              :   // i.e. a point rp(:) gets here grid coordinates
     366              :   // MODULO(rp(:)/dr(:),npts_global(:))+1
     367              :   // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
     368              :   // and (1,1,1) on grid here in C.
     369              : 
     370              :   // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
     371    128252625 :   int cubecenter[3];
     372    513010500 :   for (int i = 0; i < 3; i++) {
     373              :     double dh_inv_rp = 0.0;
     374   1539031500 :     for (int j = 0; j < 3; j++) {
     375   1154273625 :       dh_inv_rp += dh_inv[j][i] * rp[j];
     376              :     }
     377    384757875 :     cubecenter[i] = (int)floor(dh_inv_rp);
     378              :   }
     379              : 
     380              :   double roffset[3];
     381    513010500 :   for (int i = 0; i < 3; i++) {
     382    384757875 :     roffset[i] = rp[i] - ((double)cubecenter[i]) * dh[i][i];
     383              :   }
     384              : 
     385              :   // Lookup loop bounds for spherical cutoff.
     386    128252625 :   int *sphere_bounds;
     387    128252625 :   double disr_radius;
     388    128252625 :   grid_sphere_cache_lookup(radius, dh, dh_inv, &sphere_bounds, &disr_radius);
     389    128252625 :   int **sphere_bounds_iter = &sphere_bounds;
     390              : 
     391              :   // Cube bounds.
     392    128252625 :   int lb_cube[3], ub_cube[3];
     393    513010500 :   for (int i = 0; i < 3; i++) {
     394    384757875 :     lb_cube[i] = (int)ceil(-1e-8 - disr_radius * dh_inv[i][i]);
     395    384757875 :     ub_cube[i] = 1 - lb_cube[i];
     396              :     // If grid is not period check that cube fits without wrapping.
     397    384757875 :     if (npts_global[i] != npts_local[i]) {
     398        17826 :       const int offset =
     399        17826 :           modulo(cubecenter[i] + lb_cube[i] - shift_local[i], npts_global[i]) -
     400              :           lb_cube[i];
     401        17826 :       assert(offset + ub_cube[i] < npts_local[i]);
     402        17826 :       assert(offset + lb_cube[i] >= 0);
     403              :     }
     404              :   }
     405              : 
     406              :   // cmax = MAXVAL(ub_cube)
     407    128252625 :   const int cmax = imax(imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
     408              : 
     409              :   // Precompute (x-xp)**lp*exp(..) for each direction.
     410    128252625 :   double pol_mutable[3][lp + 1][2 * cmax + 1];
     411    513010500 :   for (int idir = 0; idir < 3; idir++) {
     412    384757875 :     const double dr = dh[idir][idir];
     413    384757875 :     const double ro = roffset[idir];
     414              :     //  Reuse the result from the previous gridpoint to avoid to many exps:
     415              :     //  exp( -a*(x+d)**2) = exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
     416              :     //  exp(-2*a*(x+d)*d) = exp(-2*a*x*d)*exp(-2*a*d**2)
     417    384757875 :     const double t_exp_1 = exp(-zetp * pow(dr, 2));
     418    384757875 :     const double t_exp_2 = pow(t_exp_1, 2);
     419    384757875 :     double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
     420    384757875 :     double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
     421   4359599394 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     422   3974841519 :       const double rpg = ig * dr - ro;
     423   3974841519 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     424   3974841519 :       t_exp_min_2 *= t_exp_2;
     425   3974841519 :       double pg = t_exp_min_1;
     426  14489090446 :       for (int icoef = 0; icoef <= lp; icoef++) {
     427  10514248927 :         pol_mutable[idir][icoef][ig + cmax] = pg; // exp(-zetp*rpg**2)
     428  10514248927 :         pg *= rpg;
     429              :       }
     430              :     }
     431    384757875 :     double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
     432    384757875 :     double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
     433   4359599394 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     434   3974841519 :       const double rpg = (1 - ig) * dr - ro;
     435   3974841519 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     436   3974841519 :       t_exp_plus_2 *= t_exp_2;
     437   3974841519 :       double pg = t_exp_plus_1;
     438  14489090446 :       for (int icoef = 0; icoef <= lp; icoef++) {
     439  10514248927 :         pol_mutable[idir][icoef][1 - ig + cmax] = pg; // exp(-zetp*rpg**2)
     440  10514248927 :         pg *= rpg;
     441              :       }
     442              :     }
     443              :   }
     444    128252625 :   const double(*pol)[lp + 1][2 * cmax + 1] =
     445    128252625 :       (const double(*)[lp + 1][2 * cmax + 1]) pol_mutable;
     446              : 
     447              :   // Precompute mapping from cube to grid indices for each direction
     448    128252625 :   int map_mutable[3][2 * cmax + 1];
     449    513010500 :   for (int i = 0; i < 3; i++) {
     450   8784007002 :     for (int k = -cmax; k <= +cmax; k++) {
     451   8399249127 :       map_mutable[i][k + cmax] =
     452   8399249127 :           modulo(cubecenter[i] + k - shift_local[i], npts_global[i]);
     453              :     }
     454              :   }
     455    128252625 :   const int(*map)[2 * cmax + 1] = (const int(*)[2 * cmax + 1]) map_mutable;
     456              : 
     457              :   // Precompute length of sections with homogeneous cube to grid mapping.
     458    128252625 :   int sections_mutable[3][2 * cmax + 1];
     459    513010500 :   for (int i = 0; i < 3; i++) {
     460   8784007002 :     for (int kg = 2 * cmax; kg >= 0; kg--) {
     461   8399249127 :       if (kg == 2 * cmax || map[i][kg] != map[i][kg + 1] - 1) {
     462    782268276 :         sections_mutable[i][kg] = 0;
     463              :       } else {
     464   7616980851 :         sections_mutable[i][kg] = sections_mutable[i][kg + 1] + 1;
     465              :       }
     466              :     }
     467              :   }
     468    128252625 :   const int(*sections)[2 * cmax + 1] =
     469    128252625 :       (const int(*)[2 * cmax + 1]) sections_mutable;
     470              : 
     471              :   // Loop over k dimension of the cube.
     472    128252625 :   const int kstart = *((*sphere_bounds_iter)++);
     473    128252625 :   const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
     474    128252625 :   double cxy[cxy_size];
     475   1451642305 :   for (int k1 = kstart; k1 <= 0; k1++) {
     476   1323389680 :     const int k2 = 1 - k1;
     477   1323389680 :     const int kg1 = map[2][k1 + cmax];
     478   1323389680 :     const int kg2 = map[2][k2 + cmax];
     479              : 
     480   1323389680 :     memset(cxy, 0, cxy_size * sizeof(double));
     481              : 
     482              : #if (GRID_DO_COLLOCATE)
     483              :     // collocate
     484    681204544 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     485    681204544 :     ortho_cxy_to_grid(lp, kg1, kg2, cmax, pol, map, sections, npts_local,
     486              :                       sphere_bounds_iter, cxy, grid);
     487              : #else
     488              :     // integrate
     489    642185136 :     ortho_cxy_to_grid(lp, kg1, kg2, cmax, pol, map, sections, npts_local,
     490              :                       sphere_bounds_iter, cxy, grid);
     491    642185136 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     492              : #endif
     493              :   }
     494    128252625 : }
     495              : 
     496              : /*******************************************************************************
     497              :  * \brief Collocates coefficients C_i onto the grid for general case.
     498              :  * \author Ole Schuett
     499              :  ******************************************************************************/
     500   2238423681 : static inline void __attribute__((always_inline)) general_ci_to_grid(
     501              :     const int lp, const int jg, const int kg, const int ismin, const int ismax,
     502              :     const int npts_local[3], const int index_min[3], const int index_max[3],
     503              :     const int map_i[], const int sections_i[], const double gp[3], const int k,
     504              :     const int j, const double exp_ij[], const double exp_jk[],
     505              :     const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci,
     506              :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     507              : 
     508   2238423681 :   const int base = kg * npts_local[1] * npts_local[0] + jg * npts_local[0];
     509              : 
     510              :   // AVX instructions can only load/store from evenly spaced memory locations.
     511              :   // Since the cube can wrap around due to the grid's periodicity,
     512              :   // the inner loop runs over sections with homogeneous cube to grid mapping.
     513   6606467431 :   for (int istart = ismin; istart <= ismax; istart++) {
     514   4368043750 :     const int istop = imin(ismax, istart + sections_i[istart - index_min[0]]);
     515   4368043750 :     if (map_i[istart - index_min[0]] < 0) {
     516            0 :       istart = istop; // skip over out-of-bounds indicies
     517            0 :       continue;
     518              :     }
     519              : 
     520   4368043750 :     const int cube2grid = map_i[istart - index_min[0]] - istart;
     521  42032985681 :     for (int i = istart; i <= istop; i++) {
     522  37664941931 :       const int ig = i + cube2grid;
     523  37664941931 :       const double di = i - gp[0];
     524              : 
     525  37664941931 :       const int stride_i = index_max[0] - index_min[0] + 1;
     526  37664941931 :       const int stride_j = index_max[1] - index_min[1] + 1;
     527  37664941931 :       const int stride_k = index_max[2] - index_min[2] + 1;
     528  37664941931 :       const int idx_ij = (j - index_min[1]) * stride_i + i - index_min[0];
     529  37664941931 :       const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
     530  37664941931 :       const int idx_ki = (i - index_min[0]) * stride_k + k - index_min[2];
     531              : 
     532              :       // Mathieu's trick: Calculate 3D Gaussian from three precomputed 2D tables
     533              :       //
     534              :       // r   =  (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     535              :       //     =  a                 + b                 + c
     536              :       //
     537              :       // r**2  =  (a + b + c)**2  =  a**2 + b**2 + c**2 + 2ab + 2bc + 2ca
     538              :       //
     539              :       // exp(-r**2)  =  exp(-a(a+2b)) * exp(-b*(b+2c)) * exp(-c*(c+2a))
     540              :       //
     541  37664941931 :       const double gaussian = exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
     542              : 
     543  37664941931 :       const int grid_index = base + ig; // [kg, jg, ig]
     544  37664941931 :       double dip = gaussian;
     545              : 
     546              : #if (GRID_DO_COLLOCATE)
     547              :       // collocate
     548  21274692382 :       double reg = 0.0;
     549  >10021*10^7 :       for (int il = 0; il <= lp; il++) {
     550  78940385479 :         reg += ci[il] * dip;
     551  78940385479 :         dip *= di;
     552              :       }
     553  21274692382 :       grid[grid_index] += reg;
     554              : #else
     555              :       // integrate
     556  16390249549 :       const double reg = grid[grid_index];
     557  76971067576 :       for (int il = 0; il <= lp; il++) {
     558  60580818027 :         ci[il] += reg * dip;
     559  60580818027 :         dip *= di;
     560              :       }
     561              : #endif
     562              :     }
     563              :     istart = istop;
     564              :   }
     565              : }
     566              : 
     567              : /*******************************************************************************
     568              :  * \brief Transforms coefficients C_ij into C_i by fixing grid index j.
     569              :  * \author Ole Schuett
     570              :  ******************************************************************************/
     571              : static inline void __attribute__((always_inline))
     572              : general_cij_to_ci(const int lp, const double dj,
     573              :                   GRID_CONST_WHEN_COLLOCATE double *cij,
     574              :                   GRID_CONST_WHEN_INTEGRATE double *ci) {
     575              :   double djp = 1.0;
     576  14222195864 :   for (int jl = 0; jl <= lp; jl++) {
     577  29715020308 :     for (int il = 0; il <= lp - jl; il++) {
     578  21364425764 :       const int cij_index = jl * (lp + 1) + il; // [jl, il]
     579              : #if (GRID_DO_COLLOCATE)
     580  11984947487 :       ci[il] += cij[cij_index] * djp; // collocate
     581              : #else
     582   9379478277 :       cij[cij_index] += ci[il] * djp; // integrate
     583              : #endif
     584              :     }
     585   8350594544 :     djp *= dj;
     586              :   }
     587              : }
     588              : 
     589              : /*******************************************************************************
     590              :  * \brief Loop body of general_cij_to_grid to be inlined for low values of lp.
     591              :  * \author Ole Schuett
     592              :  ******************************************************************************/
     593    972373361 : static inline void __attribute__((always_inline)) general_cij_to_grid_low(
     594              :     const int lp, const int jg, const int kg, const int ismin, const int ismax,
     595              :     const int npts_local[3], const int index_min[3], const int index_max[3],
     596              :     const int map_i[], const int sections_i[], const double gp[3], const int k,
     597              :     const int j, const double exp_ij[], const double exp_jk[],
     598              :     const double exp_ki[], const double dj, double *ci,
     599              :     GRID_CONST_WHEN_COLLOCATE double *cij,
     600              :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     601              : 
     602              : #if (GRID_DO_COLLOCATE)
     603              :   // collocate
     604   1266050320 :   general_cij_to_ci(lp, dj, cij, ci);
     605   1266050320 :   general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
     606              :                      map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     607              :                      grid);
     608              : #else
     609              :   // integrate
     610   4605551000 :   general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
     611              :                      map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     612              :                      grid);
     613              :   general_cij_to_ci(lp, dj, cij, ci);
     614              : #endif
     615              : }
     616              : 
     617              : /*******************************************************************************
     618              :  * \brief Collocates coefficients C_ij onto the grid for general case.
     619              :  * \author Ole Schuett
     620              :  ******************************************************************************/
     621    134384836 : static inline void general_cij_to_grid(
     622              :     const int lp, const int k, const int kg, const int npts_local[3],
     623              :     const int index_min[3], const int index_max[3], const int map_i[],
     624              :     const int map_j[], const int sections_i[], const int sections_j[],
     625              :     const double dh[3][3], const double gp[3], const double radius,
     626              :     const double exp_ij[], const double exp_jk[], const double exp_ki[],
     627              :     GRID_CONST_WHEN_COLLOCATE double *cij,
     628              :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     629              : 
     630   4264571486 :   for (int j = index_min[1]; j <= index_max[1]; j++) {
     631   4130186650 :     const int jg = map_j[j - index_min[1]];
     632   4130186650 :     if (jg < 0) {
     633          560 :       j += sections_j[j - index_min[1]]; // skip over out-of-bounds indicies
     634          560 :       continue;
     635              :     }
     636              : 
     637              :     //--------------------------------------------------------------------
     638              :     // Find bounds for the inner loop based on a quadratic equation in i.
     639              :     //
     640              :     // The real-space vector from the center of the gaussian to the
     641              :     // grid point i,j,k is given by:
     642              :     //   r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     643              :     //
     644              :     // Separating the term that depends on i:
     645              :     //   r = i*dh[0,:] - gp[0]*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     646              :     //     = i*dh[0,:] + v
     647              :     //
     648              :     // The squared distance works out to:
     649              :     //   r**2 = dh[0,:]**2 * i**2  +  2 * v * dh[0,:] * i  +  v**2
     650              :     //        = a * i**2           +  b * i                +  c
     651              :     //
     652              :     // Solving r**2==radius**2 for i yields:
     653              :     //    d =  b**2  -  4 * a * (c - radius**2)
     654              :     //    i = (-b \pm sqrt(d)) / (2*a)
     655              :     //
     656              :     double a = 0.0, b = 0.0, c = 0.0;
     657  16520744360 :     for (int i = 0; i < 3; i++) {
     658  12390558270 :       const double v = (0 - gp[0]) * dh[0][i] + (j - gp[1]) * dh[1][i] +
     659  12390558270 :                        (k - gp[2]) * dh[2][i];
     660  12390558270 :       a += dh[0][i] * dh[0][i];
     661  12390558270 :       b += 2.0 * v * dh[0][i];
     662  12390558270 :       c += v * v;
     663              :     }
     664   4130186090 :     const double d = b * b - 4.0 * a * (c - radius * radius);
     665              : 
     666   4130186090 :     if (0.0 < d) {
     667   2238423681 :       const double sqrt_d = sqrt(d);
     668   2238423681 :       const double inv_2a = 1.0 / (2.0 * a);
     669   2238423681 :       const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
     670   2238423681 :       const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
     671   2238423681 :       const double dj = j - gp[1];
     672              : 
     673   2238423681 :       double ci[lp + 1];
     674   2238423681 :       memset(ci, 0, sizeof(ci));
     675              : 
     676              :       // Generate separate branches for low values of lp.
     677   2238423681 :       if (lp <= GRID_MAX_LP_OPTIMIZED) {
     678              :         GRID_PRAGMA_UNROLL(GRID_MAX_LP_OPTIMIZED + 1)
     679  24622660491 :         for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
     680  22384236810 :           if (lp == ilp) {
     681  23356610171 :             general_cij_to_grid_low(ilp, jg, kg, ismin, ismax, npts_local,
     682              :                                     index_min, index_max, map_i, sections_i, gp,
     683              :                                     k, j, exp_ij, exp_jk, exp_ki, dj, ci, cij,
     684              :                                     grid);
     685              :           }
     686              :         }
     687              :       } else {
     688   2238423681 :         general_cij_to_grid_low(lp, jg, kg, ismin, ismax, npts_local, index_min,
     689              :                                 index_max, map_i, sections_i, gp, k, j, exp_ij,
     690              :                                 exp_jk, exp_ki, dj, ci, cij, grid);
     691              :       }
     692              :     }
     693              :   }
     694    134384836 : }
     695              : 
     696              : /*******************************************************************************
     697              :  * \brief Transforms coefficients C_ijk into C_ij by fixing grid index k.
     698              :  * \author Ole Schuett
     699              :  ******************************************************************************/
     700    134384836 : static inline void general_cijk_to_cij(const int lp, const double dk,
     701              :                                        GRID_CONST_WHEN_COLLOCATE double *cijk,
     702              :                                        GRID_CONST_WHEN_INTEGRATE double *cij) {
     703    134384836 :   double dkp = 1.0;
     704    633299422 :   for (int kl = 0; kl <= lp; kl++) {
     705   1772241192 :     for (int jl = 0; jl <= lp - kl; jl++) {
     706   3985293867 :       for (int il = 0; il <= lp - kl - jl; il++) {
     707   2711967261 :         const int cij_index = jl * (lp + 1) + il; // [jl, il]
     708   2711967261 :         const int cijk_index =
     709   2711967261 :             kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il; // [kl, jl, il]
     710              : #if (GRID_DO_COLLOCATE)
     711   1485137723 :         cij[cij_index] += cijk[cijk_index] * dkp; // collocate
     712              : #else
     713   1226829538 :         cijk[cijk_index] += cij[cij_index] * dkp; // integrate
     714              : #endif
     715              :       }
     716              :     }
     717    498914586 :     dkp *= dk;
     718              :   }
     719    134384836 : }
     720              : 
     721              : /*******************************************************************************
     722              :  * \brief Precompute mapping of grid indices and its homogeneous sections.
     723              :  * \author Ole Schuett
     724              :  ******************************************************************************/
     725              : static inline void
     726     17161635 : general_precompute_mapping(const int index_min, const int index_max,
     727              :                            const int shift_local, const int npts_global,
     728              :                            const int bounds[2], int map[], int sections[]) {
     729              : 
     730              :   // Precompute mapping from continous grid indices to pbc wraped.
     731    468986773 :   for (int k = index_min; k <= index_max; k++) {
     732    451825138 :     const int kg = modulo(k - shift_local, npts_global);
     733    451825138 :     if (bounds[0] <= kg && kg <= bounds[1]) {
     734    451820717 :       map[k - index_min] = kg;
     735              :     } else {
     736         4421 :       map[k - index_min] = INT_MIN; // out of bounds - not mapped
     737              :     }
     738              :   }
     739              : 
     740              :   // Precompute length of sections with homogeneous cube to grid mapping.
     741     17161635 :   const int range = index_max - index_min + 1;
     742    468986773 :   for (int kg = range - 1; kg >= 0; kg--) {
     743    451825138 :     if (kg == range - 1 || map[kg] != map[kg + 1] - 1) {
     744     43076760 :       sections[kg] = 0;
     745              :     } else {
     746    408748378 :       sections[kg] = sections[kg + 1] + 1;
     747              :     }
     748              :   }
     749     17161635 : }
     750              : 
     751              : /*******************************************************************************
     752              :  * \brief Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
     753              :  * \author Ole Schuett
     754              :  ******************************************************************************/
     755              : static inline void
     756     17161635 : general_fill_exp_table(const int idir, const int jdir, const int index_min[3],
     757              :                        const int index_max[3], const double zetp,
     758              :                        const double dh[3][3], const double gp[3],
     759              :                        double exp_table[]) {
     760              : 
     761     17161635 :   const int stride_i = index_max[idir] - index_min[idir] + 1;
     762     17161635 :   const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
     763     17161635 :                       dh[idir][2] * dh[idir][2];
     764     17161635 :   const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
     765     17161635 :                       dh[idir][2] * dh[jdir][2];
     766              : 
     767    468986773 :   for (int i = index_min[idir]; i <= index_max[idir]; i++) {
     768    451825138 :     const double di = i - gp[idir];
     769    451825138 :     const double rii = di * di * h_ii;
     770    451825138 :     const double rij_unit = di * h_ij;
     771    451825138 :     const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
     772              : 
     773              :     // compute exponentials symmetrically around cube center
     774    451825138 :     const int j_center = (int)gp[jdir];
     775    451825138 :     const double dj_center = j_center - gp[jdir];
     776    451825138 :     const double rij_center = dj_center * rij_unit;
     777    451825138 :     const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
     778              : 
     779              :     // above center
     780    451825138 :     double exp_ij = exp_ij_center;
     781   7278866800 :     for (int j = j_center; j <= index_max[jdir]; j++) {
     782   6827041662 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     783   6827041662 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     784   6827041662 :       exp_ij *= exp_ij_unit;
     785              :     }
     786              : 
     787              :     // below center
     788    451825138 :     const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
     789    451825138 :     exp_ij = exp_ij_center * exp_ij_unit_inv;
     790   6787346931 :     for (int j = j_center - 1; j >= index_min[jdir]; j--) {
     791   6335521793 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     792   6335521793 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     793   6335521793 :       exp_ij *= exp_ij_unit_inv;
     794              :     }
     795              :   }
     796     17161635 : }
     797              : 
     798              : /*******************************************************************************
     799              :  * \brief Collocates coefficients C_ijk onto the grid for general case.
     800              :  * \author Ole Schuett
     801              :  ******************************************************************************/
     802              : static inline void
     803      5720545 : general_cijk_to_grid(const int border_mask, const int lp, const double zetp,
     804              :                      const double dh[3][3], const double dh_inv[3][3],
     805              :                      const double rp[3], const int npts_global[3],
     806              :                      const int npts_local[3], const int shift_local[3],
     807              :                      const int border_width[3], const double radius,
     808              :                      GRID_CONST_WHEN_COLLOCATE double *cijk,
     809      5720545 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     810              : 
     811              :   // Default for border_mask == 0.
     812      5720545 :   int bounds_i[2] = {0, npts_local[0] - 1};
     813      5720545 :   int bounds_j[2] = {0, npts_local[1] - 1};
     814      5720545 :   int bounds_k[2] = {0, npts_local[2] - 1};
     815              : 
     816              :   // See also rs_find_node() in task_list_methods.F.
     817              :   // If the bit is set then we need to exclude the border in that direction.
     818      5720545 :   if (border_mask & (1 << 0))
     819       110890 :     bounds_i[0] += border_width[0];
     820      5720545 :   if (border_mask & (1 << 1))
     821       110890 :     bounds_i[1] -= border_width[0];
     822      5720545 :   if (border_mask & (1 << 2))
     823       110890 :     bounds_j[0] += border_width[1];
     824      5720545 :   if (border_mask & (1 << 3))
     825       110890 :     bounds_j[1] -= border_width[1];
     826      5720545 :   if (border_mask & (1 << 4))
     827       110882 :     bounds_k[0] += border_width[2];
     828      5720545 :   if (border_mask & (1 << 5))
     829       110890 :     bounds_k[1] -= border_width[2];
     830              : 
     831              :   // center in grid coords
     832              :   // gp = MATMUL(dh_inv, rp)
     833      5720545 :   double gp[3] = {0.0, 0.0, 0.0};
     834     22882180 :   for (int i = 0; i < 3; i++) {
     835     68646540 :     for (int j = 0; j < 3; j++) {
     836     51484905 :       gp[i] += dh_inv[j][i] * rp[j];
     837              :     }
     838              :   }
     839              : 
     840              :   // Get the min max indices that contain at least the cube that contains a
     841              :   // sphere around rp of radius radius if the cell is very non-orthogonal this
     842              :   // implies that many useless points are included this estimate can be improved
     843              :   // (i.e. not box but sphere should be used)
     844      5720545 :   int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
     845      5720545 :   int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
     846     22882180 :   for (int i = -1; i <= 1; i++) {
     847     68646540 :     for (int j = -1; j <= 1; j++) {
     848    205939620 :       for (int k = -1; k <= 1; k++) {
     849    154454715 :         const double x = rp[0] + i * radius;
     850    154454715 :         const double y = rp[1] + j * radius;
     851    154454715 :         const double z = rp[2] + k * radius;
     852    617818860 :         for (int idir = 0; idir < 3; idir++) {
     853    463364145 :           const double resc =
     854    463364145 :               dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
     855    463364145 :           index_min[idir] = imin(index_min[idir], (int)floor(resc));
     856    463364145 :           index_max[idir] = imax(index_max[idir], (int)ceil(resc));
     857              :         }
     858              :       }
     859              :     }
     860              :   }
     861              : 
     862              :   // Precompute mappings
     863      5720545 :   const int range_i = index_max[0] - index_min[0] + 1;
     864      5720545 :   int map_i[range_i], sections_i[range_i];
     865      5720545 :   general_precompute_mapping(index_min[0], index_max[0], shift_local[0],
     866              :                              npts_global[0], bounds_i, map_i, sections_i);
     867      5720545 :   const int range_j = index_max[1] - index_min[1] + 1;
     868      5720545 :   int map_j[range_j], sections_j[range_j];
     869      5720545 :   general_precompute_mapping(index_min[1], index_max[1], shift_local[1],
     870              :                              npts_global[1], bounds_j, map_j, sections_j);
     871      5720545 :   const int range_k = index_max[2] - index_min[2] + 1;
     872      5720545 :   int map_k[range_k], sections_k[range_k];
     873      5720545 :   general_precompute_mapping(index_min[2], index_max[2], shift_local[2],
     874              :                              npts_global[2], bounds_k, map_k, sections_k);
     875              : 
     876              :   // Precompute exponentials
     877      5720545 :   double exp_ij[range_i * range_j];
     878      5720545 :   general_fill_exp_table(0, 1, index_min, index_max, zetp, dh, gp, exp_ij);
     879      5720545 :   double exp_jk[range_j * range_k];
     880      5720545 :   general_fill_exp_table(1, 2, index_min, index_max, zetp, dh, gp, exp_jk);
     881      5720545 :   double exp_ki[range_k * range_i];
     882      5720545 :   general_fill_exp_table(2, 0, index_min, index_max, zetp, dh, gp, exp_ki);
     883              : 
     884              :   // go over the grid, but cycle if the point is not within the radius
     885      5720545 :   const int cij_size = (lp + 1) * (lp + 1);
     886      5720545 :   double cij[cij_size];
     887    140109762 :   for (int k = index_min[2]; k <= index_max[2]; k++) {
     888    134389217 :     const int kg = map_k[k - index_min[2]];
     889    134389217 :     if (kg < 0) {
     890         4381 :       k += sections_k[k - index_min[2]]; // skip over out-of-bounds indicies
     891         4381 :       continue;
     892              :     }
     893              : 
     894              :     // zero coef_xyt
     895    134384836 :     memset(cij, 0, cij_size * sizeof(double));
     896              : 
     897              : #if (GRID_DO_COLLOCATE)
     898              :     // collocate
     899     75505867 :     general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
     900     75505867 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     901              :                         map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
     902              :                         exp_jk, exp_ki, cij, grid);
     903              : #else
     904              :     // integrate
     905     58878969 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     906              :                         map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
     907              :                         exp_jk, exp_ki, cij, grid);
     908     58878969 :     general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
     909              : #endif
     910              :   }
     911      5720545 : }
     912              : 
     913              : /*******************************************************************************
     914              :  * \brief Transforms coefficients C_xyz into C_ijk.
     915              :  * \author Ole Schuett
     916              :  ******************************************************************************/
     917              : static inline void
     918      5720545 : general_cxyz_to_cijk(const int lp, const double dh[3][3],
     919              :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     920      5720545 :                      GRID_CONST_WHEN_INTEGRATE double *cijk) {
     921              : 
     922              :   // transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
     923              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
     924              :   // sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
     925              : 
     926              :   // transform using multinomials
     927      5720545 :   double hmatgridp[lp + 1][3][3];
     928     22882180 :   for (int i = 0; i < 3; i++) {
     929     68646540 :     for (int j = 0; j < 3; j++) {
     930     51484905 :       hmatgridp[0][j][i] = 1.0;
     931    188497071 :       for (int k = 1; k <= lp; k++) {
     932    137012166 :         hmatgridp[k][j][i] = hmatgridp[k - 1][j][i] * dh[j][i];
     933              :       }
     934              :     }
     935              :   }
     936              : 
     937              :   const int lpx = lp;
     938     26664664 :   for (int klx = 0; klx <= lpx; klx++) {
     939     74109920 :     for (int jlx = 0; jlx <= lpx - klx; jlx++) {
     940    166108541 :       for (int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
     941    112942740 :         const int lx = ilx + jlx + klx;
     942    112942740 :         const int lpy = lp - lx;
     943    328187664 :         for (int kly = 0; kly <= lpy; kly++) {
     944    596062606 :           for (int jly = 0; jly <= lpy - kly; jly++) {
     945   1018701947 :             for (int ily = 0; ily <= lpy - kly - jly; ily++) {
     946    637884265 :               const int ly = ily + jly + kly;
     947    637884265 :               const int lpz = lp - lx - ly;
     948   1662113970 :               for (int klz = 0; klz <= lpz; klz++) {
     949   2613937866 :                 for (int jlz = 0; jlz <= lpz - klz; jlz++) {
     950   3988925190 :                   for (int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
     951   2399217029 :                     const int lz = ilz + jlz + klz;
     952   2399217029 :                     const int il = ilx + ily + ilz;
     953   2399217029 :                     const int jl = jlx + jly + jlz;
     954   2399217029 :                     const int kl = klx + kly + klz;
     955   2399217029 :                     const int lp1 = lp + 1;
     956   2399217029 :                     const int cijk_index =
     957   2399217029 :                         kl * lp1 * lp1 + jl * lp1 + il; // [kl,jl,il]
     958   2399217029 :                     const int cxyz_index =
     959   2399217029 :                         lz * lp1 * lp1 + ly * lp1 + lx; // [lz,ly,lx]
     960   2399217029 :                     const double p =
     961   2399217029 :                         hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
     962   2399217029 :                         hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
     963   2399217029 :                         hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
     964   2399217029 :                         hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
     965   2399217029 :                         hmatgridp[klz][2][2] * fac(lx) * fac(ly) * fac(lz) /
     966   2399217029 :                         (fac(ilx) * fac(ily) * fac(ilz) * fac(jlx) * fac(jly) *
     967   2399217029 :                          fac(jlz) * fac(klx) * fac(kly) * fac(klz));
     968              : #if (GRID_DO_COLLOCATE)
     969   1088885549 :                     cijk[cijk_index] += cxyz[cxyz_index] * p; // collocate
     970              : #else
     971   1310331480 :                     cxyz[cxyz_index] += cijk[cijk_index] * p; // integrate
     972              : #endif
     973              :                   }
     974              :                 }
     975              :               }
     976              :             }
     977              :           }
     978              :         }
     979              :       }
     980              :     }
     981              :   }
     982      5720545 : }
     983              : 
     984              : /*******************************************************************************
     985              :  * \brief Collocates coefficients C_xyz onto the grid for general case.
     986              :  * \author Ole Schuett
     987              :  ******************************************************************************/
     988              : static inline void
     989      5720545 : general_cxyz_to_grid(const int border_mask, const int lp, const double zetp,
     990              :                      const double dh[3][3], const double dh_inv[3][3],
     991              :                      const double rp[3], const int npts_global[3],
     992              :                      const int npts_local[3], const int shift_local[3],
     993              :                      const int border_width[3], const double radius,
     994              :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     995      5720545 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     996              : 
     997      5720545 :   const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
     998      5720545 :   double cijk[cijk_size];
     999      5720545 :   memset(cijk, 0, cijk_size * sizeof(double));
    1000              : 
    1001              : #if (GRID_DO_COLLOCATE)
    1002              :   // collocate
    1003      3192359 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
    1004      3192359 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1005              :                        npts_local, shift_local, border_width, radius, cijk,
    1006              :                        grid);
    1007              : #else
    1008              :   // integrate
    1009      2528186 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1010              :                        npts_local, shift_local, border_width, radius, cijk,
    1011              :                        grid);
    1012      2528186 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
    1013              : #endif
    1014      5720545 : }
    1015              : 
    1016              : /*******************************************************************************
    1017              :  * \brief Collocates coefficients C_xyz onto the grid.
    1018              :  * \author Ole Schuett
    1019              :  ******************************************************************************/
    1020              : static inline void
    1021    133973170 : cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
    1022              :              const double zetp, const double dh[3][3],
    1023              :              const double dh_inv[3][3], const double rp[3],
    1024              :              const int npts_global[3], const int npts_local[3],
    1025              :              const int shift_local[3], const int border_width[3],
    1026              :              const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
    1027              :              GRID_CONST_WHEN_INTEGRATE double *grid) {
    1028              : 
    1029    133973170 :   enum grid_library_kernel k;
    1030    133973170 :   if (orthorhombic && border_mask == 0) {
    1031    128252625 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_ORTHO : GRID_INTEGRATE_ORTHO;
    1032    128252625 :     ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
    1033              :                        shift_local, radius, cxyz, grid);
    1034              :   } else {
    1035      5720545 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_GENERAL : GRID_INTEGRATE_GENERAL;
    1036      5720545 :     general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1037              :                          npts_local, shift_local, border_width, radius, cxyz,
    1038              :                          grid);
    1039              :   }
    1040    133973170 :   grid_library_counter_add(lp, GRID_BACKEND_CPU, k, 1);
    1041    133973170 : }
    1042              : 
    1043              : /*******************************************************************************
    1044              :  * \brief Transforms coefficients C_ab into C_xyz.
    1045              :  * \author Ole Schuett
    1046              :  ******************************************************************************/
    1047    133973170 : static inline void cab_to_cxyz(const int la_max, const int la_min,
    1048              :                                const int lb_max, const int lb_min,
    1049              :                                const double prefactor, const double ra[3],
    1050              :                                const double rb[3], const double rp[3],
    1051              :                                GRID_CONST_WHEN_COLLOCATE double *cab,
    1052    133973170 :                                GRID_CONST_WHEN_INTEGRATE double *cxyz) {
    1053              : 
    1054              :   // Computes the polynomial expansion coefficients:
    1055              :   //     (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
    1056    133973170 :   const int lp = la_max + lb_max;
    1057    133973170 :   double alpha[3][lb_max + 1][la_max + 1][lp + 1];
    1058    133973170 :   memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) * sizeof(double));
    1059              : 
    1060    535892680 :   for (int i = 0; i < 3; i++) {
    1061    401919510 :     const double drpa = rp[i] - ra[i];
    1062    401919510 :     const double drpb = rp[i] - rb[i];
    1063   1143160101 :     for (int lxa = 0; lxa <= la_max; lxa++) {
    1064   2153712714 :       for (int lxb = 0; lxb <= lb_max; lxb++) {
    1065              :         double binomial_k_lxa = 1.0;
    1066              :         double a = 1.0;
    1067   3739324671 :         for (int k = 0; k <= lxa; k++) {
    1068              :           double binomial_l_lxb = 1.0;
    1069              :           double b = 1.0;
    1070   6145412760 :           for (int l = 0; l <= lxb; l++) {
    1071   3818560212 :             alpha[i][lxb][lxa][lxa - l + lxb - k] +=
    1072   3818560212 :                 binomial_k_lxa * binomial_l_lxb * a * b;
    1073   3818560212 :             binomial_l_lxb *= ((double)(lxb - l)) / ((double)(l + 1));
    1074   3818560212 :             b *= drpb;
    1075              :           }
    1076   2326852548 :           binomial_k_lxa *= ((double)(lxa - k)) / ((double)(k + 1));
    1077   2326852548 :           a *= drpa;
    1078              :         }
    1079              :       }
    1080              :     }
    1081              :   }
    1082              : 
    1083              :   //   *** initialise the coefficient matrix, we transform the sum
    1084              :   //
    1085              :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
    1086              :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
    1087              :   //         (z-a_z)**lza
    1088              :   //
    1089              :   // into
    1090              :   //
    1091              :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
    1092              :   //
    1093              :   // where p is center of the product gaussian, and lp = la_max + lb_max
    1094              :   // (current implementation is l**7)
    1095              :   //
    1096              : 
    1097    370361536 :   for (int lzb = 0; lzb <= lb_max; lzb++) {
    1098    707212407 :     for (int lza = 0; lza <= la_max; lza++) {
    1099   1221525107 :       for (int lyb = 0; lyb <= lb_max - lzb; lyb++) {
    1100   2023554470 :         for (int lya = 0; lya <= la_max - lza; lya++) {
    1101   1272853404 :           const int lxb_min = imax(lb_min - lzb - lyb, 0);
    1102   1272853404 :           const int lxa_min = imax(la_min - lza - lya, 0);
    1103   3049861154 :           for (int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
    1104   4421820807 :             for (int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
    1105   2644813057 :               const int ico = coset(lxa, lya, lza);
    1106   2644813057 :               const int jco = coset(lxb, lyb, lzb);
    1107   2644813057 :               const int cab_index = jco * ncoset(la_max) + ico; // [jco, ico]
    1108   7705185338 :               for (int lzp = 0; lzp <= lza + lzb; lzp++) {
    1109  22351433895 :                 for (int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
    1110  60726451911 :                   for (int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
    1111  43435390297 :                     const double p = alpha[0][lxb][lxa][lxp] *
    1112  43435390297 :                                      alpha[1][lyb][lya][lyp] *
    1113  43435390297 :                                      alpha[2][lzb][lza][lzp] * prefactor;
    1114  43435390297 :                     const int lp1 = lp + 1;
    1115  43435390297 :                     const int cxyz_index =
    1116  43435390297 :                         lzp * lp1 * lp1 + lyp * lp1 + lxp; // [lzp, lyp, lxp]
    1117              : #if (GRID_DO_COLLOCATE)
    1118  19680386176 :                     cxyz[cxyz_index] += cab[cab_index] * p; // collocate
    1119              : #else
    1120  23755004121 :                     cab[cab_index] += cxyz[cxyz_index] * p; // integrate
    1121              : #endif
    1122              :                   }
    1123              :                 }
    1124              :               }
    1125              :             }
    1126              :           }
    1127              :         }
    1128              :       }
    1129              :     }
    1130              :   }
    1131    133973170 : }
    1132              : 
    1133              : /*******************************************************************************
    1134              :  * \brief Collocates coefficients C_ab onto the grid.
    1135              :  * \author Ole Schuett
    1136              :  ******************************************************************************/
    1137              : static inline void
    1138    181182907 : cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max,
    1139              :             const int la_min, const int lb_max, const int lb_min,
    1140              :             const double zeta, const double zetb, const double rscale,
    1141              :             const double dh[3][3], const double dh_inv[3][3],
    1142              :             const double ra[3], const double rab[3], const int npts_global[3],
    1143              :             const int npts_local[3], const int shift_local[3],
    1144              :             const int border_width[3], const double radius,
    1145              :             GRID_CONST_WHEN_COLLOCATE double *cab,
    1146    181182907 :             GRID_CONST_WHEN_INTEGRATE double *grid) {
    1147              : 
    1148              :   // Check if radius is too small to be mapped onto grid of given resolution.
    1149    181182907 :   double dh_max = 0.0;
    1150    724731628 :   for (int i = 0; i < 3; i++) {
    1151   2174194884 :     for (int j = 0; j < 3; j++) {
    1152   1630646163 :       dh_max = fmax(dh_max, fabs(dh[i][j]));
    1153              :     }
    1154              :   }
    1155    181182907 :   if (2.0 * radius < dh_max) {
    1156     47209737 :     return;
    1157              :   }
    1158              : 
    1159    133973170 :   const double zetp = zeta + zetb;
    1160    133973170 :   const double f = zetb / zetp;
    1161    133973170 :   const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
    1162    133973170 :   const double prefactor = rscale * exp(-zeta * f * rab2);
    1163    133973170 :   double rp[3], rb[3];
    1164    535892680 :   for (int i = 0; i < 3; i++) {
    1165    401919510 :     rp[i] = ra[i] + f * rab[i];
    1166    401919510 :     rb[i] = ra[i] + rab[i];
    1167              :   }
    1168              : 
    1169    133973170 :   const int lp = la_max + lb_max;
    1170    133973170 :   const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
    1171    133973170 :   double cxyz[cxyz_size];
    1172    133973170 :   memset(cxyz, 0, cxyz_size * sizeof(double));
    1173              : 
    1174              : #if (GRID_DO_COLLOCATE)
    1175              :   // collocate
    1176     66935277 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
    1177     66935277 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1178              :                npts_local, shift_local, border_width, radius, cxyz, grid);
    1179              : #else
    1180              :   // integrate
    1181     67037893 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
    1182              :                npts_local, shift_local, border_width, radius, cxyz, grid);
    1183     67037893 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
    1184              : #endif
    1185              : }
    1186              : 
    1187              : // EOF
        

Generated by: LCOV version 2.0-1