LCOV - CP2K Regtests (git:c5411e0) - src/grid/dgemm/grid_dgemm

LCOV - code coverage report

Current view:	top level - src/grid/dgemm - grid_dgemm_utils.c (source / functions)		Hit	Total	Coverage
Test:	CP2K Regtests (git:c5411e0)	Lines:	140	217	64.5 %
Date:	2024-05-16 06:48:40	Functions:	10	13	76.9 %

          Line data    Source code

       1             : /*----------------------------------------------------------------------------*/
       2             : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3             : /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */
       4             : /*                                                                            */
       5             : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6             : /*----------------------------------------------------------------------------*/
       7             : 
       8             : #include <assert.h>
       9             : #include <limits.h>
      10             : #include <math.h>
      11             : #include <stdbool.h>
      12             : #include <stdio.h>
      13             : #include <string.h>
      14             : 
      15             : #ifdef __LIBXSMM
      16             : #include <libxsmm.h>
      17             : #endif
      18             : 
      19             : #ifdef __MKL
      20             : #include <mkl.h>
      21             : #endif
      22             : 
      23             : #include "../common/grid_common.h"
      24             : #include "grid_dgemm_tensor_local.h"
      25             : #include "grid_dgemm_utils.h"
      26             : 
      27           0 : void convert_to_lattice_coordinates(const double dh_inv_[3][3],
      28             :                                     const double *restrict const rp,
      29             :                                     double *restrict rp_c) {
      30           0 :   rp_c[0] =
      31           0 :       dh_inv_[0][0] * rp[0] + dh_inv_[1][0] * rp[1] + dh_inv_[0][0] * rp[2];
      32           0 :   rp_c[1] =
      33           0 :       dh_inv_[0][1] * rp[0] + dh_inv_[1][1] * rp[1] + dh_inv_[1][1] * rp[2];
      34           0 :   rp_c[2] =
      35           0 :       dh_inv_[0][2] * rp[0] + dh_inv_[1][2] * rp[1] + dh_inv_[2][2] * rp[2];
      36           0 : }
      37             : 
      38             : /* DO NOT CHANGE THIS. */
      39             : 
      40      223464 : void dgemm_simplified(dgemm_params *const m) {
      41      223464 :   if (m == NULL)
      42           0 :     abort();
      43             : 
      44             : #if defined(__LIBXSMM)
      45             : #if LIBXSMM_VERSION2(1, 17) >=                                                 \
      46             :         LIBXSMM_VERSION2(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR) &&      \
      47             :     (2079 > LIBXSMM_VERSION_PATCH)
      48      223464 :   if (m->use_libxsmm && m->op2 == 'N') {
      49             :     /* we are in row major but xsmm is in column major */
      50      219012 :     m->prefetch = LIBXSMM_PREFETCH_AUTO;
      51             :     /* in the future, more flags can be or'd together (like NONE | TRANS_B,
      52             :      * etc.)*/
      53      438024 :     m->flags =
      54      219012 :         (m->op1 != 'T' ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B);
      55             : 
      56      219012 :     if (m->kernel == NULL) {
      57      219012 :       m->kernel =
      58      219012 :           libxsmm_dmmdispatch(m->n, m->m, m->k, &m->ldb, &m->lda, &m->ldc,
      59      219012 :                               &m->alpha, &m->beta, &m->flags, &m->prefetch);
      60             :     }
      61             : 
      62      219012 :     if (m->kernel) {
      63      219012 :       m->kernel(m->b, m->a, m->c, m->b, m->a, m->c);
      64      219012 :       return;
      65             :     }
      66             :   }
      67             : #endif
      68             : #endif
      69             : 
      70             : #if defined(__MKL)
      71             :   // fall back to mkl
      72             :   if ((m->op1 == 'N') && (m->op2 == 'N'))
      73             :     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m->m, m->n, m->k,
      74             :                 m->alpha, m->a, m->lda, m->b, m->ldb, m->beta, m->c, m->ldc);
      75             : 
      76             :   if ((m->op1 == 'T') && (m->op2 == 'N'))
      77             :     cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, m->m, m->n, m->k,
      78             :                 m->alpha, m->a, m->lda, m->b, m->ldb, m->beta, m->c, m->ldc);
      79             : 
      80             :   if ((m->op1 == 'N') && (m->op2 == 'T'))
      81             :     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m->m, m->n, m->k,
      82             :                 m->alpha, m->a, m->lda, m->b, m->ldb, m->beta, m->c, m->ldc);
      83             : 
      84             :   if ((m->op1 == 'T') && (m->op2 == 'T'))
      85             :     cblas_dgemm(CblasRowMajor, CblasTrans, CblasTrans, m->m, m->n, m->k,
      86             :                 m->alpha, m->a, m->lda, m->b, m->ldb, m->beta, m->c, m->ldc);
      87             : 
      88             : #else
      89             : 
      90        4452 :   if ((m->op1 == 'N') && (m->op2 == 'N'))
      91           0 :     dgemm_("N", "N", &m->n, &m->m, &m->k, &m->alpha, m->b, &m->ldb, m->a,
      92           0 :            &m->lda, &m->beta, m->c, &m->ldc);
      93             : 
      94        4452 :   if ((m->op1 == 'T') && (m->op2 == 'N'))
      95           0 :     dgemm_("N", "T", &m->n, &m->m, &m->k, &m->alpha, m->b, &m->ldb, m->a,
      96           0 :            &m->lda, &m->beta, m->c, &m->ldc);
      97             : 
      98        4452 :   if ((m->op1 == 'T') && (m->op2 == 'T'))
      99           0 :     dgemm_("T", "T", &m->n, &m->m, &m->k, &m->alpha, m->b, &m->ldb, m->a,
     100           0 :            &m->lda, &m->beta, m->c, &m->ldc);
     101             : 
     102        4452 :   if ((m->op1 == 'N') && (m->op2 == 'T'))
     103        4452 :     dgemm_("T", "N", &m->n, &m->m, &m->k, &m->alpha, m->b, &m->ldb, m->a,
     104        4452 :            &m->lda, &m->beta, m->c, &m->ldc);
     105             : 
     106             : #endif
     107             : }
     108             : 
     109           0 : void batched_dgemm_simplified(dgemm_params *const m, const int batch_size) {
     110           0 :   assert(m != NULL);
     111           0 :   assert(batch_size > 0);
     112             : 
     113             : #if defined(__LIBXSMM)
     114             : #if LIBXSMM_VERSION2(1, 17) >=                                                 \
     115             :         LIBXSMM_VERSION2(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR) &&      \
     116             :     (2079 > LIBXSMM_VERSION_PATCH)
     117           0 :   if (m->use_libxsmm && m->op2 == 'N') {
     118             :     /* we are in row major but xsmm is in column major */
     119           0 :     m->prefetch = LIBXSMM_PREFETCH_AUTO;
     120             :     /* in the future, more flags can be or'd together (like NONE | TRANS_B,
     121             :      * etc.)*/
     122           0 :     m->flags =
     123           0 :         (m->op1 != 'T' ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B);
     124             : 
     125           0 :     if (m->kernel == NULL) {
     126           0 :       m->kernel =
     127           0 :           libxsmm_dmmdispatch(m->n, m->m, m->k, &m->ldb, &m->lda, &m->ldc,
     128           0 :                               &m->alpha, &m->beta, &m->flags, &m->prefetch);
     129             :     }
     130             : 
     131           0 :     if (m->kernel) {
     132           0 :       for (int s = 0; s < batch_size - 1; s++) {
     133           0 :         m->kernel(m[s].b, m[s].a, m[s].c, m[s + 1].b, m[s + 1].a, m[s + 1].c);
     134             :       }
     135           0 :       m->kernel(m[batch_size - 1].b, m[batch_size - 1].a, m[batch_size - 1].c,
     136           0 :                 m[batch_size - 1].b, m[batch_size - 1].a, m[batch_size - 1].c);
     137           0 :       return;
     138             :     }
     139             :   }
     140             : #endif
     141             : #endif
     142             : 
     143             : #if defined(__MKL)
     144             :   // fall back to mkl
     145             :   for (int s = 0; s < batch_size; s++) {
     146             :     if ((m[s].op1 == 'N') && (m[s].op2 == 'N'))
     147             :       cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m[s].m, m[s].n,
     148             :                   m[s].k, m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb,
     149             :                   m[s].beta, m[s].c, m[s].ldc);
     150             : 
     151             :     if ((m[s].op1 == 'T') && (m[s].op2 == 'N'))
     152             :       cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, m[s].m, m[s].n,
     153             :                   m[s].k, m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb,
     154             :                   m[s].beta, m[s].c, m[s].ldc);
     155             : 
     156             :     if ((m[s].op1 == 'N') && (m[s].op2 == 'T'))
     157             :       cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m[s].m, m[s].n,
     158             :                   m[s].k, m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb,
     159             :                   m[s].beta, m[s].c, m[s].ldc);
     160             : 
     161             :     if ((m[s].op1 == 'T') && (m[s].op2 == 'T'))
     162             :       cblas_dgemm(CblasRowMajor, CblasTrans, CblasTrans, m[s].m, m[s].n, m[s].k,
     163             :                   m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb, m[s].beta,
     164             :                   m[s].c, m[s].ldc);
     165             :   }
     166             : #else
     167           0 :   for (int s = 0; s < batch_size; s++) {
     168           0 :     if ((m[s].op1 == 'N') && (m[s].op2 == 'N'))
     169           0 :       dgemm_("N", "N", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
     170           0 :              &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
     171             : 
     172           0 :     if ((m[s].op1 == 'T') && (m[s].op2 == 'N'))
     173           0 :       dgemm_("N", "T", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
     174           0 :              &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
     175             : 
     176           0 :     if ((m[s].op1 == 'T') && (m[s].op2 == 'T'))
     177           0 :       dgemm_("T", "T", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
     178           0 :              &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
     179             : 
     180           0 :     if ((m[s].op1 == 'N') && (m[s].op2 == 'T'))
     181           0 :       dgemm_("T", "N", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
     182           0 :              &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
     183             :   }
     184             : #endif
     185             : }
     186             : 
     187      375160 : void extract_sub_grid(const int *lower_corner, const int *upper_corner,
     188             :                       const int *position, const tensor *const grid,
     189             :                       tensor *const subgrid) {
     190      375160 :   int position1[3] = {0, 0, 0};
     191             : 
     192      375160 :   if (position) {
     193      375160 :     position1[0] = position[0];
     194      375160 :     position1[1] = position[1];
     195      375160 :     position1[2] = position[2];
     196             :   }
     197             : 
     198      375160 :   const int sizex = upper_corner[2] - lower_corner[2];
     199      375160 :   const int sizey = upper_corner[1] - lower_corner[1];
     200      375160 :   const int sizez = upper_corner[0] - lower_corner[0];
     201             : 
     202     4919976 :   for (int z = 0; z < sizez; z++) {
     203             :     /* maybe use matcopy from libxsmm if possible */
     204    61914362 :     for (int y = 0; y < sizey; y++) {
     205    57369546 :       double *restrict src =
     206    57369546 :           &idx3(grid[0], lower_corner[0] + z - grid->window_shift[0],
     207             :                 lower_corner[1] + y - grid->window_shift[1],
     208             :                 lower_corner[2] - grid->window_shift[2]);
     209    57369546 :       double *restrict dst =
     210    57369546 :           &idx3(subgrid[0], position1[0] + z, position1[1] + y, position1[2]);
     211             : #ifdef __LIBXSMM
     212             :       LIBXSMM_PRAGMA_SIMD
     213             : #else
     214             :       // #pragma omp simd linear(dst, src) simdlen(8)
     215             :       GRID_PRAGMA_SIMD((dst, src), 8)
     216             : #endif
     217    57369546 :       for (int x = 0; x < sizex; x++) {
     218   778390919 :         dst[x] = src[x];
     219             :       }
     220             :     }
     221             :   }
     222             : 
     223      375160 :   return;
     224             : }
     225             : 
     226      395504 : void add_sub_grid(const int *lower_corner, const int *upper_corner,
     227             :                   const int *position, const tensor *subgrid, tensor *grid) {
     228      395504 :   int position1[3] = {0, 0, 0};
     229             : 
     230      395504 :   if (position) {
     231      395504 :     position1[0] = position[0];
     232      395504 :     position1[1] = position[1];
     233      395504 :     position1[2] = position[2];
     234             :   }
     235             : 
     236      395504 :   const int sizex = upper_corner[2] - lower_corner[2];
     237      395504 :   const int sizey = upper_corner[1] - lower_corner[1];
     238      395504 :   const int sizez = upper_corner[0] - lower_corner[0];
     239             : 
     240     5129240 :   for (int z = 0; z < sizez; z++) {
     241     4733736 :     double *restrict dst =
     242     4733736 :         &idx3(grid[0], lower_corner[0] + z, lower_corner[1], lower_corner[2]);
     243     4733736 :     double *restrict src =
     244     4733736 :         &idx3(subgrid[0], position1[0] + z, position1[1], position1[2]);
     245    58574778 :     for (int y = 0; y < sizey - 1; y++) {
     246             :       //__builtin_prefetch(dst + grid->ld_);
     247             : #ifdef __LIBXSMM
     248             :       LIBXSMM_PRAGMA_SIMD
     249             : #else
     250             :       // #pragma omp simd linear(dst, src) simdlen(8)
     251             :       GRID_PRAGMA_SIMD((dst, src), 8)
     252             : #endif
     253             :       for (int x = 0; x < sizex; x++) {
     254   728347222 :         dst[x] += src[x];
     255             :       }
     256             : 
     257    53841042 :       dst += grid->ld_;
     258    53841042 :       src += subgrid->ld_;
     259             :     }
     260             : 
     261             :     // #pragma omp simd linear(dst, src) simdlen(8)
     262     4733736 :     GRID_PRAGMA_SIMD((dst, src), 8)
     263             :     for (int x = 0; x < sizex; x++) {
     264    58292355 :       dst[x] += src[x];
     265             :     }
     266             :   }
     267      395504 :   return;
     268             : }
     269             : 
     270       88000 : int compute_cube_properties(const bool ortho, const double radius,
     271             :                             const double dh[3][3], const double dh_inv[3][3],
     272             :                             const double *rp, double *disr_radius,
     273             :                             double *roffset, int *cubecenter, int *lb_cube,
     274             :                             int *ub_cube, int *cube_size) {
     275       88000 :   int cmax = 0;
     276             : 
     277             :   /* center of the gaussian in the lattice coordinates */
     278       88000 :   double rp1[3];
     279             : 
     280             :   /* it is in the lattice vector frame */
     281      352000 :   for (int i = 0; i < 3; i++) {
     282             :     double dh_inv_rp = 0.0;
     283     1056000 :     for (int j = 0; j < 3; j++) {
     284      792000 :       dh_inv_rp += dh_inv[j][i] * rp[j];
     285             :     }
     286      264000 :     rp1[2 - i] = dh_inv_rp;
     287      264000 :     cubecenter[2 - i] = floor(dh_inv_rp);
     288             :   }
     289             : 
     290       88000 :   if (ortho) {
     291             :     /* seting up the cube parameters */
     292       42228 :     const double dx[3] = {dh[2][2], dh[1][1], dh[0][0]};
     293       42228 :     const double dx_inv[3] = {dh_inv[2][2], dh_inv[1][1], dh_inv[0][0]};
     294             :     /* cube center */
     295             : 
     296             :     /* lower and upper bounds */
     297             : 
     298             :     // Historically, the radius gets discretized.
     299       42228 :     const double drmin = fmin(dh[0][0], fmin(dh[1][1], dh[2][2]));
     300       42228 :     *disr_radius = drmin * fmax(1.0, ceil(radius / drmin));
     301             : 
     302      168912 :     for (int i = 0; i < 3; i++) {
     303      126684 :       roffset[i] = rp[2 - i] - ((double)cubecenter[i]) * dx[i];
     304             :     }
     305             : 
     306      168912 :     for (int i = 0; i < 3; i++) {
     307      126684 :       lb_cube[i] = ceil(-1e-8 - *disr_radius * dx_inv[i]);
     308             :     }
     309             : 
     310             :     // Symmetric interval
     311      168912 :     for (int i = 0; i < 3; i++) {
     312      126684 :       ub_cube[i] = 1 - lb_cube[i];
     313             :     }
     314             : 
     315             :   } else {
     316      183088 :     for (int idir = 0; idir < 3; idir++) {
     317      137316 :       lb_cube[idir] = INT_MAX;
     318      137316 :       ub_cube[idir] = INT_MIN;
     319             :     }
     320             : 
     321             :     /* compute the size of the box. It is a fairly trivial way to compute
     322             :      * the box and it may have far more point than needed */
     323      183088 :     for (int i = -1; i <= 1; i++) {
     324      549264 :       for (int j = -1; j <= 1; j++) {
     325     1647792 :         for (int k = -1; k <= 1; k++) {
     326     1235844 :           double x[3] = {/* rp[0] + */ ((double)i) * radius,
     327     1235844 :                          /* rp[1] + */ ((double)j) * radius,
     328     1235844 :                          /* rp[2] + */ ((double)k) * radius};
     329             :           /* convert_to_lattice_coordinates(dh_inv, x, y); */
     330     4943376 :           for (int idir = 0; idir < 3; idir++) {
     331     3707532 :             const double resc = dh_inv[0][idir] * x[0] +
     332     3707532 :                                 dh_inv[1][idir] * x[1] + dh_inv[2][idir] * x[2];
     333     3707532 :             lb_cube[2 - idir] = imin(lb_cube[2 - idir], floor(resc));
     334     3707532 :             ub_cube[2 - idir] = imax(ub_cube[2 - idir], ceil(resc));
     335             :           }
     336             :         }
     337             :       }
     338             :     }
     339             : 
     340             :     /* compute the offset in lattice coordinates */
     341             : 
     342      183088 :     for (int i = 0; i < 3; i++) {
     343      137316 :       roffset[i] = rp1[i] - cubecenter[i];
     344             :     }
     345             : 
     346       45772 :     *disr_radius = radius;
     347             :   }
     348             : 
     349             :   /* compute the cube size ignoring periodicity */
     350             : 
     351             :   /* the +1 is normal here */
     352       88000 :   cube_size[0] = ub_cube[0] - lb_cube[0] + 1;
     353       88000 :   cube_size[1] = ub_cube[1] - lb_cube[1] + 1;
     354       88000 :   cube_size[2] = ub_cube[2] - lb_cube[2] + 1;
     355             : 
     356      352000 :   for (int i = 0; i < 3; i++) {
     357      264000 :     cmax = imax(cmax, cube_size[i]);
     358             :   }
     359             : 
     360       88000 :   return cmax;
     361             : }
     362             : 
     363           0 : void return_cube_position(const int *const lb_grid,
     364             :                           const int *const cube_center,
     365             :                           const int *const lower_boundaries_cube,
     366             :                           const int *const period, int *const position) {
     367           0 :   for (int i = 0; i < 3; i++)
     368           0 :     position[i] = modulo(cube_center[i] - lb_grid[i] + lower_boundaries_cube[i],
     369           0 :                          period[i]);
     370           0 : }
     371             : 
     372        1256 : void verify_orthogonality(const double dh[3][3], bool orthogonal[3]) {
     373        1256 :   double norm1, norm2, norm3;
     374             : 
     375        1256 :   norm1 = dh[0][0] * dh[0][0] + dh[0][1] * dh[0][1] + dh[0][2] * dh[0][2];
     376        1256 :   norm2 = dh[1][0] * dh[1][0] + dh[1][1] * dh[1][1] + dh[1][2] * dh[1][2];
     377        1256 :   norm3 = dh[2][0] * dh[2][0] + dh[2][1] * dh[2][1] + dh[2][2] * dh[2][2];
     378             : 
     379        1256 :   norm1 = 1.0 / sqrt(norm1);
     380        1256 :   norm2 = 1.0 / sqrt(norm2);
     381        1256 :   norm3 = 1.0 / sqrt(norm3);
     382             : 
     383             :   /* x z */
     384        1256 :   orthogonal[0] =
     385        1256 :       ((fabs(dh[0][0] * dh[2][0] + dh[0][1] * dh[2][1] + dh[0][2] * dh[2][2]) *
     386        1256 :         norm1 * norm3) < 1e-12);
     387             :   /* y z */
     388        1256 :   orthogonal[1] =
     389        1256 :       ((fabs(dh[1][0] * dh[2][0] + dh[1][1] * dh[2][1] + dh[1][2] * dh[2][2]) *
     390        1256 :         norm2 * norm3) < 1e-12);
     391             :   /* x y */
     392        1256 :   orthogonal[2] =
     393        1256 :       ((fabs(dh[0][0] * dh[1][0] + dh[0][1] * dh[1][1] + dh[0][2] * dh[1][2]) *
     394        1256 :         norm1 * norm2) < 1e-12);
     395        1256 : }
     396             : 
     397             : #ifndef __MKL
     398             : extern void dger_(const int *M, const int *N, const double *alpha,
     399             :                   const double *X, const int *incX, const double *Y,
     400             :                   const int *incY, double *A, const int *lda);
     401             : extern void dgemv_(const char *Trans, const int *M, const int *N,
     402             :                    const double *alpha, const double *A, const int *lda,
     403             :                    const double *X, const int *incX, const double *beta,
     404             :                    double *Y, const int *incY);
     405             : 
     406          80 : void cblas_daxpy(const int N, const double alpha, const double *X,
     407             :                  const int incX, double *Y, const int incY) {
     408          80 :   if ((incX == 1) && (incY == 1)) {
     409         776 :     for (int i = 0; i < N; i++)
     410         696 :       Y[i] += alpha * X[i];
     411             :     return;
     412             :   }
     413             : 
     414           0 :   if (incX == 1) {
     415           0 :     for (int i = 0; i < N; i++)
     416           0 :       Y[i + incY] += alpha * X[i];
     417             :     return;
     418             :   }
     419             : 
     420           0 :   if (incY == 1) {
     421           0 :     for (int i = 0; i < N; i++)
     422           0 :       Y[i] += alpha * X[i + incX];
     423             :     return;
     424             :   }
     425             : 
     426           0 :   for (int i = 0; i < N; i++)
     427           0 :     Y[i + incY] += alpha * X[i + incX];
     428             :   return;
     429             : }
     430             : 
     431        9280 : double cblas_ddot(const int N, const double *X, const int incX, const double *Y,
     432             :                   const int incY) {
     433        9280 :   if ((incX == incY) && (incY == 1)) {
     434             :     double res = 0.0;
     435             : 
     436      230673 :     for (int i = 0; i < N; i++) {
     437      221393 :       res += X[i] * Y[i];
     438             :     }
     439             :     return res;
     440             :   }
     441             : 
     442           0 :   if (incX == 1) {
     443             :     double res = 0.0;
     444             : 
     445           0 :     for (int i = 0; i < N; i++) {
     446           0 :       res += X[i] * Y[i + incY];
     447             :     }
     448             :     return res;
     449             :   }
     450             : 
     451           0 :   if (incY == 1) {
     452             :     double res = 0.0;
     453             : 
     454           0 :     for (int i = 0; i < N; i++) {
     455           0 :       res += X[i + incX] * Y[i];
     456             :     }
     457             :     return res;
     458             :   }
     459             : 
     460             :   double res = 0.0;
     461             : 
     462           0 :   for (int i = 0; i < N; i++) {
     463           0 :     res += X[i + incX] * Y[i + incY];
     464             :   }
     465             :   return res;
     466             : }
     467             : 
     468       66892 : void cblas_dger(const CBLAS_LAYOUT Layout, const int M, const int N,
     469             :                 const double alpha, const double *X, const int incX,
     470             :                 const double *Y, const int incY, double *A, const int lda) {
     471       66892 :   if (Layout == CblasRowMajor) {
     472       66892 :     dger_(&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
     473             :   } else {
     474           0 :     dger_(&N, &M, &alpha, X, &incX, Y, &incY, A, &lda);
     475             :   }
     476       66892 : }
     477             : 
     478             : /* code taken from gsl_cblas. We really need to use a proper cblas interface and
     479             :  * build system.... */
     480       18560 : void cblas_dgemv(const CBLAS_LAYOUT order, const CBLAS_TRANSPOSE TransA,
     481             :                  const int M, const int N, const double alpha, const double *A,
     482             :                  const int lda, const double *X, const int incX,
     483             :                  const double beta, double *Y, const int incY) {
     484             : 
     485       18560 :   if (order == CblasColMajor) {
     486           0 :     if (TransA == CblasTrans)
     487           0 :       dgemv_("T", &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
     488             :     else {
     489           0 :       dgemv_("N", &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
     490             :     }
     491             :   } else {
     492       18560 :     if (TransA == CblasTrans)
     493           0 :       dgemv_("N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
     494             :     else {
     495       18560 :       dgemv_("T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
     496             :     }
     497             :   }
     498       18560 : }
     499             : #endif
     500             : 
     501     1287557 : void compute_interval(const int *const map, const int full_size, const int size,
     502             :                       const int cube_size, const int x1, int *x,
     503             :                       int *const lower_corner, int *const upper_corner,
     504             :                       const Interval window) {
     505     1287557 :   if (size == full_size) {
     506             :     /* we have the full grid in that direction */
     507             :     /* lower boundary is within the window */
     508     1287557 :     *lower_corner = x1;
     509             :     /* now compute the upper corner */
     510             :     /* needs to be as large as possible. basically I take [x1..
     511             :      * min(grid.full_size, cube_size - x)] */
     512             : 
     513     1287557 :     *upper_corner = compute_next_boundaries(x1, *x, full_size, cube_size);
     514             : 
     515             :     /* { */
     516             :     /*   Interval tz = create_interval(*lower_corner, *upper_corner); */
     517             :     /*   Interval res = intersection_interval(tz, window); */
     518             :     /*   *lower_corner = res.xmin; */
     519             :     /*   *upper_corner = res.xmax; */
     520             :     /* } */
     521             :   } else {
     522           0 :     *lower_corner = x1;
     523           0 :     *upper_corner = x1 + 1;
     524             : 
     525             :     // the map is always increasing by 1 except when we cross the boundaries of
     526             :     // the grid and pbc are applied. Since we are only interested in by a
     527             :     // subwindow of the full table we check that the next point is inside the
     528             :     // window of interest and is also equal to the previous point + 1. The last
     529             :     // check is pointless in practice.
     530             : 
     531           0 :     for (int i = *x + 1; (i < cube_size) && (*upper_corner == map[i]) &&
     532           0 :                          is_point_in_interval(map[i], window);
     533           0 :          i++) {
     534           0 :       (*upper_corner)++;
     535             :     }
     536             :   }
     537     1287557 : }

Generated by: LCOV version 1.15