LCOV - code coverage report
Current view: top level - src/grid/dgemm - grid_dgemm_non_orthorombic_corrections.c (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:b195825) Lines: 97 218 44.5 %
Date: 2024-04-20 06:29:22 Functions: 7 10 70.0 %

          Line data    Source code
       1             : /*----------------------------------------------------------------------------*/
       2             : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3             : /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */
       4             : /*                                                                            */
       5             : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6             : /*----------------------------------------------------------------------------*/
       7             : 
       8             : #include "grid_dgemm_non_orthorombic_corrections.h"
       9             : 
      10             : #include <math.h>
      11             : #include <stdio.h>
      12             : #include <stdlib.h>
      13             : #include <string.h>
      14             : 
      15             : #include "../common/grid_common.h"
      16             : #include "grid_dgemm_utils.h"
      17             : 
      18     1108868 : double exp_recursive(const double c_exp, const double c_exp_minus_1,
      19             :                      const int index) {
      20     1108868 :   if (index == -1)
      21             :     return c_exp_minus_1;
      22             : 
      23     1108868 :   if (index == 1)
      24             :     return c_exp;
      25             : 
      26     1108868 :   double res = 1.0;
      27             : 
      28     1108868 :   if (index < 0) {
      29    15250368 :     for (int i = 0; i < -index; i++) {
      30    14141500 :       res *= c_exp_minus_1;
      31             :     }
      32             :     return res;
      33             :   }
      34             : 
      35           0 :   if (index > 0) {
      36           0 :     for (int i = 0; i < index; i++) {
      37           0 :       res *= c_exp;
      38             :     }
      39             :     return res;
      40             :   }
      41             : 
      42             :   return 1.0;
      43             : }
      44             : 
      45       91544 : void exp_i(const double alpha, const int imin, const int imax,
      46             :            double *restrict const res) {
      47       91544 :   const double c_exp_co = exp(alpha);
      48             :   /* const double c_exp_minus_1 = 1/ c_exp; */
      49       91544 :   res[0] = exp(imin * alpha);
      50     2245864 :   for (int i = 1; i < (imax - imin); i++) {
      51     2154320 :     res[i] = res[i - 1] *
      52             :              c_exp_co; // exp_recursive(c_exp_co, 1.0 / c_exp_co, i + imin);
      53             :   }
      54       91544 : }
      55             : 
      56       45772 : void exp_ij(const double alpha, const int offset_i, const int imin,
      57             :             const int imax, const int offset_j, const int jmin, const int jmax,
      58             :             tensor *exp_ij_) {
      59       45772 :   double c_exp = exp(alpha * imin);
      60       45772 :   const double c_exp_co = exp(alpha);
      61             : 
      62     1154640 :   for (int i = 0; i < (imax - imin); i++) {
      63     1108868 :     double *restrict dst = &idx2(exp_ij_[0], i + offset_i, offset_j);
      64     1108868 :     double ctmp = exp_recursive(c_exp, 1.0 / c_exp, jmin);
      65             : 
      66    30500736 :     for (int j = 0; j < (jmax - jmin); j++) {
      67    29391868 :       dst[j] *= ctmp;
      68    29391868 :       ctmp *= c_exp;
      69             :     }
      70     1108868 :     c_exp *= c_exp_co;
      71             :   }
      72       45772 : }
      73             : 
      74       45772 : void calculate_non_orthorombic_corrections_tensor(
      75             :     const double mu_mean, const double *r_ab, const double basis[3][3],
      76             :     const int *const xmin, const int *const xmax, bool plane[3],
      77             :     tensor *const Exp) {
      78             :   // zx, zy, yx
      79       45772 :   const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
      80             : 
      81             :   // need to review this
      82       45772 :   const double c[3] = {
      83             :       /* alpha gamma */
      84       45772 :       -2.0 * mu_mean *
      85       45772 :           (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
      86       45772 :            basis[0][2] * basis[2][2]),
      87             :       /* beta gamma */
      88       45772 :       -2.0 * mu_mean *
      89       45772 :           (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
      90       45772 :            basis[1][2] * basis[2][2]),
      91             :       /* alpha beta */
      92       45772 :       -2.0 * mu_mean *
      93       45772 :           (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
      94       45772 :            basis[0][2] * basis[1][2])};
      95             : 
      96             :   /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
      97             :    * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
      98             :    * better with only 7 exponentials
      99             :    *
     100             :    * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
     101             :    * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
     102             :    * that the sum of two terms in an exponential is equal to the product of
     103             :    * the two exponentials.
     104             :    *
     105             :    * this means that exp (a i) with i integer can be computed recursively with
     106             :    * one exponential only
     107             :    */
     108             : 
     109             :   /* we have a orthorombic case */
     110       45772 :   if (plane[0] && plane[1] && plane[2])
     111           0 :     return;
     112             : 
     113       45772 :   tensor exp_tmp;
     114       45772 :   double *x1, *x2;
     115             :   /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
     116       45772 :   const int max_elem =
     117       45772 :       imax(imax(xmax[0] - xmin[0], xmax[1] - xmin[1]), xmax[2] - xmin[2]) + 1;
     118       45772 :   initialize_tensor_3(Exp, 3, max_elem, max_elem);
     119       45772 :   realloc_tensor(Exp);
     120             : 
     121       45772 :   x1 = grid_allocate_scratch(sizeof(double) * max_elem);
     122       45772 :   x2 = grid_allocate_scratch(sizeof(double) * max_elem);
     123       45772 :   initialize_tensor_2(&exp_tmp, Exp->size[1], Exp->size[2]);
     124             : 
     125       45772 :   memset(&idx3(Exp[0], 0, 0, 0), 0, sizeof(double) * Exp->alloc_size_);
     126      183088 :   for (int dir = 0; dir < 3; dir++) {
     127      137316 :     int d1 = n[dir][0];
     128      137316 :     int d2 = n[dir][1];
     129             : 
     130      137316 :     if (!plane[dir]) {
     131       45772 :       const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
     132             : 
     133       45772 :       exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1);
     134       45772 :       exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2);
     135             : 
     136       45772 :       exp_tmp.data = &idx3(Exp[0], dir, 0, 0);
     137       45772 :       cblas_dger(CblasRowMajor, xmax[d1] - xmin[d1] + 1,
     138       45772 :                  xmax[d2] - xmin[d2] + 1, c_exp_const, x1, 1, x2, 1,
     139             :                  &idx2(exp_tmp, 0, 0), exp_tmp.ld_);
     140       45772 :       exp_ij(c[dir], 0, xmin[d1], xmax[d1] + 1, 0, xmin[d2], xmax[d2] + 1,
     141             :              &exp_tmp);
     142             :     }
     143             :   }
     144       45772 :   grid_free_scratch(x1);
     145       45772 :   grid_free_scratch(x2);
     146             : }
     147             : 
     148           0 : void calculate_non_orthorombic_corrections_tensor_blocked(
     149             :     const double mu_mean, const double *r_ab, const double basis[3][3],
     150             :     const int *const lower_corner, const int *const upper_corner,
     151             :     const int *const block_size, const int *const offset, const int *const xmin,
     152             :     const int *const xmax, bool *plane, tensor *const Exp) {
     153             :   // zx, zy, yx
     154           0 :   const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
     155             : 
     156             :   // need to review this
     157           0 :   const double c[3] = {
     158             :       /* alpha gamma */
     159           0 :       -2.0 * mu_mean *
     160           0 :           (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
     161           0 :            basis[0][2] * basis[2][2]),
     162             :       /* beta gamma */
     163           0 :       -2.0 * mu_mean *
     164           0 :           (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
     165           0 :            basis[1][2] * basis[2][2]),
     166             :       /* alpha beta */
     167           0 :       -2.0 * mu_mean *
     168           0 :           (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
     169           0 :            basis[0][2] * basis[1][2])};
     170             : 
     171             :   /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
     172             :    * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
     173             :    * better with only 7 exponentials
     174             :    *
     175             :    * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
     176             :    * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
     177             :    * that the sum of two terms in an exponential is equal to the product of
     178             :    * the two exponentials.
     179             :    *
     180             :    * this means that exp (a i) with i integer can be computed recursively with
     181             :    * one exponential only
     182             :    */
     183             : 
     184             :   /* we have a orthorombic case */
     185           0 :   if (plane[0] && plane[1] && plane[2])
     186           0 :     return;
     187             : 
     188           0 :   tensor exp_blocked;
     189           0 :   double *x1, *x2;
     190             :   /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
     191           0 :   initialize_tensor_2(&exp_blocked, imax(block_size[0], block_size[1]),
     192             :                       imax(block_size[1], block_size[2]));
     193             : 
     194           0 :   const int cube_size[3] = {(upper_corner[0] - lower_corner[0]) * block_size[0],
     195           0 :                             (upper_corner[1] - lower_corner[1]) * block_size[1],
     196           0 :                             (upper_corner[2] - lower_corner[2]) *
     197             :                                 block_size[2]};
     198             : 
     199           0 :   const int max_elem = imax(imax(cube_size[0], cube_size[1]), cube_size[2]);
     200           0 :   x1 = grid_allocate_scratch(sizeof(double) * max_elem);
     201           0 :   x2 = grid_allocate_scratch(sizeof(double) * max_elem);
     202             : 
     203           0 :   initialize_tensor_4(Exp, 3,
     204           0 :                       imax(upper_corner[0] - lower_corner[0],
     205             :                            upper_corner[1] - lower_corner[1]),
     206           0 :                       imax(upper_corner[2] - lower_corner[2],
     207           0 :                            upper_corner[1] - lower_corner[1]),
     208           0 :                       exp_blocked.alloc_size_);
     209             : 
     210           0 :   realloc_tensor(Exp);
     211             :   /* memset(Exp->data, 0, sizeof(double) * Exp->alloc_size_); */
     212             : 
     213           0 :   for (int dir = 0; dir < 3; dir++) {
     214           0 :     int d1 = n[dir][0];
     215           0 :     int d2 = n[dir][1];
     216             : 
     217           0 :     if (!plane[dir]) {
     218           0 :       const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
     219           0 :       memset(x1, 0, sizeof(double) * max_elem);
     220           0 :       memset(x2, 0, sizeof(double) * max_elem);
     221             :       /* memset(exp_tmp.data, 0, sizeof(double) * exp_tmp.alloc_size_); */
     222           0 :       exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1 + offset[d1]);
     223           0 :       exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2 + offset[d2]);
     224             : 
     225           0 :       for (int y = 0; y < (upper_corner[d1] - lower_corner[d1]); y++) {
     226           0 :         const int y_1 = y * block_size[d1];
     227           0 :         for (int x = 0; x < (upper_corner[d2] - lower_corner[d2]); x++) {
     228           0 :           const int x_1 = x * block_size[d2];
     229           0 :           exp_blocked.data = &idx4(Exp[0], dir, y, x, 0);
     230             : 
     231           0 :           for (int y2 = 0; y2 < block_size[d1]; y2++) {
     232           0 :             double *restrict dst = &idx2(exp_blocked, y2, 0);
     233           0 :             const double scal = x1[y_1 + y2] * c_exp_const;
     234           0 :             double *restrict src = &x2[x_1];
     235             :             //#pragma omp simd linear(dst, src) simdlen(8)
     236           0 :             GRID_PRAGMA_SIMD((dst, src), 8)
     237           0 :             for (int x3 = 0; x3 < block_size[d2]; x3++) {
     238           0 :               dst[x3] = scal * src[x3];
     239             :             }
     240             :           }
     241             : 
     242           0 :           exp_ij(c[dir], 0, xmin[d1] - offset[d1] + y_1,
     243           0 :                  xmin[d1] - offset[d1] + y_1 + block_size[d1], 0,
     244             :                  xmin[d2] - offset[d2] + x_1,
     245           0 :                  xmin[d2] - offset[d2] + x_1 + block_size[d2], &exp_blocked);
     246             :         }
     247             :       }
     248             :     }
     249             :   }
     250             : 
     251           0 :   grid_free_scratch(x1);
     252           0 :   grid_free_scratch(x2);
     253             :   /* free(exp_tmp.data); */
     254             : }
     255             : 
     256       22497 : void apply_non_orthorombic_corrections(const bool plane[restrict 3],
     257             :                                        const tensor *const Exp,
     258             :                                        tensor *const cube) {
     259             :   // Well we should never call non orthorombic corrections if everything is
     260             :   // orthorombic
     261       22497 :   if (plane[0] && plane[1] && plane[2])
     262             :     return;
     263             : 
     264             :   /*k and i are orthogonal, k and j as well */
     265       22497 :   if (plane[0] && plane[1]) {
     266       39732 :     for (int z = 0; z < cube->size[0]; z++) {
     267      983748 :       for (int y = 0; y < cube->size[1]; y++) {
     268      946350 :         double *restrict yx = &idx3(Exp[0], 2, y, 0);
     269      946350 :         double *restrict dst = &idx3(cube[0], z, y, 0);
     270      946350 :         GRID_PRAGMA_SIMD((dst, yx), 8)
     271      946350 :         for (int x = 0; x < cube->size[2]; x++) {
     272    24745974 :           dst[x] *= yx[x];
     273             :         }
     274             :       }
     275             :     }
     276             :     return;
     277             :   }
     278             : 
     279             :   /* k and i are orhogonal, i and j as well */
     280       20163 :   if (plane[0] && plane[2]) {
     281           0 :     for (int z = 0; z < cube->size[0]; z++) {
     282           0 :       for (int y = 0; y < cube->size[1]; y++) {
     283           0 :         const double zy = idx3(Exp[0], 1, z, y);
     284           0 :         double *restrict dst = &idx3(cube[0], z, y, 0);
     285             : 
     286             :         //#pragma omp simd linear(dst) simdlen(8)
     287           0 :         GRID_PRAGMA_SIMD((dst), 8)
     288           0 :         for (int x = 0; x < cube->size[2]; x++) {
     289           0 :           dst[x] *= zy;
     290             :         }
     291             :       }
     292             :     }
     293             :     return;
     294             :   }
     295             : 
     296             :   /* j, k are orthognal, i and j are orthognal */
     297       20163 :   if (plane[1] && plane[2]) {
     298      508286 :     for (int z = 0; z < cube->size[0]; z++) {
     299      488123 :       double *restrict zx = &idx3(Exp[0], 0, z, 0);
     300    13139192 :       for (int y = 0; y < cube->size[1]; y++) {
     301    12651069 :         double *restrict dst = &idx3(cube[0], z, y, 0);
     302             :         //#pragma omp simd linear(dst, zx) simdlen(8)
     303    12651069 :         GRID_PRAGMA_SIMD((dst, zx), 8)
     304    12651069 :         for (int x = 0; x < cube->size[2]; x++) {
     305   358329545 :           dst[x] *= zx[x];
     306             :         }
     307             :       }
     308             :     }
     309             :     return;
     310             :   }
     311             : 
     312           0 :   if (plane[0]) {
     313             :     // z perpendicular to x. but y non perpendicular to any
     314           0 :     for (int z = 0; z < cube->size[0]; z++) {
     315           0 :       for (int y = 0; y < cube->size[1]; y++) {
     316           0 :         const double zy = idx3(Exp[0], 1, z, y);
     317           0 :         double *restrict yx = &idx3(Exp[0], 2, y, 0);
     318           0 :         double *restrict dst = &idx3(cube[0], z, y, 0);
     319             : 
     320             :         //#pragma omp simd linear(dst, yx) simdlen(8)
     321           0 :         GRID_PRAGMA_SIMD((dst, yx), 8)
     322           0 :         for (int x = 0; x < cube->size[2]; x++) {
     323           0 :           dst[x] *= zy * yx[x];
     324             :         }
     325             :       }
     326             :     }
     327             :     return;
     328             :   }
     329             : 
     330           0 :   if (plane[1]) {
     331             :     // z perpendicular to y, but x and z are not and y and x neither
     332           0 :     for (int z = 0; z < cube->size[0]; z++) {
     333           0 :       double *restrict zx = &idx3(Exp[0], 0, z, 0);
     334           0 :       for (int y = 0; y < cube->size[1]; y++) {
     335           0 :         double *restrict yx = &idx3(Exp[0], 2, y, 0);
     336           0 :         double *restrict dst = &idx3(cube[0], z, y, 0);
     337             :         //#pragma omp simd linear(dst, yx) simdlen(8)
     338           0 :         GRID_PRAGMA_SIMD((dst, yx), 8)
     339           0 :         for (int x = 0; x < cube->size[2]; x++) {
     340           0 :           dst[x] *= zx[x] * yx[x];
     341             :         }
     342             :       }
     343             :     }
     344             :     return;
     345             :   }
     346             : 
     347           0 :   if (plane[2]) {
     348             :     // x perpendicular to y, but x and z are not and y and z neither
     349           0 :     for (int z = 0; z < cube->size[0]; z++) {
     350           0 :       double *restrict zx = &idx3(Exp[0], 0, z, 0);
     351           0 :       for (int y = 0; y < cube->size[1]; y++) {
     352           0 :         const double zy = idx3(Exp[0], 1, z, y);
     353           0 :         double *restrict dst = &idx3(cube[0], z, y, 0);
     354             : 
     355             :         //#pragma omp simd linear(dst) simdlen(8)
     356           0 :         GRID_PRAGMA_SIMD((dst), 8)
     357           0 :         for (int x = 0; x < cube->size[2]; x++) {
     358           0 :           dst[x] *= zx[x] * zy;
     359             :         }
     360             :       }
     361             :     }
     362             :     return;
     363             :   }
     364             : 
     365             :   /* generic  case */
     366             : 
     367           0 :   for (int z = 0; z < cube->size[0]; z++) {
     368           0 :     double *restrict zx = &idx3(Exp[0], 0, z, 0);
     369           0 :     for (int y = 0; y < cube->size[1]; y++) {
     370           0 :       const double zy = idx3(Exp[0], 1, z, y);
     371           0 :       const double *restrict yx = &idx3(Exp[0], 2, y, 0);
     372           0 :       double *restrict dst = &idx3(cube[0], z, y, 0);
     373             : 
     374             :       //#pragma omp simd linear(dst, zx, yx) simdlen(8)
     375           0 :       GRID_PRAGMA_SIMD((dst, zx), 8)
     376           0 :       for (int x = 0; x < cube->size[2]; x++) {
     377           0 :         dst[x] *= zx[x] * zy * yx[x];
     378             :       }
     379             :     }
     380             :   }
     381             :   return;
     382             : }
     383             : 
     384        3112 : void apply_non_orthorombic_corrections_xy_blocked(
     385             :     const struct tensor_ *const Exp, struct tensor_ *const m) {
     386       12448 :   for (int gamma = 0; gamma < m->size[0]; gamma++) {
     387      236688 :     for (int y1 = 0; y1 < m->size[1]; y1++) {
     388      227352 :       double *restrict dst = &idx3(m[0], gamma, y1, 0);
     389      227352 :       double *restrict src = &idx2(Exp[0], y1, 0);
     390             : 
     391             :       //#pragma omp simd linear(dst, src) simdlen(8)
     392      227352 :       GRID_PRAGMA_SIMD((dst, src), 8)
     393      227352 :       for (int x1 = 0; x1 < m->size[2]; x1++) {
     394     5777400 :         dst[x1] *= src[x1];
     395             :       }
     396             :     }
     397             :   }
     398        3112 : }
     399             : 
     400       20163 : void apply_non_orthorombic_corrections_xz_blocked(
     401             :     const struct tensor_ *const Exp, struct tensor_ *const m) {
     402      508286 :   for (int z1 = 0; z1 < m->size[0]; z1++) {
     403      488123 :     double *restrict src = &idx2(Exp[0], z1, 0);
     404    13139192 :     for (int y1 = 0; y1 < m->size[1]; y1++) {
     405    12651069 :       double *restrict dst = &idx3(m[0], z1, y1, 0);
     406             :       //#pragma omp simd linear(dst, src) simdlen(8)
     407    12651069 :       GRID_PRAGMA_SIMD((dst, src), 8)
     408    12651069 :       for (int x1 = 0; x1 < m->size[2]; x1++) {
     409   358329545 :         dst[x1] *= src[x1];
     410             :       }
     411             :     }
     412             :   }
     413       20163 : }
     414             : 
     415           0 : void apply_non_orthorombic_corrections_yz_blocked(
     416             :     const struct tensor_ *const Exp, struct tensor_ *const m) {
     417           0 :   for (int z1 = 0; z1 < m->size[0]; z1++) {
     418           0 :     for (int y1 = 0; y1 < m->size[1]; y1++) {
     419           0 :       const double src = idx2(Exp[0], z1, y1);
     420           0 :       double *restrict dst = &idx3(m[0], z1, y1, 0);
     421             :       //#pragma omp simd linear(dst) simdlen(8)
     422           0 :       GRID_PRAGMA_SIMD((dst), 8)
     423           0 :       for (int x1 = 0; x1 < m->size[2]; x1++) {
     424           0 :         dst[x1] *= src;
     425             :       }
     426             :     }
     427             :   }
     428           0 : }
     429             : 
     430           0 : void apply_non_orthorombic_corrections_xz_yz_blocked(
     431             :     const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz,
     432             :     struct tensor_ *const m) {
     433           0 :   for (int z1 = 0; z1 < m->size[0]; z1++) {
     434           0 :     const double *restrict src_xz = &idx2(Exp_xz[0], z1, 0);
     435           0 :     for (int y1 = 0; y1 < m->size[1]; y1++) {
     436           0 :       const double src = idx2(Exp_yz[0], z1, y1);
     437           0 :       double *restrict dst = &idx3(m[0], z1, y1, 0);
     438             :       //#pragma omp simd linear(dst) simdlen(8)
     439           0 :       GRID_PRAGMA_SIMD((dst), 8)
     440           0 :       for (int x1 = 0; x1 < m->size[2]; x1++) {
     441           0 :         dst[x1] *= src * src_xz[x1];
     442             :       }
     443             :     }
     444             :   }
     445           0 : }

Generated by: LCOV version 1.15