Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <math.h>
10 : #include <omp.h>
11 : #include <stdint.h>
12 : #include <stdio.h>
13 : #include <stdlib.h>
14 : #include <string.h>
15 :
16 : #include "../common/grid_common.h"
17 : #include "grid_ref_collocate.h"
18 : #include "grid_ref_integrate.h"
19 : #include "grid_ref_task_list.h"
20 : #include "grid_ref_task_list_internal.h"
21 :
22 : /*******************************************************************************
23 : * \brief Comperator passed to qsort to compare two tasks.
24 : * \author Ole Schuett
25 : ******************************************************************************/
26 66765726 : static int compare_tasks(const void *a, const void *b) {
27 66765726 : const grid_ref_task *task_a = a, *task_b = b;
28 66765726 : if (task_a->level != task_b->level) {
29 4767882 : return task_a->level - task_b->level;
30 61997844 : } else if (task_a->block_num != task_b->block_num) {
31 34303513 : return task_a->block_num - task_b->block_num;
32 27694331 : } else if (task_a->iset != task_b->iset) {
33 3235392 : return task_a->iset - task_b->iset;
34 : } else {
35 24458939 : return task_a->jset - task_b->jset;
36 : }
37 : }
38 : /*******************************************************************************
39 : * \brief Allocates a task list for the reference backend.
40 : * See grid_task_list.h for details.
41 : * \author Ole Schuett
42 : ******************************************************************************/
43 15913 : void grid_ref_create_task_list(
44 : const bool orthorhombic, const int ntasks, const int nlevels,
45 : const int natoms, const int nkinds, const int nblocks,
46 : const int block_offsets[nblocks], const double atom_positions[natoms][3],
47 : const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
48 : const int level_list[ntasks], const int iatom_list[ntasks],
49 : const int jatom_list[ntasks], const int iset_list[ntasks],
50 : const int jset_list[ntasks], const int ipgf_list[ntasks],
51 : const int jpgf_list[ntasks], const int border_mask_list[ntasks],
52 : const int block_num_list[ntasks], const double radius_list[ntasks],
53 : const double rab_list[ntasks][3], const int npts_global[nlevels][3],
54 : const int npts_local[nlevels][3], const int shift_local[nlevels][3],
55 : const int border_width[nlevels][3], const double dh[nlevels][3][3],
56 : const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out) {
57 :
58 15913 : if (*task_list_out != NULL) {
59 : // This is actually an opportunity to reuse some buffers.
60 6312 : grid_ref_free_task_list(*task_list_out);
61 : }
62 :
63 15913 : grid_ref_task_list_internal *task_list =
64 15913 : malloc(sizeof(grid_ref_task_list_internal));
65 15913 : assert(task_list != NULL);
66 :
67 15913 : task_list->orthorhombic = orthorhombic;
68 15913 : task_list->ntasks = ntasks;
69 15913 : task_list->nlevels = nlevels;
70 15913 : task_list->natoms = natoms;
71 15913 : task_list->nkinds = nkinds;
72 15913 : task_list->nblocks = nblocks;
73 :
74 15913 : size_t size = nblocks * sizeof(int);
75 15913 : task_list->block_offsets = malloc(size);
76 15913 : assert(task_list->block_offsets != NULL || size == 0);
77 15913 : if (size != 0) {
78 15913 : memcpy(task_list->block_offsets, block_offsets, size);
79 : }
80 :
81 15913 : size = 3 * natoms * sizeof(double);
82 15913 : task_list->atom_positions = malloc(size);
83 15913 : assert(task_list->atom_positions != NULL || size == 0);
84 15913 : if (size != 0) {
85 15913 : memcpy(task_list->atom_positions, atom_positions, size);
86 : }
87 :
88 15913 : size = natoms * sizeof(int);
89 15913 : task_list->atom_kinds = malloc(size);
90 15913 : assert(task_list->atom_kinds != NULL || size == 0);
91 15913 : if (size != 0) {
92 15913 : memcpy(task_list->atom_kinds, atom_kinds, size);
93 : }
94 :
95 15913 : size = nkinds * sizeof(grid_basis_set *);
96 15913 : task_list->basis_sets = malloc(size);
97 15913 : assert(task_list->basis_sets != NULL || size == 0);
98 15913 : if (size != 0) {
99 15913 : memcpy(task_list->basis_sets, basis_sets, size);
100 : }
101 :
102 15913 : size = ntasks * sizeof(grid_ref_task);
103 15913 : task_list->tasks = malloc(size);
104 15913 : assert(task_list->tasks != NULL || size == 0);
105 15913 : #pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
106 : for (int i = 0; i < ntasks; i++) {
107 : task_list->tasks[i].level = level_list[i];
108 : task_list->tasks[i].iatom = iatom_list[i];
109 : task_list->tasks[i].jatom = jatom_list[i];
110 : task_list->tasks[i].iset = iset_list[i];
111 : task_list->tasks[i].jset = jset_list[i];
112 : task_list->tasks[i].ipgf = ipgf_list[i];
113 : task_list->tasks[i].jpgf = jpgf_list[i];
114 : task_list->tasks[i].border_mask = border_mask_list[i];
115 : task_list->tasks[i].block_num = block_num_list[i];
116 : task_list->tasks[i].radius = radius_list[i];
117 : task_list->tasks[i].rab[0] = rab_list[i][0];
118 : task_list->tasks[i].rab[1] = rab_list[i][1];
119 : task_list->tasks[i].rab[2] = rab_list[i][2];
120 : }
121 :
122 : // Store grid layouts.
123 15913 : size = nlevels * sizeof(grid_ref_layout);
124 15913 : task_list->layouts = malloc(size);
125 15913 : assert(task_list->layouts != NULL || size == 0);
126 78826 : for (int level = 0; level < nlevels; level++) {
127 251652 : for (int i = 0; i < 3; i++) {
128 188739 : task_list->layouts[level].npts_global[i] = npts_global[level][i];
129 188739 : task_list->layouts[level].npts_local[i] = npts_local[level][i];
130 188739 : task_list->layouts[level].shift_local[i] = shift_local[level][i];
131 188739 : task_list->layouts[level].border_width[i] = border_width[level][i];
132 754956 : for (int j = 0; j < 3; j++) {
133 566217 : task_list->layouts[level].dh[i][j] = dh[level][i][j];
134 566217 : task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
135 : }
136 : }
137 : }
138 :
139 : // Sort tasks by level, block_num, iset, and jset.
140 15913 : qsort(task_list->tasks, ntasks, sizeof(grid_ref_task), &compare_tasks);
141 :
142 : // Find first and last task for each level and block.
143 15913 : size = nlevels * nblocks * sizeof(int);
144 15913 : task_list->first_level_block_task = malloc(size);
145 15913 : assert(task_list->first_level_block_task != NULL || size == 0);
146 15913 : task_list->last_level_block_task = malloc(size);
147 15913 : assert(task_list->last_level_block_task != NULL || size == 0);
148 15913 : const int nlevel_blocks = nlevels * nblocks;
149 15913 : #pragma omp parallel for schedule(static) if (nlevel_blocks > \
150 : GRID_OMP_MIN_ITERATIONS)
151 : for (int i = 0; i < nlevel_blocks; i++) {
152 : task_list->first_level_block_task[i] = 0;
153 : task_list->last_level_block_task[i] = -1; // last < first means no tasks
154 : }
155 9913757 : for (int itask = 0; itask < ntasks; itask++) {
156 9897844 : const int level = task_list->tasks[itask].level - 1;
157 9897844 : const int block_num = task_list->tasks[itask].block_num - 1;
158 9897844 : if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
159 9853620 : task_list->tasks[itask - 1].block_num - 1 != block_num) {
160 1121368 : task_list->first_level_block_task[level * nblocks + block_num] = itask;
161 : }
162 9897844 : task_list->last_level_block_task[level * nblocks + block_num] = itask;
163 : }
164 :
165 : // Find largest Cartesian subblock size.
166 15913 : task_list->maxco = 0;
167 44375 : for (int i = 0; i < nkinds; i++) {
168 28462 : task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
169 : }
170 :
171 : // Initialize thread-local storage.
172 15913 : size = omp_get_max_threads() * sizeof(double *);
173 15913 : task_list->threadlocals = malloc(size);
174 15913 : assert(task_list->threadlocals != NULL);
175 15913 : memset(task_list->threadlocals, 0, size);
176 15913 : size = omp_get_max_threads() * sizeof(size_t);
177 15913 : task_list->threadlocal_sizes = malloc(size);
178 15913 : assert(task_list->threadlocal_sizes != NULL);
179 15913 : memset(task_list->threadlocal_sizes, 0, size);
180 :
181 15913 : *task_list_out = task_list;
182 15913 : }
183 :
184 : /*******************************************************************************
185 : * \brief Deallocates given task list, basis_sets have to be freed separately.
186 : * \author Ole Schuett
187 : ******************************************************************************/
188 15913 : void grid_ref_free_task_list(grid_ref_task_list *ptr) {
189 15913 : if (ptr == NULL)
190 : return;
191 :
192 15913 : grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
193 :
194 15913 : free(task_list->block_offsets);
195 15913 : free(task_list->atom_positions);
196 15913 : free(task_list->atom_kinds);
197 15913 : free(task_list->basis_sets);
198 15913 : free(task_list->tasks);
199 15913 : free(task_list->layouts);
200 15913 : free(task_list->first_level_block_task);
201 15913 : free(task_list->last_level_block_task);
202 31826 : for (int i = 0; i < omp_get_max_threads(); i++) {
203 15913 : if (task_list->threadlocals[i] != NULL) {
204 6 : free(task_list->threadlocals[i]);
205 : }
206 : }
207 15913 : free(task_list->threadlocals);
208 15913 : free(task_list->threadlocal_sizes);
209 15913 : free(task_list);
210 : }
211 :
212 : /*******************************************************************************
213 : * \brief Prototype for BLAS dgemm.
214 : * \author Ole Schuett
215 : ******************************************************************************/
216 : void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
217 : const int *k, const double *alpha, const double *a, const int *lda,
218 : const double *b, const int *ldb, const double *beta, double *c,
219 : const int *ldc);
220 :
221 : /*******************************************************************************
222 : * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
223 : * \author Ole Schuett
224 : ******************************************************************************/
225 1928 : static void dgemm(const char transa, const char transb, const int m,
226 : const int n, const int k, const double alpha, const double *a,
227 : const int lda, const double *b, const int ldb,
228 : const double beta, double *c, const int ldc) {
229 1928 : dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
230 : &ldc);
231 1928 : }
232 :
233 : /*******************************************************************************
234 : * \brief Transforms pab from contracted spherical to prim. cartesian basis.
235 : * \author Ole Schuett
236 : ******************************************************************************/
237 580 : static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
238 : const int iset, const int jset, const bool transpose,
239 580 : const double *block, double *pab) {
240 :
241 : // Define some more convenient aliases.
242 580 : const int ncoseta = ncoset(ibasis->lmax[iset]);
243 580 : const int ncosetb = ncoset(jbasis->lmax[jset]);
244 580 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
245 580 : const int ncob = jbasis->npgf[jset] * ncosetb;
246 :
247 580 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
248 580 : const int nsgf_setb = jbasis->nsgf_set[jset];
249 580 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
250 580 : const int nsgfb = jbasis->nsgf;
251 580 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
252 580 : const int sgfb = jbasis->first_sgf[jset] - 1;
253 580 : const int maxcoa = ibasis->maxco;
254 580 : const int maxcob = jbasis->maxco;
255 :
256 580 : double work[nsgf_setb * ncoa];
257 580 : if (transpose) {
258 : // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
259 412 : dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
260 412 : &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
261 : maxcoa, 0.0, work, ncoa);
262 : } else {
263 : // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
264 168 : dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
265 168 : &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
266 : maxcoa, 0.0, work, ncoa);
267 : }
268 : // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
269 580 : dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
270 : maxcob, work, ncoa, 0.0, pab, ncoa);
271 580 : }
272 :
273 : /*******************************************************************************
274 : * \brief Collocate a range of tasks which are destined for the same grid level.
275 : * \author Ole Schuett
276 : ******************************************************************************/
277 168 : static void collocate_one_grid_level(
278 : const grid_ref_task_list *ptr, const int *first_block_task,
279 : const int *last_block_task, const enum grid_func func,
280 : const int npts_global[3], const int npts_local[3], const int shift_local[3],
281 : const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
282 : const double *pab_blocks, offload_buffer *grid) {
283 :
284 168 : if (ptr == NULL)
285 : return;
286 :
287 168 : grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
288 :
289 : // Using default(shared) because with GCC 9 the behavior around const changed:
290 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
291 168 : #pragma omp parallel default(shared)
292 : {
293 : const int ithread = omp_get_thread_num();
294 : const int nthreads = omp_get_num_threads();
295 :
296 : // Initialize variables to detect when a new subblock has to be fetched.
297 : int old_offset = -1, old_iset = -1, old_jset = -1;
298 :
299 : // Matrix pab is re-used across tasks.
300 : double pab[imax(task_list->maxco * task_list->maxco, 1)];
301 :
302 : // Ensure that grid can fit into thread-local storage, reallocate if needed.
303 : const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
304 : const size_t grid_size = npts_local_total * sizeof(double);
305 : if (task_list->threadlocal_sizes[ithread] < grid_size) {
306 : if (task_list->threadlocals[ithread] != NULL) {
307 : free(task_list->threadlocals[ithread]);
308 : }
309 : task_list->threadlocals[ithread] = malloc(grid_size);
310 : assert(task_list->threadlocals[ithread] != NULL);
311 : task_list->threadlocal_sizes[ithread] = grid_size;
312 : }
313 :
314 : // Zero thread-local copy of the grid.
315 : double *const my_grid = task_list->threadlocals[ithread];
316 : memset(my_grid, 0, grid_size);
317 :
318 : // Parallelize over blocks to avoid unnecessary calls to load_pab.
319 : const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
320 : #pragma omp for schedule(dynamic, chunk_size)
321 : for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
322 : const int first_task = first_block_task[block_num];
323 : const int last_task = last_block_task[block_num];
324 :
325 : for (int itask = first_task; itask <= last_task; itask++) {
326 : // Define some convenient aliases.
327 : const grid_ref_task *task = &task_list->tasks[itask];
328 : const int iatom = task->iatom - 1;
329 : const int jatom = task->jatom - 1;
330 : const int iset = task->iset - 1;
331 : const int jset = task->jset - 1;
332 : const int ipgf = task->ipgf - 1;
333 : const int jpgf = task->jpgf - 1;
334 : const int ikind = task_list->atom_kinds[iatom] - 1;
335 : const int jkind = task_list->atom_kinds[jatom] - 1;
336 : const grid_basis_set *ibasis = task_list->basis_sets[ikind];
337 : const grid_basis_set *jbasis = task_list->basis_sets[jkind];
338 : const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
339 : const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
340 : const int ncoseta = ncoset(ibasis->lmax[iset]);
341 : const int ncosetb = ncoset(jbasis->lmax[jset]);
342 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
343 : const int ncob = jbasis->npgf[jset] * ncosetb;
344 : const int block_num = task->block_num - 1;
345 : const int block_offset = task_list->block_offsets[block_num];
346 : const double *block = &pab_blocks[block_offset];
347 : const bool transpose = (iatom <= jatom);
348 :
349 : // Load subblock from buffer and decontract into Cartesian sublock pab.
350 : // The previous pab can be reused when only ipgf or jpgf has changed.
351 : if (block_offset != old_offset || iset != old_iset ||
352 : jset != old_jset) {
353 : old_offset = block_offset;
354 : old_iset = iset;
355 : old_jset = jset;
356 : load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
357 : }
358 :
359 : grid_ref_collocate_pgf_product(
360 : /*orthorhombic=*/task_list->orthorhombic,
361 : /*border_mask=*/task->border_mask,
362 : /*func=*/func,
363 : /*la_max=*/ibasis->lmax[iset],
364 : /*la_min=*/ibasis->lmin[iset],
365 : /*lb_max=*/jbasis->lmax[jset],
366 : /*lb_min=*/jbasis->lmin[jset],
367 : /*zeta=*/zeta,
368 : /*zetb=*/zetb,
369 : /*rscale=*/(iatom == jatom) ? 1 : 2,
370 : /*dh=*/dh,
371 : /*dh_inv=*/dh_inv,
372 : /*ra=*/&task_list->atom_positions[3 * iatom],
373 : /*rab=*/task->rab,
374 : /*npts_global=*/npts_global,
375 : /*npts_local=*/npts_local,
376 : /*shift_local=*/shift_local,
377 : /*border_width=*/border_width,
378 : /*radius=*/task->radius,
379 : /*o1=*/ipgf * ncoseta,
380 : /*o2=*/jpgf * ncosetb,
381 : /*n1=*/ncoa,
382 : /*n2=*/ncob,
383 : /*pab=*/(const double(*)[ncoa])pab,
384 : /*grid=*/my_grid);
385 :
386 : } // end of task loop
387 : } // end of block loop
388 :
389 : // While there should be an implicit barrier at the end of the block loop, this
390 : // explicit barrier eliminates occasional seg faults with icc compiled binaries.
391 : #pragma omp barrier
392 :
393 : // Merge thread-local grids via an efficient tree reduction.
394 : const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
395 : for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
396 : // Threads are divided into groups, whose size doubles with each cycle.
397 : // After a cycle the reduced data is stored at first thread of each group.
398 : const int group_size = 1 << icycle; // 2**icycle
399 : const int igroup = ithread / group_size;
400 : const int dest_thread = igroup * group_size;
401 : const int src_thread = dest_thread + group_size / 2;
402 : // The last group might actually be a bit smaller.
403 : const int actual_group_size = imin(group_size, nthreads - dest_thread);
404 : // Parallelize summation by dividing grid points across group members.
405 : const int rank = modulo(ithread, group_size); // position within the group
406 : const int lb = (npts_local_total * rank) / actual_group_size;
407 : const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
408 : if (src_thread < nthreads) {
409 : GRID_PRAGMA_SIMD_LOOP
410 : for (int i = lb; i < ub; i++) {
411 : task_list->threadlocals[dest_thread][i] +=
412 : task_list->threadlocals[src_thread][i];
413 : }
414 : }
415 : #pragma omp barrier
416 : }
417 :
418 : // Copy final result from first thread into shared grid.
419 : const int lb = (npts_local_total * ithread) / nthreads;
420 : const int ub = (npts_local_total * (ithread + 1)) / nthreads;
421 : GRID_PRAGMA_SIMD_LOOP
422 : for (int i = lb; i < ub; i++) {
423 : grid->host_buffer[i] = task_list->threadlocals[0][i];
424 : }
425 :
426 : } // end of omp parallel region
427 : }
428 :
429 : /*******************************************************************************
430 : * \brief Collocate all tasks of in given list onto given grids.
431 : * See grid_task_list.h for details.
432 : * \author Ole Schuett
433 : ******************************************************************************/
434 42 : void grid_ref_collocate_task_list(const grid_ref_task_list *ptr,
435 : const enum grid_func func, const int nlevels,
436 : const offload_buffer *pab_blocks,
437 : offload_buffer *grids[nlevels]) {
438 :
439 42 : if (ptr == NULL)
440 : return;
441 :
442 42 : grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
443 :
444 42 : assert(task_list->nlevels == nlevels);
445 :
446 210 : for (int level = 0; level < task_list->nlevels; level++) {
447 168 : const int idx = level * task_list->nblocks;
448 168 : const int *first_block_task = &task_list->first_level_block_task[idx];
449 168 : const int *last_block_task = &task_list->last_level_block_task[idx];
450 168 : const grid_ref_layout *layout = &task_list->layouts[level];
451 168 : collocate_one_grid_level(
452 168 : task_list, first_block_task, last_block_task, func, layout->npts_global,
453 168 : layout->npts_local, layout->shift_local, layout->border_width,
454 168 : layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
455 : }
456 : }
457 :
458 : /*******************************************************************************
459 : * \brief Transforms hab from prim. cartesian to contracted spherical basis.
460 : * \author Ole Schuett
461 : ******************************************************************************/
462 384 : static inline void store_hab(const grid_basis_set *ibasis,
463 : const grid_basis_set *jbasis, const int iset,
464 : const int jset, const bool transpose,
465 384 : const double *hab, double *block) {
466 :
467 : // Define some more convenient aliases.
468 384 : const int ncoseta = ncoset(ibasis->lmax[iset]);
469 384 : const int ncosetb = ncoset(jbasis->lmax[jset]);
470 384 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
471 384 : const int ncob = jbasis->npgf[jset] * ncosetb;
472 :
473 384 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
474 384 : const int nsgf_setb = jbasis->nsgf_set[jset];
475 384 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
476 384 : const int nsgfb = jbasis->nsgf;
477 384 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
478 384 : const int sgfb = jbasis->first_sgf[jset] - 1;
479 384 : const int maxcoa = ibasis->maxco;
480 384 : const int maxcob = jbasis->maxco;
481 :
482 384 : double work[nsgf_setb * ncoa];
483 :
484 : // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
485 384 : dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
486 : maxcob, hab, ncoa, 0.0, work, ncoa);
487 :
488 384 : if (transpose) {
489 : // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
490 288 : dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
491 288 : &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
492 288 : &block[sgfb * nsgfa + sgfa], nsgfa);
493 : } else {
494 : // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
495 96 : dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
496 96 : &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
497 96 : &block[sgfa * nsgfb + sgfb], nsgfb);
498 : }
499 384 : }
500 :
501 : /*******************************************************************************
502 : * \brief Integrate a range of tasks that belong to the same grid level.
503 : * \author Ole Schuett
504 : ******************************************************************************/
505 136 : static void integrate_one_grid_level(
506 : const grid_ref_task_list *ptr, const int *first_block_task,
507 : const int *last_block_task, const bool compute_tau, const int natoms,
508 : const int npts_global[3], const int npts_local[3], const int shift_local[3],
509 : const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
510 : const offload_buffer *pab_blocks, const offload_buffer *grid,
511 : offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
512 :
513 136 : if (ptr == NULL)
514 : return;
515 :
516 136 : grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
517 :
518 : // Using default(shared) because with GCC 9 the behavior around const changed:
519 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
520 136 : #pragma omp parallel default(shared)
521 : {
522 : // Initialize variables to detect when a new subblock has to be fetched.
523 : int old_offset = -1, old_iset = -1, old_jset = -1;
524 : grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
525 : bool old_transpose = false;
526 :
527 : // Matrix pab and hab are re-used across tasks.
528 : double pab[imax(task_list->maxco * task_list->maxco, 1)];
529 : double hab[imax(task_list->maxco * task_list->maxco, 1)];
530 :
531 : // Parallelize over blocks to avoid concurred access to hab_blocks.
532 : const int nthreads = omp_get_num_threads();
533 : const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
534 : #pragma omp for schedule(dynamic, chunk_size)
535 : for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
536 : const int first_task = first_block_task[block_num];
537 : const int last_task = last_block_task[block_num];
538 :
539 : // Accumulate forces per block as it corresponds to a pair of atoms.
540 : const int iatom = task_list->tasks[first_task].iatom - 1;
541 : const int jatom = task_list->tasks[first_task].jatom - 1;
542 : double my_forces[2][3] = {0};
543 : double my_virials[2][3][3] = {0};
544 :
545 : for (int itask = first_task; itask <= last_task; itask++) {
546 : // Define some convenient aliases.
547 : const grid_ref_task *task = &task_list->tasks[itask];
548 : assert(task->block_num - 1 == block_num);
549 : assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
550 : const int ikind = task_list->atom_kinds[iatom] - 1;
551 : const int jkind = task_list->atom_kinds[jatom] - 1;
552 : grid_basis_set *ibasis = task_list->basis_sets[ikind];
553 : grid_basis_set *jbasis = task_list->basis_sets[jkind];
554 : const int iset = task->iset - 1;
555 : const int jset = task->jset - 1;
556 : const int ipgf = task->ipgf - 1;
557 : const int jpgf = task->jpgf - 1;
558 : const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
559 : const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
560 : const int ncoseta = ncoset(ibasis->lmax[iset]);
561 : const int ncosetb = ncoset(jbasis->lmax[jset]);
562 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
563 : const int ncob = jbasis->npgf[jset] * ncosetb;
564 : const int block_offset = task_list->block_offsets[block_num];
565 : const bool transpose = (iatom <= jatom);
566 : const bool pab_required = (forces != NULL || virial != NULL);
567 :
568 : // Load pab and store hab subblocks when needed.
569 : // Previous hab and pab can be reused when only ipgf or jpgf changed.
570 : if (block_offset != old_offset || iset != old_iset ||
571 : jset != old_jset) {
572 : if (pab_required) {
573 : load_pab(ibasis, jbasis, iset, jset, transpose,
574 : &pab_blocks->host_buffer[block_offset], pab);
575 : }
576 : if (old_offset >= 0) { // skip first iteration
577 : store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
578 : hab, &hab_blocks->host_buffer[old_offset]);
579 : }
580 : memset(hab, 0, ncoa * ncob * sizeof(double));
581 : old_offset = block_offset;
582 : old_iset = iset;
583 : old_jset = jset;
584 : old_ibasis = ibasis;
585 : old_jbasis = jbasis;
586 : old_transpose = transpose;
587 : }
588 :
589 : grid_ref_integrate_pgf_product(
590 : /*orthorhombic=*/task_list->orthorhombic,
591 : /*compute_tau=*/compute_tau,
592 : /*border_mask=*/task->border_mask,
593 : /*la_max=*/ibasis->lmax[iset],
594 : /*la_min=*/ibasis->lmin[iset],
595 : /*lb_max=*/jbasis->lmax[jset],
596 : /*lb_min=*/jbasis->lmin[jset],
597 : /*zeta=*/zeta,
598 : /*zetb=*/zetb,
599 : /*dh=*/dh,
600 : /*dh_inv=*/dh_inv,
601 : /*ra=*/&task_list->atom_positions[3 * iatom],
602 : /*rab=*/task->rab,
603 : /*npts_global=*/npts_global,
604 : /*npts_local=*/npts_local,
605 : /*shift_local=*/shift_local,
606 : /*border_width=*/border_width,
607 : /*radius=*/task->radius,
608 : /*o1=*/ipgf * ncoseta,
609 : /*o2=*/jpgf * ncosetb,
610 : /*n1=*/ncoa,
611 : /*n2=*/ncob,
612 : /*grid=*/grid->host_buffer,
613 : /*hab=*/(double(*)[ncoa])hab,
614 : /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
615 : /*forces=*/(forces != NULL) ? my_forces : NULL,
616 : /*virials=*/(virial != NULL) ? my_virials : NULL,
617 : /*hdab=*/NULL,
618 : /*hadb=*/NULL,
619 : /*a_hdab=*/NULL);
620 :
621 : } // end of task loop
622 :
623 : // Merge thread-local forces and virial into shared ones.
624 : // It does not seem worth the trouble to accumulate them thread-locally.
625 : const double scalef = (iatom == jatom) ? 1.0 : 2.0;
626 : if (forces != NULL) {
627 : #pragma omp critical(forces)
628 : for (int i = 0; i < 3; i++) {
629 : forces[iatom][i] += scalef * my_forces[0][i];
630 : forces[jatom][i] += scalef * my_forces[1][i];
631 : }
632 : }
633 : if (virial != NULL) {
634 : #pragma omp critical(virial)
635 : for (int i = 0; i < 3; i++) {
636 : for (int j = 0; j < 3; j++) {
637 : virial[i][j] += scalef * my_virials[0][i][j];
638 : virial[i][j] += scalef * my_virials[1][i][j];
639 : }
640 : }
641 : }
642 :
643 : } // end of block loop
644 :
645 : // store final hab
646 : if (old_offset >= 0) {
647 : store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
648 : &hab_blocks->host_buffer[old_offset]);
649 : }
650 :
651 : } // end of omp parallel region
652 : }
653 :
654 : /*******************************************************************************
655 : * \brief Integrate all tasks of in given list from given grids.
656 : * See grid_task_list.h for details.
657 : * \author Ole Schuett
658 : ******************************************************************************/
659 34 : void grid_ref_integrate_task_list(
660 : const grid_ref_task_list *ptr, const bool compute_tau, const int natoms,
661 : const int nlevels, const offload_buffer *pab_blocks,
662 : const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
663 : double forces[natoms][3], double virial[3][3]) {
664 34 : if (ptr == NULL)
665 : return;
666 :
667 34 : grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
668 34 : assert(task_list->nlevels == nlevels);
669 34 : assert(task_list->natoms == natoms);
670 :
671 : // Zero result arrays.
672 34 : memset(hab_blocks->host_buffer, 0, hab_blocks->size);
673 34 : if (forces != NULL) {
674 6 : memset(forces, 0, natoms * 3 * sizeof(double));
675 : }
676 34 : if (virial != NULL) {
677 0 : memset(virial, 0, 9 * sizeof(double));
678 : }
679 :
680 170 : for (int level = 0; level < task_list->nlevels; level++) {
681 136 : const int idx = level * task_list->nblocks;
682 136 : const int *first_block_task = &task_list->first_level_block_task[idx];
683 136 : const int *last_block_task = &task_list->last_level_block_task[idx];
684 136 : const grid_ref_layout *layout = &task_list->layouts[level];
685 136 : integrate_one_grid_level(
686 : task_list, first_block_task, last_block_task, compute_tau, natoms,
687 136 : layout->npts_global, layout->npts_local, layout->shift_local,
688 136 : layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
689 136 : grids[level], hab_blocks, forces, virial);
690 : }
691 : }
692 :
693 : // EOF
|