Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <math.h>
10 : #include <omp.h>
11 : #include <stdint.h>
12 : #include <stdio.h>
13 : #include <stdlib.h>
14 : #include <string.h>
15 :
16 : #include "../common/grid_common.h"
17 : #include "grid_ref_collocate.h"
18 : #include "grid_ref_integrate.h"
19 : #include "grid_ref_task_list.h"
20 :
21 : /*******************************************************************************
22 : * \brief Comperator passed to qsort to compare two tasks.
23 : * \author Ole Schuett
24 : ******************************************************************************/
25 47203827 : static int compare_tasks(const void *a, const void *b) {
26 47203827 : const grid_ref_task *task_a = a, *task_b = b;
27 47203827 : if (task_a->level != task_b->level) {
28 3564456 : return task_a->level - task_b->level;
29 43639371 : } else if (task_a->block_num != task_b->block_num) {
30 20118487 : return task_a->block_num - task_b->block_num;
31 23520884 : } else if (task_a->iset != task_b->iset) {
32 2808673 : return task_a->iset - task_b->iset;
33 : } else {
34 20712211 : return task_a->jset - task_b->jset;
35 : }
36 : }
37 : /*******************************************************************************
38 : * \brief Allocates a task list for the reference backend.
39 : * See grid_task_list.h for details.
40 : * \author Ole Schuett
41 : ******************************************************************************/
42 13326 : void grid_ref_create_task_list(
43 : const bool orthorhombic, const int ntasks, const int nlevels,
44 : const int natoms, const int nkinds, const int nblocks,
45 : const int block_offsets[nblocks], const double atom_positions[natoms][3],
46 : const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
47 : const int level_list[ntasks], const int iatom_list[ntasks],
48 : const int jatom_list[ntasks], const int iset_list[ntasks],
49 : const int jset_list[ntasks], const int ipgf_list[ntasks],
50 : const int jpgf_list[ntasks], const int border_mask_list[ntasks],
51 : const int block_num_list[ntasks], const double radius_list[ntasks],
52 : const double rab_list[ntasks][3], const int npts_global[nlevels][3],
53 : const int npts_local[nlevels][3], const int shift_local[nlevels][3],
54 : const int border_width[nlevels][3], const double dh[nlevels][3][3],
55 : const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out) {
56 :
57 13326 : if (*task_list_out != NULL) {
58 : // This is actually an opportunity to reuse some buffers.
59 5356 : grid_ref_free_task_list(*task_list_out);
60 : }
61 :
62 13326 : grid_ref_task_list *task_list = malloc(sizeof(grid_ref_task_list));
63 :
64 13326 : task_list->orthorhombic = orthorhombic;
65 13326 : task_list->ntasks = ntasks;
66 13326 : task_list->nlevels = nlevels;
67 13326 : task_list->natoms = natoms;
68 13326 : task_list->nkinds = nkinds;
69 13326 : task_list->nblocks = nblocks;
70 :
71 13326 : size_t size = nblocks * sizeof(int);
72 13326 : task_list->block_offsets = malloc(size);
73 13326 : memcpy(task_list->block_offsets, block_offsets, size);
74 :
75 13326 : size = 3 * natoms * sizeof(double);
76 13326 : task_list->atom_positions = malloc(size);
77 13326 : memcpy(task_list->atom_positions, atom_positions, size);
78 :
79 13326 : size = natoms * sizeof(int);
80 13326 : task_list->atom_kinds = malloc(size);
81 13326 : memcpy(task_list->atom_kinds, atom_kinds, size);
82 :
83 13326 : size = nkinds * sizeof(grid_basis_set *);
84 13326 : task_list->basis_sets = malloc(size);
85 13326 : memcpy(task_list->basis_sets, basis_sets, size);
86 :
87 13326 : size = ntasks * sizeof(grid_ref_task);
88 13326 : task_list->tasks = malloc(size);
89 7585390 : for (int i = 0; i < ntasks; i++) {
90 7572064 : task_list->tasks[i].level = level_list[i];
91 7572064 : task_list->tasks[i].iatom = iatom_list[i];
92 7572064 : task_list->tasks[i].jatom = jatom_list[i];
93 7572064 : task_list->tasks[i].iset = iset_list[i];
94 7572064 : task_list->tasks[i].jset = jset_list[i];
95 7572064 : task_list->tasks[i].ipgf = ipgf_list[i];
96 7572064 : task_list->tasks[i].jpgf = jpgf_list[i];
97 7572064 : task_list->tasks[i].border_mask = border_mask_list[i];
98 7572064 : task_list->tasks[i].block_num = block_num_list[i];
99 7572064 : task_list->tasks[i].radius = radius_list[i];
100 7572064 : task_list->tasks[i].rab[0] = rab_list[i][0];
101 7572064 : task_list->tasks[i].rab[1] = rab_list[i][1];
102 7572064 : task_list->tasks[i].rab[2] = rab_list[i][2];
103 : }
104 :
105 : // Store grid layouts.
106 13326 : size = nlevels * sizeof(grid_ref_layout);
107 13326 : task_list->layouts = malloc(size);
108 66106 : for (int level = 0; level < nlevels; level++) {
109 211120 : for (int i = 0; i < 3; i++) {
110 158340 : task_list->layouts[level].npts_global[i] = npts_global[level][i];
111 158340 : task_list->layouts[level].npts_local[i] = npts_local[level][i];
112 158340 : task_list->layouts[level].shift_local[i] = shift_local[level][i];
113 158340 : task_list->layouts[level].border_width[i] = border_width[level][i];
114 633360 : for (int j = 0; j < 3; j++) {
115 475020 : task_list->layouts[level].dh[i][j] = dh[level][i][j];
116 475020 : task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
117 : }
118 : }
119 : }
120 :
121 : // Sort tasks by level, block_num, iset, and jset.
122 13326 : qsort(task_list->tasks, ntasks, sizeof(grid_ref_task), &compare_tasks);
123 :
124 : // Find first and last task for each level and block.
125 13326 : size = nlevels * nblocks * sizeof(int);
126 13326 : task_list->first_level_block_task = malloc(size);
127 13326 : task_list->last_level_block_task = malloc(size);
128 1011929 : for (int i = 0; i < nlevels * nblocks; i++) {
129 998603 : task_list->first_level_block_task[i] = 0;
130 998603 : task_list->last_level_block_task[i] = -1; // last < first means no tasks
131 : }
132 7585390 : for (int itask = 0; itask < ntasks; itask++) {
133 7572064 : const int level = task_list->tasks[itask].level - 1;
134 7572064 : const int block_num = task_list->tasks[itask].block_num - 1;
135 7572064 : if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
136 7536014 : task_list->tasks[itask - 1].block_num - 1 != block_num) {
137 505262 : task_list->first_level_block_task[level * nblocks + block_num] = itask;
138 : }
139 7572064 : task_list->last_level_block_task[level * nblocks + block_num] = itask;
140 : }
141 :
142 : // Find largest Cartesian subblock size.
143 13326 : task_list->maxco = 0;
144 37237 : for (int i = 0; i < nkinds; i++) {
145 23911 : task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
146 : }
147 :
148 : // Initialize thread-local storage.
149 13326 : size = omp_get_max_threads() * sizeof(double *);
150 13326 : task_list->threadlocals = malloc(size);
151 13326 : memset(task_list->threadlocals, 0, size);
152 13326 : size = omp_get_max_threads() * sizeof(size_t);
153 13326 : task_list->threadlocal_sizes = malloc(size);
154 13326 : memset(task_list->threadlocal_sizes, 0, size);
155 :
156 13326 : *task_list_out = task_list;
157 13326 : }
158 :
159 : /*******************************************************************************
160 : * \brief Deallocates given task list, basis_sets have to be freed separately.
161 : * \author Ole Schuett
162 : ******************************************************************************/
163 13326 : void grid_ref_free_task_list(grid_ref_task_list *task_list) {
164 13326 : free(task_list->block_offsets);
165 13326 : free(task_list->atom_positions);
166 13326 : free(task_list->atom_kinds);
167 13326 : free(task_list->basis_sets);
168 13326 : free(task_list->tasks);
169 13326 : free(task_list->layouts);
170 13326 : free(task_list->first_level_block_task);
171 13326 : free(task_list->last_level_block_task);
172 26652 : for (int i = 0; i < omp_get_max_threads(); i++) {
173 13326 : if (task_list->threadlocals[i] != NULL) {
174 8 : free(task_list->threadlocals[i]);
175 : }
176 : }
177 13326 : free(task_list->threadlocals);
178 13326 : free(task_list->threadlocal_sizes);
179 13326 : free(task_list);
180 13326 : }
181 :
182 : /*******************************************************************************
183 : * \brief Prototype for BLAS dgemm.
184 : * \author Ole Schuett
185 : ******************************************************************************/
186 : void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
187 : const int *k, const double *alpha, const double *a, const int *lda,
188 : const double *b, const int *ldb, const double *beta, double *c,
189 : const int *ldc);
190 :
191 : /*******************************************************************************
192 : * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
193 : * \author Ole Schuett
194 : ******************************************************************************/
195 1568 : static void dgemm(const char transa, const char transb, const int m,
196 : const int n, const int k, const double alpha, const double *a,
197 : const int lda, const double *b, const int ldb,
198 : const double beta, double *c, const int ldc) {
199 1568 : dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
200 : &ldc);
201 1568 : }
202 :
203 : /*******************************************************************************
204 : * \brief Transforms pab from contracted spherical to prim. cartesian basis.
205 : * \author Ole Schuett
206 : ******************************************************************************/
207 460 : static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
208 : const int iset, const int jset, const bool transpose,
209 460 : const double *block, double *pab) {
210 :
211 : // Define some more convenient aliases.
212 460 : const int ncoseta = ncoset(ibasis->lmax[iset]);
213 460 : const int ncosetb = ncoset(jbasis->lmax[jset]);
214 460 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
215 460 : const int ncob = jbasis->npgf[jset] * ncosetb;
216 :
217 460 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
218 460 : const int nsgf_setb = jbasis->nsgf_set[jset];
219 460 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
220 460 : const int nsgfb = jbasis->nsgf;
221 460 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
222 460 : const int sgfb = jbasis->first_sgf[jset] - 1;
223 460 : const int maxcoa = ibasis->maxco;
224 460 : const int maxcob = jbasis->maxco;
225 :
226 460 : double work[nsgf_setb * ncoa];
227 460 : if (transpose) {
228 : // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
229 340 : dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
230 340 : &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
231 : maxcoa, 0.0, work, ncoa);
232 : } else {
233 : // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
234 120 : dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
235 120 : &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
236 : maxcoa, 0.0, work, ncoa);
237 : }
238 : // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
239 460 : dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
240 : maxcob, work, ncoa, 0.0, pab, ncoa);
241 460 : }
242 :
243 : /*******************************************************************************
244 : * \brief Collocate a range of tasks which are destined for the same grid level.
245 : * \author Ole Schuett
246 : ******************************************************************************/
247 208 : static void collocate_one_grid_level(
248 : const grid_ref_task_list *task_list, const int *first_block_task,
249 : const int *last_block_task, const enum grid_func func,
250 : const int npts_global[3], const int npts_local[3], const int shift_local[3],
251 : const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
252 : const double *pab_blocks, offload_buffer *grid) {
253 :
254 : // Using default(shared) because with GCC 9 the behavior around const changed:
255 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
256 208 : #pragma omp parallel default(shared)
257 : {
258 : const int ithread = omp_get_thread_num();
259 : const int nthreads = omp_get_num_threads();
260 :
261 : // Initialize variables to detect when a new subblock has to be fetched.
262 : int old_offset = -1, old_iset = -1, old_jset = -1;
263 :
264 : // Matrix pab is re-used across tasks.
265 : double pab[task_list->maxco * task_list->maxco];
266 :
267 : // Ensure that grid can fit into thread-local storage, reallocate if needed.
268 : const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
269 : const size_t grid_size = npts_local_total * sizeof(double);
270 : if (task_list->threadlocal_sizes[ithread] < grid_size) {
271 : if (task_list->threadlocals[ithread] != NULL) {
272 : free(task_list->threadlocals[ithread]);
273 : }
274 : task_list->threadlocals[ithread] = malloc(grid_size);
275 : task_list->threadlocal_sizes[ithread] = grid_size;
276 : }
277 :
278 : // Zero thread-local copy of the grid.
279 : double *const my_grid = task_list->threadlocals[ithread];
280 : memset(my_grid, 0, grid_size);
281 :
282 : // Parallelize over blocks to avoid unnecessary calls to load_pab.
283 : const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
284 : #pragma omp for schedule(dynamic, chunk_size)
285 : for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
286 : const int first_task = first_block_task[block_num];
287 : const int last_task = last_block_task[block_num];
288 :
289 : for (int itask = first_task; itask <= last_task; itask++) {
290 : // Define some convenient aliases.
291 : const grid_ref_task *task = &task_list->tasks[itask];
292 : const int iatom = task->iatom - 1;
293 : const int jatom = task->jatom - 1;
294 : const int iset = task->iset - 1;
295 : const int jset = task->jset - 1;
296 : const int ipgf = task->ipgf - 1;
297 : const int jpgf = task->jpgf - 1;
298 : const int ikind = task_list->atom_kinds[iatom] - 1;
299 : const int jkind = task_list->atom_kinds[jatom] - 1;
300 : const grid_basis_set *ibasis = task_list->basis_sets[ikind];
301 : const grid_basis_set *jbasis = task_list->basis_sets[jkind];
302 : const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
303 : const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
304 : const int ncoseta = ncoset(ibasis->lmax[iset]);
305 : const int ncosetb = ncoset(jbasis->lmax[jset]);
306 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
307 : const int ncob = jbasis->npgf[jset] * ncosetb;
308 : const int block_num = task->block_num - 1;
309 : const int block_offset = task_list->block_offsets[block_num];
310 : const double *block = &pab_blocks[block_offset];
311 : const bool transpose = (iatom <= jatom);
312 :
313 : // Load subblock from buffer and decontract into Cartesian sublock pab.
314 : // The previous pab can be reused when only ipgf or jpgf has changed.
315 : if (block_offset != old_offset || iset != old_iset ||
316 : jset != old_jset) {
317 : old_offset = block_offset;
318 : old_iset = iset;
319 : old_jset = jset;
320 : load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
321 : }
322 :
323 : grid_ref_collocate_pgf_product(
324 : /*orthorhombic=*/task_list->orthorhombic,
325 : /*border_mask=*/task->border_mask,
326 : /*func=*/func,
327 : /*la_max=*/ibasis->lmax[iset],
328 : /*la_min=*/ibasis->lmin[iset],
329 : /*lb_max=*/jbasis->lmax[jset],
330 : /*lb_min=*/jbasis->lmin[jset],
331 : /*zeta=*/zeta,
332 : /*zetb=*/zetb,
333 : /*rscale=*/(iatom == jatom) ? 1 : 2,
334 : /*dh=*/dh,
335 : /*dh_inv=*/dh_inv,
336 : /*ra=*/&task_list->atom_positions[3 * iatom],
337 : /*rab=*/task->rab,
338 : /*npts_global=*/npts_global,
339 : /*npts_local=*/npts_local,
340 : /*shift_local=*/shift_local,
341 : /*border_width=*/border_width,
342 : /*radius=*/task->radius,
343 : /*o1=*/ipgf * ncoseta,
344 : /*o2=*/jpgf * ncosetb,
345 : /*n1=*/ncoa,
346 : /*n2=*/ncob,
347 : /*pab=*/(const double(*)[ncoa])pab,
348 : /*grid=*/my_grid);
349 :
350 : } // end of task loop
351 : } // end of block loop
352 :
353 : // While there should be an implicit barrier at the end of the block loop, this
354 : // explicit barrier eliminates occasional seg faults with icc compiled binaries.
355 : #pragma omp barrier
356 :
357 : // Merge thread-local grids via an efficient tree reduction.
358 : const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
359 : for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
360 : // Threads are divided into groups, whose size doubles with each cycle.
361 : // After a cycle the reduced data is stored at first thread of each group.
362 : const int group_size = 1 << icycle; // 2**icycle
363 : const int igroup = ithread / group_size;
364 : const int dest_thread = igroup * group_size;
365 : const int src_thread = dest_thread + group_size / 2;
366 : // The last group might actually be a bit smaller.
367 : const int actual_group_size = imin(group_size, nthreads - dest_thread);
368 : // Parallelize summation by dividing grid points across group members.
369 : const int rank = modulo(ithread, group_size); // position within the group
370 : const int lb = (npts_local_total * rank) / actual_group_size;
371 : const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
372 : if (src_thread < nthreads) {
373 : for (int i = lb; i < ub; i++) {
374 : task_list->threadlocals[dest_thread][i] +=
375 : task_list->threadlocals[src_thread][i];
376 : }
377 : }
378 : #pragma omp barrier
379 : }
380 :
381 : // Copy final result from first thread into shared grid.
382 : const int lb = (npts_local_total * ithread) / nthreads;
383 : const int ub = (npts_local_total * (ithread + 1)) / nthreads;
384 : for (int i = lb; i < ub; i++) {
385 : grid->host_buffer[i] = task_list->threadlocals[0][i];
386 : }
387 :
388 : } // end of omp parallel region
389 208 : }
390 :
391 : /*******************************************************************************
392 : * \brief Collocate all tasks of in given list onto given grids.
393 : * See grid_task_list.h for details.
394 : * \author Ole Schuett
395 : ******************************************************************************/
396 52 : void grid_ref_collocate_task_list(const grid_ref_task_list *task_list,
397 : const enum grid_func func, const int nlevels,
398 : const offload_buffer *pab_blocks,
399 : offload_buffer *grids[nlevels]) {
400 :
401 52 : assert(task_list->nlevels == nlevels);
402 :
403 260 : for (int level = 0; level < task_list->nlevels; level++) {
404 208 : const int idx = level * task_list->nblocks;
405 208 : const int *first_block_task = &task_list->first_level_block_task[idx];
406 208 : const int *last_block_task = &task_list->last_level_block_task[idx];
407 208 : const grid_ref_layout *layout = &task_list->layouts[level];
408 208 : collocate_one_grid_level(
409 208 : task_list, first_block_task, last_block_task, func, layout->npts_global,
410 208 : layout->npts_local, layout->shift_local, layout->border_width,
411 208 : layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
412 : }
413 52 : }
414 :
415 : /*******************************************************************************
416 : * \brief Transforms hab from prim. cartesian to contracted spherical basis.
417 : * \author Ole Schuett
418 : ******************************************************************************/
419 324 : static inline void store_hab(const grid_basis_set *ibasis,
420 : const grid_basis_set *jbasis, const int iset,
421 : const int jset, const bool transpose,
422 324 : const double *hab, double *block) {
423 :
424 : // Define some more convenient aliases.
425 324 : const int ncoseta = ncoset(ibasis->lmax[iset]);
426 324 : const int ncosetb = ncoset(jbasis->lmax[jset]);
427 324 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
428 324 : const int ncob = jbasis->npgf[jset] * ncosetb;
429 :
430 324 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
431 324 : const int nsgf_setb = jbasis->nsgf_set[jset];
432 324 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
433 324 : const int nsgfb = jbasis->nsgf;
434 324 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
435 324 : const int sgfb = jbasis->first_sgf[jset] - 1;
436 324 : const int maxcoa = ibasis->maxco;
437 324 : const int maxcob = jbasis->maxco;
438 :
439 324 : double work[nsgf_setb * ncoa];
440 :
441 : // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
442 324 : dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
443 : maxcob, hab, ncoa, 0.0, work, ncoa);
444 :
445 324 : if (transpose) {
446 : // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
447 252 : dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
448 252 : &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
449 252 : &block[sgfb * nsgfa + sgfa], nsgfa);
450 : } else {
451 : // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
452 72 : dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
453 72 : &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
454 72 : &block[sgfa * nsgfb + sgfb], nsgfb);
455 : }
456 324 : }
457 :
458 : /*******************************************************************************
459 : * \brief Integrate a range of tasks that belong to the same grid level.
460 : * \author Ole Schuett
461 : ******************************************************************************/
462 192 : static void integrate_one_grid_level(
463 : const grid_ref_task_list *task_list, const int *first_block_task,
464 : const int *last_block_task, const bool compute_tau, const int natoms,
465 : const int npts_global[3], const int npts_local[3], const int shift_local[3],
466 : const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
467 : const offload_buffer *pab_blocks, const offload_buffer *grid,
468 : offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
469 :
470 : // Using default(shared) because with GCC 9 the behavior around const changed:
471 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
472 192 : #pragma omp parallel default(shared)
473 : {
474 : // Initialize variables to detect when a new subblock has to be fetched.
475 : int old_offset = -1, old_iset = -1, old_jset = -1;
476 : grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
477 : bool old_transpose = false;
478 :
479 : // Matrix pab and hab are re-used across tasks.
480 : double pab[task_list->maxco * task_list->maxco];
481 : double hab[task_list->maxco * task_list->maxco];
482 :
483 : // Parallelize over blocks to avoid concurred access to hab_blocks.
484 : const int nthreads = omp_get_num_threads();
485 : const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
486 : #pragma omp for schedule(dynamic, chunk_size)
487 : for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
488 : const int first_task = first_block_task[block_num];
489 : const int last_task = last_block_task[block_num];
490 :
491 : // Accumulate forces per block as it corresponds to a pair of atoms.
492 : const int iatom = task_list->tasks[first_task].iatom - 1;
493 : const int jatom = task_list->tasks[first_task].jatom - 1;
494 : double my_forces[2][3] = {0};
495 : double my_virials[2][3][3] = {0};
496 :
497 : for (int itask = first_task; itask <= last_task; itask++) {
498 : // Define some convenient aliases.
499 : const grid_ref_task *task = &task_list->tasks[itask];
500 : assert(task->block_num - 1 == block_num);
501 : assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
502 : const int ikind = task_list->atom_kinds[iatom] - 1;
503 : const int jkind = task_list->atom_kinds[jatom] - 1;
504 : grid_basis_set *ibasis = task_list->basis_sets[ikind];
505 : grid_basis_set *jbasis = task_list->basis_sets[jkind];
506 : const int iset = task->iset - 1;
507 : const int jset = task->jset - 1;
508 : const int ipgf = task->ipgf - 1;
509 : const int jpgf = task->jpgf - 1;
510 : const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
511 : const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
512 : const int ncoseta = ncoset(ibasis->lmax[iset]);
513 : const int ncosetb = ncoset(jbasis->lmax[jset]);
514 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
515 : const int ncob = jbasis->npgf[jset] * ncosetb;
516 : const int block_offset = task_list->block_offsets[block_num];
517 : const bool transpose = (iatom <= jatom);
518 : const bool pab_required = (forces != NULL || virial != NULL);
519 :
520 : // Load pab and store hab subblocks when needed.
521 : // Previous hab and pab can be reused when only ipgf or jpgf changed.
522 : if (block_offset != old_offset || iset != old_iset ||
523 : jset != old_jset) {
524 : if (pab_required) {
525 : load_pab(ibasis, jbasis, iset, jset, transpose,
526 : &pab_blocks->host_buffer[block_offset], pab);
527 : }
528 : if (old_offset >= 0) { // skip first iteration
529 : store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
530 : hab, &hab_blocks->host_buffer[old_offset]);
531 : }
532 : memset(hab, 0, ncoa * ncob * sizeof(double));
533 : old_offset = block_offset;
534 : old_iset = iset;
535 : old_jset = jset;
536 : old_ibasis = ibasis;
537 : old_jbasis = jbasis;
538 : old_transpose = transpose;
539 : }
540 :
541 : grid_ref_integrate_pgf_product(
542 : /*orthorhombic=*/task_list->orthorhombic,
543 : /*compute_tau=*/compute_tau,
544 : /*border_mask=*/task->border_mask,
545 : /*la_max=*/ibasis->lmax[iset],
546 : /*la_min=*/ibasis->lmin[iset],
547 : /*lb_max=*/jbasis->lmax[jset],
548 : /*lb_min=*/jbasis->lmin[jset],
549 : /*zeta=*/zeta,
550 : /*zetb=*/zetb,
551 : /*dh=*/dh,
552 : /*dh_inv=*/dh_inv,
553 : /*ra=*/&task_list->atom_positions[3 * iatom],
554 : /*rab=*/task->rab,
555 : /*npts_global=*/npts_global,
556 : /*npts_local=*/npts_local,
557 : /*shift_local=*/shift_local,
558 : /*border_width=*/border_width,
559 : /*radius=*/task->radius,
560 : /*o1=*/ipgf * ncoseta,
561 : /*o2=*/jpgf * ncosetb,
562 : /*n1=*/ncoa,
563 : /*n2=*/ncob,
564 : /*grid=*/grid->host_buffer,
565 : /*hab=*/(double(*)[ncoa])hab,
566 : /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
567 : /*forces=*/(forces != NULL) ? my_forces : NULL,
568 : /*virials=*/(virial != NULL) ? my_virials : NULL,
569 : /*hdab=*/NULL,
570 : /*hadb=*/NULL,
571 : /*a_hdab=*/NULL);
572 :
573 : } // end of task loop
574 :
575 : // Merge thread-local forces and virial into shared ones.
576 : // It does not seem worth the trouble to accumulate them thread-locally.
577 : const double scalef = (iatom == jatom) ? 1.0 : 2.0;
578 : if (forces != NULL) {
579 : #pragma omp critical(forces)
580 : for (int i = 0; i < 3; i++) {
581 : forces[iatom][i] += scalef * my_forces[0][i];
582 : forces[jatom][i] += scalef * my_forces[1][i];
583 : }
584 : }
585 : if (virial != NULL) {
586 : #pragma omp critical(virial)
587 : for (int i = 0; i < 3; i++) {
588 : for (int j = 0; j < 3; j++) {
589 : virial[i][j] += scalef * my_virials[0][i][j];
590 : virial[i][j] += scalef * my_virials[1][i][j];
591 : }
592 : }
593 : }
594 :
595 : } // end of block loop
596 :
597 : // store final hab
598 : if (old_offset >= 0) {
599 : store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
600 : &hab_blocks->host_buffer[old_offset]);
601 : }
602 :
603 : } // end of omp parallel region
604 192 : }
605 :
606 : /*******************************************************************************
607 : * \brief Integrate all tasks of in given list from given grids.
608 : * See grid_task_list.h for details.
609 : * \author Ole Schuett
610 : ******************************************************************************/
611 48 : void grid_ref_integrate_task_list(
612 : const grid_ref_task_list *task_list, const bool compute_tau,
613 : const int natoms, const int nlevels, const offload_buffer *pab_blocks,
614 : const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
615 : double forces[natoms][3], double virial[3][3]) {
616 :
617 48 : assert(task_list->nlevels == nlevels);
618 48 : assert(task_list->natoms == natoms);
619 :
620 : // Zero result arrays.
621 48 : memset(hab_blocks->host_buffer, 0, hab_blocks->size);
622 48 : if (forces != NULL) {
623 8 : memset(forces, 0, natoms * 3 * sizeof(double));
624 : }
625 48 : if (virial != NULL) {
626 0 : memset(virial, 0, 9 * sizeof(double));
627 : }
628 :
629 240 : for (int level = 0; level < task_list->nlevels; level++) {
630 192 : const int idx = level * task_list->nblocks;
631 192 : const int *first_block_task = &task_list->first_level_block_task[idx];
632 192 : const int *last_block_task = &task_list->last_level_block_task[idx];
633 192 : const grid_ref_layout *layout = &task_list->layouts[level];
634 192 : integrate_one_grid_level(
635 : task_list, first_block_task, last_block_task, compute_tau, natoms,
636 192 : layout->npts_global, layout->npts_local, layout->shift_local,
637 192 : layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
638 192 : grids[level], hab_blocks, forces, virial);
639 : }
640 48 : }
641 :
642 : // EOF
|