LCOV - CP2K Regtests (git:71c3ab0) - src/skala_gpw

LCOV - code coverage report

Current view:	top level - src - skala_gpw_functional.F (source / functions)		Coverage	Total	Hit
Test:	CP2K Regtests (git:71c3ab0)	Lines:	76.2 %	1013	772
Test Date:	2026-07-25 06:35:44	Functions:	88.9 %	27	24

            Line data    Source code

       1              : !--------------------------------------------------------------------------------------------------!
       2              : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3              : !   Copyright 2000-2026 CP2K developers group <https://cp2k.org>                                   !
       4              : !                                                                                                  !
       5              : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6              : !--------------------------------------------------------------------------------------------------!
       7              : 
       8              : ! **************************************************************************************************
       9              : !> \brief Experimental CP2K-native GPW real-space-grid path for SKALA TorchScript models.
      10              : ! **************************************************************************************************
      11              : MODULE skala_gpw_functional
      12              :    USE cell_types,                      ONLY: cell_type,&
      13              :                                               pbc
      14              :    USE cp_array_utils,                  ONLY: cp_3d_r_cp_type
      15              :    USE cp_log_handling,                 ONLY: cp_logger_get_default_io_unit
      16              :    USE input_section_types,             ONLY: section_get_rval,&
      17              :                                               section_vals_get_subs_vals,&
      18              :                                               section_vals_get_subs_vals2,&
      19              :                                               section_vals_type,&
      20              :                                               section_vals_val_get
      21              :    USE kinds,                           ONLY: default_path_length,&
      22              :                                               dp,&
      23              :                                               int_8
      24              :    USE message_passing,                 ONLY: mp_comm_type
      25              :    USE offload_api,                     ONLY: offload_set_chosen_device
      26              :    USE particle_types,                  ONLY: particle_type
      27              :    USE pw_grid_types,                   ONLY: pw_grid_type
      28              :    USE pw_methods,                      ONLY: pw_scale,&
      29              :                                               pw_zero
      30              :    USE pw_pool_types,                   ONLY: pw_pool_type
      31              :    USE pw_types,                        ONLY: pw_c1d_gs_type,&
      32              :                                               pw_r3d_rs_type
      33              :    USE qs_grid_atom,                    ONLY: grid_atom_type
      34              :    USE skala_gpw_features,              ONLY: skala_gpw_atom_partition_hard,&
      35              :                                               skala_gpw_atom_partition_smooth,&
      36              :                                               skala_gpw_atom_subchunk_count,&
      37              :                                               skala_gpw_feature_build,&
      38              :                                               skala_gpw_feature_build_atom_subchunk,&
      39              :                                               skala_gpw_feature_release,&
      40              :                                               skala_gpw_feature_type,&
      41              :                                               skala_gpw_smooth_partition_derivatives
      42              :    USE skala_torch_api,                 ONLY: skala_torch_model_get_exc,&
      43              :                                               skala_torch_model_get_exc_density,&
      44              :                                               skala_torch_model_load,&
      45              :                                               skala_torch_model_release,&
      46              :                                               skala_torch_model_type
      47              :    USE string_utilities,                ONLY: uppercase
      48              :    USE torch_api,                       ONLY: &
      49              :         torch_cuda_device_count, torch_cuda_is_available, torch_dict_create, torch_dict_insert, &
      50              :         torch_dict_release, torch_dict_type, torch_tensor_backward_scalar, torch_tensor_data_ptr, &
      51              :         torch_tensor_from_array, torch_tensor_grad, torch_tensor_release, &
      52              :         torch_tensor_to_device_leaf, torch_tensor_type, torch_use_cuda
      53              :    USE xc_rho_cflags_types,             ONLY: xc_rho_cflags_type
      54              :    USE xc_rho_set_types,                ONLY: xc_rho_set_create,&
      55              :                                               xc_rho_set_get,&
      56              :                                               xc_rho_set_release,&
      57              :                                               xc_rho_set_type,&
      58              :                                               xc_rho_set_update
      59              :    USE xc_util,                         ONLY: xc_pw_divergence,&
      60              :                                               xc_requires_tmp_g
      61              : #include "./base/base_uses.f90"
      62              : 
      63              :    IMPLICIT NONE
      64              : 
      65              :    PRIVATE
      66              : 
      67              :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'skala_gpw_functional'
      68              :    INTEGER, PARAMETER, PRIVATE          :: atom_chunk_auto_max_rows = 400000, &
      69              :                                            atom_chunk_auto_min_rows = 100000, &
      70              :                                            atom_chunk_auto_row_quantum = 100000, &
      71              :                                            ncollapsed_grad_per_point = 5, ngrad_per_point = 10
      72              :    INTEGER, PARAMETER, PUBLIC           :: skala_gapw_density_partition_hard_minus_soft = 1, &
      73              :                                            skala_gapw_density_partition_hard_only = 2, &
      74              :                                            skala_gapw_density_partition_soft_only = 3, &
      75              :                                            skala_gapw_density_partition_none = 4
      76              : 
      77              :    PUBLIC :: ensure_native_skala_grid_scope, get_gauxc_section, skala_gapw_atom_vxc_of_r, &
      78              :              native_skala_gapw_density_partition, skala_gpw_eval, skala_gpw_exc_density, &
      79              :              xc_section_uses_native_skala_grid, xc_section_uses_gauxc_model
      80              : 
      81              :    TYPE(skala_torch_model_type), SAVE                  :: cached_model
      82              :    CHARACTER(len=default_path_length), SAVE            :: cached_model_path = ""
      83              :    LOGICAL, SAVE                                       :: cached_model_loaded = .FALSE.
      84              :    INTEGER, SAVE                                       :: cached_model_cuda_device = -3
      85              :    INTEGER, SAVE                                       :: logged_cuda_device = -3, &
      86              :                                                           logged_cuda_device_count = -1, &
      87              :                                                           logged_cuda_nproc = -1, &
      88              :                                                           logged_cuda_request = -3
      89              : 
      90              : CONTAINS
      91              : 
      92              : ! **************************************************************************************************
      93              : !> \brief Return true if the GAUXC subsection requests the CP2K-native GPW grid path.
      94              : !> \param xc_section ...
      95              : !> \return ...
      96              : ! **************************************************************************************************
      97       157281 :    FUNCTION xc_section_uses_native_skala_grid(xc_section) RESULT(uses_native_grid)
      98              :       TYPE(section_vals_type), INTENT(IN), POINTER       :: xc_section
      99              :       LOGICAL                                            :: uses_native_grid
     100              : 
     101              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
     102              : 
     103       157281 :       uses_native_grid = .FALSE.
     104       157281 :       gauxc_section => get_gauxc_section(xc_section)
     105       157281 :       IF (ASSOCIATED(gauxc_section)) THEN
     106          998 :          CALL section_vals_val_get(gauxc_section, "NATIVE_GRID", l_val=uses_native_grid)
     107              :       END IF
     108              : 
     109       157281 :    END FUNCTION xc_section_uses_native_skala_grid
     110              : 
     111              : ! **************************************************************************************************
     112              : !> \brief Return true if the GAUXC subsection requests a model evaluation.
     113              : !> \param xc_section ...
     114              : !> \return ...
     115              : ! **************************************************************************************************
     116        30902 :    FUNCTION xc_section_uses_gauxc_model(xc_section) RESULT(uses_gauxc_model)
     117              :       TYPE(section_vals_type), INTENT(IN), POINTER       :: xc_section
     118              :       LOGICAL                                            :: uses_gauxc_model
     119              : 
     120              :       CHARACTER(len=default_path_length)                 :: model_key, model_name, xc_key, xc_name
     121              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
     122              : 
     123        30902 :       uses_gauxc_model = .FALSE.
     124        30902 :       gauxc_section => get_gauxc_section(xc_section)
     125        30902 :       IF (ASSOCIATED(gauxc_section)) THEN
     126          144 :          CALL section_vals_val_get(gauxc_section, "MODEL", c_val=model_name)
     127          144 :          CALL section_vals_val_get(gauxc_section, "FUNCTIONAL", c_val=xc_name)
     128          144 :          model_key = ADJUSTL(model_name)
     129          144 :          xc_key = ADJUSTL(xc_name)
     130          144 :          CALL uppercase(model_key)
     131          144 :          CALL uppercase(xc_key)
     132              :          uses_gauxc_model = (TRIM(model_key) /= "" .AND. TRIM(model_key) /= "NONE" .AND. &
     133          144 :                              TRIM(model_key) /= TRIM(xc_key))
     134              :       END IF
     135              : 
     136        30902 :    END FUNCTION xc_section_uses_gauxc_model
     137              : 
     138              : ! **************************************************************************************************
     139              : !> \brief Return the hard/soft GAPW one-center density partition for native SKALA.
     140              : !> \param xc_section ...
     141              : !> \return ...
     142              : ! **************************************************************************************************
     143          144 :    FUNCTION native_skala_gapw_density_partition(xc_section) RESULT(partition)
     144              :       TYPE(section_vals_type), INTENT(IN), POINTER       :: xc_section
     145              :       INTEGER                                            :: partition
     146              : 
     147              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
     148              : 
     149          144 :       partition = skala_gapw_density_partition_hard_minus_soft
     150          144 :       gauxc_section => get_gauxc_section(xc_section)
     151          144 :       IF (ASSOCIATED(gauxc_section)) THEN
     152              :          CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_GAPW_DENSITY_PARTITION", &
     153          144 :                                    i_val=partition)
     154              :       END IF
     155              : 
     156              :       SELECT CASE (partition)
     157              :       CASE (skala_gapw_density_partition_hard_minus_soft, &
     158              :             skala_gapw_density_partition_hard_only, &
     159              :             skala_gapw_density_partition_soft_only, &
     160              :             skala_gapw_density_partition_none)
     161            0 :          CONTINUE
     162              :       CASE DEFAULT
     163              :          CALL cp_abort(__LOCATION__, &
     164          144 :                        "Unknown GAUXC%NATIVE_GRID_GAPW_DENSITY_PARTITION value.")
     165              :       END SELECT
     166              : 
     167          144 :    END FUNCTION native_skala_gapw_density_partition
     168              : 
     169              : ! **************************************************************************************************
     170              : !> \brief Enforce the currently implemented native SKALA GPW input scope.
     171              : !> \param xc_section ...
     172              : ! **************************************************************************************************
     173          580 :    SUBROUTINE ensure_native_skala_grid_scope(xc_section)
     174              :       TYPE(section_vals_type), INTENT(IN), POINTER       :: xc_section
     175              : 
     176              :       CHARACTER(len=default_path_length)                 :: model_key, model_name
     177              :       INTEGER                                            :: ifun, nfun
     178              :       LOGICAL                                            :: native_grid
     179              :       TYPE(section_vals_type), POINTER                   :: functionals, gauxc_section, xc_fun
     180              : 
     181          290 :       NULLIFY (gauxc_section)
     182          290 :       IF (.NOT. ASSOCIATED(xc_section)) THEN
     183            0 :          CPABORT("Native SKALA GPW requires an XC section")
     184              :       END IF
     185              : 
     186          290 :       functionals => section_vals_get_subs_vals(xc_section, "XC_FUNCTIONAL")
     187          290 :       IF (.NOT. ASSOCIATED(functionals)) THEN
     188            0 :          CPABORT("Native SKALA GPW requires an XC_FUNCTIONAL section")
     189              :       END IF
     190              : 
     191          290 :       nfun = 0
     192          290 :       ifun = 0
     193              :       DO
     194          580 :          ifun = ifun + 1
     195          580 :          xc_fun => section_vals_get_subs_vals2(functionals, i_section=ifun)
     196          580 :          IF (.NOT. ASSOCIATED(xc_fun)) EXIT
     197          290 :          nfun = nfun + 1
     198          580 :          IF (xc_fun%section%name == "GAUXC") gauxc_section => xc_fun
     199              :       END DO
     200              : 
     201          290 :       IF (.NOT. ASSOCIATED(gauxc_section)) THEN
     202            0 :          CPABORT("Native SKALA GPW requires an XC_FUNCTIONAL%GAUXC section")
     203              :       END IF
     204          290 :       IF (nfun /= 1) THEN
     205            0 :          CPABORT("Native SKALA GPW requires GAUXC to be the only XC functional")
     206              :       END IF
     207              : 
     208          290 :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID", l_val=native_grid)
     209          290 :       IF (.NOT. native_grid) RETURN
     210              : 
     211          290 :       CALL section_vals_val_get(gauxc_section, "MODEL", c_val=model_name)
     212          290 :       model_key = ADJUSTL(model_name)
     213          290 :       CALL uppercase(model_key)
     214          290 :       IF (TRIM(model_key) == "NONE" .OR. TRIM(model_key) == "") THEN
     215            0 :          CPABORT("Native SKALA GPW requires GAUXC%MODEL SKALA or a TorchScript model path")
     216              :       END IF
     217              : 
     218              :    END SUBROUTINE ensure_native_skala_grid_scope
     219              : 
     220              : ! **************************************************************************************************
     221              : !> \brief Evaluate SKALA energy and first derivatives on a CP2K GPW grid.
     222              : !> \param vxc_rho ...
     223              : !> \param vxc_tau ...
     224              : !> \param exc ...
     225              : !> \param rho_r ...
     226              : !> \param rho_g ...
     227              : !> \param tau ...
     228              : !> \param xc_section ...
     229              : !> \param weights ...
     230              : !> \param pw_pool ...
     231              : !> \param particle_set ...
     232              : !> \param cell ...
     233              : !> \param compute_virial ...
     234              : !> \param virial_xc ...
     235              : !> \param just_energy ...
     236              : !> \param atom_force ...
     237              : ! **************************************************************************************************
     238          290 :    SUBROUTINE skala_gpw_eval(vxc_rho, vxc_tau, exc, rho_r, rho_g, tau, xc_section, &
     239              :                              weights, pw_pool, particle_set, cell, compute_virial, virial_xc, &
     240          290 :                              just_energy, atom_force)
     241              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: vxc_rho, vxc_tau
     242              :       REAL(KIND=dp), INTENT(OUT)                         :: exc
     243              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: rho_r
     244              :       TYPE(pw_c1d_gs_type), DIMENSION(:), POINTER        :: rho_g
     245              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: tau
     246              :       TYPE(section_vals_type), POINTER                   :: xc_section
     247              :       TYPE(pw_r3d_rs_type), POINTER                      :: weights
     248              :       TYPE(pw_pool_type), POINTER                        :: pw_pool
     249              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     250              :       TYPE(cell_type), POINTER                           :: cell
     251              :       LOGICAL, INTENT(IN)                                :: compute_virial
     252              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(OUT)        :: virial_xc
     253              :       LOGICAL, INTENT(IN), OPTIONAL                      :: just_energy
     254              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(OUT), &
     255              :          OPTIONAL                                        :: atom_force
     256              : 
     257              :       CHARACTER(len=default_path_length)                 :: model_path
     258              :       INTEGER :: iw, native_grid_atom_chunk_max_rows, native_grid_atom_partition, &
     259              :          native_grid_atom_subchunks, native_grid_cuda_device, nspins, phase_handle, &
     260              :          selected_cuda_device, xc_deriv_method_id, xc_rho_smooth_id
     261              :       LOGICAL :: has_atom_chunk_work, have_atom_coord_grad, lsd, my_just_energy, &
     262              :          native_grid_atom_chunk_routing, native_grid_atom_chunks, native_grid_diagnostics, &
     263              :          native_grid_use_cuda, needs_atom_force, use_atom_subchunks
     264          290 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: density_grad, kin_grad
     265          290 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: grad_grad
     266              :       REAL(KIND=dp), DIMENSION(3, 3)                     :: virial_before
     267              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
     268          290 :       TYPE(skala_gpw_feature_type)                       :: features
     269              :       TYPE(torch_tensor_type)                            :: atom_coord_grad_t, &
     270              :                                                             atomic_grid_weight_grad_t, exc_tensor, &
     271              :                                                             grid_coord_grad_t, grid_weight_grad_t
     272              :       TYPE(xc_rho_cflags_type)                           :: needs
     273              :       TYPE(xc_rho_set_type)                              :: rho_set
     274              : 
     275          290 :       virial_xc = 0.0_dp
     276          290 :       exc = 0.0_dp
     277          290 :       my_just_energy = .FALSE.
     278          290 :       IF (PRESENT(just_energy)) my_just_energy = just_energy
     279          290 :       needs_atom_force = PRESENT(atom_force)
     280          770 :       IF (needs_atom_force) atom_force = 0.0_dp
     281          290 :       have_atom_coord_grad = .FALSE.
     282              : 
     283          290 :       IF (compute_virial .AND. my_just_energy) THEN
     284              :          CALL cp_abort(__LOCATION__, &
     285            0 :                        "Native SKALA GPW stress/virial requires feature gradients.")
     286              :       END IF
     287          290 :       IF (.NOT. ASSOCIATED(rho_g)) THEN
     288              :          CALL cp_abort(__LOCATION__, &
     289            0 :                        "Native SKALA GPW requires the reciprocal-space density to form density gradients.")
     290              :       END IF
     291          290 :       IF (.NOT. ASSOCIATED(tau)) THEN
     292              :          CALL cp_abort(__LOCATION__, &
     293            0 :                        "Native SKALA GPW requires the kinetic-energy density.")
     294              :       END IF
     295              : 
     296          290 :       nspins = SIZE(rho_r)
     297          290 :       lsd = (nspins /= 1)
     298          290 :       CALL get_skala_model_path(xc_section, model_path)
     299          290 :       gauxc_section => get_gauxc_section(xc_section)
     300          290 :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_USE_CUDA", l_val=native_grid_use_cuda)
     301              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_CUDA_DEVICE", &
     302          290 :                                 i_val=native_grid_cuda_device)
     303              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_ATOM_CHUNKS", &
     304          290 :                                 l_val=native_grid_atom_chunks)
     305              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_ATOM_CHUNK_ROUTING", &
     306          290 :                                 l_val=native_grid_atom_chunk_routing)
     307              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_ATOM_CHUNK_MAX_ROWS", &
     308          290 :                                 i_val=native_grid_atom_chunk_max_rows)
     309              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_ATOM_PARTITION", &
     310          290 :                                 i_val=native_grid_atom_partition)
     311           26 :       SELECT CASE (native_grid_atom_partition)
     312              :       CASE (1)
     313           26 :          native_grid_atom_partition = skala_gpw_atom_partition_hard
     314              :       CASE (2)
     315          264 :          native_grid_atom_partition = skala_gpw_atom_partition_smooth
     316              :       CASE DEFAULT
     317              :          CALL cp_abort(__LOCATION__, &
     318          290 :                        "Unknown GAUXC%NATIVE_GRID_ATOM_PARTITION value.")
     319              :       END SELECT
     320          290 :       native_grid_atom_chunk_routing = native_grid_atom_chunk_routing .OR. native_grid_atom_chunks
     321          290 :       native_grid_atom_chunks = native_grid_atom_chunks .OR. native_grid_atom_chunk_routing
     322          290 :       IF (native_grid_atom_chunk_max_rows < -1) THEN
     323              :          CALL cp_abort(__LOCATION__, &
     324            0 :                        "GAUXC%NATIVE_GRID_ATOM_CHUNK_MAX_ROWS must be -1, zero, or positive.")
     325              :       END IF
     326          290 :       IF (needs_atom_force .OR. compute_virial) THEN
     327           60 :          IF (native_grid_atom_partition == skala_gpw_atom_partition_hard) THEN
     328            0 :             native_grid_atom_partition = skala_gpw_atom_partition_smooth
     329              :          END IF
     330           60 :          native_grid_atom_chunk_routing = .FALSE.
     331           60 :          native_grid_atom_chunks = .FALSE.
     332              :       END IF
     333              :       ! The portable SKALA export used by the regtests builds ragged-index tensors on CPU.
     334          290 :       CALL torch_use_cuda(native_grid_use_cuda)
     335              :       selected_cuda_device = configure_native_grid_cuda( &
     336          290 :                              native_grid_use_cuda, native_grid_cuda_device, rho_r(1)%pw_grid%para%group)
     337          290 :       CALL ensure_model_loaded(model_path, selected_cuda_device)
     338              : 
     339          290 :       IF (lsd) THEN
     340           48 :          needs%rho_spin = .TRUE.
     341           48 :          needs%drho_spin = .TRUE.
     342           48 :          needs%tau_spin = .TRUE.
     343              :       ELSE
     344          242 :          needs%rho = .TRUE.
     345          242 :          needs%drho = .TRUE.
     346          242 :          needs%tau = .TRUE.
     347              :       END IF
     348              : 
     349          290 :       CALL section_vals_val_get(xc_section, "XC_GRID%XC_DERIV", i_val=xc_deriv_method_id)
     350          290 :       CALL section_vals_val_get(xc_section, "XC_GRID%XC_SMOOTH_RHO", i_val=xc_rho_smooth_id)
     351              : 
     352              :       CALL xc_rho_set_create(rho_set, &
     353              :                              rho_r(1)%pw_grid%bounds_local, &
     354              :                              rho_cutoff=section_get_rval(xc_section, "density_cutoff"), &
     355              :                              drho_cutoff=section_get_rval(xc_section, "gradient_cutoff"), &
     356          290 :                              tau_cutoff=section_get_rval(xc_section, "tau_cutoff"))
     357              :       CALL xc_rho_set_update(rho_set, rho_r, rho_g, tau, needs, &
     358          290 :                              xc_deriv_method_id, xc_rho_smooth_id, pw_pool)
     359              : 
     360              :       CALL skala_gpw_feature_build(features, rho_set, rho_r, particle_set, cell, &
     361              :                                    requires_grad=(.NOT. my_just_energy), weights=weights, &
     362              :                                    requires_coordinate_grad=(needs_atom_force .OR. compute_virial), &
     363              :                                    requires_stress_grad=compute_virial, &
     364              :                                    use_atom_chunks=native_grid_atom_chunks, &
     365              :                                    route_atom_chunks=native_grid_atom_chunk_routing, &
     366          520 :                                    atom_partition=native_grid_atom_partition)
     367          290 :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_DIAGNOSTICS", l_val=native_grid_diagnostics)
     368          290 :       IF (native_grid_diagnostics) THEN
     369           24 :          CALL print_native_grid_diagnostics(features, rho_r(1)%pw_grid%para%group%mepos == 0)
     370              :       END IF
     371              : 
     372          290 :       IF (features%uses_atom_chunks .AND. native_grid_atom_chunk_max_rows == -1) THEN
     373            0 :          IF (native_grid_use_cuda) THEN
     374              :             native_grid_atom_chunk_max_rows = auto_atom_chunk_max_rows(features, &
     375            0 :                                                                        rho_r(1)%pw_grid%para%group)
     376              :          ELSE
     377            0 :             native_grid_atom_chunk_max_rows = 0
     378              :          END IF
     379              :       END IF
     380          290 :       IF (native_grid_diagnostics .AND. features%uses_atom_chunks .AND. &
     381              :           rho_r(1)%pw_grid%para%group%mepos == 0) THEN
     382            1 :          iw = cp_logger_get_default_io_unit()
     383            1 :          IF (iw > 0) THEN
     384              :             WRITE (UNIT=iw, FMT="(T2,A,1X,I0)") &
     385            1 :                "SKALA_GPW| Native grid atom chunk max rows", native_grid_atom_chunk_max_rows
     386              :          END IF
     387              :       END IF
     388          290 :       native_grid_atom_subchunks = 1
     389          290 :       IF (features%uses_atom_chunks .AND. native_grid_atom_chunk_max_rows > 0) THEN
     390            8 :          native_grid_atom_subchunks = skala_gpw_atom_subchunk_count(native_grid_atom_chunk_max_rows)
     391            8 :          CALL rho_r(1)%pw_grid%para%group%max(native_grid_atom_subchunks)
     392              :       END IF
     393          290 :       use_atom_subchunks = features%uses_atom_chunks .AND. native_grid_atom_subchunks > 1
     394          290 :       has_atom_chunk_work = .NOT. features%uses_atom_chunks .OR. features%chunk_feature_count > 0
     395          290 :       exc = 0.0_dp
     396          290 :       IF (use_atom_subchunks) THEN
     397              :          CALL evaluate_atom_subchunks(features, rho_r(1)%pw_grid%para%group, &
     398              :                                       native_grid_atom_chunk_max_rows, &
     399              :                                       compute_grads=(.NOT. my_just_energy), exc=exc, &
     400              :                                       density_grad=density_grad, grad_grad=grad_grad, &
     401            2 :                                       kin_grad=kin_grad, collapse_spin_grads=(nspins == 1))
     402          288 :       ELSE IF (has_atom_chunk_work) THEN
     403              :          CALL skala_torch_model_get_exc(cached_model, features%inputs, &
     404          288 :                                         features%grid_weights_t, exc_tensor, exc)
     405              :       END IF
     406          290 :       IF (features%uses_atom_chunks) CALL rho_r(1)%pw_grid%para%group%sum(exc)
     407              : 
     408          290 :       IF (.NOT. my_just_energy) THEN
     409          290 :          IF (.NOT. use_atom_subchunks) THEN
     410          288 :             IF (has_atom_chunk_work) THEN
     411          288 :                CALL timeset("skala_gpw_backward", phase_handle)
     412          288 :                CALL torch_tensor_backward_scalar(exc_tensor)
     413          288 :                CALL timestop(phase_handle)
     414              : 
     415          288 :                IF (compute_virial) THEN
     416           50 :                   IF (native_grid_diagnostics) virial_before = virial_xc
     417              :                   CALL build_weight_virial(virial_xc, features, exc, grid_weight_grad_t, &
     418              :                                            atomic_grid_weight_grad_t, &
     419              :                                            rho_r(1)%pw_grid%para%group%mepos == 0, &
     420           50 :                                            native_grid_diagnostics)
     421           50 :                   IF (native_grid_diagnostics) THEN
     422              :                      CALL print_virial_delta("weight-residual", virial_xc - virial_before, &
     423            0 :                                              rho_r(1)%pw_grid%para%group%mepos == 0)
     424              :                   END IF
     425              :                END IF
     426              :             END IF
     427              : 
     428          288 :             CALL timeset("skala_gpw_grad_fetch", phase_handle)
     429          288 :             IF (features%uses_atom_chunks) THEN
     430              :                CALL fetch_and_gather_atom_chunk_grads(features, rho_r(1)%pw_grid%para%group, &
     431            6 :                                                       density_grad, grad_grad, kin_grad)
     432              :             ELSE
     433          282 :                CALL fetch_local_feature_grads(features, density_grad, grad_grad, kin_grad)
     434              :             END IF
     435          288 :             CALL timestop(phase_handle)
     436              :          END IF
     437          290 :          IF (needs_atom_force) THEN
     438              :             CALL add_explicit_coordinate_force(atom_force, features, atom_coord_grad_t, &
     439           60 :                                                rho_r(1)%pw_grid%para%group%mepos == 0)
     440           60 :             IF (features%atom_partition == skala_gpw_atom_partition_smooth) THEN
     441              :                CALL add_smooth_partition_force(atom_force, features, particle_set, cell, rho_r, &
     442           60 :                                                grid_weight_grad_t, atomic_grid_weight_grad_t)
     443              :             END IF
     444              :             have_atom_coord_grad = .TRUE.
     445              :          END IF
     446              : 
     447          290 :          CALL timeset("skala_gpw_vxc_unpack", phase_handle)
     448          290 :          IF (compute_virial) THEN
     449           50 :             IF (native_grid_diagnostics) virial_before = virial_xc
     450           50 :             CALL build_virial_from_feature_grads(virial_xc, rho_set, rho_r, grad_grad)
     451           50 :             IF (native_grid_diagnostics) THEN
     452              :                CALL print_virial_delta("feature-gradient", virial_xc - virial_before, &
     453            0 :                                        rho_r(1)%pw_grid%para%group%mepos == 0)
     454            0 :                virial_before = virial_xc
     455              :             END IF
     456           50 :             IF (.NOT. have_atom_coord_grad) THEN
     457            0 :                CALL torch_tensor_grad(features%coarse_0_atomic_coords_t, atom_coord_grad_t)
     458            0 :                have_atom_coord_grad = .TRUE.
     459              :             END IF
     460              :             CALL build_static_coordinate_virial(virial_xc, features, atom_coord_grad_t, &
     461              :                                                 grid_coord_grad_t, &
     462              :                                                 rho_r(1)%pw_grid%para%group%mepos == 0, &
     463           50 :                                                 native_grid_diagnostics)
     464           50 :             IF (native_grid_diagnostics) THEN
     465              :                CALL print_virial_delta("static-coordinates", virial_xc - virial_before, &
     466            0 :                                        rho_r(1)%pw_grid%para%group%mepos == 0)
     467            0 :                virial_before = virial_xc
     468              :             END IF
     469           50 :             IF (features%atom_partition == skala_gpw_atom_partition_smooth) THEN
     470              :                CALL build_smooth_partition_virial(virial_xc, features, particle_set, cell, rho_r, &
     471           50 :                                                   grid_weight_grad_t, atomic_grid_weight_grad_t)
     472           50 :                IF (native_grid_diagnostics) THEN
     473              :                   CALL print_virial_delta("smooth-partition", virial_xc - virial_before, &
     474            0 :                                           rho_r(1)%pw_grid%para%group%mepos == 0)
     475              :                   virial_before = virial_xc
     476              :                END IF
     477              :             END IF
     478              :          END IF
     479              :          CALL build_vxc_from_feature_grads(vxc_rho, vxc_tau, rho_r, pw_pool, &
     480              :                                            density_grad, grad_grad, kin_grad, &
     481          290 :                                            xc_deriv_method_id)
     482          290 :          CALL timestop(phase_handle)
     483              : 
     484          290 :          CALL timeset("skala_gpw_grad_release", phase_handle)
     485          290 :          DEALLOCATE (density_grad, grad_grad, kin_grad)
     486          290 :          IF (have_atom_coord_grad) CALL torch_tensor_release(atom_coord_grad_t)
     487          290 :          CALL timestop(phase_handle)
     488              :       END IF
     489              : 
     490          290 :       CALL timeset("skala_gpw_cleanup", phase_handle)
     491          290 :       IF (.NOT. use_atom_subchunks .AND. has_atom_chunk_work) CALL torch_tensor_release(exc_tensor)
     492          290 :       CALL skala_gpw_feature_release(features)
     493          290 :       CALL xc_rho_set_release(rho_set, pw_pool=pw_pool)
     494          290 :       CALL torch_use_cuda(.TRUE.)
     495          290 :       CALL timestop(phase_handle)
     496              : 
     497         5800 :    END SUBROUTINE skala_gpw_eval
     498              : 
     499              : ! **************************************************************************************************
     500              : !> \brief Evaluate the native SKALA XC energy density on the CP2K PW grid.
     501              : !> \param exc_r ...
     502              : !> \param rho_r ...
     503              : !> \param rho_g ...
     504              : !> \param tau ...
     505              : !> \param xc_section ...
     506              : !> \param weights ...
     507              : !> \param pw_pool ...
     508              : !> \param particle_set ...
     509              : !> \param cell ...
     510              : ! **************************************************************************************************
     511            0 :    SUBROUTINE skala_gpw_exc_density(exc_r, rho_r, rho_g, tau, xc_section, weights, pw_pool, &
     512              :                                     particle_set, cell)
     513              :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: exc_r
     514              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: rho_r
     515              :       TYPE(pw_c1d_gs_type), DIMENSION(:), POINTER        :: rho_g
     516              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: tau
     517              :       TYPE(section_vals_type), POINTER                   :: xc_section
     518              :       TYPE(pw_r3d_rs_type), POINTER                      :: weights
     519              :       TYPE(pw_pool_type), POINTER                        :: pw_pool
     520              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     521              :       TYPE(cell_type), POINTER                           :: cell
     522              : 
     523              :       CHARACTER(len=default_path_length)                 :: model_path
     524              :       INTEGER :: feature_pos, i, j, k, local_row, native_grid_atom_partition, &
     525              :          native_grid_cuda_device, nspins, row, selected_cuda_device, xc_deriv_method_id, &
     526              :          xc_rho_smooth_id
     527              :       LOGICAL                                            :: lsd, native_grid_atom_chunk_routing, &
     528              :                                                             native_grid_atom_chunks, &
     529              :                                                             native_grid_use_cuda
     530              :       REAL(KIND=dp)                                      :: local_exc
     531            0 :       REAL(KIND=dp), DIMENSION(:), POINTER               :: exc_density
     532              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
     533            0 :       TYPE(skala_gpw_feature_type)                       :: features
     534              :       TYPE(torch_tensor_type)                            :: exc_density_t
     535              :       TYPE(xc_rho_cflags_type)                           :: needs
     536              :       TYPE(xc_rho_set_type)                              :: rho_set
     537              : 
     538            0 :       CPASSERT(ASSOCIATED(rho_r))
     539            0 :       CPASSERT(ASSOCIATED(rho_g))
     540            0 :       CPASSERT(ASSOCIATED(tau))
     541            0 :       CALL pw_zero(exc_r)
     542              : 
     543            0 :       nspins = SIZE(rho_r)
     544            0 :       lsd = (nspins /= 1)
     545            0 :       CALL get_skala_model_path(xc_section, model_path)
     546            0 :       gauxc_section => get_gauxc_section(xc_section)
     547            0 :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_USE_CUDA", l_val=native_grid_use_cuda)
     548              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_CUDA_DEVICE", &
     549            0 :                                 i_val=native_grid_cuda_device)
     550              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_ATOM_CHUNKS", &
     551            0 :                                 l_val=native_grid_atom_chunks)
     552              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_ATOM_CHUNK_ROUTING", &
     553            0 :                                 l_val=native_grid_atom_chunk_routing)
     554              :       native_grid_atom_chunks = .FALSE.
     555              :       native_grid_atom_chunk_routing = .FALSE.
     556              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_ATOM_PARTITION", &
     557            0 :                                 i_val=native_grid_atom_partition)
     558            0 :       SELECT CASE (native_grid_atom_partition)
     559              :       CASE (1)
     560            0 :          native_grid_atom_partition = skala_gpw_atom_partition_hard
     561              :       CASE (2)
     562            0 :          native_grid_atom_partition = skala_gpw_atom_partition_smooth
     563              :       CASE DEFAULT
     564              :          CALL cp_abort(__LOCATION__, &
     565            0 :                        "Unknown GAUXC%NATIVE_GRID_ATOM_PARTITION value.")
     566              :       END SELECT
     567              : 
     568            0 :       CALL torch_use_cuda(native_grid_use_cuda)
     569              :       selected_cuda_device = configure_native_grid_cuda( &
     570            0 :                              native_grid_use_cuda, native_grid_cuda_device, rho_r(1)%pw_grid%para%group)
     571            0 :       CALL ensure_model_loaded(model_path, selected_cuda_device)
     572              : 
     573            0 :       IF (lsd) THEN
     574            0 :          needs%rho_spin = .TRUE.
     575            0 :          needs%drho_spin = .TRUE.
     576            0 :          needs%tau_spin = .TRUE.
     577              :       ELSE
     578            0 :          needs%rho = .TRUE.
     579            0 :          needs%drho = .TRUE.
     580            0 :          needs%tau = .TRUE.
     581              :       END IF
     582              : 
     583            0 :       CALL section_vals_val_get(xc_section, "XC_GRID%XC_DERIV", i_val=xc_deriv_method_id)
     584            0 :       CALL section_vals_val_get(xc_section, "XC_GRID%XC_SMOOTH_RHO", i_val=xc_rho_smooth_id)
     585              : 
     586              :       CALL xc_rho_set_create(rho_set, &
     587              :                              rho_r(1)%pw_grid%bounds_local, &
     588              :                              rho_cutoff=section_get_rval(xc_section, "density_cutoff"), &
     589              :                              drho_cutoff=section_get_rval(xc_section, "gradient_cutoff"), &
     590            0 :                              tau_cutoff=section_get_rval(xc_section, "tau_cutoff"))
     591              :       CALL xc_rho_set_update(rho_set, rho_r, rho_g, tau, needs, &
     592            0 :                              xc_deriv_method_id, xc_rho_smooth_id, pw_pool)
     593              : 
     594              :       CALL skala_gpw_feature_build(features, rho_set, rho_r, particle_set, cell, &
     595              :                                    requires_grad=.FALSE., weights=weights, &
     596              :                                    requires_coordinate_grad=.FALSE., &
     597              :                                    requires_stress_grad=.FALSE., &
     598              :                                    use_atom_chunks=.FALSE., route_atom_chunks=.FALSE., &
     599            0 :                                    atom_partition=native_grid_atom_partition)
     600            0 :       CALL skala_torch_model_get_exc_density(cached_model, features%inputs, exc_density_t)
     601            0 :       NULLIFY (exc_density)
     602            0 :       CALL torch_tensor_data_ptr(exc_density_t, exc_density)
     603              : 
     604            0 :       local_row = 0
     605            0 :       DO k = LBOUND(features%feature_index, 3), UBOUND(features%feature_index, 3)
     606            0 :          DO j = LBOUND(features%feature_index, 2), UBOUND(features%feature_index, 2)
     607            0 :             DO i = LBOUND(features%feature_index, 1), UBOUND(features%feature_index, 1)
     608            0 :                local_row = local_row + 1
     609            0 :                local_exc = 0.0_dp
     610            0 :                DO feature_pos = features%local_feature_offsets(local_row), &
     611            0 :                   features%local_feature_offsets(local_row + 1) - 1
     612            0 :                   row = features%local_feature_rows(feature_pos)
     613            0 :                   local_exc = local_exc + exc_density(row)*features%grid_weights(row)
     614              :                END DO
     615            0 :                exc_r%array(i, j, k) = local_exc/rho_r(1)%pw_grid%dvol
     616              :             END DO
     617              :          END DO
     618              :       END DO
     619            0 :       CPASSERT(local_row == features%nflat_local)
     620              : 
     621            0 :       CALL torch_tensor_release(exc_density_t)
     622            0 :       CALL skala_gpw_feature_release(features)
     623            0 :       CALL xc_rho_set_release(rho_set, pw_pool=pw_pool)
     624            0 :       CALL torch_use_cuda(.TRUE.)
     625              : 
     626            0 :    END SUBROUTINE skala_gpw_exc_density
     627              : 
     628              : ! **************************************************************************************************
     629              : !> \brief Evaluate SKALA on a GAPW one-center atomic grid.
     630              : !> \param xc_section ...
     631              : !> \param grid_atom ...
     632              : !> \param group ...
     633              : !> \param atom_coord ...
     634              : !> \param rho ...
     635              : !> \param drho ...
     636              : !> \param tau ...
     637              : !> \param weights ...
     638              : !> \param lsd ...
     639              : !> \param nspins ...
     640              : !> \param na ...
     641              : !> \param nr ...
     642              : !> \param exc ...
     643              : !> \param vxc ...
     644              : !> \param vxg ...
     645              : !> \param vtau ...
     646              : !> \param energy_only ...
     647              : !> \param atom_force ...
     648              : !> \param atom_virial ...
     649              : ! **************************************************************************************************
     650           32 :    SUBROUTINE skala_gapw_atom_vxc_of_r(xc_section, grid_atom, group, atom_coord, &
     651           32 :                                        rho, drho, tau, weights, lsd, nspins, na, nr, &
     652              :                                        exc, vxc, vxg, vtau, energy_only, atom_force, atom_virial)
     653              :       TYPE(section_vals_type), POINTER                   :: xc_section
     654              :       TYPE(grid_atom_type), POINTER                      :: grid_atom
     655              : 
     656              :       CLASS(mp_comm_type), INTENT(IN)                    :: group
     657              :       REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: atom_coord
     658              :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: rho, tau, vxc, vtau
     659              :       REAL(KIND=dp), DIMENSION(:, :, :, :), POINTER      :: drho, vxg
     660              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: weights
     661              :       LOGICAL, INTENT(IN)                                :: lsd
     662              :       INTEGER, INTENT(IN)                                :: nspins, na, nr
     663              :       REAL(KIND=dp), INTENT(OUT)                         :: exc
     664              :       LOGICAL, INTENT(IN), OPTIONAL                      :: energy_only
     665              :       REAL(KIND=dp), DIMENSION(3), INTENT(OUT), &
     666              :          OPTIONAL                                        :: atom_force
     667              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(OUT), &
     668              :          OPTIONAL                                        :: atom_virial
     669              : 
     670              :       CHARACTER(len=default_path_length)                 :: model_path
     671              :       INTEGER                                            :: ia, idir, ir, native_grid_cuda_device, &
     672              :                                                             jdir, nflat, row, selected_cuda_device
     673           32 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: atomic_grid_sizes
     674           32 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :)  :: atomic_grid_size_bound_shape
     675              :       LOGICAL                                            :: need_coord_grad, my_energy_only, native_grid_use_cuda
     676              :       REAL(KIND=dp)                                      :: tmp
     677           32 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: atomic_grid_weights, grid_weights
     678           32 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: coarse_0_atomic_coords, density, &
     679           32 :                                                             grid_coords, kin
     680           32 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: grad
     681           32 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: atom_coord_grad, density_grad, &
     682           32 :                                                             grid_coord_grad, kin_grad
     683           32 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: grad_grad
     684              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
     685              :       TYPE(torch_dict_type)                              :: inputs
     686              :       TYPE(torch_tensor_type)                            :: atomic_grid_size_bound_shape_t, &
     687              :                                                             atomic_grid_sizes_t, &
     688              :                                                             atomic_grid_weights_t, &
     689              :                                                             atom_coord_grad_t, &
     690              :                                                             coarse_0_atomic_coords_t, density_t, &
     691              :                                                             density_grad_t, exc_tensor, grad_t, &
     692              :                                                             grad_grad_t, grid_coord_grad_t, &
     693              :                                                             grid_coords_t, grid_weights_t, kin_t, &
     694              :                                                             kin_grad_t
     695              : 
     696            0 :       CPASSERT(ASSOCIATED(xc_section))
     697           32 :       CPASSERT(ASSOCIATED(grid_atom))
     698           32 :       CPASSERT(ASSOCIATED(rho))
     699           32 :       CPASSERT(ASSOCIATED(drho))
     700           32 :       CPASSERT(ASSOCIATED(tau))
     701              : 
     702           32 :       my_energy_only = .FALSE.
     703           32 :       IF (PRESENT(energy_only)) my_energy_only = energy_only
     704           32 :       need_coord_grad = PRESENT(atom_force) .OR. PRESENT(atom_virial)
     705           32 :       exc = 0.0_dp
     706           32 :       IF (PRESENT(atom_force)) atom_force = 0.0_dp
     707           32 :       IF (PRESENT(atom_virial)) atom_virial = 0.0_dp
     708           32 :       IF (.NOT. my_energy_only) THEN
     709        81664 :          vxc = 0.0_dp
     710       321664 :          vxg = 0.0_dp
     711        81664 :          vtau = 0.0_dp
     712              :       END IF
     713              : 
     714           32 :       CALL get_skala_model_path(xc_section, model_path)
     715           32 :       gauxc_section => get_gauxc_section(xc_section)
     716           32 :       CPASSERT(ASSOCIATED(gauxc_section))
     717           32 :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_USE_CUDA", l_val=native_grid_use_cuda)
     718              :       CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_CUDA_DEVICE", &
     719           32 :                                 i_val=native_grid_cuda_device)
     720           32 :       CALL torch_use_cuda(native_grid_use_cuda)
     721              :       selected_cuda_device = configure_native_grid_cuda( &
     722           32 :                              native_grid_use_cuda, native_grid_cuda_device, group)
     723           32 :       CALL ensure_model_loaded(model_path, selected_cuda_device)
     724              : 
     725           32 :       nflat = na*nr
     726              :       ALLOCATE (density(nflat, 2), grad(nflat, 3, 2), kin(nflat, 2), &
     727              :                 grid_coords(3, nflat), grid_weights(nflat), &
     728              :                 atomic_grid_weights(nflat), atomic_grid_sizes(1), &
     729          416 :                 coarse_0_atomic_coords(3, 1), atomic_grid_size_bound_shape(0, nflat))
     730           32 :       density = 0.0_dp
     731           32 :       grad = 0.0_dp
     732           32 :       kin = 0.0_dp
     733           32 :       grid_coords = 0.0_dp
     734           32 :       grid_weights = 0.0_dp
     735           32 :       atomic_grid_weights = 0.0_dp
     736           32 :       atomic_grid_sizes(1) = INT(nflat, KIND=int_8)
     737              :       atomic_grid_size_bound_shape = 0_int_8
     738          128 :       coarse_0_atomic_coords(:, 1) = atom_coord
     739              : 
     740              :       row = 0
     741         1632 :       DO ir = 1, nr
     742        81632 :          DO ia = 1, na
     743        80000 :             row = row + 1
     744              :             grid_coords(1, row) = atom_coord(1) + grid_atom%rad(ir)* &
     745        80000 :                                   grid_atom%sin_pol(ia)*grid_atom%cos_azi(ia)
     746              :             grid_coords(2, row) = atom_coord(2) + grid_atom%rad(ir)* &
     747        80000 :                                   grid_atom%sin_pol(ia)*grid_atom%sin_azi(ia)
     748        80000 :             grid_coords(3, row) = atom_coord(3) + grid_atom%rad(ir)*grid_atom%cos_pol(ia)
     749        80000 :             grid_weights(row) = weights(ia, ir)
     750        80000 :             atomic_grid_weights(row) = weights(ia, ir)
     751        81600 :             IF (nspins == 1) THEN
     752       240000 :                density(row, :) = 0.5_dp*rho(ia, ir, 1)
     753       320000 :                DO idir = 1, 3
     754       800000 :                   grad(row, idir, :) = 0.5_dp*drho(idir, ia, ir, 1)
     755              :                END DO
     756       240000 :                kin(row, :) = 0.5_dp*tau(ia, ir, 1)
     757              :             ELSE
     758            0 :                density(row, :) = rho(ia, ir, 1:2)
     759            0 :                DO idir = 1, 3
     760            0 :                   grad(row, idir, :) = drho(idir, ia, ir, 1:2)
     761              :                END DO
     762            0 :                kin(row, :) = tau(ia, ir, 1:2)
     763              :             END IF
     764              :          END DO
     765              :       END DO
     766              : 
     767           32 :       CALL torch_tensor_from_array(grid_coords_t, grid_coords)
     768           32 :       CALL torch_tensor_to_device_leaf(grid_coords_t, need_coord_grad)
     769           32 :       CALL torch_tensor_from_array(grid_weights_t, grid_weights)
     770           32 :       CALL torch_tensor_to_device_leaf(grid_weights_t, .FALSE.)
     771           32 :       CALL torch_tensor_from_array(atomic_grid_weights_t, atomic_grid_weights)
     772           32 :       CALL torch_tensor_to_device_leaf(atomic_grid_weights_t, .FALSE.)
     773           32 :       CALL torch_tensor_from_array(atomic_grid_sizes_t, atomic_grid_sizes)
     774           32 :       CALL torch_tensor_to_device_leaf(atomic_grid_sizes_t, .FALSE.)
     775              :       CALL torch_tensor_from_array(atomic_grid_size_bound_shape_t, &
     776           32 :                                    atomic_grid_size_bound_shape)
     777           32 :       CALL torch_tensor_to_device_leaf(atomic_grid_size_bound_shape_t, .FALSE.)
     778           32 :       CALL torch_tensor_from_array(coarse_0_atomic_coords_t, coarse_0_atomic_coords)
     779           32 :       CALL torch_tensor_to_device_leaf(coarse_0_atomic_coords_t, need_coord_grad)
     780           32 :       CALL torch_tensor_from_array(density_t, density)
     781           32 :       CALL torch_tensor_to_device_leaf(density_t,.NOT. my_energy_only)
     782           32 :       CALL torch_tensor_from_array(grad_t, grad)
     783           32 :       CALL torch_tensor_to_device_leaf(grad_t,.NOT. my_energy_only)
     784           32 :       CALL torch_tensor_from_array(kin_t, kin)
     785           32 :       CALL torch_tensor_to_device_leaf(kin_t,.NOT. my_energy_only)
     786              : 
     787           32 :       CALL torch_dict_create(inputs)
     788           32 :       CALL torch_dict_insert(inputs, "grid_coords", grid_coords_t)
     789           32 :       CALL torch_dict_insert(inputs, "grid_weights", grid_weights_t)
     790           32 :       CALL torch_dict_insert(inputs, "atomic_grid_weights", atomic_grid_weights_t)
     791           32 :       CALL torch_dict_insert(inputs, "atomic_grid_sizes", atomic_grid_sizes_t)
     792              :       CALL torch_dict_insert(inputs, "atomic_grid_size_bound_shape", &
     793           32 :                              atomic_grid_size_bound_shape_t)
     794           32 :       CALL torch_dict_insert(inputs, "density", density_t)
     795           32 :       CALL torch_dict_insert(inputs, "grad", grad_t)
     796           32 :       CALL torch_dict_insert(inputs, "kin", kin_t)
     797           32 :       CALL torch_dict_insert(inputs, "coarse_0_atomic_coords", coarse_0_atomic_coords_t)
     798              : 
     799           32 :       CALL skala_torch_model_get_exc(cached_model, inputs, grid_weights_t, exc_tensor, exc)
     800              : 
     801           32 :       IF (.NOT. my_energy_only) THEN
     802           32 :          NULLIFY (atom_coord_grad, density_grad, grad_grad, grid_coord_grad, kin_grad)
     803           32 :          CALL torch_tensor_backward_scalar(exc_tensor)
     804           32 :          IF (need_coord_grad) THEN
     805           32 :             CALL torch_tensor_grad(grid_coords_t, grid_coord_grad_t)
     806           32 :             CALL torch_tensor_grad(coarse_0_atomic_coords_t, atom_coord_grad_t)
     807           32 :             CALL torch_tensor_data_ptr(grid_coord_grad_t, grid_coord_grad)
     808           32 :             CALL torch_tensor_data_ptr(atom_coord_grad_t, atom_coord_grad)
     809           32 :             IF (PRESENT(atom_force)) THEN
     810          128 :                atom_force(:) = atom_coord_grad(:, 1)
     811        80032 :                DO row = 1, nflat
     812       320032 :                   atom_force(:) = atom_force(:) + grid_coord_grad(:, row)
     813              :                END DO
     814              :             END IF
     815           32 :             IF (PRESENT(atom_virial)) THEN
     816        80032 :                DO row = 1, nflat
     817       320032 :                   DO idir = 1, 3
     818      1040000 :                      DO jdir = 1, 3
     819       720000 :                         tmp = grid_coord_grad(idir, row)*coarse_0_atomic_coords(jdir, 1)
     820       960000 :                         atom_virial(idir, jdir) = atom_virial(idir, jdir) + tmp
     821              :                      END DO
     822              :                   END DO
     823              :                END DO
     824          128 :                DO idir = 1, 3
     825          416 :                   DO jdir = 1, 3
     826          288 :                      tmp = atom_coord_grad(idir, 1)*coarse_0_atomic_coords(jdir, 1)
     827          384 :                      atom_virial(idir, jdir) = atom_virial(idir, jdir) + tmp
     828              :                   END DO
     829              :                END DO
     830              :             END IF
     831              :          END IF
     832           32 :          CALL torch_tensor_grad(density_t, density_grad_t)
     833           32 :          CALL torch_tensor_grad(grad_t, grad_grad_t)
     834           32 :          CALL torch_tensor_grad(kin_t, kin_grad_t)
     835           32 :          CALL torch_tensor_data_ptr(density_grad_t, density_grad)
     836           32 :          CALL torch_tensor_data_ptr(grad_grad_t, grad_grad)
     837           32 :          CALL torch_tensor_data_ptr(kin_grad_t, kin_grad)
     838              : 
     839           32 :          row = 0
     840         1632 :          DO ir = 1, nr
     841        81632 :             DO ia = 1, na
     842        80000 :                row = row + 1
     843        81600 :                IF (lsd) THEN
     844            0 :                   vxc(ia, ir, 1:2) = density_grad(row, 1:2)
     845            0 :                   DO idir = 1, 3
     846            0 :                      vxg(idir, ia, ir, 1:2) = grad_grad(row, idir, 1:2)
     847              :                   END DO
     848            0 :                   vtau(ia, ir, 1:2) = kin_grad(row, 1:2)
     849              :                ELSE
     850        80000 :                   vxc(ia, ir, 1) = 0.5_dp*(density_grad(row, 1) + density_grad(row, 2))
     851       320000 :                   DO idir = 1, 3
     852              :                      vxg(idir, ia, ir, 1) = &
     853       320000 :                         0.5_dp*(grad_grad(row, idir, 1) + grad_grad(row, idir, 2))
     854              :                   END DO
     855        80000 :                   vtau(ia, ir, 1) = 0.5_dp*(kin_grad(row, 1) + kin_grad(row, 2))
     856              :                END IF
     857              :             END DO
     858              :          END DO
     859              : 
     860           32 :          CALL torch_tensor_release(density_grad_t)
     861           32 :          CALL torch_tensor_release(grad_grad_t)
     862           32 :          CALL torch_tensor_release(kin_grad_t)
     863           32 :          IF (need_coord_grad) THEN
     864           32 :             CALL torch_tensor_release(grid_coord_grad_t)
     865           32 :             CALL torch_tensor_release(atom_coord_grad_t)
     866              :          END IF
     867              :       END IF
     868              : 
     869           32 :       CALL torch_tensor_release(exc_tensor)
     870           32 :       CALL torch_tensor_release(density_t)
     871           32 :       CALL torch_tensor_release(grad_t)
     872           32 :       CALL torch_tensor_release(kin_t)
     873           32 :       CALL torch_tensor_release(grid_coords_t)
     874           32 :       CALL torch_tensor_release(grid_weights_t)
     875           32 :       CALL torch_tensor_release(atomic_grid_weights_t)
     876           32 :       CALL torch_tensor_release(atomic_grid_sizes_t)
     877           32 :       CALL torch_tensor_release(atomic_grid_size_bound_shape_t)
     878           32 :       CALL torch_tensor_release(coarse_0_atomic_coords_t)
     879           32 :       CALL torch_dict_release(inputs)
     880            0 :       DEALLOCATE (atomic_grid_size_bound_shape, atomic_grid_sizes, atomic_grid_weights, &
     881           32 :                   coarse_0_atomic_coords, density, grad, grid_coords, grid_weights, kin)
     882           32 :       CALL torch_use_cuda(.TRUE.)
     883              : 
     884           96 :    END SUBROUTINE skala_gapw_atom_vxc_of_r
     885              : 
     886              : ! **************************************************************************************************
     887              : !> \brief Add the explicit SKALA derivative with respect to atom-center coordinates.
     888              : !> \param atom_force ...
     889              : !> \param features ...
     890              : !> \param atom_coord_grad_t ...
     891              : !> \param root_rank ...
     892              : ! **************************************************************************************************
     893           60 :    SUBROUTINE add_explicit_coordinate_force(atom_force, features, atom_coord_grad_t, root_rank)
     894              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: atom_force
     895              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
     896              :       TYPE(torch_tensor_type), INTENT(INOUT)             :: atom_coord_grad_t
     897              :       LOGICAL, INTENT(IN)                                :: root_rank
     898              : 
     899           60 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: atom_coord_grad
     900              : 
     901           60 :       NULLIFY (atom_coord_grad)
     902           60 :       CALL torch_tensor_grad(features%coarse_0_atomic_coords_t, atom_coord_grad_t)
     903           60 :       IF (root_rank) THEN
     904           30 :          CALL torch_tensor_data_ptr(atom_coord_grad_t, atom_coord_grad)
     905           30 :          CPASSERT(SIZE(atom_force, 1) == SIZE(atom_coord_grad, 1))
     906           30 :          CPASSERT(SIZE(atom_force, 2) == SIZE(atom_coord_grad, 2))
     907          270 :          atom_force(:, :) = atom_force(:, :) + atom_coord_grad(:, :)
     908              :       END IF
     909              : 
     910           60 :    END SUBROUTINE add_explicit_coordinate_force
     911              : 
     912              : ! **************************************************************************************************
     913              : !> \brief Add the force from SMOOTH native-grid atom partition weights.
     914              : !> \param atom_force ...
     915              : !> \param features ...
     916              : !> \param particle_set ...
     917              : !> \param cell ...
     918              : !> \param rho_r ...
     919              : !> \param grid_weight_grad_t ...
     920              : !> \param atomic_grid_weight_grad_t ...
     921              : ! **************************************************************************************************
     922           60 :    SUBROUTINE add_smooth_partition_force(atom_force, features, particle_set, cell, rho_r, &
     923              :                                          grid_weight_grad_t, atomic_grid_weight_grad_t)
     924              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: atom_force
     925              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
     926              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     927              :       TYPE(cell_type), POINTER                           :: cell
     928              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: rho_r
     929              :       TYPE(torch_tensor_type), INTENT(INOUT)             :: grid_weight_grad_t, &
     930              :                                                             atomic_grid_weight_grad_t
     931              : 
     932              :       INTEGER                                            :: feature_begin, feature_end, feature_pos, &
     933              :                                                             i, iatom, j, jatom, k, local_row, &
     934              :                                                             natom, row
     935              :       INTEGER, DIMENSION(2, 3)                           :: bo
     936              :       LOGICAL, ALLOCATABLE, DIMENSION(:)                 :: included
     937              :       REAL(KIND=dp)                                      :: base_weight, weight_grad
     938              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: weights
     939              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: atom_coords_pbc
     940              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: dweights_datom, dweights_dstrain
     941              :       REAL(KIND=dp), DIMENSION(3)                        :: grid_point
     942           60 :       REAL(KIND=dp), DIMENSION(:), POINTER               :: atomic_grid_weight_grad, grid_weight_grad
     943              : 
     944           60 :       NULLIFY (atomic_grid_weight_grad, grid_weight_grad)
     945           60 :       CALL torch_tensor_grad(features%grid_weights_t, grid_weight_grad_t)
     946           60 :       CALL torch_tensor_grad(features%atomic_grid_weights_t, atomic_grid_weight_grad_t)
     947           60 :       CALL torch_tensor_data_ptr(grid_weight_grad_t, grid_weight_grad)
     948           60 :       CALL torch_tensor_data_ptr(atomic_grid_weight_grad_t, atomic_grid_weight_grad)
     949              : 
     950           60 :       natom = SIZE(particle_set)
     951           60 :       CPASSERT(SIZE(atom_force, 1) == 3)
     952           60 :       CPASSERT(SIZE(atom_force, 2) == natom)
     953              :       ALLOCATE (atom_coords_pbc(3, natom), included(natom), weights(natom), &
     954          720 :                 dweights_datom(3, natom, natom), dweights_dstrain(3, 3, natom))
     955          180 :       DO iatom = 1, natom
     956          180 :          atom_coords_pbc(:, iatom) = pbc(particle_set(iatom)%r, cell, positive_range=.TRUE.)
     957              :       END DO
     958              : 
     959          600 :       bo = rho_r(1)%pw_grid%bounds_local
     960           60 :       local_row = 0
     961         1308 :       DO k = bo(1, 3), bo(2, 3)
     962        28140 :          DO j = bo(1, 2), bo(2, 2)
     963       324264 :             DO i = bo(1, 1), bo(2, 1)
     964       296184 :                local_row = local_row + 1
     965      1184736 :                grid_point = native_grid_coordinate(rho_r(1)%pw_grid, [i, j, k])
     966              :                CALL skala_gpw_smooth_partition_derivatives(grid_point, atom_coords_pbc, cell, &
     967              :                                                            weights, included, dweights_datom, &
     968       296184 :                                                            dweights_dstrain)
     969       296184 :                feature_begin = features%local_feature_offsets(local_row)
     970       296184 :                feature_end = features%local_feature_offsets(local_row + 1) - 1
     971       888552 :                CPASSERT(feature_end - feature_begin + 1 == COUNT(included))
     972       296184 :                base_weight = 0.0_dp
     973       887144 :                DO feature_pos = feature_begin, feature_end
     974       590960 :                   row = features%local_feature_rows(feature_pos)
     975       887144 :                   base_weight = base_weight + features%grid_weights(row)
     976              :                END DO
     977              :                feature_pos = feature_begin
     978       888552 :                DO iatom = 1, natom
     979       592368 :                   IF (.NOT. included(iatom)) CYCLE
     980       590960 :                   row = features%local_feature_rows(feature_pos)
     981       590960 :                   weight_grad = grid_weight_grad(row)
     982      1772880 :                   DO jatom = 1, natom
     983              :                      atom_force(:, jatom) = atom_force(:, jatom) + &
     984              :                                             weight_grad*base_weight* &
     985      5318640 :                                             dweights_datom(:, jatom, iatom)
     986              :                   END DO
     987       888552 :                   feature_pos = feature_pos + 1
     988              :                END DO
     989       323016 :                CPASSERT(feature_pos == feature_end + 1)
     990              :             END DO
     991              :          END DO
     992              :       END DO
     993           60 :       CPASSERT(local_row == features%nflat_local)
     994              : 
     995           60 :       DEALLOCATE (atom_coords_pbc, dweights_datom, dweights_dstrain, included, weights)
     996           60 :       CALL torch_tensor_release(grid_weight_grad_t)
     997           60 :       CALL torch_tensor_release(atomic_grid_weight_grad_t)
     998              : 
     999           60 :    END SUBROUTINE add_smooth_partition_force
    1000              : 
    1001              : ! **************************************************************************************************
    1002              : !> \brief Add the virial from SMOOTH native-grid atom partition weights.
    1003              : !> \param virial_xc ...
    1004              : !> \param features ...
    1005              : !> \param particle_set ...
    1006              : !> \param cell ...
    1007              : !> \param rho_r ...
    1008              : !> \param grid_weight_grad_t ...
    1009              : !> \param atomic_grid_weight_grad_t ...
    1010              : ! **************************************************************************************************
    1011           50 :    SUBROUTINE build_smooth_partition_virial(virial_xc, features, particle_set, cell, rho_r, &
    1012              :                                             grid_weight_grad_t, atomic_grid_weight_grad_t)
    1013              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(INOUT)      :: virial_xc
    1014              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1015              :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
    1016              :       TYPE(cell_type), POINTER                           :: cell
    1017              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: rho_r
    1018              :       TYPE(torch_tensor_type), INTENT(INOUT)             :: grid_weight_grad_t, &
    1019              :                                                             atomic_grid_weight_grad_t
    1020              : 
    1021              :       INTEGER                                            :: feature_begin, feature_end, feature_pos, &
    1022              :                                                             i, iatom, idir, j, jdir, k, local_row, &
    1023              :                                                             natom, row
    1024              :       INTEGER, DIMENSION(2, 3)                           :: bo
    1025              :       LOGICAL, ALLOCATABLE, DIMENSION(:)                 :: included
    1026              :       REAL(KIND=dp)                                      :: base_weight, tmp, weight_grad
    1027              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: weights
    1028              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: atom_coords_pbc
    1029              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: dweights_datom, dweights_dstrain
    1030              :       REAL(KIND=dp), DIMENSION(3)                        :: grid_point
    1031           50 :       REAL(KIND=dp), DIMENSION(:), POINTER               :: atomic_grid_weight_grad, grid_weight_grad
    1032              : 
    1033           50 :       NULLIFY (atomic_grid_weight_grad, grid_weight_grad)
    1034           50 :       CALL torch_tensor_grad(features%grid_weights_t, grid_weight_grad_t)
    1035           50 :       CALL torch_tensor_grad(features%atomic_grid_weights_t, atomic_grid_weight_grad_t)
    1036           50 :       CALL torch_tensor_data_ptr(grid_weight_grad_t, grid_weight_grad)
    1037           50 :       CALL torch_tensor_data_ptr(atomic_grid_weight_grad_t, atomic_grid_weight_grad)
    1038              : 
    1039           50 :       natom = SIZE(particle_set)
    1040              :       ALLOCATE (atom_coords_pbc(3, natom), included(natom), weights(natom), &
    1041          600 :                 dweights_datom(3, natom, natom), dweights_dstrain(3, 3, natom))
    1042          150 :       DO iatom = 1, natom
    1043          150 :          atom_coords_pbc(:, iatom) = pbc(particle_set(iatom)%r, cell, positive_range=.TRUE.)
    1044              :       END DO
    1045              : 
    1046          500 :       bo = rho_r(1)%pw_grid%bounds_local
    1047           50 :       local_row = 0
    1048         1112 :       DO k = bo(1, 3), bo(2, 3)
    1049        24290 :          DO j = bo(1, 2), bo(2, 2)
    1050       282651 :             DO i = bo(1, 1), bo(2, 1)
    1051       258411 :                local_row = local_row + 1
    1052      1033644 :                grid_point = native_grid_coordinate(rho_r(1)%pw_grid, [i, j, k])
    1053              :                CALL skala_gpw_smooth_partition_derivatives(grid_point, atom_coords_pbc, cell, &
    1054              :                                                            weights, included, dweights_datom, &
    1055       258411 :                                                            dweights_dstrain)
    1056       258411 :                feature_begin = features%local_feature_offsets(local_row)
    1057       258411 :                feature_end = features%local_feature_offsets(local_row + 1) - 1
    1058       775233 :                CPASSERT(feature_end - feature_begin + 1 == COUNT(included))
    1059       258411 :                base_weight = 0.0_dp
    1060       774049 :                DO feature_pos = feature_begin, feature_end
    1061       515638 :                   row = features%local_feature_rows(feature_pos)
    1062       774049 :                   base_weight = base_weight + features%grid_weights(row)
    1063              :                END DO
    1064              :                feature_pos = feature_begin
    1065       775233 :                DO iatom = 1, natom
    1066       516822 :                   IF (.NOT. included(iatom)) CYCLE
    1067       515638 :                   row = features%local_feature_rows(feature_pos)
    1068       515638 :                   weight_grad = grid_weight_grad(row)
    1069      2062552 :                   DO idir = 1, 3
    1070      5156380 :                      DO jdir = 1, idir
    1071      3093828 :                         tmp = weight_grad*base_weight*dweights_dstrain(idir, jdir, iatom)
    1072      3093828 :                         virial_xc(jdir, idir) = virial_xc(jdir, idir) + tmp
    1073      4640742 :                         IF (idir /= jdir) virial_xc(idir, jdir) = virial_xc(idir, jdir) + tmp
    1074              :                      END DO
    1075              :                   END DO
    1076       775233 :                   feature_pos = feature_pos + 1
    1077              :                END DO
    1078       281589 :                CPASSERT(feature_pos == feature_end + 1)
    1079              :             END DO
    1080              :          END DO
    1081              :       END DO
    1082           50 :       CPASSERT(local_row == features%nflat_local)
    1083              : 
    1084           50 :       DEALLOCATE (atom_coords_pbc, dweights_datom, dweights_dstrain, included, weights)
    1085           50 :       CALL torch_tensor_release(grid_weight_grad_t)
    1086           50 :       CALL torch_tensor_release(atomic_grid_weight_grad_t)
    1087              : 
    1088           50 :    END SUBROUTINE build_smooth_partition_virial
    1089              : 
    1090              : ! **************************************************************************************************
    1091              : !> \brief Return the Cartesian coordinate of a regular GPW grid point.
    1092              : !> \param pw_grid ...
    1093              : !> \param index ...
    1094              : !> \return ...
    1095              : ! **************************************************************************************************
    1096       554595 :    FUNCTION native_grid_coordinate(pw_grid, index) RESULT(coord)
    1097              :       TYPE(pw_grid_type), POINTER                        :: pw_grid
    1098              :       INTEGER, DIMENSION(3), INTENT(IN)                  :: index
    1099              :       REAL(KIND=dp), DIMENSION(3)                        :: coord
    1100              : 
    1101              :       INTEGER, DIMENSION(3)                              :: relative_index
    1102              : 
    1103      2218380 :       relative_index = index - pw_grid%bounds(1, :)
    1104              :       coord = REAL(relative_index(1), KIND=dp)*pw_grid%dh(:, 1) + &
    1105              :               REAL(relative_index(2), KIND=dp)*pw_grid%dh(:, 2) + &
    1106      2218380 :               REAL(relative_index(3), KIND=dp)*pw_grid%dh(:, 3)
    1107              : 
    1108       554595 :    END FUNCTION native_grid_coordinate
    1109              : 
    1110              : ! **************************************************************************************************
    1111              : !> \brief Evaluate a rank-local atom chunk as multiple atom-contiguous Torch subchunks.
    1112              : !> \param features ...
    1113              : !> \param group ...
    1114              : !> \param max_rows ...
    1115              : !> \param compute_grads ...
    1116              : !> \param exc ...
    1117              : !> \param density_grad ...
    1118              : !> \param grad_grad ...
    1119              : !> \param kin_grad ...
    1120              : !> \param collapse_spin_grads ...
    1121              : ! **************************************************************************************************
    1122            2 :    SUBROUTINE evaluate_atom_subchunks(features, group, max_rows, compute_grads, exc, &
    1123              :                                       density_grad, grad_grad, kin_grad, collapse_spin_grads)
    1124              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1125              : 
    1126              :       CLASS(mp_comm_type), INTENT(IN)                    :: group
    1127              :       INTEGER, INTENT(IN)                                :: max_rows
    1128              :       LOGICAL, INTENT(IN)                                :: compute_grads, collapse_spin_grads
    1129              :       REAL(KIND=dp), INTENT(OUT)                         :: exc
    1130              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
    1131              :          INTENT(OUT)                                     :: density_grad, kin_grad
    1132              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
    1133              :          INTENT(OUT)                                     :: grad_grad
    1134              : 
    1135              :       INTEGER                                            :: base, isubchunk, local_row, nflat_local, &
    1136              :                                                             nroute_grad_per_point, nroute_points, &
    1137              :                                                             nsubchunks, phase_handle, point_pos, &
    1138              :                                                             subphase_handle
    1139            2 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: route_grad_return_recv_counts, &
    1140            2 :                                                             route_grad_return_recv_displs, &
    1141            2 :                                                             route_grad_return_send_counts, &
    1142            2 :                                                             route_grad_return_send_displs
    1143              :       REAL(KIND=dp)                                      :: subchunk_exc
    1144            2 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recv_grad_buffer, send_grad_buffer
    1145            2 :       TYPE(skala_gpw_feature_type)                       :: subchunk
    1146              :       TYPE(torch_tensor_type)                            :: subchunk_exc_tensor
    1147              : 
    1148            0 :       CPASSERT(features%uses_atom_chunks)
    1149            2 :       CPASSERT(max_rows > 0)
    1150            2 :       nflat_local = features%nflat_local
    1151            2 :       nsubchunks = skala_gpw_atom_subchunk_count(max_rows)
    1152              : 
    1153            2 :       exc = 0.0_dp
    1154            2 :       IF (compute_grads) THEN
    1155            2 :          CPASSERT(features%uses_atom_chunk_routing)
    1156            6 :          CPASSERT(SUM(features%route_point_recv_counts) == features%chunk_feature_count)
    1157            2 :          nroute_points = SIZE(features%route_send_local_rows)
    1158            6 :          CPASSERT(SUM(features%route_point_send_counts) == nroute_points)
    1159            2 :          nroute_grad_per_point = ngrad_per_point
    1160            2 :          IF (collapse_spin_grads) nroute_grad_per_point = ncollapsed_grad_per_point
    1161              :          ALLOCATE (send_grad_buffer(MAX(1, nroute_grad_per_point*features%chunk_feature_count)), &
    1162              :                    recv_grad_buffer(MAX(1, nroute_grad_per_point*nroute_points)), &
    1163              :                    route_grad_return_send_counts(SIZE(features%route_point_recv_counts)), &
    1164              :                    route_grad_return_send_displs(SIZE(features%route_point_recv_displs)), &
    1165              :                    route_grad_return_recv_counts(SIZE(features%route_point_send_counts)), &
    1166           26 :                    route_grad_return_recv_displs(SIZE(features%route_point_send_displs)))
    1167              :          route_grad_return_send_counts(:) = &
    1168            6 :             nroute_grad_per_point*features%route_point_recv_counts
    1169              :          route_grad_return_send_displs(:) = &
    1170            6 :             nroute_grad_per_point*features%route_point_recv_displs
    1171              :          route_grad_return_recv_counts(:) = &
    1172            6 :             nroute_grad_per_point*features%route_point_send_counts
    1173              :          route_grad_return_recv_displs(:) = &
    1174            6 :             nroute_grad_per_point*features%route_point_send_displs
    1175              :       END IF
    1176              : 
    1177            2 :       CALL timeset("skala_gpw_atom_subchunks", phase_handle)
    1178            6 :       DO isubchunk = 1, nsubchunks
    1179            4 :          CALL timeset("skala_gpw_atom_subchunk_build", subphase_handle)
    1180              :          CALL skala_gpw_feature_build_atom_subchunk(features, subchunk, isubchunk, &
    1181            4 :                                                     max_rows, compute_grads)
    1182            4 :          CALL timestop(subphase_handle)
    1183            4 :          CALL timeset("skala_gpw_atom_subchunk_forward", subphase_handle)
    1184              :          CALL skala_torch_model_get_exc(cached_model, subchunk%inputs, &
    1185              :                                         subchunk%grid_weights_t, subchunk_exc_tensor, &
    1186            4 :                                         subchunk_exc)
    1187            4 :          CALL timestop(subphase_handle)
    1188            4 :          exc = exc + subchunk_exc
    1189            4 :          IF (compute_grads) THEN
    1190            4 :             CALL timeset("skala_gpw_atom_subchunk_backward", subphase_handle)
    1191            4 :             CALL torch_tensor_backward_scalar(subchunk_exc_tensor)
    1192            4 :             CALL timestop(subphase_handle)
    1193              :          END IF
    1194            4 :          CALL timeset("skala_gpw_atom_subchunk_release", subphase_handle)
    1195            4 :          CALL torch_tensor_release(subchunk_exc_tensor)
    1196            4 :          CALL skala_gpw_feature_release(subchunk)
    1197           18 :          CALL timestop(subphase_handle)
    1198              :       END DO
    1199            2 :       IF (compute_grads .AND. features%chunk_feature_count > 0) THEN
    1200            2 :          CALL timeset("skala_gpw_atom_subchunk_grad_pack", subphase_handle)
    1201            2 :          CALL pack_atom_chunk_grads(features, send_grad_buffer, .TRUE., collapse_spin_grads)
    1202            2 :          CALL timestop(subphase_handle)
    1203              :       END IF
    1204            2 :       CALL timestop(phase_handle)
    1205              : 
    1206            2 :       IF (compute_grads) THEN
    1207            2 :          CALL timeset("skala_gpw_grad_route_comm", phase_handle)
    1208              :          CALL group%alltoall(send_grad_buffer, route_grad_return_send_counts, &
    1209              :                              route_grad_return_send_displs, recv_grad_buffer, &
    1210            2 :                              route_grad_return_recv_counts, route_grad_return_recv_displs)
    1211            2 :          CALL timestop(phase_handle)
    1212              : 
    1213            2 :          CALL timeset("skala_gpw_grad_route_scatter", phase_handle)
    1214            0 :          ALLOCATE (density_grad(nflat_local, 2), grad_grad(nflat_local, 3, 2), &
    1215           14 :                    kin_grad(nflat_local, 2))
    1216            2 :          density_grad = 0.0_dp
    1217            2 :          grad_grad = 0.0_dp
    1218            2 :          kin_grad = 0.0_dp
    1219        64002 :          DO point_pos = 1, nroute_points
    1220        64000 :             local_row = features%route_send_local_rows(point_pos)
    1221        64000 :             CPASSERT(local_row >= 1 .AND. local_row <= nflat_local)
    1222        64000 :             base = nroute_grad_per_point*(point_pos - 1)
    1223        64002 :             IF (collapse_spin_grads) THEN
    1224              :                density_grad(local_row, :) = density_grad(local_row, :) + &
    1225       192000 :                                             recv_grad_buffer(base + 1)
    1226              :                grad_grad(local_row, 1, :) = grad_grad(local_row, 1, :) + &
    1227       192000 :                                             recv_grad_buffer(base + 2)
    1228              :                grad_grad(local_row, 2, :) = grad_grad(local_row, 2, :) + &
    1229       192000 :                                             recv_grad_buffer(base + 3)
    1230              :                grad_grad(local_row, 3, :) = grad_grad(local_row, 3, :) + &
    1231       192000 :                                             recv_grad_buffer(base + 4)
    1232       192000 :                kin_grad(local_row, :) = kin_grad(local_row, :) + recv_grad_buffer(base + 5)
    1233              :             ELSE
    1234              :                density_grad(local_row, :) = density_grad(local_row, :) + &
    1235            0 :                                             recv_grad_buffer(base + 1:base + 2)
    1236              :                grad_grad(local_row, 1, 1) = grad_grad(local_row, 1, 1) + &
    1237            0 :                                             recv_grad_buffer(base + 3)
    1238              :                grad_grad(local_row, 2, 1) = grad_grad(local_row, 2, 1) + &
    1239            0 :                                             recv_grad_buffer(base + 4)
    1240              :                grad_grad(local_row, 3, 1) = grad_grad(local_row, 3, 1) + &
    1241            0 :                                             recv_grad_buffer(base + 5)
    1242              :                grad_grad(local_row, 1, 2) = grad_grad(local_row, 1, 2) + &
    1243            0 :                                             recv_grad_buffer(base + 6)
    1244              :                grad_grad(local_row, 2, 2) = grad_grad(local_row, 2, 2) + &
    1245            0 :                                             recv_grad_buffer(base + 7)
    1246              :                grad_grad(local_row, 3, 2) = grad_grad(local_row, 3, 2) + &
    1247            0 :                                             recv_grad_buffer(base + 8)
    1248              :                kin_grad(local_row, :) = kin_grad(local_row, :) + &
    1249            0 :                                         recv_grad_buffer(base + 9:base + 10)
    1250              :             END IF
    1251              :          END DO
    1252            2 :          CALL timestop(phase_handle)
    1253              : 
    1254            0 :          DEALLOCATE (recv_grad_buffer, route_grad_return_recv_counts, &
    1255            0 :                      route_grad_return_recv_displs, route_grad_return_send_counts, &
    1256            6 :                      route_grad_return_send_displs, send_grad_buffer)
    1257              :       END IF
    1258              : 
    1259            4 :    END SUBROUTINE evaluate_atom_subchunks
    1260              : 
    1261              : ! **************************************************************************************************
    1262              : !> \brief Select an automatic CUDA atom-subchunk row cap.
    1263              : !> \param features ...
    1264              : !> \param group ...
    1265              : !> \return ...
    1266              : ! **************************************************************************************************
    1267            0 :    FUNCTION auto_atom_chunk_max_rows(features, group) RESULT(max_rows)
    1268              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1269              : 
    1270              :       CLASS(mp_comm_type), INTENT(IN)                    :: group
    1271              :       INTEGER                                            :: max_rows
    1272              : 
    1273              :       INTEGER                                            :: local_rows_max, target_rows
    1274              : 
    1275            0 :       local_rows_max = features%chunk_feature_count
    1276            0 :       CALL group%max(local_rows_max)
    1277            0 :       IF (local_rows_max <= 0) THEN
    1278            0 :          max_rows = 0
    1279              :          RETURN
    1280              :       END IF
    1281              : 
    1282            0 :       IF (group%num_pe > 1) THEN
    1283            0 :          target_rows = CEILING(REAL(local_rows_max, KIND=dp)/2.0_dp)
    1284              :          max_rows = atom_chunk_auto_row_quantum* &
    1285            0 :                     ((target_rows + atom_chunk_auto_row_quantum - 1)/atom_chunk_auto_row_quantum)
    1286              :       ELSE
    1287            0 :          target_rows = NINT(REAL(local_rows_max, KIND=dp)/4.0_dp)
    1288              :          max_rows = atom_chunk_auto_row_quantum* &
    1289              :                     MAX(1, NINT(REAL(target_rows, KIND=dp)/ &
    1290            0 :                                 REAL(atom_chunk_auto_row_quantum, KIND=dp)))
    1291              :       END IF
    1292            0 :       max_rows = MAX(atom_chunk_auto_min_rows, MIN(atom_chunk_auto_max_rows, max_rows))
    1293              : 
    1294            0 :    END FUNCTION auto_atom_chunk_max_rows
    1295              : 
    1296              : ! **************************************************************************************************
    1297              : !> \brief Map full Torch feature gradients back to this rank's local grid order.
    1298              : !> \param features ...
    1299              : !> \param density_grad ...
    1300              : !> \param grad_grad ...
    1301              : !> \param kin_grad ...
    1302              : ! **************************************************************************************************
    1303          282 :    SUBROUTINE fetch_local_feature_grads(features, density_grad, grad_grad, kin_grad)
    1304              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1305              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
    1306              :          INTENT(OUT)                                     :: density_grad
    1307              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
    1308              :          INTENT(OUT)                                     :: grad_grad
    1309              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
    1310              :          INTENT(OUT)                                     :: kin_grad
    1311              : 
    1312              :       INTEGER                                            :: feature_pos, i, j, k, local_row, row
    1313          282 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: density_grad_all, kin_grad_all
    1314          282 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: grad_grad_all
    1315              :       TYPE(torch_tensor_type)                            :: density_grad_t, grad_grad_t, kin_grad_t
    1316              : 
    1317          282 :       NULLIFY (density_grad_all, grad_grad_all, kin_grad_all)
    1318              :       CALL get_feature_grad_views(features, density_grad_t, grad_grad_t, kin_grad_t, &
    1319          282 :                                   density_grad_all, grad_grad_all, kin_grad_all)
    1320          282 :       CPASSERT(SIZE(density_grad_all, 1) == features%nflat)
    1321          282 :       CPASSERT(SIZE(density_grad_all, 2) == 2)
    1322          282 :       CPASSERT(SIZE(grad_grad_all, 1) == features%nflat)
    1323          282 :       CPASSERT(SIZE(grad_grad_all, 2) == 3)
    1324          282 :       CPASSERT(SIZE(grad_grad_all, 3) == 2)
    1325          282 :       CPASSERT(SIZE(kin_grad_all, 1) == features%nflat)
    1326          282 :       CPASSERT(SIZE(kin_grad_all, 2) == 2)
    1327              : 
    1328            0 :       ALLOCATE (density_grad(features%nflat_local, 2), &
    1329            0 :                 grad_grad(features%nflat_local, 3, 2), &
    1330         1974 :                 kin_grad(features%nflat_local, 2))
    1331          282 :       density_grad = 0.0_dp
    1332          282 :       grad_grad = 0.0_dp
    1333          282 :       kin_grad = 0.0_dp
    1334          282 :       local_row = 0
    1335         6408 :       DO k = LBOUND(features%feature_index, 3), UBOUND(features%feature_index, 3)
    1336       144114 :          DO j = LBOUND(features%feature_index, 2), UBOUND(features%feature_index, 2)
    1337      2124981 :             DO i = LBOUND(features%feature_index, 1), UBOUND(features%feature_index, 1)
    1338      1737981 :                local_row = local_row + 1
    1339      4286402 :                DO feature_pos = features%local_feature_offsets(local_row), &
    1340      1865127 :                   features%local_feature_offsets(local_row + 1) - 1
    1341      2548421 :                   row = features%local_feature_rows(feature_pos)
    1342      2548421 :                   CPASSERT(row >= 1 .AND. row <= features%nflat)
    1343              :                   density_grad(local_row, :) = density_grad(local_row, :) + &
    1344      7645263 :                                                density_grad_all(row, :)
    1345              :                   grad_grad(local_row, :, :) = grad_grad(local_row, :, :) + &
    1346     22935789 :                                                grad_grad_all(row, :, :)
    1347      9383244 :                   kin_grad(local_row, :) = kin_grad(local_row, :) + kin_grad_all(row, :)
    1348              :                END DO
    1349              :             END DO
    1350              :          END DO
    1351              :       END DO
    1352          282 :       CPASSERT(local_row == features%nflat_local)
    1353              : 
    1354          282 :       CALL torch_tensor_release(density_grad_t)
    1355          282 :       CALL torch_tensor_release(grad_grad_t)
    1356          282 :       CALL torch_tensor_release(kin_grad_t)
    1357              : 
    1358          282 :    END SUBROUTINE fetch_local_feature_grads
    1359              : 
    1360              : ! **************************************************************************************************
    1361              : !> \brief Pack atom-chunk Torch gradients into CP2K communication buffers.
    1362              : !> \param features ...
    1363              : !> \param TARGET ...
    1364              : !> \param route_to_return_positions ...
    1365              : !> \param collapse_spin_grads ...
    1366              : ! **************************************************************************************************
    1367            8 :    SUBROUTINE pack_atom_chunk_grads(features, TARGET, route_to_return_positions, &
    1368              :                                     collapse_spin_grads)
    1369              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1370              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1371              :          INTENT(INOUT)                                   :: target
    1372              :       LOGICAL, INTENT(IN)                                :: route_to_return_positions
    1373              :       LOGICAL, INTENT(IN), OPTIONAL                      :: collapse_spin_grads
    1374              : 
    1375              :       INTEGER                                            :: base, irow, ngrad_buffer_per_point, &
    1376              :                                                             point_pos, target_points
    1377              :       LOGICAL                                            :: my_collapse_spin_grads
    1378            8 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: chunk_density_grad, chunk_kin_grad
    1379            8 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: chunk_grad_grad
    1380              :       TYPE(torch_tensor_type)                            :: density_grad_t, grad_grad_t, kin_grad_t
    1381              : 
    1382            8 :       my_collapse_spin_grads = .FALSE.
    1383           16 :       IF (PRESENT(collapse_spin_grads)) my_collapse_spin_grads = collapse_spin_grads
    1384            8 :       ngrad_buffer_per_point = ngrad_per_point
    1385            8 :       IF (my_collapse_spin_grads) ngrad_buffer_per_point = ncollapsed_grad_per_point
    1386              : 
    1387            8 :       NULLIFY (chunk_density_grad, chunk_grad_grad, chunk_kin_grad)
    1388              :       CALL get_feature_grad_views(features, density_grad_t, grad_grad_t, kin_grad_t, &
    1389            8 :                                   chunk_density_grad, chunk_grad_grad, chunk_kin_grad)
    1390            8 :       CPASSERT(MOD(SIZE(TARGET), ngrad_buffer_per_point) == 0)
    1391            8 :       target_points = SIZE(TARGET)/ngrad_buffer_per_point
    1392            8 :       CPASSERT(target_points >= features%chunk_feature_count)
    1393            8 :       CPASSERT(SIZE(chunk_density_grad, 1) == features%chunk_feature_count)
    1394            8 :       CPASSERT(SIZE(chunk_grad_grad, 1) == features%chunk_feature_count)
    1395            8 :       CPASSERT(SIZE(chunk_grad_grad, 2) == 3)
    1396            8 :       CPASSERT(SIZE(chunk_kin_grad, 1) == features%chunk_feature_count)
    1397            8 :       IF (features%uses_collapsed_rks_dynamic) THEN
    1398            8 :          CPASSERT(my_collapse_spin_grads)
    1399            8 :          CPASSERT(SIZE(chunk_density_grad, 2) == 1)
    1400            8 :          CPASSERT(SIZE(chunk_grad_grad, 3) == 1)
    1401            8 :          CPASSERT(SIZE(chunk_kin_grad, 2) == 1)
    1402              :       ELSE
    1403            0 :          CPASSERT(SIZE(chunk_density_grad, 2) == 2)
    1404            0 :          CPASSERT(SIZE(chunk_grad_grad, 3) == 2)
    1405            0 :          CPASSERT(SIZE(chunk_kin_grad, 2) == 2)
    1406              :       END IF
    1407              : 
    1408       146742 :       DO irow = 1, features%chunk_feature_count
    1409       146734 :          IF (route_to_return_positions) THEN
    1410       146734 :             point_pos = features%chunk_return_positions(irow)
    1411       146734 :             CPASSERT(point_pos >= 1 .AND. point_pos <= target_points)
    1412              :          ELSE
    1413              :             point_pos = irow
    1414              :          END IF
    1415       146734 :          base = ngrad_buffer_per_point*(point_pos - 1)
    1416       146742 :          IF (my_collapse_spin_grads) THEN
    1417       146734 :             IF (features%uses_collapsed_rks_dynamic) THEN
    1418       146734 :                TARGET(base + 1) = 0.5_dp*chunk_density_grad(irow, 1)
    1419       146734 :                TARGET(base + 2) = 0.5_dp*chunk_grad_grad(irow, 1, 1)
    1420       146734 :                TARGET(base + 3) = 0.5_dp*chunk_grad_grad(irow, 2, 1)
    1421       146734 :                TARGET(base + 4) = 0.5_dp*chunk_grad_grad(irow, 3, 1)
    1422       146734 :                TARGET(base + 5) = 0.5_dp*chunk_kin_grad(irow, 1)
    1423              :             ELSE
    1424              :                TARGET(base + 1) = 0.5_dp*(chunk_density_grad(irow, 1) + &
    1425            0 :                                           chunk_density_grad(irow, 2))
    1426              :                TARGET(base + 2) = 0.5_dp*(chunk_grad_grad(irow, 1, 1) + &
    1427            0 :                                           chunk_grad_grad(irow, 1, 2))
    1428              :                TARGET(base + 3) = 0.5_dp*(chunk_grad_grad(irow, 2, 1) + &
    1429            0 :                                           chunk_grad_grad(irow, 2, 2))
    1430              :                TARGET(base + 4) = 0.5_dp*(chunk_grad_grad(irow, 3, 1) + &
    1431            0 :                                           chunk_grad_grad(irow, 3, 2))
    1432            0 :                TARGET(base + 5) = 0.5_dp*(chunk_kin_grad(irow, 1) + chunk_kin_grad(irow, 2))
    1433              :             END IF
    1434              :          ELSE
    1435            0 :             TARGET(base + 1:base + 2) = chunk_density_grad(irow, :)
    1436            0 :             TARGET(base + 3) = chunk_grad_grad(irow, 1, 1)
    1437            0 :             TARGET(base + 4) = chunk_grad_grad(irow, 2, 1)
    1438            0 :             TARGET(base + 5) = chunk_grad_grad(irow, 3, 1)
    1439            0 :             TARGET(base + 6) = chunk_grad_grad(irow, 1, 2)
    1440            0 :             TARGET(base + 7) = chunk_grad_grad(irow, 2, 2)
    1441            0 :             TARGET(base + 8) = chunk_grad_grad(irow, 3, 2)
    1442            0 :             TARGET(base + 9:base + 10) = chunk_kin_grad(irow, :)
    1443              :          END IF
    1444              :       END DO
    1445              : 
    1446            8 :       CALL torch_tensor_release(density_grad_t)
    1447            8 :       CALL torch_tensor_release(grad_grad_t)
    1448            8 :       CALL torch_tensor_release(kin_grad_t)
    1449              : 
    1450            8 :    END SUBROUTINE pack_atom_chunk_grads
    1451              : 
    1452              : ! **************************************************************************************************
    1453              : !> \brief Return CPU views of autograd outputs for the SKALA dynamic feature tensors.
    1454              : !> \param features ...
    1455              : !> \param density_grad_t ...
    1456              : !> \param grad_grad_t ...
    1457              : !> \param kin_grad_t ...
    1458              : !> \param density_grad ...
    1459              : !> \param grad_grad ...
    1460              : !> \param kin_grad ...
    1461              : ! **************************************************************************************************
    1462          290 :    SUBROUTINE get_feature_grad_views(features, density_grad_t, grad_grad_t, kin_grad_t, &
    1463              :                                      density_grad, grad_grad, kin_grad)
    1464              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1465              :       TYPE(torch_tensor_type), INTENT(INOUT)             :: density_grad_t, grad_grad_t, kin_grad_t
    1466              :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: density_grad
    1467              :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: grad_grad
    1468              :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: kin_grad
    1469              : 
    1470          290 :       NULLIFY (density_grad, grad_grad, kin_grad)
    1471          290 :       CALL torch_tensor_grad(features%density_t, density_grad_t)
    1472          290 :       CALL torch_tensor_grad(features%grad_t, grad_grad_t)
    1473          290 :       CALL torch_tensor_grad(features%kin_t, kin_grad_t)
    1474          290 :       CALL torch_tensor_data_ptr(density_grad_t, density_grad)
    1475          290 :       CALL torch_tensor_data_ptr(grad_grad_t, grad_grad)
    1476          290 :       CALL torch_tensor_data_ptr(kin_grad_t, kin_grad)
    1477              : 
    1478          290 :    END SUBROUTINE get_feature_grad_views
    1479              : 
    1480              : ! **************************************************************************************************
    1481              : !> \brief Fetch atom-chunk gradients and route them back to their local grid owners.
    1482              : !> \param features ...
    1483              : !> \param group ...
    1484              : !> \param density_grad ...
    1485              : !> \param grad_grad ...
    1486              : !> \param kin_grad ...
    1487              : ! **************************************************************************************************
    1488            6 :    SUBROUTINE fetch_and_gather_atom_chunk_grads(features, group, density_grad, grad_grad, &
    1489              :                                                 kin_grad)
    1490              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1491              : 
    1492              :       CLASS(mp_comm_type), INTENT(IN)                    :: group
    1493              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
    1494              :          INTENT(OUT)                                     :: density_grad, kin_grad
    1495              :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
    1496              :          INTENT(OUT)                                     :: grad_grad
    1497              : 
    1498              :       INTEGER                                            :: base, feature_pos, i, j, k, local_row, &
    1499              :                                                             nflat_local, nroute_grad_per_point, &
    1500              :                                                             nroute_points, phase_handle, point_pos, row
    1501            6 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: route_grad_return_recv_counts, &
    1502            6 :                                                             route_grad_return_recv_displs, &
    1503            6 :                                                             route_grad_return_send_counts, &
    1504            6 :                                                             route_grad_return_send_displs
    1505            6 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: chunk_grad_buffer, global_grad_buffer, &
    1506            6 :                                                             recv_grad_buffer, send_grad_buffer
    1507              : 
    1508            6 :       CPASSERT(features%uses_atom_chunks)
    1509              : 
    1510            6 :       nflat_local = features%nflat_local
    1511            6 :       IF (features%uses_atom_chunk_routing) THEN
    1512           18 :          CPASSERT(SUM(features%route_point_recv_counts) == features%chunk_feature_count)
    1513            6 :          nroute_points = SIZE(features%route_send_local_rows)
    1514           18 :          CPASSERT(SUM(features%route_point_send_counts) == nroute_points)
    1515              : 
    1516            6 :          nroute_grad_per_point = ngrad_per_point
    1517            6 :          IF (features%uses_collapsed_rks_dynamic) THEN
    1518            6 :             nroute_grad_per_point = ncollapsed_grad_per_point
    1519              :          END IF
    1520              :          ALLOCATE (send_grad_buffer(MAX(1, nroute_grad_per_point*features%chunk_feature_count)), &
    1521              :                    recv_grad_buffer(MAX(1, nroute_grad_per_point*nroute_points)), &
    1522              :                    route_grad_return_send_counts(SIZE(features%route_point_recv_counts)), &
    1523              :                    route_grad_return_send_displs(SIZE(features%route_point_recv_displs)), &
    1524              :                    route_grad_return_recv_counts(SIZE(features%route_point_send_counts)), &
    1525           78 :                    route_grad_return_recv_displs(SIZE(features%route_point_send_displs)))
    1526              :          route_grad_return_send_counts(:) = &
    1527           18 :             nroute_grad_per_point*features%route_point_recv_counts
    1528              :          route_grad_return_send_displs(:) = &
    1529           18 :             nroute_grad_per_point*features%route_point_recv_displs
    1530              :          route_grad_return_recv_counts(:) = &
    1531           18 :             nroute_grad_per_point*features%route_point_send_counts
    1532              :          route_grad_return_recv_displs(:) = &
    1533           18 :             nroute_grad_per_point*features%route_point_send_displs
    1534              : 
    1535            6 :          IF (features%chunk_feature_count > 0) THEN
    1536            6 :             CALL timeset("skala_gpw_grad_torch_pack", phase_handle)
    1537              :             CALL pack_atom_chunk_grads(features, send_grad_buffer, .TRUE., &
    1538            6 :                                        features%uses_collapsed_rks_dynamic)
    1539            6 :             CALL timestop(phase_handle)
    1540              :          END IF
    1541              : 
    1542            6 :          CALL timeset("skala_gpw_grad_route_comm", phase_handle)
    1543              :          CALL group%alltoall(send_grad_buffer, route_grad_return_send_counts, &
    1544              :                              route_grad_return_send_displs, recv_grad_buffer, &
    1545            6 :                              route_grad_return_recv_counts, route_grad_return_recv_displs)
    1546            6 :          CALL timestop(phase_handle)
    1547              : 
    1548            6 :          CALL timeset("skala_gpw_grad_route_scatter", phase_handle)
    1549            0 :          ALLOCATE (density_grad(nflat_local, 2), grad_grad(nflat_local, 3, 2), &
    1550           42 :                    kin_grad(nflat_local, 2))
    1551            6 :          density_grad = 0.0_dp
    1552            6 :          grad_grad = 0.0_dp
    1553            6 :          kin_grad = 0.0_dp
    1554        82740 :          DO point_pos = 1, nroute_points
    1555        82734 :             local_row = features%route_send_local_rows(point_pos)
    1556        82734 :             CPASSERT(local_row >= 1 .AND. local_row <= nflat_local)
    1557        82734 :             base = nroute_grad_per_point*(point_pos - 1)
    1558        82740 :             IF (features%uses_collapsed_rks_dynamic) THEN
    1559              :                density_grad(local_row, :) = density_grad(local_row, :) + &
    1560       248202 :                                             recv_grad_buffer(base + 1)
    1561              :                grad_grad(local_row, 1, :) = grad_grad(local_row, 1, :) + &
    1562       248202 :                                             recv_grad_buffer(base + 2)
    1563              :                grad_grad(local_row, 2, :) = grad_grad(local_row, 2, :) + &
    1564       248202 :                                             recv_grad_buffer(base + 3)
    1565              :                grad_grad(local_row, 3, :) = grad_grad(local_row, 3, :) + &
    1566       248202 :                                             recv_grad_buffer(base + 4)
    1567       248202 :                kin_grad(local_row, :) = kin_grad(local_row, :) + recv_grad_buffer(base + 5)
    1568              :             ELSE
    1569              :                density_grad(local_row, :) = density_grad(local_row, :) + &
    1570            0 :                                             recv_grad_buffer(base + 1:base + 2)
    1571              :                grad_grad(local_row, 1, 1) = grad_grad(local_row, 1, 1) + &
    1572            0 :                                             recv_grad_buffer(base + 3)
    1573              :                grad_grad(local_row, 2, 1) = grad_grad(local_row, 2, 1) + &
    1574            0 :                                             recv_grad_buffer(base + 4)
    1575              :                grad_grad(local_row, 3, 1) = grad_grad(local_row, 3, 1) + &
    1576            0 :                                             recv_grad_buffer(base + 5)
    1577              :                grad_grad(local_row, 1, 2) = grad_grad(local_row, 1, 2) + &
    1578            0 :                                             recv_grad_buffer(base + 6)
    1579              :                grad_grad(local_row, 2, 2) = grad_grad(local_row, 2, 2) + &
    1580            0 :                                             recv_grad_buffer(base + 7)
    1581              :                grad_grad(local_row, 3, 2) = grad_grad(local_row, 3, 2) + &
    1582            0 :                                             recv_grad_buffer(base + 8)
    1583              :                kin_grad(local_row, :) = kin_grad(local_row, :) + &
    1584            0 :                                         recv_grad_buffer(base + 9:base + 10)
    1585              :             END IF
    1586              :          END DO
    1587            6 :          CALL timestop(phase_handle)
    1588              : 
    1589            0 :          DEALLOCATE (recv_grad_buffer, route_grad_return_recv_counts, &
    1590            0 :                      route_grad_return_recv_displs, route_grad_return_send_counts, &
    1591           18 :                      route_grad_return_send_displs, send_grad_buffer)
    1592              :       ELSE
    1593              :          ALLOCATE (chunk_grad_buffer(MAX(1, ngrad_per_point*features%chunk_feature_count)), &
    1594            0 :                    global_grad_buffer(ngrad_per_point*features%nflat))
    1595            0 :          IF (features%chunk_feature_count > 0) THEN
    1596            0 :             CALL timeset("skala_gpw_grad_torch_pack", phase_handle)
    1597            0 :             CALL pack_atom_chunk_grads(features, chunk_grad_buffer, .FALSE.)
    1598            0 :             CALL timestop(phase_handle)
    1599              :          END IF
    1600              : 
    1601            0 :          CALL timeset("skala_gpw_grad_allgatherv", phase_handle)
    1602              :          CALL group%allgatherv(chunk_grad_buffer, global_grad_buffer, &
    1603            0 :                                features%chunk_grad_counts, features%chunk_grad_displs)
    1604            0 :          CALL timestop(phase_handle)
    1605              : 
    1606            0 :          CALL timeset("skala_gpw_grad_scatter", phase_handle)
    1607            0 :          ALLOCATE (density_grad(nflat_local, 2), grad_grad(nflat_local, 3, 2), &
    1608            0 :                    kin_grad(nflat_local, 2))
    1609            0 :          density_grad = 0.0_dp
    1610            0 :          grad_grad = 0.0_dp
    1611            0 :          kin_grad = 0.0_dp
    1612            0 :          local_row = 0
    1613            0 :          DO k = LBOUND(features%feature_index, 3), UBOUND(features%feature_index, 3)
    1614            0 :             DO j = LBOUND(features%feature_index, 2), UBOUND(features%feature_index, 2)
    1615            0 :                DO i = LBOUND(features%feature_index, 1), UBOUND(features%feature_index, 1)
    1616            0 :                   local_row = local_row + 1
    1617            0 :                   DO feature_pos = features%local_feature_offsets(local_row), &
    1618            0 :                      features%local_feature_offsets(local_row + 1) - 1
    1619            0 :                      row = features%local_feature_rows(feature_pos)
    1620            0 :                      CPASSERT(row >= 1 .AND. row <= features%nflat)
    1621            0 :                      base = ngrad_per_point*(row - 1)
    1622              :                      density_grad(local_row, :) = density_grad(local_row, :) + &
    1623            0 :                                                   global_grad_buffer(base + 1:base + 2)
    1624              :                      grad_grad(local_row, 1, 1) = grad_grad(local_row, 1, 1) + &
    1625            0 :                                                   global_grad_buffer(base + 3)
    1626              :                      grad_grad(local_row, 2, 1) = grad_grad(local_row, 2, 1) + &
    1627            0 :                                                   global_grad_buffer(base + 4)
    1628              :                      grad_grad(local_row, 3, 1) = grad_grad(local_row, 3, 1) + &
    1629            0 :                                                   global_grad_buffer(base + 5)
    1630              :                      grad_grad(local_row, 1, 2) = grad_grad(local_row, 1, 2) + &
    1631            0 :                                                   global_grad_buffer(base + 6)
    1632              :                      grad_grad(local_row, 2, 2) = grad_grad(local_row, 2, 2) + &
    1633            0 :                                                   global_grad_buffer(base + 7)
    1634              :                      grad_grad(local_row, 3, 2) = grad_grad(local_row, 3, 2) + &
    1635            0 :                                                   global_grad_buffer(base + 8)
    1636              :                      kin_grad(local_row, :) = kin_grad(local_row, :) + &
    1637            0 :                                               global_grad_buffer(base + 9:base + 10)
    1638              :                   END DO
    1639              :                END DO
    1640              :             END DO
    1641              :          END DO
    1642            0 :          CALL timestop(phase_handle)
    1643            0 :          DEALLOCATE (chunk_grad_buffer, global_grad_buffer)
    1644              : 
    1645              :       END IF
    1646              : 
    1647            6 :    END SUBROUTINE fetch_and_gather_atom_chunk_grads
    1648              : 
    1649              : ! **************************************************************************************************
    1650              : !> \brief Build the native SKALA XC virial from feature gradients.
    1651              : !> \param virial_xc ...
    1652              : !> \param rho_set ...
    1653              : !> \param rho_r ...
    1654              : !> \param grad_grad ...
    1655              : ! **************************************************************************************************
    1656           50 :    SUBROUTINE build_virial_from_feature_grads(virial_xc, rho_set, rho_r, grad_grad)
    1657              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(INOUT)      :: virial_xc
    1658              :       TYPE(xc_rho_set_type), INTENT(IN)                  :: rho_set
    1659              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: rho_r
    1660              :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: grad_grad
    1661              : 
    1662              :       INTEGER                                            :: i, idir, ipt, ispin, j, jdir, k, nspins
    1663              :       INTEGER, DIMENSION(2, 3)                           :: bo
    1664              :       REAL(KIND=dp)                                      :: grad_i, tmp
    1665          600 :       TYPE(cp_3d_r_cp_type), DIMENSION(3)                :: drho, drhoa, drhob
    1666              : 
    1667           50 :       nspins = SIZE(rho_r)
    1668          500 :       bo = rho_r(1)%pw_grid%bounds_local
    1669           50 :       ipt = 0
    1670              : 
    1671           50 :       IF (nspins == 1) THEN
    1672           50 :          CALL xc_rho_set_get(rho_set, drho=drho)
    1673         1112 :          DO k = bo(1, 3), bo(2, 3)
    1674        24290 :             DO j = bo(1, 2), bo(2, 2)
    1675       282651 :                DO i = bo(1, 1), bo(2, 1)
    1676       258411 :                   ipt = ipt + 1
    1677      1056822 :                   DO idir = 1, 3
    1678       775233 :                      grad_i = 0.5_dp*(grad_grad(ipt, idir, 1) + grad_grad(ipt, idir, 2))
    1679      2584110 :                      DO jdir = 1, idir
    1680      1550466 :                         tmp = -grad_i*drho(jdir)%array(i, j, k)
    1681      1550466 :                         virial_xc(jdir, idir) = virial_xc(jdir, idir) + tmp
    1682      2325699 :                         virial_xc(idir, jdir) = virial_xc(jdir, idir)
    1683              :                      END DO
    1684              :                   END DO
    1685              :                END DO
    1686              :             END DO
    1687              :          END DO
    1688              :       ELSE
    1689            0 :          CALL xc_rho_set_get(rho_set, drhoa=drhoa, drhob=drhob)
    1690            0 :          DO k = bo(1, 3), bo(2, 3)
    1691            0 :             DO j = bo(1, 2), bo(2, 2)
    1692            0 :                DO i = bo(1, 1), bo(2, 1)
    1693            0 :                   ipt = ipt + 1
    1694            0 :                   DO idir = 1, 3
    1695            0 :                      DO jdir = 1, idir
    1696              :                         tmp = 0.0_dp
    1697            0 :                         DO ispin = 1, 2
    1698            0 :                            IF (ispin == 1) THEN
    1699            0 :                               tmp = tmp - grad_grad(ipt, idir, ispin)*drhoa(jdir)%array(i, j, k)
    1700              :                            ELSE
    1701            0 :                               tmp = tmp - grad_grad(ipt, idir, ispin)*drhob(jdir)%array(i, j, k)
    1702              :                            END IF
    1703              :                         END DO
    1704            0 :                         virial_xc(jdir, idir) = virial_xc(jdir, idir) + tmp
    1705            0 :                         virial_xc(idir, jdir) = virial_xc(jdir, idir)
    1706              :                      END DO
    1707              :                   END DO
    1708              :                END DO
    1709              :             END DO
    1710              :          END DO
    1711              :       END IF
    1712              : 
    1713           50 :    END SUBROUTINE build_virial_from_feature_grads
    1714              : 
    1715              : ! **************************************************************************************************
    1716              : !> \brief Print a native SKALA XC virial contribution for diagnostics.
    1717              : !> \param label ...
    1718              : !> \param delta ...
    1719              : !> \param root_rank ...
    1720              : ! **************************************************************************************************
    1721            0 :    SUBROUTINE print_virial_delta(label, delta, root_rank)
    1722              :       CHARACTER(LEN=*), INTENT(IN)                       :: label
    1723              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(IN)         :: delta
    1724              :       LOGICAL, INTENT(IN)                                :: root_rank
    1725              : 
    1726              :       INTEGER                                            :: i, iw
    1727              : 
    1728            0 :       IF (.NOT. root_rank) RETURN
    1729            0 :       iw = cp_logger_get_default_io_unit()
    1730            0 :       IF (iw <= 0) RETURN
    1731            0 :       WRITE (iw, "(T2,A,1X,A)") "SKALA_GPW| XC virial contribution", TRIM(label)
    1732            0 :       DO i = 1, 3
    1733            0 :          WRITE (iw, "(T2,A,1X,3ES20.10)") "SKALA_GPW|", delta(i, 1:3)
    1734              :       END DO
    1735              : 
    1736              :    END SUBROUTINE print_virial_delta
    1737              : 
    1738              : ! **************************************************************************************************
    1739              : !> \brief Add explicit SKALA coordinate-feature contributions to the XC virial.
    1740              : !> \param virial_xc ...
    1741              : !> \param features ...
    1742              : !> \param atom_coord_grad_t ...
    1743              : !> \param grid_coord_grad_t ...
    1744              : !> \param root_rank ...
    1745              : !> \param print_components ...
    1746              : ! **************************************************************************************************
    1747           50 :    SUBROUTINE build_static_coordinate_virial(virial_xc, features, atom_coord_grad_t, &
    1748              :                                              grid_coord_grad_t, root_rank, print_components)
    1749              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(INOUT)      :: virial_xc
    1750              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1751              :       TYPE(torch_tensor_type), INTENT(INOUT)             :: atom_coord_grad_t, grid_coord_grad_t
    1752              :       LOGICAL, INTENT(IN)                                :: root_rank
    1753              :       LOGICAL, INTENT(IN), OPTIONAL                      :: print_components
    1754              : 
    1755              :       INTEGER                                            :: feature_pos, i, iatom, idir, iw, j, &
    1756              :                                                             jdir, k, local_row, row
    1757              :       LOGICAL                                            :: my_print_components
    1758              :       REAL(KIND=dp)                                      :: tmp
    1759              :       REAL(KIND=dp), DIMENSION(3, 3)                     :: atom_virial, grid_virial
    1760           50 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: atom_coord_grad, grid_coord_grad
    1761              : 
    1762           50 :       my_print_components = .FALSE.
    1763           50 :       IF (PRESENT(print_components)) my_print_components = print_components
    1764              : 
    1765           50 :       NULLIFY (atom_coord_grad, grid_coord_grad)
    1766           50 :       CALL torch_tensor_grad(features%grid_coords_t, grid_coord_grad_t)
    1767           50 :       CALL torch_tensor_data_ptr(grid_coord_grad_t, grid_coord_grad)
    1768           50 :       CALL torch_tensor_data_ptr(atom_coord_grad_t, atom_coord_grad)
    1769              : 
    1770           50 :       grid_virial = 0.0_dp
    1771           50 :       atom_virial = 0.0_dp
    1772           50 :       local_row = 0
    1773         1212 :       DO k = LBOUND(features%feature_index, 3), UBOUND(features%feature_index, 3)
    1774        26414 :          DO j = LBOUND(features%feature_index, 2), UBOUND(features%feature_index, 2)
    1775       329007 :             DO i = LBOUND(features%feature_index, 1), UBOUND(features%feature_index, 1)
    1776       258411 :                local_row = local_row + 1
    1777       774049 :                DO feature_pos = features%local_feature_offsets(local_row), &
    1778       281589 :                   features%local_feature_offsets(local_row + 1) - 1
    1779       515638 :                   row = features%local_feature_rows(feature_pos)
    1780      2320963 :                   DO idir = 1, 3
    1781      6703294 :                      DO jdir = 1, 3
    1782      4640742 :                         tmp = grid_coord_grad(idir, row)*features%grid_coords(jdir, row)
    1783      4640742 :                         grid_virial(idir, jdir) = grid_virial(idir, jdir) + tmp
    1784      6187656 :                         virial_xc(idir, jdir) = virial_xc(idir, jdir) + tmp
    1785              :                      END DO
    1786              :                   END DO
    1787              :                END DO
    1788              :             END DO
    1789              :          END DO
    1790              :       END DO
    1791           50 :       CPASSERT(local_row == features%nflat_local)
    1792              : 
    1793           50 :       IF (root_rank) THEN
    1794           75 :          DO iatom = 1, SIZE(features%coarse_0_atomic_coords, 2)
    1795          225 :             DO idir = 1, 3
    1796          650 :                DO jdir = 1, 3
    1797          450 :                   tmp = atom_coord_grad(idir, iatom)*features%coarse_0_atomic_coords(jdir, iatom)
    1798          450 :                   atom_virial(idir, jdir) = atom_virial(idir, jdir) + tmp
    1799          600 :                   virial_xc(idir, jdir) = virial_xc(idir, jdir) + tmp
    1800              :                END DO
    1801              :             END DO
    1802              :          END DO
    1803              :       END IF
    1804              : 
    1805           50 :       IF (my_print_components .AND. root_rank) THEN
    1806            0 :          iw = cp_logger_get_default_io_unit()
    1807            0 :          IF (iw > 0) THEN
    1808            0 :             CALL print_virial_delta("static-grid", grid_virial, .TRUE.)
    1809            0 :             CALL print_virial_delta("static-atom", atom_virial, .TRUE.)
    1810              :          END IF
    1811              :       END IF
    1812              : 
    1813           50 :       CALL torch_tensor_release(grid_coord_grad_t)
    1814              : 
    1815           50 :    END SUBROUTINE build_static_coordinate_virial
    1816              : 
    1817              : ! **************************************************************************************************
    1818              : !> \brief Add residual SKALA weight-feature contributions to the XC virial.
    1819              : !> \param virial_xc ...
    1820              : !> \param features ...
    1821              : !> \param exc ...
    1822              : !> \param grid_weight_grad_t ...
    1823              : !> \param atomic_grid_weight_grad_t ...
    1824              : !> \param root_rank ...
    1825              : !> \param print_components ...
    1826              : ! **************************************************************************************************
    1827           50 :    SUBROUTINE build_weight_virial(virial_xc, features, exc, grid_weight_grad_t, &
    1828              :                                   atomic_grid_weight_grad_t, root_rank, print_components)
    1829              :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(INOUT)      :: virial_xc
    1830              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1831              :       REAL(KIND=dp), INTENT(IN)                          :: exc
    1832              :       TYPE(torch_tensor_type), INTENT(INOUT)             :: grid_weight_grad_t, &
    1833              :                                                             atomic_grid_weight_grad_t
    1834              :       LOGICAL, INTENT(IN)                                :: root_rank
    1835              :       LOGICAL, INTENT(IN), OPTIONAL                      :: print_components
    1836              : 
    1837              :       INTEGER                                            :: feature_pos, i, idir, iw, j, k, &
    1838              :                                                             local_row, row
    1839              :       LOGICAL                                            :: my_print_components
    1840              :       REAL(KIND=dp)                                      :: atomic_tmp, exc_tmp, grid_tmp, tmp
    1841           50 :       REAL(KIND=dp), DIMENSION(:), POINTER               :: atomic_grid_weight_grad, grid_weight_grad
    1842              : 
    1843           50 :       my_print_components = .FALSE.
    1844           50 :       IF (PRESENT(print_components)) my_print_components = print_components
    1845              : 
    1846           50 :       NULLIFY (atomic_grid_weight_grad, grid_weight_grad)
    1847           50 :       CALL torch_tensor_grad(features%grid_weights_t, grid_weight_grad_t)
    1848           50 :       CALL torch_tensor_grad(features%atomic_grid_weights_t, atomic_grid_weight_grad_t)
    1849           50 :       CALL torch_tensor_data_ptr(grid_weight_grad_t, grid_weight_grad)
    1850           50 :       CALL torch_tensor_data_ptr(atomic_grid_weight_grad_t, atomic_grid_weight_grad)
    1851              : 
    1852           50 :       grid_tmp = 0.0_dp
    1853           50 :       atomic_tmp = 0.0_dp
    1854           50 :       local_row = 0
    1855         1212 :       DO k = LBOUND(features%feature_index, 3), UBOUND(features%feature_index, 3)
    1856        26414 :          DO j = LBOUND(features%feature_index, 2), UBOUND(features%feature_index, 2)
    1857       329007 :             DO i = LBOUND(features%feature_index, 1), UBOUND(features%feature_index, 1)
    1858       258411 :                local_row = local_row + 1
    1859       774049 :                DO feature_pos = features%local_feature_offsets(local_row), &
    1860       281589 :                   features%local_feature_offsets(local_row + 1) - 1
    1861       515638 :                   row = features%local_feature_rows(feature_pos)
    1862       515638 :                   grid_tmp = grid_tmp + grid_weight_grad(row)*features%grid_weights(row)
    1863              :                   atomic_tmp = atomic_tmp + &
    1864       774049 :                                atomic_grid_weight_grad(row)*features%atomic_grid_weights(row)
    1865              :                END DO
    1866              :             END DO
    1867              :          END DO
    1868              :       END DO
    1869           50 :       CPASSERT(local_row == features%nflat_local)
    1870           50 :       exc_tmp = 0.0_dp
    1871           50 :       IF (root_rank) exc_tmp = -exc
    1872           50 :       tmp = grid_tmp + atomic_tmp + exc_tmp
    1873              : 
    1874           50 :       IF (my_print_components .AND. root_rank) THEN
    1875            0 :          iw = cp_logger_get_default_io_unit()
    1876            0 :          IF (iw > 0) THEN
    1877            0 :             WRITE (iw, "(T2,A,1X,ES20.10)") "SKALA_GPW| XC virial weight grid", grid_tmp
    1878            0 :             WRITE (iw, "(T2,A,1X,ES20.10)") "SKALA_GPW| XC virial weight atomic", atomic_tmp
    1879            0 :             WRITE (iw, "(T2,A,1X,ES20.10)") "SKALA_GPW| XC virial weight final", exc_tmp
    1880            0 :             WRITE (iw, "(T2,A,1X,ES20.10)") "SKALA_GPW| XC virial weight residual", tmp
    1881              :          END IF
    1882              :       END IF
    1883              : 
    1884          200 :       DO idir = 1, 3
    1885          200 :          virial_xc(idir, idir) = virial_xc(idir, idir) + tmp
    1886              :       END DO
    1887              : 
    1888           50 :       CALL torch_tensor_release(grid_weight_grad_t)
    1889           50 :       CALL torch_tensor_release(atomic_grid_weight_grad_t)
    1890              : 
    1891           50 :    END SUBROUTINE build_weight_virial
    1892              : 
    1893              : ! **************************************************************************************************
    1894              : !> \brief Fill CP2K VXC real-space arrays from Torch feature gradients.
    1895              : !> \param vxc_rho ...
    1896              : !> \param vxc_tau ...
    1897              : !> \param rho_r ...
    1898              : !> \param pw_pool ...
    1899              : !> \param density_grad ...
    1900              : !> \param grad_grad ...
    1901              : !> \param kin_grad ...
    1902              : !> \param xc_deriv_method_id ...
    1903              : ! **************************************************************************************************
    1904          290 :    SUBROUTINE build_vxc_from_feature_grads(vxc_rho, vxc_tau, rho_r, pw_pool, &
    1905          290 :                                            density_grad, grad_grad, kin_grad, &
    1906              :                                            xc_deriv_method_id)
    1907              :       TYPE(pw_r3d_rs_type), DIMENSION(:), POINTER        :: vxc_rho, vxc_tau, rho_r
    1908              :       TYPE(pw_pool_type), POINTER                        :: pw_pool
    1909              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: density_grad
    1910              :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: grad_grad
    1911              :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: kin_grad
    1912              :       INTEGER, INTENT(IN)                                :: xc_deriv_method_id
    1913              : 
    1914              :       INTEGER                                            :: i, ipt, ispin, j, k, nspins
    1915              :       INTEGER, DIMENSION(2, 3)                           :: bo
    1916              :       REAL(KIND=dp)                                      :: dvol_inv
    1917              :       TYPE(pw_c1d_gs_type)                               :: tmp_g, vxc_g
    1918         1160 :       TYPE(pw_r3d_rs_type), DIMENSION(3)                 :: grad_pw
    1919              : 
    1920          290 :       nspins = SIZE(rho_r)
    1921         2900 :       bo = rho_r(1)%pw_grid%bounds_local
    1922          290 :       dvol_inv = 1.0_dp/rho_r(1)%pw_grid%dvol
    1923              : 
    1924         1836 :       ALLOCATE (vxc_rho(nspins), vxc_tau(nspins))
    1925          628 :       DO ispin = 1, nspins
    1926          338 :          CALL pw_pool%create_pw(vxc_rho(ispin))
    1927          338 :          CALL pw_pool%create_pw(vxc_tau(ispin))
    1928          338 :          CALL pw_zero(vxc_rho(ispin))
    1929          628 :          CALL pw_zero(vxc_tau(ispin))
    1930              :       END DO
    1931              : 
    1932          290 :       IF (xc_requires_tmp_g(xc_deriv_method_id) .OR. rho_r(1)%pw_grid%spherical) THEN
    1933          290 :          CALL pw_pool%create_pw(vxc_g)
    1934          290 :          IF (.NOT. rho_r(1)%pw_grid%spherical) CALL pw_pool%create_pw(tmp_g)
    1935              :       END IF
    1936              : 
    1937          628 :       DO ispin = 1, nspins
    1938         1352 :          DO i = 1, 3
    1939         1014 :             CALL pw_pool%create_pw(grad_pw(i))
    1940         1352 :             CALL pw_zero(grad_pw(i))
    1941              :          END DO
    1942              : 
    1943          338 :          ipt = 0
    1944         7024 :          DO k = bo(1, 3), bo(2, 3)
    1945       162426 :             DO j = bo(1, 2), bo(2, 2)
    1946      2349791 :                DO i = bo(1, 1), bo(2, 1)
    1947      2187703 :                   ipt = ipt + 1
    1948      2343105 :                   IF (nspins == 1) THEN
    1949              :                      vxc_rho(1)%array(i, j, k) = 0.5_dp*dvol_inv* &
    1950      1499203 :                                                  (density_grad(ipt, 1) + density_grad(ipt, 2))
    1951              :                      vxc_tau(1)%array(i, j, k) = 0.5_dp*dvol_inv* &
    1952      1499203 :                                                  (kin_grad(ipt, 1) + kin_grad(ipt, 2))
    1953              :                      grad_pw(1)%array(i, j, k) = 0.5_dp*dvol_inv* &
    1954      1499203 :                                                  (grad_grad(ipt, 1, 1) + grad_grad(ipt, 1, 2))
    1955              :                      grad_pw(2)%array(i, j, k) = 0.5_dp*dvol_inv* &
    1956      1499203 :                                                  (grad_grad(ipt, 2, 1) + grad_grad(ipt, 2, 2))
    1957              :                      grad_pw(3)%array(i, j, k) = 0.5_dp*dvol_inv* &
    1958      1499203 :                                                  (grad_grad(ipt, 3, 1) + grad_grad(ipt, 3, 2))
    1959              :                   ELSE
    1960       688500 :                      vxc_rho(ispin)%array(i, j, k) = dvol_inv*density_grad(ipt, ispin)
    1961       688500 :                      vxc_tau(ispin)%array(i, j, k) = dvol_inv*kin_grad(ipt, ispin)
    1962       688500 :                      grad_pw(1)%array(i, j, k) = dvol_inv*grad_grad(ipt, 1, ispin)
    1963       688500 :                      grad_pw(2)%array(i, j, k) = dvol_inv*grad_grad(ipt, 2, ispin)
    1964       688500 :                      grad_pw(3)%array(i, j, k) = dvol_inv*grad_grad(ipt, 3, ispin)
    1965              :                   END IF
    1966              :                END DO
    1967              :             END DO
    1968              :          END DO
    1969              : 
    1970         1352 :          DO i = 1, 3
    1971         1352 :             CALL pw_scale(grad_pw(i), -1.0_dp)
    1972              :          END DO
    1973          338 :          CALL xc_pw_divergence(xc_deriv_method_id, grad_pw, tmp_g, vxc_g, vxc_rho(ispin))
    1974              : 
    1975         1642 :          DO i = 1, 3
    1976         1352 :             CALL pw_pool%give_back_pw(grad_pw(i))
    1977              :          END DO
    1978              :       END DO
    1979              : 
    1980          290 :       IF (ASSOCIATED(vxc_g%pw_grid)) CALL pw_pool%give_back_pw(vxc_g)
    1981          290 :       IF (ASSOCIATED(tmp_g%pw_grid)) CALL pw_pool%give_back_pw(tmp_g)
    1982              : 
    1983          290 :    END SUBROUTINE build_vxc_from_feature_grads
    1984              : 
    1985              : ! **************************************************************************************************
    1986              : !> \brief Print optional diagnostics for the CP2K-native SKALA GPW feature block.
    1987              : !> \param features ...
    1988              : !> \param print_active ...
    1989              : ! **************************************************************************************************
    1990           24 :    SUBROUTINE print_native_grid_diagnostics(features, print_active)
    1991              :       TYPE(skala_gpw_feature_type), INTENT(IN)           :: features
    1992              :       LOGICAL, INTENT(IN)                                :: print_active
    1993              : 
    1994              :       INTEGER                                            :: atom_rows_max, atom_rows_min, &
    1995              :                                                             chunk_rows_max, chunk_rows_min, iw
    1996              :       REAL(KIND=dp)                                      :: chunk_imbalance
    1997              : 
    1998           24 :       IF (.NOT. print_active) RETURN
    1999              : 
    2000           12 :       iw = cp_logger_get_default_io_unit()
    2001           12 :       IF (iw <= 0) RETURN
    2002              :       WRITE (UNIT=iw, FMT="(/,T2,A,1X,ES19.11)") &
    2003           12 :          "SKALA_GPW| Native grid feature electrons", features%electron_count
    2004              :       WRITE (UNIT=iw, FMT="(T2,A,1X,ES19.11)") &
    2005           12 :          "SKALA_GPW| Native grid feature spin moment", features%spin_moment
    2006              :       WRITE (UNIT=iw, FMT="(T2,A,1X,ES19.11)") &
    2007           12 :          "SKALA_GPW| Native grid feature weight sum", features%grid_weight_sum
    2008           12 :       IF (ALLOCATED(features%atomic_grid_sizes)) THEN
    2009           49 :          atom_rows_min = INT(MINVAL(features%atomic_grid_sizes))
    2010           49 :          atom_rows_max = INT(MAXVAL(features%atomic_grid_sizes))
    2011              :          WRITE (UNIT=iw, FMT="(T2,A,1X,I0,1X,A,1X,I0,1X,A,1X,I0)") &
    2012           12 :             "SKALA_GPW| Native grid atom row range", atom_rows_min, "to", &
    2013           61 :             atom_rows_max, "sum", INT(SUM(features%atomic_grid_sizes))
    2014              :       END IF
    2015           12 :       IF (features%uses_atom_chunks) THEN
    2016              :          WRITE (UNIT=iw, FMT="(T2,A,1X,I0,1X,A,1X,I0)") &
    2017            1 :             "SKALA_GPW| Native grid atom chunk rows", features%chunk_feature_count, &
    2018            2 :             "of", features%nflat
    2019            1 :          IF (ALLOCATED(features%chunk_grad_counts)) THEN
    2020            3 :             chunk_rows_min = MINVAL(features%chunk_grad_counts)/ngrad_per_point
    2021            3 :             chunk_rows_max = MAXVAL(features%chunk_grad_counts)/ngrad_per_point
    2022            1 :             chunk_imbalance = REAL(chunk_rows_max, KIND=dp)/REAL(MAX(1, chunk_rows_min), KIND=dp)
    2023              :             WRITE (UNIT=iw, FMT="(T2,A,1X,I0,1X,A,1X,I0,1X,A,1X,ES12.5)") &
    2024            1 :                "SKALA_GPW| Native grid atom chunk row range", chunk_rows_min, &
    2025            2 :                "to", chunk_rows_max, "imbalance", chunk_imbalance
    2026              :          END IF
    2027              :       END IF
    2028              : 
    2029              :    END SUBROUTINE print_native_grid_diagnostics
    2030              : 
    2031              : ! **************************************************************************************************
    2032              : !> \brief Configure CUDA device selection for the native SKALA GPW Torch path.
    2033              : !> \param use_cuda ...
    2034              : !> \param requested_device ...
    2035              : !> \param group ...
    2036              : !> \return selected CUDA device, or -1 for CPU fallback/no visible CUDA device
    2037              : ! **************************************************************************************************
    2038          322 :    FUNCTION configure_native_grid_cuda(use_cuda, requested_device, group) RESULT(selected_device)
    2039              :       LOGICAL, INTENT(IN)                                :: use_cuda
    2040              :       INTEGER, INTENT(IN)                                :: requested_device
    2041              : 
    2042              :       CLASS(mp_comm_type), INTENT(IN)                    :: group
    2043              : 
    2044              :       INTEGER                                            :: cuda_device_count, iw, pe, selected_device
    2045          322 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: selected_devices
    2046              : 
    2047          322 :       selected_device = -1
    2048              : 
    2049          322 :       IF (.NOT. use_cuda) RETURN
    2050              : 
    2051            0 :       IF (.NOT. torch_cuda_is_available()) THEN
    2052            0 :          cuda_device_count = 0
    2053              :       ELSE
    2054            0 :          cuda_device_count = torch_cuda_device_count()
    2055              :       END IF
    2056            0 :       IF (cuda_device_count > 0) THEN
    2057            0 :          IF (requested_device < 0) THEN
    2058            0 :             selected_device = MOD(group%mepos, cuda_device_count)
    2059              :          ELSE
    2060            0 :             selected_device = requested_device
    2061              :          END IF
    2062              :       END IF
    2063            0 :       IF (selected_device >= cuda_device_count) THEN
    2064              :          CALL cp_abort(__LOCATION__, &
    2065              :                        "GAUXC%NATIVE_GRID_CUDA_DEVICE selects a CUDA device outside the visible "// &
    2066            0 :                        "Torch CUDA device range.")
    2067              :       END IF
    2068            0 :       IF (selected_device >= 0) CALL offload_set_chosen_device(selected_device)
    2069              : 
    2070            0 :       ALLOCATE (selected_devices(group%num_pe))
    2071            0 :       CALL group%allgather(selected_device, selected_devices)
    2072              : 
    2073            0 :       IF (group%mepos /= 0) RETURN
    2074              :       IF (selected_device == logged_cuda_device .AND. &
    2075              :           cuda_device_count == logged_cuda_device_count .AND. &
    2076            0 :           group%num_pe == logged_cuda_nproc .AND. &
    2077              :           requested_device == logged_cuda_request) RETURN
    2078              : 
    2079            0 :       iw = cp_logger_get_default_io_unit()
    2080            0 :       IF (iw <= 0) RETURN
    2081            0 :       IF (selected_device >= 0) THEN
    2082              :          WRITE (UNIT=iw, FMT="(/,T2,A,1X,I0,1X,A,1X,I0,1X,A,1X,I0)") &
    2083            0 :             "SKALA_GPW| Native grid Torch CUDA device", selected_device, &
    2084            0 :             "of", cuda_device_count, "requested", requested_device
    2085              :       ELSE
    2086              :          WRITE (UNIT=iw, FMT="(/,T2,A)") &
    2087            0 :             "SKALA_GPW| Native grid Torch CUDA requested, but no Torch CUDA device is visible"
    2088              :       END IF
    2089              :       WRITE (UNIT=iw, FMT="(T2,A)", ADVANCE="NO") &
    2090            0 :          "SKALA_GPW| Native grid Torch CUDA rank devices"
    2091            0 :       DO pe = 1, group%num_pe
    2092            0 :          WRITE (UNIT=iw, FMT="(1X,I0,A,I0)", ADVANCE="NO") pe - 1, ":", selected_devices(pe)
    2093              :       END DO
    2094            0 :       WRITE (UNIT=iw, FMT=*)
    2095              : 
    2096            0 :       logged_cuda_device = selected_device
    2097            0 :       logged_cuda_device_count = cuda_device_count
    2098            0 :       logged_cuda_nproc = group%num_pe
    2099            0 :       logged_cuda_request = requested_device
    2100              : 
    2101          322 :    END FUNCTION configure_native_grid_cuda
    2102              : 
    2103              : ! **************************************************************************************************
    2104              : !> \brief Load and cache the TorchScript SKALA model.
    2105              : !> \param model_path ...
    2106              : !> \param cuda_device ...
    2107              : ! **************************************************************************************************
    2108          322 :    SUBROUTINE ensure_model_loaded(model_path, cuda_device)
    2109              :       CHARACTER(len=*), INTENT(IN)                       :: model_path
    2110              :       INTEGER, INTENT(IN)                                :: cuda_device
    2111              : 
    2112          322 :       IF (cached_model_loaded) THEN
    2113          234 :          IF (TRIM(cached_model_path) == TRIM(model_path) .AND. &
    2114              :              cached_model_cuda_device == cuda_device) RETURN
    2115            0 :          CALL skala_torch_model_release(cached_model)
    2116            0 :          cached_model_loaded = .FALSE.
    2117              :       END IF
    2118              : 
    2119           88 :       CALL skala_torch_model_load(cached_model, TRIM(model_path))
    2120           88 :       cached_model_path = model_path
    2121           88 :       cached_model_cuda_device = cuda_device
    2122           88 :       cached_model_loaded = .TRUE.
    2123              : 
    2124          322 :    END SUBROUTINE ensure_model_loaded
    2125              : 
    2126              : ! **************************************************************************************************
    2127              : !> \brief Resolve the SKALA TorchScript model path from the GAUXC subsection.
    2128              : !> \param xc_section ...
    2129              : !> \param model_path ...
    2130              : ! **************************************************************************************************
    2131          322 :    SUBROUTINE get_skala_model_path(xc_section, model_path)
    2132              :       TYPE(section_vals_type), INTENT(IN), POINTER       :: xc_section
    2133              :       CHARACTER(len=default_path_length), INTENT(OUT)    :: model_path
    2134              : 
    2135              :       CHARACTER(len=default_path_length)                 :: model_key
    2136              :       INTEGER                                            :: env_status
    2137              :       LOGICAL                                            :: native_grid_use_cuda
    2138              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
    2139              : 
    2140          322 :       gauxc_section => get_gauxc_section(xc_section)
    2141          322 :       IF (.NOT. ASSOCIATED(gauxc_section)) THEN
    2142            0 :          CPABORT("Native SKALA GPW requires an XC_FUNCTIONAL%GAUXC section")
    2143              :       END IF
    2144              : 
    2145          322 :       CALL section_vals_val_get(gauxc_section, "MODEL", c_val=model_path)
    2146          322 :       model_key = ADJUSTL(model_path)
    2147          322 :       CALL uppercase(model_key)
    2148          322 :       IF (TRIM(model_key) == "NONE" .OR. TRIM(model_key) == "") THEN
    2149            0 :          CPABORT("Native SKALA GPW requires GAUXC%MODEL SKALA or a TorchScript model path")
    2150          322 :       ELSE IF (TRIM(model_key) == "SKALA") THEN
    2151          322 :          CALL section_vals_val_get(gauxc_section, "NATIVE_GRID_USE_CUDA", l_val=native_grid_use_cuda)
    2152          322 :          IF (native_grid_use_cuda) THEN
    2153            0 :             CALL GET_ENVIRONMENT_VARIABLE("GAUXC_SKALA_CUDA_MODEL", model_path, STATUS=env_status)
    2154            0 :             IF (env_status == 0 .AND. LEN_TRIM(model_path) > 0) RETURN
    2155              :          END IF
    2156          322 :          CALL GET_ENVIRONMENT_VARIABLE("GAUXC_SKALA_MODEL", model_path, STATUS=env_status)
    2157          322 :          IF (env_status /= 0 .OR. LEN_TRIM(model_path) == 0) THEN
    2158            0 :             IF (native_grid_use_cuda) THEN
    2159              :                CALL cp_abort(__LOCATION__, &
    2160            0 :                              "MODEL SKALA CUDA path requires GAUXC_SKALA_CUDA_MODEL or GAUXC_SKALA_MODEL")
    2161              :             ELSE
    2162              :                CALL cp_abort(__LOCATION__, &
    2163            0 :                              "MODEL SKALA requires the GAUXC_SKALA_MODEL environment variable")
    2164              :             END IF
    2165              :          END IF
    2166              :       END IF
    2167              : 
    2168              :    END SUBROUTINE get_skala_model_path
    2169              : 
    2170              : ! **************************************************************************************************
    2171              : !> \brief Return the first GAUXC functional subsection, if present.
    2172              : !> \param xc_section ...
    2173              : !> \return ...
    2174              : ! **************************************************************************************************
    2175       189079 :    FUNCTION get_gauxc_section(xc_section) RESULT(gauxc_section)
    2176              :       TYPE(section_vals_type), INTENT(IN), POINTER       :: xc_section
    2177              :       TYPE(section_vals_type), POINTER                   :: gauxc_section
    2178              : 
    2179              :       INTEGER                                            :: ifun
    2180              :       TYPE(section_vals_type), POINTER                   :: functionals, xc_fun
    2181              : 
    2182       189079 :       NULLIFY (gauxc_section)
    2183       189079 :       IF (.NOT. ASSOCIATED(xc_section)) RETURN
    2184              : 
    2185       189079 :       functionals => section_vals_get_subs_vals(xc_section, "XC_FUNCTIONAL")
    2186       189079 :       IF (.NOT. ASSOCIATED(functionals)) RETURN
    2187              : 
    2188       189079 :       ifun = 0
    2189              :       DO
    2190       379208 :          ifun = ifun + 1
    2191       379208 :          xc_fun => section_vals_get_subs_vals2(functionals, i_section=ifun)
    2192       379208 :          IF (.NOT. ASSOCIATED(xc_fun)) EXIT
    2193       379208 :          IF (xc_fun%section%name == "GAUXC") THEN
    2194              :             gauxc_section => xc_fun
    2195              :             EXIT
    2196              :          END IF
    2197              :       END DO
    2198              : 
    2199              :    END FUNCTION get_gauxc_section
    2200              : 
    2201              : END MODULE skala_gpw_functional

Generated by: LCOV version 2.0-1