StartDate: 2024-03-06 08:06:09+00:00
CpuId: 12x Intel Xeon W 2000 / D-2100 (Skylake / Cascade Lake) {Skylake}, 14nm
GpuId: 1x Tesla V100-SXM2-16GB

CommitSHA: 0833382a821d2a82ff86511df71b97b2f014239a
CommitTime: 2024-03-05 11:23:13 +0100
CommitAuthor: marcella
CommitSubject: projection on reference in RTP (#3298)


#################### Building Image cp2k-perf-cuda-volta ####################
Dockerfile: /tools/docker/Dockerfile.test_performance_cuda_V100
Build-Path: /
Build-Args: GIT_COMMIT_SHA=0833382a821d2a82ff86511df71b97b2f014239a
Build-Cache: Yes

Populating docker build cache... done.
DEPRECATED: The legacy builder is deprecated and will be removed in a future release.
            BuildKit is currently disabled; enable it by removing the DOCKER_BUILDKIT=0
            environment-variable.

Sending build context to Docker daemon  393.7MB

Step 1/46 : FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
11.8.0-devel-ubuntu22.04: Pulling from nvidia/cuda
aece8493d397: Already exists
5e3b7ee77381: Already exists
5bd037f007fd: Already exists
4cda774ad2ec: Already exists
775f22adee62: Already exists
263fc748118f: Already exists
16c36d0187d0: Already exists
e7a56570655c: Already exists
507fc9045cba: Already exists
23b7d8e07c16: Already exists
922ac8fcb889: Already exists
Digest: sha256:94fd755736cb58979173d491504f0b573247b1745250249415b07fefc738e41f
Status: Downloaded newer image for nvidia/cuda:11.8.0-devel-ubuntu22.04
 ---> 6f9cc9f1ba9e
Step 2/46 : ENV CUDA_PATH /usr/local/cuda
 ---> Using cache
 ---> ea6c9bc4eda6
Step 3/46 : ENV LD_LIBRARY_PATH /usr/local/cuda/lib64
 ---> Using cache
 ---> 223d787cdd89
Step 4/46 : ENV CUDA_CACHE_DISABLE 1
 ---> Using cache
 ---> 1774168f85a8
Step 5/46 : RUN apt-get update -qq && apt-get install -qq --no-install-recommends     gfortran                                                              mpich                                                                 libmpich-dev                                                         && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> cd35350707cc
Step 6/46 : WORKDIR /opt/cp2k-toolchain
 ---> Using cache
 ---> ca16f8d162c0
Step 7/46 : COPY ./tools/toolchain/install_requirements*.sh ./
 ---> Using cache
 ---> a24633b5dbbf
Step 8/46 : RUN ./install_requirements.sh ubuntu
 ---> Using cache
 ---> 07030e2616fd
Step 9/46 : RUN mkdir scripts
 ---> Using cache
 ---> 0550a65eda27
Step 10/46 : COPY ./tools/toolchain/scripts/VERSION      ./tools/toolchain/scripts/parse_if.py      ./tools/toolchain/scripts/tool_kit.sh      ./tools/toolchain/scripts/common_vars.sh      ./tools/toolchain/scripts/signal_trap.sh      ./tools/toolchain/scripts/get_openblas_arch.sh      ./scripts/
 ---> Using cache
 ---> 5c8537531ebc
Step 11/46 : COPY ./tools/toolchain/install_cp2k_toolchain.sh .
 ---> Using cache
 ---> 899829448a26
Step 12/46 : RUN ./install_cp2k_toolchain.sh     --mpi-mode=mpich     --enable-cuda=yes     --gpu-ver=V100     --dry-run
 ---> Using cache
 ---> 7d25ffc2dd66
Step 13/46 : COPY ./tools/toolchain/scripts/stage0/ ./scripts/stage0/
 ---> Using cache
 ---> 48de2d0e812f
Step 14/46 : RUN  ./scripts/stage0/install_stage0.sh && rm -rf ./build
 ---> Using cache
 ---> 2de33e608cf9
Step 15/46 : COPY ./tools/toolchain/scripts/stage1/ ./scripts/stage1/
 ---> Using cache
 ---> fc54d5881560
Step 16/46 : RUN  ./scripts/stage1/install_stage1.sh && rm -rf ./build
 ---> Using cache
 ---> 1acd0d5dd321
Step 17/46 : COPY ./tools/toolchain/scripts/stage2/ ./scripts/stage2/
 ---> Using cache
 ---> b561717c4bd2
Step 18/46 : RUN  ./scripts/stage2/install_stage2.sh && rm -rf ./build
 ---> Using cache
 ---> f0a3db32f288
Step 19/46 : COPY ./tools/toolchain/scripts/stage3/ ./scripts/stage3/
 ---> Using cache
 ---> 0a02b97385e5
Step 20/46 : RUN  ./scripts/stage3/install_stage3.sh && rm -rf ./build
 ---> Using cache
 ---> ed8a510bafda
Step 21/46 : COPY ./tools/toolchain/scripts/stage4/ ./scripts/stage4/
 ---> Using cache
 ---> aacbf7c039f4
Step 22/46 : RUN  ./scripts/stage4/install_stage4.sh && rm -rf ./build
 ---> Using cache
 ---> 093df72cee90
Step 23/46 : COPY ./tools/toolchain/scripts/stage5/ ./scripts/stage5/
 ---> Using cache
 ---> f941008502f3
Step 24/46 : RUN  ./scripts/stage5/install_stage5.sh && rm -rf ./build
 ---> Using cache
 ---> 7ac1d6edcafe
Step 25/46 : COPY ./tools/toolchain/scripts/stage6/ ./scripts/stage6/
 ---> Using cache
 ---> 9f00baa45625
Step 26/46 : RUN  ./scripts/stage6/install_stage6.sh && rm -rf ./build
 ---> Using cache
 ---> a04df3bc4ca9
Step 27/46 : COPY ./tools/toolchain/scripts/stage7/ ./scripts/stage7/
 ---> Using cache
 ---> ab14c7b62979
Step 28/46 : RUN  ./scripts/stage7/install_stage7.sh && rm -rf ./build
 ---> Using cache
 ---> 175ff3f3ff45
Step 29/46 : COPY ./tools/toolchain/scripts/stage8/ ./scripts/stage8/
 ---> Using cache
 ---> 42d8f5e5ed6b
Step 30/46 : RUN  ./scripts/stage8/install_stage8.sh && rm -rf ./build
 ---> Using cache
 ---> 795f298dbbf6
Step 31/46 : COPY ./tools/toolchain/scripts/arch_base.tmpl      ./tools/toolchain/scripts/generate_arch_files.sh      ./scripts/
 ---> Using cache
 ---> 0daf7f83387a
Step 32/46 : RUN ./scripts/generate_arch_files.sh && rm -rf ./build
 ---> Using cache
 ---> c3e44bbde9a1
Step 33/46 : WORKDIR /opt/cp2k
 ---> Using cache
 ---> b7a099eb4fe0
Step 34/46 : COPY ./Makefile .
 ---> Using cache
 ---> 1b70cd75d243
Step 35/46 : COPY ./src ./src
 ---> 93defcf1adda
Step 36/46 : COPY ./exts ./exts
 ---> 5547d8e84366
Step 37/46 : COPY ./tools/build_utils ./tools/build_utils
 ---> 3797626fdebe
Step 38/46 : RUN /bin/bash -c "     mkdir -p arch &&     ln -vs /opt/cp2k-toolchain/install/arch/local_cuda.psmp ./arch/"
 ---> Running in 1c5fc0856171
'./arch/local_cuda.psmp' -> '/opt/cp2k-toolchain/install/arch/local_cuda.psmp'
Removing intermediate container 1c5fc0856171
 ---> 0391fe540412
Step 39/46 : COPY ./data ./data
 ---> 482b93d5bf53
Step 40/46 : COPY ./tests ./tests
 ---> c697bc67080c
Step 41/46 : COPY ./tools/regtesting ./tools/regtesting
 ---> 4619198eef4f
Step 42/46 : COPY ./benchmarks ./benchmarks
 ---> cff455ec0b77
Step 43/46 : COPY ./tools/docker/scripts/test_performance.sh       ./tools/docker/scripts/plot_performance.py       ./
 ---> d82aa156bb42
Step 44/46 : RUN ./test_performance.sh "local_cuda" 2>&1 | tee report.log
 ---> Running in 4e2b3835c5de

========== Compiling CP2K ==========
Compiling cp2k... done.

Checking benchmark inputs... Found 75 input files and 0 errors.

========== Running Performance Test ==========
Running H2O-64.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/H2O-64_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.030    0.031  118.959  118.960
 qs_mol_dyn_low                       1  2.0    0.004    0.004  118.509  118.513
 qs_forces                           11  3.9    0.002    0.002  118.447  118.447
 qs_energies                         11  4.9    0.001    0.001   99.129   99.130
 velocity_verlet                     10  3.0    0.001    0.002   76.360   76.383
 scf_env_do_scf                      11  5.9    0.001    0.001   75.461   75.462
 scf_env_do_scf_inner_loop          108  6.5    0.006    0.008   64.518   64.518
 rebuild_ks_matrix                  119  8.3    0.001    0.001   35.049   35.054
 qs_ks_build_kohn_sham_matrix       119  9.3    0.018    0.018   35.048   35.053
 qs_ks_update_qs_env                119  7.6    0.001    0.001   31.868   31.874
 dbcsr_multiply_generic            2286 12.5    0.131    0.132   26.914   26.954
 qs_scf_new_mos                     108  7.5    0.001    0.001   20.869   20.870
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   20.868   20.869
 ot_scf_mini                        108  9.5    0.003    0.003   18.892   18.898
 multiply_cannon                   2286 13.5    0.365    0.368   16.700   16.724
 qs_rho_update_rho_low              119  7.7    0.001    0.001   16.684   16.699
 calculate_rho_elec                 119  8.7    1.115    1.123   16.683   16.698
 fft_wrap_pw1pw2                   1201 11.6    0.024    0.025   16.075   16.129
 multiply_cannon_loop              2286 14.5    0.230    0.230   15.383   15.417
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001   14.792   15.092
 qs_energies_init_hamiltonians       11  5.9    0.000    0.000   15.066   15.066
 sum_up_and_integrate               119 10.3    0.002    0.002   14.953   15.024
 integrate_v_rspace                 119 11.3    0.392    0.396   14.835   14.906
 fft_wrap_pw1pw2_140                487 12.2    0.003    0.003   13.972   14.050
 ot_mini                            108 10.5    0.001    0.001   11.319   11.323
 init_scf_loop                       11  6.9    0.000    0.000   10.856   10.856
 density_rs2pw                      119  9.7    0.008    0.008   10.345   10.462
 multiply_cannon_multrec           4572 15.5    2.582    2.597    9.184    9.194
 grid_integrate_task_list           119 12.3    8.367    8.440    8.367    8.440
 make_m2s                          4572 13.5    0.048    0.049    8.402    8.405
 pw_gpu_r3dc1d_3d_ps                606 13.1    2.440    2.454    8.214    8.227
 make_images                       4572 14.5    1.607    1.612    8.197    8.201
 init_scf_run                        11  5.9    0.001    0.001    7.958    7.958
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    7.958    7.958
 pw_gpu_c1dr3d_3d_ps                595 14.2    2.386    2.403    7.830    7.872
 build_core_ppl_forces               11  5.9    7.597    7.835    7.597    7.835
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    7.570    7.573
 qs_ot_get_derivative               108 11.5    0.001    0.001    7.474    7.477
 build_core_hamiltonian_matrix       11  6.9    0.001    0.001    6.996    7.154
 pw_poisson_solve                   119 10.3    0.003    0.003    7.133    7.143
 prepare_preconditioner              11  7.9    0.000    0.000    6.991    6.992
 make_preconditioner                 11  8.9    0.000    0.000    6.991    6.992
 qs_create_task_list                 11  7.9    0.000    0.000    6.650    6.800
 generate_qs_task_list               11  8.9    2.222    2.235    6.650    6.800
 dbcsr_mm_accdrv_process           9594 16.2    1.113    1.552    6.196    6.217
 make_full_inverse_cholesky          11  9.9    0.000    0.000    5.949    6.215
 potential_pw2rs                    119 12.3    0.046    0.046    6.076    6.076
 grid_collocate_task_list           119  9.7    5.183    5.264    5.183    5.264
 qs_vxc_create                      119 10.3    0.002    0.002    5.123    5.148
 xc_vxc_pw_create                   119 11.3    0.891    0.897    5.121    5.146
 jit_kernel_multiply                 12 15.8    4.366    4.786    4.366    4.786
 pw_poisson_set                     120 11.2    0.005    0.005    4.626    4.636
 calculate_dm_sparse                119  9.5    0.001    0.001    4.293    4.298
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    4.218    4.218
 pw_derive                          357 12.3    4.204    4.215    4.204    4.215
 calculate_first_density_matrix       1  7.0    0.000    0.000    4.213    4.213
 xc_rho_set_and_dset_create         119 12.3    0.007    0.007    4.097    4.125
 build_core_ppl                      11  7.9    3.971    4.101    3.971    4.101
 grid_create_task_list               11  9.9    3.947    4.081    3.947    4.081
 xc_functional_eval                 119 13.3    4.018    4.048    4.018    4.048
 build_kinetic_matrix_low            22  6.9    3.737    3.772    3.826    3.863
 ot_diis_step                       108 11.5    0.005    0.006    3.825    3.825
 build_overlap_matrix_low            22  6.9    3.504    3.524    3.590    3.609
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.604    3.604
 multiply_cannon_sync_h2d          4572 15.5    3.500    3.544    3.500    3.544
 wfi_extrapolate                     11  7.9    0.001    0.001    3.512    3.513
 qs_ot_get_p                        119 10.4    0.001    0.001    3.345    3.361
 mp_alltoall_z22v                  1201 15.6    3.256    3.303    3.256    3.303
 dbcsr_complete_redistribute        329 12.2    1.305    1.321    3.045    3.291
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    3.227    3.231
 apply_single                       119 13.6    0.001    0.001    3.227    3.230
 hybrid_alltoall_any               4725 16.4    1.809    1.811    3.131    3.136
 make_images_data                  4572 15.5    0.066    0.066    3.062    3.069
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.055    3.055
 mp_waitall_1                     64495 16.9    2.729    2.758    2.729    2.758
 transfer_rs2pw                     487 10.6    0.008    0.008    2.505    2.757
 qs_ot_get_derivative_taylor         59 13.0    0.002    0.002    2.742    2.744
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    2.651    2.652
 -------------------------------------------------------------------------------

Plot: name="H2O-64_timings_6cpu_1gpu", title="Timings of H2O-64 with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="rest", label="rest", y=89.242, yerr=0.0
PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=8.367, yerr=0.0
PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="build_core_ppl_forces", label="build_core_ppl_forces", y=7.597, yerr=0.0
PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=5.183, yerr=0.0
PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="jit_kernel_multiply", label="jit_kernel_multiply", y=4.366, yerr=0.0
PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="pw_derive", label="pw_derive", y=4.204, yerr=0.0


Running H2O-64_nonortho.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/H2O-64_nonortho_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.030    0.033  115.988  115.988
 qs_mol_dyn_low                       1  2.0    0.004    0.004  115.465  115.468
 qs_forces                           11  3.9    0.002    0.002  115.404  115.404
 qs_energies                         11  4.9    0.001    0.001   95.730   95.731
 velocity_verlet                     10  3.0    0.002    0.002   75.584   75.607
 scf_env_do_scf                      11  5.9    0.001    0.001   71.635   71.635
 scf_env_do_scf_inner_loop           96  6.5    0.005    0.008   60.415   60.415
 rebuild_ks_matrix                  107  8.3    0.001    0.001   33.682   33.684
 qs_ks_build_kohn_sham_matrix       107  9.3    0.016    0.016   33.681   33.684
 qs_ks_update_qs_env                107  7.6    0.001    0.001   30.100   30.102
 dbcsr_multiply_generic            1966 12.4    0.115    0.116   24.643   24.733
 qs_scf_new_mos                      96  7.5    0.001    0.001   18.624   18.632
 qs_scf_loop_do_ot                   96  8.5    0.001    0.001   18.624   18.632
 qs_rho_update_rho_low              107  7.7    0.001    0.001   17.000   17.012
 calculate_rho_elec                 107  8.7    0.997    1.004   16.999   17.011
 ot_scf_mini                         96  9.5    0.003    0.003   16.851   16.851
 sum_up_and_integrate               107 10.3    0.002    0.002   15.500   15.600
 integrate_v_rspace                 107 11.3    0.362    0.363   15.393   15.493
 multiply_cannon                   1966 13.4    0.311    0.313   15.424   15.442
 qs_energies_init_hamiltonians       11  5.9    0.000    0.000   15.429   15.429
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001   14.798   15.150
 fft_wrap_pw1pw2                   1081 11.6    0.022    0.023   14.415   14.464
 multiply_cannon_loop              1966 14.4    0.202    0.204   14.311   14.317
 fft_wrap_pw1pw2_140                439 12.2    0.003    0.003   12.524   12.593
 init_scf_loop                       11  6.9    0.000    0.000   11.134   11.134
 ot_mini                             96 10.5    0.001    0.001   10.123   10.126
 grid_integrate_task_list           107 12.3    9.583    9.684    9.583    9.684
 density_rs2pw                      107  9.7    0.007    0.007    9.285    9.417
 multiply_cannon_multrec           3932 15.4    2.281    2.282    8.527    8.531
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    7.911    8.157
 init_scf_run                        11  5.9    0.001    0.001    8.030    8.031
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    8.030    8.030
 build_core_ppl_forces               11  5.9    7.594    7.886    7.594    7.886
 make_m2s                          3932 13.4    0.042    0.042    7.563    7.569
 make_images                       3932 14.4    1.448    1.459    7.382    7.387
 pw_gpu_r3dc1d_3d_ps                546 13.1    2.191    2.219    7.370    7.382
 build_core_hamiltonian_matrix       11  6.9    0.001    0.001    7.035    7.229
 prepare_preconditioner              11  7.9    0.000    0.000    7.087    7.089
 make_preconditioner                 11  8.9    0.000    0.000    7.087    7.089
 pw_gpu_c1dr3d_3d_ps                535 14.2    2.150    2.161    7.017    7.055
 qs_create_task_list                 11  7.9    0.000    0.000    6.961    7.008
 generate_qs_task_list               11  8.9    2.703    2.733    6.961    7.008
 grid_collocate_task_list           107  9.7    6.684    6.789    6.684    6.789
 qs_ot_get_derivative                96 11.5    0.001    0.001    6.703    6.703
 pw_poisson_solve                   107 10.3    0.003    0.003    6.466    6.477
 make_full_inverse_cholesky          11  9.9    0.000    0.000    6.078    6.346
 dbcsr_mm_accdrv_process           8450 16.1    0.605    0.823    5.886    5.891
 potential_pw2rs                    107 12.3    0.041    0.041    5.447    5.447
 jit_kernel_multiply                 13 15.9    4.638    4.854    4.638    4.854
 qs_vxc_create                      107 10.3    0.002    0.002    4.589    4.610
 xc_vxc_pw_create                   107 11.3    0.800    0.807    4.587    4.608
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    4.517    4.518
 pw_poisson_set                     108 11.2    0.004    0.004    4.204    4.214
 build_core_ppl                      11  7.9    4.013    4.169    4.013    4.169
 calculate_first_density_matrix       1  7.0    0.000    0.000    4.083    4.083
 calculate_dm_sparse                107  9.5    0.001    0.001    4.060    4.066
 grid_create_task_list               11  9.9    3.782    3.853    3.782    3.853
 build_kinetic_matrix_low            22  6.9    3.738    3.750    3.830    3.845
 pw_derive                          321 12.3    3.819    3.829    3.819    3.829
 wfi_extrapolate                     11  7.9    0.001    0.001    3.696    3.696
 xc_rho_set_and_dset_create         107 12.3    0.006    0.006    3.670    3.694
 build_overlap_matrix_low            22  6.9    3.540    3.568    3.627    3.655
 xc_functional_eval                 107 13.3    3.598    3.623    3.598    3.623
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.493    3.496
 ot_diis_step                        96 11.5    0.005    0.005    3.403    3.403
 multiply_cannon_sync_h2d          3932 15.4    3.313    3.367    3.313    3.367
 dbcsr_complete_redistribute        317 12.2    1.324    1.352    3.065    3.334
 mp_alltoall_z22v                  1081 15.6    2.910    2.961    2.910    2.961
 apply_preconditioner_dbcsr         107 12.6    0.000    0.000    2.948    2.952
 apply_single                       107 13.6    0.000    0.000    2.948    2.951
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.944    2.946
 qs_ot_get_p                        107 10.4    0.001    0.001    2.933    2.938
 hybrid_alltoall_any               4079 16.3    1.639    1.646    2.842    2.846
 make_images_data                  3932 15.4    0.056    0.056    2.763    2.768
 transfer_rs2pw                     439 10.6    0.007    0.008    2.345    2.657
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    2.616    2.619
 mp_waitall_1                     55487 16.8    2.467    2.515    2.467    2.515
 qs_ot_get_derivative_taylor         53 13.0    0.002    0.002    2.400    2.404
 copy_dbcsr_to_fm                   147 11.2    0.003    0.003    2.306    2.320
 -------------------------------------------------------------------------------

Plot: name="H2O-64_nonortho_timings_6cpu_1gpu", title="Timings of H2O-64_nonortho with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="rest", label="rest", y=83.476, yerr=0.0
PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=9.583, yerr=0.0
PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="build_core_ppl_forces", label="build_core_ppl_forces", y=7.594, yerr=0.0
PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=6.684, yerr=0.0
PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="jit_kernel_multiply", label="jit_kernel_multiply", y=4.638, yerr=0.0
PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="build_core_ppl", label="build_core_ppl", y=4.013, yerr=0.0


Running GW_PBE_4benzene.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/GW_PBE_4benzene_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.020    0.023  128.134  128.135
 qs_energies                          1  2.0    0.000    0.000  127.786  127.790
 mp2_main                             1  3.0    0.000    0.000  119.185  119.190
 mp2_gpw_main                         1  4.0    0.000    0.000  115.766  115.770
 rpa_ri_compute_en                    1  5.0    0.000    0.000  107.034  107.038
 rpa_num_int                          1  6.0    0.000    0.001  107.026  107.030
 compute_mat_P_omega                  1  7.0    0.002    0.002   90.077   90.081
 compute_mat_P_omega_contract        10  8.0    6.035    6.099   89.654   89.666
 dbt_total                         2336  9.6    0.023    0.023   85.244   85.245
 dbt_contract                       787 11.0    0.051    0.052   59.036   59.175
 dbt_tas_total                     1149 12.2    0.741    0.943   45.942   45.942
 dbt_tas_multiply                   807 12.1    0.003    0.003   45.100   45.100
 compute_mat_P_omega_calc_M_occ     250  9.0    6.054    6.121   31.149   31.149
 dbt_tas_dbm                        807 14.1    0.004    0.005   30.620   30.620
 dbm_multiply                       807 16.1   30.085   30.159   30.085   30.159
 dbt_copy                          1107 10.7    0.062    0.062   24.688   24.824
 compute_mat_P_omega_calc_M_vir     250  9.0    0.001    0.001   20.157   20.157
 dbt_tas_mm_1N                      524 15.1    0.002    0.002   19.094   19.227
 dbt_reshape                        594 11.8   10.071   10.072   17.281   17.465
 dbt_tas_reserve_blocks_index      3266 14.3    5.284    5.325   14.305   14.865
 dbt_crop                          1042 12.0    8.044    8.148   12.470   12.858
 compute_mat_P_omega_calc_P_t       250  9.0    0.001    0.001   12.027   12.028
 dbt_reserve_blocks_index          2347 13.0    0.177    0.178   11.173   11.773
 dbt_reserve_blocks_index_array    2289 12.1    0.011    0.011   11.023   11.624
 compute_QP_energies                  1  7.0    0.000    0.000   11.014   11.014
 compute_self_energy_cubic_gw         1  8.0    0.044    0.044   11.012   11.013
 dbt_tas_mm_2                       251 15.0    0.002    0.002    9.993    9.993
 dbm_reserve_blocks                3634 15.3    9.337    9.903    9.337    9.903
 dbcsr_multiply_generic              30  8.1    0.002    0.002    8.713    8.745
 mp2_ri_gpw_compute_in                1  5.0    0.001    0.001    8.721    8.721
 multiply_cannon                     30  9.1    0.009    0.010    8.531    8.536
 multiply_cannon_loop                30 10.1    0.004    0.004    8.475    8.480
 scf_env_do_scf                       1  3.0    0.000    0.000    8.411    8.411
 scf_env_do_scf_inner_loop           17  4.0    0.001    0.001    8.411    8.411
 compute_mat_P_omega_copy_M_vir     250  9.0    0.002    0.002    7.397    7.420
 convert_to_new_pgrid              2421 14.1    0.037    0.037    5.539    6.977
 dbm_copy                          1614 15.1    5.502    6.939    5.502    6.939
 compute_mat_P_omega_copy_M_occ     250  9.0    0.001    0.001    6.752    6.754
 multiply_cannon_multrec             60 11.1    0.263    0.266    6.213    6.247
 contract_cubic_gw                   21  9.0    0.000    0.000    6.170    6.170
 dbcsr_mm_accdrv_process            328 12.3    0.024    0.025    5.682    5.725
 jit_kernel_multiply                 17 11.6    5.651    5.694    5.651    5.694
 dbt_tas_copy                       511 11.5    2.495    2.520    5.501    5.564
 mp_waitall_2                      2656 15.9    5.362    5.377    5.362    5.377
 mp_sync                           8688 11.6    3.536    5.242    3.536    5.242
 qs_scf_new_mos                      17  5.0    0.000    0.000    4.805    4.837
 dbt_communicate_buffer             594 12.8    0.010    0.010    4.629    4.666
 get_2c_integrals                     1  6.0    0.000    0.000    4.065    4.065
 calculate_dm_sparse                 17  6.0    0.000    0.000    3.548    3.570
 cp_dbcsr_plus_fm_fm_t_native        17  7.0    0.000    0.000    3.548    3.570
 compute_vec_Sigma_x_minus_vxc_       1  4.0    0.000    0.000    3.337    3.337
 compute_2c_integrals                 1  7.0    0.000    0.000    3.210    3.210
 dbt_tas_reserve_blocks_templat     551 12.6    0.045    0.045    3.123    3.161
 qs_ks_build_kohn_sham_matrix        18  6.9    0.002    0.002    3.138    3.147
 qs_ks_update_qs_env                 17  5.0    0.000    0.000    3.098    3.108
 rebuild_ks_matrix                   17  6.0    0.000    0.000    3.094    3.104
 trace_sigma_gw                      21  9.0    0.499    0.509    3.021    3.021
 build_3c_integrals                   5  6.0    1.925    1.930    2.494    2.655
 -------------------------------------------------------------------------------

Plot: name="GW_PBE_4benzene_timings_6cpu_1gpu", title="Timings of GW_PBE_4benzene with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="rest", label="rest", y=64.54299999999999, yerr=0.0
PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbm_multiply", label="dbm_multiply", y=30.085, yerr=0.0
PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbt_reshape", label="dbt_reshape", y=10.071, yerr=0.0
PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbm_reserve_blocks", label="dbm_reserve_blocks", y=9.337, yerr=0.0
PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbt_crop", label="dbt_crop", y=8.044, yerr=0.0
PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="compute_mat_P_omega_calc_M_occ", label="compute_mat_P_omega_calc_M_occ", y=6.054, yerr=0.0


Running RI-HFX_H2O-32.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/RI-HFX_H2O-32_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.023    0.025  238.683  238.684
 qs_forces                            1  2.0    0.000    0.000  238.169  238.169
 rebuild_ks_matrix                    7  6.6    0.000    0.000  234.050  234.050
 qs_ks_build_kohn_sham_matrix         7  7.6    0.001    0.001  234.050  234.050
 hfx_ks_matrix                        7  8.6    0.000    0.000  229.465  229.466
 dbt_total                          849 11.0    0.009    0.009  162.153  162.153
 hfx_ri_update_ks                     7  9.6    0.000    0.000  144.779  144.779
 hfx_ri_update_ks_Pmat                7 10.6   32.345   32.354  144.764  144.768
 qs_energies                          1  3.0    0.000    0.000  134.330  134.330
 scf_env_do_scf                       1  4.0    0.000    0.000  132.698  132.698
 qs_ks_update_qs_env                  8  6.0    0.000    0.000  130.278  130.279
 qs_ks_update_qs_env_forces           1  3.0    0.000    0.000  103.774  103.774
 dbt_contract                       207 12.4    0.051    0.051   98.697   98.706
 hfx_ri_update_forces                 1  7.0    1.216    1.217   84.685   84.685
 scf_env_do_scf_inner_loop            6  5.0    0.000    0.001   74.901   74.901
 dbt_tas_total                      369 13.4    3.314    3.337   74.832   74.832
 dbt_tas_multiply                   216 13.5    0.001    0.001   71.436   71.436
 dbt_copy                           423 11.8    0.046    0.047   58.435   58.971
 init_scf_loop                        2  5.0    0.000    0.000   57.795   57.796
 hfx_ri_forces_Pmat_3c                1  8.0    3.647    3.647   48.262   48.272
 dbt_tas_dbm                        216 15.5    0.001    0.001   47.495   47.495
 dbm_multiply                       216 17.5   45.433   45.974   45.433   45.974
 dbt_reshape                        175 13.2   22.660   23.007   41.945   42.148
 hfx_ri_update_ks_Pmat_KS            63 11.6    0.001    0.001   41.529   41.529
 dbt_tas_reserve_blocks_index      1323 15.4   11.140   11.177   30.410   30.618
 precalc_derivatives                  1  8.0    2.232    2.247   29.177   29.177
 dbt_crop                           372 13.7   15.999   16.035   25.637   25.643
 dbt_reserve_blocks_index           889 14.5    0.337    0.338   23.956   23.983
 dbt_reserve_blocks_index_array     859 13.5    0.006    0.007   23.597   23.625
 hfx_ri_update_ks_Pmat_Px3C          63 11.6    0.000    0.000   21.873   21.874
 dbm_reserve_blocks                1491 16.3   20.097   20.304   20.097   20.304
 hfx_ri_pre_scf_Pmat                  1 12.0    0.000    0.000   19.121   19.121
 dbt_tas_mm_2                        91 16.5    0.001    0.001   18.900   18.900
 dbt_tas_mm_3T                       77 17.1    0.000    0.000   17.803   18.192
 hfx_ri_update_ks_Pmat_copy_2        63 11.6    0.000    0.000   16.442   16.442
 mp_waitall_2                      1022 16.5   16.005   16.321   16.005   16.321
 build_3c_derivatives                 3  9.0    4.041    4.054   15.819   15.821
 dbt_communicate_buffer             175 14.2    0.004    0.004   13.263   13.443
 dbt_tas_copy                       248 12.5    4.286    4.360   10.499   10.801
 convert_to_new_pgrid               648 15.5    0.015    0.016    9.103    9.189
 dbm_copy                           459 16.3    9.088    9.173    9.088    9.173
 dbt_tas_mm_3N                       37 15.4    0.000    0.000    8.896    8.918
 mp_sync                           2901 12.8    6.579    7.660    6.579    7.660
 dbt_tas_reserve_blocks_templat     266 13.6    0.071    0.072    6.329    6.555
 hfx_ri_pre_scf_Pmat_copy_2           9 13.0    2.999    3.004    6.231    6.236
 hfx_ri_pre_scf_Pmat_int              1 13.0    0.000    0.000    5.148    5.148
 dbt_tas_replicate                  168 15.1    2.424    2.490    4.967    4.977
 -------------------------------------------------------------------------------

Plot: name="RI-HFX_H2O-32_timings_6cpu_1gpu", title="Timings of RI-HFX_H2O-32 with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="rest", label="rest", y=102.143, yerr=0.0
PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbm_multiply", label="dbm_multiply", y=45.433, yerr=0.0
PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="hfx_ri_update_ks_Pmat", label="hfx_ri_update_ks_Pmat", y=32.345, yerr=0.0
PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbt_reshape", label="dbt_reshape", y=22.66, yerr=0.0
PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbm_reserve_blocks", label="dbm_reserve_blocks", y=20.097, yerr=0.0
PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="mp_waitall_2", label="mp_waitall_2", y=16.005, yerr=0.0


Running RI-MP2_ammonia.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/RI-MP2_ammonia_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.014    0.016  112.665  112.666
 qs_energies                          1  2.0    0.000    0.000  112.445  112.445
 mp2_main                             1  3.0    0.000    0.000  104.974  104.974
 mp2_gpw_main                         1  4.0    0.001    0.002  104.571  104.571
 mp2_ri_gpw_compute_in                1  5.0    0.542    0.542   62.558   62.583
 mp2_ri_gpw_compute_in_loop           1  6.0    0.012    0.012   51.692   51.717
 mp2_ri_gpw_compute_en                1  5.0    0.097    0.097   41.952   41.977
 mp2_ri_gpw_compute_en_RI_loop        1  6.0   12.781   12.800   39.352   39.353
 mp2_eri_3c_integrate_gpw          1328  7.0    0.015    0.015   25.597   28.486
 dbcsr_multiply_generic            2666  8.0    0.144    0.148   22.416   25.257
 ao_to_mo_and_store_B_mult_1       1328  7.0    0.011    0.012   20.469   23.310
 multiply_cannon                   2666  9.0    0.392    0.413   13.937   16.748
 mp2_ri_gpw_compute_en_expansio    1040  7.0    0.710    0.710   16.071   16.169
 local_gemm                        1040  8.0   15.362   15.459   15.362   15.459
 multiply_cannon_loop              2666 10.0    0.160    0.160   12.615   15.411
 calculate_wavefunction            1328  8.0    7.674    7.689   12.084   14.792
 fft_wrap_pw1pw2                  26668 10.4    0.155    0.158    9.329   12.061
 integrate_v_rspace                1338  8.0    1.003    1.008   11.343   11.568
 get_2c_integrals                     1  6.0    0.002    0.004   10.307   10.323
 compute_2c_integrals                 1  7.0    0.005    0.006    9.612    9.612
 compute_2c_integrals_loop_lm         1  8.0    0.008    0.011    9.474    9.517
 mp2_eri_2c_integrate_gpw             1  9.0    3.096    3.126    9.466    9.512
 grid_integrate_task_list          1338  9.0    9.187    9.400    9.187    9.400
 multiply_cannon_multrec           2676 11.0    3.824    5.460    7.282    8.941
 pw_gpu_r3dc1d_3d                 13282 12.2    5.950    8.687    5.950    8.687
 make_m2s                          5332  9.0    0.053    0.055    7.787    7.810
 fft_wrap_pw1pw2_20               10647 11.4    0.017    0.017    5.823    7.774
 make_images                       5332 10.0    2.750    2.757    7.596    7.622
 scf_env_do_scf                       1  3.0    0.000    0.000    6.739    6.741
 scf_env_do_scf_inner_loop           10  4.0    0.001    0.001    6.739    6.741
 qs_scf_new_mos                      10  5.0    0.000    0.000    5.360    5.364
 ao_to_mo_and_store_B_E_Ex_1       1328  7.0    3.392    3.423    5.267    5.322
 calc_potential_gpw                2656  9.5    0.015    0.015    4.881    4.951
 mp2_ri_gpw_compute_en_ener        1040  7.0    4.903    4.934    4.903    4.934
 mp2_ri_gpw_compute_en_comm         221  7.0    1.028    1.028    4.505    4.591
 pw_poisson_solve                  2666 10.5    0.026    0.026    4.100    4.171
 mp2_eri_2c_integrate_gpw_pot_l    1328 10.0    0.004    0.004    3.874    3.921
 fft_wrap_pw1pw2_10               15957 11.5    0.020    0.021    2.864    3.646
 multiply_cannon_sync_h2d          2676 11.0    2.463    3.580    2.463    3.580
 pw_poisson_set                    2669 11.5    0.035    0.036    3.334    3.398
 hybrid_alltoall_any               6683 11.6    2.940    2.970    3.202    3.231
 dbcsr_mm_accdrv_process           5392 12.0    0.264    0.265    3.219    3.227
 pw_derive                         7998 12.5    3.088    3.149    3.088    3.149
 make_images_data                  5332 11.0    0.075    0.076    3.122    3.148
 eigensolver                         11  5.8    0.001    0.001    3.070    3.070
 jit_kernel_multiply                  8 13.0    2.852    2.861    2.852    2.861
 pw_gpu_c1dr3d_3d                 13280 12.7    2.668    2.673    2.668    2.673
 mp_sendrecv_dm3                    442  8.0    2.518    2.607    2.518    2.607
 potential_pw2rs                   2666 10.0    0.108    0.108    2.548    2.550
 cp_fm_diag_elpa                     11  6.8    0.000    0.000    2.386    2.386
 cp_fm_diag_elpa_base                11  7.8    2.252    2.267    2.384    2.385
 copy_dbcsr_to_fm                  1351  8.0    0.026    0.027    2.278    2.301
 -------------------------------------------------------------------------------

Plot: name="RI-MP2_ammonia_timings_6cpu_1gpu", title="Timings of RI-MP2_ammonia with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="rest", label="rest", y=61.711000000000006, yerr=0.0
PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="local_gemm", label="local_gemm", y=15.362, yerr=0.0
PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="mp2_ri_gpw_compute_en_RI_loop", label="mp2_ri_gpw_compute_en_RI_loop", y=12.781, yerr=0.0
PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=9.187, yerr=0.0
PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="calculate_wavefunction", label="calculate_wavefunction", y=7.674, yerr=0.0
PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="pw_gpu_r3dc1d_3d", label="pw_gpu_r3dc1d_3d", y=5.95, yerr=0.0


Running diag_cu144_broy.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/diag_cu144_broy_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.077    0.079  208.541  208.542
 qs_energies                          1  2.0    0.000    0.000  207.242  207.243
 scf_env_do_scf                       1  3.0    0.000    0.000  191.422  191.422
 scf_env_do_scf_inner_loop           15  4.0    0.001    0.002  191.421  191.422
 qs_ks_update_qs_env                 15  5.0    0.000    0.000  108.246  108.296
 rebuild_ks_matrix                   15  6.0    0.000    0.000  108.065  108.115
 qs_ks_build_kohn_sham_matrix        15  7.0    0.002    0.002  108.065  108.114
 qs_vxc_create                       15  8.0    0.035    0.069   70.240   70.295
 calculate_dispersion_nonloc         15  9.0   13.046   13.067   57.839   57.863
 qs_scf_new_mos                      15  5.0    0.000    0.001   56.432   56.507
 eigensolver                         15  6.0    0.002    0.002   47.297   47.348
 sum_up_and_integrate                15  8.0    0.000    0.000   35.533   35.641
 integrate_v_rspace                  15  9.0    0.048    0.049   35.505   35.615
 grid_integrate_task_list            15 10.0   33.930   33.981   33.930   33.981
 fft_wrap_pw1pw2                   1086 10.0    0.024    0.025   32.706   32.775
 cp_fm_diag_elpa                     15  7.0    0.000    0.000   28.047   28.052
 cp_fm_diag_elpa_base                15  8.0   25.390   26.040   28.041   28.042
 qs_rho_update_rho_low               16  5.0    0.000    0.000   23.221   23.221
 calculate_rho_elec                  16  6.0    0.182    0.182   23.221   23.221
 fft_wrap_pw1pw2_150                765 11.0    0.004    0.004   22.249   22.267
 grid_collocate_task_list            16  7.0   20.337   20.361   20.337   20.361
 pw_gpu_c1dr3d_3d_ps                585 12.1    5.497    5.584   17.738   17.749
 cp_fm_cholesky_restore              45  7.0   16.523   17.317   16.523   17.317
 vdW_energy                          15 10.0   17.217   17.273   17.217   17.273
 pw_gpu_r3dc1d_3d_ps                501 11.9    4.844    5.185   14.937   14.995
 qs_energies_init_hamiltonians        1  3.0    0.000    0.000   12.873   12.873
 xc_vxc_pw_create                    15  9.0    0.190    0.192   12.366   12.370
 build_core_hamiltonian_matrix        1  4.0    0.000    0.000   11.245   11.279
 fft_wrap_pw1pw2_200                197 11.3    0.001    0.001    9.838    9.839
 xc_rho_set_and_dset_create          15 10.0    0.141    0.143    8.404    8.413
 mp_alltoall_z22v                  1086 14.0    6.817    7.347    6.817    7.347
 cp_fm_upper_to_full                 30  8.0    5.376    6.764    5.376    6.764
 copy_dbcsr_to_fm                    16  5.9    0.001    0.001    5.885    5.971
 xc_pw_derive                        90 11.0    0.001    0.001    5.949    5.961
 build_core_ppnl                      1  5.0    5.774    5.801    5.774    5.801
 dbcsr_complete_redistribute         46  8.3    1.762    1.778    5.255    5.279
 xc_functional_eval                  15 11.0    0.000    0.000    5.180    5.188
 pbe_lda_eval                        15 12.0    5.180    5.188    5.180    5.188
 x_to_yz                            585 13.1    1.285    1.293    4.926    5.026
 gspace_mixing                       14  5.0    0.131    0.131    4.808    4.808
 yz_to_x                            501 12.9    0.880    0.884    4.057    4.473
 pw_gpu_sf                          585 13.1    4.198    4.201    4.198    4.201
 -------------------------------------------------------------------------------

Plot: name="diag_cu144_broy_timings_6cpu_1gpu", title="Timings of diag_cu144_broy with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="rest", label="rest", y=95.144, yerr=0.0
PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=33.93, yerr=0.0
PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="cp_fm_diag_elpa_base", label="cp_fm_diag_elpa_base", y=25.39, yerr=0.0
PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=20.337, yerr=0.0
PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="vdW_energy", label="vdW_energy", y=17.217, yerr=0.0
PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="cp_fm_cholesky_restore", label="cp_fm_cholesky_restore", y=16.523, yerr=0.0


Running bench_dftb.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/bench_dftb_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.043    0.044  293.188  293.188
 qs_energies                          1  2.0    0.000    0.000  293.071  293.071
 ls_scf                               1  3.0    0.000    0.000  292.187  292.188
 ls_scf_main                          1  4.0    0.001    0.001  281.306  281.307
 density_matrix_trs4                 11  5.0    0.008    0.010  237.204  237.255
 dbcsr_multiply_generic             185  6.1    0.160    0.185  204.562  204.587
 multiply_cannon                    185  7.1    1.642    1.681  152.044  152.573
 multiply_cannon_loop               185  8.1    0.262    0.267  133.476  133.795
 multiply_cannon_multrec            370  9.1  100.683  100.822  117.537  117.544
 make_m2s                           370  7.1    0.042    0.042   43.974   44.066
 make_images                        370  8.1   15.329   15.730   42.776   42.864
 ls_scf_dm_to_ks                     11  5.0    0.000    0.000   39.717   39.757
 matrix_ls_to_qs                     11  6.0    0.000    0.000   37.424   37.498
 dbcsr_complete_redistribute         23  7.5   22.908   22.993   32.098   32.188
 matrix_decluster                    11  7.0    0.000    0.000   29.459   29.548
 arnoldi_extremal                    12  6.1    0.000    0.000   22.627   22.629
 arnoldi_normal_ev                   12  7.1    0.140    0.141   22.627   22.629
 build_subspace                      23  8.1    0.064    0.065   21.988   21.989
 dbcsr_matrix_vector_mult           652  9.0    0.147    0.148   20.481   20.559
 dbcsr_matrix_vector_mult_local     652 10.0   19.460   19.540   19.466   19.546
 dbcsr_finalize                     646  7.5    0.192    0.195   14.844   15.193
 calculate_norms                    740  9.1   14.568   14.897   14.568   14.897
 dbcsr_mm_accdrv_process          14501 10.0    0.941    1.040   14.626   14.719
 dbcsr_merge_all                    597  8.5    2.288    2.587   13.579   13.929
 setup_rec_index_2d                 370  8.1   13.860   13.926   13.860   13.926
 dbcsr_mm_accdrv_process_sort     14501 11.0   13.555   13.618   13.555   13.618
 dbcsr_special_finalize             555  9.1    0.008    0.008   10.619   10.692
 dbcsr_sort_indices                1287 10.0   10.408   10.499   10.408   10.499
 make_images_data                   370  9.1    0.012    0.012    9.570   10.024
 hybrid_alltoall_any                393  9.9    4.177    4.230    8.905    9.368
 ls_scf_init_scf                      1  4.0    0.000    0.000    9.151    9.152
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000    8.910    8.913
 dbcsr_add_d                        280  6.0    0.001    0.001    8.034    8.373
 dbcsr_add_anytype                  280  7.0    3.697    3.713    8.033    8.372
 matrix_sqrt_Newton_Schulz            1  6.0    0.000    0.001    8.141    8.144
 dbcsr_copy_into_existing            11  7.0    7.965    7.980    7.965    7.980
 tree_to_linear_d                   110  9.4    7.838    7.847    7.838    7.847
 dbcsr_merge_single_wm              370 10.1    0.511    0.525    6.545    6.555
 -------------------------------------------------------------------------------

Plot: name="bench_dftb_timings_6cpu_1gpu", title="Timings of bench_dftb with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="rest", label="rest", y=120.23999999999995, yerr=0.0
PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="multiply_cannon_multrec", label="multiply_cannon_multrec", y=100.683, yerr=0.0
PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="dbcsr_complete_redistribute", label="dbcsr_complete_redistribute", y=22.908, yerr=0.0
PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="dbcsr_matrix_vector_mult_local", label="dbcsr_matrix_vector_mult_local", y=19.46, yerr=0.0
PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="make_images", label="make_images", y=15.329, yerr=0.0
PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="calculate_norms", label="calculate_norms", y=14.568, yerr=0.0


Running dbcsr.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/dbcsr_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.004    0.006   68.002   68.003
 lib_test                             1  2.0    0.000    0.000   67.980   67.995
 dbcsr_run_tests                      3  3.0    0.001    0.001   67.980   67.993
 test_multiplies_multiproc            3  4.0    0.001    0.001   55.828   55.884
 dbcsr_multiply_generic               9  5.0    0.001    0.001   41.495   41.515
 multiply_cannon                      9  6.0    0.271    0.345   33.232   33.431
 multiply_cannon_loop                 9  7.0    0.003    0.003   31.723   31.804
 multiply_cannon_multrec             18  8.0   16.627   16.682   29.604   29.687
 dbcsr_mm_accdrv_process           8199  9.0    1.278    1.457   12.745   12.769
 dbcsr_make_random_matrix             9  4.0    8.874    8.917   12.026   12.081
 dbcsr_redistribute                   9  5.0    7.649    7.653    9.859    9.868
 dbcsr_mm_accdrv_process_sort      8199 10.0    8.910    8.916    8.910    8.916
 dbcsr_finalize                      27  5.7    0.001    0.001    7.292    7.294
 dbcsr_merge_all                     18  6.5    3.643    3.654    7.168    7.173
 make_m2s                            18  6.0    0.001    0.001    3.600    3.603
 make_images                         18  7.0    0.513    0.515    3.562    3.565
 dbcsr_checksum                       6  5.0    2.858    2.893    2.895    2.895
 jit_kernel_multiply                  8 10.0    2.556    2.753    2.556    2.753
 tree_to_linear_d                     9  7.0    1.923    1.937    1.923    1.937
 mp_alltoall_d11v                    27  6.0    1.908    1.913    1.908    1.913
 dbcsr_data_release                 507  7.7    1.491    1.508    1.491    1.508
 dbcsr_data_copy_aa2                 18  7.5    1.451    1.453    1.451    1.453
 -------------------------------------------------------------------------------

Plot: name="dbcsr_timings_6cpu_1gpu", title="Timings of dbcsr with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="rest", label="rest", y=22.298999999999992, yerr=0.0
PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="multiply_cannon_multrec", label="multiply_cannon_multrec", y=16.627, yerr=0.0
PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_mm_accdrv_process_sort", label="dbcsr_mm_accdrv_process_sort", y=8.91, yerr=0.0
PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_make_random_matrix", label="dbcsr_make_random_matrix", y=8.874, yerr=0.0
PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_redistribute", label="dbcsr_redistribute", y=7.649, yerr=0.0
PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_merge_all", label="dbcsr_merge_all", y=3.643, yerr=0.0


Running MQAE_single_node.inp with 3 threads and 2 ranks... done.


From /workspace/artifacts/MQAE_single_node_6cpu_1gpu.out:
-------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.055    0.057  327.878  327.879
 qs_mol_dyn_low                       1  2.0    0.005    0.005  326.010  326.069
 velocity_verlet                      5  3.0    0.004    0.004  183.493  183.580
 qs_forces                            6  3.8    0.001    0.001  164.512  164.512
 qs_energies                          6  4.8    0.001    0.001  155.359  155.360
 scf_env_do_scf                       6  5.8    0.001    0.001  145.219  145.219
 scf_env_do_scf_inner_loop          113  6.2    0.005    0.008  134.549  134.550
 rebuild_ks_matrix                  119  8.1    0.000    0.000  119.404  119.407
 qs_ks_build_kohn_sham_matrix       119  9.1    0.018    0.018  119.403  119.407
 qs_ks_update_qs_env                119  7.3    0.001    0.001  113.425  113.429
 qmmm_forces                          6  3.8    0.002    0.002   80.409   80.409
 qmmm_forces_with_gaussian            6  4.8    0.023    0.023   79.793   80.033
 qs_vxc_create                      119 10.1    0.002    0.002   78.802   78.804
 xc_vxc_pw_create                   119 11.1    1.638    1.676   78.800   78.803
 qmmm_el_coupling                     6  3.8    0.000    0.000   77.514   77.522
 qmmm_elec_with_gaussian              6  4.8    0.022    0.022   77.508   77.516
 qmmm_force_with_gaussian_low         6  5.8    0.000    0.000   76.457   76.703
 qmmm_elec_with_gaussian_low          6  5.8    0.000    0.000   73.599   74.049
 fft_wrap_pw1pw2                   2059 12.4    0.043    0.045   65.390   65.614
 qmmm_elec_gaussian_low_G             6  6.8   64.549   65.011   64.549   65.011
 fft_wrap_pw1pw2_150               1321 13.9    0.009    0.009   63.235   63.465
 xc_pw_derive                       714 13.1    0.009    0.010   49.091   49.186
 qmmm_forces_gaussian_low_G           6  6.8   47.831   48.010   47.831   48.010
 xc_rho_set_and_dset_create         119 12.1    4.323    4.324   45.907   45.946
 pw_gpu_c1dr3d_3d_ps               1095 14.8   10.761   10.946   35.263   35.295
 xc_pw_divergence                   119 12.1    0.006    0.006   30.746   30.760
 pw_gpu_r3dc1d_3d_ps                964 14.0    9.483    9.600   30.072   30.262
 qmmm_forces_gaussian_low_R           6  6.8    0.000    0.000   28.626   28.693
 qmmm_forces_with_gaussian_LG         6  7.8   28.626   28.693   28.626   28.693
 pw_derive                         1089 13.4   21.945   22.208   21.945   22.208
 qs_rho_update_rho_low              119  7.3    0.001    0.001   20.066   20.295
 calculate_rho_elec                 119  8.3    1.190    1.231   20.066   20.294
 dbcsr_multiply_generic            2588 12.3    0.105    0.106   18.041   18.066
 xc_functional_eval                 238 13.1    0.002    0.002   16.368   16.368
 multiply_cannon                   2588 13.3    0.248    0.249   16.195   16.315
 sum_up_and_integrate               119 10.1    0.002    0.002   16.078   16.199
 density_rs2pw                      119  9.3    0.008    0.008   15.875   16.087
 integrate_v_rspace                 119 11.1    0.022    0.022   15.868   15.990
 multiply_cannon_loop              2588 14.3    0.229    0.233   15.600   15.722
 mp_alltoall_z22v                  2059 16.4   13.589   14.191   13.589   14.191
 pw_poisson_solve                   125  9.9    0.003    0.003   11.922   11.964
 multiply_cannon_multrec           5176 15.3    4.383    4.436   11.591   11.715
 init_scf_loop                        6  6.8    0.000    0.000   10.666   10.666
 lyp_lda_eval                       119 14.1   10.315   10.321   10.315   10.321
 x_to_yz                           1095 15.8    2.724    2.739    9.923   10.217
 qs_ks_ddapc                        119 10.1    0.002    0.002   10.028   10.081
 potential_pw2rs                    119 12.1    0.035    0.035    9.680    9.684
 qmmm_elec_gaussian_low_R             6  6.8    0.000    0.000    9.050    9.062
 qmmm_elec_with_gaussian_LG           6  7.8    9.050    9.062    9.050    9.062
 qs_scf_new_mos                     113  7.2    0.001    0.001    8.501    8.504
 qs_scf_loop_do_ot                  113  8.2    0.001    0.001    8.500    8.504
 yz_to_x                            964 15.0    1.777    1.785    8.167    8.451
 pw_gpu_sf                         1095 15.8    8.368    8.407    8.368    8.407
 ot_scf_mini                        113  9.2    0.002    0.002    8.187    8.191
 pw_poisson_set                     128 10.8    0.005    0.005    8.015    8.056
 pw_gpu_fg                          964 15.0    7.790    7.797    7.790    7.797
 init_scf_run                         6  5.8    0.000    0.000    7.583    7.583
 scf_env_initial_rho_setup            6  6.8    0.000    0.000    7.583    7.583
 dbcsr_mm_accdrv_process          13832 16.0    1.668    1.751    7.133    7.201
 -------------------------------------------------------------------------------

Plot: name="MQAE_single_node_timings_6cpu_1gpu", title="Timings of MQAE_single_node with 6 CPU Cores and 1 GPU", ylabel="time [s]"
PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="rest", label="rest", y=151.338, yerr=0.0
PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_elec_gaussian_low_G", label="qmmm_elec_gaussian_low_G", y=64.549, yerr=0.0
PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_forces_gaussian_low_G", label="qmmm_forces_gaussian_low_G", y=47.831, yerr=0.0
PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_forces_with_gaussian_LG", label="qmmm_forces_with_gaussian_LG", y=28.626, yerr=0.0
PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="pw_derive", label="pw_derive", y=21.945, yerr=0.0
PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="mp_alltoall_z22v", label="mp_alltoall_z22v", y=13.589, yerr=0.0



Summary: Performance test took 27 minutes.
Status: OK

Removing intermediate container 4e2b3835c5de
 ---> 3da8721dc66e
Step 45/46 : CMD cat $(find ./report.log -mmin +10) | sed '/^Summary:/ s/$/ (cached)/'
 ---> Running in d464d7c12238
Removing intermediate container d464d7c12238
 ---> 934c9e86d4a7
Step 46/46 : ENTRYPOINT []
 ---> Running in d7637065ee0f
Removing intermediate container d7637065ee0f
 ---> e0a741ad3bb0
[Warning] One or more build-args [GIT_COMMIT_SHA] were not consumed
Successfully built e0a741ad3bb0
Successfully tagged us-central1-docker.pkg.dev/cp2k-org-project/cp2kci/img_cp2k-perf-cuda-volta:master

Pushing new image... done.

#################### Running Image cp2k-perf-cuda-volta ####################

Uploading artifacts... done

EndDate: 2024-03-06 09:07:26+00:00