StartDate: 2024-03-06 08:06:09+00:00 CpuId: 12x Intel Xeon W 2000 / D-2100 (Skylake / Cascade Lake) {Skylake}, 14nm GpuId: 1x Tesla V100-SXM2-16GB CommitSHA: 0833382a821d2a82ff86511df71b97b2f014239a CommitTime: 2024-03-05 11:23:13 +0100 CommitAuthor: marcella CommitSubject: projection on reference in RTP (#3298) #################### Building Image cp2k-perf-cuda-volta #################### Dockerfile: /tools/docker/Dockerfile.test_performance_cuda_V100 Build-Path: / Build-Args: GIT_COMMIT_SHA=0833382a821d2a82ff86511df71b97b2f014239a Build-Cache: Yes Populating docker build cache... done. DEPRECATED: The legacy builder is deprecated and will be removed in a future release. BuildKit is currently disabled; enable it by removing the DOCKER_BUILDKIT=0 environment-variable. Sending build context to Docker daemon 393.7MB Step 1/46 : FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 11.8.0-devel-ubuntu22.04: Pulling from nvidia/cuda aece8493d397: Already exists 5e3b7ee77381: Already exists 5bd037f007fd: Already exists 4cda774ad2ec: Already exists 775f22adee62: Already exists 263fc748118f: Already exists 16c36d0187d0: Already exists e7a56570655c: Already exists 507fc9045cba: Already exists 23b7d8e07c16: Already exists 922ac8fcb889: Already exists Digest: sha256:94fd755736cb58979173d491504f0b573247b1745250249415b07fefc738e41f Status: Downloaded newer image for nvidia/cuda:11.8.0-devel-ubuntu22.04 ---> 6f9cc9f1ba9e Step 2/46 : ENV CUDA_PATH /usr/local/cuda ---> Using cache ---> ea6c9bc4eda6 Step 3/46 : ENV LD_LIBRARY_PATH /usr/local/cuda/lib64 ---> Using cache ---> 223d787cdd89 Step 4/46 : ENV CUDA_CACHE_DISABLE 1 ---> Using cache ---> 1774168f85a8 Step 5/46 : RUN apt-get update -qq && apt-get install -qq --no-install-recommends gfortran mpich libmpich-dev && rm -rf /var/lib/apt/lists/* ---> Using cache ---> cd35350707cc Step 6/46 : WORKDIR /opt/cp2k-toolchain ---> Using cache ---> ca16f8d162c0 Step 7/46 : COPY ./tools/toolchain/install_requirements*.sh ./ ---> Using cache ---> a24633b5dbbf Step 8/46 : RUN ./install_requirements.sh ubuntu ---> Using cache ---> 07030e2616fd Step 9/46 : RUN mkdir scripts ---> Using cache ---> 0550a65eda27 Step 10/46 : COPY ./tools/toolchain/scripts/VERSION ./tools/toolchain/scripts/parse_if.py ./tools/toolchain/scripts/tool_kit.sh ./tools/toolchain/scripts/common_vars.sh ./tools/toolchain/scripts/signal_trap.sh ./tools/toolchain/scripts/get_openblas_arch.sh ./scripts/ ---> Using cache ---> 5c8537531ebc Step 11/46 : COPY ./tools/toolchain/install_cp2k_toolchain.sh . ---> Using cache ---> 899829448a26 Step 12/46 : RUN ./install_cp2k_toolchain.sh --mpi-mode=mpich --enable-cuda=yes --gpu-ver=V100 --dry-run ---> Using cache ---> 7d25ffc2dd66 Step 13/46 : COPY ./tools/toolchain/scripts/stage0/ ./scripts/stage0/ ---> Using cache ---> 48de2d0e812f Step 14/46 : RUN ./scripts/stage0/install_stage0.sh && rm -rf ./build ---> Using cache ---> 2de33e608cf9 Step 15/46 : COPY ./tools/toolchain/scripts/stage1/ ./scripts/stage1/ ---> Using cache ---> fc54d5881560 Step 16/46 : RUN ./scripts/stage1/install_stage1.sh && rm -rf ./build ---> Using cache ---> 1acd0d5dd321 Step 17/46 : COPY ./tools/toolchain/scripts/stage2/ ./scripts/stage2/ ---> Using cache ---> b561717c4bd2 Step 18/46 : RUN ./scripts/stage2/install_stage2.sh && rm -rf ./build ---> Using cache ---> f0a3db32f288 Step 19/46 : COPY ./tools/toolchain/scripts/stage3/ ./scripts/stage3/ ---> Using cache ---> 0a02b97385e5 Step 20/46 : RUN ./scripts/stage3/install_stage3.sh && rm -rf ./build ---> Using cache ---> ed8a510bafda Step 21/46 : COPY ./tools/toolchain/scripts/stage4/ ./scripts/stage4/ ---> Using cache ---> aacbf7c039f4 Step 22/46 : RUN ./scripts/stage4/install_stage4.sh && rm -rf ./build ---> Using cache ---> 093df72cee90 Step 23/46 : COPY ./tools/toolchain/scripts/stage5/ ./scripts/stage5/ ---> Using cache ---> f941008502f3 Step 24/46 : RUN ./scripts/stage5/install_stage5.sh && rm -rf ./build ---> Using cache ---> 7ac1d6edcafe Step 25/46 : COPY ./tools/toolchain/scripts/stage6/ ./scripts/stage6/ ---> Using cache ---> 9f00baa45625 Step 26/46 : RUN ./scripts/stage6/install_stage6.sh && rm -rf ./build ---> Using cache ---> a04df3bc4ca9 Step 27/46 : COPY ./tools/toolchain/scripts/stage7/ ./scripts/stage7/ ---> Using cache ---> ab14c7b62979 Step 28/46 : RUN ./scripts/stage7/install_stage7.sh && rm -rf ./build ---> Using cache ---> 175ff3f3ff45 Step 29/46 : COPY ./tools/toolchain/scripts/stage8/ ./scripts/stage8/ ---> Using cache ---> 42d8f5e5ed6b Step 30/46 : RUN ./scripts/stage8/install_stage8.sh && rm -rf ./build ---> Using cache ---> 795f298dbbf6 Step 31/46 : COPY ./tools/toolchain/scripts/arch_base.tmpl ./tools/toolchain/scripts/generate_arch_files.sh ./scripts/ ---> Using cache ---> 0daf7f83387a Step 32/46 : RUN ./scripts/generate_arch_files.sh && rm -rf ./build ---> Using cache ---> c3e44bbde9a1 Step 33/46 : WORKDIR /opt/cp2k ---> Using cache ---> b7a099eb4fe0 Step 34/46 : COPY ./Makefile . ---> Using cache ---> 1b70cd75d243 Step 35/46 : COPY ./src ./src ---> 93defcf1adda Step 36/46 : COPY ./exts ./exts ---> 5547d8e84366 Step 37/46 : COPY ./tools/build_utils ./tools/build_utils ---> 3797626fdebe Step 38/46 : RUN /bin/bash -c " mkdir -p arch && ln -vs /opt/cp2k-toolchain/install/arch/local_cuda.psmp ./arch/" ---> Running in 1c5fc0856171 './arch/local_cuda.psmp' -> '/opt/cp2k-toolchain/install/arch/local_cuda.psmp' Removing intermediate container 1c5fc0856171 ---> 0391fe540412 Step 39/46 : COPY ./data ./data ---> 482b93d5bf53 Step 40/46 : COPY ./tests ./tests ---> c697bc67080c Step 41/46 : COPY ./tools/regtesting ./tools/regtesting ---> 4619198eef4f Step 42/46 : COPY ./benchmarks ./benchmarks ---> cff455ec0b77 Step 43/46 : COPY ./tools/docker/scripts/test_performance.sh ./tools/docker/scripts/plot_performance.py ./ ---> d82aa156bb42 Step 44/46 : RUN ./test_performance.sh "local_cuda" 2>&1 | tee report.log ---> Running in 4e2b3835c5de ========== Compiling CP2K ========== Compiling cp2k... done. Checking benchmark inputs... Found 75 input files and 0 errors. ========== Running Performance Test ========== Running H2O-64.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/H2O-64_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.030 0.031 118.959 118.960 qs_mol_dyn_low 1 2.0 0.004 0.004 118.509 118.513 qs_forces 11 3.9 0.002 0.002 118.447 118.447 qs_energies 11 4.9 0.001 0.001 99.129 99.130 velocity_verlet 10 3.0 0.001 0.002 76.360 76.383 scf_env_do_scf 11 5.9 0.001 0.001 75.461 75.462 scf_env_do_scf_inner_loop 108 6.5 0.006 0.008 64.518 64.518 rebuild_ks_matrix 119 8.3 0.001 0.001 35.049 35.054 qs_ks_build_kohn_sham_matrix 119 9.3 0.018 0.018 35.048 35.053 qs_ks_update_qs_env 119 7.6 0.001 0.001 31.868 31.874 dbcsr_multiply_generic 2286 12.5 0.131 0.132 26.914 26.954 qs_scf_new_mos 108 7.5 0.001 0.001 20.869 20.870 qs_scf_loop_do_ot 108 8.5 0.001 0.001 20.868 20.869 ot_scf_mini 108 9.5 0.003 0.003 18.892 18.898 multiply_cannon 2286 13.5 0.365 0.368 16.700 16.724 qs_rho_update_rho_low 119 7.7 0.001 0.001 16.684 16.699 calculate_rho_elec 119 8.7 1.115 1.123 16.683 16.698 fft_wrap_pw1pw2 1201 11.6 0.024 0.025 16.075 16.129 multiply_cannon_loop 2286 14.5 0.230 0.230 15.383 15.417 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 14.792 15.092 qs_energies_init_hamiltonians 11 5.9 0.000 0.000 15.066 15.066 sum_up_and_integrate 119 10.3 0.002 0.002 14.953 15.024 integrate_v_rspace 119 11.3 0.392 0.396 14.835 14.906 fft_wrap_pw1pw2_140 487 12.2 0.003 0.003 13.972 14.050 ot_mini 108 10.5 0.001 0.001 11.319 11.323 init_scf_loop 11 6.9 0.000 0.000 10.856 10.856 density_rs2pw 119 9.7 0.008 0.008 10.345 10.462 multiply_cannon_multrec 4572 15.5 2.582 2.597 9.184 9.194 grid_integrate_task_list 119 12.3 8.367 8.440 8.367 8.440 make_m2s 4572 13.5 0.048 0.049 8.402 8.405 pw_gpu_r3dc1d_3d_ps 606 13.1 2.440 2.454 8.214 8.227 make_images 4572 14.5 1.607 1.612 8.197 8.201 init_scf_run 11 5.9 0.001 0.001 7.958 7.958 scf_env_initial_rho_setup 11 6.9 0.000 0.001 7.958 7.958 pw_gpu_c1dr3d_3d_ps 595 14.2 2.386 2.403 7.830 7.872 build_core_ppl_forces 11 5.9 7.597 7.835 7.597 7.835 qs_env_update_s_mstruct 11 6.9 0.000 0.000 7.570 7.573 qs_ot_get_derivative 108 11.5 0.001 0.001 7.474 7.477 build_core_hamiltonian_matrix 11 6.9 0.001 0.001 6.996 7.154 pw_poisson_solve 119 10.3 0.003 0.003 7.133 7.143 prepare_preconditioner 11 7.9 0.000 0.000 6.991 6.992 make_preconditioner 11 8.9 0.000 0.000 6.991 6.992 qs_create_task_list 11 7.9 0.000 0.000 6.650 6.800 generate_qs_task_list 11 8.9 2.222 2.235 6.650 6.800 dbcsr_mm_accdrv_process 9594 16.2 1.113 1.552 6.196 6.217 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.949 6.215 potential_pw2rs 119 12.3 0.046 0.046 6.076 6.076 grid_collocate_task_list 119 9.7 5.183 5.264 5.183 5.264 qs_vxc_create 119 10.3 0.002 0.002 5.123 5.148 xc_vxc_pw_create 119 11.3 0.891 0.897 5.121 5.146 jit_kernel_multiply 12 15.8 4.366 4.786 4.366 4.786 pw_poisson_set 120 11.2 0.005 0.005 4.626 4.636 calculate_dm_sparse 119 9.5 0.001 0.001 4.293 4.298 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 4.218 4.218 pw_derive 357 12.3 4.204 4.215 4.204 4.215 calculate_first_density_matrix 1 7.0 0.000 0.000 4.213 4.213 xc_rho_set_and_dset_create 119 12.3 0.007 0.007 4.097 4.125 build_core_ppl 11 7.9 3.971 4.101 3.971 4.101 grid_create_task_list 11 9.9 3.947 4.081 3.947 4.081 xc_functional_eval 119 13.3 4.018 4.048 4.018 4.048 build_kinetic_matrix_low 22 6.9 3.737 3.772 3.826 3.863 ot_diis_step 108 11.5 0.005 0.006 3.825 3.825 build_overlap_matrix_low 22 6.9 3.504 3.524 3.590 3.609 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 3.604 3.604 multiply_cannon_sync_h2d 4572 15.5 3.500 3.544 3.500 3.544 wfi_extrapolate 11 7.9 0.001 0.001 3.512 3.513 qs_ot_get_p 119 10.4 0.001 0.001 3.345 3.361 mp_alltoall_z22v 1201 15.6 3.256 3.303 3.256 3.303 dbcsr_complete_redistribute 329 12.2 1.305 1.321 3.045 3.291 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 3.227 3.231 apply_single 119 13.6 0.001 0.001 3.227 3.230 hybrid_alltoall_any 4725 16.4 1.809 1.811 3.131 3.136 make_images_data 4572 15.5 0.066 0.066 3.062 3.069 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 3.055 3.055 mp_waitall_1 64495 16.9 2.729 2.758 2.729 2.758 transfer_rs2pw 487 10.6 0.008 0.008 2.505 2.757 qs_ot_get_derivative_taylor 59 13.0 0.002 0.002 2.742 2.744 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.001 0.001 2.651 2.652 ------------------------------------------------------------------------------- Plot: name="H2O-64_timings_6cpu_1gpu", title="Timings of H2O-64 with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="rest", label="rest", y=89.242, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=8.367, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="build_core_ppl_forces", label="build_core_ppl_forces", y=7.597, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=5.183, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="jit_kernel_multiply", label="jit_kernel_multiply", y=4.366, yerr=0.0 PlotPoint: plot="H2O-64_timings_6cpu_1gpu", name="pw_derive", label="pw_derive", y=4.204, yerr=0.0 Running H2O-64_nonortho.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/H2O-64_nonortho_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.030 0.033 115.988 115.988 qs_mol_dyn_low 1 2.0 0.004 0.004 115.465 115.468 qs_forces 11 3.9 0.002 0.002 115.404 115.404 qs_energies 11 4.9 0.001 0.001 95.730 95.731 velocity_verlet 10 3.0 0.002 0.002 75.584 75.607 scf_env_do_scf 11 5.9 0.001 0.001 71.635 71.635 scf_env_do_scf_inner_loop 96 6.5 0.005 0.008 60.415 60.415 rebuild_ks_matrix 107 8.3 0.001 0.001 33.682 33.684 qs_ks_build_kohn_sham_matrix 107 9.3 0.016 0.016 33.681 33.684 qs_ks_update_qs_env 107 7.6 0.001 0.001 30.100 30.102 dbcsr_multiply_generic 1966 12.4 0.115 0.116 24.643 24.733 qs_scf_new_mos 96 7.5 0.001 0.001 18.624 18.632 qs_scf_loop_do_ot 96 8.5 0.001 0.001 18.624 18.632 qs_rho_update_rho_low 107 7.7 0.001 0.001 17.000 17.012 calculate_rho_elec 107 8.7 0.997 1.004 16.999 17.011 ot_scf_mini 96 9.5 0.003 0.003 16.851 16.851 sum_up_and_integrate 107 10.3 0.002 0.002 15.500 15.600 integrate_v_rspace 107 11.3 0.362 0.363 15.393 15.493 multiply_cannon 1966 13.4 0.311 0.313 15.424 15.442 qs_energies_init_hamiltonians 11 5.9 0.000 0.000 15.429 15.429 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 14.798 15.150 fft_wrap_pw1pw2 1081 11.6 0.022 0.023 14.415 14.464 multiply_cannon_loop 1966 14.4 0.202 0.204 14.311 14.317 fft_wrap_pw1pw2_140 439 12.2 0.003 0.003 12.524 12.593 init_scf_loop 11 6.9 0.000 0.000 11.134 11.134 ot_mini 96 10.5 0.001 0.001 10.123 10.126 grid_integrate_task_list 107 12.3 9.583 9.684 9.583 9.684 density_rs2pw 107 9.7 0.007 0.007 9.285 9.417 multiply_cannon_multrec 3932 15.4 2.281 2.282 8.527 8.531 qs_env_update_s_mstruct 11 6.9 0.000 0.000 7.911 8.157 init_scf_run 11 5.9 0.001 0.001 8.030 8.031 scf_env_initial_rho_setup 11 6.9 0.000 0.001 8.030 8.030 build_core_ppl_forces 11 5.9 7.594 7.886 7.594 7.886 make_m2s 3932 13.4 0.042 0.042 7.563 7.569 make_images 3932 14.4 1.448 1.459 7.382 7.387 pw_gpu_r3dc1d_3d_ps 546 13.1 2.191 2.219 7.370 7.382 build_core_hamiltonian_matrix 11 6.9 0.001 0.001 7.035 7.229 prepare_preconditioner 11 7.9 0.000 0.000 7.087 7.089 make_preconditioner 11 8.9 0.000 0.000 7.087 7.089 pw_gpu_c1dr3d_3d_ps 535 14.2 2.150 2.161 7.017 7.055 qs_create_task_list 11 7.9 0.000 0.000 6.961 7.008 generate_qs_task_list 11 8.9 2.703 2.733 6.961 7.008 grid_collocate_task_list 107 9.7 6.684 6.789 6.684 6.789 qs_ot_get_derivative 96 11.5 0.001 0.001 6.703 6.703 pw_poisson_solve 107 10.3 0.003 0.003 6.466 6.477 make_full_inverse_cholesky 11 9.9 0.000 0.000 6.078 6.346 dbcsr_mm_accdrv_process 8450 16.1 0.605 0.823 5.886 5.891 potential_pw2rs 107 12.3 0.041 0.041 5.447 5.447 jit_kernel_multiply 13 15.9 4.638 4.854 4.638 4.854 qs_vxc_create 107 10.3 0.002 0.002 4.589 4.610 xc_vxc_pw_create 107 11.3 0.800 0.807 4.587 4.608 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 4.517 4.518 pw_poisson_set 108 11.2 0.004 0.004 4.204 4.214 build_core_ppl 11 7.9 4.013 4.169 4.013 4.169 calculate_first_density_matrix 1 7.0 0.000 0.000 4.083 4.083 calculate_dm_sparse 107 9.5 0.001 0.001 4.060 4.066 grid_create_task_list 11 9.9 3.782 3.853 3.782 3.853 build_kinetic_matrix_low 22 6.9 3.738 3.750 3.830 3.845 pw_derive 321 12.3 3.819 3.829 3.819 3.829 wfi_extrapolate 11 7.9 0.001 0.001 3.696 3.696 xc_rho_set_and_dset_create 107 12.3 0.006 0.006 3.670 3.694 build_overlap_matrix_low 22 6.9 3.540 3.568 3.627 3.655 xc_functional_eval 107 13.3 3.598 3.623 3.598 3.623 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 3.493 3.496 ot_diis_step 96 11.5 0.005 0.005 3.403 3.403 multiply_cannon_sync_h2d 3932 15.4 3.313 3.367 3.313 3.367 dbcsr_complete_redistribute 317 12.2 1.324 1.352 3.065 3.334 mp_alltoall_z22v 1081 15.6 2.910 2.961 2.910 2.961 apply_preconditioner_dbcsr 107 12.6 0.000 0.000 2.948 2.952 apply_single 107 13.6 0.000 0.000 2.948 2.951 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.944 2.946 qs_ot_get_p 107 10.4 0.001 0.001 2.933 2.938 hybrid_alltoall_any 4079 16.3 1.639 1.646 2.842 2.846 make_images_data 3932 15.4 0.056 0.056 2.763 2.768 transfer_rs2pw 439 10.6 0.007 0.008 2.345 2.657 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.001 0.001 2.616 2.619 mp_waitall_1 55487 16.8 2.467 2.515 2.467 2.515 qs_ot_get_derivative_taylor 53 13.0 0.002 0.002 2.400 2.404 copy_dbcsr_to_fm 147 11.2 0.003 0.003 2.306 2.320 ------------------------------------------------------------------------------- Plot: name="H2O-64_nonortho_timings_6cpu_1gpu", title="Timings of H2O-64_nonortho with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="rest", label="rest", y=83.476, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=9.583, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="build_core_ppl_forces", label="build_core_ppl_forces", y=7.594, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=6.684, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="jit_kernel_multiply", label="jit_kernel_multiply", y=4.638, yerr=0.0 PlotPoint: plot="H2O-64_nonortho_timings_6cpu_1gpu", name="build_core_ppl", label="build_core_ppl", y=4.013, yerr=0.0 Running GW_PBE_4benzene.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/GW_PBE_4benzene_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.020 0.023 128.134 128.135 qs_energies 1 2.0 0.000 0.000 127.786 127.790 mp2_main 1 3.0 0.000 0.000 119.185 119.190 mp2_gpw_main 1 4.0 0.000 0.000 115.766 115.770 rpa_ri_compute_en 1 5.0 0.000 0.000 107.034 107.038 rpa_num_int 1 6.0 0.000 0.001 107.026 107.030 compute_mat_P_omega 1 7.0 0.002 0.002 90.077 90.081 compute_mat_P_omega_contract 10 8.0 6.035 6.099 89.654 89.666 dbt_total 2336 9.6 0.023 0.023 85.244 85.245 dbt_contract 787 11.0 0.051 0.052 59.036 59.175 dbt_tas_total 1149 12.2 0.741 0.943 45.942 45.942 dbt_tas_multiply 807 12.1 0.003 0.003 45.100 45.100 compute_mat_P_omega_calc_M_occ 250 9.0 6.054 6.121 31.149 31.149 dbt_tas_dbm 807 14.1 0.004 0.005 30.620 30.620 dbm_multiply 807 16.1 30.085 30.159 30.085 30.159 dbt_copy 1107 10.7 0.062 0.062 24.688 24.824 compute_mat_P_omega_calc_M_vir 250 9.0 0.001 0.001 20.157 20.157 dbt_tas_mm_1N 524 15.1 0.002 0.002 19.094 19.227 dbt_reshape 594 11.8 10.071 10.072 17.281 17.465 dbt_tas_reserve_blocks_index 3266 14.3 5.284 5.325 14.305 14.865 dbt_crop 1042 12.0 8.044 8.148 12.470 12.858 compute_mat_P_omega_calc_P_t 250 9.0 0.001 0.001 12.027 12.028 dbt_reserve_blocks_index 2347 13.0 0.177 0.178 11.173 11.773 dbt_reserve_blocks_index_array 2289 12.1 0.011 0.011 11.023 11.624 compute_QP_energies 1 7.0 0.000 0.000 11.014 11.014 compute_self_energy_cubic_gw 1 8.0 0.044 0.044 11.012 11.013 dbt_tas_mm_2 251 15.0 0.002 0.002 9.993 9.993 dbm_reserve_blocks 3634 15.3 9.337 9.903 9.337 9.903 dbcsr_multiply_generic 30 8.1 0.002 0.002 8.713 8.745 mp2_ri_gpw_compute_in 1 5.0 0.001 0.001 8.721 8.721 multiply_cannon 30 9.1 0.009 0.010 8.531 8.536 multiply_cannon_loop 30 10.1 0.004 0.004 8.475 8.480 scf_env_do_scf 1 3.0 0.000 0.000 8.411 8.411 scf_env_do_scf_inner_loop 17 4.0 0.001 0.001 8.411 8.411 compute_mat_P_omega_copy_M_vir 250 9.0 0.002 0.002 7.397 7.420 convert_to_new_pgrid 2421 14.1 0.037 0.037 5.539 6.977 dbm_copy 1614 15.1 5.502 6.939 5.502 6.939 compute_mat_P_omega_copy_M_occ 250 9.0 0.001 0.001 6.752 6.754 multiply_cannon_multrec 60 11.1 0.263 0.266 6.213 6.247 contract_cubic_gw 21 9.0 0.000 0.000 6.170 6.170 dbcsr_mm_accdrv_process 328 12.3 0.024 0.025 5.682 5.725 jit_kernel_multiply 17 11.6 5.651 5.694 5.651 5.694 dbt_tas_copy 511 11.5 2.495 2.520 5.501 5.564 mp_waitall_2 2656 15.9 5.362 5.377 5.362 5.377 mp_sync 8688 11.6 3.536 5.242 3.536 5.242 qs_scf_new_mos 17 5.0 0.000 0.000 4.805 4.837 dbt_communicate_buffer 594 12.8 0.010 0.010 4.629 4.666 get_2c_integrals 1 6.0 0.000 0.000 4.065 4.065 calculate_dm_sparse 17 6.0 0.000 0.000 3.548 3.570 cp_dbcsr_plus_fm_fm_t_native 17 7.0 0.000 0.000 3.548 3.570 compute_vec_Sigma_x_minus_vxc_ 1 4.0 0.000 0.000 3.337 3.337 compute_2c_integrals 1 7.0 0.000 0.000 3.210 3.210 dbt_tas_reserve_blocks_templat 551 12.6 0.045 0.045 3.123 3.161 qs_ks_build_kohn_sham_matrix 18 6.9 0.002 0.002 3.138 3.147 qs_ks_update_qs_env 17 5.0 0.000 0.000 3.098 3.108 rebuild_ks_matrix 17 6.0 0.000 0.000 3.094 3.104 trace_sigma_gw 21 9.0 0.499 0.509 3.021 3.021 build_3c_integrals 5 6.0 1.925 1.930 2.494 2.655 ------------------------------------------------------------------------------- Plot: name="GW_PBE_4benzene_timings_6cpu_1gpu", title="Timings of GW_PBE_4benzene with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="rest", label="rest", y=64.54299999999999, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbm_multiply", label="dbm_multiply", y=30.085, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbt_reshape", label="dbt_reshape", y=10.071, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbm_reserve_blocks", label="dbm_reserve_blocks", y=9.337, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="dbt_crop", label="dbt_crop", y=8.044, yerr=0.0 PlotPoint: plot="GW_PBE_4benzene_timings_6cpu_1gpu", name="compute_mat_P_omega_calc_M_occ", label="compute_mat_P_omega_calc_M_occ", y=6.054, yerr=0.0 Running RI-HFX_H2O-32.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/RI-HFX_H2O-32_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.023 0.025 238.683 238.684 qs_forces 1 2.0 0.000 0.000 238.169 238.169 rebuild_ks_matrix 7 6.6 0.000 0.000 234.050 234.050 qs_ks_build_kohn_sham_matrix 7 7.6 0.001 0.001 234.050 234.050 hfx_ks_matrix 7 8.6 0.000 0.000 229.465 229.466 dbt_total 849 11.0 0.009 0.009 162.153 162.153 hfx_ri_update_ks 7 9.6 0.000 0.000 144.779 144.779 hfx_ri_update_ks_Pmat 7 10.6 32.345 32.354 144.764 144.768 qs_energies 1 3.0 0.000 0.000 134.330 134.330 scf_env_do_scf 1 4.0 0.000 0.000 132.698 132.698 qs_ks_update_qs_env 8 6.0 0.000 0.000 130.278 130.279 qs_ks_update_qs_env_forces 1 3.0 0.000 0.000 103.774 103.774 dbt_contract 207 12.4 0.051 0.051 98.697 98.706 hfx_ri_update_forces 1 7.0 1.216 1.217 84.685 84.685 scf_env_do_scf_inner_loop 6 5.0 0.000 0.001 74.901 74.901 dbt_tas_total 369 13.4 3.314 3.337 74.832 74.832 dbt_tas_multiply 216 13.5 0.001 0.001 71.436 71.436 dbt_copy 423 11.8 0.046 0.047 58.435 58.971 init_scf_loop 2 5.0 0.000 0.000 57.795 57.796 hfx_ri_forces_Pmat_3c 1 8.0 3.647 3.647 48.262 48.272 dbt_tas_dbm 216 15.5 0.001 0.001 47.495 47.495 dbm_multiply 216 17.5 45.433 45.974 45.433 45.974 dbt_reshape 175 13.2 22.660 23.007 41.945 42.148 hfx_ri_update_ks_Pmat_KS 63 11.6 0.001 0.001 41.529 41.529 dbt_tas_reserve_blocks_index 1323 15.4 11.140 11.177 30.410 30.618 precalc_derivatives 1 8.0 2.232 2.247 29.177 29.177 dbt_crop 372 13.7 15.999 16.035 25.637 25.643 dbt_reserve_blocks_index 889 14.5 0.337 0.338 23.956 23.983 dbt_reserve_blocks_index_array 859 13.5 0.006 0.007 23.597 23.625 hfx_ri_update_ks_Pmat_Px3C 63 11.6 0.000 0.000 21.873 21.874 dbm_reserve_blocks 1491 16.3 20.097 20.304 20.097 20.304 hfx_ri_pre_scf_Pmat 1 12.0 0.000 0.000 19.121 19.121 dbt_tas_mm_2 91 16.5 0.001 0.001 18.900 18.900 dbt_tas_mm_3T 77 17.1 0.000 0.000 17.803 18.192 hfx_ri_update_ks_Pmat_copy_2 63 11.6 0.000 0.000 16.442 16.442 mp_waitall_2 1022 16.5 16.005 16.321 16.005 16.321 build_3c_derivatives 3 9.0 4.041 4.054 15.819 15.821 dbt_communicate_buffer 175 14.2 0.004 0.004 13.263 13.443 dbt_tas_copy 248 12.5 4.286 4.360 10.499 10.801 convert_to_new_pgrid 648 15.5 0.015 0.016 9.103 9.189 dbm_copy 459 16.3 9.088 9.173 9.088 9.173 dbt_tas_mm_3N 37 15.4 0.000 0.000 8.896 8.918 mp_sync 2901 12.8 6.579 7.660 6.579 7.660 dbt_tas_reserve_blocks_templat 266 13.6 0.071 0.072 6.329 6.555 hfx_ri_pre_scf_Pmat_copy_2 9 13.0 2.999 3.004 6.231 6.236 hfx_ri_pre_scf_Pmat_int 1 13.0 0.000 0.000 5.148 5.148 dbt_tas_replicate 168 15.1 2.424 2.490 4.967 4.977 ------------------------------------------------------------------------------- Plot: name="RI-HFX_H2O-32_timings_6cpu_1gpu", title="Timings of RI-HFX_H2O-32 with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="rest", label="rest", y=102.143, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbm_multiply", label="dbm_multiply", y=45.433, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="hfx_ri_update_ks_Pmat", label="hfx_ri_update_ks_Pmat", y=32.345, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbt_reshape", label="dbt_reshape", y=22.66, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="dbm_reserve_blocks", label="dbm_reserve_blocks", y=20.097, yerr=0.0 PlotPoint: plot="RI-HFX_H2O-32_timings_6cpu_1gpu", name="mp_waitall_2", label="mp_waitall_2", y=16.005, yerr=0.0 Running RI-MP2_ammonia.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/RI-MP2_ammonia_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.014 0.016 112.665 112.666 qs_energies 1 2.0 0.000 0.000 112.445 112.445 mp2_main 1 3.0 0.000 0.000 104.974 104.974 mp2_gpw_main 1 4.0 0.001 0.002 104.571 104.571 mp2_ri_gpw_compute_in 1 5.0 0.542 0.542 62.558 62.583 mp2_ri_gpw_compute_in_loop 1 6.0 0.012 0.012 51.692 51.717 mp2_ri_gpw_compute_en 1 5.0 0.097 0.097 41.952 41.977 mp2_ri_gpw_compute_en_RI_loop 1 6.0 12.781 12.800 39.352 39.353 mp2_eri_3c_integrate_gpw 1328 7.0 0.015 0.015 25.597 28.486 dbcsr_multiply_generic 2666 8.0 0.144 0.148 22.416 25.257 ao_to_mo_and_store_B_mult_1 1328 7.0 0.011 0.012 20.469 23.310 multiply_cannon 2666 9.0 0.392 0.413 13.937 16.748 mp2_ri_gpw_compute_en_expansio 1040 7.0 0.710 0.710 16.071 16.169 local_gemm 1040 8.0 15.362 15.459 15.362 15.459 multiply_cannon_loop 2666 10.0 0.160 0.160 12.615 15.411 calculate_wavefunction 1328 8.0 7.674 7.689 12.084 14.792 fft_wrap_pw1pw2 26668 10.4 0.155 0.158 9.329 12.061 integrate_v_rspace 1338 8.0 1.003 1.008 11.343 11.568 get_2c_integrals 1 6.0 0.002 0.004 10.307 10.323 compute_2c_integrals 1 7.0 0.005 0.006 9.612 9.612 compute_2c_integrals_loop_lm 1 8.0 0.008 0.011 9.474 9.517 mp2_eri_2c_integrate_gpw 1 9.0 3.096 3.126 9.466 9.512 grid_integrate_task_list 1338 9.0 9.187 9.400 9.187 9.400 multiply_cannon_multrec 2676 11.0 3.824 5.460 7.282 8.941 pw_gpu_r3dc1d_3d 13282 12.2 5.950 8.687 5.950 8.687 make_m2s 5332 9.0 0.053 0.055 7.787 7.810 fft_wrap_pw1pw2_20 10647 11.4 0.017 0.017 5.823 7.774 make_images 5332 10.0 2.750 2.757 7.596 7.622 scf_env_do_scf 1 3.0 0.000 0.000 6.739 6.741 scf_env_do_scf_inner_loop 10 4.0 0.001 0.001 6.739 6.741 qs_scf_new_mos 10 5.0 0.000 0.000 5.360 5.364 ao_to_mo_and_store_B_E_Ex_1 1328 7.0 3.392 3.423 5.267 5.322 calc_potential_gpw 2656 9.5 0.015 0.015 4.881 4.951 mp2_ri_gpw_compute_en_ener 1040 7.0 4.903 4.934 4.903 4.934 mp2_ri_gpw_compute_en_comm 221 7.0 1.028 1.028 4.505 4.591 pw_poisson_solve 2666 10.5 0.026 0.026 4.100 4.171 mp2_eri_2c_integrate_gpw_pot_l 1328 10.0 0.004 0.004 3.874 3.921 fft_wrap_pw1pw2_10 15957 11.5 0.020 0.021 2.864 3.646 multiply_cannon_sync_h2d 2676 11.0 2.463 3.580 2.463 3.580 pw_poisson_set 2669 11.5 0.035 0.036 3.334 3.398 hybrid_alltoall_any 6683 11.6 2.940 2.970 3.202 3.231 dbcsr_mm_accdrv_process 5392 12.0 0.264 0.265 3.219 3.227 pw_derive 7998 12.5 3.088 3.149 3.088 3.149 make_images_data 5332 11.0 0.075 0.076 3.122 3.148 eigensolver 11 5.8 0.001 0.001 3.070 3.070 jit_kernel_multiply 8 13.0 2.852 2.861 2.852 2.861 pw_gpu_c1dr3d_3d 13280 12.7 2.668 2.673 2.668 2.673 mp_sendrecv_dm3 442 8.0 2.518 2.607 2.518 2.607 potential_pw2rs 2666 10.0 0.108 0.108 2.548 2.550 cp_fm_diag_elpa 11 6.8 0.000 0.000 2.386 2.386 cp_fm_diag_elpa_base 11 7.8 2.252 2.267 2.384 2.385 copy_dbcsr_to_fm 1351 8.0 0.026 0.027 2.278 2.301 ------------------------------------------------------------------------------- Plot: name="RI-MP2_ammonia_timings_6cpu_1gpu", title="Timings of RI-MP2_ammonia with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="rest", label="rest", y=61.711000000000006, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="local_gemm", label="local_gemm", y=15.362, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="mp2_ri_gpw_compute_en_RI_loop", label="mp2_ri_gpw_compute_en_RI_loop", y=12.781, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=9.187, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="calculate_wavefunction", label="calculate_wavefunction", y=7.674, yerr=0.0 PlotPoint: plot="RI-MP2_ammonia_timings_6cpu_1gpu", name="pw_gpu_r3dc1d_3d", label="pw_gpu_r3dc1d_3d", y=5.95, yerr=0.0 Running diag_cu144_broy.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/diag_cu144_broy_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.077 0.079 208.541 208.542 qs_energies 1 2.0 0.000 0.000 207.242 207.243 scf_env_do_scf 1 3.0 0.000 0.000 191.422 191.422 scf_env_do_scf_inner_loop 15 4.0 0.001 0.002 191.421 191.422 qs_ks_update_qs_env 15 5.0 0.000 0.000 108.246 108.296 rebuild_ks_matrix 15 6.0 0.000 0.000 108.065 108.115 qs_ks_build_kohn_sham_matrix 15 7.0 0.002 0.002 108.065 108.114 qs_vxc_create 15 8.0 0.035 0.069 70.240 70.295 calculate_dispersion_nonloc 15 9.0 13.046 13.067 57.839 57.863 qs_scf_new_mos 15 5.0 0.000 0.001 56.432 56.507 eigensolver 15 6.0 0.002 0.002 47.297 47.348 sum_up_and_integrate 15 8.0 0.000 0.000 35.533 35.641 integrate_v_rspace 15 9.0 0.048 0.049 35.505 35.615 grid_integrate_task_list 15 10.0 33.930 33.981 33.930 33.981 fft_wrap_pw1pw2 1086 10.0 0.024 0.025 32.706 32.775 cp_fm_diag_elpa 15 7.0 0.000 0.000 28.047 28.052 cp_fm_diag_elpa_base 15 8.0 25.390 26.040 28.041 28.042 qs_rho_update_rho_low 16 5.0 0.000 0.000 23.221 23.221 calculate_rho_elec 16 6.0 0.182 0.182 23.221 23.221 fft_wrap_pw1pw2_150 765 11.0 0.004 0.004 22.249 22.267 grid_collocate_task_list 16 7.0 20.337 20.361 20.337 20.361 pw_gpu_c1dr3d_3d_ps 585 12.1 5.497 5.584 17.738 17.749 cp_fm_cholesky_restore 45 7.0 16.523 17.317 16.523 17.317 vdW_energy 15 10.0 17.217 17.273 17.217 17.273 pw_gpu_r3dc1d_3d_ps 501 11.9 4.844 5.185 14.937 14.995 qs_energies_init_hamiltonians 1 3.0 0.000 0.000 12.873 12.873 xc_vxc_pw_create 15 9.0 0.190 0.192 12.366 12.370 build_core_hamiltonian_matrix 1 4.0 0.000 0.000 11.245 11.279 fft_wrap_pw1pw2_200 197 11.3 0.001 0.001 9.838 9.839 xc_rho_set_and_dset_create 15 10.0 0.141 0.143 8.404 8.413 mp_alltoall_z22v 1086 14.0 6.817 7.347 6.817 7.347 cp_fm_upper_to_full 30 8.0 5.376 6.764 5.376 6.764 copy_dbcsr_to_fm 16 5.9 0.001 0.001 5.885 5.971 xc_pw_derive 90 11.0 0.001 0.001 5.949 5.961 build_core_ppnl 1 5.0 5.774 5.801 5.774 5.801 dbcsr_complete_redistribute 46 8.3 1.762 1.778 5.255 5.279 xc_functional_eval 15 11.0 0.000 0.000 5.180 5.188 pbe_lda_eval 15 12.0 5.180 5.188 5.180 5.188 x_to_yz 585 13.1 1.285 1.293 4.926 5.026 gspace_mixing 14 5.0 0.131 0.131 4.808 4.808 yz_to_x 501 12.9 0.880 0.884 4.057 4.473 pw_gpu_sf 585 13.1 4.198 4.201 4.198 4.201 ------------------------------------------------------------------------------- Plot: name="diag_cu144_broy_timings_6cpu_1gpu", title="Timings of diag_cu144_broy with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="rest", label="rest", y=95.144, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="grid_integrate_task_list", label="grid_integrate_task_list", y=33.93, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="cp_fm_diag_elpa_base", label="cp_fm_diag_elpa_base", y=25.39, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="grid_collocate_task_list", label="grid_collocate_task_list", y=20.337, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="vdW_energy", label="vdW_energy", y=17.217, yerr=0.0 PlotPoint: plot="diag_cu144_broy_timings_6cpu_1gpu", name="cp_fm_cholesky_restore", label="cp_fm_cholesky_restore", y=16.523, yerr=0.0 Running bench_dftb.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/bench_dftb_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.043 0.044 293.188 293.188 qs_energies 1 2.0 0.000 0.000 293.071 293.071 ls_scf 1 3.0 0.000 0.000 292.187 292.188 ls_scf_main 1 4.0 0.001 0.001 281.306 281.307 density_matrix_trs4 11 5.0 0.008 0.010 237.204 237.255 dbcsr_multiply_generic 185 6.1 0.160 0.185 204.562 204.587 multiply_cannon 185 7.1 1.642 1.681 152.044 152.573 multiply_cannon_loop 185 8.1 0.262 0.267 133.476 133.795 multiply_cannon_multrec 370 9.1 100.683 100.822 117.537 117.544 make_m2s 370 7.1 0.042 0.042 43.974 44.066 make_images 370 8.1 15.329 15.730 42.776 42.864 ls_scf_dm_to_ks 11 5.0 0.000 0.000 39.717 39.757 matrix_ls_to_qs 11 6.0 0.000 0.000 37.424 37.498 dbcsr_complete_redistribute 23 7.5 22.908 22.993 32.098 32.188 matrix_decluster 11 7.0 0.000 0.000 29.459 29.548 arnoldi_extremal 12 6.1 0.000 0.000 22.627 22.629 arnoldi_normal_ev 12 7.1 0.140 0.141 22.627 22.629 build_subspace 23 8.1 0.064 0.065 21.988 21.989 dbcsr_matrix_vector_mult 652 9.0 0.147 0.148 20.481 20.559 dbcsr_matrix_vector_mult_local 652 10.0 19.460 19.540 19.466 19.546 dbcsr_finalize 646 7.5 0.192 0.195 14.844 15.193 calculate_norms 740 9.1 14.568 14.897 14.568 14.897 dbcsr_mm_accdrv_process 14501 10.0 0.941 1.040 14.626 14.719 dbcsr_merge_all 597 8.5 2.288 2.587 13.579 13.929 setup_rec_index_2d 370 8.1 13.860 13.926 13.860 13.926 dbcsr_mm_accdrv_process_sort 14501 11.0 13.555 13.618 13.555 13.618 dbcsr_special_finalize 555 9.1 0.008 0.008 10.619 10.692 dbcsr_sort_indices 1287 10.0 10.408 10.499 10.408 10.499 make_images_data 370 9.1 0.012 0.012 9.570 10.024 hybrid_alltoall_any 393 9.9 4.177 4.230 8.905 9.368 ls_scf_init_scf 1 4.0 0.000 0.000 9.151 9.152 ls_scf_init_matrix_S 1 5.0 0.000 0.000 8.910 8.913 dbcsr_add_d 280 6.0 0.001 0.001 8.034 8.373 dbcsr_add_anytype 280 7.0 3.697 3.713 8.033 8.372 matrix_sqrt_Newton_Schulz 1 6.0 0.000 0.001 8.141 8.144 dbcsr_copy_into_existing 11 7.0 7.965 7.980 7.965 7.980 tree_to_linear_d 110 9.4 7.838 7.847 7.838 7.847 dbcsr_merge_single_wm 370 10.1 0.511 0.525 6.545 6.555 ------------------------------------------------------------------------------- Plot: name="bench_dftb_timings_6cpu_1gpu", title="Timings of bench_dftb with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="rest", label="rest", y=120.23999999999995, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="multiply_cannon_multrec", label="multiply_cannon_multrec", y=100.683, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="dbcsr_complete_redistribute", label="dbcsr_complete_redistribute", y=22.908, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="dbcsr_matrix_vector_mult_local", label="dbcsr_matrix_vector_mult_local", y=19.46, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="make_images", label="make_images", y=15.329, yerr=0.0 PlotPoint: plot="bench_dftb_timings_6cpu_1gpu", name="calculate_norms", label="calculate_norms", y=14.568, yerr=0.0 Running dbcsr.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/dbcsr_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.004 0.006 68.002 68.003 lib_test 1 2.0 0.000 0.000 67.980 67.995 dbcsr_run_tests 3 3.0 0.001 0.001 67.980 67.993 test_multiplies_multiproc 3 4.0 0.001 0.001 55.828 55.884 dbcsr_multiply_generic 9 5.0 0.001 0.001 41.495 41.515 multiply_cannon 9 6.0 0.271 0.345 33.232 33.431 multiply_cannon_loop 9 7.0 0.003 0.003 31.723 31.804 multiply_cannon_multrec 18 8.0 16.627 16.682 29.604 29.687 dbcsr_mm_accdrv_process 8199 9.0 1.278 1.457 12.745 12.769 dbcsr_make_random_matrix 9 4.0 8.874 8.917 12.026 12.081 dbcsr_redistribute 9 5.0 7.649 7.653 9.859 9.868 dbcsr_mm_accdrv_process_sort 8199 10.0 8.910 8.916 8.910 8.916 dbcsr_finalize 27 5.7 0.001 0.001 7.292 7.294 dbcsr_merge_all 18 6.5 3.643 3.654 7.168 7.173 make_m2s 18 6.0 0.001 0.001 3.600 3.603 make_images 18 7.0 0.513 0.515 3.562 3.565 dbcsr_checksum 6 5.0 2.858 2.893 2.895 2.895 jit_kernel_multiply 8 10.0 2.556 2.753 2.556 2.753 tree_to_linear_d 9 7.0 1.923 1.937 1.923 1.937 mp_alltoall_d11v 27 6.0 1.908 1.913 1.908 1.913 dbcsr_data_release 507 7.7 1.491 1.508 1.491 1.508 dbcsr_data_copy_aa2 18 7.5 1.451 1.453 1.451 1.453 ------------------------------------------------------------------------------- Plot: name="dbcsr_timings_6cpu_1gpu", title="Timings of dbcsr with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="rest", label="rest", y=22.298999999999992, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="multiply_cannon_multrec", label="multiply_cannon_multrec", y=16.627, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_mm_accdrv_process_sort", label="dbcsr_mm_accdrv_process_sort", y=8.91, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_make_random_matrix", label="dbcsr_make_random_matrix", y=8.874, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_redistribute", label="dbcsr_redistribute", y=7.649, yerr=0.0 PlotPoint: plot="dbcsr_timings_6cpu_1gpu", name="dbcsr_merge_all", label="dbcsr_merge_all", y=3.643, yerr=0.0 Running MQAE_single_node.inp with 3 threads and 2 ranks... done. From /workspace/artifacts/MQAE_single_node_6cpu_1gpu.out: ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.055 0.057 327.878 327.879 qs_mol_dyn_low 1 2.0 0.005 0.005 326.010 326.069 velocity_verlet 5 3.0 0.004 0.004 183.493 183.580 qs_forces 6 3.8 0.001 0.001 164.512 164.512 qs_energies 6 4.8 0.001 0.001 155.359 155.360 scf_env_do_scf 6 5.8 0.001 0.001 145.219 145.219 scf_env_do_scf_inner_loop 113 6.2 0.005 0.008 134.549 134.550 rebuild_ks_matrix 119 8.1 0.000 0.000 119.404 119.407 qs_ks_build_kohn_sham_matrix 119 9.1 0.018 0.018 119.403 119.407 qs_ks_update_qs_env 119 7.3 0.001 0.001 113.425 113.429 qmmm_forces 6 3.8 0.002 0.002 80.409 80.409 qmmm_forces_with_gaussian 6 4.8 0.023 0.023 79.793 80.033 qs_vxc_create 119 10.1 0.002 0.002 78.802 78.804 xc_vxc_pw_create 119 11.1 1.638 1.676 78.800 78.803 qmmm_el_coupling 6 3.8 0.000 0.000 77.514 77.522 qmmm_elec_with_gaussian 6 4.8 0.022 0.022 77.508 77.516 qmmm_force_with_gaussian_low 6 5.8 0.000 0.000 76.457 76.703 qmmm_elec_with_gaussian_low 6 5.8 0.000 0.000 73.599 74.049 fft_wrap_pw1pw2 2059 12.4 0.043 0.045 65.390 65.614 qmmm_elec_gaussian_low_G 6 6.8 64.549 65.011 64.549 65.011 fft_wrap_pw1pw2_150 1321 13.9 0.009 0.009 63.235 63.465 xc_pw_derive 714 13.1 0.009 0.010 49.091 49.186 qmmm_forces_gaussian_low_G 6 6.8 47.831 48.010 47.831 48.010 xc_rho_set_and_dset_create 119 12.1 4.323 4.324 45.907 45.946 pw_gpu_c1dr3d_3d_ps 1095 14.8 10.761 10.946 35.263 35.295 xc_pw_divergence 119 12.1 0.006 0.006 30.746 30.760 pw_gpu_r3dc1d_3d_ps 964 14.0 9.483 9.600 30.072 30.262 qmmm_forces_gaussian_low_R 6 6.8 0.000 0.000 28.626 28.693 qmmm_forces_with_gaussian_LG 6 7.8 28.626 28.693 28.626 28.693 pw_derive 1089 13.4 21.945 22.208 21.945 22.208 qs_rho_update_rho_low 119 7.3 0.001 0.001 20.066 20.295 calculate_rho_elec 119 8.3 1.190 1.231 20.066 20.294 dbcsr_multiply_generic 2588 12.3 0.105 0.106 18.041 18.066 xc_functional_eval 238 13.1 0.002 0.002 16.368 16.368 multiply_cannon 2588 13.3 0.248 0.249 16.195 16.315 sum_up_and_integrate 119 10.1 0.002 0.002 16.078 16.199 density_rs2pw 119 9.3 0.008 0.008 15.875 16.087 integrate_v_rspace 119 11.1 0.022 0.022 15.868 15.990 multiply_cannon_loop 2588 14.3 0.229 0.233 15.600 15.722 mp_alltoall_z22v 2059 16.4 13.589 14.191 13.589 14.191 pw_poisson_solve 125 9.9 0.003 0.003 11.922 11.964 multiply_cannon_multrec 5176 15.3 4.383 4.436 11.591 11.715 init_scf_loop 6 6.8 0.000 0.000 10.666 10.666 lyp_lda_eval 119 14.1 10.315 10.321 10.315 10.321 x_to_yz 1095 15.8 2.724 2.739 9.923 10.217 qs_ks_ddapc 119 10.1 0.002 0.002 10.028 10.081 potential_pw2rs 119 12.1 0.035 0.035 9.680 9.684 qmmm_elec_gaussian_low_R 6 6.8 0.000 0.000 9.050 9.062 qmmm_elec_with_gaussian_LG 6 7.8 9.050 9.062 9.050 9.062 qs_scf_new_mos 113 7.2 0.001 0.001 8.501 8.504 qs_scf_loop_do_ot 113 8.2 0.001 0.001 8.500 8.504 yz_to_x 964 15.0 1.777 1.785 8.167 8.451 pw_gpu_sf 1095 15.8 8.368 8.407 8.368 8.407 ot_scf_mini 113 9.2 0.002 0.002 8.187 8.191 pw_poisson_set 128 10.8 0.005 0.005 8.015 8.056 pw_gpu_fg 964 15.0 7.790 7.797 7.790 7.797 init_scf_run 6 5.8 0.000 0.000 7.583 7.583 scf_env_initial_rho_setup 6 6.8 0.000 0.000 7.583 7.583 dbcsr_mm_accdrv_process 13832 16.0 1.668 1.751 7.133 7.201 ------------------------------------------------------------------------------- Plot: name="MQAE_single_node_timings_6cpu_1gpu", title="Timings of MQAE_single_node with 6 CPU Cores and 1 GPU", ylabel="time [s]" PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="rest", label="rest", y=151.338, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_elec_gaussian_low_G", label="qmmm_elec_gaussian_low_G", y=64.549, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_forces_gaussian_low_G", label="qmmm_forces_gaussian_low_G", y=47.831, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="qmmm_forces_with_gaussian_LG", label="qmmm_forces_with_gaussian_LG", y=28.626, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="pw_derive", label="pw_derive", y=21.945, yerr=0.0 PlotPoint: plot="MQAE_single_node_timings_6cpu_1gpu", name="mp_alltoall_z22v", label="mp_alltoall_z22v", y=13.589, yerr=0.0 Summary: Performance test took 27 minutes. Status: OK Removing intermediate container 4e2b3835c5de ---> 3da8721dc66e Step 45/46 : CMD cat $(find ./report.log -mmin +10) | sed '/^Summary:/ s/$/ (cached)/' ---> Running in d464d7c12238 Removing intermediate container d464d7c12238 ---> 934c9e86d4a7 Step 46/46 : ENTRYPOINT [] ---> Running in d7637065ee0f Removing intermediate container d7637065ee0f ---> e0a741ad3bb0 [Warning] One or more build-args [GIT_COMMIT_SHA] were not consumed Successfully built e0a741ad3bb0 Successfully tagged us-central1-docker.pkg.dev/cp2k-org-project/cp2kci/img_cp2k-perf-cuda-volta:master Pushing new image... done. #################### Running Image cp2k-perf-cuda-volta #################### Uploading artifacts... done EndDate: 2024-03-06 09:07:26+00:00