=== This is the CP2K Performance-Test === Already up to date. Current branch master is up to date. Already up to date. Current branch master is up to date. GIT Revision: cb0f57ca944f9b314758deba717f9b13e7d43235 ################# ARCHITECTURE FILE ################## #!/bin/bash # # CP2K arch file for Cray-XC50 (Piz Daint, CSCS, GPU partition) # # Tested with: GNU 9.3.0, Cray-MPICH 7.7.18, Cray-libsci 20.09.1, Cray-FFTW 3.3.8.10, # COSMA 2.6.2, ELPA 2021.11.002, LIBINT 2.6.0, LIBPEXSI 1.2.0, # LIBXC 6.0.0, LIBVORI 220621, LIBXSMM 1.17, PLUMED 2.8.0, # SIRIUS 7.3.2, SPGLIB 1.16.2 # # Usage: Source this arch file and then run make as instructed. # A full toolchain installation is performed as default. # Replace or adapt the "module add" commands below if needed. # # Author: Matthias Krack (19.10.2022) # # \ if [ "${0}" = "${BASH_SOURCE}" ]; then \ echo "ERROR: Script ${0##*/} must be sourced"; \ echo "Usage: source ${0##*/}"; \ exit 1; \ fi; \ this_file=${BASH_SOURCE##*/}; \ if [ -n "${1}" ]; then \ gcc_version="${1}"; \ else \ gcc_version="9.3.0"; \ fi; \ module add daint-gpu; \ module rm PrgEnv-cray; \ module add PrgEnv-gnu; \ module rm gcc; \ module add gcc/${gcc_version}; \ module add cray-fftw/3.3.8.10; \ module add cudatoolkit; \ echo "Expected setup:"; \ echo " cray-mpich/7.7.18"; \ echo " craype-haswell"; \ echo " daint-gpu/21.09"; \ echo " craype/2.7.10"; \ echo " cray-libsci/20.09.1"; \ echo " PrgEnv-gnu/6.0.10"; \ echo " gcc/${gcc_version}"; \ echo " cray-fftw/3.3.8.10"; \ echo " cudatoolkit/11.0.2_3.38-8.1__g5b73779"; \ module list; \ module -f save cp2k_gpu_gnu_psmp; \ echo "To load the required modules in your batch job script, use:"; \ echo " module restore cp2k_gpu_gnu_psmp"; \ cd tools/toolchain; \ ./install_cp2k_toolchain.sh --enable-cuda=yes --gpu-ver=P100 --no-arch-files --with-gcc=system --with-libvdwxc --with-pexsi --with-plumed; \ cd ../..; \ printf "Sourcing ${PWD}/tools/toolchain/install/setup ... "; \ source ${PWD}/tools/toolchain/install/setup; \ printf "done\n"; \ echo "Check the output above for error messages and consistency!"; \ echo "If everything is OK, you can build a CP2K production binary with"; \ echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.}"; \ echo "Alternatively, you can add further checks, e.g. for regression testing, with"; \ echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.} DO_CHECKS=yes"; \ echo "or build CP2K as a library with"; \ echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.} libcp2k"; \ return # Set options DO_CHECKS := no USE_ACC := yes USE_COSMA := 2.6.2 USE_ELPA := 2021.11.002 USE_LIBINT := 2.6.0 USE_LIBPEXSI := 1.2.0 USE_LIBVORI := 220621 USE_LIBXC := 6.0.0 USE_LIBXSMM := 1.17 USE_PLUMED := 2.8.0 #USE_QUIP := b4336484fb65b0e73211a8f920ae4361c7c353fd USE_SIRIUS := 7.3.2 USE_SPGLIB := 1.16.2 # Only needed for SIRIUS LIBVDWXC_VER := 0.4.0 SPFFT_VER := 1.0.6 SPLA_VER := 1.5.4 HDF5_VER := 1.12.0 # Only needed for LIBPEXSI SCOTCH_VER := 6.0.0 SUPERLU_VER := 6.1.0 LMAX := 5 MAX_CONTR := 4 GPUVER := P100 OFFLOAD_TARGET := cuda CC := cc CXX := CC OFFLOAD_CC := nvcc FC := ftn LD := ftn AR := ar -r # cc, CC, and ftn include already the proper -march flag CFLAGS := -O2 -fopenmp -fopenmp-simd -ftree-vectorize -funroll-loops -g DFLAGS := -D__parallel DFLAGS += -D__SCALAPACK DFLAGS += -D__FFTW3 DFLAGS += -D__MPI_VERSION=3 DFLAGS += -D__MAX_CONTR=$(strip $(MAX_CONTR)) INSTALL_PATH := $(PWD)/tools/toolchain/install ifeq ($(DO_CHECKS), yes) DFLAGS += -D__CHECK_DIAG endif ifeq ($(USE_ACC), yes) DFLAGS += -D__DBCSR_ACC DFLAGS += -D__OFFLOAD_CUDA # Possibly no performance gain with PW_CUDA currently DFLAGS += -D__NO_OFFLOAD_PW endif ifneq ($(USE_PLUMED),) USE_PLUMED := $(strip $(USE_PLUMED)) PLUMED_LIB := $(INSTALL_PATH)/plumed-$(USE_PLUMED)/lib DFLAGS += -D__PLUMED2 USE_GSL := 2.7 LIBS += $(PLUMED_LIB)/libplumed.a endif ifneq ($(USE_ELPA),) USE_ELPA := $(strip $(USE_ELPA)) TARGET := nvidia ELPA_INC := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/include/elpa-$(USE_ELPA) ELPA_LIB := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/lib CFLAGS += -I$(ELPA_INC)/elpa -I$(ELPA_INC)/modules DFLAGS += -D__ELPA ifeq ($(TARGET), nvidia) DFLAGS += -D__ELPA_NVIDIA_GPU endif LIBS += $(ELPA_LIB)/libelpa.a endif ifneq ($(USE_QUIP),) USE_QUIP := $(strip $(USE_QUIP)) QUIP_INC := $(INSTALL_PATH)/quip-$(USE_QUIP)/include QUIP_LIB := $(INSTALL_PATH)/quip-$(USE_QUIP)/lib CFLAGS += -I$(QUIP_INC) DFLAGS += -D__QUIP LIBS += $(QUIP_LIB)/libquip_core.a LIBS += $(QUIP_LIB)/libatoms.a LIBS += $(QUIP_LIB)/libFoX_sax.a LIBS += $(QUIP_LIB)/libFoX_common.a LIBS += $(QUIP_LIB)/libFoX_utils.a LIBS += $(QUIP_LIB)/libFoX_fsys.a endif ifneq ($(USE_LIBPEXSI),) USE_LIBPEXSI := $(strip $(USE_LIBPEXSI)) SCOTCH_VER := $(strip $(SCOTCH_VER)) SUPERLU_VER := $(strip $(SUPERLU_VER)) LIBPEXSI_INC := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/include LIBPEXSI_LIB := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/lib SCOTCH_INC := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/include SCOTCH_LIB := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/lib SUPERLU_INC := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/include SUPERLU_LIB := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/lib CFLAGS += -I$(LIBPEXSI_INC) -I$(SCOTCH_INC) -I$(SUPERLU_INC) DFLAGS += -D__LIBPEXSI LIBS += $(LIBPEXSI_LIB)/libpexsi.a LIBS += $(SUPERLU_LIB)/libsuperlu_dist.a LIBS += $(SCOTCH_LIB)/libptscotchparmetis.a LIBS += $(SCOTCH_LIB)/libptscotch.a LIBS += $(SCOTCH_LIB)/libptscotcherr.a LIBS += $(SCOTCH_LIB)/libscotchmetis.a LIBS += $(SCOTCH_LIB)/libscotch.a endif ifneq ($(USE_LIBVORI),) USE_LIBVORI := $(strip $(USE_LIBVORI)) LIBVORI_LIB := $(INSTALL_PATH)/libvori-$(USE_LIBVORI)/lib DFLAGS += -D__LIBVORI LIBS += $(LIBVORI_LIB)/libvori.a endif ifneq ($(USE_LIBXC),) USE_LIBXC := $(strip $(USE_LIBXC)) LIBXC_INC := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/include LIBXC_LIB := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/lib CFLAGS += -I$(LIBXC_INC) DFLAGS += -D__LIBXC LIBS += $(LIBXC_LIB)/libxcf03.a LIBS += $(LIBXC_LIB)/libxc.a endif ifneq ($(USE_LIBINT),) USE_LIBINT := $(strip $(USE_LIBINT)) LMAX := $(strip $(LMAX)) LIBINT_INC := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/include LIBINT_LIB := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/lib CFLAGS += -I$(LIBINT_INC) DFLAGS += -D__LIBINT LIBS += $(LIBINT_LIB)/libint2.a endif ifneq ($(USE_SPGLIB),) USE_SPGLIB := $(strip $(USE_SPGLIB)) SPGLIB_INC := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/include SPGLIB_LIB := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/lib CFLAGS += -I$(SPGLIB_INC) DFLAGS += -D__SPGLIB LIBS += $(SPGLIB_LIB)/libsymspg.a endif ifneq ($(USE_LIBXSMM),) USE_LIBXSMM := $(strip $(USE_LIBXSMM)) LIBXSMM_INC := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/include LIBXSMM_LIB := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/lib CFLAGS += -I$(LIBXSMM_INC) DFLAGS += -D__LIBXSMM LIBS += $(LIBXSMM_LIB)/libxsmmf.a LIBS += $(LIBXSMM_LIB)/libxsmm.a endif ifneq ($(USE_SIRIUS),) USE_SIRIUS := $(strip $(USE_SIRIUS)) HDF5_VER := $(strip $(HDF5_VER)) HDF5_LIB := $(INSTALL_PATH)/hdf5-$(HDF5_VER)/lib LIBVDWXC_VER := $(strip $(LIBVDWXC_VER)) LIBVDWXC_INC := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/include LIBVDWXC_LIB := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/lib SPFFT_VER := $(strip $(SPFFT_VER)) SPFFT_INC := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/include SPLA_VER := $(strip $(SPLA_VER)) SPLA_INC := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/include/spla ifeq ($(USE_ACC), yes) DFLAGS += -D__OFFLOAD_GEMM SPFFT_LIB := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib/cuda SPLA_LIB := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib/cuda SIRIUS_INC := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include/cuda SIRIUS_LIB := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib/cuda else SPFFT_LIB := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib SPLA_LIB := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib SIRIUS_INC := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include SIRIUS_LIB := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib endif CFLAGS += -I$(LIBVDWXC_INC) CFLAGS += -I$(SPFFT_INC) CFLAGS += -I$(SPLA_INC) CFLAGS += -I$(SIRIUS_INC) DFLAGS += -D__HDF5 DFLAGS += -D__LIBVDWXC DFLAGS += -D__SPFFT DFLAGS += -D__SPLA DFLAGS += -D__SIRIUS LIBS += $(SIRIUS_LIB)/libsirius.a LIBS += $(SPLA_LIB)/libspla.a LIBS += $(SPFFT_LIB)/libspfft.a LIBS += $(LIBVDWXC_LIB)/libvdwxc.a LIBS += $(HDF5_LIB)/libhdf5.a endif ifneq ($(USE_COSMA),) USE_COSMA := $(strip $(USE_COSMA)) ifeq ($(USE_ACC), yes) USE_COSMA := $(USE_COSMA)-cuda endif COSMA_INC := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/include COSMA_LIB := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/lib CFLAGS += -I$(COSMA_INC) DFLAGS += -D__COSMA LIBS += $(COSMA_LIB)/libcosma_prefixed_pxgemm.a LIBS += $(COSMA_LIB)/libcosma.a LIBS += $(COSMA_LIB)/libcosta_prefixed_scalapack.a LIBS += $(COSMA_LIB)/libcosta.a LIBS += $(COSMA_LIB)/libTiled-MM.a endif ifneq ($(USE_GSL),) USE_GSL := $(strip $(USE_GSL)) GSL_INC := $(INSTALL_PATH)/gsl-$(USE_GSL)/include GSL_LIB := $(INSTALL_PATH)/gsl-$(USE_GSL)/lib CFLAGS += -I$(GSL_INC) DFLAGS += -D__GSL LIBS += $(GSL_LIB)/libgsl.a endif CFLAGS += $(DFLAGS) CXXFLAGS := $(CFLAGS) -std=c++11 OFFLOAD_FLAGS := $(DFLAGS) -O3 -Xcompiler="-fopenmp" -arch sm_60 --std=c++11 FCFLAGS := $(CFLAGS) ifeq ($(shell [ $(shell gcc -dumpversion | cut -d. -f1) -gt 9 ] && echo yes), yes) FCFLAGS += -fallow-argument-mismatch endif FCFLAGS += -fbacktrace FCFLAGS += -ffree-form FCFLAGS += -ffree-line-length-none FCFLAGS += -fno-omit-frame-pointer FCFLAGS += -std=f2008 ifneq ($(CUDA_HOME),) CUDA_LIB := $(CUDA_HOME)/lib64 LDFLAGS := $(FCFLAGS) -L$(CUDA_LIB) -Wl,-rpath=$(CUDA_LIB) else LDFLAGS := $(FCFLAGS) endif LIBS += -lcusolver -lcudart -lnvrtc -lcuda -lcufft -lcublas -lrt LIBS += -lz -ldl -lpthread -lstdc++ # End ############### END ARCHITECTURE FILE ################ ===== TESTS (description) ===== ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 RI-RPA/RI-MP2 correlation energy input file: benchmarks/QS_mp2_rpa/32-H2O/RI-RPA.inp required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-dRPA-TZ.inp'] output file: result.log # nodes = 8 # ranks/node = 2 # threads/rank = 6 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/01 job id: 42240555 --- Point --- name: 10 plot: h2o_32_ri_rpa_mp2 regex: Total RI-RPA Time= label: RI-RPA (8n/2r/6t) --- Point --- name: 11 plot: h2o_32_ri_rpa_mp2_mem regex: Estimated peak process memory label: RI-RPA (8n/2r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 RI-RPA/RI-MP2 correlation energy input file: benchmarks/QS_mp2_rpa/32-H2O/RI-MP2.inp required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-HF-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-MP2-TZ.inp'] output file: result.log # nodes = 8 # ranks/node = 6 # threads/rank = 2 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/02 job id: 42240556 --- Point --- name: 20 plot: h2o_32_ri_rpa_mp2 regex: Total MP2 Time= label: RI-MP2 (8n/6r/2t) --- Point --- name: 21 plot: h2o_32_ri_rpa_mp2_mem regex: Estimated peak process memory label: RI-MP2 (8n/6r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 12 # threads/rank = 1 nrepeat = 1 time[min] = 5 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/03 job id: 42240558 --- Point --- name: 100 plot: h2o_64_md regex: CP2K label: (8n/12r/1t) --- Point --- name: 101 plot: h2o_64_md_mem regex: Estimated peak process memory label: (8n/12r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 6 # threads/rank = 2 nrepeat = 1 time[min] = 5 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/04 job id: 42240559 --- Point --- name: 102 plot: h2o_64_md regex: CP2K label: (8n/6r/2t) --- Point --- name: 103 plot: h2o_64_md_mem regex: Estimated peak process memory label: (8n/6r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 4 # threads/rank = 3 nrepeat = 1 time[min] = 5 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/05 job id: 42240561 --- Point --- name: 104 plot: h2o_64_md regex: CP2K label: (8n/4r/3t) --- Point --- name: 105 plot: h2o_64_md_mem regex: Estimated peak process memory label: (8n/4r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 3 # threads/rank = 4 nrepeat = 1 time[min] = 5 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/06 job id: 42240562 --- Point --- name: 106 plot: h2o_64_md regex: CP2K label: (8n/3r/4t) --- Point --- name: 107 plot: h2o_64_md_mem regex: Estimated peak process memory label: (8n/3r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 2 # threads/rank = 6 nrepeat = 1 time[min] = 5 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/07 job id: 42240564 --- Point --- name: 108 plot: h2o_64_md regex: CP2K label: (8n/2r/6t) --- Point --- name: 109 plot: h2o_64_md_mem regex: Estimated peak process memory label: (8n/2r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 1 # threads/rank = 12 nrepeat = 1 time[min] = 5 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/08 job id: 42240565 --- Point --- name: 110 plot: h2o_64_md regex: CP2K label: (8n/1r/12t) --- Point --- name: 111 plot: h2o_64_md_mem regex: Estimated peak process memory label: (8n/1r/12t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 12 # threads/rank = 1 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/09 job id: 42240567 --- Point --- name: 200 plot: h2o_128_md regex: CP2K label: (8n/12r/1t) --- Point --- name: 201 plot: h2o_128_md_mem regex: Estimated peak process memory label: (8n/12r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 6 # threads/rank = 2 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/10 job id: 42240568 --- Point --- name: 202 plot: h2o_128_md regex: CP2K label: (8n/6r/2t) --- Point --- name: 203 plot: h2o_128_md_mem regex: Estimated peak process memory label: (8n/6r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 4 # threads/rank = 3 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/11 job id: 42240569 --- Point --- name: 204 plot: h2o_128_md regex: CP2K label: (8n/4r/3t) --- Point --- name: 205 plot: h2o_128_md_mem regex: Estimated peak process memory label: (8n/4r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 3 # threads/rank = 4 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/12 job id: 42240571 --- Point --- name: 206 plot: h2o_128_md regex: CP2K label: (8n/3r/4t) --- Point --- name: 207 plot: h2o_128_md_mem regex: Estimated peak process memory label: (8n/3r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 2 # threads/rank = 6 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/13 job id: 42240572 --- Point --- name: 208 plot: h2o_128_md regex: CP2K label: (8n/2r/6t) --- Point --- name: 209 plot: h2o_128_md_mem regex: Estimated peak process memory label: (8n/2r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 1 # threads/rank = 12 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/14 job id: 42240574 --- Point --- name: 210 plot: h2o_128_md regex: CP2K label: (8n/1r/12t) --- Point --- name: 211 plot: h2o_128_md_mem regex: Estimated peak process memory label: (8n/1r/12t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 12 # threads/rank = 1 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/15 job id: 42240575 --- Point --- name: 400 plot: h2o_256_md regex: CP2K label: (8n/12r/1t) --- Point --- name: 401 plot: h2o_256_md_mem regex: Estimated peak process memory label: (8n/12r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 6 # threads/rank = 2 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/16 job id: 42240576 --- Point --- name: 402 plot: h2o_256_md regex: CP2K label: (8n/6r/2t) --- Point --- name: 403 plot: h2o_256_md_mem regex: Estimated peak process memory label: (8n/6r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 4 # threads/rank = 3 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/17 job id: 42240578 --- Point --- name: 404 plot: h2o_256_md regex: CP2K label: (8n/4r/3t) --- Point --- name: 405 plot: h2o_256_md_mem regex: Estimated peak process memory label: (8n/4r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 3 # threads/rank = 4 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/18 job id: 42240579 --- Point --- name: 406 plot: h2o_256_md regex: CP2K label: (8n/3r/4t) --- Point --- name: 407 plot: h2o_256_md_mem regex: Estimated peak process memory label: (8n/3r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 2 # threads/rank = 6 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/19 job id: 42240581 --- Point --- name: 408 plot: h2o_256_md regex: CP2K label: (8n/2r/6t) --- Point --- name: 409 plot: h2o_256_md_mem regex: Estimated peak process memory label: (8n/2r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 1 # threads/rank = 12 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/20 job id: 42240582 --- Point --- name: 410 plot: h2o_256_md regex: CP2K label: (8n/1r/12t) --- Point --- name: 411 plot: h2o_256_md_mem regex: Estimated peak process memory label: (8n/1r/12t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 12 # threads/rank = 1 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/21 job id: 42240584 --- Point --- name: 500 plot: h2o_32_nrep3_ls regex: CP2K label: (8n/12r/1t) --- Point --- name: 501 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (8n/12r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 6 # threads/rank = 2 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/22 job id: 42240585 --- Point --- name: 502 plot: h2o_32_nrep3_ls regex: CP2K label: (8n/6r/2t) --- Point --- name: 503 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (8n/6r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 4 # threads/rank = 3 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/23 job id: 42240587 --- Point --- name: 504 plot: h2o_32_nrep3_ls regex: CP2K label: (8n/4r/3t) --- Point --- name: 505 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (8n/4r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 3 # threads/rank = 4 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/24 job id: 42240588 --- Point --- name: 506 plot: h2o_32_nrep3_ls regex: CP2K label: (8n/3r/4t) --- Point --- name: 507 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (8n/3r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 2 # threads/rank = 6 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/25 job id: 42240589 --- Point --- name: 508 plot: h2o_32_nrep3_ls regex: CP2K label: (8n/2r/6t) --- Point --- name: 509 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (8n/2r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 8 # ranks/node = 1 # threads/rank = 12 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/26 job id: 42240591 --- Point --- name: 510 plot: h2o_32_nrep3_ls regex: CP2K label: (8n/1r/12t) --- Point --- name: 511 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (8n/1r/12t) ~~~~~~~ END TEST ~~~~~~~ === END TESTS (description) === ===== PLOTS (description) ===== ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_ri_rpa_mp2", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_ri_rpa_mp2_mem", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_64_md", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_64_md_mem", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_128_md", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_128_md_mem", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_256_md", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_256_md_mem", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_nrep3_ls", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_nrep3_ls_mem", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" === END PLOTS (description) === ============ RESULTS ============ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/01/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 0.000000E+00 0.0% 0.0% 0.0% flops max/rank 0.000000E+00 0.0% 0.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 0 0.0% 0.0% 0.0% number of processed stacks 0 0.0% 0.0% 0.0% average stack size 0.0 0.0 0.0 marketing flops 0.000000E+00 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 1 12. MP_Allreduce 19 21. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 15 177869. MP_Allreduce 344 9. MP_Sync 3 MP_comm_split 1 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.022 0.034 132.470 132.471 farming_run 1 2.0 132.024 132.026 132.442 132.445 ------------------------------------------------------------------------------- @@@@@@@@@@ Run number: 2 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 4194304 0.0% 0.0% 100.0% flops 14 x 32 x 32 154140672 0.0% 0.0% 100.0% flops 29 x 32 x 32 159645696 0.0% 0.0% 100.0% flops 14 x 14 x 32 208732160 0.0% 0.0% 100.0% flops 29 x 14 x 32 212860928 0.0% 0.0% 100.0% flops 14 x 29 x 32 212860928 0.0% 0.0% 100.0% flops 29 x 29 x 32 227352576 0.0% 0.0% 100.0% flops 14 x 32 x 14 896801644032 0.0% 0.0% 100.0% flops 29 x 32 x 14 928925089792 0.0% 0.0% 100.0% flops 14 x 32 x 29 928925089792 0.0% 0.0% 100.0% flops 29 x 32 x 29 962100985856 0.0% 0.0% 100.0% flops 32 x 32 x 14 1693169221632 0.0% 0.0% 100.0% flops 32 x 32 x 29 1753639550976 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 7.164741E+12 0.0% 0.0% 100.0% flops max/rank 447.801317E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 249492158 0.0% 0.0% 100.0% number of processed stacks 164328 0.0% 0.0% 100.0% average stack size 0.0 0.0 1518.3 marketing flops 7.165779E+12 ------------------------------------------------------------------------------- # multiplications 1160 max memory usage/rank 1.459610E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 2592 MPI messages size (bytes): total size 1.140326E+09 min size 0.000000E+00 max size 1.663488E+06 average size 439.940750E+03 MPI breakdown and total messages size (bytes): size <= 128 132 0 128 < size <= 8192 348 2850816 8192 < size <= 32768 0 0 32768 < size <= 131072 1536 179306496 131072 < size <= 4194304 576 958169088 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 14 12. MP_Allreduce 2308 54. MP_Alltoall 4670 822215. MP_ISend 2604 90577. MP_IRecv 2604 90574. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 12 MP_Bcast 228 1113141. MP_Allreduce 489 2263609. MP_Sync 27 MP_Alltoall 38 9316958. MP_SendRecv 30 829726. MP_ISendRecv 135 235435. MP_Wait 281 MP_comm_split 8 MP_ISend 127 3867574. MP_IRecv 127 3866554. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.011 0.024 115.679 115.680 qs_energies 1 2.0 0.000 0.000 115.455 115.458 mp2_main 1 3.0 0.000 0.000 113.569 113.571 mp2_gpw_main 1 4.0 0.019 0.025 112.725 112.728 mp2_ri_gpw_compute_in 1 5.0 0.171 0.172 93.608 94.203 mp2_ri_gpw_compute_in_loop 1 6.0 0.005 0.005 55.413 56.008 mp2_eri_3c_integrate_gpw 272 7.0 0.153 0.172 41.733 47.293 get_2c_integrals 1 6.0 0.000 0.000 37.483 38.022 integrate_v_rspace 273 8.0 0.434 0.465 25.084 30.332 pw_transfer 6555 10.6 0.374 0.395 27.441 28.331 fft_wrap_pw1pw2 5465 11.4 0.045 0.047 26.070 26.813 grid_integrate_task_list 273 9.0 20.906 26.645 20.906 26.645 fft_wrap_pw1pw2_100 2178 12.4 1.228 1.433 23.629 24.365 compute_2c_integrals 1 7.0 0.002 0.002 19.727 19.729 compute_2c_integrals_loop_lm 1 8.0 0.003 0.003 19.004 19.450 mp2_eri_2c_integrate_gpw 1 9.0 2.378 2.441 19.001 19.450 rpa_ri_compute_en 1 5.0 0.000 0.000 19.006 19.171 cp_fm_cholesky_decompose 12 8.2 17.702 18.220 17.702 18.220 cholesky_decomp 1 7.0 0.000 0.000 16.597 17.118 fft3d_s 5443 13.4 16.177 16.714 16.199 16.736 ao_to_mo_and_store_B_mult_1 272 7.0 10.849 15.558 10.849 15.558 calculate_wavefunction 272 8.0 5.427 5.581 12.560 13.187 rpa_num_int 1 6.0 0.000 0.000 10.575 10.576 rpa_num_int_RPA_matrix_operati 8 7.0 0.000 0.000 10.509 10.529 calc_mat_Q 8 8.0 0.000 0.000 9.355 9.457 contract_S_to_Q 8 9.0 0.000 0.000 8.776 8.878 calc_potential_gpw 544 9.5 0.005 0.006 8.290 8.698 mp2_eri_2c_integrate_gpw_pot_l 272 10.0 0.001 0.002 8.286 8.585 potential_pw2rs 545 10.0 0.107 0.110 7.722 8.493 parallel_gemm_fm 14 9.1 0.000 0.000 8.350 8.446 parallel_gemm_fm_cosma 14 10.1 8.350 8.446 8.350 8.446 collocate_single_gaussian 272 10.0 0.040 0.043 7.519 7.765 create_integ_mat 1 6.0 0.022 0.027 7.742 7.742 array2fm 1 7.0 0.000 0.000 6.619 7.187 pw_scatter_s 2720 13.7 4.420 4.599 4.420 4.599 pw_gather_s 2722 13.2 3.871 4.267 3.871 4.267 array2fm_buffer_send 1 8.0 2.977 3.184 2.977 3.184 pw_poisson_solve 545 10.5 1.133 1.184 2.210 2.357 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="10", plot="h2o_32_ri_rpa_mp2", label="RI-RPA (8n/2r/6t)", y=112.727866, yerr=0.000000 PlotPoint: name="11", plot="h2o_32_ri_rpa_mp2_mem", label="RI-RPA (8n/2r/6t)", y=2731.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/02/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 0.000000E+00 0.0% 0.0% 0.0% flops max/rank 0.000000E+00 0.0% 0.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 0 0.0% 0.0% 0.0% number of processed stacks 0 0.0% 0.0% 0.0% average stack size 0.0 0.0 0.0 marketing flops 0.000000E+00 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 1 12. MP_Allreduce 19 21. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 22 205321. MP_Allreduce 344 10. MP_Sync 4 MP_comm_split 1 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.028 0.040 527.928 527.929 farming_run 1 2.0 527.190 527.194 527.886 527.888 ------------------------------------------------------------------------------- @@@@@@@@@@ Run number: 2 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 16777216 0.0% 0.0% 100.0% flops 14 x 32 x 32 565182464 0.0% 0.0% 100.0% flops 29 x 32 x 32 585367552 0.0% 0.0% 100.0% flops 14 x 14 x 32 626196480 0.0% 0.0% 100.0% flops 29 x 14 x 32 638582784 0.0% 0.0% 100.0% flops 14 x 29 x 32 638582784 0.0% 0.0% 100.0% flops 29 x 29 x 32 682057728 0.0% 0.0% 100.0% flops 14 x 32 x 14 897827141120 0.0% 0.0% 100.0% flops 29 x 32 x 14 929989394432 0.0% 0.0% 100.0% flops 14 x 32 x 29 929989394432 0.0% 0.0% 100.0% flops 29 x 32 x 29 963203301376 0.0% 0.0% 100.0% flops 32 x 32 x 14 1693481172992 0.0% 0.0% 100.0% flops 32 x 32 x 29 1753962643456 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 7.172206E+12 0.0% 0.0% 100.0% flops max/rank 150.696064E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 249788822 0.0% 0.0% 100.0% number of processed stacks 98736 0.0% 0.0% 100.0% average stack size 0.0 0.0 2529.9 marketing flops 7.174951E+12 ------------------------------------------------------------------------------- # multiplications 1140 max memory usage/rank 1.227370E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 61440 MPI messages size (bytes): total size 6.073508E+09 min size 0.000000E+00 max size 642.960000E+03 average size 98.852664E+03 MPI breakdown and total messages size (bytes): size <= 128 32004 0 128 < size <= 8192 1820 14909440 8192 < size <= 32768 0 0 32768 < size <= 131072 18640 1081442304 131072 < size <= 4194304 8976 4977156096 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 14 12. MP_Allreduce 1003 44. MP_Alltoall 1797 713538. MP_ISend 3686 54943. MP_IRecv 3622 54292. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 12 MP_Bcast 703 408373. MP_Allreduce 1825 23678. MP_Sync 38 MP_Alltoall 77 MP_SendRecv 2171 2843495. MP_ISendRecv 1739 144022. MP_Wait 2051 MP_comm_split 7 MP_ISend 264 362227. MP_IRecv 264 362718. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.014 0.049 209.105 209.106 qs_energies 1 2.0 0.000 0.000 208.885 208.897 scf_env_do_scf 1 3.0 0.000 0.000 106.183 106.183 qs_ks_update_qs_env 5 5.0 0.000 0.000 105.316 105.326 rebuild_ks_matrix 4 6.0 0.000 0.000 105.315 105.325 qs_ks_build_kohn_sham_matrix 4 7.0 0.055 0.066 105.315 105.325 hfx_ks_matrix 4 8.0 0.001 0.001 104.939 104.942 integrate_four_center 4 9.0 0.144 0.455 104.938 104.941 mp2_main 1 3.0 0.000 0.001 102.418 102.430 mp2_gpw_main 1 4.0 0.034 0.053 101.536 101.551 integrate_four_center_main 4 10.0 0.123 0.576 96.652 99.065 integrate_four_center_bin 267 11.0 96.528 99.048 96.528 99.048 init_scf_loop 1 4.0 0.000 0.000 91.727 91.727 mp2_ri_gpw_compute_in 1 5.0 0.068 0.098 74.928 75.929 mp2_ri_gpw_compute_in_loop 1 6.0 0.002 0.003 54.525 55.506 mp2_eri_3c_integrate_gpw 91 7.0 0.143 0.161 42.236 47.266 integrate_v_rspace 95 8.0 0.396 0.562 28.595 33.483 pw_transfer 2240 10.6 0.145 0.169 29.856 30.338 fft_wrap_pw1pw2 1868 11.4 0.018 0.021 28.882 29.377 grid_integrate_task_list 95 9.0 23.884 28.964 23.884 28.964 ao_to_mo_and_store_B_mult_1 91 7.0 10.595 28.219 10.595 28.219 mp2_ri_gpw_compute_en 1 5.0 0.063 0.101 26.441 28.182 fft_wrap_pw1pw2_100 730 12.4 1.263 1.445 26.623 27.083 mp2_ri_gpw_compute_en_RI_loop 1 6.0 1.853 1.917 24.756 24.766 get_2c_integrals 1 6.0 0.002 0.016 20.308 20.335 compute_2c_integrals 1 7.0 0.003 0.009 19.283 19.287 compute_2c_integrals_loop_lm 1 8.0 0.002 0.010 18.807 19.155 mp2_eri_2c_integrate_gpw 1 9.0 1.757 1.965 18.805 19.154 fft3d_s 1823 13.4 18.400 18.694 18.413 18.707 scf_env_do_scf_inner_loop 4 4.0 0.000 0.000 14.454 14.454 calculate_wavefunction 91 8.0 2.019 2.068 9.756 9.981 mp2_ri_gpw_compute_en_expansio 172 7.0 0.555 0.594 8.772 9.461 potential_pw2rs 186 10.0 0.033 0.035 8.611 9.208 local_gemm 172 8.0 8.216 8.868 8.216 8.868 mp2_eri_2c_integrate_gpw_pot_l 91 10.0 0.001 0.004 8.198 8.532 mp2_ri_gpw_compute_en_comm 22 7.0 0.495 0.523 7.742 8.163 calc_potential_gpw 182 9.5 0.002 0.002 7.896 8.130 collocate_single_gaussian 91 10.0 0.022 0.074 7.854 8.100 mp2_ri_gpw_compute_en_ener 172 7.0 6.343 6.427 6.343 6.427 mp_sendrecv_dm3 2068 8.0 5.791 6.228 5.791 6.228 mp_sync 38 10.4 2.977 5.670 2.977 5.670 pw_gather_s 912 13.2 4.924 5.356 4.924 5.356 pw_scatter_s 910 13.7 3.933 4.201 3.933 4.201 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="20", plot="h2o_32_ri_rpa_mp2", label="RI-MP2 (8n/6r/2t)", y=101.529406, yerr=0.000000 PlotPoint: name="21", plot="h2o_32_ri_rpa_mp2_mem", label="RI-MP2 (8n/6r/2t)", y=1512.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/03/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 26877100032 0.0% 0.0% 100.0% flops 9 x 9 x 32 44168260608 0.0% 0.0% 100.0% flops 22 x 9 x 32 53835724800 0.0% 0.0% 100.0% flops 9 x 22 x 32 53885500416 0.0% 0.0% 100.0% flops 32 x 32 x 9 63568871424 0.0% 0.0% 100.0% flops 22 x 22 x 32 67007283200 0.0% 0.0% 100.0% flops 32 x 32 x 22 77695287296 0.0% 0.0% 100.0% flops 9 x 32 x 32 78422999040 0.0% 0.0% 100.0% flops 22 x 32 x 32 95850332160 0.0% 0.0% 100.0% flops 9 x 32 x 9 266263676928 0.0% 0.0% 100.0% flops 22 x 32 x 9 326697440256 0.0% 0.0% 100.0% flops 9 x 32 x 22 326697440256 0.0% 0.0% 100.0% flops 22 x 32 x 22 399918497792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 1.880888E+12 0.0% 0.0% 100.0% flops max/rank 29.277748E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 146984760 0.0% 0.0% 100.0% number of processed stacks 5055360 0.0% 0.0% 100.0% average stack size 0.0 0.0 29.1 marketing flops 2.107592E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 451.358720E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 9436608 MPI messages size (bytes): total size 333.233553E+09 min size 0.000000E+00 max size 315.840000E+03 average size 35.312852E+03 MPI breakdown and total messages size (bytes): size <= 128 4913240 0 128 < size <= 8192 1155432 9465298944 8192 < size <= 32768 1984512 54190407680 32768 < size <= 131072 551296 42776657920 131072 < size <= 4194304 832128 226802306368 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3683 62385. MP_Allreduce 10249 271. MP_Sync 580 MP_Alltoall 2083 592243. MP_ISendRecv 45220 5520. MP_Wait 60486 MP_comm_split 50 MP_ISend 20771 42672. MP_IRecv 20771 42672. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.031 0.119 48.165 48.167 qs_mol_dyn_low 1 2.0 0.003 0.007 47.784 47.791 qs_forces 11 3.9 0.006 0.035 47.255 47.256 qs_energies 11 4.9 0.003 0.017 45.756 45.771 scf_env_do_scf 11 5.9 0.000 0.001 39.932 39.933 scf_env_do_scf_inner_loop 108 6.5 0.002 0.008 37.938 37.939 dbcsr_multiply_generic 2286 12.5 0.092 0.098 30.060 30.500 qs_scf_new_mos 108 7.5 0.000 0.001 28.128 28.409 qs_scf_loop_do_ot 108 8.5 0.001 0.001 28.128 28.408 ot_scf_mini 108 9.5 0.002 0.003 26.607 26.789 multiply_cannon 2286 13.5 0.186 0.193 24.337 25.887 multiply_cannon_loop 2286 14.5 1.420 1.487 23.631 25.223 velocity_verlet 10 3.0 0.001 0.002 23.349 23.354 ot_mini 108 10.5 0.001 0.001 16.644 16.890 qs_ot_get_derivative 108 11.5 0.001 0.002 13.875 14.047 mp_waitall_1 267858 16.1 6.967 12.817 6.967 12.817 multiply_cannon_metrocomm3 54864 15.5 0.069 0.073 4.900 11.515 multiply_cannon_multrec 54864 15.5 4.470 6.932 7.602 11.021 rebuild_ks_matrix 119 8.3 0.000 0.000 7.840 7.959 qs_ks_build_kohn_sham_matrix 119 9.3 0.011 0.019 7.840 7.959 qs_ks_update_qs_env 119 7.6 0.001 0.001 6.922 7.033 multiply_cannon_sync_h2d 54864 15.5 6.211 6.701 6.211 6.701 qs_ot_get_p 119 10.4 0.001 0.001 5.675 5.941 mp_sum_l 7207 12.9 3.778 5.267 3.778 5.267 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 4.882 4.981 qs_ot_get_derivative_taylor 59 13.0 0.001 0.001 4.334 4.768 sum_up_and_integrate 119 10.3 0.013 0.018 4.513 4.520 integrate_v_rspace 119 11.3 0.002 0.004 4.500 4.509 init_scf_run 11 5.9 0.000 0.001 4.493 4.493 scf_env_initial_rho_setup 11 6.9 0.001 0.005 4.493 4.493 dbcsr_mm_accdrv_process 76910 16.1 1.055 1.632 3.054 4.241 qs_rho_update_rho_low 119 7.7 0.001 0.001 3.927 4.013 calculate_rho_elec 119 8.7 0.011 0.017 3.926 4.013 qs_ot_p2m_diag 50 11.0 0.004 0.007 3.283 3.311 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 2.860 3.058 apply_single 119 13.6 0.000 0.000 2.860 3.058 rs_pw_transfer 974 11.9 0.012 0.013 2.818 2.944 calculate_dm_sparse 119 9.5 0.000 0.000 2.743 2.885 ot_diis_step 108 11.5 0.006 0.009 2.611 2.612 cp_dbcsr_syevd 50 12.0 0.002 0.003 2.579 2.580 jit_kernel_multiply 13 15.8 1.942 2.524 1.942 2.524 calculate_first_density_matrix 1 7.0 0.000 0.004 2.438 2.445 cp_fm_diag_elpa 50 13.0 0.000 0.000 2.364 2.364 cp_fm_redistribute_end 50 14.0 2.147 2.342 2.151 2.343 cp_fm_diag_elpa_base 50 14.0 0.190 2.298 0.191 2.310 density_rs2pw 119 9.7 0.004 0.004 2.169 2.284 qs_ot_get_orbitals 108 10.5 0.000 0.000 2.067 2.132 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.002 2.116 2.118 grid_integrate_task_list 119 12.3 2.023 2.107 2.023 2.107 multiply_cannon_metrocomm1 54864 15.5 0.054 0.059 1.176 2.063 acc_transpose_blocks 54864 15.5 0.204 0.222 1.606 2.003 wfi_extrapolate 11 7.9 0.001 0.003 1.995 1.996 init_scf_loop 11 6.9 0.001 0.005 1.976 1.977 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.913 1.956 potential_pw2rs 119 12.3 0.004 0.004 1.884 1.901 mp_sum_d 4129 12.0 1.164 1.821 1.164 1.821 make_m2s 4572 13.5 0.054 0.056 1.700 1.749 pw_transfer 1439 11.6 0.051 0.055 1.666 1.734 make_images 4572 14.5 0.132 0.137 1.618 1.665 fft_wrap_pw1pw2 1201 12.6 0.006 0.007 1.591 1.662 mp_waitany 12084 13.8 1.294 1.473 1.294 1.473 fft3d_ps 1201 14.6 0.357 0.460 1.370 1.432 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.000 0.000 1.378 1.397 grid_collocate_task_list 119 9.7 1.286 1.362 1.286 1.362 mp_alltoall_d11v 2130 13.8 1.180 1.321 1.180 1.321 fft_wrap_pw1pw2_140 487 13.2 0.079 0.093 1.219 1.291 dbcsr_dot_sd 1205 11.9 0.047 0.058 0.603 1.004 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 0.976 0.989 acc_transpose_blocks_kernels 54864 16.5 0.235 0.356 0.726 0.982 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="100", plot="h2o_64_md", label="(8n/12r/1t)", y=48.167000, yerr=0.000000 PlotPoint: name="101", plot="h2o_64_md_mem", label="(8n/12r/1t)", y=430.000000, yerr=1.348400 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/04/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 26877100032 0.0% 0.0% 100.0% flops 9 x 9 x 32 44168260608 0.0% 0.0% 100.0% flops 22 x 9 x 32 53835724800 0.0% 0.0% 100.0% flops 9 x 22 x 32 53885500416 0.0% 0.0% 100.0% flops 32 x 32 x 9 63568871424 0.0% 0.0% 100.0% flops 22 x 22 x 32 67007283200 0.0% 0.0% 100.0% flops 32 x 32 x 22 77695287296 0.0% 0.0% 100.0% flops 9 x 32 x 32 78422999040 0.0% 0.0% 100.0% flops 22 x 32 x 32 95850332160 0.0% 0.0% 100.0% flops 9 x 32 x 9 266263676928 0.0% 0.0% 100.0% flops 22 x 32 x 9 326697440256 0.0% 0.0% 100.0% flops 9 x 32 x 22 326697440256 0.0% 0.0% 100.0% flops 22 x 32 x 22 399918497792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 1.880888E+12 0.0% 0.0% 100.0% flops max/rank 57.173320E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 146984760 0.0% 0.0% 100.0% number of processed stacks 3066240 0.0% 0.0% 100.0% average stack size 0.0 0.0 47.9 marketing flops 2.107592E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 486.944768E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 2194560 MPI messages size (bytes): total size 310.646604E+09 min size 0.000000E+00 max size 1.145520E+06 average size 141.553031E+03 MPI breakdown and total messages size (bytes): size <= 128 724648 0 128 < size <= 8192 253512 2076770304 8192 < size <= 32768 281952 4619501568 32768 < size <= 131072 494448 39143342080 131072 < size <= 4194304 440000 264807943488 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3672 62664. MP_Allreduce 10226 305. MP_Sync 104 MP_Alltoall 2060 1594595. MP_ISendRecv 33558 37093. MP_Wait 40318 MP_comm_split 50 MP_ISend 5720 128509. MP_IRecv 5720 128509. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.020 0.035 38.429 38.431 qs_mol_dyn_low 1 2.0 0.003 0.003 38.164 38.172 qs_forces 11 3.9 0.011 0.013 38.049 38.051 qs_energies 11 4.9 0.002 0.006 36.336 36.345 scf_env_do_scf 11 5.9 0.001 0.001 31.273 31.274 scf_env_do_scf_inner_loop 108 6.5 0.003 0.008 28.777 28.779 dbcsr_multiply_generic 2286 12.5 0.100 0.104 21.326 21.712 qs_scf_new_mos 108 7.5 0.001 0.001 19.783 20.037 qs_scf_loop_do_ot 108 8.5 0.001 0.001 19.783 20.036 ot_scf_mini 108 9.5 0.002 0.003 18.886 19.067 velocity_verlet 10 3.0 0.001 0.002 18.277 18.279 multiply_cannon 2286 13.5 0.208 0.220 16.241 17.757 multiply_cannon_loop 2286 14.5 0.897 0.978 15.122 16.579 ot_mini 108 10.5 0.001 0.001 11.804 12.046 mp_waitall_1 217478 16.2 6.026 11.081 6.026 11.081 qs_ot_get_derivative 108 11.5 0.001 0.001 9.347 9.529 multiply_cannon_metrocomm3 27432 15.5 0.068 0.070 4.129 9.465 multiply_cannon_multrec 27432 15.5 1.983 4.564 5.856 8.742 rebuild_ks_matrix 119 8.3 0.000 0.000 7.182 7.319 qs_ks_build_kohn_sham_matrix 119 9.3 0.012 0.014 7.182 7.318 qs_ks_update_qs_env 119 7.6 0.001 0.001 6.329 6.453 dbcsr_mm_accdrv_process 47894 16.0 2.847 4.586 3.805 5.639 qs_ot_get_derivative_taylor 59 13.0 0.001 0.001 3.625 4.470 qs_ot_get_p 119 10.4 0.001 0.001 4.169 4.394 sum_up_and_integrate 119 10.3 0.024 0.027 4.209 4.219 integrate_v_rspace 119 11.3 0.002 0.002 4.185 4.195 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 3.044 4.120 apply_single 119 13.6 0.000 0.000 3.044 4.120 mp_sum_l 7207 12.9 2.104 4.099 2.104 4.099 init_scf_run 11 5.9 0.000 0.001 3.830 3.831 scf_env_initial_rho_setup 11 6.9 0.001 0.001 3.830 3.831 qs_rho_update_rho_low 119 7.7 0.001 0.001 3.635 3.676 calculate_rho_elec 119 8.7 0.021 0.024 3.634 3.675 make_m2s 4572 13.5 0.052 0.055 2.542 2.781 multiply_cannon_sync_h2d 27432 15.5 2.190 2.764 2.190 2.764 qs_ot_p2m_diag 50 11.0 0.008 0.013 2.673 2.692 make_images 4572 14.5 0.200 0.239 2.453 2.690 rs_pw_transfer 974 11.9 0.010 0.011 2.589 2.653 init_scf_loop 11 6.9 0.001 0.003 2.475 2.484 ot_diis_step 108 11.5 0.011 0.011 2.405 2.406 calculate_first_density_matrix 1 7.0 0.000 0.002 2.299 2.300 cp_dbcsr_syevd 50 12.0 0.003 0.003 2.241 2.241 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 2.147 2.240 calculate_dm_sparse 119 9.5 0.000 0.001 2.039 2.113 density_rs2pw 119 9.7 0.004 0.004 1.991 2.059 potential_pw2rs 119 12.3 0.006 0.006 1.945 1.959 grid_integrate_task_list 119 12.3 1.838 1.938 1.838 1.938 jit_kernel_multiply 10 16.1 0.906 1.930 0.906 1.930 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.901 1.902 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 1.880 1.882 cp_fm_redistribute_end 50 14.0 1.567 1.875 1.570 1.876 pw_transfer 1439 11.6 0.063 0.067 1.816 1.846 cp_fm_diag_elpa_base 50 14.0 0.297 1.816 0.305 1.845 fft_wrap_pw1pw2 1201 12.6 0.007 0.008 1.726 1.758 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.665 1.703 prepare_preconditioner 11 7.9 0.000 0.000 1.547 1.575 make_preconditioner 11 8.9 0.000 0.002 1.547 1.575 make_images_data 4572 15.5 0.045 0.051 1.160 1.572 make_full_inverse_cholesky 11 9.9 0.000 0.000 1.448 1.503 wfi_extrapolate 11 7.9 0.001 0.001 1.483 1.483 acc_transpose_blocks 27432 15.5 0.106 0.111 1.178 1.479 hybrid_alltoall_any 4725 16.4 0.050 0.111 1.010 1.465 fft3d_ps 1201 14.6 0.498 0.552 1.433 1.460 fft_wrap_pw1pw2_140 487 13.2 0.076 0.082 1.341 1.375 mp_alltoall_d11v 2130 13.8 1.219 1.341 1.219 1.341 grid_collocate_task_list 119 9.7 1.242 1.330 1.242 1.330 mp_allgather_i34 2286 14.5 0.562 1.310 0.562 1.310 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.000 0.000 1.250 1.258 qs_ot_get_orbitals 108 10.5 0.000 0.000 1.208 1.254 mp_sum_d 4129 12.0 0.605 1.036 0.605 1.036 qs_energies_init_hamiltonians 11 5.9 0.000 0.002 0.960 0.967 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 0.911 0.923 acc_transpose_blocks_kernels 27432 16.5 0.183 0.274 0.648 0.858 make_images_sizes 4572 15.5 0.005 0.005 0.570 0.814 mp_alltoall_i44 4572 16.5 0.565 0.809 0.565 0.809 rs_pw_transfer_PW2RS_50 119 14.3 0.590 0.608 0.748 0.794 mp_alltoall_z22v 1201 16.6 0.714 0.784 0.714 0.784 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="102", plot="h2o_64_md", label="(8n/6r/2t)", y=38.431000, yerr=0.000000 PlotPoint: name="103", plot="h2o_64_md_mem", label="(8n/6r/2t)", y=464.181818, yerr=1.402477 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/05/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 26877100032 0.0% 0.0% 100.0% flops 9 x 9 x 32 44168260608 0.0% 0.0% 100.0% flops 22 x 9 x 32 53835724800 0.0% 0.0% 100.0% flops 9 x 22 x 32 53885500416 0.0% 0.0% 100.0% flops 32 x 32 x 9 63568871424 0.0% 0.0% 100.0% flops 22 x 22 x 32 67007283200 0.0% 0.0% 100.0% flops 32 x 32 x 22 77695287296 0.0% 0.0% 100.0% flops 9 x 32 x 32 78422999040 0.0% 0.0% 100.0% flops 22 x 32 x 32 95850332160 0.0% 0.0% 100.0% flops 9 x 32 x 9 266263676928 0.0% 0.0% 100.0% flops 22 x 32 x 9 326697440256 0.0% 0.0% 100.0% flops 9 x 32 x 22 326697440256 0.0% 0.0% 100.0% flops 22 x 32 x 22 399918497792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 1.880888E+12 0.0% 0.0% 100.0% flops max/rank 59.051995E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 146984760 0.0% 0.0% 100.0% number of processed stacks 3143552 0.0% 0.0% 100.0% average stack size 0.0 0.0 46.8 marketing flops 2.107587E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 521.187328E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 950976 MPI messages size (bytes): total size 203.844256E+09 min size 0.000000E+00 max size 1.638400E+06 average size 214.352688E+03 MPI breakdown and total messages size (bytes): size <= 128 6424 0 128 < size <= 8192 253512 2076770304 8192 < size <= 32768 179424 2939682816 32768 < size <= 131072 181440 14863564800 131072 < size <= 4194304 330176 183964913216 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3672 62660. MP_Allreduce 10225 303. MP_Sync 104 MP_Alltoall 1821 1607811. MP_ISendRecv 22134 57667. MP_Wait 33054 MP_comm_split 50 MP_ISend 9880 92618. MP_IRecv 9880 92618. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.013 0.037 31.944 31.946 qs_mol_dyn_low 1 2.0 0.003 0.004 31.662 31.669 qs_forces 11 3.9 0.002 0.003 31.590 31.592 qs_energies 11 4.9 0.001 0.001 30.032 30.034 scf_env_do_scf 11 5.9 0.000 0.001 25.436 25.436 scf_env_do_scf_inner_loop 108 6.5 0.002 0.006 22.855 22.855 dbcsr_multiply_generic 2286 12.5 0.093 0.095 16.446 16.542 velocity_verlet 10 3.0 0.003 0.004 15.288 15.289 qs_scf_new_mos 108 7.5 0.001 0.001 14.871 14.895 qs_scf_loop_do_ot 108 8.5 0.001 0.001 14.871 14.895 ot_scf_mini 108 9.5 0.002 0.002 14.141 14.160 multiply_cannon 2286 13.5 0.196 0.201 13.069 13.832 multiply_cannon_loop 2286 14.5 0.637 0.661 12.262 13.072 ot_mini 108 10.5 0.001 0.001 8.850 8.868 qs_ot_get_derivative 108 11.5 0.001 0.001 7.321 7.338 multiply_cannon_multrec 18288 15.5 1.932 2.830 6.755 7.111 rebuild_ks_matrix 119 8.3 0.000 0.000 6.326 6.345 qs_ks_build_kohn_sham_matrix 119 9.3 0.012 0.013 6.326 6.344 dbcsr_mm_accdrv_process 38222 16.0 3.999 5.564 4.740 5.620 qs_ks_update_qs_env 119 7.6 0.001 0.001 5.596 5.614 mp_waitall_1 169478 16.3 3.014 4.133 3.014 4.133 sum_up_and_integrate 119 10.3 0.030 0.031 3.945 3.951 integrate_v_rspace 119 11.3 0.002 0.003 3.915 3.925 init_scf_run 11 5.9 0.000 0.001 3.429 3.429 scf_env_initial_rho_setup 11 6.9 0.001 0.001 3.429 3.429 qs_ot_get_derivative_taylor 59 13.0 0.001 0.001 2.762 3.378 qs_rho_update_rho_low 119 7.7 0.001 0.001 3.259 3.270 calculate_rho_elec 119 8.7 0.030 0.031 3.259 3.270 qs_ot_get_p 119 10.4 0.001 0.001 3.099 3.115 init_scf_loop 11 6.9 0.000 0.000 2.563 2.563 multiply_cannon_metrocomm3 18288 15.5 0.045 0.046 1.444 2.478 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 2.050 2.361 apply_single 119 13.6 0.000 0.000 2.049 2.361 rs_pw_transfer 974 11.9 0.009 0.010 2.277 2.358 calculate_first_density_matrix 1 7.0 0.000 0.001 2.200 2.201 qs_ot_p2m_diag 50 11.0 0.012 0.012 1.999 2.005 make_m2s 4572 13.5 0.044 0.045 1.851 1.970 density_rs2pw 119 9.7 0.004 0.004 1.841 1.909 grid_integrate_task_list 119 12.3 1.803 1.903 1.803 1.903 make_images 4572 14.5 0.190 0.203 1.765 1.884 jit_kernel_multiply 10 16.1 0.690 1.858 0.690 1.858 calculate_dm_sparse 119 9.5 0.000 0.000 1.816 1.827 prepare_preconditioner 11 7.9 0.000 0.000 1.773 1.776 make_preconditioner 11 8.9 0.000 0.000 1.773 1.776 pw_transfer 1439 11.6 0.063 0.065 1.734 1.748 potential_pw2rs 119 12.3 0.007 0.008 1.729 1.735 cp_dbcsr_syevd 50 12.0 0.003 0.003 1.730 1.730 make_full_inverse_cholesky 11 9.9 0.000 0.000 1.626 1.707 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.002 1.669 1.671 fft_wrap_pw1pw2 1201 12.6 0.007 0.008 1.644 1.660 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.599 1.608 multiply_cannon_sync_h2d 18288 15.5 1.387 1.523 1.387 1.523 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.514 1.519 mp_sum_l 7207 12.9 1.144 1.518 1.144 1.518 ot_diis_step 108 11.5 0.011 0.011 1.503 1.503 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.490 1.491 cp_fm_redistribute_end 50 14.0 1.103 1.465 1.104 1.465 cp_fm_diag_elpa_base 50 14.0 0.345 1.401 0.359 1.439 fft3d_ps 1201 14.6 0.507 0.523 1.331 1.349 fft_wrap_pw1pw2_140 487 13.2 0.086 0.089 1.292 1.305 grid_collocate_task_list 119 9.7 1.203 1.295 1.203 1.295 acc_transpose_blocks 18288 15.5 0.074 0.076 1.227 1.251 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.000 0.000 1.181 1.187 wfi_extrapolate 11 7.9 0.001 0.001 1.180 1.180 make_images_data 4572 15.5 0.045 0.049 0.839 0.979 qs_energies_init_hamiltonians 11 5.9 0.000 0.001 0.940 0.942 qs_ot_get_orbitals 108 10.5 0.000 0.000 0.888 0.912 hybrid_alltoall_any 4725 16.4 0.054 0.113 0.727 0.888 mp_alltoall_d11v 2130 13.8 0.728 0.824 0.728 0.824 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 0.802 0.803 acc_transpose_blocks_kernels 18288 16.5 0.212 0.221 0.786 0.800 cp_fm_cholesky_invert 11 10.9 0.744 0.748 0.744 0.748 mp_alltoall_z22v 1201 16.6 0.665 0.736 0.665 0.736 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.000 0.657 0.732 mp_allgather_i34 2286 14.5 0.268 0.652 0.268 0.652 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="104", plot="h2o_64_md", label="(8n/4r/3t)", y=31.946000, yerr=0.000000 PlotPoint: name="105", plot="h2o_64_md_mem", label="(8n/4r/3t)", y=496.545455, yerr=1.499311 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/06/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 26877100032 0.0% 0.0% 100.0% flops 9 x 9 x 32 44168260608 0.0% 0.0% 100.0% flops 22 x 9 x 32 53835724800 0.0% 0.0% 100.0% flops 9 x 22 x 32 53885500416 0.0% 0.0% 100.0% flops 32 x 32 x 9 63568871424 0.0% 0.0% 100.0% flops 22 x 22 x 32 67007283200 0.0% 0.0% 100.0% flops 32 x 32 x 22 77695287296 0.0% 0.0% 100.0% flops 9 x 32 x 32 78422999040 0.0% 0.0% 100.0% flops 22 x 32 x 32 95850332160 0.0% 0.0% 100.0% flops 9 x 32 x 9 266263676928 0.0% 0.0% 100.0% flops 22 x 32 x 9 326697440256 0.0% 0.0% 100.0% flops 9 x 32 x 22 326697440256 0.0% 0.0% 100.0% flops 22 x 32 x 22 399918497792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 1.880888E+12 0.0% 0.0% 100.0% flops max/rank 114.044384E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 146984760 0.0% 0.0% 100.0% number of processed stacks 3805952 0.0% 0.0% 100.0% average stack size 0.0 0.0 38.6 marketing flops 2.107592E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 560.988160E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 1042416 MPI messages size (bytes): total size 150.443262E+09 min size 0.000000E+00 max size 1.188816E+06 average size 144.321719E+03 MPI breakdown and total messages size (bytes): size <= 128 228256 0 128 < size <= 8192 126888 1039466496 8192 < size <= 32768 191472 3137077248 32768 < size <= 131072 295800 25899827200 131072 < size <= 4194304 200000 120367247040 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3672 62659. MP_Allreduce 10224 344. MP_Sync 104 MP_Alltoall 1582 2412273. MP_ISendRecv 16422 74133. MP_Wait 24482 MP_comm_split 50 MP_ISend 7280 135929. MP_IRecv 7280 135929. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.028 0.060 34.699 34.701 qs_mol_dyn_low 1 2.0 0.003 0.003 34.374 34.381 qs_forces 11 3.9 0.002 0.003 34.255 34.257 qs_energies 11 4.9 0.002 0.005 32.561 32.565 scf_env_do_scf 11 5.9 0.001 0.001 27.594 27.595 scf_env_do_scf_inner_loop 108 6.5 0.002 0.006 24.156 24.156 dbcsr_multiply_generic 2286 12.5 0.101 0.106 18.023 18.149 velocity_verlet 10 3.0 0.001 0.002 17.578 17.582 qs_scf_new_mos 108 7.5 0.001 0.001 16.075 16.119 qs_scf_loop_do_ot 108 8.5 0.001 0.001 16.074 16.118 multiply_cannon 2286 13.5 0.231 0.268 14.387 15.224 ot_scf_mini 108 9.5 0.003 0.003 15.131 15.186 multiply_cannon_loop 2286 14.5 0.935 0.963 13.440 14.210 ot_mini 108 10.5 0.001 0.001 9.261 9.328 multiply_cannon_multrec 27432 15.5 2.334 3.030 8.581 8.963 qs_ot_get_derivative 108 11.5 0.001 0.001 7.446 7.501 dbcsr_mm_accdrv_process 47916 15.9 5.210 7.138 6.151 7.380 rebuild_ks_matrix 119 8.3 0.000 0.000 6.410 6.463 qs_ks_build_kohn_sham_matrix 119 9.3 0.012 0.014 6.410 6.462 qs_ks_update_qs_env 119 7.6 0.001 0.001 5.701 5.748 sum_up_and_integrate 119 10.3 0.035 0.038 3.699 3.707 integrate_v_rspace 119 11.3 0.002 0.003 3.664 3.673 init_scf_run 11 5.9 0.000 0.001 3.602 3.603 scf_env_initial_rho_setup 11 6.9 0.001 0.001 3.602 3.602 init_scf_loop 11 6.9 0.001 0.004 3.416 3.417 qs_ot_get_p 119 10.4 0.001 0.001 3.263 3.338 qs_rho_update_rho_low 119 7.7 0.001 0.001 3.276 3.310 calculate_rho_elec 119 8.7 0.040 0.046 3.275 3.310 qs_ot_get_derivative_taylor 59 13.0 0.001 0.001 2.761 3.165 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 2.051 2.604 apply_single 119 13.6 0.000 0.000 2.051 2.604 prepare_preconditioner 11 7.9 0.000 0.000 2.580 2.588 make_preconditioner 11 8.9 0.000 0.001 2.580 2.588 mp_waitall_1 145218 16.4 2.056 2.584 2.056 2.584 make_full_inverse_cholesky 11 9.9 0.000 0.000 2.176 2.515 make_m2s 4572 13.5 0.054 0.057 2.218 2.329 calculate_first_density_matrix 1 7.0 0.001 0.003 2.244 2.246 make_images 4572 14.5 0.273 0.336 2.111 2.220 calculate_dm_sparse 119 9.5 0.000 0.000 2.033 2.079 rs_pw_transfer 974 11.9 0.009 0.009 1.956 2.058 qs_ot_p2m_diag 50 11.0 0.015 0.023 1.989 1.998 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.957 1.986 grid_integrate_task_list 119 12.3 1.838 1.909 1.838 1.909 density_rs2pw 119 9.7 0.003 0.004 1.747 1.864 jit_kernel_multiply 10 15.7 0.882 1.812 0.882 1.812 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 1.808 1.810 pw_transfer 1439 11.6 0.062 0.065 1.753 1.786 ot_diis_step 108 11.5 0.012 0.013 1.776 1.777 fft_wrap_pw1pw2 1201 12.6 0.008 0.008 1.664 1.700 mp_sum_l 7207 12.9 1.014 1.673 1.014 1.673 cp_dbcsr_syevd 50 12.0 0.003 0.003 1.670 1.670 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.619 1.636 multiply_cannon_metrocomm3 27432 15.5 0.038 0.039 0.808 1.495 potential_pw2rs 119 12.3 0.008 0.009 1.490 1.495 acc_transpose_blocks 27432 15.5 0.110 0.113 1.453 1.468 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.424 1.425 cp_fm_redistribute_end 50 14.0 0.941 1.401 0.942 1.401 fft_wrap_pw1pw2_140 487 13.2 0.084 0.093 1.354 1.392 cp_fm_diag_elpa_base 50 14.0 0.437 1.336 0.457 1.375 fft3d_ps 1201 14.6 0.533 0.588 1.346 1.369 grid_collocate_task_list 119 9.7 1.220 1.333 1.220 1.333 wfi_extrapolate 11 7.9 0.001 0.001 1.310 1.310 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.000 0.000 1.208 1.219 cp_fm_upper_to_full 72 13.5 0.823 1.146 0.823 1.146 qs_ot_get_orbitals 108 10.5 0.000 0.000 1.101 1.121 qs_energies_init_hamiltonians 11 5.9 0.001 0.001 1.110 1.112 dbcsr_complete_redistribute 329 12.2 0.126 0.162 0.799 1.081 multiply_cannon_sync_h2d 27432 15.5 1.001 1.066 1.001 1.066 make_images_data 4572 15.5 0.045 0.048 0.864 0.983 hybrid_alltoall_any 4725 16.4 0.062 0.151 0.730 0.906 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.000 0.794 0.875 copy_fm_to_dbcsr 176 11.2 0.001 0.001 0.597 0.871 mp_alltoall_d11v 2130 13.8 0.747 0.848 0.747 0.848 acc_transpose_blocks_kernels 27432 16.5 0.272 0.280 0.824 0.834 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 0.799 0.805 cp_fm_cholesky_invert 11 10.9 0.756 0.759 0.756 0.759 mp_alltoall_i22 627 13.8 0.444 0.746 0.444 0.746 mp_alltoall_z22v 1201 16.6 0.695 0.717 0.695 0.717 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="106", plot="h2o_64_md", label="(8n/3r/4t)", y=34.701000, yerr=0.000000 PlotPoint: name="107", plot="h2o_64_md_mem", label="(8n/3r/4t)", y=531.272727, yerr=2.987578 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/07/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 26877100032 0.0% 0.0% 100.0% flops 9 x 9 x 32 44168260608 0.0% 0.0% 100.0% flops 22 x 9 x 32 53835724800 0.0% 0.0% 100.0% flops 9 x 22 x 32 53885500416 0.0% 0.0% 100.0% flops 32 x 32 x 9 63568871424 0.0% 0.0% 100.0% flops 22 x 22 x 32 67007283200 0.0% 0.0% 100.0% flops 32 x 32 x 22 77695287296 0.0% 0.0% 100.0% flops 9 x 32 x 32 78422999040 0.0% 0.0% 100.0% flops 22 x 32 x 32 95850332160 0.0% 0.0% 100.0% flops 9 x 32 x 9 266263676928 0.0% 0.0% 100.0% flops 22 x 32 x 9 326697440256 0.0% 0.0% 100.0% flops 9 x 32 x 22 326697440256 0.0% 0.0% 100.0% flops 22 x 32 x 22 399918497792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 1.880888E+12 0.0% 0.0% 100.0% flops max/rank 117.977176E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 146984760 0.0% 0.0% 100.0% number of processed stacks 1384136 0.0% 0.0% 100.0% average stack size 0.0 0.0 106.2 marketing flops 2.107587E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 605.483008E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 219456 MPI messages size (bytes): total size 97.042514E+09 min size 0.000000E+00 max size 3.276800E+06 average size 442.195750E+03 MPI breakdown and total messages size (bytes): size <= 128 1452 0 128 < size <= 8192 0 0 8192 < size <= 32768 101892 3336634368 32768 < size <= 131072 0 0 131072 < size <= 4194304 116112 93705670464 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 14 12. MP_Allreduce 8156 20. MP_Alltoall 8655 64935. MP_ISend 36532 168375. MP_IRecv 36532 168349. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3672 62658. MP_Allreduce 10224 344. MP_Sync 104 MP_Alltoall 1582 3682667. MP_ISendRecv 10710 94533. MP_Wait 16690 MP_comm_split 50 MP_ISend 5200 225425. MP_IRecv 5200 225425. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.020 0.037 28.196 28.196 qs_mol_dyn_low 1 2.0 0.003 0.003 27.954 27.961 qs_forces 11 3.9 0.018 0.019 27.806 27.807 qs_energies 11 4.9 0.001 0.001 26.096 26.098 scf_env_do_scf 11 5.9 0.000 0.001 21.357 21.357 scf_env_do_scf_inner_loop 108 6.5 0.002 0.006 18.861 18.862 velocity_verlet 10 3.0 0.002 0.002 14.358 14.366 dbcsr_multiply_generic 2286 12.5 0.092 0.095 12.517 12.644 qs_scf_new_mos 108 7.5 0.001 0.001 11.293 11.318 qs_scf_loop_do_ot 108 8.5 0.001 0.001 11.293 11.317 ot_scf_mini 108 9.5 0.002 0.002 10.615 10.643 multiply_cannon 2286 13.5 0.231 0.237 9.900 10.350 multiply_cannon_loop 2286 14.5 0.331 0.343 8.988 9.228 multiply_cannon_multrec 9144 15.5 1.657 2.031 5.954 6.292 ot_mini 108 10.5 0.001 0.001 6.042 6.076 rebuild_ks_matrix 119 8.3 0.000 0.000 5.790 5.812 qs_ks_build_kohn_sham_matrix 119 9.3 0.012 0.013 5.790 5.812 qs_ks_update_qs_env 119 7.6 0.001 0.001 5.163 5.182 qs_ot_get_derivative 108 11.5 0.001 0.001 4.727 4.754 dbcsr_mm_accdrv_process 12550 15.8 3.250 4.208 4.196 4.272 sum_up_and_integrate 119 10.3 0.037 0.041 3.558 3.563 integrate_v_rspace 119 11.3 0.003 0.003 3.520 3.526 init_scf_run 11 5.9 0.000 0.001 3.310 3.310 scf_env_initial_rho_setup 11 6.9 0.001 0.001 3.310 3.310 qs_rho_update_rho_low 119 7.7 0.001 0.001 3.231 3.245 calculate_rho_elec 119 8.7 0.059 0.061 3.230 3.244 qs_ot_get_p 119 10.4 0.001 0.001 2.778 2.820 init_scf_loop 11 6.9 0.000 0.000 2.476 2.478 mp_waitall_1 121218 16.5 1.814 2.374 1.814 2.374 calculate_first_density_matrix 1 7.0 0.000 0.000 2.182 2.185 make_m2s 4572 13.5 0.034 0.035 1.791 1.960 grid_integrate_task_list 119 12.3 1.852 1.948 1.852 1.948 jit_kernel_multiply 10 15.7 0.908 1.896 0.908 1.896 make_images 4572 14.5 0.267 0.298 1.702 1.869 qs_ot_p2m_diag 50 11.0 0.022 0.023 1.815 1.817 calculate_dm_sparse 119 9.5 0.000 0.000 1.788 1.809 prepare_preconditioner 11 7.9 0.000 0.000 1.748 1.753 make_preconditioner 11 8.9 0.000 0.000 1.748 1.753 pw_transfer 1439 11.6 0.063 0.065 1.743 1.751 rs_pw_transfer 974 11.9 0.008 0.008 1.648 1.723 density_rs2pw 119 9.7 0.004 0.004 1.635 1.715 make_full_inverse_cholesky 11 9.9 0.000 0.000 1.638 1.663 fft_wrap_pw1pw2 1201 12.6 0.008 0.008 1.653 1.660 qs_ot_get_derivative_taylor 59 13.0 0.001 0.001 1.606 1.622 cp_dbcsr_syevd 50 12.0 0.003 0.003 1.619 1.620 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 1.574 1.576 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.403 1.413 grid_collocate_task_list 119 9.7 1.271 1.366 1.271 1.366 potential_pw2rs 119 12.3 0.010 0.010 1.339 1.344 fft_wrap_pw1pw2_140 487 13.2 0.082 0.087 1.331 1.341 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.328 1.329 fft3d_ps 1201 14.6 0.538 0.551 1.319 1.328 cp_fm_redistribute_end 50 14.0 0.663 1.309 0.664 1.310 ot_diis_step 108 11.5 0.012 0.013 1.303 1.303 cp_fm_diag_elpa_base 50 14.0 0.601 1.237 0.644 1.291 qs_energies_init_hamiltonians 11 5.9 0.000 0.001 1.215 1.215 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.000 0.000 1.205 1.213 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 1.173 1.193 apply_single 119 13.6 0.000 0.000 1.173 1.193 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.152 1.165 wfi_extrapolate 11 7.9 0.001 0.001 1.082 1.082 hybrid_alltoall_any 4725 16.4 0.062 0.174 0.811 1.051 make_images_data 4572 15.5 0.039 0.042 0.827 1.030 acc_transpose_blocks 9144 15.5 0.038 0.039 0.944 0.949 multiply_cannon_metrocomm3 9144 15.5 0.018 0.019 0.446 0.934 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.000 0.870 0.919 mp_alltoall_d11v 2130 13.8 0.813 0.912 0.813 0.912 cp_fm_cholesky_invert 11 10.9 0.889 0.891 0.889 0.891 multiply_cannon_sync_h2d 9144 15.5 0.711 0.789 0.711 0.789 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 0.750 0.752 qs_ot_get_orbitals 108 10.5 0.000 0.000 0.738 0.747 qs_env_update_s_mstruct 11 6.9 0.000 0.000 0.670 0.721 acc_transpose_blocks_kernels 9144 16.5 0.118 0.121 0.703 0.705 mp_alltoall_z22v 1201 16.6 0.655 0.689 0.655 0.689 mp_allgather_i34 2286 14.5 0.244 0.668 0.244 0.668 jit_kernel_transpose 5 15.6 0.584 0.587 0.584 0.587 mp_sum_l 7207 12.9 0.397 0.584 0.397 0.584 qs_create_task_list 11 7.9 0.000 0.001 0.547 0.573 generate_qs_task_list 11 8.9 0.190 0.213 0.547 0.573 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="108", plot="h2o_64_md", label="(8n/2r/6t)", y=28.196000, yerr=0.000000 PlotPoint: name="109", plot="h2o_64_md_mem", label="(8n/2r/6t)", y=572.000000, yerr=5.187397 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/08/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 26877100032 0.0% 0.0% 100.0% flops 9 x 9 x 32 44168260608 0.0% 0.0% 100.0% flops 22 x 9 x 32 53835724800 0.0% 0.0% 100.0% flops 9 x 22 x 32 53885500416 0.0% 0.0% 100.0% flops 32 x 32 x 9 63568871424 0.0% 0.0% 100.0% flops 22 x 22 x 32 67007283200 0.0% 0.0% 100.0% flops 32 x 32 x 22 77695287296 0.0% 0.0% 100.0% flops 9 x 32 x 32 78422999040 0.0% 0.0% 100.0% flops 22 x 32 x 32 95850332160 0.0% 0.0% 100.0% flops 9 x 32 x 9 266263676928 0.0% 0.0% 100.0% flops 22 x 32 x 9 326697440256 0.0% 0.0% 100.0% flops 9 x 32 x 22 326697440256 0.0% 0.0% 100.0% flops 22 x 32 x 22 399918497792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 1.880888E+12 0.0% 0.0% 100.0% flops max/rank 235.585836E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 146984760 0.0% 0.0% 100.0% number of processed stacks 1388964 0.0% 0.0% 100.0% average stack size 0.0 0.0 105.8 marketing flops 2.107587E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 739.733504E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 91440 MPI messages size (bytes): total size 85.748679E+09 min size 0.000000E+00 max size 6.553600E+06 average size 937.758938E+03 MPI breakdown and total messages size (bytes): size <= 128 572 0 128 < size <= 8192 0 0 8192 < size <= 32768 21148 692256768 32768 < size <= 131072 19224 1259864064 131072 < size <= 4194304 41040 21941452800 4194304 < size <= 16777216 9456 61855174464 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3622 63729. MP_Allreduce 10074 433. MP_Sync 54 MP_Alltoall 1582 7383731. MP_ISendRecv 4998 189067. MP_Wait 8898 MP_ISend 3120 546875. MP_IRecv 3120 546875. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.015 0.034 40.606 40.607 qs_mol_dyn_low 1 2.0 0.003 0.003 40.401 40.408 qs_forces 11 3.9 0.002 0.003 40.328 40.330 qs_energies 11 4.9 0.001 0.002 38.390 38.394 scf_env_do_scf 11 5.9 0.000 0.001 32.697 32.697 scf_env_do_scf_inner_loop 108 6.5 0.003 0.006 24.703 24.704 velocity_verlet 10 3.0 0.002 0.002 23.005 23.010 dbcsr_multiply_generic 2286 12.5 0.101 0.103 17.411 17.589 qs_scf_new_mos 108 7.5 0.001 0.001 15.848 15.949 qs_scf_loop_do_ot 108 8.5 0.001 0.001 15.847 15.949 ot_scf_mini 108 9.5 0.002 0.002 14.773 14.877 multiply_cannon 2286 13.5 0.302 0.310 13.503 14.428 multiply_cannon_loop 2286 14.5 0.343 0.348 12.255 13.172 ot_mini 108 10.5 0.001 0.001 8.793 8.912 multiply_cannon_multrec 9144 15.5 3.368 4.787 8.546 8.670 init_scf_loop 11 6.9 0.000 0.000 7.967 7.969 prepare_preconditioner 11 7.9 0.000 0.000 7.050 7.063 make_preconditioner 11 8.9 0.000 0.000 7.050 7.063 rebuild_ks_matrix 119 8.3 0.000 0.000 6.799 6.947 qs_ks_build_kohn_sham_matrix 119 9.3 0.013 0.013 6.799 6.947 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.559 6.932 qs_ot_get_derivative 108 11.5 0.001 0.001 6.754 6.856 dbcsr_mm_accdrv_process 12550 15.8 4.119 5.637 5.053 6.388 qs_ks_update_qs_env 119 7.6 0.001 0.001 6.159 6.292 cp_fm_upper_to_full 72 14.2 3.274 4.699 3.274 4.699 sum_up_and_integrate 119 10.3 0.064 0.066 3.667 3.674 init_scf_run 11 5.9 0.000 0.001 3.667 3.667 scf_env_initial_rho_setup 11 6.9 0.001 0.001 3.667 3.667 qs_rho_update_rho_low 119 7.7 0.001 0.001 3.660 3.664 calculate_rho_elec 119 8.7 0.118 0.121 3.659 3.664 integrate_v_rspace 119 11.3 0.003 0.003 3.603 3.610 mp_waitall_1 97218 16.6 2.509 3.410 2.509 3.410 qs_ot_get_p 119 10.4 0.001 0.001 3.243 3.380 dbcsr_complete_redistribute 329 12.2 0.289 0.292 2.056 2.896 qs_ot_get_derivative_taylor 59 13.0 0.001 0.001 2.456 2.887 copy_fm_to_dbcsr 176 11.2 0.001 0.001 1.724 2.565 make_m2s 4572 13.5 0.038 0.038 2.302 2.461 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 2.203 2.446 apply_single 119 13.6 0.000 0.000 2.203 2.446 mp_alltoall_i22 627 13.8 1.526 2.364 1.526 2.364 make_images 4572 14.5 0.352 0.382 2.181 2.340 transfer_fm_to_dbcsr 11 9.9 0.000 0.000 1.486 2.320 calculate_first_density_matrix 1 7.0 0.000 0.000 2.241 2.242 calculate_dm_sparse 119 9.5 0.000 0.000 2.183 2.201 multiply_cannon_metrocomm3 9144 15.5 0.019 0.020 1.262 2.132 pw_transfer 1439 11.6 0.066 0.067 2.042 2.044 grid_integrate_task_list 119 12.3 2.004 2.029 2.004 2.029 ot_diis_step 108 11.5 0.014 0.014 2.011 2.012 qs_ot_p2m_diag 50 11.0 0.043 0.044 1.965 1.966 fft_wrap_pw1pw2 1201 12.6 0.008 0.009 1.947 1.949 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 1.938 1.939 mp_sum_l 7207 12.9 1.069 1.789 1.069 1.789 density_rs2pw 119 9.7 0.003 0.003 1.765 1.784 qs_energies_init_hamiltonians 11 5.9 0.001 0.001 1.747 1.749 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.696 1.748 jit_kernel_multiply 10 15.6 0.908 1.717 0.908 1.717 cp_dbcsr_syevd 50 12.0 0.003 0.003 1.692 1.693 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.611 1.633 fft_wrap_pw1pw2_140 487 13.2 0.087 0.090 1.614 1.617 fft3d_ps 1201 14.6 0.567 0.575 1.582 1.584 cp_fm_cholesky_invert 11 10.9 1.536 1.539 1.536 1.539 grid_collocate_task_list 119 9.7 1.446 1.462 1.446 1.462 hybrid_alltoall_any 4725 16.4 0.087 0.149 1.158 1.406 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.396 1.397 cp_fm_diag_elpa_base 50 14.0 1.248 1.302 1.394 1.395 rs_pw_transfer 974 11.9 0.009 0.009 1.356 1.391 wfi_extrapolate 11 7.9 0.001 0.001 1.367 1.367 make_images_data 4572 15.5 0.042 0.045 1.113 1.320 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.000 0.000 1.259 1.266 potential_pw2rs 119 12.3 0.014 0.014 1.212 1.214 mp_alltoall_d11v 2130 13.8 1.159 1.171 1.159 1.171 qs_ot_get_orbitals 108 10.5 0.000 0.000 1.141 1.161 qs_env_update_s_mstruct 11 6.9 0.000 0.000 1.083 1.097 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.000 0.990 1.049 multiply_cannon_sync_h2d 9144 15.5 1.039 1.043 1.039 1.043 qs_create_task_list 11 7.9 0.000 0.000 0.941 0.951 generate_qs_task_list 11 8.9 0.372 0.392 0.940 0.950 acc_transpose_blocks 9144 15.5 0.038 0.039 0.916 0.927 mp_alltoall_z22v 1201 16.6 0.880 0.901 0.880 0.901 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 0.853 0.867 copy_dbcsr_to_fm 153 11.3 0.002 0.002 0.783 0.827 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="110", plot="h2o_64_md", label="(8n/1r/12t)", y=40.607000, yerr=0.000000 PlotPoint: name="111", plot="h2o_64_md_mem", label="(8n/1r/12t)", y=696.363636, yerr=11.475450 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/09/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 184415158272 0.0% 0.0% 100.0% flops 9 x 9 x 32 269180485632 0.0% 0.0% 100.0% flops 9 x 22 x 32 349395425280 0.0% 0.0% 100.0% flops 22 x 9 x 32 350042406912 0.0% 0.0% 100.0% flops 22 x 22 x 32 453581815808 0.0% 0.0% 100.0% flops 32 x 32 x 9 465064427520 0.0% 0.0% 100.0% flops 32 x 32 x 22 568412078080 0.0% 0.0% 100.0% flops 9 x 32 x 32 572195340288 0.0% 0.0% 100.0% flops 22 x 32 x 32 699349860352 0.0% 0.0% 100.0% flops 9 x 32 x 9 1735942275072 0.0% 0.0% 100.0% flops 22 x 32 x 9 2216407818240 0.0% 0.0% 100.0% flops 9 x 32 x 22 2216407818240 0.0% 0.0% 100.0% flops 22 x 32 x 22 2803661053952 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 12.884056E+12 0.0% 0.0% 100.0% flops max/rank 198.287135E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 984178160 0.0% 0.0% 100.0% number of processed stacks 8410880 0.0% 0.0% 100.0% average stack size 0.0 0.0 117.0 marketing flops 15.646302E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 499.634176E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 8483040 MPI messages size (bytes): total size 1.160510E+12 min size 0.000000E+00 max size 1.161504E+06 average size 136.803609E+03 MPI breakdown and total messages size (bytes): size <= 128 1836752 0 128 < size <= 8192 1040592 8524529664 8192 < size <= 32768 1486976 24362614784 32768 < size <= 131072 2491776 216971345920 131072 < size <= 4194304 1626944 910632720448 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65372. MP_Allreduce 9840 486. MP_Sync 100 MP_Alltoall 1938 1402082. MP_ISendRecv 41800 9096. MP_Wait 58168 MP_comm_split 48 MP_ISend 14300 82312. MP_IRecv 14300 82312. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.028 0.095 81.040 81.073 qs_mol_dyn_low 1 2.0 0.003 0.003 80.670 80.689 qs_forces 11 3.9 0.015 0.019 80.280 80.281 qs_energies 11 4.9 0.001 0.002 77.473 77.492 scf_env_do_scf 11 5.9 0.000 0.001 68.557 68.560 scf_env_do_scf_inner_loop 99 6.5 0.002 0.008 63.036 63.036 dbcsr_multiply_generic 2055 12.4 0.104 0.108 50.936 51.220 qs_scf_new_mos 99 7.5 0.000 0.001 46.356 46.499 qs_scf_loop_do_ot 99 8.5 0.000 0.001 46.356 46.499 ot_scf_mini 99 9.5 0.002 0.002 43.961 44.105 multiply_cannon 2055 13.4 0.181 0.192 42.034 42.864 multiply_cannon_loop 2055 14.4 1.522 1.565 41.072 41.964 velocity_verlet 10 3.0 0.002 0.002 40.548 40.552 ot_mini 99 10.5 0.001 0.001 26.538 26.627 qs_ot_get_derivative 99 11.5 0.001 0.001 19.804 19.907 multiply_cannon_multrec 49320 15.4 12.377 13.478 17.355 18.251 rebuild_ks_matrix 110 8.3 0.000 0.001 14.263 14.400 qs_ks_build_kohn_sham_matrix 110 9.3 0.011 0.013 14.263 14.399 qs_ks_update_qs_env 110 7.6 0.001 0.001 12.533 12.657 mp_waitall_1 241148 16.1 10.767 11.481 10.767 11.481 multiply_cannon_sync_h2d 49320 15.4 10.263 10.923 10.263 10.923 qs_ot_get_p 110 10.4 0.001 0.001 9.006 9.095 multiply_cannon_metrocomm3 49320 15.4 0.079 0.083 6.436 7.861 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 7.160 7.583 apply_single 110 13.6 0.000 0.000 7.160 7.583 qs_ot_get_derivative_taylor 52 13.0 0.001 0.001 6.380 6.914 sum_up_and_integrate 110 10.3 0.037 0.044 6.898 6.912 integrate_v_rspace 110 11.3 0.002 0.003 6.861 6.884 qs_ot_get_derivative_diag 47 12.0 0.001 0.001 6.671 6.728 init_scf_run 11 5.9 0.000 0.001 6.710 6.711 scf_env_initial_rho_setup 11 6.9 0.001 0.001 6.710 6.710 ot_diis_step 99 11.5 0.005 0.006 6.522 6.522 qs_rho_update_rho_low 110 7.6 0.001 0.001 6.081 6.206 calculate_rho_elec 110 8.6 0.020 0.024 6.081 6.205 qs_ot_p2m_diag 48 11.0 0.012 0.019 5.912 5.943 init_scf_loop 11 6.9 0.000 0.000 5.494 5.495 mp_sum_l 6514 12.8 4.637 5.376 4.637 5.376 dbcsr_mm_accdrv_process 87628 16.1 1.970 2.060 4.855 5.146 cp_dbcsr_syevd 48 12.0 0.002 0.003 4.981 4.982 cp_fm_diag_elpa 48 13.0 0.000 0.000 4.521 4.522 cp_fm_redistribute_end 48 14.0 3.940 4.497 3.944 4.498 cp_fm_diag_elpa_base 48 14.0 0.549 4.408 0.552 4.430 wfi_extrapolate 11 7.9 0.001 0.001 3.977 3.977 make_m2s 4110 13.4 0.061 0.065 3.827 3.943 rs_pw_transfer 902 11.9 0.012 0.014 3.723 3.914 calculate_dm_sparse 110 9.5 0.000 0.001 3.761 3.866 make_images 4110 14.4 0.177 0.192 3.732 3.848 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 3.587 3.591 density_rs2pw 110 9.6 0.004 0.004 3.194 3.421 grid_integrate_task_list 110 12.3 3.260 3.405 3.260 3.405 multiply_cannon_metrocomm1 49320 15.4 0.061 0.064 2.323 3.404 prepare_preconditioner 11 7.9 0.000 0.000 3.342 3.354 make_preconditioner 11 8.9 0.000 0.000 3.342 3.354 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 3.236 3.295 qs_ot_get_orbitals 99 10.5 0.000 0.001 3.170 3.209 make_full_inverse_cholesky 11 9.9 0.000 0.000 3.115 3.159 pw_transfer 1331 11.6 0.053 0.062 2.772 2.830 jit_kernel_multiply 13 15.9 2.609 2.775 2.609 2.775 fft_wrap_pw1pw2 1111 12.6 0.007 0.008 2.685 2.746 calculate_first_density_matrix 1 7.0 0.000 0.000 2.650 2.655 potential_pw2rs 110 12.3 0.006 0.007 2.515 2.535 mp_alltoall_d11v 2046 13.8 2.030 2.515 2.030 2.515 fft_wrap_pw1pw2_140 451 13.1 0.169 0.190 2.258 2.319 fft3d_ps 1111 14.6 0.750 0.833 2.223 2.274 grid_collocate_task_list 110 9.6 2.086 2.216 2.086 2.216 acc_transpose_blocks 49320 15.4 0.200 0.211 2.085 2.153 mp_waitany 14300 13.8 1.878 2.139 1.878 2.139 mp_sum_d 3883 11.9 1.377 1.911 1.377 1.911 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.869 1.890 make_images_data 4110 15.4 0.044 0.046 1.745 1.876 cp_fm_cholesky_invert 11 10.9 1.833 1.837 1.833 1.837 hybrid_alltoall_any 4261 16.3 0.082 0.483 1.508 1.808 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.001 0.001 1.633 1.658 qs_energies_init_hamiltonians 11 5.9 0.000 0.001 1.616 1.623 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="200", plot="h2o_128_md", label="(8n/12r/1t)", y=81.073000, yerr=0.000000 PlotPoint: name="201", plot="h2o_128_md_mem", label="(8n/12r/1t)", y=475.545455, yerr=1.876342 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/10/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 184415158272 0.0% 0.0% 100.0% flops 9 x 9 x 32 269180485632 0.0% 0.0% 100.0% flops 9 x 22 x 32 349395425280 0.0% 0.0% 100.0% flops 22 x 9 x 32 350042406912 0.0% 0.0% 100.0% flops 22 x 22 x 32 453581815808 0.0% 0.0% 100.0% flops 32 x 32 x 9 465064427520 0.0% 0.0% 100.0% flops 32 x 32 x 22 568412078080 0.0% 0.0% 100.0% flops 9 x 32 x 32 572195340288 0.0% 0.0% 100.0% flops 22 x 32 x 32 699349860352 0.0% 0.0% 100.0% flops 9 x 32 x 9 1735942275072 0.0% 0.0% 100.0% flops 22 x 32 x 9 2216407818240 0.0% 0.0% 100.0% flops 9 x 32 x 22 2216407818240 0.0% 0.0% 100.0% flops 22 x 32 x 22 2803661053952 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 12.884056E+12 0.0% 0.0% 100.0% flops max/rank 390.715586E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 984178160 0.0% 0.0% 100.0% number of processed stacks 5019072 0.0% 0.0% 100.0% average stack size 0.0 0.0 196.1 marketing flops 15.646302E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 584.409088E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 1972800 MPI messages size (bytes): total size 1.077520E+12 min size 0.000000E+00 max size 4.537280E+06 average size 546.188250E+03 MPI breakdown and total messages size (bytes): size <= 128 14916 0 128 < size <= 8192 222984 1826684928 8192 < size <= 32768 520356 13399818240 32768 < size <= 131072 372336 35386294272 131072 < size <= 4194304 787758 788321309808 4194304 < size <= 16777216 54450 238588003280 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65587. MP_Allreduce 9839 562. MP_Sync 100 MP_Alltoall 1717 3583165. MP_ISendRecv 20680 26400. MP_Wait 32692 MP_comm_split 48 MP_ISend 10164 155761. MP_IRecv 10164 155761. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.018 0.037 68.004 68.005 qs_mol_dyn_low 1 2.0 0.003 0.005 67.721 67.730 qs_forces 11 3.9 0.002 0.003 67.519 67.521 qs_energies 11 4.9 0.001 0.003 64.224 64.228 scf_env_do_scf 11 5.9 0.000 0.001 55.695 55.698 scf_env_do_scf_inner_loop 99 6.5 0.002 0.007 48.091 48.092 dbcsr_multiply_generic 2055 12.4 0.113 0.117 37.538 37.748 velocity_verlet 10 3.0 0.001 0.001 35.940 35.944 qs_scf_new_mos 99 7.5 0.001 0.001 32.331 32.467 qs_scf_loop_do_ot 99 8.5 0.001 0.001 32.330 32.467 multiply_cannon 2055 13.4 0.221 0.242 30.962 32.125 ot_scf_mini 99 9.5 0.003 0.003 30.676 30.817 multiply_cannon_loop 2055 14.4 0.929 0.948 29.665 30.459 ot_mini 99 10.5 0.001 0.001 18.104 18.251 multiply_cannon_multrec 24660 15.4 7.670 9.135 13.924 15.529 rebuild_ks_matrix 110 8.3 0.000 0.001 13.568 13.644 qs_ks_build_kohn_sham_matrix 110 9.3 0.012 0.014 13.568 13.644 qs_ot_get_derivative 99 11.5 0.001 0.001 12.323 12.466 qs_ks_update_qs_env 110 7.6 0.001 0.001 11.968 12.043 mp_waitall_1 186928 16.3 7.706 9.950 7.706 9.950 multiply_cannon_sync_h2d 24660 15.4 7.028 8.288 7.028 8.288 multiply_cannon_metrocomm3 24660 15.4 0.071 0.073 5.062 7.740 init_scf_loop 11 6.9 0.000 0.000 7.570 7.570 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 6.483 7.179 apply_single 110 13.6 0.000 0.001 6.483 7.179 sum_up_and_integrate 110 10.3 0.052 0.060 6.491 6.504 integrate_v_rspace 110 11.3 0.002 0.003 6.438 6.453 dbcsr_mm_accdrv_process 52282 16.1 4.604 5.427 6.093 6.421 qs_ot_get_p 110 10.4 0.001 0.001 5.944 6.121 init_scf_run 11 5.9 0.000 0.001 6.117 6.118 scf_env_initial_rho_setup 11 6.9 0.001 0.001 6.117 6.118 ot_diis_step 99 11.5 0.010 0.010 5.733 5.733 qs_rho_update_rho_low 110 7.6 0.001 0.001 5.721 5.730 calculate_rho_elec 110 8.6 0.039 0.047 5.720 5.730 prepare_preconditioner 11 7.9 0.000 0.000 5.560 5.579 make_preconditioner 11 8.9 0.000 0.000 5.560 5.579 qs_ot_get_derivative_taylor 52 13.0 0.001 0.001 4.640 5.336 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.121 5.277 make_m2s 4110 13.4 0.057 0.060 4.176 4.657 make_images 4110 14.4 0.399 0.445 4.067 4.545 qs_ot_p2m_diag 48 11.0 0.028 0.044 4.071 4.091 cp_dbcsr_syevd 48 12.0 0.003 0.003 3.647 3.648 wfi_extrapolate 11 7.9 0.001 0.001 3.511 3.511 pw_transfer 1331 11.6 0.065 0.070 3.266 3.403 grid_integrate_task_list 110 12.3 3.160 3.318 3.160 3.318 fft_wrap_pw1pw2 1111 12.6 0.008 0.009 3.160 3.295 density_rs2pw 110 9.6 0.004 0.004 3.010 3.254 qs_ot_get_derivative_diag 47 12.0 0.001 0.001 3.161 3.239 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.003 3.232 3.234 rs_pw_transfer 902 11.9 0.012 0.013 2.951 3.146 cp_fm_diag_elpa 48 13.0 0.000 0.000 3.122 3.124 cp_fm_redistribute_end 48 14.0 2.330 3.098 2.332 3.098 cp_fm_diag_elpa_base 48 14.0 0.733 2.979 0.764 3.061 calculate_dm_sparse 110 9.5 0.001 0.001 3.006 3.035 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.886 2.930 make_images_data 4110 15.4 0.047 0.051 2.340 2.841 fft_wrap_pw1pw2_140 451 13.1 0.200 0.218 2.667 2.802 hybrid_alltoall_any 4261 16.3 0.101 0.442 2.041 2.782 fft3d_ps 1111 14.6 1.073 1.268 2.517 2.637 cp_fm_cholesky_invert 11 10.9 2.580 2.586 2.580 2.586 calculate_first_density_matrix 1 7.0 0.000 0.002 2.520 2.522 mp_sum_l 6514 12.8 1.782 2.411 1.782 2.411 potential_pw2rs 110 12.3 0.008 0.009 2.354 2.368 jit_kernel_multiply 12 16.3 1.140 2.363 1.140 2.363 grid_collocate_task_list 110 9.6 2.121 2.260 2.121 2.260 mp_alltoall_d11v 2046 13.8 1.783 2.041 1.783 2.041 qs_ot_get_orbitals 99 10.5 0.001 0.001 1.951 1.965 qs_energies_init_hamiltonians 11 5.9 0.000 0.001 1.900 1.902 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.763 1.776 multiply_cannon_metrocomm4 22605 15.4 0.074 0.077 0.780 1.726 mp_allgather_i34 2055 14.4 0.626 1.636 0.626 1.636 mp_irecv_dv 57340 16.2 0.654 1.606 0.654 1.606 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.001 0.001 1.584 1.594 acc_transpose_blocks 24660 15.4 0.106 0.110 1.512 1.532 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.001 1.373 1.484 dbcsr_complete_redistribute 325 12.2 0.251 0.320 1.172 1.440 mp_waitany 10164 13.8 1.204 1.416 1.204 1.416 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="202", plot="h2o_128_md", label="(8n/6r/2t)", y=68.005000, yerr=0.000000 PlotPoint: name="203", plot="h2o_128_md_mem", label="(8n/6r/2t)", y=553.545455, yerr=6.358439 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/11/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 184415158272 0.0% 0.0% 100.0% flops 9 x 9 x 32 269180485632 0.0% 0.0% 100.0% flops 9 x 22 x 32 349395425280 0.0% 0.0% 100.0% flops 22 x 9 x 32 350042406912 0.0% 0.0% 100.0% flops 22 x 22 x 32 453581815808 0.0% 0.0% 100.0% flops 32 x 32 x 9 465064427520 0.0% 0.0% 100.0% flops 32 x 32 x 22 568412078080 0.0% 0.0% 100.0% flops 9 x 32 x 32 572195340288 0.0% 0.0% 100.0% flops 22 x 32 x 32 699349860352 0.0% 0.0% 100.0% flops 9 x 32 x 9 1735942275072 0.0% 0.0% 100.0% flops 22 x 32 x 9 2216407818240 0.0% 0.0% 100.0% flops 9 x 32 x 22 2216407818240 0.0% 0.0% 100.0% flops 22 x 32 x 22 2803661053952 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 12.884056E+12 0.0% 0.0% 100.0% flops max/rank 404.681598E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 984178160 0.0% 0.0% 100.0% number of processed stacks 3346752 0.0% 0.0% 100.0% average stack size 0.0 0.0 294.1 marketing flops 15.646297E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 659.628032E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 854880 MPI messages size (bytes): total size 708.322787E+09 min size 0.000000E+00 max size 6.553600E+06 average size 828.564000E+03 MPI breakdown and total messages size (bytes): size <= 128 6424 0 128 < size <= 8192 0 0 8192 < size <= 32768 222984 7302414336 32768 < size <= 131072 153888 10085203968 131072 < size <= 4194304 389376 200257044480 4194304 < size <= 16777216 82208 490679162176 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65578. MP_Allreduce 9838 559. MP_Sync 100 MP_Alltoall 1496 4511006. MP_ISendRecv 13640 27424. MP_Wait 32318 MP_comm_split 48 MP_ISend 17072 115022. MP_IRecv 17072 115022. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.022 0.037 58.375 58.376 qs_mol_dyn_low 1 2.0 0.003 0.003 58.062 58.072 qs_forces 11 3.9 0.002 0.002 57.997 57.998 qs_energies 11 4.9 0.001 0.002 54.866 54.870 scf_env_do_scf 11 5.9 0.000 0.001 47.066 47.066 scf_env_do_scf_inner_loop 99 6.5 0.002 0.006 38.697 38.697 velocity_verlet 10 3.0 0.001 0.002 31.850 31.851 dbcsr_multiply_generic 2055 12.4 0.108 0.111 28.030 28.265 qs_scf_new_mos 99 7.5 0.001 0.001 24.426 24.530 qs_scf_loop_do_ot 99 8.5 0.001 0.001 24.425 24.530 ot_scf_mini 99 9.5 0.002 0.003 23.219 23.337 multiply_cannon 2055 13.4 0.212 0.220 21.890 23.174 multiply_cannon_loop 2055 14.4 0.618 0.634 20.733 21.641 ot_mini 99 10.5 0.001 0.001 13.265 13.381 rebuild_ks_matrix 110 8.3 0.000 0.000 12.045 12.177 qs_ks_build_kohn_sham_matrix 110 9.3 0.012 0.013 12.045 12.177 qs_ks_update_qs_env 110 7.6 0.001 0.001 10.621 10.740 multiply_cannon_multrec 16440 15.4 3.958 5.027 9.696 10.677 mp_waitall_1 146766 16.3 6.884 9.741 6.884 9.741 qs_ot_get_derivative 99 11.5 0.001 0.001 8.868 8.988 init_scf_loop 11 6.9 0.000 0.000 8.334 8.334 multiply_cannon_metrocomm3 16440 15.4 0.043 0.044 4.143 6.825 prepare_preconditioner 11 7.9 0.000 0.000 6.579 6.596 make_preconditioner 11 8.9 0.000 0.000 6.579 6.596 sum_up_and_integrate 110 10.3 0.060 0.061 6.314 6.328 integrate_v_rspace 110 11.3 0.002 0.003 6.254 6.268 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.915 6.250 dbcsr_mm_accdrv_process 34862 16.1 4.651 5.270 5.591 5.820 qs_rho_update_rho_low 110 7.6 0.001 0.001 5.416 5.424 calculate_rho_elec 110 8.6 0.058 0.058 5.415 5.424 init_scf_run 11 5.9 0.000 0.001 5.359 5.359 scf_env_initial_rho_setup 11 6.9 0.001 0.001 5.359 5.359 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 4.911 5.334 apply_single 110 13.6 0.000 0.000 4.911 5.333 qs_ot_get_p 110 10.4 0.001 0.001 5.182 5.328 make_m2s 4110 13.4 0.050 0.052 4.116 4.482 make_images 4110 14.4 0.391 0.511 4.001 4.366 ot_diis_step 99 11.5 0.011 0.011 4.364 4.364 multiply_cannon_sync_h2d 16440 15.4 3.714 4.293 3.714 4.293 qs_ot_get_derivative_taylor 52 13.0 0.001 0.001 3.036 3.669 qs_ot_p2m_diag 48 11.0 0.041 0.044 3.623 3.628 grid_integrate_task_list 110 12.3 3.188 3.370 3.188 3.370 cp_dbcsr_syevd 48 12.0 0.003 0.003 3.293 3.294 pw_transfer 1331 11.6 0.064 0.070 3.128 3.136 fft_wrap_pw1pw2 1111 12.6 0.008 0.008 3.022 3.034 density_rs2pw 110 9.6 0.004 0.004 2.712 2.964 wfi_extrapolate 11 7.9 0.001 0.001 2.923 2.923 make_images_data 4110 15.4 0.044 0.048 2.422 2.868 rs_pw_transfer 902 11.9 0.010 0.011 2.581 2.821 cp_fm_diag_elpa 48 13.0 0.000 0.000 2.797 2.798 hybrid_alltoall_any 4261 16.3 0.105 0.373 2.147 2.791 cp_fm_redistribute_end 48 14.0 1.740 2.775 1.742 2.775 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 2.749 2.751 cp_fm_diag_elpa_base 48 14.0 0.972 2.645 1.030 2.749 cp_fm_cholesky_invert 11 10.9 2.606 2.612 2.606 2.612 fft_wrap_pw1pw2_140 451 13.1 0.211 0.226 2.596 2.606 calculate_dm_sparse 110 9.5 0.001 0.001 2.483 2.518 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.428 2.472 grid_collocate_task_list 110 9.6 2.179 2.383 2.179 2.383 calculate_first_density_matrix 1 7.0 0.000 0.000 2.354 2.355 fft3d_ps 1111 14.6 1.066 1.077 2.336 2.348 qs_ot_get_derivative_diag 47 12.0 0.001 0.001 2.267 2.336 multiply_cannon_metrocomm4 14385 15.4 0.045 0.049 0.867 2.306 mp_irecv_dv 48980 15.7 0.796 2.180 0.796 2.180 potential_pw2rs 110 12.3 0.011 0.011 2.151 2.158 mp_alltoall_d11v 2046 13.8 1.732 2.064 1.732 2.064 qs_energies_init_hamiltonians 11 5.9 0.000 0.001 2.013 2.014 mp_sum_l 6514 12.8 1.413 1.993 1.413 1.993 dbcsr_complete_redistribute 325 12.2 0.328 0.362 1.378 1.831 cp_fm_upper_to_full 70 13.6 1.367 1.826 1.367 1.826 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.623 1.638 mp_allgather_i34 2055 14.4 0.478 1.575 0.478 1.575 cp_fm_cholesky_decompose 22 10.9 1.522 1.539 1.522 1.539 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.001 0.001 1.456 1.468 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.001 1.366 1.466 copy_fm_to_dbcsr 174 11.2 0.001 0.001 0.964 1.399 mp_waitany 17072 13.8 1.150 1.337 1.150 1.337 qs_ot_get_orbitals 99 10.5 0.001 0.001 1.236 1.244 acc_transpose_blocks 16440 15.4 0.072 0.074 1.204 1.215 qs_env_update_s_mstruct 11 6.9 0.000 0.000 1.112 1.209 rs_gather_matrices 110 12.3 0.138 0.150 0.866 1.183 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="204", plot="h2o_128_md", label="(8n/4r/3t)", y=58.376000, yerr=0.000000 PlotPoint: name="205", plot="h2o_128_md_mem", label="(8n/4r/3t)", y=624.545455, yerr=8.886800 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/12/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 184415158272 0.0% 0.0% 100.0% flops 9 x 9 x 32 269180485632 0.0% 0.0% 100.0% flops 9 x 22 x 32 349395425280 0.0% 0.0% 100.0% flops 22 x 9 x 32 350042406912 0.0% 0.0% 100.0% flops 22 x 22 x 32 453581815808 0.0% 0.0% 100.0% flops 32 x 32 x 9 465064427520 0.0% 0.0% 100.0% flops 32 x 32 x 22 568412078080 0.0% 0.0% 100.0% flops 9 x 32 x 32 572195340288 0.0% 0.0% 100.0% flops 22 x 32 x 32 699349860352 0.0% 0.0% 100.0% flops 9 x 32 x 9 1735942275072 0.0% 0.0% 100.0% flops 22 x 32 x 9 2216407818240 0.0% 0.0% 100.0% flops 9 x 32 x 22 2216407818240 0.0% 0.0% 100.0% flops 22 x 32 x 22 2803661053952 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 12.884056E+12 0.0% 0.0% 100.0% flops max/rank 601.317074E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 984178160 0.0% 0.0% 100.0% number of processed stacks 4916280 0.0% 0.0% 100.0% average stack size 0.0 0.0 200.2 marketing flops 15.646302E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 726.630400E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 937080 MPI messages size (bytes): total size 523.723932E+09 min size 0.000000E+00 max size 4.537280E+06 average size 558.889250E+03 MPI breakdown and total messages size (bytes): size <= 128 6996 0 128 < size <= 8192 264 2162688 8192 < size <= 32768 304932 8165326848 32768 < size <= 131072 110640 6338641920 131072 < size <= 4194304 489498 400769458320 4194304 < size <= 16777216 24750 108449092400 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65576. MP_Allreduce 9838 600. MP_Sync 100 MP_Alltoall 1496 5863162. MP_ISendRecv 10120 43184. MP_Wait 25102 MP_comm_split 48 MP_ISend 13376 163145. MP_IRecv 13376 163145. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.053 0.167 64.598 64.599 qs_mol_dyn_low 1 2.0 0.003 0.003 64.053 64.062 qs_forces 11 3.9 0.002 0.004 63.989 63.990 qs_energies 11 4.9 0.004 0.020 60.605 60.609 scf_env_do_scf 11 5.9 0.001 0.001 51.980 51.983 scf_env_do_scf_inner_loop 99 6.5 0.003 0.006 40.293 40.293 velocity_verlet 10 3.0 0.002 0.002 36.574 36.576 dbcsr_multiply_generic 2055 12.4 0.118 0.126 29.454 29.669 qs_scf_new_mos 99 7.5 0.001 0.001 25.953 26.061 qs_scf_loop_do_ot 99 8.5 0.001 0.001 25.952 26.061 ot_scf_mini 99 9.5 0.003 0.004 24.341 24.447 multiply_cannon 2055 13.4 0.245 0.263 22.552 23.792 multiply_cannon_loop 2055 14.4 0.879 0.903 20.981 21.537 ot_mini 99 10.5 0.001 0.001 13.964 14.094 multiply_cannon_multrec 24660 15.4 4.195 6.654 12.606 13.885 rebuild_ks_matrix 110 8.3 0.000 0.000 11.888 12.016 qs_ks_build_kohn_sham_matrix 110 9.3 0.014 0.027 11.888 12.015 init_scf_loop 11 6.9 0.000 0.001 11.645 11.646 qs_ks_update_qs_env 110 7.6 0.001 0.001 10.508 10.621 prepare_preconditioner 11 7.9 0.000 0.000 9.925 9.940 make_preconditioner 11 8.9 0.000 0.001 9.925 9.940 qs_ot_get_derivative 99 11.5 0.001 0.001 9.809 9.920 make_full_inverse_cholesky 11 9.9 0.000 0.000 8.112 9.610 dbcsr_mm_accdrv_process 52304 16.0 6.926 8.544 8.264 9.219 mp_waitall_1 126806 16.4 4.312 6.412 4.312 6.412 sum_up_and_integrate 110 10.3 0.068 0.071 6.262 6.279 integrate_v_rspace 110 11.3 0.003 0.003 6.193 6.209 make_m2s 4110 13.4 0.059 0.062 5.291 5.800 make_images 4110 14.4 0.578 0.700 5.152 5.657 qs_rho_update_rho_low 110 7.6 0.001 0.001 5.589 5.602 calculate_rho_elec 110 8.6 0.077 0.081 5.589 5.602 init_scf_run 11 5.9 0.000 0.001 5.563 5.563 scf_env_initial_rho_setup 11 6.9 0.001 0.002 5.562 5.563 qs_ot_get_p 110 10.4 0.001 0.001 5.347 5.508 cp_fm_upper_to_full 70 13.8 3.365 4.784 3.365 4.784 ot_diis_step 99 11.5 0.011 0.012 4.119 4.119 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 4.026 4.114 apply_single 110 13.6 0.000 0.000 4.026 4.113 dbcsr_complete_redistribute 325 12.2 0.420 0.473 2.748 3.914 qs_ot_p2m_diag 48 11.0 0.055 0.064 3.573 3.587 grid_integrate_task_list 110 12.3 3.280 3.427 3.280 3.427 copy_fm_to_dbcsr 174 11.2 0.004 0.029 2.205 3.352 qs_ot_get_derivative_taylor 52 13.0 0.001 0.001 3.273 3.332 multiply_cannon_sync_h2d 24660 15.4 3.200 3.324 3.200 3.324 pw_transfer 1331 11.6 0.064 0.072 3.209 3.232 hybrid_alltoall_any 4261 16.3 0.120 0.456 2.273 3.160 make_images_data 4110 15.4 0.046 0.050 2.642 3.136 multiply_cannon_metrocomm3 24660 15.4 0.035 0.036 1.368 3.131 fft_wrap_pw1pw2 1111 12.6 0.008 0.008 3.104 3.131 cp_dbcsr_syevd 48 12.0 0.003 0.003 3.101 3.102 qs_ot_get_derivative_diag 47 12.0 0.001 0.001 2.998 3.060 wfi_extrapolate 11 7.9 0.001 0.001 2.954 2.954 transfer_fm_to_dbcsr 11 9.9 0.000 0.000 1.805 2.939 calculate_dm_sparse 110 9.5 0.001 0.001 2.900 2.933 mp_alltoall_i22 605 13.7 1.708 2.914 1.708 2.914 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 2.902 2.904 density_rs2pw 110 9.6 0.004 0.004 2.751 2.864 fft_wrap_pw1pw2_140 451 13.1 0.202 0.212 2.639 2.669 rs_pw_transfer 902 11.9 0.010 0.011 2.472 2.659 cp_fm_cholesky_invert 11 10.9 2.642 2.651 2.642 2.651 cp_fm_diag_elpa 48 13.0 0.000 0.000 2.579 2.581 cp_fm_redistribute_end 48 14.0 1.285 2.554 1.286 2.555 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.494 2.540 cp_fm_diag_elpa_base 48 14.0 1.186 2.425 1.265 2.535 qs_energies_init_hamiltonians 11 5.9 0.035 0.073 2.529 2.529 calculate_first_density_matrix 1 7.0 0.001 0.003 2.524 2.528 fft3d_ps 1111 14.6 1.062 1.097 2.408 2.423 grid_collocate_task_list 110 9.6 2.223 2.421 2.223 2.421 potential_pw2rs 110 12.3 0.012 0.013 2.035 2.043 jit_kernel_multiply 11 15.8 1.011 2.013 1.011 2.013 mp_alltoall_d11v 2046 13.8 1.772 1.958 1.772 1.958 qs_ot_get_orbitals 99 10.5 0.001 0.001 1.690 1.726 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.001 1.605 1.715 cp_fm_cholesky_decompose 22 10.9 1.631 1.679 1.631 1.679 mp_allgather_i34 2055 14.4 0.670 1.670 0.670 1.670 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.624 1.636 acc_transpose_blocks 24660 15.4 0.104 0.106 1.560 1.603 qs_env_update_s_mstruct 11 6.9 0.000 0.001 1.380 1.546 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.001 0.001 1.524 1.542 multiply_cannon_metrocomm4 20550 15.4 0.056 0.060 0.848 1.534 mp_irecv_dv 62702 16.1 0.749 1.457 0.749 1.457 mp_sum_l 6514 12.8 0.962 1.453 0.962 1.453 mp_waitany 13376 13.8 1.138 1.377 1.138 1.377 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="206", plot="h2o_128_md", label="(8n/3r/4t)", y=64.599000, yerr=0.000000 PlotPoint: name="207", plot="h2o_128_md_mem", label="(8n/3r/4t)", y=685.727273, yerr=7.556946 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/13/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 184415158272 0.0% 0.0% 100.0% flops 9 x 9 x 32 269180485632 0.0% 0.0% 100.0% flops 9 x 22 x 32 349395425280 0.0% 0.0% 100.0% flops 22 x 9 x 32 350042406912 0.0% 0.0% 100.0% flops 22 x 22 x 32 453581815808 0.0% 0.0% 100.0% flops 32 x 32 x 9 465064427520 0.0% 0.0% 100.0% flops 32 x 32 x 22 568412078080 0.0% 0.0% 100.0% flops 9 x 32 x 32 572195340288 0.0% 0.0% 100.0% flops 22 x 32 x 32 699349860352 0.0% 0.0% 100.0% flops 9 x 32 x 9 1735942275072 0.0% 0.0% 100.0% flops 22 x 32 x 9 2216407818240 0.0% 0.0% 100.0% flops 9 x 32 x 22 2216407818240 0.0% 0.0% 100.0% flops 22 x 32 x 22 2803661053952 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 12.884056E+12 0.0% 0.0% 100.0% flops max/rank 807.299199E+09 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 984178160 0.0% 0.0% 100.0% number of processed stacks 1438408 0.0% 0.0% 100.0% average stack size 0.0 0.0 684.2 marketing flops 15.646297E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 837.070848E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 197280 MPI messages size (bytes): total size 339.125567E+09 min size 0.000000E+00 max size 13.107200E+06 average size 1.719006E+06 MPI breakdown and total messages size (bytes): size <= 128 1452 0 128 < size <= 8192 0 0 8192 < size <= 32768 132 4325376 32768 < size <= 131072 88656 11620319232 131072 < size <= 4194304 89424 117209825280 4194304 < size <= 16777216 17616 210291069504 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 14 12. MP_Allreduce 7346 33. MP_Alltoall 8043 263767. MP_ISend 32836 654203. MP_IRecv 32836 654587. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65574. MP_Allreduce 9838 640. MP_Sync 100 MP_Alltoall 1496 8504061. MP_ISendRecv 6600 54848. MP_Wait 17226 MP_comm_split 48 MP_ISend 9240 278857. MP_IRecv 9240 278857. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.014 0.029 54.919 54.920 qs_mol_dyn_low 1 2.0 0.003 0.003 54.598 54.607 qs_forces 11 3.9 0.003 0.003 54.532 54.533 qs_energies 11 4.9 0.015 0.027 50.913 50.917 scf_env_do_scf 11 5.9 0.000 0.001 42.436 42.436 scf_env_do_scf_inner_loop 99 6.5 0.002 0.007 34.723 34.723 velocity_verlet 10 3.0 0.002 0.002 30.457 30.460 dbcsr_multiply_generic 2055 12.4 0.104 0.106 22.939 23.125 qs_scf_new_mos 99 7.5 0.001 0.001 20.671 20.725 qs_scf_loop_do_ot 99 8.5 0.001 0.001 20.670 20.724 ot_scf_mini 99 9.5 0.002 0.002 19.433 19.478 multiply_cannon 2055 13.4 0.247 0.256 17.475 18.628 multiply_cannon_loop 2055 14.4 0.323 0.335 16.128 16.394 rebuild_ks_matrix 110 8.3 0.000 0.000 11.446 11.491 qs_ks_build_kohn_sham_matrix 110 9.3 0.012 0.013 11.446 11.490 ot_mini 99 10.5 0.001 0.001 10.552 10.591 qs_ks_update_qs_env 110 7.6 0.001 0.001 10.192 10.232 multiply_cannon_multrec 8220 15.4 3.222 4.646 7.455 8.594 mp_waitall_1 106626 16.5 6.337 7.966 6.337 7.966 init_scf_loop 11 6.9 0.000 0.000 7.664 7.666 qs_ot_get_derivative 99 11.5 0.001 0.001 6.713 6.760 sum_up_and_integrate 110 10.3 0.079 0.081 6.173 6.187 integrate_v_rspace 110 11.3 0.003 0.003 6.094 6.107 prepare_preconditioner 11 7.9 0.000 0.000 6.049 6.053 make_preconditioner 11 8.9 0.000 0.000 6.049 6.053 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.641 5.709 qs_rho_update_rho_low 110 7.6 0.001 0.001 5.561 5.573 calculate_rho_elec 110 8.6 0.115 0.116 5.561 5.572 init_scf_run 11 5.9 0.000 0.001 5.186 5.186 scf_env_initial_rho_setup 11 6.9 0.013 0.021 5.186 5.186 dbcsr_mm_accdrv_process 17442 15.9 2.902 3.974 4.103 5.029 qs_ot_get_p 110 10.4 0.001 0.001 4.935 4.989 multiply_cannon_metrocomm3 8220 15.4 0.017 0.017 3.257 4.843 make_m2s 4110 13.4 0.039 0.040 4.192 4.443 make_images 4110 14.4 0.638 0.692 4.062 4.311 ot_diis_step 99 11.5 0.012 0.012 3.816 3.816 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 3.734 3.780 apply_single 110 13.6 0.000 0.000 3.733 3.779 qs_ot_p2m_diag 48 11.0 0.081 0.084 3.580 3.583 grid_integrate_task_list 110 12.3 3.373 3.488 3.373 3.488 cp_dbcsr_syevd 48 12.0 0.003 0.003 3.266 3.267 pw_transfer 1331 11.6 0.064 0.069 3.220 3.232 fft_wrap_pw1pw2 1111 12.6 0.008 0.008 3.114 3.129 multiply_cannon_sync_h2d 8220 15.4 2.884 3.070 2.884 3.070 cp_fm_cholesky_invert 11 10.9 2.906 2.910 2.906 2.910 make_images_data 4110 15.4 0.038 0.043 2.388 2.798 density_rs2pw 110 9.6 0.004 0.004 2.605 2.787 qs_energies_init_hamiltonians 11 5.9 0.045 0.089 2.729 2.730 cp_fm_diag_elpa 48 13.0 0.000 0.000 2.724 2.725 hybrid_alltoall_any 4261 16.3 0.199 0.862 2.346 2.710 cp_fm_redistribute_end 48 14.0 0.693 2.695 0.697 2.696 fft_wrap_pw1pw2_140 451 13.1 0.213 0.217 2.674 2.692 wfi_extrapolate 11 7.9 0.001 0.001 2.678 2.679 cp_fm_diag_elpa_base 48 14.0 1.820 2.502 1.991 2.660 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 2.611 2.613 calculate_dm_sparse 110 9.5 0.001 0.001 2.483 2.524 grid_collocate_task_list 110 9.6 2.320 2.524 2.320 2.524 calculate_first_density_matrix 1 7.0 0.000 0.000 2.409 2.420 fft3d_ps 1111 14.6 1.113 1.155 2.371 2.391 rs_pw_transfer 902 11.9 0.010 0.010 2.142 2.356 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.144 2.161 build_core_hamiltonian_matrix_ 11 4.9 0.000 0.001 1.775 2.004 potential_pw2rs 110 12.3 0.015 0.015 1.914 1.919 qs_ot_get_derivative_taylor 52 13.0 0.001 0.001 1.865 1.894 qs_ot_get_derivative_diag 47 12.0 0.001 0.001 1.817 1.836 mp_alltoall_d11v 2046 13.8 1.682 1.811 1.682 1.811 cp_fm_cholesky_decompose 22 10.9 1.668 1.685 1.668 1.685 dbcsr_complete_redistribute 325 12.2 0.626 0.686 1.563 1.648 mp_allgather_i34 2055 14.4 0.530 1.620 0.530 1.620 qs_env_update_s_mstruct 11 6.9 0.000 0.000 1.488 1.607 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.586 1.591 cp_dbcsr_plus_fm_fm_t_native 22 8.9 0.001 0.001 1.445 1.457 multiply_cannon_metrocomm1 8220 15.4 0.021 0.022 0.758 1.398 multiply_cannon_metrocomm4 6165 15.4 0.017 0.019 0.485 1.363 mp_irecv_dv 24056 15.7 0.460 1.320 0.460 1.320 qs_create_task_list 11 7.9 0.001 0.001 1.220 1.319 generate_qs_task_list 11 8.9 0.379 0.446 1.219 1.318 mp_waitany 9240 13.8 1.051 1.280 1.051 1.280 copy_dbcsr_to_fm 151 11.3 0.003 0.003 1.180 1.214 jit_kernel_multiply 7 15.6 0.895 1.190 0.895 1.190 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="208", plot="h2o_128_md", label="(8n/2r/6t)", y=54.920000, yerr=0.000000 PlotPoint: name="209", plot="h2o_128_md_mem", label="(8n/2r/6t)", y=788.454545, yerr=10.705679 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/14/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 184415158272 0.0% 0.0% 100.0% flops 9 x 9 x 32 269180485632 0.0% 0.0% 100.0% flops 9 x 22 x 32 349395425280 0.0% 0.0% 100.0% flops 22 x 9 x 32 350042406912 0.0% 0.0% 100.0% flops 22 x 22 x 32 453581815808 0.0% 0.0% 100.0% flops 32 x 32 x 9 465064427520 0.0% 0.0% 100.0% flops 32 x 32 x 22 568412078080 0.0% 0.0% 100.0% flops 9 x 32 x 32 572195340288 0.0% 0.0% 100.0% flops 22 x 32 x 32 699349860352 0.0% 0.0% 100.0% flops 9 x 32 x 9 1735942275072 0.0% 0.0% 100.0% flops 22 x 32 x 9 2216407818240 0.0% 0.0% 100.0% flops 9 x 32 x 22 2216407818240 0.0% 0.0% 100.0% flops 22 x 32 x 22 2803661053952 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 12.884056E+12 0.0% 0.0% 100.0% flops max/rank 1.612391E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 984178160 0.0% 0.0% 100.0% number of processed stacks 1464624 0.0% 0.0% 100.0% average stack size 0.0 0.0 672.0 marketing flops 15.646297E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 1.319678E+09 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 82200 MPI messages size (bytes): total size 297.640985E+09 min size 0.000000E+00 max size 26.214400E+06 average size 3.620936E+06 MPI breakdown and total messages size (bytes): size <= 128 572 0 128 < size <= 8192 0 0 8192 < size <= 32768 44 1441792 32768 < size <= 131072 18560 2432696320 131072 < size <= 4194304 54216 84915781632 4194304 < size <= 16777216 0 0 16777216 < size 8808 210291069504 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3462 67104. MP_Allreduce 9672 819. MP_Sync 52 MP_Alltoall 1474 16505187. MP_ISendRecv 4620 360267. MP_Wait 7524 MP_ISend 2420 1187840. MP_IRecv 2420 1187840. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.015 0.035 86.090 86.091 qs_mol_dyn_low 1 2.0 0.003 0.003 85.791 85.799 qs_forces 11 3.9 0.002 0.003 85.718 85.719 qs_energies 11 4.9 0.007 0.007 81.678 81.681 scf_env_do_scf 11 5.9 0.001 0.001 71.902 71.902 velocity_verlet 10 3.0 0.017 0.018 55.395 55.401 scf_env_do_scf_inner_loop 99 6.5 0.002 0.007 42.811 42.813 init_scf_loop 11 6.9 0.000 0.000 29.019 29.021 dbcsr_multiply_generic 2055 12.4 0.120 0.121 28.621 28.760 prepare_preconditioner 11 7.9 0.000 0.000 27.058 27.071 make_preconditioner 11 8.9 0.000 0.000 27.058 27.071 make_full_inverse_cholesky 11 9.9 0.000 0.000 20.915 26.528 qs_scf_new_mos 99 7.5 0.001 0.001 26.413 26.499 qs_scf_loop_do_ot 99 8.5 0.001 0.001 26.412 26.498 ot_scf_mini 99 9.5 0.002 0.002 24.662 24.734 multiply_cannon 2055 13.4 0.347 0.366 21.521 22.294 multiply_cannon_loop 2055 14.4 0.341 0.344 19.678 19.993 cp_fm_upper_to_full 70 14.2 13.055 18.932 13.055 18.932 ot_mini 99 10.5 0.001 0.001 13.694 13.763 rebuild_ks_matrix 110 8.3 0.000 0.001 13.176 13.257 qs_ks_build_kohn_sham_matrix 110 9.3 0.013 0.013 13.175 13.256 qs_ks_update_qs_env 110 7.6 0.001 0.001 11.990 12.063 dbcsr_complete_redistribute 325 12.2 1.022 1.052 7.729 11.235 copy_fm_to_dbcsr 174 11.2 0.001 0.001 6.699 10.217 multiply_cannon_multrec 8220 15.4 4.367 4.555 9.547 9.650 transfer_fm_to_dbcsr 11 9.9 0.000 0.000 6.128 9.614 mp_alltoall_i22 605 13.7 5.767 9.315 5.767 9.315 qs_ot_get_derivative 99 11.5 0.001 0.001 9.002 9.078 mp_waitall_1 87304 16.6 7.809 8.890 7.809 8.890 sum_up_and_integrate 110 10.3 0.150 0.152 6.632 6.648 integrate_v_rspace 110 11.3 0.004 0.004 6.482 6.498 qs_rho_update_rho_low 110 7.6 0.001 0.001 6.435 6.472 calculate_rho_elec 110 8.6 0.227 0.227 6.434 6.472 qs_ot_get_p 110 10.4 0.001 0.001 5.782 5.861 make_m2s 4110 13.4 0.043 0.044 5.279 5.835 make_images 4110 14.4 0.879 0.932 5.092 5.646 init_scf_run 11 5.9 0.000 0.001 5.528 5.528 scf_env_initial_rho_setup 11 6.9 0.001 0.001 5.528 5.528 cp_fm_cholesky_invert 11 10.9 5.347 5.352 5.347 5.352 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 4.775 5.302 apply_single 110 13.6 0.000 0.000 4.775 5.302 dbcsr_mm_accdrv_process 11614 15.7 3.304 3.657 5.037 5.274 multiply_cannon_metrocomm3 8220 15.4 0.018 0.018 4.795 5.099 ot_diis_step 99 11.5 0.015 0.016 4.670 4.670 qs_ot_p2m_diag 48 11.0 0.151 0.155 4.179 4.186 multiply_cannon_sync_h2d 8220 15.4 3.947 3.955 3.947 3.955 pw_transfer 1331 11.6 0.073 0.073 3.747 3.752 cp_dbcsr_syevd 48 12.0 0.003 0.003 3.734 3.734 grid_integrate_task_list 110 12.3 3.657 3.711 3.657 3.711 hybrid_alltoall_any 4261 16.3 0.256 0.554 2.888 3.655 qs_energies_init_hamiltonians 11 5.9 0.001 0.001 3.641 3.642 fft_wrap_pw1pw2 1111 12.6 0.009 0.009 3.631 3.637 make_images_data 4110 15.4 0.041 0.044 2.884 3.542 qs_ot_get_derivative_taylor 52 13.0 0.001 0.001 2.880 3.342 wfi_extrapolate 11 7.9 0.001 0.001 3.195 3.195 fft_wrap_pw1pw2_140 451 13.1 0.215 0.217 3.148 3.155 cp_fm_diag_elpa 48 13.0 0.000 0.000 3.125 3.126 cp_fm_diag_elpa_base 48 14.0 2.582 2.792 3.123 3.123 calculate_dm_sparse 110 9.5 0.001 0.001 3.105 3.123 density_rs2pw 110 9.6 0.004 0.004 2.929 2.947 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 2.844 2.847 fft3d_ps 1111 14.6 1.267 1.287 2.839 2.845 grid_collocate_task_list 110 9.6 2.626 2.648 2.626 2.648 qs_ot_get_derivative_diag 47 12.0 0.001 0.001 2.413 2.452 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.337 2.361 qs_env_update_s_mstruct 11 6.9 0.000 0.000 2.173 2.239 calculate_first_density_matrix 1 7.0 0.000 0.000 2.229 2.230 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 2.101 2.194 rs_pw_transfer 902 11.9 0.010 0.011 2.113 2.171 mp_alltoall_d11v 2046 13.8 2.059 2.104 2.059 2.104 potential_pw2rs 110 12.3 0.021 0.021 2.068 2.073 cp_fm_cholesky_decompose 22 10.9 1.980 2.004 1.980 2.004 qs_create_task_list 11 7.9 0.001 0.001 1.897 1.941 generate_qs_task_list 11 8.9 0.738 0.790 1.896 1.941 copy_dbcsr_to_fm 151 11.3 0.003 0.003 1.781 1.836 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.790 1.798 jit_kernel_multiply 10 15.3 1.534 1.761 1.534 1.761 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="210", plot="h2o_128_md", label="(8n/1r/12t)", y=86.091000, yerr=0.000000 PlotPoint: name="211", plot="h2o_128_md_mem", label="(8n/1r/12t)", y=1195.909091, yerr=48.274696 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/15/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 9 x 32 1420242647040 0.0% 0.0% 100.0% flops 32 x 32 x 32 1943472701440 0.0% 0.0% 100.0% flops 22 x 9 x 32 1972057190400 0.0% 0.0% 100.0% flops 9 x 22 x 32 1977770336256 0.0% 0.0% 100.0% flops 22 x 22 x 32 2734287699968 0.0% 0.0% 100.0% flops 32 x 32 x 9 4416300122112 0.0% 0.0% 100.0% flops 32 x 32 x 22 5397700149248 0.0% 0.0% 100.0% flops 9 x 32 x 32 5443971710976 0.0% 0.0% 100.0% flops 22 x 32 x 32 6653743202304 0.0% 0.0% 100.0% flops 9 x 32 x 9 11528903135232 0.0% 0.0% 100.0% flops 22 x 32 x 9 15129160814592 0.0% 0.0% 100.0% flops 9 x 32 x 22 15129160814592 0.0% 0.0% 100.0% flops 22 x 32 x 22 19767995056128 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 93.514766E+12 0.0% 0.0% 100.0% flops max/rank 1.094965E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 6755941440 0.0% 0.0% 100.0% number of processed stacks 11950464 0.0% 0.0% 100.0% average stack size 0.0 0.0 565.3 marketing flops 144.580175E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 629.133312E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 10348896 MPI messages size (bytes): total size 4.491514E+12 min size 0.000000E+00 max size 4.537280E+06 average size 434.009000E+03 MPI breakdown and total messages size (bytes): size <= 128 65736 0 128 < size <= 8192 1232 10092544 8192 < size <= 32768 3576680 95640223744 32768 < size <= 131072 1294784 74079797248 131072 < size <= 4194304 5148576 3175955383376 4194304 < size <= 16777216 261888 1145794321408 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4085 56760. MP_Allreduce 11253 785. MP_Sync 170 MP_Alltoall 2226 1653942. MP_ISendRecv 48640 18752. MP_Wait 66796 MP_comm_split 83 MP_ISend 16020 108028. MP_IRecv 16020 108028. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.012 0.030 200.159 200.160 qs_mol_dyn_low 1 2.0 0.003 0.003 199.744 199.757 qs_forces 11 3.9 0.011 0.012 199.606 199.607 qs_energies 11 4.9 0.001 0.002 194.128 194.140 scf_env_do_scf 11 5.9 0.001 0.001 177.727 177.731 scf_env_do_scf_inner_loop 117 6.6 0.003 0.007 157.179 157.181 dbcsr_multiply_generic 2507 12.6 0.175 0.179 122.124 122.506 velocity_verlet 10 3.0 0.001 0.002 120.789 120.791 qs_scf_new_mos 117 7.6 0.001 0.001 118.578 118.731 qs_scf_loop_do_ot 117 8.6 0.001 0.001 118.577 118.730 ot_scf_mini 117 9.6 0.003 0.003 112.035 112.137 multiply_cannon 2507 13.6 0.240 0.249 100.040 101.508 multiply_cannon_loop 2507 14.6 2.026 2.125 97.877 98.941 ot_mini 117 10.6 0.001 0.001 64.190 64.311 multiply_cannon_multrec 60168 15.6 33.664 36.020 41.844 43.996 qs_ot_get_derivative 117 11.6 0.001 0.001 39.308 39.434 rebuild_ks_matrix 128 8.3 0.001 0.001 33.187 33.391 qs_ks_build_kohn_sham_matrix 128 9.3 0.015 0.017 33.187 33.391 multiply_cannon_sync_h2d 60168 15.6 28.136 30.494 28.136 30.494 qs_ks_update_qs_env 128 7.6 0.001 0.001 29.873 30.054 mp_waitall_1 291448 16.2 27.035 29.434 27.035 29.434 qs_ot_get_p 128 10.4 0.001 0.001 26.161 26.418 apply_preconditioner_dbcsr 128 12.6 0.000 0.001 24.336 25.038 apply_single 128 13.6 0.001 0.001 24.336 25.038 ot_diis_step 117 11.6 0.007 0.008 24.547 24.548 init_scf_loop 11 6.9 0.000 0.000 20.473 20.474 qs_ot_p2m_diag 83 11.4 0.078 0.091 19.583 19.643 qs_ot_get_derivative_diag 77 12.4 0.002 0.002 18.043 18.138 cp_dbcsr_syevd 83 12.4 0.004 0.005 17.369 17.371 multiply_cannon_metrocomm3 60168 15.6 0.115 0.119 14.868 16.725 prepare_preconditioner 11 7.9 0.000 0.000 15.881 15.918 make_preconditioner 11 8.9 0.000 0.000 15.881 15.918 make_full_inverse_cholesky 11 9.9 0.000 0.000 15.124 15.281 cp_fm_diag_elpa 83 13.4 0.000 0.001 14.495 14.501 cp_fm_redistribute_end 83 14.4 11.440 14.414 11.451 14.416 cp_fm_diag_elpa_base 83 14.4 2.923 14.120 2.955 14.234 make_m2s 5014 13.6 0.104 0.110 13.789 14.093 make_images 5014 14.6 0.402 0.422 13.608 13.926 sum_up_and_integrate 128 10.3 0.090 0.109 13.889 13.904 integrate_v_rspace 128 11.3 0.003 0.004 13.799 13.816 qs_rho_update_rho_low 128 7.7 0.001 0.001 12.560 12.634 calculate_rho_elec 128 8.7 0.045 0.064 12.560 12.634 init_scf_run 11 5.9 0.000 0.001 12.258 12.258 scf_env_initial_rho_setup 11 6.9 0.001 0.001 12.257 12.258 wfi_extrapolate 11 7.9 0.001 0.001 9.067 9.067 cp_fm_cholesky_invert 11 10.9 9.060 9.067 9.060 9.067 calculate_dm_sparse 128 9.5 0.001 0.001 8.347 8.440 dbcsr_mm_accdrv_process 124484 16.2 3.063 3.289 7.721 8.281 mp_sum_l 7870 13.0 7.021 8.010 7.021 8.010 qs_ot_get_derivative_taylor 40 13.0 0.001 0.001 7.687 7.794 qs_ot_get_orbitals 117 10.6 0.001 0.001 7.697 7.752 make_images_data 5014 15.6 0.070 0.076 6.804 7.672 grid_integrate_task_list 128 12.3 7.116 7.483 7.116 7.483 hybrid_alltoall_any 5200 16.5 0.290 2.252 5.957 7.293 pw_transfer 1547 11.6 0.074 0.102 6.775 7.067 density_rs2pw 128 9.7 0.006 0.007 6.390 6.892 multiply_cannon_metrocomm1 60168 15.6 0.087 0.092 5.277 6.873 fft_wrap_pw1pw2 1291 12.7 0.010 0.013 6.572 6.841 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.003 6.619 6.628 rs_pw_transfer 1046 11.9 0.017 0.019 5.582 6.160 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 5.768 5.871 fft_wrap_pw1pw2_140 523 13.2 0.445 0.509 5.667 5.838 fft3d_ps 1291 14.7 2.091 2.570 5.366 5.578 mp_alltoall_d11v 2415 14.1 4.113 5.158 4.113 5.158 grid_collocate_task_list 128 9.7 4.727 5.051 4.727 5.051 cp_fm_cholesky_decompose 22 10.9 4.665 4.680 4.665 4.680 potential_pw2rs 128 12.3 0.009 0.010 4.406 4.432 mp_sum_d 4464 12.1 3.535 4.173 3.535 4.173 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="400", plot="h2o_256_md", label="(8n/12r/1t)", y=200.160000, yerr=0.000000 PlotPoint: name="401", plot="h2o_256_md_mem", label="(8n/12r/1t)", y=593.272727, yerr=7.249964 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/16/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 9 x 32 1420239992832 0.0% 0.0% 100.0% flops 32 x 32 x 32 1943472701440 0.0% 0.0% 100.0% flops 22 x 9 x 32 1972057190400 0.0% 0.0% 100.0% flops 9 x 22 x 32 1977770336256 0.0% 0.0% 100.0% flops 22 x 22 x 32 2734287699968 0.0% 0.0% 100.0% flops 32 x 32 x 9 4416300122112 0.0% 0.0% 100.0% flops 32 x 32 x 22 5397700149248 0.0% 0.0% 100.0% flops 9 x 32 x 32 5443971710976 0.0% 0.0% 100.0% flops 22 x 32 x 32 6653743202304 0.0% 0.0% 100.0% flops 9 x 32 x 9 11528891191296 0.0% 0.0% 100.0% flops 22 x 32 x 9 15129160814592 0.0% 0.0% 100.0% flops 9 x 32 x 22 15129160814592 0.0% 0.0% 100.0% flops 22 x 32 x 22 19767995056128 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 93.514751E+12 0.0% 0.0% 100.0% flops max/rank 2.183246E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 6755938624 0.0% 0.0% 100.0% number of processed stacks 5975232 0.0% 0.0% 100.0% average stack size 0.0 0.0 1130.7 marketing flops 144.580175E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 832.040960E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 2406720 MPI messages size (bytes): total size 4.100942E+12 min size 0.000000E+00 max size 17.653760E+06 average size 1.703955E+06 MPI breakdown and total messages size (bytes): size <= 128 14916 0 128 < size <= 8192 0 0 8192 < size <= 32768 70860 2317615104 32768 < size <= 131072 722992 55511613440 131072 < size <= 4194304 1375664 1398181724160 4194304 < size <= 16777216 154704 1463834332048 16777216 < size 67584 1181116006400 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4075 57341. MP_Allreduce 11227 947. MP_Sync 170 MP_Alltoall 1969 5001687. MP_ISendRecv 24064 47072. MP_Wait 37948 MP_comm_split 83 MP_ISend 11748 212467. MP_IRecv 11748 212467. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.022 0.052 188.927 188.927 qs_mol_dyn_low 1 2.0 0.003 0.003 188.351 188.364 qs_forces 11 3.9 0.010 0.019 188.267 188.268 qs_energies 11 4.9 0.002 0.005 181.564 181.583 scf_env_do_scf 11 5.9 0.001 0.002 165.277 165.288 scf_env_do_scf_inner_loop 117 6.6 0.003 0.009 132.457 132.459 velocity_verlet 10 3.0 0.001 0.002 118.679 118.681 dbcsr_multiply_generic 2507 12.6 0.185 0.191 96.873 98.132 qs_scf_new_mos 117 7.6 0.001 0.001 94.114 94.648 qs_scf_loop_do_ot 117 8.6 0.001 0.002 94.113 94.647 ot_scf_mini 117 9.6 0.004 0.005 89.295 89.861 multiply_cannon 2507 13.6 0.477 0.531 76.900 80.701 multiply_cannon_loop 2507 14.6 1.259 1.292 73.916 76.467 ot_mini 117 10.6 0.001 0.001 49.676 50.276 mp_waitall_1 226760 16.4 24.698 38.104 24.698 38.104 multiply_cannon_multrec 30084 15.6 21.991 26.374 31.601 36.206 rebuild_ks_matrix 128 8.3 0.001 0.001 32.524 33.124 qs_ks_build_kohn_sham_matrix 128 9.3 0.017 0.021 32.523 33.123 init_scf_loop 11 6.9 0.001 0.004 32.732 32.733 qs_ks_update_qs_env 128 7.6 0.001 0.001 29.313 29.855 multiply_cannon_metrocomm3 30084 15.6 0.096 0.103 15.642 28.902 prepare_preconditioner 11 7.9 0.000 0.000 28.388 28.450 make_preconditioner 11 8.9 0.000 0.002 28.388 28.450 qs_ot_get_derivative 117 11.6 0.001 0.001 27.669 28.241 make_full_inverse_cholesky 11 9.9 0.000 0.000 27.073 27.618 apply_preconditioner_dbcsr 128 12.6 0.000 0.001 22.032 23.152 apply_single 128 13.6 0.001 0.001 22.032 23.151 qs_ot_get_p 128 10.4 0.001 0.001 21.552 22.441 ot_diis_step 117 11.6 0.014 0.015 21.832 21.834 multiply_cannon_sync_h2d 30084 15.6 19.241 21.301 19.241 21.301 qs_ot_p2m_diag 83 11.4 0.187 0.215 16.729 16.768 cp_fm_cholesky_invert 11 10.9 16.564 16.577 16.564 16.577 cp_dbcsr_syevd 83 12.4 0.005 0.006 15.585 15.586 make_m2s 5014 13.6 0.090 0.095 14.201 15.482 make_images 5014 14.6 1.160 1.337 13.991 15.272 sum_up_and_integrate 128 10.3 0.116 0.132 14.306 14.336 integrate_v_rspace 128 11.3 0.003 0.003 14.190 14.225 qs_rho_update_rho_low 128 7.7 0.001 0.001 12.842 12.871 calculate_rho_elec 128 8.7 0.088 0.105 12.842 12.871 cp_fm_diag_elpa 83 13.4 0.000 0.001 12.494 12.506 cp_fm_redistribute_end 83 14.4 7.298 12.438 7.308 12.439 cp_fm_diag_elpa_base 83 14.4 4.890 11.908 5.108 12.340 init_scf_run 11 5.9 0.000 0.001 11.370 11.372 scf_env_initial_rho_setup 11 6.9 0.001 0.001 11.370 11.372 qs_ot_get_derivative_diag 77 12.4 0.002 0.002 10.903 11.308 multiply_cannon_metrocomm4 27577 15.6 0.096 0.113 3.749 10.594 make_images_data 5014 15.6 0.066 0.073 8.648 10.226 mp_irecv_dv 69486 16.3 3.554 10.204 3.554 10.204 dbcsr_mm_accdrv_process 62242 16.2 4.615 5.171 9.069 9.669 hybrid_alltoall_any 5200 16.5 0.343 1.520 7.343 9.385 wfi_extrapolate 11 7.9 0.001 0.001 8.288 8.288 pw_transfer 1547 11.6 0.084 0.103 7.714 7.787 fft_wrap_pw1pw2 1291 12.7 0.010 0.011 7.491 7.561 grid_integrate_task_list 128 12.3 7.174 7.546 7.174 7.546 density_rs2pw 128 9.7 0.006 0.007 6.630 7.111 cp_fm_cholesky_decompose 22 10.9 6.944 7.025 6.944 7.025 qs_ot_get_derivative_taylor 40 13.0 0.001 0.001 6.161 6.868 calculate_dm_sparse 128 9.5 0.001 0.001 6.491 6.646 fft_wrap_pw1pw2_140 523 13.2 0.469 0.515 6.556 6.624 mp_sum_l 7870 13.0 3.997 6.119 3.997 6.119 rs_pw_transfer 1046 11.9 0.014 0.017 5.583 6.118 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 6.072 6.082 fft3d_ps 1291 14.7 2.779 2.927 5.861 5.906 qs_ot_get_orbitals 117 10.6 0.001 0.001 5.345 5.416 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 5.198 5.361 grid_collocate_task_list 128 9.7 4.911 5.289 4.911 5.289 mp_alltoall_d11v 2415 14.1 4.137 4.857 4.137 4.857 potential_pw2rs 128 12.3 0.015 0.017 4.760 4.777 mp_allgather_i34 2507 14.6 1.558 4.490 1.558 4.490 dbcsr_complete_redistribute 395 12.7 0.780 0.874 3.148 4.002 mp_sum_d 4459 12.1 2.662 3.972 2.662 3.972 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="402", plot="h2o_256_md", label="(8n/6r/2t)", y=188.927000, yerr=0.000000 PlotPoint: name="403", plot="h2o_256_md_mem", label="(8n/6r/2t)", y=792.272727, yerr=2.987578 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/17/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 9 x 32 1420242647040 0.0% 0.0% 100.0% flops 32 x 32 x 32 1943472701440 0.0% 0.0% 100.0% flops 22 x 9 x 32 1972057190400 0.0% 0.0% 100.0% flops 9 x 22 x 32 1977770336256 0.0% 0.0% 100.0% flops 22 x 22 x 32 2734287699968 0.0% 0.0% 100.0% flops 32 x 32 x 9 4416300122112 0.0% 0.0% 100.0% flops 32 x 32 x 22 5397700149248 0.0% 0.0% 100.0% flops 9 x 32 x 32 5443971710976 0.0% 0.0% 100.0% flops 22 x 32 x 32 6653743202304 0.0% 0.0% 100.0% flops 9 x 32 x 9 11528903135232 0.0% 0.0% 100.0% flops 22 x 32 x 9 15129160814592 0.0% 0.0% 100.0% flops 9 x 32 x 22 15129160814592 0.0% 0.0% 100.0% flops 22 x 32 x 22 19767995056128 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 93.514766E+12 0.0% 0.0% 100.0% flops max/rank 2.928533E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 6755941440 0.0% 0.0% 100.0% number of processed stacks 3984192 0.0% 0.0% 100.0% average stack size 0.0 0.0 1695.7 marketing flops 144.579337E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 932.749312E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 1042912 MPI messages size (bytes): total size 2.716210E+12 min size 0.000000E+00 max size 26.214400E+06 average size 2.604448E+06 MPI breakdown and total messages size (bytes): size <= 128 6424 0 128 < size <= 8192 0 0 8192 < size <= 32768 264 8650752 32768 < size <= 131072 281856 36943429632 131072 < size <= 4194304 660064 996105256960 4194304 < size <= 16777216 65632 931531265168 16777216 < size 28672 751619276800 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4103 56951. MP_Allreduce 11296 984. MP_Sync 170 MP_Alltoall 1712 9388896. MP_ISendRecv 15872 75008. MP_Wait 29756 MP_comm_split 83 MP_ISend 11748 275205. MP_IRecv 11748 275205. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.022 0.040 175.995 175.996 qs_mol_dyn_low 1 2.0 0.003 0.003 175.450 175.463 qs_forces 11 3.9 0.003 0.004 175.350 175.355 qs_energies 11 4.9 0.018 0.069 168.800 168.809 scf_env_do_scf 11 5.9 0.002 0.012 153.327 153.335 scf_env_do_scf_inner_loop 117 6.6 0.003 0.008 118.399 118.401 velocity_verlet 10 3.0 0.001 0.002 112.591 112.593 dbcsr_multiply_generic 2507 12.6 0.192 0.195 81.537 82.517 qs_scf_new_mos 117 7.6 0.001 0.001 81.862 82.203 qs_scf_loop_do_ot 117 8.6 0.001 0.002 81.861 82.202 ot_scf_mini 117 9.6 0.004 0.005 77.714 78.087 multiply_cannon 2507 13.6 0.498 0.520 61.407 65.899 multiply_cannon_loop 2507 14.6 0.859 0.884 58.544 60.870 ot_mini 117 10.6 0.001 0.001 42.609 42.998 mp_waitall_1 178456 16.5 25.821 35.472 25.821 35.472 init_scf_loop 11 6.9 0.001 0.004 34.822 34.823 prepare_preconditioner 11 7.9 0.000 0.000 30.825 30.871 make_preconditioner 11 8.9 0.000 0.002 30.825 30.871 rebuild_ks_matrix 128 8.3 0.001 0.001 30.065 30.514 qs_ks_build_kohn_sham_matrix 128 9.3 0.016 0.019 30.064 30.513 make_full_inverse_cholesky 11 9.9 0.000 0.000 28.471 29.847 qs_ks_update_qs_env 128 7.6 0.001 0.001 27.145 27.547 multiply_cannon_multrec 20056 15.6 13.354 16.728 21.974 25.438 multiply_cannon_metrocomm3 20056 15.6 0.059 0.062 15.450 25.038 qs_ot_get_derivative 117 11.6 0.001 0.002 22.733 23.116 apply_preconditioner_dbcsr 128 12.6 0.000 0.000 19.989 21.010 apply_single 128 13.6 0.001 0.001 19.989 21.009 qs_ot_get_p 128 10.4 0.001 0.001 20.524 21.003 ot_diis_step 117 11.6 0.018 0.018 19.766 19.766 qs_ot_p2m_diag 83 11.4 0.265 0.272 16.285 16.293 make_m2s 5014 13.6 0.081 0.086 15.131 15.894 multiply_cannon_sync_h2d 20056 15.6 14.223 15.872 14.223 15.872 make_images 5014 14.6 1.186 1.284 14.897 15.660 cp_dbcsr_syevd 83 12.4 0.005 0.005 15.275 15.276 cp_fm_cholesky_invert 11 10.9 14.444 14.453 14.444 14.453 sum_up_and_integrate 128 10.3 0.134 0.145 14.197 14.224 integrate_v_rspace 128 11.3 0.003 0.004 14.063 14.091 qs_rho_update_rho_low 128 7.7 0.001 0.001 13.018 13.058 calculate_rho_elec 128 8.7 0.132 0.146 13.018 13.058 cp_fm_diag_elpa 83 13.4 0.000 0.001 12.119 12.120 cp_fm_redistribute_end 83 14.4 4.607 12.057 4.621 12.060 cp_fm_diag_elpa_base 83 14.4 7.012 11.398 7.418 11.951 make_images_data 5014 15.6 0.061 0.068 9.476 10.646 init_scf_run 11 5.9 0.000 0.001 10.366 10.366 scf_env_initial_rho_setup 11 6.9 0.002 0.005 10.366 10.366 hybrid_alltoall_any 5200 16.5 0.431 1.986 8.174 9.670 multiply_cannon_metrocomm4 17549 15.6 0.063 0.074 3.524 9.348 qs_ot_get_derivative_diag 77 12.4 0.002 0.002 8.934 9.209 mp_irecv_dv 50230 16.2 3.399 9.091 3.399 9.091 dbcsr_mm_accdrv_process 41502 16.2 4.469 5.204 8.077 8.258 pw_transfer 1547 11.6 0.084 0.103 7.773 7.881 grid_integrate_task_list 128 12.3 7.329 7.846 7.329 7.846 fft_wrap_pw1pw2 1291 12.7 0.010 0.011 7.550 7.665 cp_fm_upper_to_full 105 14.5 5.673 7.366 5.673 7.366 wfi_extrapolate 11 7.9 0.001 0.001 7.344 7.344 cp_fm_cholesky_decompose 22 10.9 7.299 7.334 7.299 7.334 density_rs2pw 128 9.7 0.006 0.006 6.465 6.830 fft_wrap_pw1pw2_140 523 13.2 0.478 0.532 6.622 6.750 dbcsr_complete_redistribute 395 12.7 1.176 1.219 4.630 6.360 fft3d_ps 1291 14.7 2.691 2.906 5.819 5.881 calculate_dm_sparse 128 9.5 0.001 0.001 5.741 5.833 grid_collocate_task_list 128 9.7 5.081 5.496 5.081 5.496 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 5.476 5.480 rs_pw_transfer 1046 11.9 0.014 0.014 5.080 5.413 qs_ot_get_derivative_taylor 40 13.0 0.001 0.001 4.562 5.252 copy_fm_to_dbcsr 209 11.7 0.002 0.002 3.436 5.164 mp_alltoall_d11v 2415 14.1 4.292 4.724 4.292 4.724 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 4.543 4.670 mp_allgather_i34 2507 14.6 1.391 4.598 1.391 4.598 potential_pw2rs 128 12.3 0.020 0.022 4.555 4.573 mp_sum_l 7870 13.0 3.174 4.521 3.174 4.521 qs_ot_get_orbitals 117 10.6 0.001 0.001 4.054 4.091 transfer_fm_to_dbcsr 11 9.9 0.000 0.000 2.334 4.041 mp_alltoall_i22 716 14.1 1.949 3.799 1.949 3.799 qs_energies_init_hamiltonians 11 5.9 0.013 0.026 3.747 3.748 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="404", plot="h2o_256_md", label="(8n/4r/3t)", y=175.996000, yerr=0.000000 PlotPoint: name="405", plot="h2o_256_md_mem", label="(8n/4r/3t)", y=885.363636, yerr=9.383474 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/18/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 9 x 32 1420242647040 0.0% 0.0% 100.0% flops 32 x 32 x 32 1943472701440 0.0% 0.0% 100.0% flops 22 x 9 x 32 1972057190400 0.0% 0.0% 100.0% flops 9 x 22 x 32 1977770336256 0.0% 0.0% 100.0% flops 22 x 22 x 32 2734287699968 0.0% 0.0% 100.0% flops 32 x 32 x 9 4416300122112 0.0% 0.0% 100.0% flops 32 x 32 x 22 5397700149248 0.0% 0.0% 100.0% flops 9 x 32 x 32 5443971710976 0.0% 0.0% 100.0% flops 22 x 32 x 32 6653743202304 0.0% 0.0% 100.0% flops 9 x 32 x 9 11528903135232 0.0% 0.0% 100.0% flops 22 x 32 x 9 15129160814592 0.0% 0.0% 100.0% flops 9 x 32 x 22 15129160814592 0.0% 0.0% 100.0% flops 22 x 32 x 22 19767995056128 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 93.514766E+12 0.0% 0.0% 100.0% flops max/rank 4.353791E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 6755941440 0.0% 0.0% 100.0% number of processed stacks 5977344 0.0% 0.0% 100.0% average stack size 0.0 0.0 1130.3 marketing flops 144.580175E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 1.141502E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 1143192 MPI messages size (bytes): total size 2.023815E+12 min size 0.000000E+00 max size 17.653760E+06 average size 1.770320E+06 MPI breakdown and total messages size (bytes): size <= 128 6996 0 128 < size <= 8192 0 0 8192 < size <= 32768 396 8650752 32768 < size <= 131072 319024 36042702848 131072 < size <= 4194304 715736 785529176064 4194304 < size <= 16777216 70320 665379475120 16777216 < size 30720 536870912000 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4085 57194. MP_Allreduce 11251 1067. MP_Sync 170 MP_Alltoall 1712 12503107. MP_ISendRecv 11776 75008. MP_Wait 28330 MP_comm_split 83 MP_ISend 14952 244818. MP_IRecv 14952 244818. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.031 0.073 187.754 187.756 qs_mol_dyn_low 1 2.0 0.003 0.004 187.320 187.333 qs_forces 11 3.9 0.010 0.060 187.169 187.208 qs_energies 11 4.9 0.003 0.006 180.088 180.100 scf_env_do_scf 11 5.9 0.001 0.001 163.509 163.523 velocity_verlet 10 3.0 0.002 0.003 124.340 124.343 scf_env_do_scf_inner_loop 117 6.6 0.003 0.008 116.460 116.462 qs_scf_new_mos 117 7.6 0.001 0.001 81.005 81.339 qs_scf_loop_do_ot 117 8.6 0.001 0.001 81.004 81.338 dbcsr_multiply_generic 2507 12.6 0.192 0.197 79.315 80.011 ot_scf_mini 117 9.6 0.004 0.005 76.500 76.792 multiply_cannon 2507 13.6 0.560 0.584 54.653 57.586 multiply_cannon_loop 2507 14.6 1.180 1.222 50.921 52.679 init_scf_loop 11 6.9 0.001 0.004 46.926 46.930 prepare_preconditioner 11 7.9 0.000 0.000 42.688 42.718 make_preconditioner 11 8.9 0.001 0.003 42.688 42.718 ot_mini 117 10.6 0.001 0.002 42.393 42.688 make_full_inverse_cholesky 11 9.9 0.000 0.000 36.117 41.329 multiply_cannon_multrec 30084 15.6 14.071 19.166 25.967 30.559 rebuild_ks_matrix 128 8.3 0.001 0.001 28.997 29.282 qs_ks_build_kohn_sham_matrix 128 9.3 0.017 0.020 28.996 29.281 mp_waitall_1 153770 16.5 17.590 27.740 17.590 27.740 qs_ks_update_qs_env 128 7.6 0.001 0.001 26.184 26.440 qs_ot_get_derivative 117 11.6 0.001 0.002 22.638 22.940 make_m2s 5014 13.6 0.095 0.099 20.303 21.382 make_images 5014 14.6 1.966 2.262 20.003 21.082 qs_ot_get_p 128 10.4 0.002 0.012 20.434 20.737 ot_diis_step 117 11.6 0.024 0.070 19.626 19.628 apply_preconditioner_dbcsr 128 12.6 0.000 0.001 19.120 19.619 apply_single 128 13.6 0.001 0.001 19.119 19.618 cp_fm_upper_to_full 105 14.7 11.311 16.772 11.311 16.772 qs_ot_p2m_diag 83 11.4 0.342 0.390 16.235 16.287 cp_fm_cholesky_invert 11 10.9 16.153 16.162 16.153 16.162 cp_dbcsr_syevd 83 12.4 0.005 0.005 14.951 14.952 multiply_cannon_metrocomm3 30084 15.6 0.047 0.049 6.512 14.878 sum_up_and_integrate 128 10.3 0.141 0.155 14.083 14.109 integrate_v_rspace 128 11.3 0.004 0.005 13.942 13.971 dbcsr_complete_redistribute 395 12.7 1.516 1.621 9.210 13.170 qs_rho_update_rho_low 128 7.7 0.001 0.001 13.086 13.121 calculate_rho_elec 128 8.7 0.175 0.190 13.085 13.120 make_images_data 5014 15.6 0.064 0.068 10.945 12.856 multiply_cannon_sync_h2d 30084 15.6 11.764 12.420 11.764 12.420 dbcsr_mm_accdrv_process 62264 16.2 7.385 8.263 11.474 11.980 cp_fm_diag_elpa 83 13.4 0.000 0.001 11.828 11.830 copy_fm_to_dbcsr 209 11.7 0.002 0.002 7.817 11.762 cp_fm_redistribute_end 83 14.4 2.034 11.747 2.051 11.752 hybrid_alltoall_any 5200 16.5 0.527 2.210 9.845 11.751 cp_fm_diag_elpa_base 83 14.4 9.084 11.136 9.668 11.616 init_scf_run 11 5.9 0.000 0.001 10.655 10.656 scf_env_initial_rho_setup 11 6.9 0.001 0.003 10.655 10.656 transfer_fm_to_dbcsr 11 9.9 0.000 0.000 6.548 10.325 mp_alltoall_i22 716 14.1 5.775 9.619 5.775 9.619 qs_ot_get_derivative_diag 77 12.4 0.002 0.002 9.349 9.566 pw_transfer 1547 11.6 0.084 0.099 7.872 7.950 grid_integrate_task_list 128 12.3 7.541 7.925 7.541 7.925 fft_wrap_pw1pw2 1291 12.7 0.010 0.011 7.647 7.732 cp_fm_cholesky_decompose 22 10.9 7.540 7.672 7.540 7.672 wfi_extrapolate 11 7.9 0.001 0.001 7.587 7.587 multiply_cannon_metrocomm4 25070 15.6 0.075 0.084 2.780 6.983 fft_wrap_pw1pw2_140 523 13.2 0.482 0.493 6.759 6.863 mp_irecv_dv 76098 16.2 2.635 6.720 2.635 6.720 density_rs2pw 128 9.7 0.006 0.006 6.237 6.571 calculate_dm_sparse 128 9.5 0.001 0.001 6.167 6.244 fft3d_ps 1291 14.7 2.793 2.865 5.876 5.951 grid_collocate_task_list 128 9.7 5.211 5.570 5.211 5.570 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 5.453 5.530 mp_alltoall_d11v 2415 14.1 4.782 5.315 4.782 5.315 rs_pw_transfer 1046 11.9 0.013 0.014 4.615 4.971 qs_energies_init_hamiltonians 11 5.9 0.001 0.002 4.447 4.449 qs_ot_get_derivative_taylor 40 13.0 0.001 0.001 4.359 4.429 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 4.377 4.417 potential_pw2rs 128 12.3 0.023 0.023 4.296 4.314 qs_ot_get_orbitals 117 10.6 0.001 0.001 4.232 4.303 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="406", plot="h2o_256_md", label="(8n/3r/4t)", y=187.756000, yerr=0.000000 PlotPoint: name="407", plot="h2o_256_md_mem", label="(8n/3r/4t)", y=1075.727273, yerr=20.298190 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/19/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 9 x 32 1430457200640 0.0% 0.0% 100.0% flops 32 x 32 x 32 1962800054272 0.0% 0.0% 100.0% flops 22 x 9 x 32 1986252263424 0.0% 0.0% 100.0% flops 9 x 22 x 32 1992001093632 0.0% 0.0% 100.0% flops 22 x 22 x 32 2753958699008 0.0% 0.0% 100.0% flops 32 x 32 x 9 4454954827776 0.0% 0.0% 100.0% flops 32 x 32 x 22 5444944789504 0.0% 0.0% 100.0% flops 9 x 32 x 32 5492290093056 0.0% 0.0% 100.0% flops 22 x 32 x 32 6712799002624 0.0% 0.0% 100.0% flops 9 x 32 x 9 11613077360640 0.0% 0.0% 100.0% flops 22 x 32 x 9 15239162695680 0.0% 0.0% 100.0% flops 9 x 32 x 22 15239162695680 0.0% 0.0% 100.0% flops 22 x 32 x 22 19911132921856 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 94.232994E+12 0.0% 0.0% 100.0% flops max/rank 5.910120E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 6806382528 0.0% 0.0% 100.0% number of processed stacks 1976928 0.0% 0.0% 100.0% average stack size 0.0 0.0 3442.9 marketing flops 145.650931E+12 ------------------------------------------------------------------------------- # multiplications 2529 max memory usage/rank 1.495142E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 242784 MPI messages size (bytes): total size 1.341806E+12 min size 0.000000E+00 max size 52.428800E+06 average size 5.526746E+06 MPI breakdown and total messages size (bytes): size <= 128 1452 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 132 8650752 131072 < size <= 4194304 115008 60297314304 4194304 < size <= 16777216 105840 554906419200 16777216 < size 20352 726592214928 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 14 12. MP_Allreduce 9010 51. MP_Alltoall 9724 794507. MP_ISend 40420 2100460. MP_IRecv 40420 2099564. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4043 57630. MP_Allreduce 11104 1171. MP_Sync 88 MP_Alltoall 1724 18848081. MP_ISendRecv 7740 122880. MP_Wait 20114 MP_ISend 10760 423501. MP_IRecv 10760 423501. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.024 0.040 172.406 172.406 qs_mol_dyn_low 1 2.0 0.003 0.003 171.943 171.955 qs_forces 11 3.9 0.003 0.003 171.692 171.696 qs_energies 11 4.9 0.002 0.002 164.394 164.404 scf_env_do_scf 11 5.9 0.001 0.001 147.158 147.174 velocity_verlet 10 3.0 0.008 0.011 113.407 113.420 scf_env_do_scf_inner_loop 118 6.6 0.003 0.008 111.438 111.439 dbcsr_multiply_generic 2529 12.6 0.183 0.188 75.478 76.050 qs_scf_new_mos 118 7.6 0.001 0.001 75.715 75.819 qs_scf_loop_do_ot 118 8.6 0.001 0.001 75.714 75.818 ot_scf_mini 118 9.6 0.004 0.004 71.261 71.357 multiply_cannon 2529 13.6 0.592 0.624 55.534 59.923 multiply_cannon_loop 2529 14.6 0.453 0.465 50.636 51.683 ot_mini 118 10.6 0.001 0.001 40.245 40.345 init_scf_loop 11 6.9 0.000 0.000 35.567 35.568 mp_waitall_1 130746 16.6 26.622 34.228 26.622 34.228 prepare_preconditioner 11 7.9 0.000 0.000 31.714 31.743 make_preconditioner 11 8.9 0.000 0.000 31.713 31.743 make_full_inverse_cholesky 11 9.9 0.000 0.000 29.589 29.845 rebuild_ks_matrix 129 8.3 0.001 0.001 28.676 28.760 qs_ks_build_kohn_sham_matrix 129 9.3 0.017 0.017 28.675 28.760 qs_ks_update_qs_env 129 7.6 0.001 0.001 26.184 26.257 multiply_cannon_multrec 10116 15.6 10.486 16.207 18.093 22.149 ot_diis_step 118 11.6 0.020 0.021 20.177 20.178 apply_preconditioner_dbcsr 129 12.6 0.000 0.000 19.721 20.086 apply_single 129 13.6 0.001 0.001 19.721 20.086 qs_ot_get_derivative 118 11.6 0.002 0.002 19.994 20.084 multiply_cannon_metrocomm3 10116 15.6 0.022 0.024 12.868 19.968 make_m2s 5058 13.6 0.067 0.071 16.295 18.740 make_images 5058 14.6 2.340 2.936 15.986 18.428 cp_fm_cholesky_invert 11 10.9 18.207 18.214 18.207 18.214 qs_ot_get_p 129 10.4 0.001 0.001 17.827 17.978 sum_up_and_integrate 129 10.3 0.182 0.193 14.071 14.118 qs_ot_p2m_diag 84 11.4 0.501 0.507 14.062 14.079 integrate_v_rspace 129 11.3 0.004 0.004 13.889 13.947 qs_rho_update_rho_low 129 7.7 0.001 0.001 13.146 13.183 calculate_rho_elec 129 8.7 0.260 0.271 13.146 13.182 cp_dbcsr_syevd 84 12.4 0.005 0.005 12.921 12.922 make_images_data 5058 15.6 0.053 0.060 9.897 12.728 hybrid_alltoall_any 5245 16.5 0.785 3.577 9.721 12.296 multiply_cannon_sync_h2d 10116 15.6 11.628 12.229 11.628 12.229 init_scf_run 11 5.9 0.000 0.001 10.461 10.461 scf_env_initial_rho_setup 11 6.9 0.001 0.001 10.461 10.461 cp_fm_diag_elpa 84 13.4 0.000 0.000 9.871 9.879 cp_fm_diag_elpa_base 84 14.4 9.622 9.700 9.857 9.865 grid_integrate_task_list 129 12.3 7.819 8.260 7.819 8.260 qs_ot_get_derivative_diag 78 12.4 0.002 0.003 8.020 8.089 cp_fm_cholesky_decompose 22 10.9 7.875 7.990 7.875 7.990 dbcsr_mm_accdrv_process 20934 16.1 2.788 3.639 7.236 7.962 pw_transfer 1559 11.6 0.083 0.091 7.739 7.764 fft_wrap_pw1pw2 1301 12.7 0.010 0.010 7.517 7.542 mp_allgather_i34 2529 14.6 3.089 7.385 3.089 7.385 multiply_cannon_metrocomm1 10116 15.6 0.029 0.030 4.357 7.363 wfi_extrapolate 11 7.9 0.001 0.001 7.350 7.351 fft_wrap_pw1pw2_140 527 13.2 0.506 0.527 6.628 6.654 calculate_dm_sparse 129 9.5 0.001 0.001 6.215 6.291 density_rs2pw 129 9.7 0.005 0.006 5.922 6.107 grid_collocate_task_list 129 9.7 5.530 5.797 5.530 5.797 fft3d_ps 1301 14.7 2.748 2.808 5.664 5.702 multiply_cannon_metrocomm4 7587 15.6 0.024 0.027 1.896 5.537 mp_alltoall_d11v 2429 14.1 4.744 5.531 4.744 5.531 dbcsr_complete_redistribute 397 12.7 2.120 2.230 5.216 5.497 mp_irecv_dv 29102 15.9 1.859 5.454 1.859 5.454 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 5.349 5.383 qs_energies_init_hamiltonians 11 5.9 0.001 0.001 5.226 5.228 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 4.271 4.307 rs_pw_transfer 1054 12.0 0.013 0.013 4.070 4.263 potential_pw2rs 129 12.3 0.027 0.028 4.056 4.072 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 3.560 3.840 qs_ot_get_orbitals 118 10.6 0.001 0.001 3.779 3.823 copy_fm_to_dbcsr 210 11.7 0.002 0.002 3.401 3.733 copy_dbcsr_to_fm 187 11.8 0.004 0.004 3.546 3.623 qs_ot_get_derivative_taylor 40 13.0 0.001 0.001 3.451 3.479 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="408", plot="h2o_256_md", label="(8n/2r/6t)", y=172.406000, yerr=0.000000 PlotPoint: name="409", plot="h2o_256_md_mem", label="(8n/2r/6t)", y=1399.818182, yerr=46.853385 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/20/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 9 x 32 1410022950912 0.0% 0.0% 100.0% flops 32 x 32 x 32 1924145348608 0.0% 0.0% 100.0% flops 22 x 9 x 32 1957871443968 0.0% 0.0% 100.0% flops 9 x 22 x 32 1963542011904 0.0% 0.0% 100.0% flops 22 x 22 x 32 2714615709696 0.0% 0.0% 100.0% flops 32 x 32 x 9 4377645416448 0.0% 0.0% 100.0% flops 32 x 32 x 22 5350455508992 0.0% 0.0% 100.0% flops 9 x 32 x 32 5395653328896 0.0% 0.0% 100.0% flops 22 x 32 x 32 6594687401984 0.0% 0.0% 100.0% flops 9 x 32 x 9 11444706349056 0.0% 0.0% 100.0% flops 22 x 32 x 9 15019182452736 0.0% 0.0% 100.0% flops 9 x 32 x 22 15019182452736 0.0% 0.0% 100.0% flops 22 x 32 x 22 19624853225472 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 92.796564E+12 0.0% 0.0% 100.0% flops max/rank 11.606412E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 6705499488 0.0% 0.0% 100.0% number of processed stacks 1947808 0.0% 0.0% 100.0% average stack size 0.0 0.0 3442.6 marketing flops 143.507742E+12 ------------------------------------------------------------------------------- # multiplications 2485 max memory usage/rank 3.035910E+09 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 99400 MPI messages size (bytes): total size 1.127422E+12 min size 0.000000E+00 max size 104.857600E+06 average size 11.342272E+06 MPI breakdown and total messages size (bytes): size <= 128 572 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 44 2883584 131072 < size <= 4194304 44768 34745614336 4194304 < size <= 16777216 43984 376564613120 16777216 < size 10032 716108490000 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3995 59242. MP_Allreduce 10985 1515. MP_Sync 86 MP_Alltoall 1700 36954339. MP_ISendRecv 3556 218624. MP_Wait 11506 MP_ISend 6360 1080477. MP_IRecv 6360 1080477. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.022 0.050 286.420 286.420 qs_mol_dyn_low 1 2.0 0.003 0.003 285.902 285.912 qs_forces 11 3.9 0.003 0.006 285.803 285.813 qs_energies 11 4.9 0.002 0.002 277.242 277.256 scf_env_do_scf 11 5.9 0.001 0.001 255.347 255.365 velocity_verlet 10 3.0 0.002 0.002 207.291 207.298 scf_env_do_scf_inner_loop 116 6.6 0.003 0.008 130.409 130.411 init_scf_loop 11 6.9 0.000 0.000 124.682 124.684 prepare_preconditioner 11 7.9 0.000 0.000 119.987 120.015 make_preconditioner 11 8.9 0.000 0.000 119.987 120.015 make_full_inverse_cholesky 11 9.9 0.000 0.000 95.682 117.109 qs_scf_new_mos 116 7.6 0.001 0.001 89.288 89.386 qs_scf_loop_do_ot 116 8.6 0.001 0.001 89.287 89.385 ot_scf_mini 116 9.6 0.004 0.004 84.561 84.606 dbcsr_multiply_generic 2485 12.5 0.212 0.218 82.566 83.211 cp_fm_upper_to_full 104 14.8 53.258 76.499 53.258 76.499 multiply_cannon 2485 13.5 0.698 0.744 58.166 59.154 multiply_cannon_loop 2485 14.5 0.469 0.477 54.396 55.836 ot_mini 116 10.6 0.001 0.001 43.993 44.032 dbcsr_complete_redistribute 393 12.7 4.002 4.050 29.932 43.461 copy_fm_to_dbcsr 208 11.6 0.001 0.002 26.570 40.070 transfer_fm_to_dbcsr 11 9.9 0.000 0.000 24.259 37.664 mp_alltoall_i22 712 14.1 22.072 35.459 22.072 35.459 cp_fm_cholesky_invert 11 10.9 33.297 33.303 33.297 33.303 rebuild_ks_matrix 127 8.3 0.001 0.001 32.966 33.020 qs_ks_build_kohn_sham_matrix 127 9.3 0.017 0.017 32.965 33.019 mp_waitall_1 104546 16.7 28.268 31.993 28.268 31.993 qs_ks_update_qs_env 127 7.6 0.001 0.001 30.735 30.794 qs_ot_get_p 127 10.4 0.001 0.001 25.243 25.302 qs_ot_get_derivative 116 11.6 0.002 0.002 23.760 23.807 qs_ot_p2m_diag 82 11.4 0.867 0.872 21.321 21.351 make_m2s 4970 13.5 0.076 0.078 19.870 20.944 make_images 4970 14.5 3.717 3.888 19.395 20.472 ot_diis_step 116 11.6 0.022 0.022 20.200 20.201 multiply_cannon_metrocomm3 9940 15.5 0.023 0.023 18.580 19.998 apply_preconditioner_dbcsr 127 12.6 0.000 0.000 19.494 19.688 apply_single 127 13.6 0.001 0.001 19.494 19.687 cp_dbcsr_syevd 82 12.4 0.005 0.006 19.626 19.627 multiply_cannon_multrec 9940 15.5 10.348 12.072 17.686 17.779 cp_fm_diag_elpa 82 13.4 0.000 0.000 16.498 16.499 cp_fm_diag_elpa_base 82 14.4 12.189 13.791 16.494 16.495 multiply_cannon_sync_h2d 9940 15.5 15.528 15.566 15.528 15.566 sum_up_and_integrate 127 10.3 0.318 0.321 15.414 15.504 integrate_v_rspace 127 11.3 0.004 0.004 15.095 15.185 qs_rho_update_rho_low 127 7.7 0.001 0.001 14.709 14.995 calculate_rho_elec 127 8.7 0.479 0.479 14.709 14.994 hybrid_alltoall_any 5155 16.4 1.290 2.994 10.914 12.759 make_images_data 4970 15.5 0.059 0.064 10.783 12.626 init_scf_run 11 5.9 0.000 0.001 11.982 11.982 scf_env_initial_rho_setup 11 6.9 0.001 0.001 11.982 11.982 qs_ot_get_derivative_diag 76 12.4 0.002 0.002 9.205 9.245 dbcsr_mm_accdrv_process 20590 16.0 3.783 5.703 7.104 8.933 cp_fm_cholesky_decompose 22 10.9 8.797 8.831 8.797 8.831 grid_integrate_task_list 127 12.3 8.478 8.658 8.478 8.658 wfi_extrapolate 11 7.9 0.001 0.001 8.562 8.562 pw_transfer 1535 11.6 0.089 0.090 8.130 8.138 qs_energies_init_hamiltonians 11 5.9 0.001 0.001 7.953 7.961 fft_wrap_pw1pw2 1281 12.7 0.011 0.011 7.899 7.906 fft_wrap_pw1pw2_140 519 13.2 0.536 0.539 7.023 7.038 calculate_dm_sparse 127 9.5 0.001 0.001 6.869 6.970 mp_alltoall_d11v 2401 14.1 6.810 6.960 6.810 6.960 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 6.248 6.326 grid_collocate_task_list 127 9.7 6.284 6.318 6.284 6.318 copy_dbcsr_to_fm 185 11.7 0.004 0.004 6.013 6.059 fft3d_ps 1281 14.7 2.739 2.747 5.955 5.970 density_rs2pw 127 9.7 0.005 0.005 5.837 5.871 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="410", plot="h2o_256_md", label="(8n/1r/12t)", y=286.420000, yerr=0.000000 PlotPoint: name="411", plot="h2o_256_md_mem", label="(8n/1r/12t)", y=2661.181818, yerr=186.977791 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/21/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 0.0% 100.0% flops max/rank 2.766000E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 0.0% 100.0% number of processed stacks 419739 0.0% 0.0% 100.0% average stack size 0.0 0.0 22952.9 marketing flops 1.742116E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 1.261548E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 458208 MPI messages size (bytes): total size 3.456111E+12 min size 0.000000E+00 max size 18.735064E+06 average size 7.542668E+06 MPI breakdown and total messages size (bytes): size <= 128 112896 0 128 < size <= 8192 0 0 8192 < size <= 32768 224 5687808 32768 < size <= 131072 10528 813356544 131072 < size <= 4194304 36422 76284728544 4194304 < size <= 16777216 294266 3312457683808 16777216 < size 3872 66548597808 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 255669. MP_Allreduce 3059 6274. MP_Sync 4 MP_Alltoall 54 MP_ISendRecv 570 19200. MP_Wait 1302 MP_ISend 642 197829. MP_IRecv 642 197607. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.021 0.050 84.922 84.931 qs_energies 1 2.0 0.000 0.000 84.522 84.529 ls_scf 1 3.0 0.000 0.000 83.628 83.635 dbcsr_multiply_generic 111 6.7 0.014 0.015 72.854 73.047 multiply_cannon 111 7.7 0.017 0.021 55.963 57.360 multiply_cannon_loop 111 8.7 0.210 0.226 52.516 54.157 ls_scf_main 1 4.0 0.000 0.000 52.413 52.414 density_matrix_trs4 2 5.0 0.002 0.003 46.915 46.992 ls_scf_init_scf 1 4.0 0.000 0.001 28.129 28.130 ls_scf_init_matrix_S 1 5.0 0.000 0.000 27.082 27.133 mp_waitall_1 11316 10.9 22.832 25.637 22.832 25.637 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.001 24.985 25.008 multiply_cannon_multrec 2664 9.7 8.152 9.058 15.451 17.295 multiply_cannon_sync_h2d 2664 9.7 13.552 15.297 13.552 15.297 make_m2s 222 7.7 0.008 0.010 13.181 13.658 make_images 222 8.7 0.099 0.109 13.159 13.638 multiply_cannon_metrocomm1 2664 9.7 0.010 0.011 9.962 12.717 multiply_cannon_metrocomm3 2664 9.7 0.009 0.011 5.389 9.512 make_images_data 222 9.7 0.004 0.005 7.809 8.436 hybrid_alltoall_any 227 10.6 0.215 1.831 6.608 8.024 dbcsr_mm_accdrv_process 4760 10.4 0.507 0.602 6.916 7.893 dbcsr_mm_accdrv_process_sort 4760 11.4 6.209 7.102 6.209 7.102 calculate_norms 4752 9.8 5.510 6.290 5.510 6.290 apply_matrix_preconditioner 6 5.3 0.000 0.000 5.114 5.274 mp_sum_l 807 5.4 3.170 4.605 3.170 4.605 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 2.385 3.492 make_images_sizes 222 9.7 0.000 0.000 0.789 3.360 mp_alltoall_i44 222 10.7 0.789 3.360 0.789 3.360 multiply_cannon_metrocomm4 2442 9.7 0.012 0.014 2.073 3.358 mp_irecv_dv 6231 10.9 2.055 3.340 2.055 3.340 arnoldi_extremal 4 6.8 0.000 0.000 3.206 3.231 arnoldi_normal_ev 4 7.8 0.001 0.002 3.206 3.231 build_subspace 16 8.4 0.009 0.012 3.104 3.106 ls_scf_post 1 4.0 0.000 0.000 3.085 3.092 ls_scf_store_result 1 5.0 0.000 0.000 2.894 2.944 dbcsr_special_finalize 555 9.7 0.005 0.005 2.228 2.699 dbcsr_merge_single_wm 555 10.7 0.453 0.568 2.221 2.691 make_images_pack 222 9.7 2.209 2.635 2.210 2.637 dbcsr_matrix_vector_mult 304 9.0 0.003 0.010 2.311 2.553 dbcsr_matrix_vector_mult_local 304 10.0 2.064 2.460 2.066 2.462 dbcsr_sort_data 658 11.4 2.026 2.424 2.026 2.424 ls_scf_dm_to_ks 2 5.0 0.000 0.000 2.198 2.294 buffer_matrices_ensure_size 222 8.7 1.745 2.089 1.745 2.089 qs_ks_update_qs_env 3 6.3 0.000 0.000 1.719 1.721 rebuild_ks_matrix 3 7.3 0.000 0.000 1.710 1.711 qs_ks_build_kohn_sham_matrix 3 8.3 0.001 0.003 1.710 1.711 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="500", plot="h2o_32_nrep3_ls", label="(8n/12r/1t)", y=84.931000, yerr=0.000000 PlotPoint: name="501", plot="h2o_32_nrep3_ls_mem", label="(8n/12r/1t)", y=1141.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/22/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 0.0% 100.0% flops max/rank 5.588524E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 0.0% 100.0% number of processed stacks 368848 0.0% 0.0% 100.0% average stack size 0.0 0.0 26119.8 marketing flops 1.742116E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 2.082890E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 106560 MPI messages size (bytes): total size 2.699093E+12 min size 0.000000E+00 max size 72.286792E+06 average size 25.329324E+06 MPI breakdown and total messages size (bytes): size <= 128 23040 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 3264 325830144 131072 < size <= 4194304 5280 3328561104 4194304 < size <= 16777216 12709 156766962056 16777216 < size 62267 2538670978840 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 266696. MP_Allreduce 3058 10339. MP_Sync 4 MP_Alltoall 47 15335933. MP_ISendRecv 282 57600. MP_Wait 828 MP_ISend 462 414589. MP_IRecv 462 413870. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.024 0.044 90.152 90.154 qs_energies 1 2.0 0.000 0.000 89.720 89.724 ls_scf 1 3.0 0.000 0.000 88.416 88.420 dbcsr_multiply_generic 111 6.7 0.015 0.016 74.729 75.085 multiply_cannon 111 7.7 0.028 0.040 52.928 56.429 ls_scf_main 1 4.0 0.000 0.000 54.431 54.436 multiply_cannon_loop 111 8.7 0.116 0.124 49.664 52.540 density_matrix_trs4 2 5.0 0.002 0.003 48.875 49.048 mp_waitall_1 9246 10.9 20.763 30.695 20.763 30.695 ls_scf_init_scf 1 4.0 0.000 0.001 30.493 30.494 ls_scf_init_matrix_S 1 5.0 0.000 0.000 29.363 29.461 multiply_cannon_multrec 1332 9.7 13.105 16.905 22.322 27.300 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.001 26.969 26.979 multiply_cannon_metrocomm3 1332 9.7 0.007 0.008 11.479 21.472 make_m2s 222 7.7 0.006 0.007 15.263 16.009 make_images 222 8.7 1.567 1.915 15.233 15.980 dbcsr_mm_accdrv_process 4041 10.4 0.267 0.445 8.817 10.340 dbcsr_mm_accdrv_process_sort 4041 11.4 8.411 9.912 8.411 9.912 make_images_data 222 9.7 0.004 0.004 8.705 9.731 hybrid_alltoall_any 227 10.6 0.523 2.452 8.102 9.117 mp_sum_l 807 5.4 5.241 8.534 5.241 8.534 multiply_cannon_metrocomm4 1221 9.7 0.006 0.008 3.232 7.750 mp_irecv_dv 3311 11.0 3.212 7.700 3.212 7.700 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 4.027 6.899 calculate_norms 2376 9.8 6.019 6.841 6.019 6.841 multiply_cannon_sync_h2d 1332 9.7 4.841 6.091 4.841 6.091 apply_matrix_preconditioner 6 5.3 0.000 0.000 4.981 5.155 arnoldi_extremal 4 6.8 0.000 0.000 4.643 4.669 arnoldi_normal_ev 4 7.8 0.001 0.004 4.643 4.669 build_subspace 16 8.4 0.014 0.021 4.386 4.389 ls_scf_post 1 4.0 0.000 0.000 3.493 3.497 dbcsr_matrix_vector_mult 304 9.0 0.005 0.017 3.117 3.361 ls_scf_store_result 1 5.0 0.000 0.000 3.247 3.318 dbcsr_matrix_vector_mult_local 304 10.0 2.731 3.220 2.733 3.222 multiply_cannon_metrocomm1 1332 9.7 0.003 0.004 1.207 2.850 ls_scf_dm_to_ks 2 5.0 0.000 0.000 2.490 2.575 mp_allgather_i34 111 8.7 0.974 2.450 0.974 2.450 make_images_pack 222 9.7 2.027 2.382 2.029 2.384 dbcsr_sort_data 436 11.2 1.834 2.105 1.834 2.105 dbcsr_data_new 4174 10.1 1.614 1.846 1.614 1.846 qs_ks_update_qs_env 3 6.3 0.000 0.000 1.807 1.809 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="502", plot="h2o_32_nrep3_ls", label="(8n/6r/2t)", y=90.154000, yerr=0.000000 PlotPoint: name="503", plot="h2o_32_nrep3_ls_mem", label="(8n/6r/2t)", y=1791.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/23/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 0.0% 100.0% flops max/rank 8.404608E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 0.0% 100.0% number of processed stacks 353133 0.0% 0.0% 100.0% average stack size 0.0 0.0 27282.1 marketing flops 1.742118E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 2.753712E+09 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 46176 MPI messages size (bytes): total size 1.924064E+12 min size 0.000000E+00 max size 108.059888E+06 average size 41.668048E+06 MPI breakdown and total messages size (bytes): size <= 128 9984 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 0 0 131072 < size <= 4194304 3328 1170063360 4194304 < size <= 16777216 1870 19378539600 16777216 < size 30994 1903514987232 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 265470. MP_Allreduce 3058 11181. MP_Sync 4 MP_Alltoall 47 23526250. MP_ISendRecv 186 57600. MP_Wait 732 MP_ISend 462 560046. MP_IRecv 462 560662. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.040 0.088 94.250 94.251 qs_energies 1 2.0 0.000 0.000 93.599 93.633 ls_scf 1 3.0 0.001 0.009 92.167 92.201 dbcsr_multiply_generic 111 6.7 0.016 0.020 76.964 77.257 ls_scf_main 1 4.0 0.000 0.000 57.783 57.790 multiply_cannon 111 7.7 0.040 0.144 53.047 56.540 multiply_cannon_loop 111 8.7 0.100 0.106 49.538 53.611 density_matrix_trs4 2 5.0 0.002 0.003 51.917 52.104 mp_waitall_1 7374 11.0 24.128 34.296 24.128 34.296 ls_scf_init_scf 1 4.0 0.000 0.002 30.777 30.779 ls_scf_init_matrix_S 1 5.0 0.000 0.001 29.536 29.591 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.001 27.155 27.171 multiply_cannon_multrec 888 9.7 12.622 15.381 21.124 24.464 multiply_cannon_metrocomm3 888 9.7 0.004 0.004 11.181 23.849 make_m2s 222 7.7 0.006 0.007 17.057 18.299 make_images 222 8.7 1.967 2.284 17.019 18.261 hybrid_alltoall_any 227 10.6 0.620 2.852 9.397 10.804 make_images_data 222 9.7 0.004 0.004 9.738 10.691 mp_sum_l 807 5.4 5.311 9.354 5.311 9.354 dbcsr_mm_accdrv_process 3754 10.4 0.262 0.432 8.039 9.275 dbcsr_mm_accdrv_process_sort 3754 11.4 7.664 8.843 7.664 8.843 multiply_cannon_sync_h2d 888 9.7 6.046 7.544 6.046 7.544 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 4.073 7.399 multiply_cannon_metrocomm4 777 9.7 0.004 0.005 2.477 7.202 mp_irecv_dv 2335 11.1 2.461 7.146 2.461 7.146 multiply_cannon_metrocomm1 888 9.7 0.003 0.003 3.830 6.857 apply_matrix_preconditioner 6 5.3 0.000 0.000 4.959 5.166 arnoldi_extremal 4 6.8 0.000 0.000 5.051 5.076 arnoldi_normal_ev 4 7.8 0.001 0.005 5.051 5.076 build_subspace 16 8.4 0.014 0.020 4.742 4.750 calculate_norms 1584 9.8 4.269 4.604 4.269 4.604 mp_allgather_i34 111 8.7 1.349 3.793 1.349 3.793 dbcsr_matrix_vector_mult 304 9.0 0.005 0.016 3.400 3.735 ls_scf_post 1 4.0 0.002 0.012 3.606 3.641 dbcsr_matrix_vector_mult_local 304 10.0 3.002 3.581 3.004 3.583 ls_scf_store_result 1 5.0 0.000 0.000 3.334 3.429 ls_scf_dm_to_ks 2 5.0 0.000 0.001 2.721 2.813 make_images_sizes 222 9.7 0.000 0.000 1.000 2.208 mp_alltoall_i44 222 10.7 1.000 2.207 1.000 2.207 dbcsr_sort_data 325 11.1 1.876 2.126 1.876 2.126 make_images_pack 222 9.7 1.810 2.113 1.813 2.116 dbcsr_data_release 9322 10.9 1.309 2.046 1.309 2.046 qs_ks_update_qs_env 3 6.3 0.000 0.000 1.935 1.937 rebuild_ks_matrix 3 7.3 0.000 0.000 1.917 1.919 qs_ks_build_kohn_sham_matrix 3 8.3 0.003 0.011 1.917 1.919 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="504", plot="h2o_32_nrep3_ls", label="(8n/4r/3t)", y=94.251000, yerr=0.000000 PlotPoint: name="505", plot="h2o_32_nrep3_ls_mem", label="(8n/4r/3t)", y=2190.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/24/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 0.0% 100.0% flops max/rank 10.747127E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 0.0% 100.0% number of processed stacks 369794 0.0% 0.0% 100.0% average stack size 0.0 0.0 26053.0 marketing flops 1.742116E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 3.320594E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 50616 MPI messages size (bytes): total size 1.536549E+12 min size 0.000000E+00 max size 72.286792E+06 average size 30.356986E+06 MPI breakdown and total messages size (bytes): size <= 128 10368 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 1056 104411904 131072 < size <= 4194304 3168 831638784 4194304 < size <= 16777216 3103 33613273640 16777216 < size 32921 1501999894888 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 266696. MP_Allreduce 3058 13371. MP_Sync 4 MP_Alltoall 47 30278988. MP_ISendRecv 138 86400. MP_Wait 600 MP_ISend 378 823502. MP_IRecv 378 823753. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.031 0.048 97.114 97.116 qs_energies 1 2.0 0.000 0.000 96.498 96.504 ls_scf 1 3.0 0.000 0.000 94.860 94.865 dbcsr_multiply_generic 111 6.7 0.016 0.017 78.846 79.142 ls_scf_main 1 4.0 0.000 0.000 59.021 59.022 multiply_cannon 111 7.7 0.057 0.129 51.774 56.385 density_matrix_trs4 2 5.0 0.002 0.003 53.052 53.191 multiply_cannon_loop 111 8.7 0.115 0.125 46.735 49.937 ls_scf_init_scf 1 4.0 0.000 0.001 32.587 32.589 ls_scf_init_matrix_S 1 5.0 0.000 0.000 31.438 31.520 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.001 28.966 28.978 mp_waitall_1 6438 11.0 23.109 28.931 23.109 28.931 multiply_cannon_multrec 1332 9.7 14.135 17.222 21.970 24.763 make_m2s 222 7.7 0.007 0.008 21.136 22.533 make_images 222 8.7 3.130 3.590 21.086 22.485 multiply_cannon_metrocomm3 1332 9.7 0.003 0.003 9.456 17.535 make_images_data 222 9.7 0.004 0.004 11.869 13.415 hybrid_alltoall_any 227 10.6 0.797 3.794 11.226 12.986 dbcsr_mm_accdrv_process 3641 10.4 0.229 0.409 7.470 8.981 dbcsr_mm_accdrv_process_sort 3641 11.4 7.096 8.565 7.096 8.565 mp_sum_l 807 5.4 4.356 8.433 4.356 8.433 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 3.383 6.713 multiply_cannon_sync_h2d 1332 9.7 5.524 6.324 5.524 6.324 multiply_cannon_metrocomm4 1110 9.7 0.004 0.006 2.061 6.004 mp_irecv_dv 3229 10.9 2.038 5.926 2.038 5.926 arnoldi_extremal 4 6.8 0.000 0.000 5.193 5.218 arnoldi_normal_ev 4 7.8 0.001 0.005 5.193 5.218 multiply_cannon_metrocomm1 1332 9.7 0.003 0.003 2.611 5.047 build_subspace 16 8.4 0.014 0.021 4.844 4.849 apply_matrix_preconditioner 6 5.3 0.000 0.000 4.514 4.656 calculate_norms 2376 9.8 4.180 4.537 4.180 4.537 mp_allgather_i34 111 8.7 2.164 4.475 2.164 4.475 dbcsr_matrix_vector_mult 304 9.0 0.006 0.016 3.553 3.844 dbcsr_matrix_vector_mult_local 304 10.0 3.161 3.665 3.163 3.667 dbcsr_sort_data 658 11.4 3.083 3.363 3.083 3.363 ls_scf_post 1 4.0 0.000 0.000 3.252 3.259 dbcsr_special_finalize 555 9.7 0.006 0.008 2.826 3.144 dbcsr_merge_single_wm 555 10.7 0.537 0.655 2.818 3.136 ls_scf_store_result 1 5.0 0.000 0.000 2.968 3.057 ls_scf_dm_to_ks 2 5.0 0.000 0.000 2.884 2.938 dbcsr_data_release 10477 10.7 1.573 2.423 1.573 2.423 dbcsr_finalize 304 7.8 0.049 0.061 1.798 1.980 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="506", plot="h2o_32_nrep3_ls", label="(8n/3r/4t)", y=97.116000, yerr=0.000000 PlotPoint: name="507", plot="h2o_32_nrep3_ls_mem", label="(8n/3r/4t)", y=2724.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/25/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 0.0% 100.0% flops max/rank 15.383312E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 0.0% 100.0% number of processed stacks 336818 0.0% 0.0% 100.0% average stack size 0.0 0.0 28603.7 marketing flops 1.742118E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 4.623303E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 10656 MPI messages size (bytes): total size 1.149035E+12 min size 0.000000E+00 max size 203.538048E+06 average size 107.829832E+06 MPI breakdown and total messages size (bytes): size <= 128 2304 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 0 0 131072 < size <= 4194304 768 702038016 4194304 < size <= 16777216 0 0 16777216 < size 7584 1148332810224 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 2 12. MP_Allreduce 705 128. MP_Alltoall 310 12920694. MP_ISend 1776 40180424. MP_IRecv 1776 40465030. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 265558. MP_Allreduce 3049 15663. MP_Sync 4 MP_Alltoall 47 46208988. MP_ISendRecv 90 115200. MP_Wait 573 MP_ISend 420 924980. MP_IRecv 420 924528. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.038 0.056 99.127 99.129 qs_energies 1 2.0 0.000 0.000 98.336 98.344 ls_scf 1 3.0 0.000 0.000 96.425 96.432 dbcsr_multiply_generic 111 6.7 0.017 0.019 78.339 78.581 ls_scf_main 1 4.0 0.000 0.000 62.177 62.178 multiply_cannon 111 7.7 0.077 0.139 55.520 60.088 density_matrix_trs4 2 5.0 0.002 0.002 55.225 55.297 multiply_cannon_loop 111 8.7 0.069 0.077 51.105 53.684 mp_waitall_1 5481 11.0 27.033 32.457 27.033 32.457 ls_scf_init_scf 1 4.0 0.000 0.001 30.642 30.644 ls_scf_init_matrix_S 1 5.0 0.000 0.000 29.483 29.517 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.001 27.316 27.326 multiply_cannon_multrec 444 9.7 14.094 16.736 21.126 23.760 make_m2s 222 7.7 0.004 0.005 17.809 20.220 make_images 222 8.7 3.711 4.395 17.748 20.160 multiply_cannon_metrocomm1 444 9.7 0.002 0.002 11.669 16.623 multiply_cannon_metrocomm3 444 9.7 0.001 0.001 5.992 15.488 make_images_data 222 9.7 0.003 0.004 10.169 12.420 hybrid_alltoall_any 227 10.6 0.788 3.763 9.918 12.356 multiply_cannon_sync_h2d 444 9.7 6.571 8.130 6.571 8.130 dbcsr_mm_accdrv_process 3003 10.4 0.165 0.343 6.726 7.867 dbcsr_mm_accdrv_process_sort 3003 11.4 6.411 7.524 6.411 7.524 mp_allgather_i34 111 8.7 2.647 6.995 2.647 6.995 arnoldi_extremal 4 6.8 0.000 0.000 5.753 5.762 arnoldi_normal_ev 4 7.8 0.001 0.004 5.753 5.762 build_subspace 16 8.4 0.015 0.020 5.361 5.372 mp_sum_l 807 5.4 3.061 4.864 3.061 4.864 apply_matrix_preconditioner 6 5.3 0.000 0.000 4.578 4.750 dbcsr_matrix_vector_mult 304 9.0 0.007 0.017 4.122 4.320 dbcsr_matrix_vector_mult_local 304 10.0 3.653 4.125 3.655 4.127 multiply_cannon_metrocomm4 333 9.7 0.001 0.002 1.585 3.808 mp_irecv_dv 1241 11.2 1.563 3.757 1.563 3.757 calculate_norms 792 9.8 3.533 3.695 3.533 3.695 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 2.125 3.659 ls_scf_post 1 4.0 0.000 0.000 3.606 3.613 make_images_sizes 222 9.7 0.000 0.000 1.011 3.546 mp_alltoall_i44 222 10.7 1.011 3.546 1.011 3.546 ls_scf_dm_to_ks 2 5.0 0.000 0.000 3.360 3.456 ls_scf_store_result 1 5.0 0.000 0.000 3.389 3.429 dbcsr_finalize 304 7.8 0.062 0.078 2.192 2.253 dbcsr_merge_all 275 8.9 0.473 0.520 2.041 2.086 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="508", plot="h2o_32_nrep3_ls", label="(8n/2r/6t)", y=99.129000, yerr=0.000000 PlotPoint: name="509", plot="h2o_32_nrep3_ls_mem", label="(8n/2r/6t)", y=3585.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cb0f57ca944f9b314758deba717f9b13e7d43235_performance_tests/26/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 0.0% 100.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 0.0% 100.0% flops max/rank 30.358840E+12 0.0% 0.0% 100.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 0.0% 100.0% number of processed stacks 339931 0.0% 0.0% 100.0% average stack size 0.0 0.0 28341.7 marketing flops 1.742118E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 8.767099E+09 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 4440 MPI messages size (bytes): total size 770.525954E+09 min size 0.000000E+00 max size 399.069120E+06 average size 173.541888E+06 MPI breakdown and total messages size (bytes): size <= 128 640 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 0 0 131072 < size <= 4194304 640 468025344 4194304 < size <= 16777216 0 0 16777216 < size 3160 770057961712 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 284111. MP_Allreduce 3043 21950. MP_Sync 4 MP_Alltoall 47 88727262. MP_ISendRecv 84 732600. MP_Wait 309 MP_ISend 180 3337386. MP_IRecv 180 3339494. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.078 0.104 107.358 107.362 qs_energies 1 2.0 0.000 0.000 105.956 105.970 ls_scf 1 3.0 0.000 0.000 103.061 103.075 dbcsr_multiply_generic 111 6.7 0.023 0.028 77.426 77.529 ls_scf_main 1 4.0 0.000 0.000 65.580 65.581 density_matrix_trs4 2 5.0 0.002 0.003 56.731 56.784 multiply_cannon 111 7.7 0.136 0.202 49.903 51.580 multiply_cannon_loop 111 8.7 0.067 0.070 46.451 47.530 ls_scf_init_scf 1 4.0 0.001 0.001 33.830 33.831 ls_scf_init_matrix_S 1 5.0 0.000 0.000 32.503 32.511 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.001 29.656 29.665 mp_waitall_1 4569 11.1 22.131 25.915 22.131 25.915 make_m2s 222 7.7 0.005 0.005 23.717 24.674 make_images 222 8.7 4.588 4.956 23.611 24.565 multiply_cannon_multrec 444 9.7 17.848 18.696 22.431 23.073 hybrid_alltoall_any 227 10.6 1.663 3.635 12.771 15.435 make_images_data 222 9.7 0.003 0.004 12.970 15.322 multiply_cannon_metrocomm3 444 9.7 0.001 0.001 10.565 10.995 multiply_cannon_sync_h2d 444 9.7 8.848 8.892 8.848 8.892 arnoldi_extremal 4 6.8 0.000 0.000 7.242 7.260 arnoldi_normal_ev 4 7.8 0.002 0.008 7.242 7.260 build_subspace 16 8.4 0.026 0.036 6.672 6.679 dbcsr_matrix_vector_mult 304 9.0 0.009 0.025 5.340 5.486 apply_matrix_preconditioner 6 5.3 0.000 0.000 5.062 5.309 dbcsr_matrix_vector_mult_local 304 10.0 4.933 5.220 4.935 5.222 ls_scf_dm_to_ks 2 5.0 0.000 0.000 4.800 4.890 dbcsr_mm_accdrv_process 1814 10.4 0.231 0.316 4.418 4.542 dbcsr_mm_accdrv_process_sort 1814 11.4 4.118 4.253 4.118 4.253 ls_scf_post 1 4.0 0.000 0.000 3.650 3.664 make_images_sizes 222 9.7 0.000 0.000 1.422 3.459 mp_alltoall_i44 222 10.7 1.422 3.459 1.422 3.459 ls_scf_store_result 1 5.0 0.000 0.000 3.404 3.413 mp_allgather_i34 111 8.7 1.076 3.407 1.076 3.407 calculate_norms 792 9.8 3.239 3.278 3.239 3.278 dbcsr_finalize 304 7.8 0.082 0.089 3.083 3.140 dbcsr_merge_all 275 8.9 0.891 0.917 2.869 2.920 dbcsr_complete_redistribute 5 7.6 1.433 1.475 2.763 2.906 qs_energies_init_hamiltonians 1 3.0 0.001 0.001 2.865 2.865 dbcsr_data_release 12724 10.6 2.316 2.839 2.316 2.839 matrix_ls_to_qs 2 6.0 0.000 0.000 2.414 2.571 dbcsr_sort_data 325 11.1 2.442 2.494 2.442 2.494 dbcsr_new_transposed 4 7.5 0.242 0.250 2.293 2.305 mp_sum_l 807 5.4 1.479 2.282 1.479 2.282 dbcsr_frobenius_norm 74 6.6 2.056 2.136 2.199 2.240 dbcsr_add_d 103 6.2 0.000 0.000 2.132 2.206 dbcsr_add_anytype 103 7.2 0.859 0.893 2.132 2.206 qs_ks_update_qs_env 3 6.3 0.000 0.000 2.198 2.200 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="510", plot="h2o_32_nrep3_ls", label="(8n/1r/12t)", y=107.362000, yerr=0.000000 PlotPoint: name="511", plot="h2o_32_nrep3_ls_mem", label="(8n/1r/12t)", y=6919.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ========= END RESULTS =========== CommitSHA: cb0f57ca944f9b314758deba717f9b13e7d43235 Summary: empty Status: OK