=== This is the CP2K Performance-Test === Already up to date. Current branch master is up to date. Already up to date. Current branch master is up to date. GIT Revision: aa4adac34cf79714cddcdea931a90aba3810da90 ################# ARCHITECTURE FILE ################## #!/bin/bash # # CP2K arch file for Cray-XC40 (Piz Daint, CSCS, multi-core partition) # # Tested with: GNU 11.2.0, Cray-MPICH 7.7.18, # Cray-libsci 20.09.1, Cray-FFTW 3.3.8.10, # COSMA 2.6.6, ELPA 2023.05.001, HDF5 1.14.2, # LIBINT 2.6.0, LIBPEXSI 1.2.0, LIBXC 6.2.2, # LIBVORI 220621, LIBXSMM 1.17, PLUMED 2.9.0, # SIRIUS 7.5.2, SPGLIB 1.16.2, LIBGRPP 20231215, # SPFFT 1.0.6, SPLA 1.5.5 # # Usage: Source this arch file and then run make as instructed. # A full toolchain installation is performed as default. # Replace or adapt the "module add" commands below if needed. # # Last update: 12.03.2024 # # \ if [ "${0}" = "${BASH_SOURCE}" ]; then \ echo "ERROR: Script ${0##*/} must be sourced"; \ echo "Usage: source ${0##*/}"; \ exit 1; \ fi; \ this_file=${BASH_SOURCE##*/}; \ if [ -n "${1}" ]; then \ gcc_version="${1}"; \ else \ gcc_version="11.2.0"; \ fi; \ module add daint-mc; \ module rm PrgEnv-cray; \ module add PrgEnv-gnu; \ module rm gcc; \ module add gcc/${gcc_version}; \ module add cray-fftw/3.3.8.10; \ echo "Expected setup:"; \ echo " cray-mpich/7.7.18"; \ echo " craype-broadwell"; \ echo " daint-mc/21.09"; \ echo " craype/2.7.10"; \ echo " cray-libsci/20.09.1"; \ echo " PrgEnv-gnu/6.0.10"; \ echo " gcc/${gcc_version}"; \ echo " cray-fftw/3.3.8.10"; \ module list; \ module -f save cp2k_mc_gnu_psmp; \ echo "To load the required modules in your batch job script, use:"; \ echo " module restore cp2k_mc_gnu_psmp"; \ cd tools/toolchain; \ ./install_cp2k_toolchain.sh -j${maxtasks} --no-arch-files --with-gcc=system --with-libvdwxc --with-pexsi --with-plumed; \ cd ../..; \ printf "Sourcing ${PWD}/tools/toolchain/install/setup ... "; \ source ${PWD}/tools/toolchain/install/setup; \ printf "done\n"; \ echo "Check the output above for error messages and consistency!"; \ echo; \ echo "If everything is OK, you can build a CP2K production binary with"; \ echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.}"; \ echo; \ echo "Alternatively, you can add further checks, e.g. for regression testing, with"; \ echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.} DO_CHECKS=yes"; \ echo "or build CP2K as a library with"; \ echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.} libcp2k"; \ echo; \ return # Set options DO_CHECKS := no USE_COSMA := 2.6.6 USE_ELPA := 2023.05.001 USE_HDF5 := 1.14.2 USE_LIBGRPP := 20231215 USE_LIBINT := 2.6.0 USE_LIBPEXSI := 1.2.0 USE_LIBVORI := 220621 USE_LIBXC := 6.2.2 USE_LIBXSMM := 1.17 USE_PLUMED := 2.9.0 USE_SPFFT := 1.0.6 USE_SPLA := 1.5.5 #USE_QUIP := 0.9.10 USE_SIRIUS := 7.5.2 USE_SPGLIB := 1.16.2 # Only needed for SIRIUS LIBVDWXC_VER := 0.4.0 # Only needed for LIBPEXSI SCOTCH_VER := 6.0.0 SUPERLU_VER := 6.1.0 LMAX := 5 MAX_CONTR := 4 CC := cc FC := ftn LD := ftn AR := ar -r # cc, CC, and ftn include already the proper -march flag CFLAGS := -O2 -fopenmp -fopenmp-simd -ftree-vectorize -funroll-loops -g DFLAGS := -D__parallel DFLAGS += -D__SCALAPACK DFLAGS += -D__FFTW3 DFLAGS += -D__MAX_CONTR=$(strip $(MAX_CONTR)) INSTALL_PATH := $(PWD)/tools/toolchain/install ifeq ($(DO_CHECKS), yes) DFLAGS += -D__CHECK_DIAG endif ifneq ($(USE_PLUMED),) USE_PLUMED := $(strip $(USE_PLUMED)) PLUMED_LIB := $(INSTALL_PATH)/plumed-$(USE_PLUMED)/lib DFLAGS += -D__PLUMED2 USE_GSL := 2.7 LIBS += $(PLUMED_LIB)/libplumed.a endif ifneq ($(USE_ELPA),) USE_ELPA := $(strip $(USE_ELPA)) TARGET := cpu ELPA_INC := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/include/elpa-$(USE_ELPA) ELPA_LIB := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/lib CFLAGS += -I$(ELPA_INC)/elpa -I$(ELPA_INC)/modules DFLAGS += -D__ELPA LIBS += $(ELPA_LIB)/libelpa.a endif ifneq ($(USE_QUIP),) USE_QUIP := $(strip $(USE_QUIP)) QUIP_INC := $(INSTALL_PATH)/quip-$(USE_QUIP)/include QUIP_LIB := $(INSTALL_PATH)/quip-$(USE_QUIP)/lib CFLAGS += -I$(QUIP_INC) DFLAGS += -D__QUIP LIBS += $(QUIP_LIB)/libquip_core.a LIBS += $(QUIP_LIB)/libatoms.a LIBS += $(QUIP_LIB)/libFoX_sax.a LIBS += $(QUIP_LIB)/libFoX_common.a LIBS += $(QUIP_LIB)/libFoX_utils.a LIBS += $(QUIP_LIB)/libFoX_fsys.a endif ifneq ($(USE_LIBPEXSI),) USE_LIBPEXSI := $(strip $(USE_LIBPEXSI)) SCOTCH_VER := $(strip $(SCOTCH_VER)) SUPERLU_VER := $(strip $(SUPERLU_VER)) LIBPEXSI_INC := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/include LIBPEXSI_LIB := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/lib SCOTCH_INC := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/include SCOTCH_LIB := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/lib SUPERLU_INC := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/include SUPERLU_LIB := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/lib CFLAGS += -I$(LIBPEXSI_INC) -I$(SCOTCH_INC) -I$(SUPERLU_INC) DFLAGS += -D__LIBPEXSI LIBS += $(LIBPEXSI_LIB)/libpexsi.a LIBS += $(SUPERLU_LIB)/libsuperlu_dist.a LIBS += $(SCOTCH_LIB)/libptscotchparmetis.a LIBS += $(SCOTCH_LIB)/libptscotch.a LIBS += $(SCOTCH_LIB)/libptscotcherr.a LIBS += $(SCOTCH_LIB)/libscotchmetis.a LIBS += $(SCOTCH_LIB)/libscotch.a endif ifneq ($(USE_LIBVORI),) USE_LIBVORI := $(strip $(USE_LIBVORI)) LIBVORI_LIB := $(INSTALL_PATH)/libvori-$(USE_LIBVORI)/lib DFLAGS += -D__LIBVORI LIBS += $(LIBVORI_LIB)/libvori.a endif ifneq ($(USE_LIBXC),) USE_LIBXC := $(strip $(USE_LIBXC)) LIBXC_INC := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/include LIBXC_LIB := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/lib CFLAGS += -I$(LIBXC_INC) DFLAGS += -D__LIBXC LIBS += $(LIBXC_LIB)/libxcf03.a LIBS += $(LIBXC_LIB)/libxc.a endif ifneq ($(USE_LIBGRPP),) USE_LIBGRPP := $(strip $(USE_LIBGRPP)) LIBGRPP_INC := $(INSTALL_PATH)/libgrpp-main-$(USE_LIBGRPP)/include LIBGRPP_LIB := $(INSTALL_PATH)/libgrpp-main-$(USE_LIBGRPP)/lib CFLAGS += -I$(LIBGRPP_INC) DFLAGS += -D__LIBGRPP LIBS += $(LIBGRPP_LIB)/liblibgrpp.a endif ifneq ($(USE_LIBINT),) USE_LIBINT := $(strip $(USE_LIBINT)) LMAX := $(strip $(LMAX)) LIBINT_INC := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/include LIBINT_LIB := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/lib CFLAGS += -I$(LIBINT_INC) DFLAGS += -D__LIBINT LIBS += $(LIBINT_LIB)/libint2.a endif ifneq ($(USE_SPGLIB),) USE_SPGLIB := $(strip $(USE_SPGLIB)) SPGLIB_INC := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/include SPGLIB_LIB := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/lib CFLAGS += -I$(SPGLIB_INC) DFLAGS += -D__SPGLIB LIBS += $(SPGLIB_LIB)/libsymspg.a endif ifneq ($(USE_LIBXSMM),) USE_LIBXSMM := $(strip $(USE_LIBXSMM)) LIBXSMM_INC := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/include LIBXSMM_LIB := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/lib CFLAGS += -I$(LIBXSMM_INC) DFLAGS += -D__LIBXSMM LIBS += $(LIBXSMM_LIB)/libxsmmf.a LIBS += $(LIBXSMM_LIB)/libxsmm.a endif ifneq ($(USE_SIRIUS),) USE_SIRIUS := $(strip $(USE_SIRIUS)) LIBVDWXC_VER := $(strip $(LIBVDWXC_VER)) LIBVDWXC_INC := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/include LIBVDWXC_LIB := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/lib SIRIUS_INC := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include SIRIUS_LIB := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib CFLAGS += -I$(LIBVDWXC_INC) CFLAGS += -I$(SIRIUS_INC) DFLAGS += -D__LIBVDWXC DFLAGS += -D__SIRIUS LIBS += $(SIRIUS_LIB)/libsirius.a LIBS += $(LIBVDWXC_LIB)/libvdwxc.a endif ifneq ($(USE_SPFFT),) USE_SPFFT := $(strip $(USE_SPFFT)) SPFFT_INC := $(INSTALL_PATH)/SpFFT-$(USE_SPFFT)/include SPFFT_LIB := $(INSTALL_PATH)/SpFFT-$(USE_SPFFT)/lib CFLAGS += -I$(SPFFT_INC) DFLAGS += -D__SPFFT LIBS += $(SPFFT_LIB)/libspfft.a endif ifneq ($(USE_SPLA),) USE_SPLA := $(strip $(USE_SPLA)) SPLA_INC := $(INSTALL_PATH)/SpLA-$(USE_SPLA)/include/spla SPLA_LIB := $(INSTALL_PATH)/SpLA-$(USE_SPLA)/lib CFLAGS += -I$(SPLA_INC) DFLAGS += -D__SPLA LIBS += $(SPLA_LIB)/libspla.a endif ifneq ($(USE_HDF5),) USE_HDF5 := $(strip $(USE_HDF5)) HDF5_INC := $(INSTALL_PATH)/hdf5-$(USE_HDF5)/include HDF5_LIB := $(INSTALL_PATH)/hdf5-$(USE_HDF5)/lib CFLAGS += -I$(HDF5_INC) DFLAGS += -D__HDF5 LIBS += $(HDF5_LIB)/libhdf5_fortran.a LIBS += $(HDF5_LIB)/libhdf5_hl.a LIBS += $(HDF5_LIB)/libhdf5.a endif ifneq ($(USE_COSMA),) USE_COSMA := $(strip $(USE_COSMA)) COSMA_INC := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/include COSMA_LIB := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/lib CFLAGS += -I$(COSMA_INC) DFLAGS += -D__COSMA LIBS += $(COSMA_LIB)/libcosma_prefixed_pxgemm.a LIBS += $(COSMA_LIB)/libcosma.a LIBS += $(COSMA_LIB)/libcosta.a endif ifneq ($(USE_GSL),) USE_GSL := $(strip $(USE_GSL)) GSL_INC := $(INSTALL_PATH)/gsl-$(USE_GSL)/include GSL_LIB := $(INSTALL_PATH)/gsl-$(USE_GSL)/lib CFLAGS += -I$(GSL_INC) DFLAGS += -D__GSL LIBS += $(GSL_LIB)/libgsl.a endif CFLAGS += $(DFLAGS) FCFLAGS := $(CFLAGS) ifeq ($(shell [ $(shell gcc -dumpversion | cut -d. -f1) -gt 9 ] && echo yes), yes) FCFLAGS += -fallow-argument-mismatch endif FCFLAGS += -fbacktrace FCFLAGS += -ffree-form FCFLAGS += -ffree-line-length-none FCFLAGS += -fno-omit-frame-pointer FCFLAGS += -std=f2008 LDFLAGS := $(FCFLAGS) -static LIBS += -lz -ldl -lstdc++ # End ############### END ARCHITECTURE FILE ################ ===== TESTS (description) ===== ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 RI-RPA/RI-MP2 correlation energy input file: benchmarks/QS_mp2_rpa/32-H2O/RI-RPA.inp required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-dRPA-TZ.inp'] output file: result.log # nodes = 4 # ranks/node = 4 # threads/rank = 9 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/01 job id: 52243929 --- Point --- name: 10 plot: h2o_32_ri_rpa_mp2 regex: Total RI-RPA Time= label: RI-RPA (4n/4r/9t) --- Point --- name: 11 plot: h2o_32_ri_rpa_mp2_mem regex: Estimated peak process memory label: RI-RPA (4n/4r/9t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 RI-RPA/RI-MP2 correlation energy input file: benchmarks/QS_mp2_rpa/32-H2O/RI-MP2.inp required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-HF-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-MP2-TZ.inp'] output file: result.log # nodes = 4 # ranks/node = 12 # threads/rank = 3 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/02 job id: 52243930 --- Point --- name: 20 plot: h2o_32_ri_rpa_mp2 regex: Total MP2 Time= label: RI-MP2 (4n/12r/3t) --- Point --- name: 21 plot: h2o_32_ri_rpa_mp2_mem regex: Estimated peak process memory label: RI-MP2 (4n/12r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 36 # threads/rank = 1 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/03 job id: 52243931 --- Point --- name: 100 plot: h2o_64_md regex: CP2K label: (4n/36r/1t) --- Point --- name: 101 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/36r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 18 # threads/rank = 2 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/04 job id: 52243933 --- Point --- name: 102 plot: h2o_64_md regex: CP2K label: (4n/18r/2t) --- Point --- name: 103 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/18r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 12 # threads/rank = 3 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/05 job id: 52243935 --- Point --- name: 104 plot: h2o_64_md regex: CP2K label: (4n/12r/3t) --- Point --- name: 105 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/12r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 9 # threads/rank = 4 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/06 job id: 52243936 --- Point --- name: 106 plot: h2o_64_md regex: CP2K label: (4n/9r/4t) --- Point --- name: 107 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/9r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 6 # threads/rank = 6 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/07 job id: 52243937 --- Point --- name: 108 plot: h2o_64_md regex: CP2K label: (4n/6r/6t) --- Point --- name: 109 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/6r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 4 # threads/rank = 9 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/08 job id: 52243938 --- Point --- name: 110 plot: h2o_64_md regex: CP2K label: (4n/4r/9t) --- Point --- name: 111 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/4r/9t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 3 # threads/rank = 12 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/09 job id: 52243939 --- Point --- name: 112 plot: h2o_64_md regex: CP2K label: (4n/3r/12t) --- Point --- name: 113 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/3r/12t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 2 # threads/rank = 18 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/10 job id: 52243940 --- Point --- name: 114 plot: h2o_64_md regex: CP2K label: (4n/2r/18t) --- Point --- name: 115 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/2r/18t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-64 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-64.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 1 # threads/rank = 36 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/11 job id: 52243941 --- Point --- name: 116 plot: h2o_64_md regex: CP2K label: (4n/1r/36t) --- Point --- name: 117 plot: h2o_64_md_mem regex: Estimated peak process memory label: (4n/1r/36t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 36 # threads/rank = 1 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/12 job id: 52243942 --- Point --- name: 200 plot: h2o_128_md regex: CP2K label: (4n/36r/1t) --- Point --- name: 201 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/36r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 18 # threads/rank = 2 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/13 job id: 52243943 --- Point --- name: 202 plot: h2o_128_md regex: CP2K label: (4n/18r/2t) --- Point --- name: 203 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/18r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 12 # threads/rank = 3 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/14 job id: 52243944 --- Point --- name: 204 plot: h2o_128_md regex: CP2K label: (4n/12r/3t) --- Point --- name: 205 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/12r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 9 # threads/rank = 4 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/15 job id: 52243945 --- Point --- name: 206 plot: h2o_128_md regex: CP2K label: (4n/9r/4t) --- Point --- name: 207 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/9r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 6 # threads/rank = 6 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/16 job id: 52243946 --- Point --- name: 208 plot: h2o_128_md regex: CP2K label: (4n/6r/6t) --- Point --- name: 209 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/6r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 4 # threads/rank = 9 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/17 job id: 52243947 --- Point --- name: 210 plot: h2o_128_md regex: CP2K label: (4n/4r/9t) --- Point --- name: 211 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/4r/9t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 3 # threads/rank = 12 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/18 job id: 52243948 --- Point --- name: 212 plot: h2o_128_md regex: CP2K label: (4n/3r/12t) --- Point --- name: 213 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/3r/12t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 2 # threads/rank = 18 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/19 job id: 52243949 --- Point --- name: 214 plot: h2o_128_md regex: CP2K label: (4n/2r/18t) --- Point --- name: 215 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/2r/18t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-128 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-128.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 1 # threads/rank = 36 nrepeat = 1 time[min] = 10 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/20 job id: 52243950 --- Point --- name: 216 plot: h2o_128_md regex: CP2K label: (4n/1r/36t) --- Point --- name: 217 plot: h2o_128_md_mem regex: Estimated peak process memory label: (4n/1r/36t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 36 # threads/rank = 1 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/21 job id: 52243951 --- Point --- name: 400 plot: h2o_256_md regex: CP2K label: (4n/36r/1t) --- Point --- name: 401 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/36r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 18 # threads/rank = 2 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/22 job id: 52243953 --- Point --- name: 402 plot: h2o_256_md regex: CP2K label: (4n/18r/2t) --- Point --- name: 403 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/18r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 12 # threads/rank = 3 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/23 job id: 52243954 --- Point --- name: 404 plot: h2o_256_md regex: CP2K label: (4n/12r/3t) --- Point --- name: 405 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/12r/3t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 9 # threads/rank = 4 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/24 job id: 52243955 --- Point --- name: 406 plot: h2o_256_md regex: CP2K label: (4n/9r/4t) --- Point --- name: 407 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/9r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 6 # threads/rank = 6 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/25 job id: 52243956 --- Point --- name: 408 plot: h2o_256_md regex: CP2K label: (4n/6r/6t) --- Point --- name: 409 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/6r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 4 # threads/rank = 9 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/26 job id: 52243957 --- Point --- name: 410 plot: h2o_256_md regex: CP2K label: (4n/4r/9t) --- Point --- name: 411 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/4r/9t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 3 # threads/rank = 12 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/27 job id: 52243958 --- Point --- name: 412 plot: h2o_256_md regex: CP2K label: (4n/3r/12t) --- Point --- name: 413 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/3r/12t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 2 # threads/rank = 18 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/28 job id: 52243959 --- Point --- name: 414 plot: h2o_256_md regex: CP2K label: (4n/2r/18t) --- Point --- name: 415 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/2r/18t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-256 test - DBCSR dominated (MPI/OMP) input file: benchmarks/QS/H2O-256.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 1 # threads/rank = 36 nrepeat = 1 time[min] = 30 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/29 job id: 52243960 --- Point --- name: 416 plot: h2o_256_md regex: CP2K label: (4n/1r/36t) --- Point --- name: 417 plot: h2o_256_md_mem regex: Estimated peak process memory label: (4n/1r/36t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 36 # threads/rank = 1 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/30 job id: 52243961 --- Point --- name: 500 plot: h2o_32_nrep3_ls regex: CP2K label: (4n/36r/1t) --- Point --- name: 501 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (4n/36r/1t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 18 # threads/rank = 2 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/31 job id: 52243962 --- Point --- name: 502 plot: h2o_32_nrep3_ls regex: CP2K label: (4n/18r/2t) --- Point --- name: 503 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (4n/18r/2t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 9 # threads/rank = 4 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/32 job id: 52243963 --- Point --- name: 504 plot: h2o_32_nrep3_ls regex: CP2K label: (4n/9r/4t) --- Point --- name: 505 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (4n/9r/4t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 6 # threads/rank = 6 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/33 job id: 52243964 --- Point --- name: 506 plot: h2o_32_nrep3_ls regex: CP2K label: (4n/6r/6t) --- Point --- name: 507 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (4n/6r/6t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 4 # threads/rank = 9 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/34 job id: 52243965 --- Point --- name: 508 plot: h2o_32_nrep3_ls regex: CP2K label: (4n/4r/9t) --- Point --- name: 509 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (4n/4r/9t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: H2O-32 (NREP 3) linear scaling test (864 H2O) input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp required files: [] output file: result.log # nodes = 4 # ranks/node = 3 # threads/rank = 12 nrepeat = 1 time[min] = 15 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/35 job id: 52243966 --- Point --- name: 510 plot: h2o_32_nrep3_ls regex: CP2K label: (4n/3r/12t) --- Point --- name: 511 plot: h2o_32_nrep3_ls_mem regex: Estimated peak process memory label: (4n/3r/12t) ~~~~~~~ END TEST ~~~~~~~ ~~~~~~~~~ TEST ~~~~~~~~~ description: 512 H2O (4 NVE MD steps on 16 nodes) input file: benchmarks/QS/00512_H2O/H2O-512_md.inp required files: [] output file: result.log # nodes = 16 # ranks/node = 36 # threads/rank = 1 nrepeat = 1 time[min] = 20 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/36 job id: 52243967 --- Point --- name: 601 plot: h2o_512_md regex: CP2K label: (16n/36r/1t) --- Point --- name: 602 plot: h2o_512_md_mem regex: Estimated peak process memory label: (16n/36r/1t) ~~~~~~~ END TEST ~~~~~~~ === END TESTS (description) === ===== PLOTS (description) ===== ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_ri_rpa_mp2", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_ri_rpa_mp2_mem", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_64_md", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_64_md_mem", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_128_md", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_128_md_mem", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_256_md", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_256_md_mem", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_nrep3_ls", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_32_nrep3_ls_mem", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_512_md", title="512 H2O (4 NVE MD steps on 16 nodes)", xlabel="Revision", ylabel="Time [s]" ~~~~~~~~~ PLOT ~~~~~~~~~ Plot: name="h2o_512_md_mem", title="512 H2O (4 NVE MD steps on 16 nodes)", xlabel="Revision", ylabel="Est. peak process memory [MiB]" === END PLOTS (description) === ============ RESULTS ============ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/01/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 0.000000E+00 0.0% 0.0% 0.0% flops max/rank 0.000000E+00 0.0% 0.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 0 0.0% 0.0% 0.0% number of processed stacks 0 0.0% 0.0% 0.0% average stack size 0.0 0.0 0.0 marketing flops 0.000000E+00 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 1 12. MP_Allreduce 19 21. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 15 172669. MP_Allreduce 424 8. MP_Sync 3 MP_comm_split 1 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.030 0.060 261.257 261.265 farming_run 1 2.0 258.677 258.720 260.877 261.125 ------------------------------------------------------------------------------- @@@@@@@@@@ Run number: 2 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 4194304 0.0% 100.0% 0.0% flops 29 x 32 x 32 7602176 0.0% 100.0% 0.0% flops 14 x 32 x 32 14221312 0.0% 100.0% 0.0% flops 28 x 32 x 32 27525120 0.0% 100.0% 0.0% flops 43 x 32 x 32 28180480 0.0% 100.0% 0.0% flops 86 x 32 x 32 28180480 0.0% 100.0% 0.0% flops 14 x 32 x 456 78446592 0.0% 100.0% 0.0% flops 57 x 32 x 32 102727680 0.0% 100.0% 0.0% flops 14 x 14 x 32 208732160 0.0% 100.0% 0.0% flops 29 x 14 x 32 212860928 0.0% 100.0% 0.0% flops 14 x 29 x 32 212860928 0.0% 100.0% 0.0% flops 29 x 29 x 32 227352576 0.0% 100.0% 0.0% flops 32 x 32 x 456 298844160 0.0% 100.0% 0.0% flops 28 x 32 x 456 313786368 0.0% 100.0% 0.0% flops 43 x 32 x 456 321257472 0.0% 100.0% 0.0% flops 86 x 32 x 456 321257472 0.0% 100.0% 0.0% flops 57 x 32 x 456 1171095552 0.0% 100.0% 0.0% flops 14 x 32 x 14 895979560448 0.0% 100.0% 0.0% flops 29 x 32 x 14 928073646080 0.0% 100.0% 0.0% flops 14 x 32 x 29 928073646080 0.0% 100.0% 0.0% flops 29 x 32 x 29 961219133440 0.0% 100.0% 0.0% flops 32 x 32 x 14 1693022420992 0.0% 100.0% 0.0% flops 32 x 32 x 29 1753487507456 0.0% 100.0% 0.0% flops inhomo. stacks 1804075008 100.0% 0.0% 0.0% flops total 7.165239E+12 0.0% 100.0% 0.0% flops max/rank 447.990765E+09 0.0% 100.0% 0.0% matmuls inhomo. stacks 1440 100.0% 0.0% 0.0% matmuls total 249334846 0.0% 100.0% 0.0% number of processed stacks 368972 0.4% 99.6% 0.0% average stack size 1.0 678.4 0.0 marketing flops 7.165779E+12 ------------------------------------------------------------------------------- # multiplications 1160 max memory usage/rank 1.391923E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 2592 MPI messages size (bytes): total size 1.140326E+09 min size 0.000000E+00 max size 1.663488E+06 average size 439.940750E+03 MPI breakdown and total messages size (bytes): size <= 128 132 0 128 < size <= 8192 348 2850816 8192 < size <= 32768 0 0 32768 < size <= 131072 1536 179306496 131072 < size <= 4194304 576 958169088 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 24 12. MP_Allreduce 2365 53. MP_Alltoall 4670 822089. MP_ISend 2604 90540. MP_IRecv 2604 90537. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 12 MP_Bcast 230 1134128. MP_Allreduce 571 1938539. MP_Sync 25 MP_Alltoall 38 9316958. MP_SendRecv 120 384007. MP_ISendRecv 45 235435. MP_Wait 191 MP_comm_split 10 MP_ISend 127 3867574. MP_IRecv 127 3866554. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.006 0.028 234.244 234.251 qs_energies 1 2.0 0.002 0.002 234.050 234.074 mp2_main 1 3.0 0.056 0.083 231.472 231.495 mp2_gpw_main 1 4.0 0.014 0.045 230.224 230.232 mp2_ri_gpw_compute_in 1 5.0 0.310 0.323 150.689 152.848 mp2_ri_gpw_compute_in_loop 1 6.0 0.012 0.030 98.115 100.267 mp2_eri_3c_integrate_gpw 272 7.0 0.175 0.185 83.025 84.721 rpa_ri_compute_en 1 5.0 0.053 0.103 79.414 83.070 rpa_num_int 1 6.0 0.074 0.168 67.971 67.978 rpa_num_int_RPA_matrix_operati 8 7.0 0.013 0.034 67.111 67.974 calc_mat_Q 8 8.0 0.017 0.041 65.789 66.748 contract_S_to_Q 8 9.0 0.011 0.017 64.559 65.546 parallel_gemm_fm 14 9.1 0.000 0.000 64.143 65.115 parallel_gemm_fm_cosma 14 10.1 64.143 65.115 64.143 65.115 integrate_v_rspace 273 8.0 0.428 0.468 60.927 62.313 grid_integrate_task_list 273 9.0 55.656 56.938 55.656 56.938 get_2c_integrals 1 6.0 0.089 0.138 48.087 52.259 fft_wrap_pw1pw2 5465 10.4 0.084 0.091 40.483 44.884 fft_wrap_pw1pw2_100 2178 11.4 4.522 5.390 36.587 40.258 compute_2c_integrals 1 7.0 0.031 0.041 31.761 31.772 compute_2c_integrals_loop_lm 1 8.0 0.006 0.008 30.287 31.333 mp2_eri_2c_integrate_gpw 1 9.0 2.104 2.373 30.281 31.330 fft3d_s 5443 12.4 20.535 22.374 20.562 22.402 cp_fm_cholesky_decompose 12 8.2 16.496 20.669 16.496 20.669 cholesky_decomp 1 7.0 0.005 0.071 15.263 19.400 calculate_wavefunction 272 8.0 5.835 5.956 15.723 17.530 mp2_eri_2c_integrate_gpw_pot_l 272 10.0 0.002 0.002 14.785 15.953 calc_potential_gpw 544 9.5 0.005 0.006 14.679 15.604 ao_to_mo_and_store_B_mult_1 272 7.0 12.108 14.006 12.108 14.006 collocate_single_gaussian 272 10.0 0.055 0.068 12.718 13.813 potential_pw2rs 545 10.0 0.146 0.160 11.234 12.442 pw_scatter_s 2720 12.7 9.047 9.576 9.047 9.576 create_integ_mat 1 6.0 0.030 0.049 9.046 9.050 array2fm 1 7.0 0.000 0.000 7.551 8.066 mp_sync 25 8.8 4.932 7.367 4.932 7.367 pw_poisson_solve 545 10.5 0.011 0.012 5.012 5.838 mp_min_d 1 6.0 2.154 5.813 2.154 5.813 pw_gather_s 2722 12.2 3.991 4.767 3.991 4.767 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="10", plot="h2o_32_ri_rpa_mp2", label="RI-RPA (4n/4r/9t)", y=230.231824, yerr=0.000000 PlotPoint: name="11", plot="h2o_32_ri_rpa_mp2_mem", label="RI-RPA (4n/4r/9t)", y=2732.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/02/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 0.000000E+00 0.0% 0.0% 0.0% flops max/rank 0.000000E+00 0.0% 0.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 0 0.0% 0.0% 0.0% number of processed stacks 0 0.0% 0.0% 0.0% average stack size 0.0 0.0 0.0 marketing flops 0.000000E+00 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 1 12. MP_Allreduce 19 21. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 22 200775. MP_Allreduce 424 9. MP_Sync 4 MP_comm_split 1 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.025 0.037 537.145 537.150 farming_run 1 2.0 535.547 535.573 536.723 537.055 ------------------------------------------------------------------------------- @@@@@@@@@@ Run number: 2 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 32 x 32 x 32 16777216 0.0% 100.0% 0.0% flops 100 x 32 x 143 36608000 0.0% 100.0% 0.0% flops 14 x 32 x 32 36700160 0.0% 100.0% 0.0% flops 29 x 32 x 32 38010880 0.0% 100.0% 0.0% flops 128 x 32 x 143 46858240 0.0% 100.0% 0.0% flops 157 x 32 x 143 57474560 0.0% 100.0% 0.0% flops 100 x 32 x 32 58982400 0.0% 100.0% 0.0% flops 171 x 32 x 143 62599680 0.0% 100.0% 0.0% flops 186 x 32 x 143 68090880 0.0% 100.0% 0.0% flops 100 x 32 x 142 72704000 0.0% 100.0% 0.0% flops 200 x 32 x 143 73216000 0.0% 100.0% 0.0% flops 128 x 32 x 32 75497472 0.0% 100.0% 0.0% flops 100 x 32 x 157 80384000 0.0% 100.0% 0.0% flops 157 x 32 x 32 92602368 0.0% 100.0% 0.0% flops 128 x 32 x 142 93061120 0.0% 100.0% 0.0% flops 171 x 32 x 32 100859904 0.0% 100.0% 0.0% flops 128 x 32 x 157 102891520 0.0% 100.0% 0.0% flops 142 x 32 x 143 103966720 0.0% 100.0% 0.0% flops 143 x 32 x 143 104698880 0.0% 100.0% 0.0% flops 186 x 32 x 32 109707264 0.0% 100.0% 0.0% flops 157 x 32 x 142 114145280 0.0% 100.0% 0.0% flops 156 x 32 x 143 114216960 0.0% 100.0% 0.0% flops 200 x 32 x 32 117964800 0.0% 100.0% 0.0% flops 171 x 32 x 142 124323840 0.0% 100.0% 0.0% flops 157 x 32 x 157 126202880 0.0% 100.0% 0.0% flops 186 x 32 x 142 135229440 0.0% 100.0% 0.0% flops 171 x 32 x 157 137456640 0.0% 100.0% 0.0% flops 200 x 32 x 142 145408000 0.0% 100.0% 0.0% flops 32 x 32 x 143 146432000 0.0% 100.0% 0.0% flops 186 x 32 x 157 149514240 0.0% 100.0% 0.0% flops 200 x 32 x 157 160768000 0.0% 100.0% 0.0% flops 142 x 32 x 32 167510016 0.0% 100.0% 0.0% flops 143 x 32 x 32 168689664 0.0% 100.0% 0.0% flops 156 x 32 x 32 184025088 0.0% 100.0% 0.0% flops 142 x 32 x 142 206479360 0.0% 100.0% 0.0% flops 143 x 32 x 142 207933440 0.0% 100.0% 0.0% flops 156 x 32 x 142 226836480 0.0% 100.0% 0.0% flops 142 x 32 x 157 228290560 0.0% 100.0% 0.0% flops 143 x 32 x 157 229898240 0.0% 100.0% 0.0% flops 156 x 32 x 157 250798080 0.0% 100.0% 0.0% flops 32 x 32 x 142 290816000 0.0% 100.0% 0.0% flops 32 x 32 x 157 321536000 0.0% 100.0% 0.0% flops 14 x 14 x 32 626196480 0.0% 100.0% 0.0% flops 29 x 14 x 32 638582784 0.0% 100.0% 0.0% flops 14 x 29 x 32 638582784 0.0% 100.0% 0.0% flops 29 x 29 x 32 682057728 0.0% 100.0% 0.0% flops 14 x 32 x 14 896799524096 0.0% 100.0% 0.0% flops 29 x 32 x 14 928925089792 0.0% 100.0% 0.0% flops 14 x 32 x 29 928925089792 0.0% 100.0% 0.0% flops 29 x 32 x 29 962100985856 0.0% 100.0% 0.0% flops 32 x 32 x 14 1693022420992 0.0% 100.0% 0.0% flops 32 x 32 x 29 1753487507456 0.0% 100.0% 0.0% flops inhomo. stacks 1112785920 100.0% 0.0% 0.0% flops total 7.172345E+12 0.0% 100.0% 0.0% flops max/rank 150.710992E+09 0.1% 99.9% 0.0% matmuls inhomo. stacks 980 100.0% 0.0% 0.0% matmuls total 249562189 0.0% 100.0% 0.0% number of processed stacks 347432 0.3% 99.7% 0.0% average stack size 1.0 720.3 0.0 marketing flops 7.174951E+12 ------------------------------------------------------------------------------- # multiplications 1140 max memory usage/rank 1.212305E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 61440 MPI messages size (bytes): total size 6.073508E+09 min size 0.000000E+00 max size 642.960000E+03 average size 98.852664E+03 MPI breakdown and total messages size (bytes): size <= 128 32004 0 128 < size <= 8192 1820 14909440 8192 < size <= 32768 0 0 32768 < size <= 131072 18640 1081442304 131072 < size <= 4194304 8976 4977156096 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 53 12. MP_Allreduce 1182 39. MP_Alltoall 1797 713945. MP_ISend 3686 54897. MP_IRecv 3622 54246. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 12 MP_Bcast 757 478553. MP_Allreduce 2021 21391. MP_Sync 37 MP_Alltoall 77 28382042. MP_SendRecv 4192 1987179. MP_ISendRecv 1034 172713. MP_Wait 1346 MP_comm_split 7 MP_ISend 264 362227. MP_IRecv 264 362718. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.018 0.041 343.585 343.586 qs_energies 1 2.0 0.000 0.000 343.388 343.402 mp2_main 1 3.0 0.000 0.000 239.539 239.553 mp2_gpw_main 1 4.0 0.005 0.008 238.550 238.567 mp2_ri_gpw_compute_en 1 5.0 0.054 0.065 130.839 143.773 mp2_ri_gpw_compute_en_RI_loop 1 6.0 2.397 2.479 125.045 125.091 mp2_ri_gpw_compute_in 1 5.0 0.097 0.104 107.574 112.471 scf_env_do_scf 1 3.0 0.000 0.000 103.535 103.536 qs_ks_update_qs_env 5 5.0 0.000 0.000 102.590 102.593 rebuild_ks_matrix 4 6.0 0.000 0.000 102.588 102.592 qs_ks_build_kohn_sham_matrix 4 7.0 0.022 0.025 102.588 102.592 hfx_ks_matrix 4 8.0 0.001 0.001 102.098 102.102 integrate_four_center 4 9.0 0.260 0.603 102.098 102.101 mp2_ri_gpw_compute_en_expansio 172 7.0 0.704 0.811 91.388 98.474 local_gemm 172 8.0 90.684 97.663 90.684 97.663 mp2_ri_gpw_compute_in_loop 1 6.0 0.001 0.002 80.336 85.235 integrate_four_center_main 4 10.0 0.146 0.381 79.232 82.008 integrate_four_center_bin 214 11.0 79.085 81.861 79.085 81.861 init_scf_loop 1 4.0 0.000 0.000 81.139 81.139 mp2_eri_3c_integrate_gpw 91 7.0 0.135 0.168 70.297 74.409 integrate_v_rspace 95 8.0 0.288 0.407 53.373 55.754 grid_integrate_task_list 95 9.0 48.616 50.021 48.616 50.021 mp2_ri_gpw_compute_en_comm 36 7.0 1.071 1.357 27.035 40.977 fft_wrap_pw1pw2 1868 10.4 0.031 0.037 33.601 38.982 mp_sendrecv_dm3 3384 8.0 24.213 38.092 24.213 38.092 fft_wrap_pw1pw2_100 730 11.4 1.689 2.279 30.975 36.174 get_2c_integrals 1 6.0 0.000 0.000 27.126 27.144 compute_2c_integrals 1 7.0 0.003 0.003 26.319 26.333 compute_2c_integrals_loop_lm 1 8.0 0.001 0.002 23.408 26.130 mp2_eri_2c_integrate_gpw 1 9.0 1.564 1.853 23.406 26.129 fft3d_s 1823 12.4 20.845 24.691 20.862 24.709 scf_env_do_scf_inner_loop 4 4.0 0.000 0.001 22.393 22.393 integrate_four_center_load 4 10.0 0.000 0.000 18.055 18.096 hfx_load_balance 1 11.0 0.001 0.001 18.055 18.096 mp_min_d 2 7.0 4.955 18.004 4.955 18.004 mp2_ri_get_integ_group_size 1 6.0 0.000 0.000 4.899 17.833 calc_potential_gpw 182 9.5 0.002 0.003 13.385 16.217 mp2_eri_2c_integrate_gpw_pot_l 91 10.0 0.001 0.001 11.911 14.108 calculate_wavefunction 91 8.0 2.084 2.432 10.792 13.809 potential_pw2rs 186 10.0 0.042 0.045 9.364 12.370 collocate_single_gaussian 91 10.0 0.019 0.024 9.155 11.624 mp_sum_l 425 2.2 5.508 10.868 5.508 10.868 hfx_load_balance_dist 1 12.0 0.000 0.000 5.488 10.861 mp_comm_split_direct 6 7.2 2.737 10.447 2.737 10.447 ao_to_mo_and_store_B_mult_1 91 7.0 8.521 9.740 8.521 9.740 hfx_load_balance_count 1 12.0 6.282 9.034 6.282 9.034 hfx_load_balance_bin 1 12.0 6.263 8.998 6.263 8.998 mp_sync 37 10.5 3.340 8.371 3.340 8.371 pw_poisson_solve 186 10.4 0.006 0.008 5.417 7.178 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="20", plot="h2o_32_ri_rpa_mp2", label="RI-MP2 (4n/12r/3t)", y=238.566609, yerr=0.000000 PlotPoint: name="21", plot="h2o_32_ri_rpa_mp2_mem", label="RI-MP2 (4n/12r/3t)", y=1324.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/03/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 32 x 32 x 32 26877100032 0.0% 100.0% 0.0% flops 209 x 32 x 209 42582335488 0.0% 100.0% 0.0% flops 209 x 32 x 213 43397308416 0.0% 100.0% 0.0% flops 213 x 32 x 209 43397308416 0.0% 100.0% 0.0% flops 9 x 9 x 32 44168260608 0.0% 100.0% 0.0% flops 213 x 32 x 213 44227878912 0.0% 100.0% 0.0% flops 209 x 32 x 218 44416024576 0.0% 100.0% 0.0% flops 218 x 32 x 209 44416024576 0.0% 100.0% 0.0% flops 213 x 32 x 218 45266092032 0.0% 100.0% 0.0% flops 218 x 32 x 213 45266092032 0.0% 100.0% 0.0% flops 32 x 32 x 209 46131576832 0.0% 100.0% 0.0% flops 218 x 32 x 218 46328676352 0.0% 100.0% 0.0% flops 32 x 32 x 213 47014477824 0.0% 100.0% 0.0% flops 32 x 32 x 218 48118104064 0.0% 100.0% 0.0% flops 22 x 9 x 32 53835724800 0.0% 100.0% 0.0% flops 9 x 22 x 32 53885500416 0.0% 100.0% 0.0% flops 209 x 32 x 32 56760467456 0.0% 100.0% 0.0% flops 213 x 32 x 32 57846792192 0.0% 100.0% 0.0% flops 218 x 32 x 32 59204698112 0.0% 100.0% 0.0% flops 22 x 22 x 32 67007283200 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 1.880888E+12 0.0% 100.0% 0.0% flops max/rank 20.325101E+09 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 101210040 0.0% 100.0% 0.0% number of processed stacks 3134624 0.0% 100.0% 0.0% average stack size 0.0 32.3 0.0 marketing flops 2.107629E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 178.757632E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 7242048 MPI messages size (bytes): total size 355.819487E+09 min size 0.000000E+00 max size 380.192000E+03 average size 49.132441E+03 MPI breakdown and total messages size (bytes): size <= 128 2986104 0 128 < size <= 8192 1493448 12234326016 8192 < size <= 32768 0 0 32768 < size <= 131072 2138400 116785152000 131072 < size <= 4194304 624096 226802306368 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 27 12. MP_Allreduce 12193 16. MP_Alltoall 8655 34121. MP_ISend 109684 25393. MP_IRecv 109684 24883. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3683 62395. MP_Allreduce 10330 309. MP_Sync 1482 MP_Alltoall 2094 25181027. MP_SendRecv 34034 3780. MP_ISendRecv 34034 3780. MP_Wait 45572 MP_comm_split 50 MP_ISend 23112 34348. MP_IRecv 23112 34348. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.079 0.208 30.731 30.734 qs_mol_dyn_low 1 2.0 0.067 0.150 29.236 29.244 qs_forces 11 3.9 0.013 0.046 28.729 28.843 qs_energies 11 4.9 0.068 0.269 27.313 27.428 scf_env_do_scf 11 5.9 0.001 0.001 22.240 22.243 scf_env_do_scf_inner_loop 108 6.5 0.003 0.007 19.298 19.299 velocity_verlet 10 3.0 0.002 0.003 14.007 14.008 rebuild_ks_matrix 119 8.3 0.000 0.001 8.942 9.022 qs_ks_build_kohn_sham_matrix 119 9.3 0.027 0.077 8.942 9.022 qs_ks_update_qs_env 119 7.6 0.001 0.001 8.038 8.113 qs_scf_new_mos 108 7.5 0.001 0.001 7.619 7.704 qs_scf_loop_do_ot 108 8.5 0.001 0.001 7.618 7.703 dbcsr_multiply_generic 2286 12.5 0.103 0.153 7.164 7.334 ot_scf_mini 108 9.5 0.002 0.003 7.240 7.299 sum_up_and_integrate 119 10.3 0.004 0.014 6.998 7.015 integrate_v_rspace 119 11.3 0.005 0.011 6.989 7.010 qs_rho_update_rho_low 119 7.7 0.001 0.004 6.094 6.139 calculate_rho_elec 119 8.7 0.011 0.014 6.093 6.139 multiply_cannon 2286 13.5 0.170 0.203 3.663 4.183 ot_mini 108 10.5 0.001 0.003 4.014 4.078 mp_waitall_1 294200 16.4 2.667 3.711 2.667 3.711 multiply_cannon_loop 2286 14.5 0.150 0.260 2.904 3.522 init_scf_run 11 5.9 0.000 0.001 3.505 3.506 scf_env_initial_rho_setup 11 6.9 0.038 0.150 3.505 3.506 grid_integrate_task_list 119 12.3 2.948 3.503 2.948 3.503 qs_ot_get_derivative 108 11.5 0.001 0.002 3.182 3.242 density_rs2pw 119 9.7 0.004 0.008 2.952 3.137 init_scf_loop 11 6.9 0.007 0.027 2.915 2.920 mp_waitany 7404 13.9 2.543 2.874 2.543 2.874 potential_pw2rs 119 12.3 0.004 0.007 2.828 2.848 grid_collocate_task_list 119 9.7 2.464 2.632 2.464 2.632 transfer_rs2pw 487 10.6 0.005 0.006 2.242 2.590 mp_alltoall_d11v 2130 13.8 2.135 2.449 2.135 2.449 make_m2s 4572 13.5 0.065 0.089 2.127 2.345 multiply_cannon_metrocomm3 27432 15.5 0.074 0.167 0.799 2.330 transfer_pw2rs 487 13.2 0.005 0.005 2.279 2.296 qs_ot_get_p 119 10.4 0.001 0.001 2.046 2.136 fft_wrap_pw1pw2 1201 11.6 0.008 0.010 2.048 2.119 calculate_first_density_matrix 1 7.0 0.011 0.045 2.027 2.032 make_images 4572 14.5 0.135 0.188 1.845 1.989 mp_sum_d 4139 12.0 1.414 1.887 1.414 1.887 fft_wrap_pw1pw2_140 487 12.2 0.037 0.044 1.481 1.626 multiply_cannon_multrec 27432 15.5 0.908 1.616 0.916 1.624 fft3d_pb 487 13.2 0.391 0.575 1.354 1.487 rs_gather_matrices 119 12.3 0.041 0.055 1.154 1.448 mp_sum_l 11298 13.2 0.941 1.407 0.941 1.407 transfer_pw2rs_50 119 14.3 0.072 0.083 1.169 1.259 transfer_rs2pw_50 119 11.7 0.104 0.117 1.222 1.256 multiply_cannon_metrocomm1 27432 15.5 0.078 0.173 0.691 1.227 qs_ot_get_derivative_diag 49 12.0 0.001 0.002 1.142 1.174 qs_energies_init_hamiltonians 11 5.9 0.028 0.111 1.040 1.150 dbcsr_dot_sd 1205 11.9 0.046 0.055 0.823 1.149 wfi_extrapolate 11 7.9 0.002 0.005 1.146 1.146 qs_ot_p2m_diag 50 11.0 0.004 0.006 1.110 1.139 qs_ot_get_derivative_taylor 59 13.0 0.001 0.002 1.078 1.108 mp_alltoall_z22v 1688 15.5 0.936 1.095 0.936 1.095 prepare_preconditioner 11 7.9 0.000 0.001 1.042 1.054 make_preconditioner 11 8.9 0.000 0.001 1.042 1.054 make_images_data 4572 15.5 0.051 0.083 0.728 1.050 transfer_rs2pw_140 130 11.5 0.158 0.194 0.698 1.035 make_basis_sm 11 9.8 0.000 0.000 0.988 0.990 hybrid_alltoall_any 4725 16.4 0.049 0.109 0.600 0.982 make_images_sizes 4572 15.5 0.005 0.012 0.745 0.980 make_full_inverse_cholesky 11 9.9 0.000 0.000 0.943 0.979 mp_alltoall_i44 4572 16.5 0.740 0.975 0.740 0.975 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 0.957 0.964 cp_dbcsr_syevd 50 12.0 0.005 0.005 0.920 0.939 mp_allgather_i34 2286 14.5 0.451 0.894 0.451 0.894 parallel_gemm_fm 81 9.0 0.000 0.000 0.775 0.856 parallel_gemm_fm_cosma 81 10.0 0.775 0.856 0.775 0.856 calculate_atomic_block_dm 1 8.0 0.030 0.190 0.805 0.821 qs_env_update_s_mstruct 11 6.9 0.001 0.002 0.677 0.810 dbcsr_complete_redistribute 329 12.2 0.100 0.254 0.759 0.799 ot_diis_step 108 11.5 0.014 0.038 0.788 0.789 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 0.768 0.770 rs_scatter_matrices 130 9.7 0.028 0.036 0.685 0.731 transfer_pw2rs_140 130 13.9 0.228 0.264 0.647 0.698 apply_preconditioner_dbcsr 119 12.6 0.001 0.002 0.622 0.697 apply_single 119 13.6 0.000 0.000 0.621 0.697 cp_fm_diag_elpa 50 13.0 0.000 0.000 0.654 0.654 fft3d_ps 714 14.0 0.045 0.087 0.539 0.636 create_qs_kind_set 1 2.0 0.000 0.001 0.388 0.633 read_qs_kind 2 3.0 0.034 0.102 0.388 0.633 cp_fm_cholesky_decompose 22 10.9 0.608 0.628 0.608 0.628 calculate_rho_core 11 7.9 0.020 0.036 0.467 0.626 cp_fm_redistribute_end 50 14.0 0.318 0.622 0.324 0.626 parser_read_line 2821 4.0 0.001 0.004 0.353 0.622 parser_read_line_low 5 5.0 0.004 0.083 0.352 0.621 broadcast_input_information 5 6.0 0.016 0.018 0.348 0.619 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="100", plot="h2o_64_md", label="(4n/36r/1t)", y=30.734000, yerr=0.000000 PlotPoint: name="101", plot="h2o_64_md_mem", label="(4n/36r/1t)", y=170.454545, yerr=0.782030 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/04/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 142 x 32 x 213 3685656576 0.0% 100.0% 0.0% flops 142 x 32 x 218 3772174336 0.0% 100.0% 0.0% flops 182 x 32 x 213 4723869696 0.0% 100.0% 0.0% flops 182 x 32 x 218 4834758656 0.0% 100.0% 0.0% flops 187 x 32 x 213 4853646336 0.0% 100.0% 0.0% flops 191 x 32 x 213 4957467648 0.0% 100.0% 0.0% flops 187 x 32 x 218 4967581696 0.0% 100.0% 0.0% flops 191 x 32 x 218 5073840128 0.0% 100.0% 0.0% flops 196 x 32 x 213 5087244288 0.0% 100.0% 0.0% flops 196 x 32 x 218 5206663168 0.0% 100.0% 0.0% flops 209 x 32 x 213 5424663552 0.0% 100.0% 0.0% flops 209 x 32 x 218 5552003072 0.0% 100.0% 0.0% flops 218 x 32 x 213 5658261504 0.0% 100.0% 0.0% flops 218 x 32 x 218 5791084544 0.0% 100.0% 0.0% flops 240 x 32 x 213 6229278720 0.0% 100.0% 0.0% flops 240 x 32 x 218 6375505920 0.0% 100.0% 0.0% flops 249 x 32 x 213 6462876672 0.0% 100.0% 0.0% flops 249 x 32 x 218 6614587392 0.0% 100.0% 0.0% flops 284 x 32 x 213 7371313152 0.0% 100.0% 0.0% flops 284 x 32 x 218 7544348672 0.0% 100.0% 0.0% flops 142 x 32 x 32 9641132032 0.0% 100.0% 0.0% flops 142 x 32 x 209 10849327104 0.0% 100.0% 0.0% flops 231 x 32 x 213 11991361536 0.0% 100.0% 0.0% flops 231 x 32 x 218 12272848896 0.0% 100.0% 0.0% flops 182 x 32 x 32 12356943872 0.0% 100.0% 0.0% flops 187 x 32 x 32 12696420352 0.0% 100.0% 0.0% flops 191 x 32 x 32 12968001536 0.0% 100.0% 0.0% flops 196 x 32 x 32 13307478016 0.0% 100.0% 0.0% flops 182 x 32 x 209 13905475584 0.0% 100.0% 0.0% flops 209 x 32 x 32 14190116864 0.0% 100.0% 0.0% flops 187 x 32 x 209 14287494144 0.0% 100.0% 0.0% flops 191 x 32 x 209 14593108992 0.0% 100.0% 0.0% flops 218 x 32 x 32 14801174528 0.0% 100.0% 0.0% flops 196 x 32 x 209 14975127552 0.0% 100.0% 0.0% flops 209 x 32 x 209 15968375808 0.0% 100.0% 0.0% flops 240 x 32 x 32 16294871040 0.0% 100.0% 0.0% flops 218 x 32 x 209 16656009216 0.0% 100.0% 0.0% flops 249 x 32 x 32 16905928704 0.0% 100.0% 0.0% flops 240 x 32 x 209 18336890880 0.0% 100.0% 0.0% flops 249 x 32 x 209 19024524288 0.0% 100.0% 0.0% flops 284 x 32 x 32 19282264064 0.0% 100.0% 0.0% flops 284 x 32 x 209 21698654208 0.0% 100.0% 0.0% flops 32 x 32 x 213 23507238912 0.0% 100.0% 0.0% flops 32 x 32 x 218 24059052032 0.0% 100.0% 0.0% flops 32 x 32 x 32 26877100032 0.0% 100.0% 0.0% flops 231 x 32 x 32 31367626752 0.0% 100.0% 0.0% flops 231 x 32 x 209 35298514944 0.0% 100.0% 0.0% flops 9 x 9 x 32 44168260608 0.0% 100.0% 0.0% flops 22 x 9 x 32 53835724800 0.0% 100.0% 0.0% flops 9 x 22 x 32 53885500416 0.0% 100.0% 0.0% flops 22 x 22 x 32 67007283200 0.0% 100.0% 0.0% flops 32 x 32 x 209 69197365248 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 103113707520 100.0% 0.0% 0.0% flops total 1.890248E+12 5.5% 94.5% 0.0% flops max/rank 40.294274E+09 6.1% 93.9% 0.0% matmuls inhomo. stacks 76736 100.0% 0.0% 0.0% matmuls total 101210040 0.1% 99.9% 0.0% number of processed stacks 3136704 2.4% 97.6% 0.0% average stack size 1.0 33.1 0.0 marketing flops 2.107629E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 202.665984E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 3456432 MPI messages size (bytes): total size 321.940816E+09 min size 0.000000E+00 max size 765.456000E+03 average size 93.142531E+03 MPI breakdown and total messages size (bytes): size <= 128 1163952 0 128 < size <= 8192 704472 5771034624 8192 < size <= 32768 140976 2309750784 32768 < size <= 131072 1134984 87058022400 131072 < size <= 4194304 312048 226802306368 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3683 62487. MP_Allreduce 10328 308. MP_Sync 54 MP_Alltoall 2082 575825. MP_SendRecv 16898 6600. MP_ISendRecv 16898 6600. MP_Wait 35258 MP_comm_split 50 MP_ISend 15892 63460. MP_IRecv 15892 63460. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.116 0.255 43.201 43.211 qs_mol_dyn_low 1 2.0 0.179 0.293 41.877 41.894 qs_forces 11 3.9 0.004 0.010 41.242 41.645 qs_energies 11 4.9 0.001 0.003 39.072 39.478 scf_env_do_scf 11 5.9 0.001 0.004 34.517 34.525 scf_env_do_scf_inner_loop 108 6.5 0.003 0.034 30.906 30.911 velocity_verlet 10 3.0 0.002 0.006 22.945 22.965 qs_scf_new_mos 108 7.5 0.001 0.001 13.242 13.403 qs_scf_loop_do_ot 108 8.5 0.001 0.001 13.241 13.402 ot_scf_mini 108 9.5 0.003 0.004 12.578 12.679 dbcsr_multiply_generic 2286 12.5 0.131 0.140 11.827 12.388 rebuild_ks_matrix 119 8.3 0.001 0.001 11.726 11.960 qs_ks_build_kohn_sham_matrix 119 9.3 0.015 0.018 11.726 11.959 qs_ks_update_qs_env 119 7.6 0.001 0.001 10.424 10.633 qs_rho_update_rho_low 119 7.7 0.003 0.009 9.372 9.398 calculate_rho_elec 119 8.7 0.021 0.030 9.369 9.398 sum_up_and_integrate 119 10.3 0.002 0.004 9.035 9.045 integrate_v_rspace 119 11.3 0.003 0.004 9.021 9.031 multiply_cannon 2286 13.5 0.225 0.233 6.286 7.559 ot_mini 108 10.5 0.001 0.002 6.734 6.868 mp_waitall_1 220534 16.5 4.375 6.731 4.375 6.731 multiply_cannon_loop 2286 14.5 0.212 0.228 5.026 6.555 qs_ot_get_derivative 108 11.5 0.001 0.002 5.276 5.379 grid_collocate_task_list 119 9.7 4.704 4.985 4.704 4.985 multiply_cannon_metrocomm3 27432 15.5 0.089 0.094 1.952 4.880 grid_integrate_task_list 119 12.3 4.630 4.832 4.630 4.832 density_rs2pw 119 9.7 0.006 0.007 3.961 4.195 qs_ot_get_p 119 10.4 0.001 0.001 3.903 4.041 make_m2s 4572 13.5 0.087 0.090 3.494 3.835 init_scf_loop 11 6.9 0.000 0.000 3.566 3.569 potential_pw2rs 119 12.3 0.008 0.009 3.501 3.552 fft_wrap_pw1pw2 1201 11.6 0.016 0.019 3.287 3.369 make_images 4572 14.5 0.223 0.275 3.029 3.334 transfer_rs2pw 487 10.6 0.007 0.009 2.599 3.216 init_scf_run 11 5.9 0.000 0.008 3.176 3.177 scf_env_initial_rho_setup 11 6.9 0.005 0.020 3.176 3.177 fft3d_ps 1201 13.6 0.656 0.750 2.856 2.940 mp_waitany 15892 13.8 2.297 2.933 2.297 2.933 multiply_cannon_multrec 27432 15.5 1.745 2.700 1.757 2.712 transfer_pw2rs 487 13.2 0.006 0.007 2.580 2.631 fft_wrap_pw1pw2_140 487 12.2 0.101 0.110 2.545 2.629 mp_alltoall_d11v 2130 13.8 2.265 2.513 2.265 2.513 mp_sum_l 11298 13.2 1.427 2.192 1.427 2.192 qs_ot_p2m_diag 50 11.0 0.006 0.014 2.129 2.148 prepare_preconditioner 11 7.9 0.000 0.000 2.021 2.041 make_preconditioner 11 8.9 0.000 0.000 2.021 2.041 wfi_extrapolate 11 7.9 0.001 0.003 1.990 1.991 mp_alltoall_z22v 1201 15.6 1.785 1.990 1.785 1.990 cp_dbcsr_syevd 50 12.0 0.004 0.004 1.862 1.878 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.819 1.867 make_full_inverse_cholesky 11 9.9 0.000 0.000 1.817 1.862 make_images_data 4572 15.5 0.063 0.072 1.200 1.834 qs_ot_get_derivative_taylor 59 13.0 0.002 0.002 1.762 1.817 transfer_rs2pw_140 130 11.5 0.221 0.264 1.058 1.730 hybrid_alltoall_any 4725 16.4 0.064 0.127 1.045 1.720 multiply_cannon_metrocomm1 27432 15.5 0.047 0.052 0.598 1.615 make_images_sizes 4572 15.5 0.006 0.006 1.147 1.563 mp_alltoall_i44 4572 16.5 1.141 1.557 1.141 1.557 dbcsr_complete_redistribute 329 12.2 0.147 0.245 1.380 1.484 qs_energies_init_hamiltonians 11 5.9 0.001 0.002 1.075 1.478 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.398 1.423 ot_diis_step 108 11.5 0.013 0.015 1.390 1.391 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.347 1.348 apply_preconditioner_dbcsr 119 12.6 0.000 0.001 1.144 1.322 apply_single 119 13.6 0.000 0.000 1.144 1.322 cp_fm_redistribute_end 50 14.0 0.662 1.291 0.674 1.299 mp_allgather_i34 2286 14.5 0.781 1.265 0.781 1.265 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 1.247 1.251 cp_fm_diag_elpa_base 50 14.0 0.600 1.207 0.611 1.236 yz_to_x 368 14.5 0.056 0.067 1.086 1.230 mp_sum_d 4139 12.0 0.698 1.200 0.698 1.200 qs_env_update_s_mstruct 11 6.9 0.000 0.001 0.705 1.145 transfer_pw2rs_50 119 14.3 0.100 0.115 0.920 1.075 rs_gather_matrices 119 12.3 0.066 0.074 0.828 1.067 transfer_pw2rs_140 130 13.9 0.313 0.355 0.979 1.065 copy_fm_to_dbcsr 176 11.2 0.001 0.001 0.924 1.033 calculate_first_density_matrix 1 7.0 0.000 0.000 0.980 0.981 calculate_rho_core 11 7.9 0.021 0.023 0.474 0.921 cp_fm_cholesky_invert 11 10.9 0.903 0.915 0.903 0.915 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="102", plot="h2o_64_md", label="(4n/18r/2t)", y=43.211000, yerr=0.000000 PlotPoint: name="103", plot="h2o_64_md_mem", label="(4n/18r/2t)", y=193.545455, yerr=0.890724 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/05/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 142 x 32 x 200 3460710400 0.0% 100.0% 0.0% flops 164 x 32 x 200 3996876800 0.0% 100.0% 0.0% flops 209 x 32 x 200 5093580800 0.0% 100.0% 0.0% flops 213 x 32 x 200 5191065600 0.0% 100.0% 0.0% flops 64 x 32 x 200 5518131200 0.0% 100.0% 0.0% flops 231 x 32 x 200 5629747200 0.0% 100.0% 0.0% flops 262 x 32 x 200 6385254400 0.0% 100.0% 0.0% flops 64 x 32 x 32 6719275008 0.0% 100.0% 0.0% flops 293 x 32 x 200 7140761600 0.0% 100.0% 0.0% flops 142 x 32 x 209 7232884736 0.0% 100.0% 0.0% flops 142 x 32 x 222 7682777088 0.0% 100.0% 0.0% flops 164 x 32 x 209 8353472512 0.0% 100.0% 0.0% flops 164 x 32 x 222 8873066496 0.0% 100.0% 0.0% flops 196 x 32 x 200 9553510400 0.0% 100.0% 0.0% flops 142 x 32 x 32 9641132032 0.0% 100.0% 0.0% flops 209 x 32 x 209 10645583872 0.0% 100.0% 0.0% flops 213 x 32 x 209 10849327104 0.0% 100.0% 0.0% flops 164 x 32 x 32 11134828544 0.0% 100.0% 0.0% flops 209 x 32 x 222 11307749376 0.0% 100.0% 0.0% flops 213 x 32 x 222 11524165632 0.0% 100.0% 0.0% flops 64 x 32 x 209 11532894208 0.0% 100.0% 0.0% flops 231 x 32 x 209 11766171648 0.0% 100.0% 0.0% flops 64 x 32 x 222 12250251264 0.0% 100.0% 0.0% flops 231 x 32 x 222 12498038784 0.0% 100.0% 0.0% flops 262 x 32 x 209 13345181696 0.0% 100.0% 0.0% flops 262 x 32 x 222 14175264768 0.0% 100.0% 0.0% flops 209 x 32 x 32 14190116864 0.0% 100.0% 0.0% flops 213 x 32 x 32 14461698048 0.0% 100.0% 0.0% flops 293 x 32 x 209 14924191744 0.0% 100.0% 0.0% flops 231 x 32 x 32 15683813376 0.0% 100.0% 0.0% flops 293 x 32 x 222 15852490752 0.0% 100.0% 0.0% flops 218 x 32 x 200 15938764800 0.0% 100.0% 0.0% flops 32 x 32 x 200 16554393600 0.0% 100.0% 0.0% flops 262 x 32 x 32 17788567552 0.0% 100.0% 0.0% flops 293 x 32 x 32 19893321728 0.0% 100.0% 0.0% flops 196 x 32 x 209 19966836736 0.0% 100.0% 0.0% flops 32 x 32 x 32 20157825024 0.0% 100.0% 0.0% flops 196 x 32 x 222 21208793088 0.0% 100.0% 0.0% flops 196 x 32 x 32 26614956032 0.0% 100.0% 0.0% flops 218 x 32 x 209 33312018432 0.0% 100.0% 0.0% flops 32 x 32 x 209 34598682624 0.0% 100.0% 0.0% flops 218 x 32 x 222 35384057856 0.0% 100.0% 0.0% flops 32 x 32 x 222 36750753792 0.0% 100.0% 0.0% flops 9 x 9 x 32 44168260608 0.0% 100.0% 0.0% flops 218 x 32 x 32 44403523584 0.0% 100.0% 0.0% flops 22 x 9 x 32 53835724800 0.0% 100.0% 0.0% flops 9 x 22 x 32 53885500416 0.0% 100.0% 0.0% flops 22 x 22 x 32 67007283200 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 105981222912 100.0% 0.0% 0.0% flops total 1.894805E+12 5.6% 94.4% 0.0% flops max/rank 58.021006E+09 6.4% 93.6% 0.0% matmuls inhomo. stacks 70000 100.0% 0.0% 0.0% matmuls total 101118360 0.1% 99.9% 0.0% number of processed stacks 3045024 2.3% 97.7% 0.0% average stack size 1.0 34.0 0.0 marketing flops 2.107592E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 220.942336E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 2194560 MPI messages size (bytes): total size 310.646604E+09 min size 0.000000E+00 max size 1.145520E+06 average size 141.553031E+03 MPI breakdown and total messages size (bytes): size <= 128 724648 0 128 < size <= 8192 253512 2076770304 8192 < size <= 32768 281952 4619501568 32768 < size <= 131072 494448 39143342080 131072 < size <= 4194304 440000 264807943488 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3672 62658. MP_Allreduce 10306 303. MP_Sync 54 MP_Alltoall 2060 1571200. MP_SendRecv 16779 37093. MP_ISendRecv 16779 37093. MP_Wait 23539 MP_comm_split 50 MP_ISend 5720 128509. MP_IRecv 5720 128509. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.037 0.093 36.509 36.511 qs_mol_dyn_low 1 2.0 0.135 0.321 35.891 35.897 qs_forces 11 3.9 0.007 0.022 35.261 35.308 qs_energies 11 4.9 0.001 0.001 33.572 33.622 scf_env_do_scf 11 5.9 0.001 0.002 27.849 27.850 scf_env_do_scf_inner_loop 108 6.5 0.003 0.018 24.770 24.782 velocity_verlet 10 3.0 0.001 0.002 18.440 18.457 qs_scf_new_mos 108 7.5 0.001 0.001 11.333 11.415 qs_scf_loop_do_ot 108 8.5 0.001 0.001 11.332 11.415 ot_scf_mini 108 9.5 0.003 0.006 10.791 10.878 dbcsr_multiply_generic 2286 12.5 0.131 0.193 10.429 10.743 rebuild_ks_matrix 119 8.3 0.001 0.001 8.931 9.073 qs_ks_build_kohn_sham_matrix 119 9.3 0.014 0.016 8.930 9.072 qs_ks_update_qs_env 119 7.6 0.001 0.002 7.952 8.081 qs_rho_update_rho_low 119 7.7 0.001 0.002 7.036 7.044 calculate_rho_elec 119 8.7 0.026 0.031 7.035 7.044 sum_up_and_integrate 119 10.3 0.001 0.002 6.514 6.540 integrate_v_rspace 119 11.3 0.003 0.003 6.503 6.529 multiply_cannon 2286 13.5 0.225 0.259 5.485 6.235 ot_mini 108 10.5 0.009 0.036 5.800 5.890 mp_waitall_1 200699 16.5 2.874 5.164 2.874 5.164 multiply_cannon_loop 2286 14.5 0.225 0.335 4.505 5.141 grid_collocate_task_list 119 9.7 4.236 4.697 4.236 4.697 qs_ot_get_derivative 108 11.5 0.001 0.003 4.320 4.411 grid_integrate_task_list 119 12.3 3.904 4.133 3.904 4.133 multiply_cannon_metrocomm3 27432 15.5 0.093 0.195 1.648 3.978 init_scf_run 11 5.9 0.000 0.002 3.791 3.791 scf_env_initial_rho_setup 11 6.9 0.000 0.002 3.790 3.791 multiply_cannon_multrec 27432 15.5 1.845 3.640 1.858 3.653 make_m2s 4572 13.5 0.089 0.127 3.232 3.387 qs_ot_get_p 119 10.4 0.001 0.001 3.164 3.306 density_rs2pw 119 9.7 0.005 0.008 2.503 3.270 transfer_rs2pw 487 10.6 0.005 0.007 2.097 3.078 init_scf_loop 11 6.9 0.000 0.000 3.057 3.070 make_images 4572 14.5 0.234 0.295 2.651 2.869 mp_waitany 5720 13.7 1.515 2.555 1.515 2.555 fft_wrap_pw1pw2 1201 11.6 0.012 0.015 2.375 2.441 transfer_rs2pw_140 130 11.5 0.151 0.170 1.453 2.433 fft3d_ps 1201 13.6 0.584 0.700 2.083 2.228 calculate_first_density_matrix 1 7.0 0.000 0.000 2.168 2.169 potential_pw2rs 119 12.3 0.007 0.010 2.078 2.105 fft_wrap_pw1pw2_140 487 12.2 0.060 0.073 1.927 1.983 prepare_preconditioner 11 7.9 0.000 0.000 1.905 1.916 make_preconditioner 11 8.9 0.000 0.000 1.905 1.916 make_full_inverse_cholesky 11 9.9 0.000 0.000 1.757 1.797 qs_ot_p2m_diag 50 11.0 0.008 0.014 1.721 1.782 qs_energies_init_hamiltonians 11 5.9 0.003 0.012 1.713 1.758 qs_ot_get_derivative_diag 49 12.0 0.001 0.002 1.544 1.604 cp_dbcsr_syevd 50 12.0 0.003 0.004 1.467 1.521 mp_alltoall_z22v 1201 15.6 1.248 1.520 1.248 1.520 qs_ot_get_derivative_taylor 59 13.0 0.002 0.003 1.477 1.513 mp_sum_l 11298 13.2 1.097 1.508 1.097 1.508 make_images_sizes 4572 15.5 0.006 0.014 1.084 1.498 wfi_extrapolate 11 7.9 0.001 0.001 1.495 1.495 mp_alltoall_i44 4572 16.5 1.078 1.492 1.078 1.492 mp_sum_d 4139 12.0 0.841 1.480 0.841 1.480 transfer_pw2rs 487 13.2 0.004 0.005 1.441 1.454 ot_diis_step 108 11.5 0.017 0.026 1.435 1.445 mp_alltoall_d11v 2130 13.8 1.120 1.410 1.120 1.410 make_basis_sm 11 9.8 0.000 0.000 1.388 1.392 qs_env_update_s_mstruct 11 6.9 0.000 0.001 1.157 1.388 apply_preconditioner_dbcsr 119 12.6 0.001 0.002 1.176 1.328 apply_single 119 13.6 0.000 0.001 1.175 1.327 make_images_data 4572 15.5 0.063 0.117 0.901 1.182 calculate_rho_core 11 7.9 0.023 0.025 0.906 1.167 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.135 1.136 cp_fm_redistribute_end 50 14.0 0.561 1.104 0.566 1.107 cp_fm_diag_elpa_base 50 14.0 0.492 1.022 0.536 1.079 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.052 1.066 hybrid_alltoall_any 4725 16.4 0.066 0.122 0.778 1.033 dbcsr_dot_sd 1205 11.9 0.087 0.110 0.562 1.001 build_core_hamiltonian_matrix 11 6.9 0.039 0.157 0.440 0.970 multiply_cannon_metrocomm4 25146 15.5 0.087 0.191 0.457 0.929 calculate_atomic_block_dm 1 8.0 0.001 0.001 0.926 0.926 yz_to_x 368 14.5 0.039 0.048 0.750 0.914 parallel_gemm_fm 81 9.0 0.000 0.000 0.843 0.849 parallel_gemm_fm_cosma 81 10.0 0.843 0.849 0.843 0.849 cp_fm_cholesky_invert 11 10.9 0.829 0.841 0.829 0.841 mp_irecv_dv 59094 16.3 0.334 0.804 0.334 0.804 cp_fm_cholesky_decompose 22 10.9 0.762 0.798 0.762 0.798 dbcsr_complete_redistribute 329 12.2 0.102 0.127 0.717 0.790 mp_allgather_i34 2286 14.5 0.472 0.780 0.472 0.780 qs_ot_get_orbitals 108 10.5 0.001 0.001 0.744 0.767 rs_gather_matrices 119 12.3 0.037 0.044 0.494 0.753 transfer_pw2rs_50 119 14.3 0.351 0.392 0.634 0.731 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="104", plot="h2o_64_md", label="(4n/12r/3t)", y=36.511000, yerr=0.000000 PlotPoint: name="105", plot="h2o_64_md_mem", label="(4n/12r/3t)", y=210.454545, yerr=0.497930 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/06/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 80 x 32 x 32 1357905920 0.0% 100.0% 0.0% flops 80 x 32 x 64 1357905920 0.0% 100.0% 0.0% flops 80 x 64 x 32 1357905920 0.0% 100.0% 0.0% flops 80 x 64 x 64 1357905920 0.0% 100.0% 0.0% flops 89 x 32 x 32 1510670336 0.0% 100.0% 0.0% flops 89 x 32 x 64 1510670336 0.0% 100.0% 0.0% flops 89 x 64 x 32 1510670336 0.0% 100.0% 0.0% flops 89 x 64 x 64 1510670336 0.0% 100.0% 0.0% flops 64 x 64 x 64 1679818752 0.0% 100.0% 0.0% flops 64 x 64 x 32 1679818752 0.0% 100.0% 0.0% flops 64 x 32 x 32 1679818752 0.0% 100.0% 0.0% flops 64 x 32 x 64 1679818752 0.0% 100.0% 0.0% flops 80 x 32 x 422 2056929280 0.0% 100.0% 0.0% flops 80 x 64 x 422 2056929280 0.0% 100.0% 0.0% flops 80 x 32 x 427 2081300480 0.0% 100.0% 0.0% flops 80 x 64 x 427 2081300480 0.0% 100.0% 0.0% flops 80 x 32 x 431 2100797440 0.0% 100.0% 0.0% flops 80 x 64 x 431 2100797440 0.0% 100.0% 0.0% flops 89 x 32 x 422 2288333824 0.0% 100.0% 0.0% flops 89 x 64 x 422 2288333824 0.0% 100.0% 0.0% flops 89 x 32 x 427 2315446784 0.0% 100.0% 0.0% flops 89 x 64 x 427 2315446784 0.0% 100.0% 0.0% flops 89 x 32 x 431 2337137152 0.0% 100.0% 0.0% flops 89 x 64 x 431 2337137152 0.0% 100.0% 0.0% flops 71 x 64 x 64 3615424512 0.0% 100.0% 0.0% flops 71 x 64 x 32 3615424512 0.0% 100.0% 0.0% flops 71 x 32 x 32 3615424512 0.0% 100.0% 0.0% flops 71 x 32 x 64 3615424512 0.0% 100.0% 0.0% flops 32 x 32 x 32 5039456256 0.0% 100.0% 0.0% flops 32 x 32 x 64 5039456256 0.0% 100.0% 0.0% flops 32 x 64 x 64 5039456256 0.0% 100.0% 0.0% flops 32 x 64 x 32 5039456256 0.0% 100.0% 0.0% flops 71 x 64 x 422 5476574208 0.0% 100.0% 0.0% flops 71 x 32 x 422 5476574208 0.0% 100.0% 0.0% flops 71 x 64 x 427 5541462528 0.0% 100.0% 0.0% flops 71 x 32 x 427 5541462528 0.0% 100.0% 0.0% flops 71 x 64 x 431 5593373184 0.0% 100.0% 0.0% flops 71 x 32 x 431 5593373184 0.0% 100.0% 0.0% flops 64 x 64 x 422 5821628416 0.0% 100.0% 0.0% flops 64 x 32 x 422 5821628416 0.0% 100.0% 0.0% flops 64 x 64 x 427 5890605056 0.0% 100.0% 0.0% flops 64 x 32 x 427 5890605056 0.0% 100.0% 0.0% flops 64 x 64 x 431 5945786368 0.0% 100.0% 0.0% flops 64 x 32 x 431 5945786368 0.0% 100.0% 0.0% flops 111 x 64 x 64 9420472320 0.0% 100.0% 0.0% flops 111 x 64 x 32 9420472320 0.0% 100.0% 0.0% flops 111 x 32 x 32 9420472320 0.0% 100.0% 0.0% flops 111 x 32 x 64 9420472320 0.0% 100.0% 0.0% flops 98 x 64 x 64 9980608512 0.0% 100.0% 0.0% flops 98 x 64 x 32 9980608512 0.0% 100.0% 0.0% flops 98 x 32 x 32 9980608512 0.0% 100.0% 0.0% flops 98 x 32 x 64 9980608512 0.0% 100.0% 0.0% flops 120 x 32 x 32 10184294400 0.0% 100.0% 0.0% flops 120 x 32 x 64 10184294400 0.0% 100.0% 0.0% flops 120 x 64 x 64 10184294400 0.0% 100.0% 0.0% flops 120 x 64 x 32 10184294400 0.0% 100.0% 0.0% flops 111 x 64 x 422 14269946880 0.0% 100.0% 0.0% flops 111 x 32 x 422 14269946880 0.0% 100.0% 0.0% flops 111 x 64 x 427 14439022080 0.0% 100.0% 0.0% flops 111 x 32 x 427 14439022080 0.0% 100.0% 0.0% flops 111 x 64 x 431 14574282240 0.0% 100.0% 0.0% flops 111 x 32 x 431 14574282240 0.0% 100.0% 0.0% flops 98 x 64 x 422 15118430208 0.0% 100.0% 0.0% flops 98 x 32 x 422 15118430208 0.0% 100.0% 0.0% flops 98 x 64 x 427 15297558528 0.0% 100.0% 0.0% flops 98 x 32 x 427 15297558528 0.0% 100.0% 0.0% flops 120 x 32 x 422 15426969600 0.0% 100.0% 0.0% flops 120 x 64 x 422 15426969600 0.0% 100.0% 0.0% flops 98 x 64 x 431 15440861184 0.0% 100.0% 0.0% flops 98 x 32 x 431 15440861184 0.0% 100.0% 0.0% flops 120 x 32 x 427 15609753600 0.0% 100.0% 0.0% flops 120 x 64 x 427 15609753600 0.0% 100.0% 0.0% flops 120 x 32 x 431 15755980800 0.0% 100.0% 0.0% flops 120 x 64 x 431 15755980800 0.0% 100.0% 0.0% flops 32 x 32 x 422 17464885248 0.0% 100.0% 0.0% flops 32 x 64 x 422 17464885248 0.0% 100.0% 0.0% flops 32 x 32 x 427 17671815168 0.0% 100.0% 0.0% flops 32 x 64 x 427 17671815168 0.0% 100.0% 0.0% flops 32 x 32 x 431 17837359104 0.0% 100.0% 0.0% flops 32 x 64 x 431 17837359104 0.0% 100.0% 0.0% flops 9 x 9 x 64 22084130304 0.0% 100.0% 0.0% flops 9 x 9 x 32 22084130304 0.0% 100.0% 0.0% flops 22 x 9 x 64 26917862400 0.0% 100.0% 0.0% flops 22 x 9 x 32 26917862400 0.0% 100.0% 0.0% flops 9 x 22 x 64 26942750208 0.0% 100.0% 0.0% flops 9 x 22 x 32 26942750208 0.0% 100.0% 0.0% flops 22 x 22 x 64 33503641600 0.0% 100.0% 0.0% flops 22 x 22 x 32 33503641600 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 112840197120 100.0% 0.0% 0.0% flops total 1.896345E+12 6.0% 94.0% 0.0% flops max/rank 83.755100E+09 11.7% 88.3% 0.0% matmuls inhomo. stacks 68796 100.0% 0.0% 0.0% matmuls total 96003990 0.1% 99.9% 0.0% number of processed stacks 2257260 3.0% 97.0% 0.0% average stack size 1.0 43.8 0.0 marketing flops 2.107629E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 234.516480E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 822960 MPI messages size (bytes): total size 161.737343E+09 min size 0.000000E+00 max size 1.486088E+06 average size 196.531234E+03 MPI breakdown and total messages size (bytes): size <= 128 5610 0 128 < size <= 8192 169820 1391165440 8192 < size <= 32768 212110 4169891840 32768 < size <= 131072 243000 26542080000 131072 < size <= 4194304 192420 129634037440 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 95 12. MP_Allreduce 12329 16. MP_Alltoall 8655 36603. MP_ISend 54820 93714. MP_IRecv 54820 91356. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3683 62471. MP_Allreduce 10327 343. MP_Sync 54 MP_Alltoall 1843 2366062. MP_SendRecv 8330 18700. MP_ISendRecv 8330 18700. MP_Wait 31172 MP_comm_split 50 MP_ISend 20872 59666. MP_IRecv 20872 59666. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.255 0.382 37.701 37.711 qs_mol_dyn_low 1 2.0 0.145 0.205 36.296 36.330 qs_forces 11 3.9 0.047 0.108 35.398 35.494 qs_energies 11 4.9 0.003 0.006 33.361 33.482 scf_env_do_scf 11 5.9 0.001 0.003 28.202 28.204 scf_env_do_scf_inner_loop 108 6.5 0.006 0.028 24.743 24.793 velocity_verlet 10 3.0 0.001 0.002 18.377 18.381 qs_scf_new_mos 108 7.5 0.001 0.001 10.159 10.315 qs_scf_loop_do_ot 108 8.5 0.001 0.001 10.158 10.314 rebuild_ks_matrix 119 8.3 0.001 0.001 9.840 10.024 qs_ks_build_kohn_sham_matrix 119 9.3 0.016 0.021 9.840 10.023 ot_scf_mini 108 9.5 0.003 0.004 9.662 9.826 dbcsr_multiply_generic 2286 12.5 0.125 0.172 8.798 9.117 qs_ks_update_qs_env 119 7.6 0.001 0.001 8.775 8.941 qs_rho_update_rho_low 119 7.7 0.001 0.002 8.002 8.017 calculate_rho_elec 119 8.7 0.035 0.041 8.001 8.017 sum_up_and_integrate 119 10.3 0.002 0.005 7.470 7.477 integrate_v_rspace 119 11.3 0.003 0.004 7.457 7.464 multiply_cannon 2286 13.5 0.208 0.252 4.354 5.610 grid_collocate_task_list 119 9.7 5.347 5.562 5.347 5.562 ot_mini 108 10.5 0.001 0.001 5.024 5.180 grid_integrate_task_list 119 12.3 4.886 5.108 4.886 5.108 multiply_cannon_loop 2286 14.5 0.126 0.187 3.494 4.453 qs_ot_get_derivative 108 11.5 0.001 0.002 3.731 3.896 init_scf_run 11 5.9 0.000 0.006 3.888 3.889 scf_env_initial_rho_setup 11 6.9 0.011 0.022 3.888 3.889 init_scf_loop 11 6.9 0.001 0.005 3.434 3.486 qs_ot_get_p 119 10.4 0.001 0.001 3.270 3.437 multiply_cannon_multrec 13716 15.5 2.040 3.320 2.053 3.334 mp_waitall_1 156604 16.6 1.897 2.819 1.897 2.819 make_m2s 4572 13.5 0.074 0.099 2.677 2.804 fft_wrap_pw1pw2 1201 11.6 0.014 0.017 2.413 2.433 density_rs2pw 119 9.7 0.006 0.009 2.190 2.369 make_images 4572 14.5 0.260 0.313 2.196 2.311 fft3d_ps 1201 13.6 0.673 0.717 1.964 1.987 fft_wrap_pw1pw2_140 487 12.2 0.111 0.118 1.959 1.976 calculate_first_density_matrix 1 7.0 0.001 0.002 1.972 1.974 prepare_preconditioner 11 7.9 0.000 0.000 1.951 1.964 make_preconditioner 11 8.9 0.000 0.001 1.951 1.963 potential_pw2rs 119 12.3 0.009 0.012 1.901 1.911 make_full_inverse_cholesky 11 9.9 0.000 0.000 1.813 1.843 qs_ot_p2m_diag 50 11.0 0.010 0.023 1.794 1.823 mp_alltoall_d11v 2130 13.8 1.413 1.629 1.413 1.629 cp_dbcsr_syevd 50 12.0 0.003 0.004 1.594 1.615 transfer_rs2pw 487 10.6 0.006 0.007 1.200 1.497 mp_sum_l 11298 13.2 1.110 1.467 1.110 1.467 wfi_extrapolate 11 7.9 0.002 0.005 1.427 1.427 qs_ot_get_derivative_taylor 59 13.0 0.002 0.003 1.311 1.414 mp_waitany 20872 13.8 1.050 1.399 1.050 1.399 qs_ot_get_derivative_diag 49 12.0 0.001 0.002 1.295 1.355 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.273 1.273 ot_diis_step 108 11.5 0.035 0.056 1.267 1.269 multiply_cannon_metrocomm3 13716 15.5 0.043 0.092 0.516 1.257 cp_fm_redistribute_end 50 14.0 0.627 1.245 0.631 1.247 cp_fm_diag_elpa_base 50 14.0 0.572 1.154 0.603 1.227 transfer_pw2rs 487 13.2 0.005 0.006 1.212 1.220 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.170 1.189 qs_energies_init_hamiltonians 11 5.9 0.001 0.002 1.042 1.162 mp_alltoall_z22v 1201 15.6 1.107 1.156 1.107 1.156 apply_preconditioner_dbcsr 119 12.6 0.000 0.001 0.989 1.121 apply_single 119 13.6 0.000 0.001 0.989 1.121 make_images_data 4572 15.5 0.060 0.102 0.942 1.098 transfer_rs2pw_140 130 11.5 0.148 0.174 0.751 1.066 cp_fm_cholesky_invert 11 10.9 1.032 1.039 1.032 1.039 make_basis_sm 11 9.8 0.000 0.000 1.006 1.010 hybrid_alltoall_any 4725 16.4 0.071 0.203 0.800 0.964 calculate_atomic_block_dm 1 8.0 0.020 0.064 0.914 0.916 mp_allgather_i34 2286 14.5 0.388 0.869 0.388 0.869 dbcsr_complete_redistribute 329 12.2 0.170 0.239 0.824 0.862 arnoldi_extremal 119 11.4 0.002 0.003 0.738 0.837 arnoldi_normal_ev 119 12.4 0.002 0.004 0.736 0.835 rs_gather_matrices 119 12.3 0.056 0.062 0.630 0.829 cp_fm_cholesky_decompose 22 10.9 0.771 0.787 0.771 0.787 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 0.783 0.785 multiply_cannon_metrocomm1 13716 15.5 0.046 0.095 0.460 0.784 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 0.688 0.762 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="106", plot="h2o_64_md", label="(4n/9r/4t)", y=37.711000, yerr=0.000000 PlotPoint: name="107", plot="h2o_64_md_mem", label="(4n/9r/4t)", y=222.454545, yerr=1.372697 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/07/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 40 x 32 x 200 487424000 0.0% 100.0% 0.0% flops 40 x 64 x 200 487424000 0.0% 100.0% 0.0% flops 62 x 32 x 200 755507200 0.0% 100.0% 0.0% flops 62 x 64 x 200 755507200 0.0% 100.0% 0.0% flops 76 x 64 x 200 926105600 0.0% 100.0% 0.0% flops 76 x 32 x 200 926105600 0.0% 100.0% 0.0% flops 40 x 32 x 209 1018716160 0.0% 100.0% 0.0% flops 40 x 64 x 209 1018716160 0.0% 100.0% 0.0% flops 40 x 32 x 222 1082081280 0.0% 100.0% 0.0% flops 40 x 64 x 222 1082081280 0.0% 100.0% 0.0% flops 111 x 32 x 200 1352601600 0.0% 100.0% 0.0% flops 111 x 64 x 200 1352601600 0.0% 100.0% 0.0% flops 40 x 32 x 32 1357905920 0.0% 100.0% 0.0% flops 40 x 64 x 32 1357905920 0.0% 100.0% 0.0% flops 62 x 32 x 209 1579010048 0.0% 100.0% 0.0% flops 62 x 64 x 209 1579010048 0.0% 100.0% 0.0% flops 62 x 32 x 222 1677225984 0.0% 100.0% 0.0% flops 62 x 64 x 222 1677225984 0.0% 100.0% 0.0% flops 76 x 64 x 209 1935560704 0.0% 100.0% 0.0% flops 76 x 32 x 209 1935560704 0.0% 100.0% 0.0% flops 76 x 64 x 222 2055954432 0.0% 100.0% 0.0% flops 76 x 32 x 222 2055954432 0.0% 100.0% 0.0% flops 85 x 64 x 200 2071552000 0.0% 100.0% 0.0% flops 85 x 32 x 200 2071552000 0.0% 100.0% 0.0% flops 62 x 32 x 32 2104754176 0.0% 100.0% 0.0% flops 62 x 64 x 32 2104754176 0.0% 100.0% 0.0% flops 98 x 32 x 200 2388377600 0.0% 100.0% 0.0% flops 98 x 64 x 200 2388377600 0.0% 100.0% 0.0% flops 76 x 64 x 32 2580021248 0.0% 100.0% 0.0% flops 76 x 32 x 32 2580021248 0.0% 100.0% 0.0% flops 64 x 64 x 200 2759065600 0.0% 100.0% 0.0% flops 64 x 32 x 200 2759065600 0.0% 100.0% 0.0% flops 111 x 32 x 209 2826937344 0.0% 100.0% 0.0% flops 111 x 64 x 209 2826937344 0.0% 100.0% 0.0% flops 120 x 32 x 200 2924544000 0.0% 100.0% 0.0% flops 120 x 64 x 200 2924544000 0.0% 100.0% 0.0% flops 111 x 32 x 222 3002775552 0.0% 100.0% 0.0% flops 111 x 64 x 222 3002775552 0.0% 100.0% 0.0% flops 64 x 64 x 32 3359637504 0.0% 100.0% 0.0% flops 64 x 32 x 32 3359637504 0.0% 100.0% 0.0% flops 111 x 32 x 32 3768188928 0.0% 100.0% 0.0% flops 111 x 64 x 32 3768188928 0.0% 100.0% 0.0% flops 85 x 64 x 209 4329543680 0.0% 100.0% 0.0% flops 85 x 32 x 209 4329543680 0.0% 100.0% 0.0% flops 89 x 64 x 200 4338073600 0.0% 100.0% 0.0% flops 89 x 32 x 200 4338073600 0.0% 100.0% 0.0% flops 85 x 64 x 222 4598845440 0.0% 100.0% 0.0% flops 85 x 32 x 222 4598845440 0.0% 100.0% 0.0% flops 98 x 32 x 209 4991709184 0.0% 100.0% 0.0% flops 98 x 64 x 209 4991709184 0.0% 100.0% 0.0% flops 98 x 32 x 222 5302198272 0.0% 100.0% 0.0% flops 98 x 64 x 222 5302198272 0.0% 100.0% 0.0% flops 64 x 64 x 209 5766447104 0.0% 100.0% 0.0% flops 64 x 32 x 209 5766447104 0.0% 100.0% 0.0% flops 85 x 64 x 32 5771100160 0.0% 100.0% 0.0% flops 85 x 32 x 32 5771100160 0.0% 100.0% 0.0% flops 120 x 32 x 209 6112296960 0.0% 100.0% 0.0% flops 120 x 64 x 209 6112296960 0.0% 100.0% 0.0% flops 64 x 64 x 222 6125125632 0.0% 100.0% 0.0% flops 64 x 32 x 222 6125125632 0.0% 100.0% 0.0% flops 120 x 32 x 222 6492487680 0.0% 100.0% 0.0% flops 120 x 64 x 222 6492487680 0.0% 100.0% 0.0% flops 98 x 32 x 32 6653739008 0.0% 100.0% 0.0% flops 98 x 64 x 32 6653739008 0.0% 100.0% 0.0% flops 120 x 32 x 32 8147435520 0.0% 100.0% 0.0% flops 120 x 64 x 32 8147435520 0.0% 100.0% 0.0% flops 32 x 32 x 200 8277196800 0.0% 100.0% 0.0% flops 32 x 64 x 200 8277196800 0.0% 100.0% 0.0% flops 89 x 64 x 209 9066573824 0.0% 100.0% 0.0% flops 89 x 32 x 209 9066573824 0.0% 100.0% 0.0% flops 89 x 64 x 222 9630523392 0.0% 100.0% 0.0% flops 89 x 32 x 222 9630523392 0.0% 100.0% 0.0% flops 32 x 32 x 32 10078912512 0.0% 100.0% 0.0% flops 32 x 64 x 32 10078912512 0.0% 100.0% 0.0% flops 89 x 64 x 32 12085362688 0.0% 100.0% 0.0% flops 89 x 32 x 32 12085362688 0.0% 100.0% 0.0% flops 32 x 32 x 209 17299341312 0.0% 100.0% 0.0% flops 32 x 64 x 209 17299341312 0.0% 100.0% 0.0% flops 32 x 32 x 222 18375376896 0.0% 100.0% 0.0% flops 32 x 64 x 222 18375376896 0.0% 100.0% 0.0% flops 9 x 9 x 32 44168260608 0.0% 100.0% 0.0% flops 22 x 9 x 32 53835724800 0.0% 100.0% 0.0% flops 9 x 22 x 32 53885500416 0.0% 100.0% 0.0% flops 22 x 22 x 32 67007283200 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 409643995136 100.0% 0.0% 0.0% flops total 1.940194E+12 21.1% 78.9% 0.0% flops max/rank 126.888791E+09 24.8% 75.2% 0.0% matmuls inhomo. stacks 389676 100.0% 0.0% 0.0% matmuls total 101225376 0.4% 99.6% 0.0% number of processed stacks 3787752 10.3% 89.7% 0.0% average stack size 1.0 29.7 0.0 marketing flops 2.107592E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 268.959744E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 1042416 MPI messages size (bytes): total size 150.443262E+09 min size 0.000000E+00 max size 1.188816E+06 average size 144.321719E+03 MPI breakdown and total messages size (bytes): size <= 128 228256 0 128 < size <= 8192 126888 1039466496 8192 < size <= 32768 191472 3137077248 32768 < size <= 131072 295800 25899827200 131072 < size <= 4194304 200000 120367247040 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3672 62653. MP_Allreduce 10304 342. MP_Sync 54 MP_Alltoall 1582 2412273. MP_SendRecv 8211 74133. MP_ISendRecv 8211 74133. MP_Wait 16271 MP_comm_split 50 MP_ISend 7280 135929. MP_IRecv 7280 135929. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.108 0.261 42.221 42.225 qs_mol_dyn_low 1 2.0 0.043 0.101 40.883 40.891 qs_forces 11 3.9 0.002 0.003 40.328 40.430 qs_energies 11 4.9 0.001 0.001 38.308 38.414 scf_env_do_scf 11 5.9 0.001 0.003 32.913 32.914 scf_env_do_scf_inner_loop 108 6.5 0.003 0.028 27.811 27.837 velocity_verlet 10 3.0 0.001 0.002 21.608 21.616 qs_scf_new_mos 108 7.5 0.001 0.001 11.919 12.057 qs_scf_loop_do_ot 108 8.5 0.001 0.001 11.918 12.057 ot_scf_mini 108 9.5 0.003 0.004 11.286 11.410 dbcsr_multiply_generic 2286 12.5 0.145 0.208 10.716 11.143 rebuild_ks_matrix 119 8.3 0.001 0.001 9.688 9.853 qs_ks_build_kohn_sham_matrix 119 9.3 0.014 0.016 9.688 9.852 qs_rho_update_rho_low 119 7.7 0.001 0.001 9.353 9.358 calculate_rho_elec 119 8.7 0.051 0.058 9.353 9.357 qs_ks_update_qs_env 119 7.6 0.001 0.001 8.659 8.808 sum_up_and_integrate 119 10.3 0.001 0.002 7.176 7.184 integrate_v_rspace 119 11.3 0.003 0.003 7.164 7.172 grid_collocate_task_list 119 9.7 6.694 6.848 6.694 6.848 ot_mini 108 10.5 0.001 0.001 6.193 6.331 multiply_cannon 2286 13.5 0.228 0.267 5.025 6.111 init_scf_loop 11 6.9 0.000 0.000 5.076 5.103 multiply_cannon_loop 2286 14.5 0.239 0.340 3.926 5.050 grid_integrate_task_list 119 12.3 4.962 5.047 4.962 5.047 qs_ot_get_derivative 108 11.5 0.001 0.001 4.780 4.907 init_scf_run 11 5.9 0.000 0.005 4.009 4.010 scf_env_initial_rho_setup 11 6.9 0.000 0.004 4.008 4.009 make_m2s 4572 13.5 0.096 0.135 3.628 3.735 prepare_preconditioner 11 7.9 0.000 0.000 3.688 3.697 make_preconditioner 11 8.9 0.000 0.000 3.688 3.697 make_full_inverse_cholesky 11 9.9 0.000 0.000 3.224 3.582 qs_ot_get_p 119 10.4 0.001 0.001 3.441 3.572 multiply_cannon_multrec 27432 15.5 2.430 3.513 2.444 3.538 mp_waitall_1 137007 16.6 1.799 3.080 1.799 3.080 make_images 4572 14.5 0.338 0.393 2.835 2.900 density_rs2pw 119 9.7 0.005 0.009 2.361 2.502 fft_wrap_pw1pw2 1201 11.6 0.014 0.017 2.353 2.383 calculate_first_density_matrix 1 7.0 0.000 0.000 1.996 1.996 fft_wrap_pw1pw2_140 487 12.2 0.110 0.115 1.900 1.932 fft3d_ps 1201 13.6 0.708 0.762 1.903 1.927 multiply_cannon_metrocomm3 27432 15.5 0.052 0.108 0.546 1.887 potential_pw2rs 119 12.3 0.012 0.015 1.821 1.827 qs_ot_p2m_diag 50 11.0 0.014 0.023 1.759 1.787 mp_sum_l 11298 13.2 1.268 1.751 1.268 1.751 transfer_rs2pw 487 10.6 0.005 0.007 1.466 1.727 qs_ot_get_derivative_diag 49 12.0 0.001 0.002 1.651 1.703 qs_ot_get_derivative_taylor 59 13.0 0.002 0.003 1.591 1.667 wfi_extrapolate 11 7.9 0.001 0.001 1.597 1.598 cp_dbcsr_syevd 50 12.0 0.003 0.004 1.473 1.480 cp_fm_cholesky_invert 11 10.9 1.438 1.448 1.438 1.448 dbcsr_complete_redistribute 329 12.2 0.185 0.257 1.133 1.426 ot_diis_step 108 11.5 0.015 0.017 1.358 1.359 apply_preconditioner_dbcsr 119 12.6 0.000 0.000 1.137 1.296 apply_single 119 13.6 0.000 0.001 1.137 1.295 qs_energies_init_hamiltonians 11 5.9 0.000 0.000 1.174 1.275 cp_fm_upper_to_full 72 13.8 0.893 1.223 0.893 1.223 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.181 1.182 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.146 1.163 cp_fm_redistribute_end 50 14.0 0.583 1.155 0.588 1.157 transfer_pw2rs 487 13.2 0.004 0.005 1.147 1.153 cp_fm_diag_elpa_base 50 14.0 0.526 1.071 0.567 1.136 make_images_data 4572 15.5 0.065 0.120 1.035 1.127 mp_waitany 7280 13.7 0.859 1.123 0.859 1.123 copy_fm_to_dbcsr 176 11.2 0.001 0.002 0.798 1.096 make_basis_sm 11 9.8 0.000 0.000 1.071 1.074 mp_alltoall_z22v 1201 15.6 1.048 1.071 1.048 1.071 transfer_rs2pw_140 130 11.5 0.122 0.139 0.796 1.045 mp_alltoall_d11v 2130 13.8 0.894 1.037 0.894 1.037 hybrid_alltoall_any 4725 16.4 0.078 0.195 0.893 1.032 cp_fm_cholesky_decompose 22 10.9 1.003 1.016 1.003 1.016 make_images_sizes 4572 15.5 0.006 0.014 0.820 0.983 mp_alltoall_i44 4572 16.5 0.814 0.977 0.814 0.977 dbcsr_make_images_dense 3978 14.8 0.066 0.103 0.575 0.937 arnoldi_extremal 119 11.4 0.002 0.003 0.855 0.930 arnoldi_normal_ev 119 12.4 0.002 0.004 0.852 0.928 mp_alltoall_i22 627 13.8 0.580 0.917 0.580 0.917 calculate_atomic_block_dm 1 8.0 0.001 0.001 0.908 0.909 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="108", plot="h2o_64_md", label="(4n/6r/6t)", y=42.225000, yerr=0.000000 PlotPoint: name="109", plot="h2o_64_md_mem", label="(4n/6r/6t)", y=255.636364, yerr=2.143605 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/08/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 40 x 64 x 64 10863247360 0.0% 100.0% 0.0% flops 40 x 64 x 640 24956108800 0.0% 100.0% 0.0% flops 32 x 64 x 64 26877100032 0.0% 100.0% 0.0% flops 9 x 9 x 64 44168260608 0.0% 100.0% 0.0% flops 71 x 64 x 64 48205660160 0.0% 100.0% 0.0% flops 22 x 9 x 64 53835724800 0.0% 100.0% 0.0% flops 9 x 22 x 64 53885500416 0.0% 100.0% 0.0% flops 80 x 64 x 64 54316236800 0.0% 100.0% 0.0% flops 22 x 22 x 64 67007283200 0.0% 100.0% 0.0% flops 71 x 64 x 640 110742732800 0.0% 100.0% 0.0% flops 80 x 64 x 640 124780544000 0.0% 100.0% 0.0% flops 32 x 64 x 640 141264158720 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 261929041920 100.0% 0.0% 0.0% flops total 1.943572E+12 13.5% 86.5% 0.0% flops max/rank 122.902337E+09 13.9% 86.1% 0.0% matmuls inhomo. stacks 122304 100.0% 0.0% 0.0% matmuls total 90872996 0.1% 99.9% 0.0% number of processed stacks 1449216 8.4% 91.6% 0.0% average stack size 1.0 68.4 0.0 marketing flops 2.107587E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 328.048640E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 219456 MPI messages size (bytes): total size 97.042514E+09 min size 0.000000E+00 max size 3.276800E+06 average size 442.195750E+03 MPI breakdown and total messages size (bytes): size <= 128 1452 0 128 < size <= 8192 0 0 8192 < size <= 32768 101892 3336634368 32768 < size <= 131072 0 0 131072 < size <= 4194304 116112 93705670464 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 19 12. MP_Allreduce 12177 16. MP_Alltoall 8655 62672. MP_ISend 36532 167957. MP_IRecv 36532 167930. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3622 63488. MP_Allreduce 10154 346. MP_Sync 54 MP_Alltoall 1582 3682667. MP_SendRecv 5355 94533. MP_ISendRecv 5355 94533. MP_Wait 11335 MP_ISend 5200 225425. MP_IRecv 5200 225425. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.194 0.380 39.019 39.029 qs_mol_dyn_low 1 2.0 0.271 0.436 37.510 37.580 qs_forces 11 3.9 0.013 0.024 36.176 36.360 qs_energies 11 4.9 0.018 0.033 34.235 34.428 scf_env_do_scf 11 5.9 0.001 0.004 28.872 28.873 scf_env_do_scf_inner_loop 108 6.5 0.005 0.023 25.154 25.155 velocity_verlet 10 3.0 0.015 0.029 19.703 19.740 qs_scf_new_mos 108 7.5 0.001 0.001 9.592 9.620 qs_scf_loop_do_ot 108 8.5 0.004 0.014 9.591 9.619 rebuild_ks_matrix 119 8.3 0.001 0.001 9.468 9.520 qs_ks_build_kohn_sham_matrix 119 9.3 0.016 0.023 9.468 9.520 ot_scf_mini 108 9.5 0.010 0.017 9.096 9.124 qs_rho_update_rho_low 119 7.7 0.002 0.004 9.038 9.056 calculate_rho_elec 119 8.7 0.079 0.092 9.036 9.055 qs_ks_update_qs_env 119 7.6 0.001 0.001 8.503 8.554 dbcsr_multiply_generic 2286 12.5 0.142 0.159 7.895 7.990 sum_up_and_integrate 119 10.3 0.001 0.002 6.867 6.880 integrate_v_rspace 119 11.3 0.003 0.003 6.855 6.869 grid_collocate_task_list 119 9.7 5.909 6.258 5.909 6.258 grid_integrate_task_list 119 12.3 4.544 4.708 4.544 4.708 ot_mini 108 10.5 0.001 0.002 4.456 4.487 multiply_cannon 2286 13.5 0.216 0.221 3.833 4.387 init_scf_loop 11 6.9 0.003 0.005 3.691 3.693 init_scf_run 11 5.9 0.001 0.005 3.482 3.482 scf_env_initial_rho_setup 11 6.9 0.008 0.017 3.481 3.482 density_rs2pw 119 9.7 0.005 0.005 2.750 3.461 qs_ot_get_p 119 10.4 0.001 0.001 3.302 3.344 multiply_cannon_loop 2286 14.5 0.094 0.110 3.097 3.293 qs_ot_get_derivative 108 11.5 0.001 0.002 3.076 3.102 fft_wrap_pw1pw2 1201 11.6 0.013 0.015 2.916 2.983 make_m2s 4572 13.5 0.070 0.081 2.697 2.828 transfer_rs2pw 487 10.6 0.005 0.007 1.523 2.544 fft_wrap_pw1pw2_140 487 12.2 0.147 0.163 2.371 2.508 fft3d_ps 1201 13.6 0.827 0.984 2.392 2.433 multiply_cannon_multrec 9144 15.5 2.231 2.401 2.245 2.415 prepare_preconditioner 11 7.9 0.000 0.000 2.390 2.400 make_preconditioner 11 8.9 0.003 0.011 2.390 2.400 make_full_inverse_cholesky 11 9.9 0.000 0.000 2.229 2.285 make_images 4572 14.5 0.322 0.357 2.089 2.220 mp_waitany 5200 13.7 1.021 1.993 1.021 1.993 transfer_rs2pw_140 130 11.5 0.113 0.133 0.945 1.956 qs_ot_p2m_diag 50 11.0 0.021 0.024 1.894 1.922 qs_energies_init_hamiltonians 11 5.9 0.001 0.004 1.609 1.800 potential_pw2rs 119 12.3 0.014 0.015 1.750 1.753 cp_dbcsr_syevd 50 12.0 0.003 0.004 1.715 1.744 calculate_first_density_matrix 1 7.0 0.015 0.024 1.725 1.729 mp_waitall_1 115863 16.7 1.509 1.723 1.509 1.723 wfi_extrapolate 11 7.9 0.001 0.001 1.538 1.538 mp_alltoall_z22v 1201 15.6 1.382 1.500 1.382 1.500 mp_alltoall_d11v 2130 13.8 1.123 1.412 1.123 1.412 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.401 1.401 cp_fm_diag_elpa_base 50 14.0 1.354 1.373 1.398 1.398 ot_diis_step 108 11.5 0.062 0.157 1.364 1.366 qs_env_update_s_mstruct 11 6.9 0.001 0.002 0.927 1.281 make_images_data 4572 15.5 0.056 0.060 0.982 1.146 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.118 1.121 cp_fm_cholesky_invert 11 10.9 1.094 1.104 1.094 1.104 make_basis_sm 11 9.8 0.000 0.000 1.099 1.104 hybrid_alltoall_any 4725 16.4 0.083 0.209 0.922 1.096 qs_ot_get_derivative_taylor 59 13.0 0.002 0.002 1.031 1.055 apply_preconditioner_dbcsr 119 12.6 0.007 0.014 1.014 1.050 apply_single 119 13.6 0.000 0.000 1.007 1.049 dbcsr_complete_redistribute 329 12.2 0.244 0.325 0.958 1.031 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.003 1.022 transfer_pw2rs 487 13.2 0.004 0.004 0.963 0.965 yz_to_x 606 14.1 0.070 0.095 0.855 0.919 mp_sum_d 4137 12.0 0.610 0.889 0.610 0.889 arnoldi_extremal 119 11.4 0.002 0.002 0.742 0.874 arnoldi_normal_ev 119 12.4 0.003 0.004 0.740 0.872 calculate_rho_core 11 7.9 0.037 0.051 0.465 0.861 rs_gather_matrices 119 12.3 0.053 0.065 0.524 0.834 mp_allgather_i34 2286 14.5 0.296 0.822 0.296 0.822 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="110", plot="h2o_64_md", label="(4n/4r/9t)", y=39.029000, yerr=0.000000 PlotPoint: name="111", plot="h2o_64_md_mem", label="(4n/4r/9t)", y=309.090909, yerr=4.737751 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/09/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 89 x 32 x 409 2217840128 0.0% 100.0% 0.0% flops 89 x 64 x 409 2217840128 0.0% 100.0% 0.0% flops 89 x 32 x 418 2266643456 0.0% 100.0% 0.0% flops 89 x 64 x 418 2266643456 0.0% 100.0% 0.0% flops 107 x 64 x 409 2666392064 0.0% 100.0% 0.0% flops 107 x 32 x 409 2666392064 0.0% 100.0% 0.0% flops 107 x 64 x 418 2725065728 0.0% 100.0% 0.0% flops 107 x 32 x 418 2725065728 0.0% 100.0% 0.0% flops 120 x 32 x 409 2990346240 0.0% 100.0% 0.0% flops 120 x 64 x 409 2990346240 0.0% 100.0% 0.0% flops 89 x 32 x 32 3021340672 0.0% 100.0% 0.0% flops 89 x 32 x 64 3021340672 0.0% 100.0% 0.0% flops 89 x 64 x 32 3021340672 0.0% 100.0% 0.0% flops 89 x 64 x 64 3021340672 0.0% 100.0% 0.0% flops 120 x 32 x 418 3056148480 0.0% 100.0% 0.0% flops 120 x 64 x 418 3056148480 0.0% 100.0% 0.0% flops 142 x 64 x 409 3538576384 0.0% 100.0% 0.0% flops 142 x 32 x 409 3538576384 0.0% 100.0% 0.0% flops 142 x 64 x 418 3616442368 0.0% 100.0% 0.0% flops 142 x 32 x 418 3616442368 0.0% 100.0% 0.0% flops 107 x 64 x 64 3632398336 0.0% 100.0% 0.0% flops 107 x 64 x 32 3632398336 0.0% 100.0% 0.0% flops 107 x 32 x 32 3632398336 0.0% 100.0% 0.0% flops 107 x 32 x 64 3632398336 0.0% 100.0% 0.0% flops 120 x 32 x 32 4073717760 0.0% 100.0% 0.0% flops 120 x 32 x 64 4073717760 0.0% 100.0% 0.0% flops 120 x 64 x 32 4073717760 0.0% 100.0% 0.0% flops 120 x 64 x 64 4073717760 0.0% 100.0% 0.0% flops 142 x 64 x 64 4820566016 0.0% 100.0% 0.0% flops 142 x 64 x 32 4820566016 0.0% 100.0% 0.0% flops 142 x 32 x 32 4820566016 0.0% 100.0% 0.0% flops 142 x 32 x 64 4820566016 0.0% 100.0% 0.0% flops 111 x 64 x 409 5532140544 0.0% 100.0% 0.0% flops 111 x 32 x 409 5532140544 0.0% 100.0% 0.0% flops 111 x 64 x 418 5653874688 0.0% 100.0% 0.0% flops 111 x 32 x 418 5653874688 0.0% 100.0% 0.0% flops 32 x 64 x 64 6719275008 0.0% 100.0% 0.0% flops 32 x 64 x 32 6719275008 0.0% 100.0% 0.0% flops 32 x 32 x 32 6719275008 0.0% 100.0% 0.0% flops 32 x 32 x 64 6719275008 0.0% 100.0% 0.0% flops 89 x 32 x 431 7011411456 0.0% 100.0% 0.0% flops 89 x 64 x 431 7011411456 0.0% 100.0% 0.0% flops 111 x 64 x 64 7536377856 0.0% 100.0% 0.0% flops 111 x 64 x 32 7536377856 0.0% 100.0% 0.0% flops 111 x 32 x 32 7536377856 0.0% 100.0% 0.0% flops 111 x 32 x 64 7536377856 0.0% 100.0% 0.0% flops 107 x 64 x 431 8429449728 0.0% 100.0% 0.0% flops 107 x 32 x 431 8429449728 0.0% 100.0% 0.0% flops 120 x 32 x 431 9453588480 0.0% 100.0% 0.0% flops 120 x 64 x 431 9453588480 0.0% 100.0% 0.0% flops 142 x 64 x 431 11186746368 0.0% 100.0% 0.0% flops 142 x 32 x 431 11186746368 0.0% 100.0% 0.0% flops 32 x 64 x 409 11284578304 0.0% 100.0% 0.0% flops 32 x 32 x 409 11284578304 0.0% 100.0% 0.0% flops 32 x 64 x 418 11532894208 0.0% 100.0% 0.0% flops 32 x 32 x 418 11532894208 0.0% 100.0% 0.0% flops 111 x 64 x 431 17489138688 0.0% 100.0% 0.0% flops 111 x 32 x 431 17489138688 0.0% 100.0% 0.0% flops 9 x 9 x 64 22084130304 0.0% 100.0% 0.0% flops 9 x 9 x 32 22084130304 0.0% 100.0% 0.0% flops 22 x 9 x 64 26917862400 0.0% 100.0% 0.0% flops 22 x 9 x 32 26917862400 0.0% 100.0% 0.0% flops 9 x 22 x 64 26942750208 0.0% 100.0% 0.0% flops 9 x 22 x 32 26942750208 0.0% 100.0% 0.0% flops 22 x 22 x 64 33503641600 0.0% 100.0% 0.0% flops 22 x 22 x 32 33503641600 0.0% 100.0% 0.0% flops 32 x 64 x 431 35674718208 0.0% 100.0% 0.0% flops 32 x 32 x 431 35674718208 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 555803951104 100.0% 0.0% 0.0% flops total 2.107308E+12 26.4% 73.6% 0.0% flops max/rank 262.290333E+09 28.3% 71.7% 0.0% matmuls inhomo. stacks 303960 100.0% 0.0% 0.0% matmuls total 96047982 0.3% 99.7% 0.0% number of processed stacks 2272806 13.4% 86.6% 0.0% average stack size 1.0 48.6 0.0 marketing flops 2.107592E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 396.177408E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 219456 MPI messages size (bytes): total size 139.149754E+09 min size 0.000000E+00 max size 4.537280E+06 average size 634.066750E+03 MPI breakdown and total messages size (bytes): size <= 128 1386 0 128 < size <= 8192 21148 173244416 8192 < size <= 32768 58442 1568899072 32768 < size <= 131072 38700 3527147520 131072 < size <= 4194304 78110 38969991440 4194304 < size <= 16777216 21670 94910778800 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3672 62884. MP_Allreduce 10304 424. MP_Sync 54 MP_Alltoall 1582 4823651. MP_SendRecv 3927 131600. MP_ISendRecv 3927 131600. MP_Wait 8867 MP_comm_split 50 MP_ISend 4160 325000. MP_IRecv 4160 325000. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.276 0.338 47.037 47.054 qs_mol_dyn_low 1 2.0 0.196 0.240 45.754 45.773 qs_forces 11 3.9 0.094 0.157 44.605 44.649 qs_energies 11 4.9 0.020 0.052 42.267 42.346 scf_env_do_scf 11 5.9 0.001 0.003 36.340 36.341 scf_env_do_scf_inner_loop 108 6.5 0.005 0.027 29.270 29.307 velocity_verlet 10 3.0 0.033 0.074 25.063 25.129 qs_scf_new_mos 108 7.5 0.001 0.001 12.177 12.255 qs_scf_loop_do_ot 108 8.5 0.001 0.001 12.176 12.254 ot_scf_mini 108 9.5 0.007 0.020 11.561 11.657 dbcsr_multiply_generic 2286 12.5 0.139 0.145 10.930 11.051 rebuild_ks_matrix 119 8.3 0.001 0.001 10.481 10.579 qs_ks_build_kohn_sham_matrix 119 9.3 0.017 0.031 10.480 10.578 qs_rho_update_rho_low 119 7.7 0.002 0.004 10.166 10.174 calculate_rho_elec 119 8.7 0.101 0.108 10.163 10.173 qs_ks_update_qs_env 119 7.6 0.001 0.001 9.484 9.590 grid_collocate_task_list 119 9.7 7.310 7.441 7.310 7.441 sum_up_and_integrate 119 10.3 0.002 0.003 7.221 7.228 integrate_v_rspace 119 11.3 0.003 0.003 7.209 7.217 init_scf_loop 11 6.9 0.021 0.050 7.039 7.052 multiply_cannon 2286 13.5 0.232 0.247 5.807 6.772 ot_mini 108 10.5 0.020 0.076 5.773 5.864 prepare_preconditioner 11 7.9 0.000 0.000 5.331 5.336 make_preconditioner 11 8.9 0.001 0.004 5.331 5.336 multiply_cannon_loop 2286 14.5 0.135 0.141 4.764 5.234 make_full_inverse_cholesky 11 9.9 0.000 0.000 4.583 5.203 grid_integrate_task_list 119 12.3 4.957 5.082 4.957 5.082 mp_waitall_1 114435 16.7 3.093 4.424 3.093 4.424 qs_ot_get_p 119 10.4 0.001 0.002 3.810 3.916 make_m2s 4572 13.5 0.083 0.087 3.531 3.862 init_scf_run 11 5.9 0.001 0.004 3.735 3.736 scf_env_initial_rho_setup 11 6.9 0.003 0.007 3.735 3.736 qs_ot_get_derivative 108 11.5 0.014 0.019 3.598 3.685 multiply_cannon_multrec 13716 15.5 2.292 3.543 2.309 3.562 multiply_cannon_metrocomm3 13716 15.5 0.044 0.045 1.752 3.038 make_images 4572 14.5 0.443 0.488 2.610 2.925 fft_wrap_pw1pw2 1201 11.6 0.014 0.016 2.648 2.693 density_rs2pw 119 9.7 0.005 0.005 2.346 2.602 cp_fm_upper_to_full 72 13.8 1.846 2.398 1.846 2.398 qs_ot_p2m_diag 50 11.0 0.027 0.043 2.281 2.299 fft_wrap_pw1pw2_140 487 12.2 0.174 0.176 2.249 2.291 ot_diis_step 108 11.5 0.051 0.081 2.140 2.159 fft3d_ps 1201 13.6 0.888 0.948 2.083 2.125 cp_dbcsr_syevd 50 12.0 0.003 0.004 2.077 2.078 qs_energies_init_hamiltonians 11 5.9 0.001 0.002 1.897 1.972 dbcsr_complete_redistribute 329 12.2 0.260 0.294 1.413 1.877 apply_preconditioner_dbcsr 119 12.6 0.001 0.002 1.754 1.872 apply_single 119 13.6 0.000 0.000 1.752 1.870 wfi_extrapolate 11 7.9 0.005 0.016 1.848 1.849 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.776 1.777 cp_fm_cholesky_invert 11 10.9 1.755 1.771 1.755 1.771 cp_fm_redistribute_end 50 14.0 0.884 1.754 0.886 1.756 cp_fm_diag_elpa_base 50 14.0 0.689 1.438 0.863 1.728 make_images_data 4572 15.5 0.062 0.065 1.373 1.682 hybrid_alltoall_any 4725 16.4 0.096 0.157 1.286 1.643 transfer_rs2pw 487 10.6 0.006 0.006 1.265 1.607 potential_pw2rs 119 12.3 0.016 0.018 1.566 1.568 mp_alltoall_d11v 2130 13.8 1.354 1.487 1.354 1.487 copy_fm_to_dbcsr 176 11.2 0.001 0.001 0.996 1.467 calculate_first_density_matrix 1 7.0 0.001 0.003 1.423 1.424 qs_env_update_s_mstruct 11 6.9 0.001 0.001 1.220 1.402 mp_waitany 4160 13.7 0.926 1.252 0.926 1.252 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.201 1.239 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.219 1.225 mp_alltoall_i22 627 13.8 0.732 1.216 0.732 1.216 qs_ot_get_derivative_taylor 59 13.0 0.002 0.002 1.161 1.214 make_basis_sm 11 9.8 0.000 0.000 1.184 1.186 transfer_rs2pw_140 130 11.5 0.111 0.118 0.839 1.185 transfer_fm_to_dbcsr 11 9.9 0.000 0.000 0.707 1.181 multiply_cannon_metrocomm4 11430 15.5 0.042 0.045 0.472 1.133 mp_alltoall_z22v 1201 15.6 1.056 1.101 1.056 1.101 cp_fm_cholesky_decompose 22 10.9 1.027 1.048 1.027 1.048 mp_irecv_dv 29167 16.4 0.405 1.039 0.405 1.039 mp_allgather_i34 2286 14.5 0.410 0.994 0.410 0.994 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 0.888 0.959 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="112", plot="h2o_64_md", label="(4n/3r/12t)", y=47.054000, yerr=0.000000 PlotPoint: name="113", plot="h2o_64_md_mem", label="(4n/3r/12t)", y=374.545455, yerr=5.630495 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/10/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 58 x 64 x 64 15751708672 0.0% 100.0% 0.0% flops 32 x 64 x 64 26877100032 0.0% 100.0% 0.0% flops 58 x 64 x 640 36186357760 0.0% 100.0% 0.0% flops 71 x 64 x 64 38564528128 0.0% 100.0% 0.0% flops 80 x 64 x 64 43452989440 0.0% 100.0% 0.0% flops 9 x 9 x 64 44168260608 0.0% 100.0% 0.0% flops 22 x 9 x 64 53835724800 0.0% 100.0% 0.0% flops 9 x 22 x 64 53885500416 0.0% 100.0% 0.0% flops 22 x 22 x 64 67007283200 0.0% 100.0% 0.0% flops 71 x 64 x 640 88594186240 0.0% 100.0% 0.0% flops 80 x 64 x 640 99824435200 0.0% 100.0% 0.0% flops 32 x 64 x 640 141264158720 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 342522593280 100.0% 0.0% 0.0% flops total 1.972676E+12 17.4% 82.6% 0.0% flops max/rank 253.383579E+09 18.0% 82.0% 0.0% matmuls inhomo. stacks 152880 100.0% 0.0% 0.0% matmuls total 90862804 0.2% 99.8% 0.0% number of processed stacks 1405732 10.9% 89.1% 0.0% average stack size 1.0 72.4 0.0 marketing flops 2.107587E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 982.827008E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 91440 MPI messages size (bytes): total size 85.748679E+09 min size 0.000000E+00 max size 6.553600E+06 average size 937.758938E+03 MPI breakdown and total messages size (bytes): size <= 128 572 0 128 < size <= 8192 0 0 8192 < size <= 32768 21148 692256768 32768 < size <= 131072 19224 1259864064 131072 < size <= 4194304 41040 21941452800 4194304 < size <= 16777216 9456 61855174464 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3622 63723. MP_Allreduce 10154 429. MP_Sync 54 MP_Alltoall 1582 7383731. MP_SendRecv 2499 189067. MP_ISendRecv 2499 189067. MP_Wait 6399 MP_ISend 3120 546875. MP_IRecv 3120 546875. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.286 0.316 51.826 51.832 qs_mol_dyn_low 1 2.0 0.267 0.290 49.655 49.671 qs_forces 11 3.9 0.072 0.108 48.776 48.807 qs_energies 11 4.9 0.070 0.076 46.310 46.360 scf_env_do_scf 11 5.9 0.005 0.011 39.982 39.983 scf_env_do_scf_inner_loop 108 6.5 0.021 0.048 30.250 30.254 velocity_verlet 10 3.0 0.003 0.004 28.609 28.629 qs_scf_new_mos 108 7.5 0.001 0.001 11.964 12.036 qs_scf_loop_do_ot 108 8.5 0.002 0.002 11.963 12.035 ot_scf_mini 108 9.5 0.007 0.009 11.300 11.365 qs_rho_update_rho_low 119 7.7 0.001 0.001 10.931 10.943 calculate_rho_elec 119 8.7 0.149 0.152 10.930 10.942 dbcsr_multiply_generic 2286 12.5 0.141 0.148 10.698 10.840 rebuild_ks_matrix 119 8.3 0.000 0.000 10.724 10.811 qs_ks_build_kohn_sham_matrix 119 9.3 0.014 0.014 10.724 10.810 qs_ks_update_qs_env 119 7.6 0.001 0.001 9.681 9.746 init_scf_loop 11 6.9 0.068 0.084 9.696 9.699 prepare_preconditioner 11 7.9 0.000 0.000 8.042 8.062 make_preconditioner 11 8.9 0.001 0.002 8.042 8.062 grid_collocate_task_list 119 9.7 7.764 7.955 7.764 7.955 make_full_inverse_cholesky 11 9.9 0.000 0.000 6.585 7.893 sum_up_and_integrate 119 10.3 0.001 0.001 7.226 7.238 integrate_v_rspace 119 11.3 0.003 0.003 7.215 7.226 multiply_cannon 2286 13.5 0.241 0.250 5.454 6.006 ot_mini 108 10.5 0.010 0.011 5.791 5.863 grid_integrate_task_list 119 12.3 5.073 5.181 5.073 5.181 multiply_cannon_loop 2286 14.5 0.099 0.110 4.550 4.743 cp_fm_upper_to_full 72 14.2 3.153 4.507 3.153 4.507 mp_waitall_1 94719 16.7 2.993 4.030 2.993 4.030 make_m2s 4572 13.5 0.072 0.074 3.659 3.967 init_scf_run 11 5.9 0.002 0.005 3.896 3.896 scf_env_initial_rho_setup 11 6.9 0.022 0.037 3.894 3.896 qs_ot_get_derivative 108 11.5 0.001 0.002 3.735 3.806 qs_ot_get_p 119 10.4 0.001 0.001 3.511 3.555 dbcsr_complete_redistribute 329 12.2 0.396 0.454 2.333 3.147 fft_wrap_pw1pw2 1201 11.6 0.013 0.015 3.059 3.101 make_images 4572 14.5 0.542 0.564 2.688 2.993 multiply_cannon_multrec 9144 15.5 2.469 2.830 2.491 2.850 density_rs2pw 119 9.7 0.005 0.005 2.596 2.772 fft_wrap_pw1pw2_140 487 12.2 0.244 0.247 2.622 2.664 copy_fm_to_dbcsr 176 11.2 0.001 0.001 1.839 2.653 fft3d_ps 1201 13.6 1.066 1.082 2.381 2.425 multiply_cannon_metrocomm3 9144 15.5 0.026 0.027 1.568 2.316 transfer_fm_to_dbcsr 11 9.9 0.000 0.001 1.449 2.262 mp_alltoall_i22 627 13.8 1.465 2.251 1.465 2.251 qs_ot_p2m_diag 50 11.0 0.039 0.043 2.128 2.140 qs_energies_init_hamiltonians 11 5.9 0.003 0.006 2.080 2.127 wfi_extrapolate 11 7.9 0.001 0.001 2.052 2.052 ot_diis_step 108 11.5 0.071 0.096 2.036 2.036 cp_fm_cholesky_invert 11 10.9 2.025 2.032 2.025 2.032 cp_dbcsr_syevd 50 12.0 0.003 0.003 1.922 1.931 hybrid_alltoall_any 4725 16.4 0.111 0.184 1.430 1.798 make_images_data 4572 15.5 0.059 0.062 1.428 1.764 apply_preconditioner_dbcsr 119 12.6 0.024 0.069 1.720 1.757 apply_single 119 13.6 0.000 0.000 1.696 1.757 calculate_first_density_matrix 1 7.0 0.008 0.019 1.665 1.678 cp_fm_diag_elpa 50 13.0 0.000 0.000 1.604 1.605 cp_fm_diag_elpa_base 50 14.0 1.478 1.495 1.603 1.603 potential_pw2rs 119 12.3 0.017 0.018 1.571 1.574 mp_alltoall_d11v 2130 13.8 1.414 1.472 1.414 1.472 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.302 1.315 qs_env_update_s_mstruct 11 6.9 0.001 0.001 1.218 1.280 qs_ot_get_derivative_taylor 59 13.0 0.002 0.002 1.229 1.272 make_basis_sm 11 9.8 0.000 0.000 1.220 1.222 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.188 1.219 cp_fm_cholesky_decompose 22 10.9 1.195 1.200 1.195 1.200 mp_alltoall_z22v 1201 15.6 1.143 1.175 1.143 1.175 transfer_rs2pw 487 10.6 0.006 0.007 1.006 1.136 cp_dbcsr_sm_fm_multiply 37 9.5 0.001 0.001 1.052 1.053 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="114", plot="h2o_64_md", label="(4n/2r/18t)", y=51.832000, yerr=0.000000 PlotPoint: name="115", plot="h2o_64_md_mem", label="(4n/2r/18t)", y=818.363636, yerr=108.800295 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/11/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 207618048 0.0% 100.0% 0.0% flops 22 x 32 x 32 253755392 0.0% 100.0% 0.0% flops 18 x 128 x 128 8554807296 0.0% 100.0% 0.0% flops 18 x 128 x 1280 19652935680 0.0% 100.0% 0.0% flops 53 x 128 x 128 21590704128 0.0% 100.0% 0.0% flops 32 x 128 x 128 26877100032 0.0% 100.0% 0.0% flops 31 x 128 x 128 31571312640 0.0% 100.0% 0.0% flops 9 x 9 x 128 44168260608 0.0% 100.0% 0.0% flops 40 x 128 x 128 46168801280 0.0% 100.0% 0.0% flops 53 x 128 x 1280 49600266240 0.0% 100.0% 0.0% flops 22 x 9 x 128 53835724800 0.0% 100.0% 0.0% flops 9 x 22 x 128 53885500416 0.0% 100.0% 0.0% flops 22 x 22 x 128 67007283200 0.0% 100.0% 0.0% flops 31 x 128 x 1280 72528691200 0.0% 100.0% 0.0% flops 40 x 128 x 1280 106063462400 0.0% 100.0% 0.0% flops 32 x 128 x 1280 141264158720 0.0% 100.0% 0.0% flops 9 x 32 x 9 185405884416 0.0% 100.0% 0.0% flops 22 x 32 x 9 227871249408 0.0% 100.0% 0.0% flops 9 x 32 x 22 227871249408 0.0% 100.0% 0.0% flops 22 x 32 x 22 279130931200 0.0% 100.0% 0.0% flops inhomo. stacks 320807108608 100.0% 0.0% 0.0% flops total 1.984317E+12 16.2% 83.8% 0.0% flops max/rank 515.157433E+09 16.2% 83.8% 0.0% matmuls inhomo. stacks 63700 100.0% 0.0% 0.0% matmuls total 85771122 0.1% 99.9% 0.0% number of processed stacks 614366 10.4% 89.6% 0.0% average stack size 1.0 155.6 0.0 marketing flops 2.107587E+12 ------------------------------------------------------------------------------- # multiplications 2286 max memory usage/rank 4.621623E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 18288 MPI messages size (bytes): total size 32.347431E+09 min size 0.000000E+00 max size 13.107200E+06 average size 1.768779E+06 MPI breakdown and total messages size (bytes): size <= 128 110 0 128 < size <= 8192 0 0 8192 < size <= 32768 22 720896 32768 < size <= 131072 8480 1111490560 131072 < size <= 4194304 8100 10616832000 4194304 < size <= 16777216 1576 20618391488 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 21 12. MP_Allreduce 12181 16. MP_Alltoall 8655 275585. MP_ISend 18244 454716. MP_IRecv 18244 454782. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3611 63917. MP_Allreduce 10132 514. MP_Sync 54 MP_Alltoall 1201 14305296. MP_SendRecv 1461 1332738. MP_ISendRecv 1461 1332738. MP_Wait 1461 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.281 0.378 64.420 64.432 qs_mol_dyn_low 1 2.0 0.158 0.166 62.389 62.394 qs_forces 11 3.9 0.141 0.173 60.233 60.291 qs_energies 11 4.9 0.040 0.040 56.579 56.641 scf_env_do_scf 11 5.9 0.022 0.040 48.462 48.462 scf_env_do_scf_inner_loop 108 6.5 0.014 0.050 40.300 40.309 velocity_verlet 10 3.0 0.001 0.003 36.303 36.693 qs_rho_update_rho_low 119 7.7 0.001 0.002 17.119 17.121 calculate_rho_elec 119 8.7 0.318 0.323 17.117 17.120 qs_scf_new_mos 108 7.5 0.001 0.001 15.195 15.221 qs_scf_loop_do_ot 108 8.5 0.001 0.001 15.194 15.220 ot_scf_mini 108 9.5 0.026 0.041 14.365 14.401 dbcsr_multiply_generic 2286 12.5 0.158 0.163 12.352 12.765 grid_collocate_task_list 119 9.7 12.446 12.534 12.446 12.534 rebuild_ks_matrix 119 8.3 0.001 0.001 12.246 12.285 qs_ks_build_kohn_sham_matrix 119 9.3 0.015 0.015 12.246 12.284 qs_ks_update_qs_env 119 7.6 0.001 0.001 11.140 11.188 init_scf_loop 11 6.9 0.057 0.073 8.088 8.092 sum_up_and_integrate 119 10.3 0.002 0.002 7.716 7.954 integrate_v_rspace 119 11.3 0.108 0.113 7.701 7.939 ot_mini 108 10.5 0.021 0.021 7.518 7.544 prepare_preconditioner 11 7.9 0.000 0.000 6.297 6.304 make_preconditioner 11 8.9 0.001 0.001 6.297 6.304 multiply_cannon 2286 13.5 0.531 0.562 5.146 6.101 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.855 6.055 grid_integrate_task_list 119 12.3 5.121 5.347 5.121 5.347 qs_ot_get_derivative 108 11.5 0.006 0.017 5.182 5.205 qs_ot_get_p 119 10.4 0.001 0.001 4.880 4.903 make_m2s 4572 13.5 0.064 0.065 4.426 4.855 fft_wrap_pw1pw2 1201 11.6 0.014 0.014 4.751 4.782 init_scf_run 11 5.9 0.001 0.004 4.520 4.520 scf_env_initial_rho_setup 11 6.9 0.003 0.003 4.518 4.520 density_rs2pw 119 9.7 0.005 0.005 4.343 4.436 fft_wrap_pw1pw2_140 487 12.2 0.468 0.470 4.073 4.106 multiply_cannon_loop 2286 14.5 0.065 0.066 3.764 3.913 fft3d_ps 1201 13.6 1.720 1.746 3.671 3.705 make_images 4572 14.5 0.709 0.730 3.132 3.555 cp_fm_cholesky_invert 11 10.9 3.332 3.341 3.332 3.341 qs_energies_init_hamiltonians 11 5.9 0.001 0.002 3.132 3.194 multiply_cannon_multrec 4572 15.5 2.899 3.031 2.929 3.060 qs_ot_p2m_diag 50 11.0 0.075 0.080 2.851 2.871 wfi_extrapolate 11 7.9 0.001 0.001 2.822 2.824 mp_waitall_1 74613 16.8 2.279 2.773 2.279 2.773 cp_dbcsr_syevd 50 12.0 0.004 0.004 2.546 2.563 potential_pw2rs 119 12.3 0.028 0.028 2.472 2.486 ot_diis_step 108 11.5 0.086 0.107 2.303 2.305 transfer_rs2pw 487 10.6 0.008 0.008 2.057 2.256 hybrid_alltoall_any 4725 16.4 0.165 0.318 1.695 2.127 cp_fm_diag_elpa 50 13.0 0.000 0.000 2.061 2.062 cp_fm_diag_elpa_base 50 14.0 1.899 1.930 2.060 2.060 qs_env_update_s_mstruct 11 6.9 0.001 0.001 1.793 1.985 make_images_data 4572 15.5 0.055 0.058 1.437 1.956 qs_ot_get_derivative_taylor 59 13.0 0.002 0.002 1.824 1.848 mp_alltoall_z22v 1201 15.6 1.767 1.803 1.767 1.803 transfer_rs2pw_140 130 11.5 0.923 0.942 1.537 1.753 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 1.697 1.747 dbcsr_complete_redistribute 329 12.2 0.587 0.596 1.655 1.736 qs_ot_get_derivative_diag 49 12.0 0.001 0.001 1.667 1.678 apply_preconditioner_dbcsr 119 12.6 0.009 0.016 1.607 1.666 md_write_output 11 3.9 0.438 1.603 0.477 1.653 apply_single 119 13.6 0.000 0.000 1.598 1.650 mp_sum_dm 438 4.9 1.216 1.637 1.216 1.637 md_output 10 3.0 0.012 0.013 0.458 1.627 update_particle_set 20 4.0 0.010 0.014 1.183 1.577 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 1.539 1.544 copy_dbcsr_to_fm 153 11.3 0.003 0.003 1.450 1.481 cp_fm_cholesky_decompose 22 10.9 1.435 1.440 1.435 1.440 calculate_first_density_matrix 1 7.0 0.001 0.002 1.404 1.404 mp_allgather_i34 2286 14.5 0.502 1.303 0.502 1.303 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="116", plot="h2o_64_md", label="(4n/1r/36t)", y=64.432000, yerr=0.000000 PlotPoint: name="117", plot="h2o_64_md_mem", label="(4n/1r/36t)", y=3162.181818, yerr=958.895844 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/12/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 64 x 64 x 64 23051894784 0.0% 100.0% 0.0% flops 64 x 64 x 32 23051894784 0.0% 100.0% 0.0% flops 32 x 32 x 32 23051894784 0.0% 100.0% 0.0% flops 32 x 32 x 64 23051894784 0.0% 100.0% 0.0% flops 32 x 64 x 64 23051894784 0.0% 100.0% 0.0% flops 32 x 64 x 32 23051894784 0.0% 100.0% 0.0% flops 64 x 32 x 64 23051894784 0.0% 100.0% 0.0% flops 64 x 32 x 32 23051894784 0.0% 100.0% 0.0% flops 64 x 64 x 422 85181071360 0.0% 100.0% 0.0% flops 32 x 32 x 422 85181071360 0.0% 100.0% 0.0% flops 32 x 64 x 422 85181071360 0.0% 100.0% 0.0% flops 64 x 32 x 422 85181071360 0.0% 100.0% 0.0% flops 64 x 64 x 427 86190325760 0.0% 100.0% 0.0% flops 32 x 32 x 427 86190325760 0.0% 100.0% 0.0% flops 32 x 64 x 427 86190325760 0.0% 100.0% 0.0% flops 64 x 32 x 427 86190325760 0.0% 100.0% 0.0% flops 64 x 64 x 431 86997729280 0.0% 100.0% 0.0% flops 32 x 32 x 431 86997729280 0.0% 100.0% 0.0% flops 32 x 64 x 431 86997729280 0.0% 100.0% 0.0% flops 64 x 32 x 431 86997729280 0.0% 100.0% 0.0% flops 422 x 32 x 32 104651030528 0.0% 100.0% 0.0% flops 422 x 32 x 64 104651030528 0.0% 100.0% 0.0% flops 422 x 64 x 64 104651030528 0.0% 100.0% 0.0% flops 422 x 64 x 32 104651030528 0.0% 100.0% 0.0% flops 427 x 32 x 32 105890971648 0.0% 100.0% 0.0% flops 427 x 32 x 64 105890971648 0.0% 100.0% 0.0% flops 427 x 64 x 64 105890971648 0.0% 100.0% 0.0% flops 427 x 64 x 32 105890971648 0.0% 100.0% 0.0% flops 431 x 64 x 64 106882924544 0.0% 100.0% 0.0% flops 431 x 64 x 32 106882924544 0.0% 100.0% 0.0% flops 431 x 32 x 32 106882924544 0.0% 100.0% 0.0% flops 431 x 32 x 64 106882924544 0.0% 100.0% 0.0% flops 9 x 9 x 64 134590242816 0.0% 100.0% 0.0% flops 9 x 9 x 32 134590242816 0.0% 100.0% 0.0% flops 422 x 32 x 422 160475054080 0.0% 100.0% 0.0% flops 422 x 64 x 422 160475054080 0.0% 100.0% 0.0% flops 427 x 32 x 422 162376417280 0.0% 100.0% 0.0% flops 427 x 64 x 422 162376417280 0.0% 100.0% 0.0% flops 422 x 32 x 427 162376417280 0.0% 100.0% 0.0% flops 422 x 64 x 427 162376417280 0.0% 100.0% 0.0% flops 431 x 64 x 422 163897507840 0.0% 100.0% 0.0% flops 422 x 32 x 431 163897507840 0.0% 100.0% 0.0% flops 422 x 64 x 431 163897507840 0.0% 100.0% 0.0% flops 431 x 32 x 422 163897507840 0.0% 100.0% 0.0% flops 427 x 32 x 427 164300308480 0.0% 100.0% 0.0% flops 427 x 64 x 427 164300308480 0.0% 100.0% 0.0% flops 431 x 64 x 427 165839421440 0.0% 100.0% 0.0% flops 427 x 32 x 431 165839421440 0.0% 100.0% 0.0% flops 427 x 64 x 431 165839421440 0.0% 100.0% 0.0% flops 431 x 32 x 427 165839421440 0.0% 100.0% 0.0% flops 431 x 64 x 431 167392952320 0.0% 100.0% 0.0% flops 431 x 32 x 431 167392952320 0.0% 100.0% 0.0% flops 9 x 22 x 64 174697712640 0.0% 100.0% 0.0% flops 9 x 22 x 32 174697712640 0.0% 100.0% 0.0% flops 22 x 9 x 64 175021203456 0.0% 100.0% 0.0% flops 22 x 9 x 32 175021203456 0.0% 100.0% 0.0% flops 22 x 22 x 64 226790907904 0.0% 100.0% 0.0% flops 22 x 22 x 32 226790907904 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 12.884056E+12 0.0% 100.0% 0.0% flops max/rank 137.578432E+09 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 609915708 0.0% 100.0% 0.0% number of processed stacks 5472840 0.0% 100.0% 0.0% average stack size 0.0 111.4 0.0 marketing flops 15.646547E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 214.614016E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 6510240 MPI messages size (bytes): total size 1.243517E+12 min size 0.000000E+00 max size 1.486088E+06 average size 191.009359E+03 MPI breakdown and total messages size (bytes): size <= 128 50820 0 128 < size <= 8192 1301256 10659889152 8192 < size <= 32768 1625844 31963807744 32768 < size <= 131072 1967328 214884679680 131072 < size <= 4194304 1564992 985951934528 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 65 12. MP_Allreduce 11033 25. MP_Alltoall 8043 61599. MP_ISend 98596 100033. MP_IRecv 98596 98808. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65382. MP_Allreduce 9921 489. MP_Sync 492 MP_Alltoall 1939 944079. MP_SendRecv 31460 6552. MP_ISendRecv 31460 6552. MP_Wait 47872 MP_comm_split 48 MP_ISend 26114 37861. MP_IRecv 26114 37861. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.137 0.288 59.220 59.231 qs_mol_dyn_low 1 2.0 0.118 0.186 57.137 57.162 qs_forces 11 3.9 0.005 0.012 56.540 56.849 qs_energies 11 4.9 0.001 0.001 53.679 54.000 scf_env_do_scf 11 5.9 0.000 0.001 46.712 46.713 scf_env_do_scf_inner_loop 99 6.5 0.002 0.007 40.458 40.459 velocity_verlet 10 3.0 0.001 0.002 31.720 31.738 dbcsr_multiply_generic 2055 12.4 0.117 0.149 19.651 20.343 qs_scf_new_mos 99 7.5 0.001 0.001 18.047 18.445 qs_scf_loop_do_ot 99 8.5 0.001 0.001 18.046 18.445 ot_scf_mini 99 9.5 0.002 0.003 16.879 17.239 rebuild_ks_matrix 110 8.3 0.000 0.001 16.231 16.632 qs_ks_build_kohn_sham_matrix 110 9.3 0.012 0.014 16.231 16.631 multiply_cannon 2055 13.4 0.166 0.199 12.368 14.906 qs_ks_update_qs_env 110 7.6 0.001 0.001 14.326 14.686 multiply_cannon_loop 2055 14.4 0.172 0.280 11.007 12.856 sum_up_and_integrate 110 10.3 0.001 0.002 11.150 11.180 integrate_v_rspace 110 11.3 0.003 0.004 11.131 11.162 qs_rho_update_rho_low 110 7.6 0.001 0.001 10.883 10.912 calculate_rho_elec 110 8.6 0.018 0.031 10.882 10.912 mp_waitall_1 264348 16.4 7.741 10.160 7.741 10.160 ot_mini 99 10.5 0.001 0.001 9.184 9.577 multiply_cannon_multrec 24660 15.4 5.839 8.938 5.848 8.947 qs_ot_get_derivative 99 11.5 0.001 0.001 6.052 6.414 init_scf_loop 11 6.9 0.000 0.000 6.210 6.214 grid_integrate_task_list 110 12.3 5.160 5.436 5.160 5.436 make_m2s 4110 13.4 0.072 0.097 4.946 5.339 density_rs2pw 110 9.6 0.005 0.008 5.070 5.305 init_scf_run 11 5.9 0.000 0.001 5.215 5.216 scf_env_initial_rho_setup 11 6.9 0.000 0.001 5.215 5.216 grid_collocate_task_list 110 9.6 4.593 4.823 4.593 4.823 make_images 4110 14.4 0.158 0.204 4.462 4.809 multiply_cannon_metrocomm3 24660 15.4 0.074 0.156 1.850 4.801 qs_ot_get_p 110 10.4 0.001 0.001 4.083 4.570 fft_wrap_pw1pw2 1111 11.6 0.012 0.015 4.341 4.565 potential_pw2rs 110 12.3 0.005 0.008 4.453 4.501 fft3d_ps 1111 13.6 0.700 1.007 3.911 4.093 prepare_preconditioner 11 7.9 0.000 0.000 3.997 4.047 make_preconditioner 11 8.9 0.000 0.000 3.997 4.047 mp_waitany 13684 13.7 3.261 3.771 3.261 3.771 fft_wrap_pw1pw2_140 451 12.1 0.098 0.108 3.492 3.730 transfer_rs2pw 451 10.6 0.005 0.007 3.265 3.725 mp_alltoall_d11v 2046 13.8 3.480 3.701 3.480 3.701 make_full_inverse_cholesky 11 9.9 0.000 0.000 3.623 3.700 multiply_cannon_metrocomm1 24660 15.4 0.084 0.169 2.401 3.643 transfer_pw2rs 451 13.1 0.005 0.005 3.266 3.313 wfi_extrapolate 11 7.9 0.001 0.001 3.247 3.247 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 2.846 3.178 apply_single 110 13.6 0.000 0.001 2.846 3.178 make_images_data 4110 15.4 0.052 0.082 2.618 3.114 ot_diis_step 99 11.5 0.006 0.008 3.075 3.075 hybrid_alltoall_any 4261 16.3 0.077 0.470 2.093 2.882 mp_sum_l 10179 13.1 1.680 2.841 1.680 2.841 mp_alltoall_z22v 1111 15.6 2.374 2.476 2.374 2.476 qs_ot_get_derivative_diag 47 12.0 0.001 0.002 2.120 2.316 qs_ot_p2m_diag 48 11.0 0.008 0.017 2.275 2.292 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 2.037 2.078 cp_dbcsr_syevd 48 12.0 0.003 0.004 2.047 2.059 mp_allgather_i34 2055 14.4 0.708 1.997 0.708 1.997 qs_ot_get_derivative_taylor 52 13.0 0.001 0.002 1.801 1.966 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 1.884 1.894 mp_sum_d 3893 11.9 1.271 1.891 1.271 1.891 transfer_pw2rs_50 110 14.3 0.086 0.105 1.516 1.879 calculate_first_density_matrix 1 7.0 0.000 0.000 1.769 1.770 cp_fm_cholesky_decompose 22 10.9 1.689 1.719 1.689 1.719 dbcsr_complete_redistribute 325 12.2 0.195 0.277 1.584 1.704 make_images_sizes 4110 15.4 0.005 0.011 1.064 1.629 mp_alltoall_i44 4110 16.4 1.059 1.625 1.059 1.625 rs_gather_matrices 110 12.3 0.097 0.129 1.400 1.616 qs_energies_init_hamiltonians 11 5.9 0.001 0.002 1.255 1.566 cp_fm_cholesky_invert 11 10.9 1.553 1.560 1.553 1.560 transfer_rs2pw_50 110 11.6 0.132 0.149 1.520 1.560 transfer_rs2pw_140 121 11.5 0.254 0.299 1.079 1.528 calculate_dm_sparse 110 9.5 0.001 0.001 1.426 1.501 yz_to_x 231 14.8 0.064 0.069 1.416 1.487 cp_fm_diag_elpa 48 13.0 0.000 0.000 1.399 1.400 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.226 1.360 cp_fm_redistribute_end 48 14.0 0.683 1.351 0.695 1.357 cp_fm_diag_elpa_base 48 14.0 0.648 1.304 0.657 1.319 dbcsr_dot_sd 1091 11.9 0.158 0.179 0.799 1.308 make_basis_sm 11 9.8 0.000 0.000 1.243 1.267 rs_scatter_matrices 121 9.7 0.085 0.109 1.205 1.242 qs_env_update_s_mstruct 11 6.9 0.000 0.000 0.856 1.220 multiply_cannon_metrocomm4 22605 15.4 0.070 0.154 0.501 1.187 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="200", plot="h2o_128_md", label="(4n/36r/1t)", y=59.231000, yerr=0.000000 PlotPoint: name="201", plot="h2o_128_md_mem", label="(4n/36r/1t)", y=202.000000, yerr=2.044949 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/13/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 360 x 32 x 422 8556134400 0.0% 100.0% 0.0% flops 360 x 64 x 422 8556134400 0.0% 100.0% 0.0% flops 378 x 32 x 422 8983941120 0.0% 100.0% 0.0% flops 378 x 64 x 422 8983941120 0.0% 100.0% 0.0% flops 382 x 64 x 422 9079009280 0.0% 100.0% 0.0% flops 382 x 32 x 422 9079009280 0.0% 100.0% 0.0% flops 458 x 32 x 422 10885304320 0.0% 100.0% 0.0% flops 458 x 64 x 422 10885304320 0.0% 100.0% 0.0% flops 471 x 32 x 422 11194275840 0.0% 100.0% 0.0% flops 471 x 64 x 422 11194275840 0.0% 100.0% 0.0% flops 480 x 64 x 422 11408179200 0.0% 100.0% 0.0% flops 480 x 32 x 422 11408179200 0.0% 100.0% 0.0% flops 493 x 32 x 422 11717150720 0.0% 100.0% 0.0% flops 493 x 64 x 422 11717150720 0.0% 100.0% 0.0% flops 32 x 64 x 422 21295267840 0.0% 100.0% 0.0% flops 64 x 64 x 422 21295267840 0.0% 100.0% 0.0% flops 32 x 32 x 422 21295267840 0.0% 100.0% 0.0% flops 64 x 32 x 422 21295267840 0.0% 100.0% 0.0% flops 449 x 32 x 422 21342801920 0.0% 100.0% 0.0% flops 449 x 64 x 422 21342801920 0.0% 100.0% 0.0% flops 360 x 32 x 32 22318940160 0.0% 100.0% 0.0% flops 360 x 32 x 64 22318940160 0.0% 100.0% 0.0% flops 360 x 64 x 32 22318940160 0.0% 100.0% 0.0% flops 360 x 64 x 64 22318940160 0.0% 100.0% 0.0% flops 32 x 64 x 64 23051894784 0.0% 100.0% 0.0% flops 32 x 64 x 32 23051894784 0.0% 100.0% 0.0% flops 64 x 64 x 64 23051894784 0.0% 100.0% 0.0% flops 64 x 64 x 32 23051894784 0.0% 100.0% 0.0% flops 32 x 32 x 32 23051894784 0.0% 100.0% 0.0% flops 32 x 32 x 64 23051894784 0.0% 100.0% 0.0% flops 64 x 32 x 32 23051894784 0.0% 100.0% 0.0% flops 64 x 32 x 64 23051894784 0.0% 100.0% 0.0% flops 378 x 32 x 32 23434887168 0.0% 100.0% 0.0% flops 378 x 32 x 64 23434887168 0.0% 100.0% 0.0% flops 378 x 64 x 32 23434887168 0.0% 100.0% 0.0% flops 378 x 64 x 64 23434887168 0.0% 100.0% 0.0% flops 382 x 64 x 64 23682875392 0.0% 100.0% 0.0% flops 382 x 64 x 32 23682875392 0.0% 100.0% 0.0% flops 382 x 32 x 32 23682875392 0.0% 100.0% 0.0% flops 382 x 32 x 64 23682875392 0.0% 100.0% 0.0% flops 458 x 32 x 32 28394651648 0.0% 100.0% 0.0% flops 458 x 32 x 64 28394651648 0.0% 100.0% 0.0% flops 458 x 64 x 32 28394651648 0.0% 100.0% 0.0% flops 458 x 64 x 64 28394651648 0.0% 100.0% 0.0% flops 400 x 32 x 422 28520448000 0.0% 100.0% 0.0% flops 400 x 64 x 422 28520448000 0.0% 100.0% 0.0% flops 471 x 32 x 32 29200613376 0.0% 100.0% 0.0% flops 471 x 32 x 64 29200613376 0.0% 100.0% 0.0% flops 471 x 64 x 32 29200613376 0.0% 100.0% 0.0% flops 471 x 64 x 64 29200613376 0.0% 100.0% 0.0% flops 480 x 64 x 64 29758586880 0.0% 100.0% 0.0% flops 480 x 64 x 32 29758586880 0.0% 100.0% 0.0% flops 480 x 32 x 32 29758586880 0.0% 100.0% 0.0% flops 480 x 32 x 64 29758586880 0.0% 100.0% 0.0% flops 493 x 32 x 32 30564548608 0.0% 100.0% 0.0% flops 493 x 32 x 64 30564548608 0.0% 100.0% 0.0% flops 493 x 64 x 32 30564548608 0.0% 100.0% 0.0% flops 493 x 64 x 64 30564548608 0.0% 100.0% 0.0% flops 360 x 32 x 418 33900134400 0.0% 100.0% 0.0% flops 360 x 64 x 418 33900134400 0.0% 100.0% 0.0% flops 378 x 32 x 418 35595141120 0.0% 100.0% 0.0% flops 378 x 64 x 418 35595141120 0.0% 100.0% 0.0% flops 382 x 64 x 418 35971809280 0.0% 100.0% 0.0% flops 382 x 32 x 418 35971809280 0.0% 100.0% 0.0% flops 458 x 32 x 418 43128504320 0.0% 100.0% 0.0% flops 458 x 64 x 418 43128504320 0.0% 100.0% 0.0% flops 471 x 32 x 418 44352675840 0.0% 100.0% 0.0% flops 471 x 64 x 418 44352675840 0.0% 100.0% 0.0% flops 480 x 64 x 418 45200179200 0.0% 100.0% 0.0% flops 480 x 32 x 418 45200179200 0.0% 100.0% 0.0% flops 493 x 32 x 418 46424350720 0.0% 100.0% 0.0% flops 493 x 64 x 418 46424350720 0.0% 100.0% 0.0% flops 360 x 32 x 431 52431667200 0.0% 100.0% 0.0% flops 360 x 64 x 431 52431667200 0.0% 100.0% 0.0% flops 378 x 32 x 431 55053250560 0.0% 100.0% 0.0% flops 378 x 64 x 431 55053250560 0.0% 100.0% 0.0% flops 382 x 64 x 431 55635824640 0.0% 100.0% 0.0% flops 382 x 32 x 431 55635824640 0.0% 100.0% 0.0% flops 449 x 32 x 32 55673356288 0.0% 100.0% 0.0% flops 449 x 32 x 64 55673356288 0.0% 100.0% 0.0% flops 449 x 64 x 32 55673356288 0.0% 100.0% 0.0% flops 449 x 64 x 64 55673356288 0.0% 100.0% 0.0% flops 458 x 32 x 431 66704732160 0.0% 100.0% 0.0% flops 458 x 64 x 431 66704732160 0.0% 100.0% 0.0% flops 471 x 32 x 431 68598097920 0.0% 100.0% 0.0% flops 471 x 64 x 431 68598097920 0.0% 100.0% 0.0% flops 480 x 64 x 431 69908889600 0.0% 100.0% 0.0% flops 480 x 32 x 431 69908889600 0.0% 100.0% 0.0% flops 493 x 32 x 431 71802255360 0.0% 100.0% 0.0% flops 493 x 64 x 431 71802255360 0.0% 100.0% 0.0% flops 400 x 32 x 32 74396467200 0.0% 100.0% 0.0% flops 400 x 32 x 64 74396467200 0.0% 100.0% 0.0% flops 400 x 64 x 32 74396467200 0.0% 100.0% 0.0% flops 400 x 64 x 64 74396467200 0.0% 100.0% 0.0% flops 32 x 64 x 418 84373667840 0.0% 100.0% 0.0% flops 64 x 64 x 418 84373667840 0.0% 100.0% 0.0% flops 32 x 32 x 418 84373667840 0.0% 100.0% 0.0% flops 64 x 32 x 418 84373667840 0.0% 100.0% 0.0% flops 449 x 32 x 418 84562001920 0.0% 100.0% 0.0% flops 449 x 64 x 418 84562001920 0.0% 100.0% 0.0% flops 400 x 32 x 418 113000448000 0.0% 100.0% 0.0% flops 400 x 64 x 418 113000448000 0.0% 100.0% 0.0% flops 32 x 64 x 431 130496593920 0.0% 100.0% 0.0% flops 64 x 64 x 431 130496593920 0.0% 100.0% 0.0% flops 32 x 32 x 431 130496593920 0.0% 100.0% 0.0% flops 64 x 32 x 431 130496593920 0.0% 100.0% 0.0% flops 449 x 32 x 431 130787880960 0.0% 100.0% 0.0% flops 449 x 64 x 431 130787880960 0.0% 100.0% 0.0% flops 9 x 9 x 64 134590242816 0.0% 100.0% 0.0% flops 9 x 9 x 32 134590242816 0.0% 100.0% 0.0% flops 9 x 22 x 64 174697712640 0.0% 100.0% 0.0% flops 9 x 22 x 32 174697712640 0.0% 100.0% 0.0% flops 400 x 32 x 431 174772224000 0.0% 100.0% 0.0% flops 400 x 64 x 431 174772224000 0.0% 100.0% 0.0% flops 22 x 9 x 64 175021203456 0.0% 100.0% 0.0% flops 22 x 9 x 32 175021203456 0.0% 100.0% 0.0% flops 22 x 22 x 64 226790907904 0.0% 100.0% 0.0% flops 22 x 22 x 32 226790907904 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 388562944000 100.0% 0.0% 0.0% flops total 12.930049E+12 3.0% 97.0% 0.0% flops max/rank 265.896255E+09 3.3% 96.7% 0.0% matmuls inhomo. stacks 71280 100.0% 0.0% 0.0% matmuls total 609915708 0.0% 100.0% 0.0% number of processed stacks 5475744 1.3% 98.7% 0.0% average stack size 1.0 112.8 0.0 marketing flops 15.646547E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 258.293760E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 3107160 MPI messages size (bytes): total size 1.118966E+12 min size 0.000000E+00 max size 3.034240E+06 average size 360.125031E+03 MPI breakdown and total messages size (bytes): size <= 128 23892 0 128 < size <= 8192 371640 3044474880 8192 < size <= 32768 887412 19031326720 32768 < size <= 131072 539952 51713409024 131072 < size <= 4194304 1284264 1045217450048 4194304 < size <= 16777216 0 0 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65473. MP_Allreduce 9920 520. MP_Sync 52 MP_Alltoall 1938 1921907. MP_SendRecv 15620 11120. MP_ISendRecv 15620 11120. MP_Wait 31988 MP_comm_split 48 MP_ISend 14300 93624. MP_IRecv 14300 93624. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.085 0.133 81.727 81.736 qs_mol_dyn_low 1 2.0 0.125 0.184 80.037 80.097 qs_forces 11 3.9 0.003 0.004 79.261 79.393 qs_energies 11 4.9 0.001 0.002 75.053 75.192 scf_env_do_scf 11 5.9 0.001 0.004 66.035 66.036 scf_env_do_scf_inner_loop 99 6.5 0.003 0.033 57.540 57.548 velocity_verlet 10 3.0 0.033 0.128 45.215 45.261 dbcsr_multiply_generic 2055 12.4 0.152 0.159 31.682 32.726 qs_scf_new_mos 99 7.5 0.001 0.002 27.462 28.189 qs_scf_loop_do_ot 99 8.5 0.001 0.001 27.461 28.188 ot_scf_mini 99 9.5 0.003 0.004 25.742 26.326 multiply_cannon 2055 13.4 0.233 0.264 21.679 24.977 rebuild_ks_matrix 110 8.3 0.001 0.001 22.137 22.737 qs_ks_build_kohn_sham_matrix 110 9.3 0.015 0.019 22.136 22.736 multiply_cannon_loop 2055 14.4 0.299 0.316 19.562 21.937 qs_ks_update_qs_env 110 7.6 0.001 0.001 19.557 20.093 multiply_cannon_multrec 24660 15.4 14.032 19.138 14.051 19.158 ot_mini 99 10.5 0.001 0.001 14.384 14.993 qs_rho_update_rho_low 110 7.6 0.001 0.001 14.186 14.215 calculate_rho_elec 110 8.6 0.034 0.045 14.186 14.214 sum_up_and_integrate 110 10.3 0.002 0.005 13.743 13.769 integrate_v_rspace 110 11.3 0.003 0.004 13.696 13.726 mp_waitall_1 198528 16.4 6.868 13.121 6.868 13.121 multiply_cannon_metrocomm3 24660 15.4 0.098 0.112 3.025 9.535 grid_collocate_task_list 110 9.6 8.821 9.169 8.821 9.169 grid_integrate_task_list 110 12.3 8.488 8.958 8.488 8.958 init_scf_loop 11 6.9 0.000 0.000 8.441 8.457 qs_ot_get_derivative 99 11.5 0.001 0.001 7.546 8.132 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 6.455 7.063 apply_single 110 13.6 0.001 0.001 6.455 7.063 make_m2s 4110 13.4 0.098 0.103 6.633 6.928 init_scf_run 11 5.9 0.000 0.009 6.803 6.803 scf_env_initial_rho_setup 11 6.9 0.000 0.008 6.802 6.803 ot_diis_step 99 11.5 0.012 0.012 6.765 6.765 make_images 4110 14.4 0.471 0.551 5.682 6.088 qs_ot_get_p 110 10.4 0.001 0.001 5.036 5.743 prepare_preconditioner 11 7.9 0.000 0.000 5.365 5.446 make_preconditioner 11 8.9 0.000 0.000 5.365 5.446 make_full_inverse_cholesky 11 9.9 0.000 0.000 4.961 5.126 fft_wrap_pw1pw2 1111 11.6 0.019 0.027 4.901 5.071 density_rs2pw 110 9.6 0.007 0.008 4.373 4.706 wfi_extrapolate 11 7.9 0.001 0.001 4.481 4.481 fft_wrap_pw1pw2_140 451 12.1 0.189 0.195 4.188 4.400 fft3d_ps 1111 13.6 1.448 1.755 4.033 4.243 mp_sum_l 10179 13.1 2.303 3.990 2.303 3.990 potential_pw2rs 110 12.3 0.010 0.011 3.859 3.893 multiply_cannon_metrocomm4 22605 15.4 0.099 0.109 1.400 3.845 make_images_data 4110 15.4 0.065 0.071 3.314 3.813 mp_irecv_dv 61724 16.0 1.313 3.720 1.313 3.720 hybrid_alltoall_any 4261 16.3 0.138 0.650 2.806 3.457 mp_alltoall_d11v 2046 13.8 2.978 3.258 2.978 3.258 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.003 2.969 2.975 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 2.833 2.907 qs_ot_get_derivative_diag 47 12.0 0.001 0.002 2.512 2.831 qs_ot_p2m_diag 48 11.0 0.018 0.034 2.627 2.657 mp_waitany 14300 13.8 1.884 2.554 1.884 2.554 transfer_pw2rs 451 13.1 0.007 0.010 2.524 2.550 transfer_rs2pw 451 10.6 0.007 0.010 2.122 2.515 cp_fm_cholesky_invert 11 10.9 2.449 2.460 2.449 2.460 qs_ot_get_derivative_taylor 52 13.0 0.002 0.002 2.149 2.424 cp_dbcsr_syevd 48 12.0 0.004 0.004 2.390 2.403 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.116 2.307 mp_alltoall_z22v 1111 15.6 2.108 2.264 2.108 2.264 mp_sum_d 3893 11.9 1.439 2.230 1.439 2.230 calculate_dm_sparse 110 9.5 0.001 0.001 1.983 2.139 calculate_first_density_matrix 1 7.0 0.000 0.000 2.053 2.055 mp_allgather_i34 2055 14.4 0.635 2.034 0.635 2.034 dbcsr_dot_sd 1091 11.9 0.316 0.352 1.245 1.938 dbcsr_complete_redistribute 325 12.2 0.265 0.326 1.713 1.908 make_basis_sm 11 9.8 0.000 0.000 1.785 1.792 cp_fm_cholesky_decompose 22 10.9 1.749 1.789 1.749 1.789 qs_energies_init_hamiltonians 11 5.9 0.000 0.001 1.618 1.750 cp_fm_diag_elpa 48 13.0 0.000 0.000 1.702 1.703 cp_fm_redistribute_end 48 14.0 0.845 1.668 0.852 1.672 transfer_rs2pw_140 121 11.5 0.339 0.409 1.264 1.645 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="202", plot="h2o_128_md", label="(4n/18r/2t)", y=81.736000, yerr=0.000000 PlotPoint: name="203", plot="h2o_128_md_mem", label="(4n/18r/2t)", y=244.909091, yerr=2.466302 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/14/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 436 x 64 x 409 10043207680 0.0% 100.0% 0.0% flops 436 x 32 x 409 10043207680 0.0% 100.0% 0.0% flops 440 x 32 x 409 10135347200 0.0% 100.0% 0.0% flops 440 x 64 x 409 10135347200 0.0% 100.0% 0.0% flops 449 x 32 x 409 10342661120 0.0% 100.0% 0.0% flops 449 x 64 x 409 10342661120 0.0% 100.0% 0.0% flops 458 x 32 x 409 10549975040 0.0% 100.0% 0.0% flops 458 x 64 x 409 10549975040 0.0% 100.0% 0.0% flops 462 x 32 x 409 10642114560 0.0% 100.0% 0.0% flops 462 x 64 x 409 10642114560 0.0% 100.0% 0.0% flops 493 x 64 x 409 11356195840 0.0% 100.0% 0.0% flops 493 x 32 x 409 11356195840 0.0% 100.0% 0.0% flops 32 x 64 x 409 20639252480 0.0% 100.0% 0.0% flops 64 x 64 x 409 20639252480 0.0% 100.0% 0.0% flops 32 x 32 x 409 20639252480 0.0% 100.0% 0.0% flops 64 x 32 x 409 20639252480 0.0% 100.0% 0.0% flops 471 x 32 x 409 21698856960 0.0% 100.0% 0.0% flops 471 x 64 x 409 21698856960 0.0% 100.0% 0.0% flops 32 x 64 x 64 23051894784 0.0% 100.0% 0.0% flops 32 x 64 x 32 23051894784 0.0% 100.0% 0.0% flops 64 x 64 x 64 23051894784 0.0% 100.0% 0.0% flops 64 x 64 x 32 23051894784 0.0% 100.0% 0.0% flops 32 x 32 x 32 23051894784 0.0% 100.0% 0.0% flops 32 x 32 x 64 23051894784 0.0% 100.0% 0.0% flops 64 x 32 x 32 23051894784 0.0% 100.0% 0.0% flops 64 x 32 x 64 23051894784 0.0% 100.0% 0.0% flops 436 x 64 x 64 27030716416 0.0% 100.0% 0.0% flops 436 x 64 x 32 27030716416 0.0% 100.0% 0.0% flops 436 x 32 x 32 27030716416 0.0% 100.0% 0.0% flops 436 x 32 x 64 27030716416 0.0% 100.0% 0.0% flops 440 x 32 x 32 27278704640 0.0% 100.0% 0.0% flops 440 x 32 x 64 27278704640 0.0% 100.0% 0.0% flops 440 x 64 x 64 27278704640 0.0% 100.0% 0.0% flops 440 x 64 x 32 27278704640 0.0% 100.0% 0.0% flops 449 x 32 x 32 27836678144 0.0% 100.0% 0.0% flops 449 x 32 x 64 27836678144 0.0% 100.0% 0.0% flops 449 x 64 x 64 27836678144 0.0% 100.0% 0.0% flops 449 x 64 x 32 27836678144 0.0% 100.0% 0.0% flops 458 x 32 x 32 28394651648 0.0% 100.0% 0.0% flops 458 x 32 x 64 28394651648 0.0% 100.0% 0.0% flops 458 x 64 x 64 28394651648 0.0% 100.0% 0.0% flops 458 x 64 x 32 28394651648 0.0% 100.0% 0.0% flops 462 x 32 x 32 28642639872 0.0% 100.0% 0.0% flops 462 x 32 x 64 28642639872 0.0% 100.0% 0.0% flops 462 x 64 x 64 28642639872 0.0% 100.0% 0.0% flops 462 x 64 x 32 28642639872 0.0% 100.0% 0.0% flops 493 x 64 x 64 30564548608 0.0% 100.0% 0.0% flops 493 x 64 x 32 30564548608 0.0% 100.0% 0.0% flops 493 x 32 x 32 30564548608 0.0% 100.0% 0.0% flops 493 x 32 x 64 30564548608 0.0% 100.0% 0.0% flops 436 x 64 x 418 30792622080 0.0% 100.0% 0.0% flops 436 x 32 x 418 30792622080 0.0% 100.0% 0.0% flops 440 x 32 x 418 31075123200 0.0% 100.0% 0.0% flops 440 x 64 x 418 31075123200 0.0% 100.0% 0.0% flops 449 x 32 x 418 31710750720 0.0% 100.0% 0.0% flops 449 x 64 x 418 31710750720 0.0% 100.0% 0.0% flops 458 x 32 x 418 32346378240 0.0% 100.0% 0.0% flops 458 x 64 x 418 32346378240 0.0% 100.0% 0.0% flops 462 x 32 x 418 32628879360 0.0% 100.0% 0.0% flops 462 x 64 x 418 32628879360 0.0% 100.0% 0.0% flops 360 x 64 x 409 33170227200 0.0% 100.0% 0.0% flops 360 x 32 x 409 33170227200 0.0% 100.0% 0.0% flops 493 x 64 x 418 34818263040 0.0% 100.0% 0.0% flops 493 x 32 x 418 34818263040 0.0% 100.0% 0.0% flops 471 x 32 x 32 58401226752 0.0% 100.0% 0.0% flops 471 x 32 x 64 58401226752 0.0% 100.0% 0.0% flops 471 x 64 x 64 58401226752 0.0% 100.0% 0.0% flops 471 x 64 x 32 58401226752 0.0% 100.0% 0.0% flops 32 x 64 x 418 63280250880 0.0% 100.0% 0.0% flops 64 x 64 x 418 63280250880 0.0% 100.0% 0.0% flops 32 x 32 x 418 63280250880 0.0% 100.0% 0.0% flops 64 x 32 x 418 63280250880 0.0% 100.0% 0.0% flops 471 x 32 x 418 66529013760 0.0% 100.0% 0.0% flops 471 x 64 x 418 66529013760 0.0% 100.0% 0.0% flops 436 x 64 x 431 74084003840 0.0% 100.0% 0.0% flops 436 x 32 x 431 74084003840 0.0% 100.0% 0.0% flops 440 x 32 x 431 74763673600 0.0% 100.0% 0.0% flops 440 x 64 x 431 74763673600 0.0% 100.0% 0.0% flops 449 x 32 x 431 76292930560 0.0% 100.0% 0.0% flops 449 x 64 x 431 76292930560 0.0% 100.0% 0.0% flops 458 x 32 x 431 77822187520 0.0% 100.0% 0.0% flops 458 x 64 x 431 77822187520 0.0% 100.0% 0.0% flops 462 x 32 x 431 78501857280 0.0% 100.0% 0.0% flops 462 x 64 x 431 78501857280 0.0% 100.0% 0.0% flops 493 x 64 x 431 83769297920 0.0% 100.0% 0.0% flops 493 x 32 x 431 83769297920 0.0% 100.0% 0.0% flops 360 x 64 x 64 89275760640 0.0% 100.0% 0.0% flops 360 x 64 x 32 89275760640 0.0% 100.0% 0.0% flops 360 x 32 x 32 89275760640 0.0% 100.0% 0.0% flops 360 x 32 x 64 89275760640 0.0% 100.0% 0.0% flops 360 x 64 x 418 101700403200 0.0% 100.0% 0.0% flops 360 x 32 x 418 101700403200 0.0% 100.0% 0.0% flops 9 x 9 x 64 134590242816 0.0% 100.0% 0.0% flops 9 x 9 x 32 134590242816 0.0% 100.0% 0.0% flops 32 x 64 x 431 152246026240 0.0% 100.0% 0.0% flops 64 x 64 x 431 152246026240 0.0% 100.0% 0.0% flops 32 x 32 x 431 152246026240 0.0% 100.0% 0.0% flops 64 x 32 x 431 152246026240 0.0% 100.0% 0.0% flops 471 x 32 x 431 160062228480 0.0% 100.0% 0.0% flops 471 x 64 x 431 160062228480 0.0% 100.0% 0.0% flops 9 x 22 x 64 174697712640 0.0% 100.0% 0.0% flops 9 x 22 x 32 174697712640 0.0% 100.0% 0.0% flops 22 x 9 x 64 175021203456 0.0% 100.0% 0.0% flops 22 x 9 x 32 175021203456 0.0% 100.0% 0.0% flops 22 x 22 x 64 226790907904 0.0% 100.0% 0.0% flops 22 x 22 x 32 226790907904 0.0% 100.0% 0.0% flops 360 x 64 x 431 244681113600 0.0% 100.0% 0.0% flops 360 x 32 x 431 244681113600 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 415276646400 100.0% 0.0% 0.0% flops total 12.956763E+12 3.2% 96.8% 0.0% flops max/rank 393.280399E+09 3.4% 96.6% 0.0% matmuls inhomo. stacks 71280 100.0% 0.0% 0.0% matmuls total 609915708 0.0% 100.0% 0.0% number of processed stacks 5475744 1.3% 98.7% 0.0% average stack size 1.0 112.8 0.0 marketing flops 15.646302E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 305.598464E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 1972800 MPI messages size (bytes): total size 1.077520E+12 min size 0.000000E+00 max size 4.537280E+06 average size 546.188250E+03 MPI breakdown and total messages size (bytes): size <= 128 14916 0 128 < size <= 8192 222984 1826684928 8192 < size <= 32768 520356 13399818240 32768 < size <= 131072 372336 35386294272 131072 < size <= 4194304 787758 788321309808 4194304 < size <= 16777216 54450 238588003280 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3521 65581. MP_Allreduce 9919 558. MP_Sync 52 MP_Alltoall 1717 2801539. MP_SendRecv 10340 26400. MP_ISendRecv 10340 26400. MP_Wait 22352 MP_comm_split 48 MP_ISend 10164 155761. MP_IRecv 10164 155761. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.037 0.067 75.517 75.522 qs_mol_dyn_low 1 2.0 0.050 0.097 74.164 74.200 qs_forces 11 3.9 0.003 0.003 73.638 73.776 qs_energies 11 4.9 0.001 0.001 69.998 70.149 scf_env_do_scf 11 5.9 0.000 0.001 61.835 61.836 scf_env_do_scf_inner_loop 99 6.5 0.003 0.018 52.704 52.722 velocity_verlet 10 3.0 0.001 0.001 42.247 42.258 dbcsr_multiply_generic 2055 12.4 0.146 0.159 30.630 31.761 qs_scf_new_mos 99 7.5 0.001 0.001 27.602 28.140 qs_scf_loop_do_ot 99 8.5 0.001 0.001 27.601 28.139 ot_scf_mini 99 9.5 0.003 0.003 25.867 26.398 multiply_cannon 2055 13.4 0.225 0.251 20.763 24.557 multiply_cannon_loop 2055 14.4 0.266 0.289 19.004 21.968 multiply_cannon_multrec 24660 15.4 12.470 19.834 12.488 19.853 rebuild_ks_matrix 110 8.3 0.001 0.001 18.729 19.412 qs_ks_build_kohn_sham_matrix 110 9.3 0.013 0.016 18.729 19.412 qs_ks_update_qs_env 110 7.6 0.001 0.001 16.564 17.186 mp_waitall_1 176588 16.5 7.474 15.225 7.474 15.225 ot_mini 99 10.5 0.001 0.001 14.066 14.634 qs_rho_update_rho_low 110 7.6 0.001 0.001 11.701 11.718 calculate_rho_elec 110 8.6 0.048 0.059 11.700 11.717 multiply_cannon_metrocomm3 24660 15.4 0.091 0.098 4.145 11.570 sum_up_and_integrate 110 10.3 0.001 0.002 11.328 11.384 integrate_v_rspace 110 11.3 0.003 0.004 11.300 11.368 init_scf_loop 11 6.9 0.000 0.000 9.084 9.135 qs_ot_get_derivative 99 11.5 0.001 0.001 7.818 8.351 grid_integrate_task_list 110 12.3 7.103 7.486 7.103 7.486 grid_collocate_task_list 110 9.6 6.977 7.459 6.977 7.459 make_m2s 4110 13.4 0.096 0.103 6.299 6.654 prepare_preconditioner 11 7.9 0.000 0.000 6.259 6.325 make_preconditioner 11 8.9 0.000 0.000 6.259 6.325 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 5.807 6.302 apply_single 110 13.6 0.000 0.001 5.807 6.302 qs_ot_get_p 110 10.4 0.001 0.001 5.515 6.149 ot_diis_step 99 11.5 0.013 0.014 6.141 6.141 make_full_inverse_cholesky 11 9.9 0.000 0.000 5.819 6.019 init_scf_run 11 5.9 0.000 0.001 5.741 5.742 scf_env_initial_rho_setup 11 6.9 0.000 0.001 5.741 5.742 make_images 4110 14.4 0.462 0.535 5.320 5.547 density_rs2pw 110 9.6 0.005 0.006 3.787 4.564 fft_wrap_pw1pw2 1111 11.6 0.013 0.016 4.225 4.548 mp_sum_l 10179 13.1 2.557 4.149 2.557 4.149 fft3d_ps 1111 13.6 1.094 1.398 3.557 4.123 wfi_extrapolate 11 7.9 0.001 0.001 4.084 4.084 multiply_cannon_metrocomm4 22605 15.4 0.091 0.102 1.703 3.942 mp_irecv_dv 57340 16.2 1.583 3.805 1.583 3.805 fft_wrap_pw1pw2_140 451 12.1 0.138 0.167 3.571 3.780 make_images_data 4110 15.4 0.063 0.071 3.097 3.452 mp_alltoall_d11v 2046 13.8 2.629 3.253 2.629 3.253 mp_waitany 10164 13.8 2.002 3.215 2.002 3.215 potential_pw2rs 110 12.3 0.010 0.011 3.043 3.120 hybrid_alltoall_any 4261 16.3 0.115 0.434 2.646 3.095 mp_alltoall_z22v 1111 15.6 2.193 2.994 2.193 2.994 transfer_rs2pw 451 10.6 0.006 0.007 2.099 2.988 qs_ot_p2m_diag 48 11.0 0.026 0.044 2.901 2.940 qs_ot_get_derivative_diag 47 12.0 0.001 0.002 2.605 2.883 cp_fm_cholesky_invert 11 10.9 2.753 2.771 2.753 2.771 cp_dbcsr_syevd 48 12.0 0.003 0.004 2.645 2.666 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 2.607 2.611 qs_ot_get_derivative_taylor 52 13.0 0.001 0.002 2.235 2.493 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 2.379 2.446 mp_sum_d 3893 11.9 1.486 2.354 1.486 2.354 transfer_rs2pw_140 121 11.5 0.226 0.262 1.382 2.264 calculate_dm_sparse 110 9.5 0.001 0.001 1.958 2.085 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.848 2.063 cp_fm_diag_elpa 48 13.0 0.000 0.000 1.986 1.987 cp_fm_redistribute_end 48 14.0 0.988 1.955 0.995 1.958 cp_fm_cholesky_decompose 22 10.9 1.853 1.914 1.853 1.914 cp_fm_diag_elpa_base 48 14.0 0.868 1.808 0.951 1.911 dbcsr_complete_redistribute 325 12.2 0.337 0.462 1.634 1.893 transfer_pw2rs 451 13.1 0.005 0.006 1.850 1.863 qs_energies_init_hamiltonians 11 5.9 0.000 0.000 1.617 1.755 dbcsr_dot_sd 1091 11.9 0.277 0.349 1.219 1.714 yz_to_x 451 14.2 0.084 0.091 1.261 1.679 qs_ot_get_orbitals 99 10.5 0.001 0.001 1.525 1.612 rs_gather_matrices 110 12.3 0.124 0.143 1.070 1.607 mp_allgather_i34 2055 14.4 0.656 1.574 0.656 1.574 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="204", plot="h2o_128_md", label="(4n/12r/3t)", y=75.522000, yerr=0.000000 PlotPoint: name="205", plot="h2o_128_md_mem", label="(4n/12r/3t)", y=287.272727, yerr=4.002066 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/15/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 64 x 64 x 64 1440743424 0.0% 100.0% 0.0% flops 151 x 64 x 64 2340388864 0.0% 100.0% 0.0% flops 173 x 64 x 64 2681372672 0.0% 100.0% 0.0% flops 178 x 64 x 64 2758868992 0.0% 100.0% 0.0% flops 271 x 64 x 64 4200300544 0.0% 100.0% 0.0% flops 64 x 96 x 64 4322230272 0.0% 100.0% 0.0% flops 64 x 64 x 96 4322230272 0.0% 100.0% 0.0% flops 151 x 64 x 96 7021166592 0.0% 100.0% 0.0% flops 151 x 96 x 64 7021166592 0.0% 100.0% 0.0% flops 151 x 64 x 849 7220167680 0.0% 100.0% 0.0% flops 151 x 64 x 853 7254184960 0.0% 100.0% 0.0% flops 151 x 64 x 858 7296706560 0.0% 100.0% 0.0% flops 169 x 64 x 64 7858126848 0.0% 100.0% 0.0% flops 173 x 64 x 96 8044118016 0.0% 100.0% 0.0% flops 173 x 96 x 64 8044118016 0.0% 100.0% 0.0% flops 173 x 64 x 849 8272112640 0.0% 100.0% 0.0% flops 178 x 64 x 96 8276606976 0.0% 100.0% 0.0% flops 178 x 96 x 64 8276606976 0.0% 100.0% 0.0% flops 173 x 64 x 853 8311086080 0.0% 100.0% 0.0% flops 173 x 64 x 858 8359802880 0.0% 100.0% 0.0% flops 178 x 64 x 849 8511191040 0.0% 100.0% 0.0% flops 178 x 64 x 853 8551290880 0.0% 100.0% 0.0% flops 178 x 64 x 858 8601415680 0.0% 100.0% 0.0% flops 32 x 64 x 64 10085203968 0.0% 100.0% 0.0% flops 218 x 64 x 64 10136518656 0.0% 100.0% 0.0% flops 64 x 64 x 849 10710712320 0.0% 100.0% 0.0% flops 64 x 64 x 853 10761175040 0.0% 100.0% 0.0% flops 64 x 64 x 858 10824253440 0.0% 100.0% 0.0% flops 271 x 96 x 64 12600901632 0.0% 100.0% 0.0% flops 271 x 64 x 96 12600901632 0.0% 100.0% 0.0% flops 209 x 64 x 64 12957384704 0.0% 100.0% 0.0% flops 271 x 64 x 849 12958049280 0.0% 100.0% 0.0% flops 64 x 96 x 96 12966690816 0.0% 100.0% 0.0% flops 271 x 64 x 853 13019100160 0.0% 100.0% 0.0% flops 271 x 64 x 858 13095413760 0.0% 100.0% 0.0% flops 151 x 96 x 96 21063499776 0.0% 100.0% 0.0% flops 151 x 96 x 849 21660503040 0.0% 100.0% 0.0% flops 151 x 96 x 853 21762554880 0.0% 100.0% 0.0% flops 151 x 96 x 858 21890119680 0.0% 100.0% 0.0% flops 169 x 96 x 64 23574380544 0.0% 100.0% 0.0% flops 169 x 64 x 96 23574380544 0.0% 100.0% 0.0% flops 173 x 96 x 96 24132354048 0.0% 100.0% 0.0% flops 169 x 64 x 849 24242549760 0.0% 100.0% 0.0% flops 169 x 64 x 853 24356766720 0.0% 100.0% 0.0% flops 169 x 64 x 858 24499537920 0.0% 100.0% 0.0% flops 173 x 96 x 849 24816337920 0.0% 100.0% 0.0% flops 178 x 96 x 96 24829820928 0.0% 100.0% 0.0% flops 173 x 96 x 853 24933258240 0.0% 100.0% 0.0% flops 231 x 64 x 64 25062309888 0.0% 100.0% 0.0% flops 173 x 96 x 858 25079408640 0.0% 100.0% 0.0% flops 178 x 96 x 849 25533573120 0.0% 100.0% 0.0% flops 178 x 96 x 853 25653872640 0.0% 100.0% 0.0% flops 178 x 96 x 858 25804247040 0.0% 100.0% 0.0% flops 32 x 96 x 64 30255611904 0.0% 100.0% 0.0% flops 32 x 64 x 96 30255611904 0.0% 100.0% 0.0% flops 218 x 64 x 96 30409555968 0.0% 100.0% 0.0% flops 218 x 96 x 64 30409555968 0.0% 100.0% 0.0% flops 218 x 64 x 849 31271454720 0.0% 100.0% 0.0% flops 218 x 64 x 853 31418787840 0.0% 100.0% 0.0% flops 218 x 64 x 858 31602954240 0.0% 100.0% 0.0% flops 64 x 96 x 849 32132136960 0.0% 100.0% 0.0% flops 64 x 96 x 853 32283525120 0.0% 100.0% 0.0% flops 64 x 96 x 858 32472760320 0.0% 100.0% 0.0% flops 271 x 96 x 96 37802704896 0.0% 100.0% 0.0% flops 209 x 96 x 64 38872154112 0.0% 100.0% 0.0% flops 209 x 64 x 96 38872154112 0.0% 100.0% 0.0% flops 271 x 96 x 849 38874147840 0.0% 100.0% 0.0% flops 271 x 96 x 853 39057300480 0.0% 100.0% 0.0% flops 271 x 96 x 858 39286241280 0.0% 100.0% 0.0% flops 209 x 64 x 849 39973908480 0.0% 100.0% 0.0% flops 209 x 64 x 853 40162242560 0.0% 100.0% 0.0% flops 209 x 64 x 858 40397660160 0.0% 100.0% 0.0% flops 9 x 9 x 64 67295121408 0.0% 100.0% 0.0% flops 169 x 96 x 96 70723141632 0.0% 100.0% 0.0% flops 169 x 96 x 849 72727649280 0.0% 100.0% 0.0% flops 169 x 96 x 853 73070300160 0.0% 100.0% 0.0% flops 169 x 96 x 858 73498613760 0.0% 100.0% 0.0% flops 32 x 64 x 849 74974986240 0.0% 100.0% 0.0% flops 231 x 64 x 96 75186929664 0.0% 100.0% 0.0% flops 231 x 96 x 64 75186929664 0.0% 100.0% 0.0% flops 32 x 64 x 853 75328225280 0.0% 100.0% 0.0% flops 32 x 64 x 858 75769774080 0.0% 100.0% 0.0% flops 231 x 64 x 849 77317954560 0.0% 100.0% 0.0% flops 231 x 64 x 853 77682232320 0.0% 100.0% 0.0% flops 231 x 64 x 858 78137579520 0.0% 100.0% 0.0% flops 9 x 22 x 64 87348856320 0.0% 100.0% 0.0% flops 22 x 9 x 64 87510601728 0.0% 100.0% 0.0% flops 32 x 96 x 96 90766835712 0.0% 100.0% 0.0% flops 218 x 96 x 96 91228667904 0.0% 100.0% 0.0% flops 218 x 96 x 849 93814364160 0.0% 100.0% 0.0% flops 218 x 96 x 853 94256363520 0.0% 100.0% 0.0% flops 218 x 96 x 858 94808862720 0.0% 100.0% 0.0% flops 22 x 22 x 64 113395453952 0.0% 100.0% 0.0% flops 209 x 96 x 96 116616462336 0.0% 100.0% 0.0% flops 209 x 96 x 849 119921725440 0.0% 100.0% 0.0% flops 209 x 96 x 853 120486727680 0.0% 100.0% 0.0% flops 209 x 96 x 858 121192980480 0.0% 100.0% 0.0% flops 9 x 9 x 96 201885364224 0.0% 100.0% 0.0% flops 32 x 96 x 849 224924958720 0.0% 100.0% 0.0% flops 231 x 96 x 96 225560788992 0.0% 100.0% 0.0% flops 32 x 96 x 853 225984675840 0.0% 100.0% 0.0% flops 32 x 96 x 858 227309322240 0.0% 100.0% 0.0% flops 231 x 96 x 849 231953863680 0.0% 100.0% 0.0% flops 231 x 96 x 853 233046696960 0.0% 100.0% 0.0% flops 231 x 96 x 858 234412738560 0.0% 100.0% 0.0% flops 9 x 22 x 96 262046568960 0.0% 100.0% 0.0% flops 22 x 9 x 96 262531805184 0.0% 100.0% 0.0% flops 22 x 22 x 96 340186361856 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 629542526976 100.0% 0.0% 0.0% flops total 12.909090E+12 4.9% 95.1% 0.0% flops max/rank 451.741913E+09 9.0% 91.0% 0.0% matmuls inhomo. stacks 62964 100.0% 0.0% 0.0% matmuls total 562477038 0.0% 100.0% 0.0% number of processed stacks 2497330 2.5% 97.5% 0.0% average stack size 1.0 231.0 0.0 marketing flops 15.646547E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 352.178176E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 739800 MPI messages size (bytes): total size 565.223162E+09 min size 0.000000E+00 max size 5.889312E+06 average size 764.021562E+03 MPI breakdown and total messages size (bytes): size <= 128 5610 0 128 < size <= 8192 0 0 8192 < size <= 32768 37270 1217658880 32768 < size <= 131072 295520 18156748800 131072 < size <= 4194304 335340 195349708800 4194304 < size <= 16777216 66060 350485115840 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 115 12. MP_Allreduce 11133 25. MP_Alltoall 8043 133840. MP_ISend 49276 334981. MP_IRecv 49276 330552. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3473 66308. MP_Allreduce 9775 564. MP_Sync 52 MP_Alltoall 1717 4416898. MP_SendRecv 7700 27936. MP_ISendRecv 7700 27936. MP_Wait 17864 MP_ISend 8316 219755. MP_IRecv 8316 219755. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.062 0.118 80.983 80.986 qs_mol_dyn_low 1 2.0 0.071 0.145 79.357 79.392 qs_forces 11 3.9 0.003 0.004 78.604 78.762 qs_energies 11 4.9 0.001 0.002 74.205 74.381 scf_env_do_scf 11 5.9 0.001 0.003 64.930 64.940 scf_env_do_scf_inner_loop 99 6.5 0.003 0.023 56.486 56.496 velocity_verlet 10 3.0 0.001 0.001 45.542 45.573 dbcsr_multiply_generic 2055 12.4 0.168 0.213 29.512 30.444 qs_scf_new_mos 99 7.5 0.001 0.001 26.882 27.306 qs_scf_loop_do_ot 99 8.5 0.001 0.002 26.881 27.305 ot_scf_mini 99 9.5 0.003 0.004 24.955 25.301 rebuild_ks_matrix 110 8.3 0.001 0.001 20.959 21.504 qs_ks_build_kohn_sham_matrix 110 9.3 0.016 0.018 20.958 21.503 multiply_cannon 2055 13.4 0.217 0.245 18.953 20.891 qs_ks_update_qs_env 110 7.6 0.001 0.001 18.572 19.049 multiply_cannon_loop 2055 14.4 0.158 0.201 16.572 18.574 qs_rho_update_rho_low 110 7.6 0.001 0.001 14.685 14.790 calculate_rho_elec 110 8.6 0.065 0.073 14.685 14.789 multiply_cannon_multrec 12330 15.4 12.070 14.425 12.089 14.444 ot_mini 99 10.5 0.001 0.001 13.303 13.688 sum_up_and_integrate 110 10.3 0.003 0.006 13.185 13.207 integrate_v_rspace 110 11.3 0.003 0.005 13.139 13.162 mp_waitall_1 141068 16.5 6.707 11.215 6.707 11.215 grid_collocate_task_list 110 9.6 9.537 9.831 9.537 9.831 grid_integrate_task_list 110 12.3 8.671 9.055 8.671 9.055 init_scf_loop 11 6.9 0.000 0.000 8.391 8.396 qs_ot_get_derivative 99 11.5 0.001 0.002 7.412 7.766 make_m2s 4110 13.4 0.083 0.105 7.310 7.736 make_images 4110 14.4 0.668 0.870 6.424 6.826 init_scf_run 11 5.9 0.000 0.007 6.595 6.595 scf_env_initial_rho_setup 11 6.9 0.000 0.005 6.594 6.595 qs_ot_get_p 110 10.4 0.001 0.001 5.834 6.310 apply_preconditioner_dbcsr 110 12.6 0.000 0.000 5.593 6.016 apply_single 110 13.6 0.001 0.001 5.592 6.015 ot_diis_step 99 11.5 0.014 0.030 5.852 5.852 multiply_cannon_metrocomm3 12330 15.4 0.042 0.089 1.739 5.463 prepare_preconditioner 11 7.9 0.000 0.000 5.345 5.398 make_preconditioner 11 8.9 0.000 0.000 5.345 5.398 make_full_inverse_cholesky 11 9.9 0.000 0.000 4.899 4.980 fft_wrap_pw1pw2 1111 11.6 0.017 0.021 4.736 4.801 wfi_extrapolate 11 7.9 0.001 0.002 4.400 4.400 density_rs2pw 110 9.6 0.006 0.010 4.035 4.312 make_images_data 4110 15.4 0.056 0.087 3.585 4.228 fft_wrap_pw1pw2_140 451 12.1 0.211 0.234 4.075 4.178 hybrid_alltoall_any 4261 16.3 0.182 0.993 3.400 4.123 fft3d_ps 1111 13.6 1.562 1.658 3.790 3.856 mp_alltoall_d11v 2046 13.8 3.372 3.635 3.372 3.635 potential_pw2rs 110 12.3 0.014 0.019 3.221 3.251 qs_ot_p2m_diag 48 11.0 0.034 0.048 3.204 3.249 mp_sum_l 10179 13.1 1.912 3.023 1.912 3.023 cp_dbcsr_syevd 48 12.0 0.003 0.004 2.944 2.971 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 2.692 2.763 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.003 2.753 2.759 mp_allgather_i34 2055 14.4 1.037 2.742 1.037 2.742 qs_ot_get_derivative_diag 47 12.0 0.001 0.002 2.392 2.634 qs_ot_get_derivative_taylor 52 13.0 0.002 0.002 2.370 2.537 multiply_cannon_metrocomm1 12330 15.4 0.048 0.091 1.464 2.493 cp_fm_cholesky_invert 11 10.9 2.433 2.444 2.433 2.444 transfer_rs2pw 451 10.6 0.006 0.008 1.954 2.440 calculate_dm_sparse 110 9.5 0.001 0.001 2.177 2.321 mp_waitany 8316 13.8 1.842 2.205 1.842 2.205 qs_energies_init_hamiltonians 11 5.9 0.000 0.000 2.013 2.171 cp_fm_diag_elpa 48 13.0 0.000 0.000 2.135 2.136 cp_fm_diag_elpa_base 48 14.0 2.091 2.095 2.129 2.129 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.869 2.028 mp_alltoall_z22v 1111 15.6 1.847 1.999 1.847 1.999 mp_irecv_dv 30428 16.1 0.923 1.985 0.923 1.985 calculate_first_density_matrix 1 7.0 0.000 0.000 1.963 1.965 transfer_pw2rs 451 13.1 0.006 0.008 1.934 1.948 dbcsr_complete_redistribute 325 12.2 0.437 0.552 1.807 1.896 mp_sum_d 3893 11.9 1.307 1.894 1.307 1.894 multiply_cannon_metrocomm4 10275 15.4 0.040 0.083 0.900 1.845 dbcsr_dot_sd 1091 11.9 0.376 0.427 1.188 1.793 make_basis_sm 11 9.8 0.000 0.000 1.779 1.782 transfer_rs2pw_140 121 11.5 0.237 0.268 1.269 1.758 make_images_sizes 4110 15.4 0.006 0.013 1.194 1.696 mp_alltoall_i44 4110 16.4 1.188 1.690 1.188 1.690 cp_fm_cholesky_decompose 22 10.9 1.656 1.686 1.656 1.686 copy_dbcsr_to_fm 151 11.3 0.003 0.004 1.548 1.666 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="206", plot="h2o_128_md", label="(4n/9r/4t)", y=80.986000, yerr=0.000000 PlotPoint: name="207", plot="h2o_128_md_mem", label="(4n/9r/4t)", y=330.909091, yerr=4.679832 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/16/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 129 x 64 x 409 1485749760 0.0% 100.0% 0.0% flops 151 x 64 x 409 1739133440 0.0% 100.0% 0.0% flops 174 x 64 x 409 2004034560 0.0% 100.0% 0.0% flops 200 x 64 x 409 2303488000 0.0% 100.0% 0.0% flops 209 x 64 x 409 2407144960 0.0% 100.0% 0.0% flops 218 x 64 x 409 2510801920 0.0% 100.0% 0.0% flops 160 x 64 x 409 3685580800 0.0% 100.0% 0.0% flops 129 x 64 x 32 3998810112 0.0% 100.0% 0.0% flops 129 x 64 x 64 3998810112 0.0% 100.0% 0.0% flops 187 x 64 x 409 4307522560 0.0% 100.0% 0.0% flops 129 x 96 x 409 4457249280 0.0% 100.0% 0.0% flops 129 x 64 x 418 4555330560 0.0% 100.0% 0.0% flops 151 x 64 x 32 4680777728 0.0% 100.0% 0.0% flops 151 x 64 x 64 4680777728 0.0% 100.0% 0.0% flops 64 x 64 x 409 5159813120 0.0% 100.0% 0.0% flops 151 x 96 x 409 5217400320 0.0% 100.0% 0.0% flops 231 x 64 x 409 5321057280 0.0% 100.0% 0.0% flops 151 x 64 x 418 5332208640 0.0% 100.0% 0.0% flops 174 x 64 x 32 5393743872 0.0% 100.0% 0.0% flops 174 x 64 x 64 5393743872 0.0% 100.0% 0.0% flops 64 x 64 x 32 5762973696 0.0% 100.0% 0.0% flops 64 x 64 x 64 5762973696 0.0% 100.0% 0.0% flops 174 x 96 x 409 6012103680 0.0% 100.0% 0.0% flops 262 x 64 x 409 6035138560 0.0% 100.0% 0.0% flops 174 x 64 x 418 6144399360 0.0% 100.0% 0.0% flops 200 x 64 x 32 6199705600 0.0% 100.0% 0.0% flops 200 x 64 x 64 6199705600 0.0% 100.0% 0.0% flops 209 x 64 x 32 6478692352 0.0% 100.0% 0.0% flops 209 x 64 x 64 6478692352 0.0% 100.0% 0.0% flops 218 x 64 x 32 6757679104 0.0% 100.0% 0.0% flops 218 x 64 x 64 6757679104 0.0% 100.0% 0.0% flops 200 x 96 x 409 6910464000 0.0% 100.0% 0.0% flops 200 x 64 x 418 7062528000 0.0% 100.0% 0.0% flops 209 x 96 x 409 7221434880 0.0% 100.0% 0.0% flops 209 x 64 x 418 7380341760 0.0% 100.0% 0.0% flops 218 x 96 x 409 7532405760 0.0% 100.0% 0.0% flops 218 x 64 x 418 7698155520 0.0% 100.0% 0.0% flops 160 x 64 x 32 9919528960 0.0% 100.0% 0.0% flops 160 x 64 x 64 9919528960 0.0% 100.0% 0.0% flops 129 x 64 x 431 10959674880 0.0% 100.0% 0.0% flops 160 x 96 x 409 11056742400 0.0% 100.0% 0.0% flops 160 x 64 x 418 11300044800 0.0% 100.0% 0.0% flops 187 x 64 x 32 11593449472 0.0% 100.0% 0.0% flops 187 x 64 x 64 11593449472 0.0% 100.0% 0.0% flops 129 x 96 x 64 11996430336 0.0% 100.0% 0.0% flops 129 x 96 x 32 11996430336 0.0% 100.0% 0.0% flops 151 x 64 x 431 12828766720 0.0% 100.0% 0.0% flops 187 x 96 x 409 12922567680 0.0% 100.0% 0.0% flops 187 x 64 x 418 13206927360 0.0% 100.0% 0.0% flops 129 x 96 x 418 13665991680 0.0% 100.0% 0.0% flops 151 x 96 x 64 14042333184 0.0% 100.0% 0.0% flops 151 x 96 x 32 14042333184 0.0% 100.0% 0.0% flops 231 x 64 x 32 14321319936 0.0% 100.0% 0.0% flops 231 x 64 x 64 14321319936 0.0% 100.0% 0.0% flops 174 x 64 x 431 14782817280 0.0% 100.0% 0.0% flops 32 x 64 x 409 15479439360 0.0% 100.0% 0.0% flops 64 x 96 x 409 15479439360 0.0% 100.0% 0.0% flops 64 x 64 x 418 15820062720 0.0% 100.0% 0.0% flops 231 x 96 x 409 15963171840 0.0% 100.0% 0.0% flops 151 x 96 x 418 15996625920 0.0% 100.0% 0.0% flops 174 x 96 x 64 16181231616 0.0% 100.0% 0.0% flops 174 x 96 x 32 16181231616 0.0% 100.0% 0.0% flops 262 x 64 x 32 16243228672 0.0% 100.0% 0.0% flops 262 x 64 x 64 16243228672 0.0% 100.0% 0.0% flops 231 x 64 x 418 16314439680 0.0% 100.0% 0.0% flops 200 x 64 x 431 16991744000 0.0% 100.0% 0.0% flops 32 x 64 x 32 17288921088 0.0% 100.0% 0.0% flops 32 x 64 x 64 17288921088 0.0% 100.0% 0.0% flops 64 x 96 x 64 17288921088 0.0% 100.0% 0.0% flops 64 x 96 x 32 17288921088 0.0% 100.0% 0.0% flops 209 x 64 x 431 17756372480 0.0% 100.0% 0.0% flops 262 x 96 x 409 18105415680 0.0% 100.0% 0.0% flops 174 x 96 x 418 18433198080 0.0% 100.0% 0.0% flops 262 x 64 x 418 18503823360 0.0% 100.0% 0.0% flops 218 x 64 x 431 18521000960 0.0% 100.0% 0.0% flops 200 x 96 x 64 18599116800 0.0% 100.0% 0.0% flops 200 x 96 x 32 18599116800 0.0% 100.0% 0.0% flops 209 x 96 x 64 19436077056 0.0% 100.0% 0.0% flops 209 x 96 x 32 19436077056 0.0% 100.0% 0.0% flops 218 x 96 x 64 20273037312 0.0% 100.0% 0.0% flops 218 x 96 x 32 20273037312 0.0% 100.0% 0.0% flops 200 x 96 x 418 21187584000 0.0% 100.0% 0.0% flops 209 x 96 x 418 22141025280 0.0% 100.0% 0.0% flops 218 x 96 x 418 23094466560 0.0% 100.0% 0.0% flops 160 x 64 x 431 27186790400 0.0% 100.0% 0.0% flops 160 x 96 x 64 29758586880 0.0% 100.0% 0.0% flops 160 x 96 x 32 29758586880 0.0% 100.0% 0.0% flops 187 x 64 x 431 31774561280 0.0% 100.0% 0.0% flops 129 x 96 x 431 32879024640 0.0% 100.0% 0.0% flops 160 x 96 x 418 33900134400 0.0% 100.0% 0.0% flops 187 x 96 x 64 34780348416 0.0% 100.0% 0.0% flops 187 x 96 x 32 34780348416 0.0% 100.0% 0.0% flops 64 x 64 x 431 38061506560 0.0% 100.0% 0.0% flops 151 x 96 x 431 38486300160 0.0% 100.0% 0.0% flops 231 x 64 x 431 39250928640 0.0% 100.0% 0.0% flops 187 x 96 x 418 39620782080 0.0% 100.0% 0.0% flops 231 x 96 x 64 42963959808 0.0% 100.0% 0.0% flops 231 x 96 x 32 42963959808 0.0% 100.0% 0.0% flops 174 x 96 x 431 44348451840 0.0% 100.0% 0.0% flops 262 x 64 x 431 44518369280 0.0% 100.0% 0.0% flops 32 x 96 x 409 46438318080 0.0% 100.0% 0.0% flops 32 x 64 x 418 47460188160 0.0% 100.0% 0.0% flops 64 x 96 x 418 47460188160 0.0% 100.0% 0.0% flops 262 x 96 x 64 48729686016 0.0% 100.0% 0.0% flops 262 x 96 x 32 48729686016 0.0% 100.0% 0.0% flops 231 x 96 x 418 48943319040 0.0% 100.0% 0.0% flops 200 x 96 x 431 50975232000 0.0% 100.0% 0.0% flops 32 x 96 x 64 51866763264 0.0% 100.0% 0.0% flops 32 x 96 x 32 51866763264 0.0% 100.0% 0.0% flops 209 x 96 x 431 53269117440 0.0% 100.0% 0.0% flops 262 x 96 x 418 55511470080 0.0% 100.0% 0.0% flops 218 x 96 x 431 55563002880 0.0% 100.0% 0.0% flops 160 x 96 x 431 81560371200 0.0% 100.0% 0.0% flops 187 x 96 x 431 95323683840 0.0% 100.0% 0.0% flops 32 x 64 x 431 114184519680 0.0% 100.0% 0.0% flops 64 x 96 x 431 114184519680 0.0% 100.0% 0.0% flops 231 x 96 x 431 117752785920 0.0% 100.0% 0.0% flops 262 x 96 x 431 133555107840 0.0% 100.0% 0.0% flops 9 x 9 x 64 134590242816 0.0% 100.0% 0.0% flops 9 x 9 x 32 134590242816 0.0% 100.0% 0.0% flops 32 x 96 x 418 142380564480 0.0% 100.0% 0.0% flops 9 x 22 x 64 174697712640 0.0% 100.0% 0.0% flops 9 x 22 x 32 174697712640 0.0% 100.0% 0.0% flops 22 x 9 x 64 175021203456 0.0% 100.0% 0.0% flops 22 x 9 x 32 175021203456 0.0% 100.0% 0.0% flops 22 x 22 x 64 226790907904 0.0% 100.0% 0.0% flops 22 x 22 x 32 226790907904 0.0% 100.0% 0.0% flops 32 x 96 x 431 342553559040 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 2770741067776 100.0% 0.0% 0.0% flops total 13.483664E+12 20.5% 79.5% 0.0% flops max/rank 678.224219E+09 23.3% 76.7% 0.0% matmuls inhomo. stacks 461340 100.0% 0.0% 0.0% matmuls total 609143868 0.1% 99.9% 0.0% number of processed stacks 4707072 9.8% 90.2% 0.0% average stack size 1.0 143.4 0.0 marketing flops 15.646302E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 455.569408E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 937080 MPI messages size (bytes): total size 523.723932E+09 min size 0.000000E+00 max size 4.537280E+06 average size 558.889250E+03 MPI breakdown and total messages size (bytes): size <= 128 6996 0 128 < size <= 8192 264 2162688 8192 < size <= 32768 304932 8165326848 32768 < size <= 131072 110640 6338641920 131072 < size <= 4194304 489498 400769458320 4194304 < size <= 16777216 24750 108449092400 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3473 66419. MP_Allreduce 9774 603. MP_Sync 52 MP_Alltoall 1496 5863162. MP_SendRecv 5060 43184. MP_ISendRecv 5060 43184. MP_Wait 20042 MP_ISend 13376 163145. MP_IRecv 13376 163145. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.229 0.444 95.310 95.323 qs_mol_dyn_low 1 2.0 0.132 0.249 93.344 93.352 qs_forces 11 3.9 0.012 0.041 90.025 90.284 qs_energies 11 4.9 0.003 0.010 85.490 85.752 scf_env_do_scf 11 5.9 0.001 0.003 75.626 75.641 scf_env_do_scf_inner_loop 99 6.5 0.004 0.026 60.962 60.964 velocity_verlet 10 3.0 0.006 0.020 55.714 55.837 dbcsr_multiply_generic 2055 12.4 0.182 0.196 33.451 34.203 qs_scf_new_mos 99 7.5 0.001 0.001 29.963 30.415 qs_scf_loop_do_ot 99 8.5 0.001 0.001 29.962 30.414 ot_scf_mini 99 9.5 0.003 0.005 27.877 28.312 rebuild_ks_matrix 110 8.3 0.001 0.001 21.951 22.433 qs_ks_build_kohn_sham_matrix 110 9.3 0.014 0.015 21.950 22.433 multiply_cannon 2055 13.4 0.239 0.260 20.301 22.045 qs_ks_update_qs_env 110 7.6 0.001 0.001 19.537 19.977 multiply_cannon_loop 2055 14.4 0.277 0.297 17.900 19.532 multiply_cannon_multrec 24660 15.4 13.774 16.417 13.795 16.438 qs_rho_update_rho_low 110 7.6 0.001 0.001 15.396 15.424 calculate_rho_elec 110 8.6 0.097 0.102 15.395 15.423 ot_mini 99 10.5 0.001 0.003 14.905 15.369 init_scf_loop 11 6.9 0.005 0.018 14.610 14.613 sum_up_and_integrate 110 10.3 0.002 0.003 13.212 13.227 integrate_v_rspace 110 11.3 0.003 0.004 13.164 13.178 prepare_preconditioner 11 7.9 0.000 0.000 11.356 11.406 make_preconditioner 11 8.9 0.002 0.012 11.355 11.406 make_full_inverse_cholesky 11 9.9 0.000 0.000 9.317 10.899 grid_collocate_task_list 110 9.6 10.374 10.823 10.374 10.823 make_m2s 4110 13.4 0.101 0.105 9.432 9.800 grid_integrate_task_list 110 12.3 8.974 9.221 8.974 9.221 mp_waitall_1 121746 16.5 6.261 8.748 6.261 8.748 make_images 4110 14.4 0.896 1.164 8.088 8.417 qs_ot_get_derivative 99 11.5 0.002 0.004 7.898 8.348 apply_preconditioner_dbcsr 110 12.6 0.001 0.002 6.656 7.113 apply_single 110 13.6 0.001 0.001 6.655 7.113 ot_diis_step 99 11.5 0.015 0.019 6.967 6.968 qs_ot_get_p 110 10.4 0.001 0.001 6.055 6.565 init_scf_run 11 5.9 0.000 0.005 6.302 6.303 scf_env_initial_rho_setup 11 6.9 0.001 0.004 6.302 6.303 cp_fm_upper_to_full 70 14.2 3.700 5.336 3.700 5.336 make_images_data 4110 15.4 0.063 0.068 4.501 5.060 fft_wrap_pw1pw2 1111 11.6 0.016 0.021 4.824 4.869 hybrid_alltoall_any 4261 16.3 0.147 0.479 4.024 4.763 dbcsr_complete_redistribute 325 12.2 0.505 0.605 3.303 4.655 wfi_extrapolate 11 7.9 0.001 0.001 4.635 4.636 density_rs2pw 110 9.6 0.006 0.006 3.940 4.298 fft_wrap_pw1pw2_140 451 12.1 0.244 0.259 4.208 4.275 multiply_cannon_metrocomm3 24660 15.4 0.049 0.052 1.870 4.030 copy_fm_to_dbcsr 174 11.2 0.001 0.001 2.566 3.887 fft3d_ps 1111 13.6 1.596 1.757 3.803 3.838 mp_sum_l 10179 13.1 2.243 3.727 2.243 3.727 mp_alltoall_i22 605 13.7 2.049 3.454 2.049 3.454 qs_ot_p2m_diag 48 11.0 0.050 0.062 3.384 3.416 transfer_fm_to_dbcsr 11 9.9 0.002 0.008 2.021 3.322 mp_alltoall_d11v 2046 13.8 3.001 3.209 3.001 3.209 cp_dbcsr_syevd 48 12.0 0.003 0.003 3.097 3.124 multiply_cannon_metrocomm4 20550 15.4 0.075 0.080 1.509 3.057 mp_sum_dm 438 4.9 2.863 3.044 2.863 3.044 qs_ot_get_derivative_diag 47 12.0 0.001 0.002 2.714 3.017 potential_pw2rs 110 12.3 0.018 0.019 3.000 3.012 qs_energies_init_hamiltonians 11 5.9 0.000 0.001 2.749 3.006 cp_fm_cholesky_invert 11 10.9 2.959 2.974 2.959 2.974 mp_irecv_dv 62702 16.1 1.424 2.966 1.424 2.966 md_output 10 3.0 0.001 0.001 0.192 2.948 update_particle_set 20 4.0 0.010 0.038 2.768 2.899 md_write_output 11 3.9 0.121 2.850 0.145 2.893 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 2.748 2.797 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 2.729 2.735 mp_waitany 13376 13.8 2.009 2.659 2.009 2.659 transfer_rs2pw 451 10.6 0.006 0.006 2.044 2.632 qs_ot_get_derivative_taylor 52 13.0 0.002 0.002 2.311 2.493 calculate_dm_sparse 110 9.5 0.001 0.001 2.352 2.470 cp_fm_diag_elpa 48 13.0 0.000 0.000 2.394 2.396 cp_fm_diag_elpa_base 48 14.0 2.283 2.318 2.391 2.391 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.053 2.198 transfer_rs2pw_140 121 11.5 0.205 0.236 1.494 2.069 mp_alltoall_z22v 1111 15.6 1.903 2.036 1.903 2.036 cp_fm_cholesky_decompose 22 10.9 1.983 2.028 1.983 2.028 mp_sum_d 3891 11.9 1.380 2.022 1.380 2.022 mp_allgather_i34 2055 14.4 0.869 2.019 0.869 2.019 qs_env_update_s_mstruct 11 6.9 0.001 0.003 1.602 2.001 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="208", plot="h2o_128_md", label="(4n/6r/6t)", y=95.323000, yerr=0.000000 PlotPoint: name="209", plot="h2o_128_md_mem", label="(4n/6r/6t)", y=428.454545, yerr=10.138707 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/17/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 89 x 128 x 128 22070951936 0.0% 100.0% 0.0% flops 89 x 128 x 1280 51327795200 0.0% 100.0% 0.0% flops 80 x 128 x 128 59517173760 0.0% 100.0% 0.0% flops 151 x 128 x 128 74892443648 0.0% 100.0% 0.0% flops 182 x 128 x 128 90267713536 0.0% 100.0% 0.0% flops 32 x 128 x 128 92207579136 0.0% 100.0% 0.0% flops 64 x 128 x 128 92207579136 0.0% 100.0% 0.0% flops 80 x 128 x 1280 138412032000 0.0% 100.0% 0.0% flops 151 x 128 x 1280 174168473600 0.0% 100.0% 0.0% flops 160 x 128 x 128 198390579200 0.0% 100.0% 0.0% flops 182 x 128 x 1280 209924915200 0.0% 100.0% 0.0% flops 129 x 128 x 128 255923847168 0.0% 100.0% 0.0% flops 9 x 9 x 128 269180485632 0.0% 100.0% 0.0% flops 9 x 22 x 128 349395425280 0.0% 100.0% 0.0% flops 22 x 9 x 128 350042406912 0.0% 100.0% 0.0% flops 22 x 22 x 128 453581815808 0.0% 100.0% 0.0% flops 160 x 128 x 1280 461373440000 0.0% 100.0% 0.0% flops 32 x 128 x 1280 516738252800 0.0% 100.0% 0.0% flops 64 x 128 x 1280 516738252800 0.0% 100.0% 0.0% flops 129 x 128 x 1280 595171737600 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 2199488299008 100.0% 0.0% 0.0% flops total 13.192496E+12 16.7% 83.3% 0.0% flops max/rank 860.633972E+09 17.4% 82.6% 0.0% matmuls inhomo. stacks 139920 100.0% 0.0% 0.0% matmuls total 546715604 0.0% 100.0% 0.0% number of processed stacks 1575496 8.9% 91.1% 0.0% average stack size 1.0 380.7 0.0 marketing flops 15.646297E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 602.476544E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 197280 MPI messages size (bytes): total size 339.125567E+09 min size 0.000000E+00 max size 13.107200E+06 average size 1.719006E+06 MPI breakdown and total messages size (bytes): size <= 128 1452 0 128 < size <= 8192 0 0 8192 < size <= 32768 132 4325376 32768 < size <= 131072 88656 11620319232 131072 < size <= 4194304 89424 117209825280 4194304 < size <= 16777216 17616 210291069504 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 27 12. MP_Allreduce 10957 25. MP_Alltoall 8043 258767. MP_ISend 32836 652428. MP_IRecv 32836 652812. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3473 66417. MP_Allreduce 9774 644. MP_Sync 52 MP_Alltoall 1496 8504061. MP_SendRecv 3300 54848. MP_ISendRecv 3300 54848. MP_Wait 13926 MP_ISend 9240 278857. MP_IRecv 9240 278857. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.172 0.242 89.956 89.964 qs_mol_dyn_low 1 2.0 0.121 0.228 84.948 87.980 qs_forces 11 3.9 0.024 0.058 83.526 83.714 qs_energies 11 4.9 0.019 0.073 78.983 79.181 scf_env_do_scf 11 5.9 0.001 0.002 68.775 68.776 scf_env_do_scf_inner_loop 99 6.5 0.004 0.018 57.744 57.748 velocity_verlet 10 3.0 0.042 0.134 48.259 48.328 dbcsr_multiply_generic 2055 12.4 0.164 0.211 29.456 29.647 qs_scf_new_mos 99 7.5 0.001 0.002 27.749 27.834 qs_scf_loop_do_ot 99 8.5 0.001 0.001 27.749 27.833 ot_scf_mini 99 9.5 0.004 0.004 25.790 25.872 rebuild_ks_matrix 110 8.3 0.001 0.001 20.753 20.870 qs_ks_build_kohn_sham_matrix 110 9.3 0.014 0.015 20.752 20.869 multiply_cannon 2055 13.4 0.224 0.257 18.715 20.128 qs_ks_update_qs_env 110 7.6 0.001 0.001 18.426 18.546 multiply_cannon_loop 2055 14.4 0.153 0.221 16.752 17.139 qs_rho_update_rho_low 110 7.6 0.001 0.001 15.489 15.509 calculate_rho_elec 110 8.6 0.145 0.146 15.488 15.509 multiply_cannon_multrec 8220 15.4 13.373 14.206 13.394 14.226 ot_mini 99 10.5 0.001 0.002 13.754 13.853 sum_up_and_integrate 110 10.3 0.002 0.002 12.741 12.797 integrate_v_rspace 110 11.3 0.003 0.004 12.697 12.753 init_scf_loop 11 6.9 0.001 0.003 10.968 10.974 grid_collocate_task_list 110 9.6 9.883 10.371 9.883 10.371 grid_integrate_task_list 110 12.3 8.321 8.570 8.321 8.570 prepare_preconditioner 11 7.9 0.000 0.000 7.879 7.902 make_preconditioner 11 8.9 0.000 0.001 7.879 7.902 qs_ot_get_derivative 99 11.5 0.001 0.002 7.767 7.854 make_m2s 4110 13.4 0.079 0.114 7.620 7.827 make_full_inverse_cholesky 11 9.9 0.000 0.000 7.114 7.243 init_scf_run 11 5.9 0.001 0.005 6.697 6.698 scf_env_initial_rho_setup 11 6.9 0.001 0.002 6.697 6.698 qs_ot_get_p 110 10.4 0.001 0.002 6.595 6.651 make_images 4110 14.4 0.950 1.039 6.262 6.512 fft_wrap_pw1pw2 1111 11.6 0.015 0.019 5.996 6.177 mp_waitall_1 103326 16.6 5.412 6.176 5.412 6.176 ot_diis_step 99 11.5 0.028 0.042 5.941 5.942 apply_preconditioner_dbcsr 110 12.6 0.000 0.001 5.684 5.805 apply_single 110 13.6 0.000 0.001 5.684 5.804 density_rs2pw 110 9.6 0.005 0.008 4.556 5.591 fft_wrap_pw1pw2_140 451 12.1 0.300 0.315 5.130 5.482 fft3d_ps 1111 13.6 1.906 2.210 4.859 4.983 wfi_extrapolate 11 7.9 0.001 0.001 4.605 4.605 qs_ot_p2m_diag 48 11.0 0.072 0.081 3.948 3.966 make_images_data 4110 15.4 0.055 0.086 3.383 3.825 hybrid_alltoall_any 4261 16.3 0.333 1.325 3.332 3.802 md_output 10 3.0 0.009 0.013 0.321 3.668 cp_dbcsr_syevd 48 12.0 0.003 0.004 3.606 3.618 mp_max_l 33 2.8 3.166 3.523 3.166 3.523 cp_fm_cholesky_invert 11 10.9 3.466 3.479 3.466 3.479 mp_alltoall_d11v 2046 13.8 2.893 3.259 2.893 3.259 potential_pw2rs 110 12.3 0.021 0.025 3.077 3.099 qs_energies_init_hamiltonians 11 5.9 0.001 0.002 2.771 2.972 transfer_rs2pw 451 10.6 0.006 0.007 1.847 2.933 cp_fm_diag_elpa 48 13.0 0.000 0.000 2.915 2.916 cp_fm_diag_elpa_base 48 14.0 2.820 2.856 2.911 2.911 mp_alltoall_z22v 1111 15.6 2.570 2.793 2.570 2.793 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 2.735 2.758 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 2.739 2.745 mp_waitany 9240 13.8 1.757 2.740 1.757 2.740 qs_ot_get_derivative_diag 47 12.0 0.001 0.002 2.594 2.658 dbcsr_complete_redistribute 325 12.2 0.803 0.975 2.520 2.652 write_restart 10 4.0 0.162 2.548 0.199 2.550 transfer_rs2pw_140 121 11.5 0.192 0.225 1.341 2.423 calculate_dm_sparse 110 9.5 0.001 0.001 2.248 2.323 mp_allgather_i34 2055 14.4 0.952 2.307 0.952 2.307 qs_ot_get_derivative_taylor 52 13.0 0.002 0.002 2.118 2.155 cp_fm_cholesky_decompose 22 10.9 2.090 2.108 2.090 2.108 mp_sum_l 10179 13.1 1.632 2.024 1.632 2.024 multiply_cannon_metrocomm1 8220 15.4 0.030 0.058 1.403 2.024 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 1.982 2.022 copy_dbcsr_to_fm 151 11.3 0.003 0.004 1.976 2.021 qs_env_update_s_mstruct 11 6.9 0.000 0.001 1.664 1.975 make_basis_sm 11 9.8 0.000 0.000 1.810 1.813 calculate_first_density_matrix 1 7.0 0.000 0.001 1.805 1.807 yz_to_x 561 14.1 0.122 0.169 1.645 1.802 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="210", plot="h2o_128_md", label="(4n/4r/9t)", y=89.964000, yerr=0.000000 PlotPoint: name="211", plot="h2o_128_md_mem", label="(4n/4r/9t)", y=558.000000, yerr=17.766158 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/18/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 71 x 64 x 64 1100447744 0.0% 100.0% 0.0% flops 64 x 64 x 64 1440743424 0.0% 100.0% 0.0% flops 71 x 64 x 840 1679462400 0.0% 100.0% 0.0% flops 71 x 64 x 849 1697456640 0.0% 100.0% 0.0% flops 129 x 64 x 64 1999405056 0.0% 100.0% 0.0% flops 129 x 64 x 840 3051417600 0.0% 100.0% 0.0% flops 129 x 64 x 849 3084111360 0.0% 100.0% 0.0% flops 71 x 64 x 96 3301343232 0.0% 100.0% 0.0% flops 71 x 96 x 64 3301343232 0.0% 100.0% 0.0% flops 71 x 64 x 858 3430901760 0.0% 100.0% 0.0% flops 64 x 96 x 64 4322230272 0.0% 100.0% 0.0% flops 64 x 64 x 96 4322230272 0.0% 100.0% 0.0% flops 71 x 96 x 840 5038387200 0.0% 100.0% 0.0% flops 71 x 96 x 849 5092369920 0.0% 100.0% 0.0% flops 64 x 64 x 840 5298585600 0.0% 100.0% 0.0% flops 64 x 64 x 849 5355356160 0.0% 100.0% 0.0% flops 129 x 96 x 64 5998215168 0.0% 100.0% 0.0% flops 129 x 64 x 96 5998215168 0.0% 100.0% 0.0% flops 129 x 64 x 858 6233610240 0.0% 100.0% 0.0% flops 209 x 64 x 64 6478692352 0.0% 100.0% 0.0% flops 222 x 64 x 64 6881673216 0.0% 100.0% 0.0% flops 129 x 96 x 840 9154252800 0.0% 100.0% 0.0% flops 129 x 96 x 849 9252334080 0.0% 100.0% 0.0% flops 209 x 64 x 840 9887539200 0.0% 100.0% 0.0% flops 71 x 96 x 96 9904029696 0.0% 100.0% 0.0% flops 209 x 64 x 849 9993477120 0.0% 100.0% 0.0% flops 32 x 64 x 64 10085203968 0.0% 100.0% 0.0% flops 71 x 96 x 858 10292705280 0.0% 100.0% 0.0% flops 222 x 64 x 840 10502553600 0.0% 100.0% 0.0% flops 222 x 64 x 849 10615080960 0.0% 100.0% 0.0% flops 231 x 64 x 64 10740989952 0.0% 100.0% 0.0% flops 64 x 64 x 858 10824253440 0.0% 100.0% 0.0% flops 240 x 64 x 64 11159470080 0.0% 100.0% 0.0% flops 64 x 96 x 96 12966690816 0.0% 100.0% 0.0% flops 64 x 96 x 840 15895756800 0.0% 100.0% 0.0% flops 64 x 96 x 849 16066068480 0.0% 100.0% 0.0% flops 231 x 64 x 840 16392499200 0.0% 100.0% 0.0% flops 231 x 64 x 849 16568133120 0.0% 100.0% 0.0% flops 240 x 64 x 840 17031168000 0.0% 100.0% 0.0% flops 240 x 64 x 849 17213644800 0.0% 100.0% 0.0% flops 129 x 96 x 96 17994645504 0.0% 100.0% 0.0% flops 129 x 96 x 858 18700830720 0.0% 100.0% 0.0% flops 209 x 96 x 64 19436077056 0.0% 100.0% 0.0% flops 209 x 64 x 96 19436077056 0.0% 100.0% 0.0% flops 209 x 64 x 858 20198830080 0.0% 100.0% 0.0% flops 222 x 64 x 96 20645019648 0.0% 100.0% 0.0% flops 222 x 96 x 64 20645019648 0.0% 100.0% 0.0% flops 222 x 64 x 858 21455216640 0.0% 100.0% 0.0% flops 209 x 96 x 840 29662617600 0.0% 100.0% 0.0% flops 209 x 96 x 849 29980431360 0.0% 100.0% 0.0% flops 32 x 96 x 64 30255611904 0.0% 100.0% 0.0% flops 32 x 64 x 96 30255611904 0.0% 100.0% 0.0% flops 222 x 96 x 840 31507660800 0.0% 100.0% 0.0% flops 222 x 96 x 849 31845242880 0.0% 100.0% 0.0% flops 231 x 64 x 96 32222969856 0.0% 100.0% 0.0% flops 231 x 96 x 64 32222969856 0.0% 100.0% 0.0% flops 64 x 96 x 858 32472760320 0.0% 100.0% 0.0% flops 240 x 96 x 64 33478410240 0.0% 100.0% 0.0% flops 240 x 64 x 96 33478410240 0.0% 100.0% 0.0% flops 231 x 64 x 858 33487534080 0.0% 100.0% 0.0% flops 240 x 64 x 858 34792243200 0.0% 100.0% 0.0% flops 32 x 64 x 840 37090099200 0.0% 100.0% 0.0% flops 32 x 64 x 849 37487493120 0.0% 100.0% 0.0% flops 231 x 96 x 840 49177497600 0.0% 100.0% 0.0% flops 231 x 96 x 849 49704399360 0.0% 100.0% 0.0% flops 240 x 96 x 840 51093504000 0.0% 100.0% 0.0% flops 240 x 96 x 849 51640934400 0.0% 100.0% 0.0% flops 209 x 96 x 96 58308231168 0.0% 100.0% 0.0% flops 209 x 96 x 858 60596490240 0.0% 100.0% 0.0% flops 222 x 96 x 96 61935058944 0.0% 100.0% 0.0% flops 222 x 96 x 858 64365649920 0.0% 100.0% 0.0% flops 9 x 9 x 64 67295121408 0.0% 100.0% 0.0% flops 32 x 64 x 858 75769774080 0.0% 100.0% 0.0% flops 9 x 22 x 64 87348856320 0.0% 100.0% 0.0% flops 22 x 9 x 64 87510601728 0.0% 100.0% 0.0% flops 32 x 96 x 96 90766835712 0.0% 100.0% 0.0% flops 231 x 96 x 96 96668909568 0.0% 100.0% 0.0% flops 240 x 96 x 96 100435230720 0.0% 100.0% 0.0% flops 231 x 96 x 858 100462602240 0.0% 100.0% 0.0% flops 240 x 96 x 858 104376729600 0.0% 100.0% 0.0% flops 32 x 96 x 840 111270297600 0.0% 100.0% 0.0% flops 32 x 96 x 849 112462479360 0.0% 100.0% 0.0% flops 22 x 22 x 64 113395453952 0.0% 100.0% 0.0% flops 9 x 9 x 96 201885364224 0.0% 100.0% 0.0% flops 32 x 96 x 858 227309322240 0.0% 100.0% 0.0% flops 9 x 22 x 96 262046568960 0.0% 100.0% 0.0% flops 22 x 9 x 96 262531805184 0.0% 100.0% 0.0% flops 22 x 22 x 96 340186361856 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 3766099836928 100.0% 0.0% 0.0% flops total 13.644522E+12 27.6% 72.4% 0.0% flops max/rank 1.282868E+12 28.0% 72.0% 0.0% matmuls inhomo. stacks 336996 100.0% 0.0% 0.0% matmuls total 562477038 0.1% 99.9% 0.0% number of processed stacks 2521520 13.4% 86.6% 0.0% average stack size 1.0 257.3 0.0 marketing flops 15.646302E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 796.508160E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 197280 MPI messages size (bytes): total size 482.240758E+09 min size 0.000000E+00 max size 17.688240E+06 average size 2.444448E+06 MPI breakdown and total messages size (bytes): size <= 128 1386 0 128 < size <= 8192 0 0 8192 < size <= 32768 4706 153485312 32768 < size <= 131072 50860 4081582080 131072 < size <= 4194304 118308 127519948800 4194304 < size <= 16777216 15420 235141755840 16777216 < size 6600 115343360000 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3473 66888. MP_Allreduce 9774 769. MP_Sync 52 MP_Alltoall 1496 10935805. MP_SendRecv 2420 70608. MP_ISendRecv 2420 70608. MP_Wait 11198 MP_ISend 7392 401928. MP_IRecv 7392 401928. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.258 0.339 126.008 126.010 qs_mol_dyn_low 1 2.0 0.153 0.168 122.598 123.771 qs_forces 11 3.9 0.230 0.438 121.556 121.694 qs_energies 11 4.9 0.124 0.127 115.653 115.757 scf_env_do_scf 11 5.9 0.004 0.011 102.494 102.508 scf_env_do_scf_inner_loop 99 6.5 0.009 0.028 79.283 79.342 velocity_verlet 10 3.0 0.006 0.009 73.710 73.760 dbcsr_multiply_generic 2055 12.4 0.178 0.232 49.885 51.954 qs_scf_new_mos 99 7.5 0.001 0.001 43.629 44.760 qs_scf_loop_do_ot 99 8.5 0.004 0.012 43.628 44.759 ot_scf_mini 99 9.5 0.014 0.017 41.482 42.667 multiply_cannon 2055 13.4 0.248 0.291 31.755 35.962 multiply_cannon_loop 2055 14.4 0.207 0.241 29.321 33.874 rebuild_ks_matrix 110 8.3 0.001 0.001 25.713 27.354 qs_ks_build_kohn_sham_matrix 110 9.3 0.015 0.017 25.713 27.353 mp_waitall_1 102446 16.6 17.067 26.896 17.067 26.896 qs_ks_update_qs_env 110 7.6 0.001 0.001 23.033 24.480 multiply_cannon_multrec 12330 15.4 15.276 23.831 15.305 23.858 ot_mini 99 10.5 0.001 0.002 22.582 23.800 init_scf_loop 11 6.9 0.025 0.029 23.132 23.187 multiply_cannon_metrocomm3 12330 15.4 0.047 0.090 11.774 21.430 prepare_preconditioner 11 7.9 0.000 0.001 18.719 18.853 make_preconditioner 11 8.9 0.001 0.002 18.719 18.853 make_full_inverse_cholesky 11 9.9 0.000 0.000 15.401 17.962 qs_rho_update_rho_low 110 7.6 0.002 0.002 17.233 17.281 calculate_rho_elec 110 8.6 0.190 0.200 17.232 17.280 sum_up_and_integrate 110 10.3 0.002 0.005 13.258 13.283 integrate_v_rspace 110 11.3 0.003 0.004 13.208 13.234 qs_ot_get_derivative 99 11.5 0.006 0.018 10.943 12.141 grid_collocate_task_list 110 9.6 11.931 12.095 11.931 12.095 apply_preconditioner_dbcsr 110 12.6 0.006 0.012 10.636 11.886 apply_single 110 13.6 0.000 0.001 10.630 11.874 make_m2s 4110 13.4 0.093 0.125 11.333 11.873 ot_diis_step 99 11.5 0.039 0.058 11.597 11.598 qs_ot_get_p 110 10.4 0.001 0.002 8.788 10.283 make_images 4110 14.4 1.223 1.424 9.543 10.072 cp_fm_upper_to_full 70 14.2 7.063 9.658 7.063 9.658 grid_integrate_task_list 110 12.3 9.135 9.233 9.135 9.233 init_scf_run 11 5.9 0.001 0.004 8.625 8.626 scf_env_initial_rho_setup 11 6.9 0.015 0.025 8.624 8.625 multiply_cannon_metrocomm4 10275 15.4 0.048 0.087 1.898 8.554 mp_irecv_dv 29063 16.1 1.816 8.478 1.816 8.478 mp_sum_l 10179 13.1 4.721 7.659 4.721 7.659 hybrid_alltoall_any 4261 16.3 0.205 0.449 5.681 7.082 dbcsr_complete_redistribute 325 12.2 0.929 1.129 5.198 7.042 make_images_data 4110 15.4 0.062 0.105 5.961 6.941 wfi_extrapolate 11 7.9 0.001 0.001 6.682 6.682 copy_fm_to_dbcsr 174 11.2 0.001 0.002 4.053 5.937 mp_alltoall_i22 605 13.7 3.500 5.699 3.500 5.699 fft_wrap_pw1pw2 1111 11.6 0.016 0.019 5.471 5.538 transfer_fm_to_dbcsr 11 9.9 0.002 0.005 3.301 5.159 fft_wrap_pw1pw2_140 451 12.1 0.356 0.366 4.767 4.824 cp_fm_cholesky_invert 11 10.9 4.809 4.822 4.809 4.822 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 4.644 4.661 qs_ot_get_derivative_diag 47 12.0 0.002 0.002 3.694 4.393 fft3d_ps 1111 13.6 1.912 2.038 4.270 4.350 density_rs2pw 110 9.6 0.006 0.009 4.047 4.244 qs_ot_p2m_diag 48 11.0 0.096 0.116 4.090 4.130 qs_ot_get_derivative_taylor 52 13.0 0.002 0.002 3.340 4.056 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 3.273 3.938 dbcsr_dot_sd 1091 11.9 0.816 0.887 2.544 3.881 cp_dbcsr_syevd 48 12.0 0.003 0.004 3.697 3.721 mp_alltoall_d11v 2046 13.8 3.510 3.635 3.510 3.635 qs_energies_init_hamiltonians 11 5.9 0.006 0.020 3.518 3.627 mp_sum_d 3891 11.9 2.199 3.522 2.199 3.522 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 3.196 3.395 copy_dbcsr_to_fm 151 11.3 0.003 0.004 2.885 3.391 cp_fm_diag_elpa 48 13.0 0.000 0.000 3.014 3.017 cp_fm_diag_elpa_base 48 14.0 2.732 2.756 3.005 3.005 potential_pw2rs 110 12.3 0.021 0.024 2.847 2.858 calculate_dm_sparse 110 9.5 0.001 0.001 2.445 2.616 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="212", plot="h2o_128_md", label="(4n/3r/12t)", y=126.010000, yerr=0.000000 PlotPoint: name="213", plot="h2o_128_md_mem", label="(4n/3r/12t)", y=721.909091, yerr=31.741726 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/19/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 142 x 128 x 128 70428655616 0.0% 100.0% 0.0% flops 129 x 128 x 128 127961923584 0.0% 100.0% 0.0% flops 142 x 128 x 1280 163787571200 0.0% 100.0% 0.0% flops 32 x 128 x 128 184415158272 0.0% 100.0% 0.0% flops 160 x 128 x 128 238068695040 0.0% 100.0% 0.0% flops 138 x 128 x 128 239556624384 0.0% 100.0% 0.0% flops 9 x 9 x 128 269180485632 0.0% 100.0% 0.0% flops 129 x 128 x 1280 297585868800 0.0% 100.0% 0.0% flops 9 x 22 x 128 349395425280 0.0% 100.0% 0.0% flops 22 x 9 x 128 350042406912 0.0% 100.0% 0.0% flops 22 x 22 x 128 453581815808 0.0% 100.0% 0.0% flops 160 x 128 x 1280 553648128000 0.0% 100.0% 0.0% flops 138 x 128 x 1280 557108428800 0.0% 100.0% 0.0% flops 32 x 128 x 1280 1033476505600 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 2588749070336 100.0% 0.0% 0.0% flops total 13.498461E+12 19.2% 80.8% 0.0% flops max/rank 1.745688E+12 20.7% 79.3% 0.0% matmuls inhomo. stacks 158576 100.0% 0.0% 0.0% matmuls total 546784212 0.0% 100.0% 0.0% number of processed stacks 1648032 9.6% 90.4% 0.0% average stack size 1.0 367.0 0.0 marketing flops 15.646297E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 2.007859E+09 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 82200 MPI messages size (bytes): total size 297.640985E+09 min size 0.000000E+00 max size 26.214400E+06 average size 3.620936E+06 MPI breakdown and total messages size (bytes): size <= 128 572 0 128 < size <= 8192 0 0 8192 < size <= 32768 44 1441792 32768 < size <= 131072 18560 2432696320 131072 < size <= 4194304 54216 84915781632 4194304 < size <= 16777216 0 0 16777216 < size 8808 210291069504 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3462 67098. MP_Allreduce 9752 812. MP_Sync 52 MP_Alltoall 1474 16505187. MP_SendRecv 2310 360267. MP_ISendRecv 2310 360267. MP_Wait 5214 MP_ISend 2420 1187840. MP_IRecv 2420 1187840. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.274 0.391 125.123 125.128 qs_mol_dyn_low 1 2.0 0.116 0.143 123.121 123.131 qs_forces 11 3.9 0.123 0.168 119.430 119.544 qs_energies 11 4.9 0.053 0.075 113.922 114.058 scf_env_do_scf 11 5.9 0.001 0.003 101.220 101.221 velocity_verlet 10 3.0 0.004 0.011 78.329 78.724 scf_env_do_scf_inner_loop 99 6.5 0.017 0.077 68.745 68.747 dbcsr_multiply_generic 2055 12.4 0.176 0.183 33.288 33.630 init_scf_loop 11 6.9 0.020 0.047 32.387 32.388 qs_scf_new_mos 99 7.5 0.001 0.001 31.011 31.435 qs_scf_loop_do_ot 99 8.5 0.001 0.001 31.011 31.434 ot_scf_mini 99 9.5 0.004 0.005 28.766 29.035 prepare_preconditioner 11 7.9 0.008 0.031 28.915 28.966 make_preconditioner 11 8.9 0.001 0.002 28.907 28.966 make_full_inverse_cholesky 11 9.9 0.000 0.000 23.019 28.132 rebuild_ks_matrix 110 8.3 0.001 0.001 23.171 23.378 qs_ks_build_kohn_sham_matrix 110 9.3 0.015 0.017 23.170 23.378 multiply_cannon 2055 13.4 0.251 0.261 21.254 22.869 qs_rho_update_rho_low 110 7.6 0.001 0.001 21.828 21.886 calculate_rho_elec 110 8.6 0.279 0.280 21.827 21.885 qs_ks_update_qs_env 110 7.6 0.001 0.001 20.840 21.022 multiply_cannon_loop 2055 14.4 0.137 0.150 19.538 20.104 cp_fm_upper_to_full 70 14.2 12.288 17.538 12.288 17.538 grid_collocate_task_list 110 9.6 15.870 16.166 15.870 16.166 ot_mini 99 10.5 0.001 0.001 15.149 15.445 multiply_cannon_multrec 8220 15.4 12.589 14.003 12.616 14.031 sum_up_and_integrate 110 10.3 0.002 0.002 13.480 13.505 integrate_v_rspace 110 11.3 0.003 0.003 13.429 13.454 mp_waitall_1 84994 16.7 9.286 12.505 9.286 12.505 dbcsr_complete_redistribute 325 12.2 1.254 1.336 8.095 11.206 copy_fm_to_dbcsr 174 11.2 0.001 0.001 6.622 9.740 grid_integrate_task_list 110 12.3 9.342 9.419 9.342 9.419 make_m2s 4110 13.4 0.078 0.080 8.881 9.348 transfer_fm_to_dbcsr 11 9.9 0.002 0.002 5.866 8.969 mp_alltoall_i22 605 13.7 5.501 8.743 5.501 8.743 multiply_cannon_metrocomm3 8220 15.4 0.026 0.026 5.480 8.410 qs_ot_get_derivative 99 11.5 0.002 0.002 7.734 8.012 init_scf_run 11 5.9 0.001 0.005 7.965 7.965 scf_env_initial_rho_setup 11 6.9 0.017 0.036 7.964 7.965 make_images 4110 14.4 1.489 1.599 6.915 7.397 ot_diis_step 99 11.5 0.048 0.060 7.376 7.376 qs_ot_get_p 110 10.4 0.001 0.001 6.916 7.304 apply_preconditioner_dbcsr 110 12.6 0.001 0.001 7.104 7.198 apply_single 110 13.6 0.000 0.000 7.104 7.197 cp_fm_cholesky_invert 11 10.9 6.484 6.492 6.484 6.492 fft_wrap_pw1pw2 1111 11.6 0.016 0.019 6.171 6.210 wfi_extrapolate 11 7.9 0.001 0.001 5.823 5.823 fft_wrap_pw1pw2_140 451 12.1 0.501 0.509 5.468 5.507 density_rs2pw 110 9.6 0.005 0.006 4.758 5.010 fft3d_ps 1111 13.6 2.491 2.513 4.748 4.810 hybrid_alltoall_any 4261 16.3 0.413 0.934 4.069 4.751 make_images_data 4110 15.4 0.056 0.060 4.002 4.686 qs_ot_p2m_diag 48 11.0 0.139 0.149 4.657 4.667 cp_dbcsr_syevd 48 12.0 0.003 0.003 4.292 4.292 qs_energies_init_hamiltonians 11 5.9 0.008 0.021 3.837 3.958 cp_fm_diag_elpa 48 13.0 0.000 0.000 3.618 3.618 cp_fm_diag_elpa_base 48 14.0 3.161 3.266 3.616 3.617 multiply_cannon_metrocomm4 6165 15.4 0.025 0.028 1.203 3.549 mp_irecv_dv 17923 16.3 1.155 3.473 1.155 3.473 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 3.383 3.389 mp_sum_dm 438 4.9 2.875 3.301 2.875 3.301 md_write_output 11 3.9 0.460 3.157 0.485 3.225 md_output 10 3.0 0.012 0.013 0.477 3.209 update_particle_set 20 4.0 0.012 0.016 2.758 3.159 cp_fm_cholesky_decompose 22 10.9 3.134 3.154 3.134 3.154 potential_pw2rs 110 12.3 0.029 0.029 3.059 3.085 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 3.019 3.050 mp_alltoall_d11v 2046 13.8 2.903 3.012 2.903 3.012 copy_dbcsr_to_fm 151 11.3 0.003 0.003 2.638 2.894 calculate_dm_sparse 110 9.5 0.001 0.001 2.577 2.773 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 2.486 2.652 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="214", plot="h2o_128_md", label="(4n/2r/18t)", y=125.128000, yerr=0.000000 PlotPoint: name="215", plot="h2o_128_md_mem", label="(4n/2r/18t)", y=1618.909091, yerr=234.393009 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/20/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 830472192 0.0% 100.0% 0.0% flops 22 x 32 x 32 1015021568 0.0% 100.0% 0.0% flops 32 x 256 x 256 184415158272 0.0% 100.0% 0.0% flops 49 x 256 x 256 194422767616 0.0% 100.0% 0.0% flops 71 x 256 x 256 246500294656 0.0% 100.0% 0.0% flops 9 x 9 x 256 269180485632 0.0% 100.0% 0.0% flops 9 x 22 x 256 349395425280 0.0% 100.0% 0.0% flops 22 x 9 x 256 350042406912 0.0% 100.0% 0.0% flops 80 x 256 x 256 396781158400 0.0% 100.0% 0.0% flops 49 x 256 x 2560 452145971200 0.0% 100.0% 0.0% flops 22 x 22 x 256 453581815808 0.0% 100.0% 0.0% flops 71 x 256 x 2560 573256499200 0.0% 100.0% 0.0% flops 80 x 256 x 2560 922746880000 0.0% 100.0% 0.0% flops 32 x 256 x 2560 1033476505600 0.0% 100.0% 0.0% flops 9 x 32 x 9 1138002296832 0.0% 100.0% 0.0% flops 9 x 32 x 22 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 9 1485592289280 0.0% 100.0% 0.0% flops 22 x 32 x 22 1910442074112 0.0% 100.0% 0.0% flops inhomo. stacks 1682398248960 100.0% 0.0% 0.0% flops total 13.129818E+12 12.8% 87.2% 0.0% flops max/rank 3.360079E+12 13.8% 86.2% 0.0% matmuls inhomo. stacks 46640 100.0% 0.0% 0.0% matmuls total 531185346 0.0% 100.0% 0.0% number of processed stacks 996500 4.7% 95.3% 0.0% average stack size 1.0 559.2 0.0 marketing flops 15.646297E+12 ------------------------------------------------------------------------------- # multiplications 2055 max memory usage/rank 8.442036E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 16440 MPI messages size (bytes): total size 113.041801E+09 min size 0.000000E+00 max size 52.428800E+06 average size 6.876022E+06 MPI breakdown and total messages size (bytes): size <= 128 110 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 22 1441792 131072 < size <= 4194304 7388 3873439744 4194304 < size <= 16777216 7452 39069941760 16777216 < size 1468 70097023168 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 19 12. MP_Allreduce 10941 25. MP_Alltoall 8043 1122409. MP_ISend 16396 1767280. MP_IRecv 16396 1767297. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3462 67097. MP_Allreduce 9752 980. MP_Sync 52 MP_Alltoall 1474 32339426. MP_SendRecv 990 720533. MP_ISendRecv 990 720533. MP_Wait 2926 MP_ISend 1452 2662400. MP_IRecv 1452 2662400. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.238 0.253 142.520 142.522 qs_mol_dyn_low 1 2.0 0.176 0.184 140.306 140.313 qs_forces 11 3.9 0.106 0.136 138.963 138.988 qs_energies 11 4.9 0.102 0.123 130.492 130.520 scf_env_do_scf 11 5.9 0.005 0.020 112.608 112.611 velocity_verlet 10 3.0 0.001 0.001 86.995 87.107 scf_env_do_scf_inner_loop 99 6.5 0.008 0.023 86.373 86.380 dbcsr_multiply_generic 2055 12.4 0.205 0.213 44.091 44.294 qs_scf_new_mos 99 7.5 0.001 0.001 41.709 41.838 qs_scf_loop_do_ot 99 8.5 0.001 0.001 41.708 41.837 ot_scf_mini 99 9.5 0.004 0.007 38.800 38.897 rebuild_ks_matrix 110 8.3 0.001 0.001 27.775 27.952 qs_ks_build_kohn_sham_matrix 110 9.3 0.014 0.014 27.774 27.951 multiply_cannon 2055 13.4 0.505 0.541 24.020 26.950 init_scf_loop 11 6.9 0.038 0.043 26.045 26.050 qs_rho_update_rho_low 110 7.6 0.001 0.001 25.694 25.701 calculate_rho_elec 110 8.6 0.489 0.489 25.693 25.700 qs_ks_update_qs_env 110 7.6 0.001 0.001 25.195 25.344 ot_mini 99 10.5 0.002 0.002 21.117 21.216 prepare_preconditioner 11 7.9 0.000 0.000 20.992 21.009 make_preconditioner 11 8.9 0.001 0.001 20.992 21.009 multiply_cannon_loop 2055 14.4 0.135 0.140 20.488 20.855 make_full_inverse_cholesky 11 9.9 0.023 0.026 18.868 19.273 grid_collocate_task_list 110 9.6 17.803 17.934 17.803 17.934 make_m2s 4110 13.4 0.074 0.076 15.133 16.701 sum_up_and_integrate 110 10.3 0.002 0.002 15.039 15.059 integrate_v_rspace 110 11.3 0.003 0.003 14.986 15.006 multiply_cannon_multrec 4110 15.4 14.677 14.927 14.747 14.999 mp_waitall_1 67234 16.8 10.770 14.081 10.770 14.081 make_images 4110 14.4 1.977 2.035 10.581 12.053 qs_ot_get_derivative 99 11.5 0.006 0.007 11.084 11.183 ot_diis_step 99 11.5 0.085 0.086 10.013 10.013 init_scf_run 11 5.9 0.001 0.003 9.977 9.978 scf_env_initial_rho_setup 11 6.9 0.003 0.004 9.976 9.977 grid_integrate_task_list 110 12.3 9.891 9.974 9.891 9.974 cp_fm_cholesky_invert 11 10.9 9.909 9.917 9.909 9.917 apply_preconditioner_dbcsr 110 12.6 0.014 0.014 9.580 9.646 apply_single 110 13.6 0.000 0.000 9.566 9.632 qs_ot_get_p 110 10.4 0.001 0.001 9.353 9.395 hybrid_alltoall_any 4261 16.3 0.862 2.128 7.058 9.365 fft_wrap_pw1pw2 1111 11.6 0.015 0.015 8.754 8.770 make_images_data 4110 15.4 0.053 0.058 6.349 8.605 wfi_extrapolate 11 7.9 0.001 0.001 8.048 8.049 fft_wrap_pw1pw2_140 451 12.1 0.954 0.959 7.739 7.758 fft3d_ps 1111 13.6 3.679 3.709 6.619 6.634 qs_energies_init_hamiltonians 11 5.9 0.004 0.004 6.397 6.419 dbcsr_complete_redistribute 325 12.2 2.400 2.415 6.109 6.389 density_rs2pw 110 9.6 0.005 0.005 6.055 6.184 qs_ot_p2m_diag 48 11.0 0.242 0.252 5.782 5.808 mp_alltoall_d11v 2046 13.8 5.413 5.473 5.413 5.473 cp_dbcsr_syevd 48 12.0 0.004 0.004 5.246 5.260 mp_allgather_i34 2055 14.4 2.120 5.241 2.120 5.241 copy_dbcsr_to_fm 151 11.3 0.003 0.003 5.079 5.212 multiply_cannon_metrocomm3 4110 15.4 0.008 0.009 4.231 5.096 dbcsr_make_dense_low 5207 15.5 0.054 0.054 4.599 4.725 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 4.643 4.679 make_dense_data 5207 16.5 4.172 4.242 4.526 4.651 cp_fm_diag_elpa 48 13.0 0.000 0.000 4.472 4.472 cp_fm_diag_elpa_base 48 14.0 4.136 4.201 4.469 4.469 dbcsr_make_images_dense 3552 14.7 0.027 0.027 4.167 4.271 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 3.999 4.073 cp_fm_cholesky_decompose 22 10.9 4.007 4.024 4.007 4.024 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 3.847 3.875 copy_fm_to_dbcsr 174 11.2 0.001 0.001 3.490 3.718 transfer_dbcsr_to_fm 11 10.9 0.000 0.000 3.622 3.640 qs_env_update_s_mstruct 11 6.9 0.006 0.017 3.577 3.638 potential_pw2rs 110 12.3 0.045 0.045 3.521 3.524 calculate_dm_sparse 110 9.5 0.001 0.001 3.411 3.448 qs_ot_get_derivative_diag 47 12.0 0.002 0.002 3.324 3.443 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 3.134 3.244 qs_ot_get_derivative_taylor 52 13.0 0.002 0.002 3.148 3.167 dbcsr_copy 1918 11.9 0.301 0.303 2.996 3.022 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="216", plot="h2o_128_md", label="(4n/1r/36t)", y=142.522000, yerr=0.000000 PlotPoint: name="217", plot="h2o_128_md_mem", label="(4n/1r/36t)", y=5973.636364, yerr=1578.445166 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/21/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 64 x 64 x 64 30064771072 0.0% 100.0% 0.0% flops 64 x 64 x 96 90194313216 0.0% 100.0% 0.0% flops 64 x 96 x 64 90194313216 0.0% 100.0% 0.0% flops 96 x 64 x 64 90194313216 0.0% 100.0% 0.0% flops 64 x 64 x 849 201639591936 0.0% 100.0% 0.0% flops 64 x 64 x 853 202589601792 0.0% 100.0% 0.0% flops 64 x 64 x 858 203777114112 0.0% 100.0% 0.0% flops 849 x 64 x 64 248377245696 0.0% 100.0% 0.0% flops 853 x 64 x 64 249547456512 0.0% 100.0% 0.0% flops 858 x 64 x 64 251010220032 0.0% 100.0% 0.0% flops 96 x 96 x 64 270582939648 0.0% 100.0% 0.0% flops 64 x 96 x 96 270582939648 0.0% 100.0% 0.0% flops 96 x 64 x 96 270582939648 0.0% 100.0% 0.0% flops 9 x 9 x 64 352505530368 0.0% 100.0% 0.0% flops 22 x 9 x 64 489467860992 0.0% 100.0% 0.0% flops 9 x 22 x 64 490886212608 0.0% 100.0% 0.0% flops 64 x 96 x 849 604918775808 0.0% 100.0% 0.0% flops 96 x 64 x 849 604918775808 0.0% 100.0% 0.0% flops 64 x 96 x 853 607768805376 0.0% 100.0% 0.0% flops 96 x 64 x 853 607768805376 0.0% 100.0% 0.0% flops 64 x 96 x 858 611331342336 0.0% 100.0% 0.0% flops 96 x 64 x 858 611331342336 0.0% 100.0% 0.0% flops 22 x 22 x 64 678653927424 0.0% 100.0% 0.0% flops 849 x 64 x 96 745131737088 0.0% 100.0% 0.0% flops 849 x 96 x 64 745131737088 0.0% 100.0% 0.0% flops 853 x 96 x 64 748642369536 0.0% 100.0% 0.0% flops 853 x 64 x 96 748642369536 0.0% 100.0% 0.0% flops 849 x 64 x 849 749909827584 0.0% 100.0% 0.0% flops 858 x 64 x 96 753030660096 0.0% 100.0% 0.0% flops 858 x 96 x 64 753030660096 0.0% 100.0% 0.0% flops 849 x 64 x 853 753442971648 0.0% 100.0% 0.0% flops 853 x 64 x 849 753442971648 0.0% 100.0% 0.0% flops 853 x 64 x 853 756992761856 0.0% 100.0% 0.0% flops 858 x 64 x 849 757859401728 0.0% 100.0% 0.0% flops 849 x 64 x 858 757859401728 0.0% 100.0% 0.0% flops 858 x 64 x 853 761429999616 0.0% 100.0% 0.0% flops 853 x 64 x 858 761429999616 0.0% 100.0% 0.0% flops 858 x 64 x 858 765893246976 0.0% 100.0% 0.0% flops 96 x 96 x 96 811748818944 0.0% 100.0% 0.0% flops 9 x 9 x 96 1057516591104 0.0% 100.0% 0.0% flops 22 x 9 x 96 1468403582976 0.0% 100.0% 0.0% flops 9 x 22 x 96 1472658637824 0.0% 100.0% 0.0% flops 96 x 96 x 849 1814756327424 0.0% 100.0% 0.0% flops 96 x 96 x 853 1823306416128 0.0% 100.0% 0.0% flops 96 x 96 x 858 1833994027008 0.0% 100.0% 0.0% flops 22 x 22 x 96 2035961782272 0.0% 100.0% 0.0% flops 849 x 96 x 96 2235395211264 0.0% 100.0% 0.0% flops 853 x 96 x 96 2245927108608 0.0% 100.0% 0.0% flops 849 x 96 x 849 2249729482752 0.0% 100.0% 0.0% flops 858 x 96 x 96 2259091980288 0.0% 100.0% 0.0% flops 853 x 96 x 849 2260328914944 0.0% 100.0% 0.0% flops 849 x 96 x 853 2260328914944 0.0% 100.0% 0.0% flops 853 x 96 x 853 2270978285568 0.0% 100.0% 0.0% flops 858 x 96 x 849 2273578205184 0.0% 100.0% 0.0% flops 849 x 96 x 858 2273578205184 0.0% 100.0% 0.0% flops 853 x 96 x 858 2284289998848 0.0% 100.0% 0.0% flops 858 x 96 x 853 2284289998848 0.0% 100.0% 0.0% flops 858 x 96 x 858 2297679740928 0.0% 100.0% 0.0% flops 9 x 32 x 9 5921911627776 0.0% 100.0% 0.0% flops 9 x 32 x 22 8269110153216 0.0% 100.0% 0.0% flops 22 x 32 x 9 8269110153216 0.0% 100.0% 0.0% flops 22 x 32 x 22 11374757920768 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 92.796573E+12 0.0% 100.0% 0.0% flops max/rank 738.067995E+09 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 3069347940 0.0% 100.0% 0.0% number of processed stacks 8078828 0.0% 100.0% 0.0% average stack size 0.0 379.9 0.0 marketing flops 143.511165E+12 ------------------------------------------------------------------------------- # multiplications 2485 max memory usage/rank 300.191744E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 7872480 MPI messages size (bytes): total size 4.844067E+12 min size 0.000000E+00 max size 5.889312E+06 average size 615.316500E+03 MPI breakdown and total messages size (bytes): size <= 128 50820 0 128 < size <= 8192 0 0 8192 < size <= 32768 415052 13584564224 32768 < size <= 131072 3308800 203292672000 131072 < size <= 4194304 3896640 3455280906176 4194304 < size <= 16777216 201168 1171888537600 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 65 12. MP_Allreduce 13331 37. MP_Alltoall 9584 123391. MP_ISend 119236 295303. MP_IRecv 119236 293568. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4001 57672. MP_Allreduce 11083 760. MP_Sync 86 MP_Alltoall 2465 1586187. MP_SendRecv 36322 12928. MP_ISendRecv 36322 12928. MP_Wait 53082 MP_ISend 14640 95640. MP_IRecv 14640 95640. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.096 0.142 171.649 171.777 qs_mol_dyn_low 1 2.0 0.126 0.197 169.634 169.844 qs_forces 11 3.9 0.005 0.006 169.036 169.266 qs_energies 11 4.9 0.001 0.002 163.187 163.418 scf_env_do_scf 11 5.9 0.001 0.001 148.537 148.636 scf_env_do_scf_inner_loop 116 6.6 0.003 0.009 126.408 126.480 velocity_verlet 10 3.0 0.001 0.002 106.980 107.085 dbcsr_multiply_generic 2485 12.5 0.187 0.242 82.341 83.709 qs_scf_new_mos 116 7.6 0.001 0.001 74.364 76.125 qs_scf_loop_do_ot 116 8.6 0.001 0.001 74.363 76.124 ot_scf_mini 116 9.6 0.003 0.005 68.579 69.714 multiply_cannon 2485 13.5 0.230 0.302 61.380 66.156 multiply_cannon_loop 2485 14.5 0.312 0.464 57.155 60.878 multiply_cannon_multrec 29820 15.5 38.939 46.401 38.956 46.420 ot_mini 116 10.6 0.001 0.001 39.167 40.392 rebuild_ks_matrix 127 8.3 0.001 0.001 39.577 40.254 qs_ks_build_kohn_sham_matrix 127 9.3 0.016 0.021 39.576 40.253 qs_ks_update_qs_env 127 7.6 0.001 0.002 35.728 36.360 mp_waitall_1 316762 16.5 24.179 31.525 24.179 31.525 sum_up_and_integrate 127 10.3 0.002 0.005 22.078 22.146 integrate_v_rspace 127 11.3 0.004 0.005 22.020 22.101 init_scf_loop 11 6.9 0.000 0.001 22.023 22.057 qs_rho_update_rho_low 127 7.7 0.001 0.001 21.552 21.813 calculate_rho_elec 127 8.7 0.038 0.054 21.552 21.812 apply_preconditioner_dbcsr 127 12.6 0.000 0.001 19.730 20.771 apply_single 127 13.6 0.001 0.002 19.730 20.770 qs_ot_get_derivative 116 11.6 0.001 0.002 19.338 20.496 ot_diis_step 116 11.6 0.008 0.011 19.659 19.672 make_m2s 4970 13.5 0.114 0.137 16.413 17.386 multiply_cannon_metrocomm3 29820 15.5 0.107 0.209 4.898 17.365 prepare_preconditioner 11 7.9 0.000 0.000 17.003 17.138 make_preconditioner 11 8.9 0.000 0.000 17.003 17.138 qs_ot_get_p 127 10.4 0.001 0.002 14.084 15.448 make_images 4970 14.5 0.303 0.356 14.310 15.343 make_full_inverse_cholesky 11 9.9 0.000 0.000 14.546 14.908 multiply_cannon_metrocomm1 29820 15.5 0.138 0.243 9.947 13.998 grid_integrate_task_list 127 12.3 12.432 12.792 12.432 12.792 grid_collocate_task_list 127 9.7 10.701 11.211 10.701 11.211 make_images_data 4970 15.5 0.070 0.106 8.745 10.937 init_scf_run 11 5.9 0.000 0.001 10.854 10.862 scf_env_initial_rho_setup 11 6.9 0.000 0.001 10.854 10.862 hybrid_alltoall_any 5155 16.4 0.266 2.692 7.156 10.230 qs_ot_p2m_diag 82 11.4 0.044 0.058 9.447 9.520 qs_ot_get_derivative_diag 76 12.4 0.002 0.003 8.507 9.324 wfi_extrapolate 11 7.9 0.001 0.001 9.118 9.124 fft_wrap_pw1pw2 1281 11.7 0.017 0.022 8.684 8.965 cp_dbcsr_syevd 82 12.4 0.005 0.006 8.836 8.887 mp_alltoall_d11v 2401 14.1 8.190 8.627 8.190 8.627 density_rs2pw 127 9.7 0.006 0.009 8.125 8.527 fft3d_ps 1281 13.7 1.861 2.345 7.726 7.953 fft_wrap_pw1pw2_140 519 12.2 0.211 0.228 7.282 7.608 calculate_dm_sparse 127 9.5 0.001 0.001 6.428 7.104 potential_pw2rs 127 12.3 0.007 0.011 6.781 6.814 cp_fm_cholesky_invert 11 10.9 6.657 6.669 6.657 6.669 mp_sum_l 12261 13.2 2.945 6.602 2.945 6.602 mp_irecv_dv 71358 16.2 2.619 6.577 2.619 6.577 cp_fm_cholesky_decompose 22 10.9 6.397 6.463 6.397 6.463 multiply_cannon_metrocomm4 27335 15.5 0.102 0.206 2.332 6.392 dbcsr_complete_redistribute 393 12.7 0.576 0.782 5.093 5.573 cp_fm_diag_elpa 82 13.4 0.000 0.001 5.362 5.368 cp_fm_diag_elpa_base 82 14.4 5.305 5.326 5.351 5.354 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.003 5.216 5.253 mp_allgather_i34 2485 14.5 1.731 5.147 1.731 5.147 mp_waitany 14640 13.8 4.370 5.007 4.370 5.007 make_images_sizes 4970 15.5 0.006 0.014 2.644 4.993 mp_alltoall_i44 4970 16.5 2.637 4.986 2.637 4.986 mp_alltoall_z22v 1281 15.7 4.418 4.728 4.418 4.728 transfer_rs2pw 519 10.6 0.006 0.008 4.154 4.626 transfer_pw2rs 519 13.2 0.006 0.008 4.334 4.363 copy_fm_to_dbcsr 208 11.6 0.001 0.002 3.927 4.328 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 4.215 4.271 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 4.018 4.203 mp_sum_d 4455 12.1 2.384 4.102 2.384 4.102 dbcsr_dot_sd 1305 12.0 0.513 0.564 1.883 3.629 qs_ot_get_derivative_taylor 40 13.0 0.001 0.002 3.185 3.530 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="400", plot="h2o_256_md", label="(4n/36r/1t)", y=171.777000, yerr=0.000000 PlotPoint: name="401", plot="h2o_256_md_mem", label="(4n/36r/1t)", y=284.818182, yerr=3.214122 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/22/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 64 x 64 x 64 30366760960 0.0% 100.0% 0.0% flops 778 x 64 x 64 57411371008 0.0% 100.0% 0.0% flops 782 x 64 x 64 57706545152 0.0% 100.0% 0.0% flops 800 x 64 x 64 59034828800 0.0% 100.0% 0.0% flops 907 x 64 x 64 66930737152 0.0% 100.0% 0.0% flops 920 x 64 x 64 67890053120 0.0% 100.0% 0.0% flops 929 x 64 x 64 68554194944 0.0% 100.0% 0.0% flops 942 x 64 x 64 69513510912 0.0% 100.0% 0.0% flops 778 x 64 x 840 85658173440 0.0% 100.0% 0.0% flops 782 x 64 x 840 86098575360 0.0% 100.0% 0.0% flops 800 x 64 x 840 88080384000 0.0% 100.0% 0.0% flops 64 x 64 x 96 91100282880 0.0% 100.0% 0.0% flops 96 x 64 x 64 91100282880 0.0% 100.0% 0.0% flops 64 x 96 x 64 91100282880 0.0% 100.0% 0.0% flops 907 x 64 x 840 99861135360 0.0% 100.0% 0.0% flops 64 x 64 x 840 100631838720 0.0% 100.0% 0.0% flops 920 x 64 x 840 101292441600 0.0% 100.0% 0.0% flops 929 x 64 x 840 102283345920 0.0% 100.0% 0.0% flops 942 x 64 x 840 103714652160 0.0% 100.0% 0.0% flops 951 x 64 x 64 140355305472 0.0% 100.0% 0.0% flops 760 x 64 x 64 168249262080 0.0% 100.0% 0.0% flops 778 x 64 x 96 172234113024 0.0% 100.0% 0.0% flops 778 x 96 x 64 172234113024 0.0% 100.0% 0.0% flops 782 x 64 x 96 173119635456 0.0% 100.0% 0.0% flops 782 x 96 x 64 173119635456 0.0% 100.0% 0.0% flops 778 x 64 x 849 173151879168 0.0% 100.0% 0.0% flops 782 x 64 x 849 174042120192 0.0% 100.0% 0.0% flops 778 x 64 x 862 175803203584 0.0% 100.0% 0.0% flops 782 x 64 x 862 176707076096 0.0% 100.0% 0.0% flops 800 x 64 x 96 177104486400 0.0% 100.0% 0.0% flops 800 x 96 x 64 177104486400 0.0% 100.0% 0.0% flops 800 x 64 x 849 178048204800 0.0% 100.0% 0.0% flops 800 x 64 x 862 180774502400 0.0% 100.0% 0.0% flops 907 x 64 x 96 200792211456 0.0% 100.0% 0.0% flops 907 x 96 x 64 200792211456 0.0% 100.0% 0.0% flops 907 x 64 x 849 201862152192 0.0% 100.0% 0.0% flops 64 x 64 x 849 203420073984 0.0% 100.0% 0.0% flops 920 x 64 x 96 203670159360 0.0% 100.0% 0.0% flops 920 x 96 x 64 203670159360 0.0% 100.0% 0.0% flops 920 x 64 x 849 204755435520 0.0% 100.0% 0.0% flops 907 x 64 x 862 204953092096 0.0% 100.0% 0.0% flops 929 x 64 x 96 205662584832 0.0% 100.0% 0.0% flops 929 x 96 x 64 205662584832 0.0% 100.0% 0.0% flops 64 x 64 x 862 206534868992 0.0% 100.0% 0.0% flops 929 x 64 x 849 206758477824 0.0% 100.0% 0.0% flops 920 x 64 x 862 207890677760 0.0% 100.0% 0.0% flops 942 x 96 x 64 208540532736 0.0% 100.0% 0.0% flops 942 x 64 x 96 208540532736 0.0% 100.0% 0.0% flops 951 x 64 x 840 209411112960 0.0% 100.0% 0.0% flops 942 x 64 x 849 209651761152 0.0% 100.0% 0.0% flops 929 x 64 x 862 209924390912 0.0% 100.0% 0.0% flops 942 x 64 x 862 212861976576 0.0% 100.0% 0.0% flops 760 x 64 x 840 251029094400 0.0% 100.0% 0.0% flops 778 x 96 x 840 256974520320 0.0% 100.0% 0.0% flops 782 x 96 x 840 258295726080 0.0% 100.0% 0.0% flops 800 x 96 x 840 264241152000 0.0% 100.0% 0.0% flops 96 x 96 x 64 273300848640 0.0% 100.0% 0.0% flops 96 x 64 x 96 273300848640 0.0% 100.0% 0.0% flops 64 x 96 x 96 273300848640 0.0% 100.0% 0.0% flops 907 x 96 x 840 299583406080 0.0% 100.0% 0.0% flops 96 x 64 x 840 301895516160 0.0% 100.0% 0.0% flops 64 x 96 x 840 301895516160 0.0% 100.0% 0.0% flops 920 x 96 x 840 303877324800 0.0% 100.0% 0.0% flops 929 x 96 x 840 306850037760 0.0% 100.0% 0.0% flops 942 x 96 x 840 311143956480 0.0% 100.0% 0.0% flops 9 x 9 x 64 355059998208 0.0% 100.0% 0.0% flops 951 x 64 x 96 421065916416 0.0% 100.0% 0.0% flops 951 x 96 x 64 421065916416 0.0% 100.0% 0.0% flops 951 x 64 x 849 423309606912 0.0% 100.0% 0.0% flops 951 x 64 x 862 429791379456 0.0% 100.0% 0.0% flops 22 x 9 x 64 493014297600 0.0% 100.0% 0.0% flops 9 x 22 x 64 494442584064 0.0% 100.0% 0.0% flops 760 x 96 x 64 504747786240 0.0% 100.0% 0.0% flops 760 x 64 x 96 504747786240 0.0% 100.0% 0.0% flops 760 x 64 x 849 507437383680 0.0% 100.0% 0.0% flops 760 x 64 x 862 515207331840 0.0% 100.0% 0.0% flops 778 x 96 x 96 516702339072 0.0% 100.0% 0.0% flops 782 x 96 x 96 519358906368 0.0% 100.0% 0.0% flops 778 x 96 x 849 519455637504 0.0% 100.0% 0.0% flops 782 x 96 x 849 522126360576 0.0% 100.0% 0.0% flops 778 x 96 x 862 527409610752 0.0% 100.0% 0.0% flops 782 x 96 x 862 530121228288 0.0% 100.0% 0.0% flops 800 x 96 x 96 531313459200 0.0% 100.0% 0.0% flops 800 x 96 x 849 534144614400 0.0% 100.0% 0.0% flops 800 x 96 x 862 542323507200 0.0% 100.0% 0.0% flops 907 x 96 x 96 602376634368 0.0% 100.0% 0.0% flops 907 x 96 x 849 605586456576 0.0% 100.0% 0.0% flops 96 x 64 x 849 610260221952 0.0% 100.0% 0.0% flops 64 x 96 x 849 610260221952 0.0% 100.0% 0.0% flops 920 x 96 x 96 611010478080 0.0% 100.0% 0.0% flops 920 x 96 x 849 614266306560 0.0% 100.0% 0.0% flops 907 x 96 x 862 614859276288 0.0% 100.0% 0.0% flops 929 x 96 x 96 616987754496 0.0% 100.0% 0.0% flops 96 x 64 x 862 619604606976 0.0% 100.0% 0.0% flops 64 x 96 x 862 619604606976 0.0% 100.0% 0.0% flops 929 x 96 x 849 620275433472 0.0% 100.0% 0.0% flops 920 x 96 x 862 623672033280 0.0% 100.0% 0.0% flops 942 x 96 x 96 625621598208 0.0% 100.0% 0.0% flops 951 x 96 x 840 628233338880 0.0% 100.0% 0.0% flops 942 x 96 x 849 628955283456 0.0% 100.0% 0.0% flops 929 x 96 x 862 629773172736 0.0% 100.0% 0.0% flops 942 x 96 x 862 638585929728 0.0% 100.0% 0.0% flops 22 x 22 x 64 683571924992 0.0% 100.0% 0.0% flops 760 x 96 x 840 753087283200 0.0% 100.0% 0.0% flops 96 x 96 x 96 819902545920 0.0% 100.0% 0.0% flops 96 x 96 x 840 905686548480 0.0% 100.0% 0.0% flops 9 x 9 x 96 1065179994624 0.0% 100.0% 0.0% flops 951 x 96 x 96 1263197749248 0.0% 100.0% 0.0% flops 951 x 96 x 849 1269928820736 0.0% 100.0% 0.0% flops 951 x 96 x 862 1289374138368 0.0% 100.0% 0.0% flops 22 x 9 x 96 1479042892800 0.0% 100.0% 0.0% flops 9 x 22 x 96 1483327752192 0.0% 100.0% 0.0% flops 760 x 96 x 96 1514243358720 0.0% 100.0% 0.0% flops 760 x 96 x 849 1522312151040 0.0% 100.0% 0.0% flops 760 x 96 x 862 1545621995520 0.0% 100.0% 0.0% flops 96 x 96 x 849 1830780665856 0.0% 100.0% 0.0% flops 96 x 96 x 862 1858813820928 0.0% 100.0% 0.0% flops 22 x 22 x 96 2050715774976 0.0% 100.0% 0.0% flops 9 x 32 x 9 5962613575680 0.0% 100.0% 0.0% flops 22 x 32 x 9 8325932617728 0.0% 100.0% 0.0% flops 9 x 32 x 22 8325932617728 0.0% 100.0% 0.0% flops 22 x 32 x 22 11452938371072 0.0% 100.0% 0.0% flops inhomo. stacks 6920499888128 100.0% 0.0% 0.0% flops total 94.184287E+12 7.3% 92.7% 0.0% flops max/rank 1.486220E+12 7.4% 92.6% 0.0% matmuls inhomo. stacks 168480 100.0% 0.0% 0.0% matmuls total 3090542384 0.0% 100.0% 0.0% number of processed stacks 8217740 2.1% 97.9% 0.0% average stack size 1.0 383.9 0.0 marketing flops 144.582793E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 409.513984E+06 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 3790584 MPI messages size (bytes): total size 4.296164E+12 min size 0.000000E+00 max size 11.799056E+06 average size 1.133378E+06 MPI breakdown and total messages size (bytes): size <= 128 23892 0 128 < size <= 8192 0 0 8192 < size <= 32768 118100 3862691840 32768 < size <= 131072 1497000 117745909760 131072 < size <= 4194304 1818160 1529661358080 4194304 < size <= 16777216 333432 2644950338448 16777216 < size 0 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3992 58131. MP_Allreduce 11058 879. MP_Sync 87 MP_Alltoall 1969 4864969. MP_SendRecv 18176 24736. MP_ISendRecv 18176 24736. MP_Wait 36332 MP_ISend 16020 128768. MP_IRecv 16020 128768. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.036 0.101 271.694 271.788 qs_mol_dyn_low 1 2.0 0.102 0.218 270.834 270.891 qs_forces 11 3.9 0.024 0.083 270.216 270.373 qs_energies 11 4.9 0.002 0.003 260.900 261.081 scf_env_do_scf 11 5.9 0.001 0.005 239.697 239.762 scf_env_do_scf_inner_loop 117 6.6 0.004 0.038 209.149 209.232 velocity_verlet 10 3.0 0.001 0.001 170.053 170.127 dbcsr_multiply_generic 2507 12.6 0.268 0.336 143.596 147.196 qs_scf_new_mos 117 7.6 0.001 0.002 128.742 130.610 qs_scf_loop_do_ot 117 8.6 0.001 0.002 128.740 130.609 ot_scf_mini 117 9.6 0.004 0.006 119.669 121.368 multiply_cannon 2507 13.6 0.334 0.415 108.689 115.754 multiply_cannon_loop 2507 14.6 0.603 0.761 102.061 111.216 multiply_cannon_multrec 30084 15.6 82.954 96.384 82.993 96.423 ot_mini 117 10.6 0.001 0.002 71.640 73.804 rebuild_ks_matrix 128 8.3 0.001 0.001 60.685 62.109 qs_ks_build_kohn_sham_matrix 128 9.3 0.021 0.027 60.685 62.109 mp_waitall_1 240928 16.6 22.985 56.958 22.985 56.958 qs_ks_update_qs_env 128 7.6 0.001 0.002 54.838 56.145 multiply_cannon_metrocomm3 30084 15.6 0.159 0.295 9.863 43.467 apply_preconditioner_dbcsr 128 12.6 0.001 0.001 36.794 39.057 apply_single 128 13.6 0.001 0.002 36.794 39.057 ot_diis_step 117 11.6 0.018 0.021 37.648 37.661 qs_ot_get_derivative 117 11.6 0.002 0.003 33.676 35.465 qs_rho_update_rho_low 128 7.7 0.001 0.001 33.527 33.780 calculate_rho_elec 128 8.7 0.075 0.081 33.526 33.779 sum_up_and_integrate 128 10.3 0.004 0.008 31.228 31.265 integrate_v_rspace 128 11.3 0.005 0.006 31.123 31.168 init_scf_loop 11 6.9 0.001 0.001 30.403 30.420 make_m2s 5014 13.6 0.159 0.190 24.374 25.267 qs_ot_get_p 128 10.4 0.001 0.002 20.161 22.999 prepare_preconditioner 11 7.9 0.000 0.000 22.473 22.716 make_preconditioner 11 8.9 0.000 0.000 22.473 22.716 grid_collocate_task_list 128 9.7 21.718 22.635 21.718 22.635 grid_integrate_task_list 128 12.3 20.854 21.824 20.854 21.824 make_images 5014 14.6 1.356 1.719 20.613 21.738 make_full_inverse_cholesky 11 9.9 0.000 0.001 21.082 21.508 qs_ot_get_derivative_diag 77 12.4 0.003 0.004 14.640 15.936 init_scf_run 11 5.9 0.000 0.010 15.877 15.883 scf_env_initial_rho_setup 11 6.9 0.000 0.008 15.877 15.883 multiply_cannon_metrocomm4 27577 15.6 0.170 0.284 6.072 15.439 mp_irecv_dv 74558 16.2 6.134 15.164 6.134 15.164 mp_sum_l 12367 13.3 7.070 14.924 7.070 14.924 make_images_data 5014 15.6 0.092 0.152 11.863 13.726 wfi_extrapolate 11 7.9 0.001 0.002 13.582 13.587 hybrid_alltoall_any 5200 16.5 0.460 2.813 10.283 12.983 qs_ot_p2m_diag 83 11.4 0.114 0.151 12.130 12.203 fft_wrap_pw1pw2 1291 11.7 0.029 0.044 11.689 11.850 cp_fm_cholesky_invert 11 10.9 11.391 11.418 11.391 11.418 cp_dbcsr_syevd 83 12.4 0.006 0.007 11.266 11.305 calculate_dm_sparse 128 9.5 0.001 0.002 10.042 10.997 fft_wrap_pw1pw2_140 523 12.2 0.525 0.555 10.481 10.678 density_rs2pw 128 9.7 0.009 0.013 9.276 10.168 fft3d_ps 1291 13.7 4.282 4.574 9.317 9.447 cp_dbcsr_sm_fm_multiply 37 9.5 0.003 0.004 8.261 8.282 mp_sum_d 4469 12.1 4.327 8.204 4.327 8.204 mp_alltoall_d11v 2415 14.1 6.924 7.937 6.924 7.937 dbcsr_dot_sd 1318 12.0 1.038 1.131 4.307 7.565 cp_fm_diag_elpa 83 13.4 0.001 0.001 7.549 7.559 cp_fm_diag_elpa_base 83 14.4 7.399 7.423 7.536 7.538 qs_ot_get_orbitals 117 10.6 0.001 0.002 7.312 7.537 potential_pw2rs 128 12.3 0.018 0.022 7.260 7.326 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 6.643 7.058 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 6.480 6.642 cp_fm_cholesky_decompose 22 10.9 6.455 6.517 6.455 6.517 qs_ot_get_derivative_taylor 40 13.0 0.002 0.002 5.450 5.962 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="402", plot="h2o_256_md", label="(4n/18r/2t)", y=271.788000, yerr=0.000000 PlotPoint: name="403", plot="h2o_256_md_mem", label="(4n/18r/2t)", y=384.909091, yerr=4.010317 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/23/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 128 x 64 x 64 15183380480 0.0% 100.0% 0.0% flops 64 x 64 x 64 37958451200 0.0% 100.0% 0.0% flops 128 x 96 x 64 45550141440 0.0% 100.0% 0.0% flops 128 x 64 x 96 45550141440 0.0% 100.0% 0.0% flops 96 x 64 x 64 68325212160 0.0% 100.0% 0.0% flops 128 x 64 x 862 103267434496 0.0% 100.0% 0.0% flops 64 x 96 x 64 113875353600 0.0% 100.0% 0.0% flops 64 x 64 x 96 113875353600 0.0% 100.0% 0.0% flops 876 x 64 x 64 129286275072 0.0% 100.0% 0.0% flops 911 x 64 x 64 134451822592 0.0% 100.0% 0.0% flops 128 x 96 x 96 136650424320 0.0% 100.0% 0.0% flops 929 x 64 x 64 137108389888 0.0% 100.0% 0.0% flops 964 x 64 x 64 142273937408 0.0% 100.0% 0.0% flops 128 x 64 x 849 203420073984 0.0% 100.0% 0.0% flops 96 x 64 x 96 204975636480 0.0% 100.0% 0.0% flops 96 x 96 x 64 204975636480 0.0% 100.0% 0.0% flops 720 x 64 x 64 212525383680 0.0% 100.0% 0.0% flops 64 x 64 x 862 258168586240 0.0% 100.0% 0.0% flops 128 x 96 x 862 309802303488 0.0% 100.0% 0.0% flops 64 x 96 x 96 341626060800 0.0% 100.0% 0.0% flops 9 x 9 x 64 355059998208 0.0% 100.0% 0.0% flops 876 x 64 x 96 387858825216 0.0% 100.0% 0.0% flops 876 x 96 x 64 387858825216 0.0% 100.0% 0.0% flops 876 x 64 x 862 395896160256 0.0% 100.0% 0.0% flops 911 x 96 x 64 403355467776 0.0% 100.0% 0.0% flops 911 x 64 x 96 403355467776 0.0% 100.0% 0.0% flops 929 x 96 x 64 411325169664 0.0% 100.0% 0.0% flops 929 x 64 x 96 411325169664 0.0% 100.0% 0.0% flops 911 x 64 x 862 411713929216 0.0% 100.0% 0.0% flops 929 x 64 x 862 419848781824 0.0% 100.0% 0.0% flops 964 x 64 x 96 426821812224 0.0% 100.0% 0.0% flops 964 x 96 x 64 426821812224 0.0% 100.0% 0.0% flops 964 x 64 x 862 435666550784 0.0% 100.0% 0.0% flops 96 x 64 x 862 464703455232 0.0% 100.0% 0.0% flops 22 x 9 x 64 493014297600 0.0% 100.0% 0.0% flops 9 x 22 x 64 494442584064 0.0% 100.0% 0.0% flops 64 x 64 x 849 508550184960 0.0% 100.0% 0.0% flops 128 x 96 x 849 610260221952 0.0% 100.0% 0.0% flops 96 x 96 x 96 614926909440 0.0% 100.0% 0.0% flops 720 x 96 x 64 637576151040 0.0% 100.0% 0.0% flops 720 x 64 x 96 637576151040 0.0% 100.0% 0.0% flops 720 x 64 x 862 650788208640 0.0% 100.0% 0.0% flops 22 x 22 x 64 683571924992 0.0% 100.0% 0.0% flops 64 x 96 x 862 774505758720 0.0% 100.0% 0.0% flops 876 x 64 x 849 779851137024 0.0% 100.0% 0.0% flops 911 x 64 x 849 811009572864 0.0% 100.0% 0.0% flops 929 x 64 x 849 827033911296 0.0% 100.0% 0.0% flops 964 x 64 x 849 858192347136 0.0% 100.0% 0.0% flops 96 x 64 x 849 915390332928 0.0% 100.0% 0.0% flops 9 x 9 x 96 1065179994624 0.0% 100.0% 0.0% flops 876 x 96 x 96 1163576475648 0.0% 100.0% 0.0% flops 876 x 96 x 862 1187688480768 0.0% 100.0% 0.0% flops 911 x 96 x 96 1210066403328 0.0% 100.0% 0.0% flops 929 x 96 x 96 1233975508992 0.0% 100.0% 0.0% flops 911 x 96 x 862 1235141787648 0.0% 100.0% 0.0% flops 929 x 96 x 862 1259546345472 0.0% 100.0% 0.0% flops 964 x 96 x 96 1280465436672 0.0% 100.0% 0.0% flops 720 x 64 x 849 1281947074560 0.0% 100.0% 0.0% flops 964 x 96 x 862 1306999652352 0.0% 100.0% 0.0% flops 96 x 96 x 862 1394110365696 0.0% 100.0% 0.0% flops 22 x 9 x 96 1479042892800 0.0% 100.0% 0.0% flops 9 x 22 x 96 1483327752192 0.0% 100.0% 0.0% flops 64 x 96 x 849 1525650554880 0.0% 100.0% 0.0% flops 720 x 96 x 96 1912728453120 0.0% 100.0% 0.0% flops 720 x 96 x 862 1952364625920 0.0% 100.0% 0.0% flops 22 x 22 x 96 2050715774976 0.0% 100.0% 0.0% flops 876 x 96 x 849 2339553411072 0.0% 100.0% 0.0% flops 911 x 96 x 849 2433028718592 0.0% 100.0% 0.0% flops 929 x 96 x 849 2481101733888 0.0% 100.0% 0.0% flops 964 x 96 x 849 2574577041408 0.0% 100.0% 0.0% flops 96 x 96 x 849 2746170998784 0.0% 100.0% 0.0% flops 720 x 96 x 849 3845841223680 0.0% 100.0% 0.0% flops 9 x 32 x 9 5962613575680 0.0% 100.0% 0.0% flops 22 x 32 x 9 8325932617728 0.0% 100.0% 0.0% flops 9 x 32 x 22 8325932617728 0.0% 100.0% 0.0% flops 22 x 32 x 22 11452938371072 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 93.514751E+12 0.0% 100.0% 0.0% flops max/rank 2.183246E+12 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 3090542384 0.0% 100.0% 0.0% number of processed stacks 8274452 0.0% 100.0% 0.0% average stack size 0.0 373.5 0.0 marketing flops 144.580175E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 521.355264E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 2406720 MPI messages size (bytes): total size 4.100942E+12 min size 0.000000E+00 max size 17.653760E+06 average size 1.703955E+06 MPI breakdown and total messages size (bytes): size <= 128 14916 0 128 < size <= 8192 0 0 8192 < size <= 32768 70860 2317615104 32768 < size <= 131072 722992 55511613440 131072 < size <= 4194304 1375664 1398181724160 4194304 < size <= 16777216 154704 1463834332048 16777216 < size 67584 1181116006400 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3992 58357. MP_Allreduce 11058 960. MP_Sync 87 MP_Alltoall 1969 4570534. MP_SendRecv 12032 47072. MP_ISendRecv 12032 47072. MP_Wait 25916 MP_ISend 11748 212467. MP_IRecv 11748 212467. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.054 0.074 249.763 249.767 qs_mol_dyn_low 1 2.0 0.030 0.110 248.484 248.516 qs_forces 11 3.9 0.004 0.005 248.106 248.224 qs_energies 11 4.9 0.001 0.002 239.890 240.069 scf_env_do_scf 11 5.9 0.001 0.001 219.908 219.936 scf_env_do_scf_inner_loop 117 6.6 0.003 0.013 187.889 187.896 velocity_verlet 10 3.0 0.001 0.001 157.419 157.425 dbcsr_multiply_generic 2507 12.6 0.229 0.250 132.752 136.541 qs_scf_new_mos 117 7.6 0.001 0.001 120.989 122.361 qs_scf_loop_do_ot 117 8.6 0.001 0.001 120.988 122.360 ot_scf_mini 117 9.6 0.004 0.004 112.827 114.652 multiply_cannon 2507 13.6 0.313 0.342 100.205 105.876 multiply_cannon_loop 2507 14.6 0.568 0.654 95.252 102.566 multiply_cannon_multrec 30084 15.6 74.866 91.580 74.901 91.618 ot_mini 117 10.6 0.001 0.001 65.538 67.501 rebuild_ks_matrix 128 8.3 0.001 0.001 51.502 53.005 qs_ks_build_kohn_sham_matrix 128 9.3 0.017 0.021 51.501 53.004 mp_waitall_1 214728 16.6 23.395 50.640 23.395 50.640 qs_ks_update_qs_env 128 7.6 0.001 0.002 46.495 47.905 multiply_cannon_metrocomm3 30084 15.6 0.128 0.143 12.418 39.884 apply_preconditioner_dbcsr 128 12.6 0.000 0.001 35.571 38.234 apply_single 128 13.6 0.001 0.001 35.570 38.234 ot_diis_step 117 11.6 0.021 0.022 36.435 36.437 init_scf_loop 11 6.9 0.000 0.000 31.893 31.897 qs_ot_get_derivative 117 11.6 0.002 0.002 28.818 30.691 qs_rho_update_rho_low 128 7.7 0.001 0.001 27.033 27.246 calculate_rho_elec 128 8.7 0.109 0.132 27.033 27.246 sum_up_and_integrate 128 10.3 0.002 0.006 25.364 25.455 integrate_v_rspace 128 11.3 0.004 0.005 25.294 25.419 prepare_preconditioner 11 7.9 0.000 0.000 24.899 25.079 make_preconditioner 11 8.9 0.000 0.000 24.899 25.079 make_full_inverse_cholesky 11 9.9 0.000 0.001 23.128 23.803 qs_ot_get_p 128 10.4 0.001 0.001 20.446 23.081 make_m2s 5014 13.6 0.147 0.163 21.901 23.046 make_images 5014 14.6 1.379 1.982 18.493 19.746 grid_integrate_task_list 128 12.3 16.867 17.632 16.867 17.632 grid_collocate_task_list 128 9.7 16.435 17.585 16.435 17.585 multiply_cannon_metrocomm4 27577 15.6 0.136 0.159 6.803 17.527 mp_irecv_dv 69486 16.3 6.615 17.268 6.615 17.268 init_scf_run 11 5.9 0.000 0.001 15.039 15.040 scf_env_initial_rho_setup 11 6.9 0.000 0.001 15.038 15.040 qs_ot_get_derivative_diag 77 12.4 0.003 0.003 12.413 13.750 mp_sum_l 12367 13.3 7.107 13.341 7.107 13.341 qs_ot_p2m_diag 83 11.4 0.164 0.195 13.019 13.136 wfi_extrapolate 11 7.9 0.001 0.001 12.378 12.378 make_images_data 5014 15.6 0.081 0.100 11.136 12.341 cp_dbcsr_syevd 83 12.4 0.006 0.007 12.105 12.196 cp_fm_cholesky_invert 11 10.9 11.961 11.988 11.961 11.988 hybrid_alltoall_any 5200 16.5 0.330 1.418 9.849 11.735 fft_wrap_pw1pw2 1291 11.7 0.020 0.027 10.084 10.564 density_rs2pw 128 9.7 0.007 0.008 8.241 10.288 calculate_dm_sparse 128 9.5 0.001 0.001 9.037 9.782 fft3d_ps 1291 13.7 3.036 3.867 8.453 9.482 mp_sum_d 4469 12.1 4.788 9.140 4.788 9.140 fft_wrap_pw1pw2_140 523 12.2 0.329 0.393 8.826 8.997 cp_fm_diag_elpa 83 13.4 0.001 0.001 8.254 8.261 cp_fm_diag_elpa_base 83 14.4 8.008 8.047 8.238 8.238 mp_alltoall_d11v 2415 14.1 6.648 7.868 6.648 7.868 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.003 7.750 7.761 dbcsr_dot_sd 1318 12.0 0.921 1.065 4.362 7.476 mp_alltoall_z22v 1291 15.7 4.721 7.221 4.721 7.221 cp_fm_cholesky_decompose 22 10.9 6.612 6.796 6.612 6.796 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 6.161 6.613 qs_ot_get_orbitals 117 10.6 0.001 0.001 6.259 6.435 mp_waitany 11748 13.9 3.371 6.052 3.371 6.052 potential_pw2rs 128 12.3 0.020 0.026 5.789 5.984 transfer_rs2pw 523 10.6 0.007 0.008 3.537 5.906 dbcsr_complete_redistribute 395 12.7 0.978 1.173 4.616 5.728 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 5.547 5.696 mp_allgather_i34 2507 14.6 2.035 5.671 2.035 5.671 qs_ot_get_derivative_taylor 40 13.0 0.001 0.002 4.696 5.240 make_images_sizes 5014 15.6 0.008 0.009 2.398 5.018 mp_alltoall_i44 5014 16.6 2.390 5.011 2.390 5.011 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="404", plot="h2o_256_md", label="(4n/12r/3t)", y=249.767000, yerr=0.000000 PlotPoint: name="405", plot="h2o_256_md_mem", label="(4n/12r/3t)", y=481.818182, yerr=2.166614 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/24/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 96 x 192 x 192 25621954560 0.0% 100.0% 0.0% flops 96 x 160 x 192 42703257600 0.0% 100.0% 0.0% flops 96 x 192 x 160 42703257600 0.0% 100.0% 0.0% flops 347 x 192 x 192 57614303232 0.0% 100.0% 0.0% flops 369 x 192 x 192 61267083264 0.0% 100.0% 0.0% flops 96 x 160 x 160 71172096000 0.0% 100.0% 0.0% flops 347 x 160 x 192 96023838720 0.0% 100.0% 0.0% flops 347 x 192 x 160 96023838720 0.0% 100.0% 0.0% flops 369 x 160 x 192 102111805440 0.0% 100.0% 0.0% flops 369 x 192 x 160 102111805440 0.0% 100.0% 0.0% flops 64 x 192 x 192 102487818240 0.0% 100.0% 0.0% flops 320 x 192 x 192 106262691840 0.0% 100.0% 0.0% flops 342 x 192 x 192 113568251904 0.0% 100.0% 0.0% flops 96 x 192 x 1702 114693341184 0.0% 100.0% 0.0% flops 96 x 192 x 1707 115030278144 0.0% 100.0% 0.0% flops 96 x 192 x 1711 115299827712 0.0% 100.0% 0.0% flops 347 x 192 x 1702 116115505152 0.0% 100.0% 0.0% flops 347 x 192 x 1707 116456620032 0.0% 100.0% 0.0% flops 347 x 192 x 1711 116729511936 0.0% 100.0% 0.0% flops 369 x 192 x 1702 123477295104 0.0% 100.0% 0.0% flops 369 x 192 x 1707 123840036864 0.0% 100.0% 0.0% flops 369 x 192 x 1711 124130230272 0.0% 100.0% 0.0% flops 32 x 192 x 192 145191075840 0.0% 100.0% 0.0% flops 440 x 192 x 192 146111201280 0.0% 100.0% 0.0% flops 462 x 192 x 192 153416761344 0.0% 100.0% 0.0% flops 471 x 192 x 192 156405399552 0.0% 100.0% 0.0% flops 347 x 160 x 160 160039731200 0.0% 100.0% 0.0% flops 369 x 160 x 160 170186342400 0.0% 100.0% 0.0% flops 64 x 192 x 160 170813030400 0.0% 100.0% 0.0% flops 64 x 160 x 192 170813030400 0.0% 100.0% 0.0% flops 320 x 192 x 160 177104486400 0.0% 100.0% 0.0% flops 320 x 160 x 192 177104486400 0.0% 100.0% 0.0% flops 342 x 160 x 192 189280419840 0.0% 100.0% 0.0% flops 342 x 192 x 160 189280419840 0.0% 100.0% 0.0% flops 96 x 160 x 1702 191155568640 0.0% 100.0% 0.0% flops 96 x 160 x 1707 191717130240 0.0% 100.0% 0.0% flops 96 x 160 x 1711 192166379520 0.0% 100.0% 0.0% flops 347 x 160 x 1702 193525841920 0.0% 100.0% 0.0% flops 347 x 160 x 1707 194094366720 0.0% 100.0% 0.0% flops 347 x 160 x 1711 194549186560 0.0% 100.0% 0.0% flops 369 x 160 x 1702 205795491840 0.0% 100.0% 0.0% flops 369 x 160 x 1707 206400061440 0.0% 100.0% 0.0% flops 369 x 160 x 1711 206883717120 0.0% 100.0% 0.0% flops 320 x 192 x 1702 214161162240 0.0% 100.0% 0.0% flops 320 x 192 x 1707 214790307840 0.0% 100.0% 0.0% flops 320 x 192 x 1711 215293624320 0.0% 100.0% 0.0% flops 342 x 192 x 1702 228884742144 0.0% 100.0% 0.0% flops 342 x 192 x 1707 229557141504 0.0% 100.0% 0.0% flops 342 x 192 x 1711 230095060992 0.0% 100.0% 0.0% flops 32 x 192 x 160 241985126400 0.0% 100.0% 0.0% flops 32 x 160 x 192 241985126400 0.0% 100.0% 0.0% flops 440 x 160 x 192 243518668800 0.0% 100.0% 0.0% flops 440 x 192 x 160 243518668800 0.0% 100.0% 0.0% flops 462 x 192 x 160 255694602240 0.0% 100.0% 0.0% flops 462 x 160 x 192 255694602240 0.0% 100.0% 0.0% flops 471 x 160 x 192 260675665920 0.0% 100.0% 0.0% flops 471 x 192 x 160 260675665920 0.0% 100.0% 0.0% flops 64 x 160 x 160 284688384000 0.0% 100.0% 0.0% flops 440 x 192 x 1702 294471598080 0.0% 100.0% 0.0% flops 320 x 160 x 160 295174144000 0.0% 100.0% 0.0% flops 440 x 192 x 1707 295336673280 0.0% 100.0% 0.0% flops 440 x 192 x 1711 296028733440 0.0% 100.0% 0.0% flops 462 x 192 x 1702 309195177984 0.0% 100.0% 0.0% flops 462 x 192 x 1707 310103506944 0.0% 100.0% 0.0% flops 462 x 192 x 1711 310830170112 0.0% 100.0% 0.0% flops 471 x 192 x 1702 315218460672 0.0% 100.0% 0.0% flops 342 x 160 x 160 315467366400 0.0% 100.0% 0.0% flops 471 x 192 x 1707 316144484352 0.0% 100.0% 0.0% flops 471 x 192 x 1711 316885303296 0.0% 100.0% 0.0% flops 320 x 160 x 1702 356935270400 0.0% 100.0% 0.0% flops 320 x 160 x 1707 357983846400 0.0% 100.0% 0.0% flops 320 x 160 x 1711 358822707200 0.0% 100.0% 0.0% flops 342 x 160 x 1702 381474570240 0.0% 100.0% 0.0% flops 342 x 160 x 1707 382595235840 0.0% 100.0% 0.0% flops 342 x 160 x 1711 383491768320 0.0% 100.0% 0.0% flops 32 x 160 x 160 403308544000 0.0% 100.0% 0.0% flops 440 x 160 x 160 405864448000 0.0% 100.0% 0.0% flops 462 x 160 x 160 426157670400 0.0% 100.0% 0.0% flops 471 x 160 x 160 434459443200 0.0% 100.0% 0.0% flops 64 x 192 x 1702 458773364736 0.0% 100.0% 0.0% flops 64 x 192 x 1707 460121112576 0.0% 100.0% 0.0% flops 64 x 192 x 1711 461199310848 0.0% 100.0% 0.0% flops 440 x 160 x 1702 490785996800 0.0% 100.0% 0.0% flops 440 x 160 x 1707 492227788800 0.0% 100.0% 0.0% flops 440 x 160 x 1711 493381222400 0.0% 100.0% 0.0% flops 462 x 160 x 1702 515325296640 0.0% 100.0% 0.0% flops 462 x 160 x 1707 516839178240 0.0% 100.0% 0.0% flops 462 x 160 x 1711 518050283520 0.0% 100.0% 0.0% flops 471 x 160 x 1702 525364101120 0.0% 100.0% 0.0% flops 471 x 160 x 1707 526907473920 0.0% 100.0% 0.0% flops 471 x 160 x 1711 528142172160 0.0% 100.0% 0.0% flops 9 x 9 x 192 532589997312 0.0% 100.0% 0.0% flops 32 x 192 x 1702 649928933376 0.0% 100.0% 0.0% flops 32 x 192 x 1707 651838242816 0.0% 100.0% 0.0% flops 32 x 192 x 1711 653365690368 0.0% 100.0% 0.0% flops 22 x 9 x 192 739521446400 0.0% 100.0% 0.0% flops 9 x 22 x 192 741663876096 0.0% 100.0% 0.0% flops 449 x 192 x 192 745499197440 0.0% 100.0% 0.0% flops 64 x 160 x 1702 764622274560 0.0% 100.0% 0.0% flops 64 x 160 x 1707 766868520960 0.0% 100.0% 0.0% flops 64 x 160 x 1711 768665518080 0.0% 100.0% 0.0% flops 9 x 9 x 160 887649995520 0.0% 100.0% 0.0% flops 22 x 22 x 192 1025357887488 0.0% 100.0% 0.0% flops 32 x 160 x 1702 1083214888960 0.0% 100.0% 0.0% flops 32 x 160 x 1707 1086397071360 0.0% 100.0% 0.0% flops 32 x 160 x 1711 1088942817280 0.0% 100.0% 0.0% flops 22 x 9 x 160 1232535744000 0.0% 100.0% 0.0% flops 9 x 22 x 160 1236106460160 0.0% 100.0% 0.0% flops 449 x 192 x 160 1242498662400 0.0% 100.0% 0.0% flops 449 x 160 x 192 1242498662400 0.0% 100.0% 0.0% flops 449 x 192 x 1702 1502474403840 0.0% 100.0% 0.0% flops 449 x 192 x 1707 1506888253440 0.0% 100.0% 0.0% flops 449 x 192 x 1711 1510419333120 0.0% 100.0% 0.0% flops 22 x 22 x 160 1708929812480 0.0% 100.0% 0.0% flops 449 x 160 x 160 2070831104000 0.0% 100.0% 0.0% flops 449 x 160 x 1702 2504124006400 0.0% 100.0% 0.0% flops 449 x 160 x 1707 2511480422400 0.0% 100.0% 0.0% flops 449 x 160 x 1711 2517365555200 0.0% 100.0% 0.0% flops 9 x 32 x 9 5962613575680 0.0% 100.0% 0.0% flops 9 x 32 x 22 8325932617728 0.0% 100.0% 0.0% flops 22 x 32 x 9 8325932617728 0.0% 100.0% 0.0% flops 22 x 32 x 22 11452938371072 0.0% 100.0% 0.0% flops inhomo. stacks 3874753609728 100.0% 0.0% 0.0% flops total 93.663595E+12 4.1% 95.9% 0.0% flops max/rank 3.319256E+12 11.2% 88.8% 0.0% matmuls inhomo. stacks 49752 100.0% 0.0% 0.0% matmuls total 2962405880 0.0% 100.0% 0.0% number of processed stacks 5617614 0.9% 99.1% 0.0% average stack size 1.0 532.0 0.0 marketing flops 144.582793E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 665.788416E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 902520 MPI messages size (bytes): total size 2.219095E+12 min size 0.000000E+00 max size 23.420168E+06 average size 2.458777E+06 MPI breakdown and total messages size (bytes): size <= 128 5610 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 330 14417920 131072 < size <= 4194304 820800 1016825118720 4194304 < size <= 16777216 52740 665379241840 16777216 < size 23040 536870912000 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 125 12. MP_Allreduce 13570 37. MP_Alltoall 9654 365242. MP_ISend 60124 1071152. MP_IRecv 60124 1064037. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 3992 58120. MP_Allreduce 11057 1001. MP_Sync 87 MP_Alltoall 1712 8598467. MP_SendRecv 8960 54272. MP_ISendRecv 8960 54272. MP_Wait 20708 MP_ISend 9612 307844. MP_IRecv 9612 307844. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.141 0.209 263.050 263.080 qs_mol_dyn_low 1 2.0 0.116 0.157 261.648 261.669 qs_forces 11 3.9 0.050 0.115 261.055 261.129 qs_energies 11 4.9 0.005 0.019 251.431 251.533 scf_env_do_scf 11 5.9 0.001 0.004 230.183 230.201 scf_env_do_scf_inner_loop 117 6.6 0.006 0.023 199.577 199.621 velocity_verlet 10 3.0 0.010 0.018 164.891 164.932 dbcsr_multiply_generic 2507 12.6 0.245 0.303 130.382 133.311 qs_scf_new_mos 117 7.6 0.001 0.002 121.400 123.071 qs_scf_loop_do_ot 117 8.6 0.001 0.002 121.399 123.069 ot_scf_mini 117 9.6 0.004 0.006 112.430 113.657 multiply_cannon 2507 13.6 0.324 0.353 93.797 102.182 multiply_cannon_loop 2507 14.6 0.300 0.380 86.190 93.583 multiply_cannon_multrec 15042 15.6 73.896 83.636 73.931 83.670 ot_mini 117 10.6 0.002 0.002 66.692 67.987 rebuild_ks_matrix 128 8.3 0.001 0.001 56.022 57.853 qs_ks_build_kohn_sham_matrix 128 9.3 0.019 0.022 56.021 57.852 qs_ks_update_qs_env 128 7.6 0.001 0.001 50.712 52.477 qs_rho_update_rho_low 128 7.7 0.001 0.001 35.247 35.605 calculate_rho_elec 128 8.7 0.147 0.157 35.247 35.604 apply_preconditioner_dbcsr 128 12.6 0.001 0.001 32.251 34.938 apply_single 128 13.6 0.001 0.001 32.250 34.938 mp_waitall_1 171544 16.6 19.400 34.009 19.400 34.009 ot_diis_step 117 11.6 0.039 0.068 33.834 33.836 qs_ot_get_derivative 117 11.6 0.002 0.003 32.549 33.782 init_scf_loop 11 6.9 0.005 0.022 30.450 30.492 sum_up_and_integrate 128 10.3 0.004 0.008 29.623 29.703 integrate_v_rspace 128 11.3 0.004 0.006 29.517 29.597 make_m2s 5014 13.6 0.133 0.161 24.615 25.846 grid_collocate_task_list 128 9.7 24.141 24.710 24.141 24.710 prepare_preconditioner 11 7.9 0.000 0.000 22.781 22.953 make_preconditioner 11 8.9 0.001 0.013 22.780 22.953 qs_ot_get_p 128 10.4 0.001 0.002 21.060 22.845 make_images 5014 14.6 2.187 2.798 20.952 22.059 grid_integrate_task_list 128 12.3 20.919 21.422 20.919 21.422 make_full_inverse_cholesky 11 9.9 0.000 0.002 21.053 21.413 multiply_cannon_metrocomm3 15042 15.6 0.060 0.113 3.581 17.589 init_scf_run 11 5.9 0.000 0.004 15.300 15.301 scf_env_initial_rho_setup 11 6.9 0.001 0.003 15.299 15.301 qs_ot_get_derivative_diag 77 12.4 0.003 0.004 14.288 15.228 make_images_data 5014 15.6 0.077 0.122 12.542 14.355 hybrid_alltoall_any 5200 16.5 0.750 4.753 10.677 13.676 qs_ot_p2m_diag 83 11.4 0.217 0.283 12.674 12.777 wfi_extrapolate 11 7.9 0.001 0.002 12.776 12.776 fft_wrap_pw1pw2 1291 11.7 0.025 0.033 11.649 11.856 cp_dbcsr_syevd 83 12.4 0.006 0.007 11.670 11.688 mp_sum_l 12367 13.3 7.985 11.557 7.985 11.557 cp_fm_cholesky_invert 11 10.9 11.347 11.361 11.347 11.361 calculate_dm_sparse 128 9.5 0.001 0.002 9.954 10.997 fft_wrap_pw1pw2_140 523 12.2 0.552 0.579 10.497 10.769 mp_irecv_dv 36752 16.2 4.098 10.386 4.098 10.386 multiply_cannon_metrocomm4 12535 15.6 0.062 0.118 3.767 9.394 fft3d_ps 1291 13.7 4.441 4.904 9.185 9.355 density_rs2pw 128 9.7 0.008 0.012 8.477 9.027 cp_fm_diag_elpa 83 13.4 0.001 0.001 7.920 7.932 cp_fm_diag_elpa_base 83 14.4 7.766 7.795 7.908 7.908 cp_dbcsr_sm_fm_multiply 37 9.5 0.003 0.004 7.545 7.565 mp_sum_d 4467 12.1 4.969 7.149 4.969 7.149 dbcsr_dot_sd 1318 12.0 1.294 1.383 4.827 7.116 mp_alltoall_d11v 2415 14.1 6.585 7.111 6.585 7.111 cp_fm_cholesky_decompose 22 10.9 6.914 7.026 6.914 7.026 multiply_cannon_metrocomm1 15042 15.6 0.076 0.128 3.925 6.813 qs_ot_get_orbitals 117 10.6 0.001 0.001 6.117 6.341 potential_pw2rs 128 12.3 0.028 0.031 6.236 6.293 mp_allgather_i34 2507 14.6 2.499 6.289 2.499 6.289 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 6.058 6.253 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 5.742 6.203 qs_ot_get_derivative_taylor 40 13.0 0.001 0.002 5.358 5.697 dbcsr_complete_redistribute 395 12.7 1.384 1.568 5.001 5.455 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="406", plot="h2o_256_md", label="(4n/9r/4t)", y=263.080000, yerr=0.000000 PlotPoint: name="407", plot="h2o_256_md_mem", label="(4n/9r/4t)", y=621.727273, yerr=10.401843 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/25/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 96 x 192 x 64 17081303040 0.0% 100.0% 0.0% flops 96 x 160 x 64 28468838400 0.0% 100.0% 0.0% flops 267 x 192 x 64 29554311168 0.0% 100.0% 0.0% flops 289 x 192 x 64 31989497856 0.0% 100.0% 0.0% flops 311 x 192 x 64 34424684544 0.0% 100.0% 0.0% flops 342 x 192 x 64 37856083968 0.0% 100.0% 0.0% flops 409 x 192 x 64 45272334336 0.0% 100.0% 0.0% flops 418 x 192 x 64 46268547072 0.0% 100.0% 0.0% flops 427 x 192 x 64 47264759808 0.0% 100.0% 0.0% flops 436 x 192 x 64 48260972544 0.0% 100.0% 0.0% flops 267 x 160 x 64 49257185280 0.0% 100.0% 0.0% flops 96 x 192 x 96 51243909120 0.0% 100.0% 0.0% flops 289 x 160 x 64 53315829760 0.0% 100.0% 0.0% flops 64 x 192 x 64 56937676800 0.0% 100.0% 0.0% flops 311 x 160 x 64 57374474240 0.0% 100.0% 0.0% flops 342 x 160 x 64 63093473280 0.0% 100.0% 0.0% flops 409 x 160 x 64 75453890560 0.0% 100.0% 0.0% flops 418 x 160 x 64 77114245120 0.0% 100.0% 0.0% flops 427 x 160 x 64 78774599680 0.0% 100.0% 0.0% flops 436 x 160 x 64 80434954240 0.0% 100.0% 0.0% flops 96 x 160 x 96 85406515200 0.0% 100.0% 0.0% flops 267 x 192 x 96 88662933504 0.0% 100.0% 0.0% flops 267 x 192 x 862 90500235264 0.0% 100.0% 0.0% flops 32 x 192 x 64 91100282880 0.0% 100.0% 0.0% flops 64 x 160 x 64 94896128000 0.0% 100.0% 0.0% flops 289 x 192 x 96 95968493568 0.0% 100.0% 0.0% flops 289 x 192 x 862 97957183488 0.0% 100.0% 0.0% flops 453 x 192 x 64 100285415424 0.0% 100.0% 0.0% flops 311 x 192 x 96 103274053632 0.0% 100.0% 0.0% flops 311 x 192 x 862 105414131712 0.0% 100.0% 0.0% flops 342 x 192 x 96 113568251904 0.0% 100.0% 0.0% flops 342 x 192 x 862 115921649664 0.0% 100.0% 0.0% flops 96 x 192 x 862 116175863808 0.0% 100.0% 0.0% flops 409 x 192 x 96 135817003008 0.0% 100.0% 0.0% flops 409 x 192 x 862 138631446528 0.0% 100.0% 0.0% flops 418 x 192 x 96 138805641216 0.0% 100.0% 0.0% flops 418 x 192 x 862 141682016256 0.0% 100.0% 0.0% flops 427 x 192 x 96 141794279424 0.0% 100.0% 0.0% flops 427 x 192 x 862 144732585984 0.0% 100.0% 0.0% flops 436 x 192 x 96 144782917632 0.0% 100.0% 0.0% flops 440 x 192 x 64 146111201280 0.0% 100.0% 0.0% flops 267 x 160 x 96 147771555840 0.0% 100.0% 0.0% flops 436 x 192 x 862 147783155712 0.0% 100.0% 0.0% flops 267 x 160 x 862 150833725440 0.0% 100.0% 0.0% flops 32 x 160 x 64 151833804800 0.0% 100.0% 0.0% flops 289 x 160 x 96 159947489280 0.0% 100.0% 0.0% flops 289 x 160 x 862 163261972480 0.0% 100.0% 0.0% flops 453 x 160 x 64 167142359040 0.0% 100.0% 0.0% flops 64 x 192 x 96 170813030400 0.0% 100.0% 0.0% flops 311 x 160 x 96 172123422720 0.0% 100.0% 0.0% flops 311 x 160 x 862 175690219520 0.0% 100.0% 0.0% flops 267 x 192 x 849 178270765056 0.0% 100.0% 0.0% flops 342 x 160 x 96 189280419840 0.0% 100.0% 0.0% flops 289 x 192 x 849 192959741952 0.0% 100.0% 0.0% flops 342 x 160 x 862 193202749440 0.0% 100.0% 0.0% flops 96 x 160 x 862 193626439680 0.0% 100.0% 0.0% flops 311 x 192 x 849 207648718848 0.0% 100.0% 0.0% flops 409 x 160 x 96 226361671680 0.0% 100.0% 0.0% flops 342 x 192 x 849 228346822656 0.0% 100.0% 0.0% flops 96 x 192 x 849 228847583232 0.0% 100.0% 0.0% flops 409 x 160 x 862 231052410880 0.0% 100.0% 0.0% flops 418 x 160 x 96 231342735360 0.0% 100.0% 0.0% flops 418 x 160 x 862 236136693760 0.0% 100.0% 0.0% flops 427 x 160 x 96 236323799040 0.0% 100.0% 0.0% flops 427 x 160 x 862 241220976640 0.0% 100.0% 0.0% flops 436 x 160 x 96 241304862720 0.0% 100.0% 0.0% flops 440 x 160 x 64 243518668800 0.0% 100.0% 0.0% flops 436 x 160 x 862 246305259520 0.0% 100.0% 0.0% flops 409 x 192 x 849 273081434112 0.0% 100.0% 0.0% flops 32 x 192 x 96 273300848640 0.0% 100.0% 0.0% flops 418 x 192 x 849 279090561024 0.0% 100.0% 0.0% flops 64 x 160 x 96 284688384000 0.0% 100.0% 0.0% flops 427 x 192 x 849 285099687936 0.0% 100.0% 0.0% flops 436 x 192 x 849 291108814848 0.0% 100.0% 0.0% flops 267 x 160 x 849 297117941760 0.0% 100.0% 0.0% flops 453 x 192 x 96 300856246272 0.0% 100.0% 0.0% flops 453 x 192 x 862 307090685952 0.0% 100.0% 0.0% flops 289 x 160 x 849 321599569920 0.0% 100.0% 0.0% flops 311 x 160 x 849 346081198080 0.0% 100.0% 0.0% flops 9 x 9 x 64 355060661760 0.0% 100.0% 0.0% flops 342 x 160 x 849 380578037760 0.0% 100.0% 0.0% flops 96 x 160 x 849 381412638720 0.0% 100.0% 0.0% flops 64 x 192 x 862 387252879360 0.0% 100.0% 0.0% flops 440 x 192 x 96 438333603840 0.0% 100.0% 0.0% flops 440 x 192 x 862 447416893440 0.0% 100.0% 0.0% flops 409 x 160 x 849 455135723520 0.0% 100.0% 0.0% flops 32 x 160 x 96 455501414400 0.0% 100.0% 0.0% flops 418 x 160 x 849 465150935040 0.0% 100.0% 0.0% flops 427 x 160 x 849 475166146560 0.0% 100.0% 0.0% flops 436 x 160 x 849 485181358080 0.0% 100.0% 0.0% flops 22 x 9 x 64 493014297600 0.0% 100.0% 0.0% flops 9 x 22 x 64 494442584064 0.0% 100.0% 0.0% flops 453 x 160 x 96 501427077120 0.0% 100.0% 0.0% flops 453 x 160 x 862 511817809920 0.0% 100.0% 0.0% flops 453 x 192 x 849 604918775808 0.0% 100.0% 0.0% flops 32 x 192 x 862 619604606976 0.0% 100.0% 0.0% flops 64 x 160 x 862 645421465600 0.0% 100.0% 0.0% flops 22 x 22 x 64 683571924992 0.0% 100.0% 0.0% flops 440 x 160 x 96 730556006400 0.0% 100.0% 0.0% flops 440 x 160 x 862 745694822400 0.0% 100.0% 0.0% flops 64 x 192 x 849 762825277440 0.0% 100.0% 0.0% flops 440 x 192 x 849 881338613760 0.0% 100.0% 0.0% flops 453 x 160 x 849 1008197959680 0.0% 100.0% 0.0% flops 32 x 160 x 862 1032674344960 0.0% 100.0% 0.0% flops 9 x 9 x 96 1065181985280 0.0% 100.0% 0.0% flops 32 x 192 x 849 1220520443904 0.0% 100.0% 0.0% flops 64 x 160 x 849 1271375462400 0.0% 100.0% 0.0% flops 440 x 160 x 849 1468897689600 0.0% 100.0% 0.0% flops 22 x 9 x 96 1479042892800 0.0% 100.0% 0.0% flops 9 x 22 x 96 1483327752192 0.0% 100.0% 0.0% flops 32 x 160 x 849 2034200739840 0.0% 100.0% 0.0% flops 22 x 22 x 96 2050715774976 0.0% 100.0% 0.0% flops 9 x 32 x 9 5962625519616 0.0% 100.0% 0.0% flops 22 x 32 x 9 8325932617728 0.0% 100.0% 0.0% flops 9 x 32 x 22 8325932617728 0.0% 100.0% 0.0% flops 22 x 32 x 22 11452938371072 0.0% 100.0% 0.0% flops inhomo. stacks 22380222873600 100.0% 0.0% 0.0% flops total 95.022989E+12 23.6% 76.4% 0.0% flops max/rank 5.030957E+12 27.1% 72.9% 0.0% matmuls inhomo. stacks 645336 100.0% 0.0% 0.0% matmuls total 3090446816 0.0% 100.0% 0.0% number of processed stacks 8282084 7.8% 92.2% 0.0% average stack size 1.0 404.6 0.0 marketing flops 144.580175E+12 ------------------------------------------------------------------------------- # multiplications 2507 max memory usage/rank 875.356160E+06 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 1143192 MPI messages size (bytes): total size 2.023815E+12 min size 0.000000E+00 max size 17.653760E+06 average size 1.770319E+06 MPI breakdown and total messages size (bytes): size <= 128 6996 0 128 < size <= 8192 0 0 8192 < size <= 32768 396 8650752 32768 < size <= 131072 319024 36042702848 131072 < size <= 4194304 715736 785529176064 4194304 < size <= 16777216 70320 665379475120 16777216 < size 30720 536870912000 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4002 58205. MP_Allreduce 11082 1082. MP_Sync 87 MP_Alltoall 1712 12503107. MP_SendRecv 5888 75008. MP_ISendRecv 5888 75008. MP_Wait 22442 MP_ISend 14952 244818. MP_IRecv 14952 244818. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.086 0.234 324.112 324.118 qs_mol_dyn_low 1 2.0 0.064 0.244 322.938 322.952 qs_forces 11 3.9 0.026 0.108 322.327 322.474 qs_energies 11 4.9 0.017 0.065 312.209 312.368 scf_env_do_scf 11 5.9 0.004 0.013 287.806 287.830 scf_env_do_scf_inner_loop 117 6.6 0.006 0.031 229.953 229.978 velocity_verlet 10 3.0 0.005 0.017 211.633 211.658 dbcsr_multiply_generic 2507 12.6 0.262 0.318 160.206 163.488 qs_scf_new_mos 117 7.6 0.001 0.002 146.110 147.554 qs_scf_loop_do_ot 117 8.6 0.001 0.002 146.109 147.553 ot_scf_mini 117 9.6 0.004 0.005 136.634 138.156 multiply_cannon 2507 13.6 0.359 0.407 104.164 114.747 multiply_cannon_loop 2507 14.6 0.534 0.656 96.633 106.396 multiply_cannon_multrec 30084 15.6 85.945 98.435 85.985 98.476 ot_mini 117 10.6 0.001 0.002 83.850 85.625 rebuild_ks_matrix 128 8.3 0.001 0.001 61.535 63.362 qs_ks_build_kohn_sham_matrix 128 9.3 0.020 0.022 61.534 63.362 init_scf_loop 11 6.9 0.001 0.005 57.661 57.696 qs_ks_update_qs_env 128 7.6 0.001 0.001 55.924 57.648 prepare_preconditioner 11 7.9 0.000 0.001 49.282 49.398 make_preconditioner 11 8.9 0.000 0.002 49.282 49.398 make_full_inverse_cholesky 11 9.9 0.021 0.031 40.662 47.448 apply_preconditioner_dbcsr 128 12.6 0.001 0.001 43.937 45.923 apply_single 128 13.6 0.001 0.001 43.937 45.922 ot_diis_step 117 11.6 0.028 0.032 43.984 43.986 qs_ot_get_derivative 117 11.6 0.002 0.003 39.641 41.243 make_m2s 5014 13.6 0.160 0.196 39.812 40.996 qs_rho_update_rho_low 128 7.7 0.002 0.007 36.371 36.528 calculate_rho_elec 128 8.7 0.213 0.230 36.369 36.521 make_images 5014 14.6 3.151 3.913 33.152 33.954 sum_up_and_integrate 128 10.3 0.003 0.007 30.372 30.439 integrate_v_rspace 128 11.3 0.004 0.005 30.264 30.332 mp_waitall_1 147882 16.7 21.779 28.338 21.779 28.338 qs_ot_get_p 128 10.4 0.001 0.002 23.472 25.959 grid_collocate_task_list 128 9.7 24.904 25.332 24.904 25.332 cp_fm_upper_to_full 105 14.8 15.786 22.771 15.786 22.771 make_images_data 5014 15.6 0.086 0.133 19.989 22.514 grid_integrate_task_list 128 12.3 21.427 21.922 21.427 21.922 hybrid_alltoall_any 5200 16.5 0.702 2.539 18.353 20.290 dbcsr_complete_redistribute 395 12.7 1.847 1.984 12.643 18.124 qs_ot_get_derivative_diag 77 12.4 0.003 0.004 16.280 17.622 init_scf_run 11 5.9 0.001 0.002 17.412 17.417 scf_env_initial_rho_setup 11 6.9 0.007 0.027 17.412 17.416 copy_fm_to_dbcsr 209 11.7 0.002 0.006 10.560 16.020 cp_fm_cholesky_invert 11 10.9 15.096 15.117 15.096 15.117 wfi_extrapolate 11 7.9 0.001 0.002 15.050 15.050 qs_ot_p2m_diag 83 11.4 0.307 0.350 14.441 14.519 multiply_cannon_metrocomm4 25070 15.6 0.116 0.219 5.908 14.474 mp_sum_l 12367 13.3 9.058 14.356 9.058 14.356 mp_irecv_dv 76098 16.2 5.778 14.348 5.778 14.348 mp_alltoall_i22 716 14.1 8.317 14.306 8.317 14.306 transfer_fm_to_dbcsr 11 9.9 0.031 0.047 8.583 13.901 cp_dbcsr_syevd 83 12.4 0.006 0.007 13.197 13.214 fft_wrap_pw1pw2 1291 11.7 0.022 0.029 12.029 12.081 calculate_dm_sparse 128 9.5 0.001 0.002 10.574 11.148 fft_wrap_pw1pw2_140 523 12.2 0.581 0.598 10.801 10.898 cp_fm_diag_elpa 83 13.4 0.001 0.001 9.583 9.593 cp_fm_diag_elpa_base 83 14.4 8.704 8.970 9.573 9.573 fft3d_ps 1291 13.7 4.529 4.636 9.413 9.463 cp_dbcsr_sm_fm_multiply 37 9.5 0.003 0.003 9.349 9.446 density_rs2pw 128 9.7 0.008 0.012 8.719 9.145 qs_ot_get_derivative_taylor 40 13.0 0.001 0.002 8.486 8.837 mp_alltoall_d11v 2415 14.1 7.625 8.255 7.625 8.255 cp_fm_cholesky_decompose 22 10.9 7.763 7.893 7.763 7.893 multiply_cannon_metrocomm3 30084 15.6 0.077 0.138 3.253 7.893 dbcsr_dot_sd 1318 12.0 1.674 1.784 4.214 7.734 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 7.087 7.711 make_images_sizes 5014 15.6 0.009 0.017 3.498 7.345 mp_alltoall_i44 5014 16.6 3.489 7.337 3.489 7.337 dbcsr_make_dense_low 13013 15.7 0.131 0.192 6.727 7.255 qs_ot_get_orbitals 117 10.6 0.001 0.001 6.903 7.080 dbcsr_make_images_dense 4384 14.8 0.092 0.130 6.230 7.023 make_dense_data 13013 16.7 6.172 6.655 6.539 7.013 mp_sum_d 4472 12.1 3.572 6.989 3.572 6.989 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 6.448 6.600 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="408", plot="h2o_256_md", label="(4n/6r/6t)", y=324.118000, yerr=0.000000 PlotPoint: name="409", plot="h2o_256_md_mem", label="(4n/6r/6t)", y=755.818182, yerr=14.160240 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/26/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 160 x 256 x 256 190589173760 0.0% 100.0% 0.0% flops 182 x 256 x 256 216795185152 0.0% 100.0% 0.0% flops 267 x 256 x 256 318045683712 0.0% 100.0% 0.0% flops 169 x 256 x 256 402619629568 0.0% 100.0% 0.0% flops 160 x 256 x 2560 432852172800 0.0% 100.0% 0.0% flops 182 x 256 x 2560 492369346560 0.0% 100.0% 0.0% flops 64 x 256 x 256 616730460160 0.0% 100.0% 0.0% flops 302 x 256 x 256 719474130944 0.0% 100.0% 0.0% flops 267 x 256 x 2560 722322063360 0.0% 100.0% 0.0% flops 324 x 256 x 256 771886153728 0.0% 100.0% 0.0% flops 169 x 256 x 2560 914400215040 0.0% 100.0% 0.0% flops 32 x 256 x 256 1356807012352 0.0% 100.0% 0.0% flops 9 x 9 x 256 1430456039424 0.0% 100.0% 0.0% flops 302 x 256 x 2560 1634016952320 0.0% 100.0% 0.0% flops 324 x 256 x 2560 1753051299840 0.0% 100.0% 0.0% flops 22 x 9 x 256 1986255912960 0.0% 100.0% 0.0% flops 9 x 22 x 256 1992003932160 0.0% 100.0% 0.0% flops 311 x 256 x 256 2222746238976 0.0% 100.0% 0.0% flops 22 x 22 x 256 2753958699008 0.0% 100.0% 0.0% flops 64 x 256 x 2560 3093718630400 0.0% 100.0% 0.0% flops 289 x 256 x 256 3442516951040 0.0% 100.0% 0.0% flops 311 x 256 x 2560 5048138465280 0.0% 100.0% 0.0% flops 9 x 32 x 9 6003307892736 0.0% 100.0% 0.0% flops 32 x 256 x 2560 6806180986880 0.0% 100.0% 0.0% flops 289 x 256 x 2560 7818392371200 0.0% 100.0% 0.0% flops 9 x 32 x 22 8382797660160 0.0% 100.0% 0.0% flops 22 x 32 x 9 8382797660160 0.0% 100.0% 0.0% flops 22 x 32 x 22 11531114856448 0.0% 100.0% 0.0% flops inhomo. stacks 13914431553536 100.0% 0.0% 0.0% flops total 95.358159E+12 14.6% 85.4% 0.0% flops max/rank 7.230947E+12 22.4% 77.6% 0.0% matmuls inhomo. stacks 122672 100.0% 0.0% 0.0% matmuls total 2939463668 0.0% 100.0% 0.0% number of processed stacks 4570836 2.7% 97.3% 0.0% average stack size 1.0 660.8 0.0 marketing flops 145.661668E+12 ------------------------------------------------------------------------------- # multiplications 2534 max memory usage/rank 1.173254E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 243264 MPI messages size (bytes): total size 1.342058E+12 min size 0.000000E+00 max size 52.428800E+06 average size 5.516877E+06 MPI breakdown and total messages size (bytes): size <= 128 1452 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 132 8650752 131072 < size <= 4194304 115488 60548972544 4194304 < size <= 16777216 105840 554906419200 16777216 < size 20352 726592466352 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 31 12. MP_Allreduce 13526 37. MP_Alltoall 9734 749570. MP_ISend 40500 2089810. MP_IRecv 40500 2088916. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4043 57624. MP_Allreduce 11184 1163. MP_Sync 88 MP_Alltoall 1724 18848034. MP_SendRecv 3870 122880. MP_ISendRecv 3870 122880. MP_Wait 16244 MP_ISend 10760 423501. MP_IRecv 10760 423501. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.136 0.216 319.253 319.309 qs_mol_dyn_low 1 2.0 0.084 0.128 317.800 317.858 qs_forces 11 3.9 0.065 0.127 317.348 317.455 qs_energies 11 4.9 0.037 0.116 307.140 307.221 scf_env_do_scf 11 5.9 0.002 0.019 280.131 280.164 scf_env_do_scf_inner_loop 118 6.6 0.009 0.032 235.094 235.134 velocity_verlet 10 3.0 0.002 0.003 203.221 203.255 dbcsr_multiply_generic 2534 12.6 0.267 0.288 163.455 166.659 qs_scf_new_mos 118 7.6 0.001 0.001 149.649 151.536 qs_scf_loop_do_ot 118 8.6 0.002 0.002 149.648 151.535 ot_scf_mini 118 9.6 0.009 0.021 139.609 141.627 multiply_cannon 2534 13.6 0.355 0.400 113.050 122.320 multiply_cannon_loop 2534 14.6 0.345 0.449 102.497 108.158 multiply_cannon_multrec 10136 15.6 83.696 95.406 83.737 95.448 ot_mini 118 10.6 0.012 0.016 82.908 84.978 rebuild_ks_matrix 129 8.3 0.001 0.001 62.987 64.883 qs_ks_build_kohn_sham_matrix 129 9.3 0.020 0.023 62.987 64.883 qs_ks_update_qs_env 129 7.6 0.001 0.001 57.567 59.323 apply_preconditioner_dbcsr 129 12.6 0.001 0.001 46.910 49.061 apply_single 129 13.6 0.001 0.001 46.909 49.061 ot_diis_step 118 11.6 0.090 0.127 47.408 47.420 mp_waitall_1 127116 16.7 31.684 46.603 31.684 46.603 init_scf_loop 11 6.9 0.023 0.071 44.818 44.832 make_m2s 5068 13.6 0.128 0.137 37.004 43.207 qs_ot_get_derivative 118 11.6 0.002 0.003 35.382 37.447 qs_rho_update_rho_low 129 7.7 0.002 0.002 36.748 36.897 calculate_rho_elec 129 8.7 0.318 0.330 36.746 36.897 prepare_preconditioner 11 7.9 0.000 0.001 35.940 36.082 make_preconditioner 11 8.9 0.001 0.001 35.940 36.082 make_images 5068 14.6 3.490 4.778 30.003 35.256 make_full_inverse_cholesky 11 9.9 0.018 0.036 31.022 33.533 sum_up_and_integrate 129 10.3 0.003 0.003 29.634 29.685 integrate_v_rspace 129 11.3 0.004 0.004 29.533 29.585 qs_ot_get_p 129 10.4 0.001 0.001 25.809 28.311 make_images_data 5068 15.6 0.071 0.087 19.538 26.231 grid_collocate_task_list 129 9.7 23.491 24.954 23.491 24.954 hybrid_alltoall_any 5255 16.5 1.109 5.007 18.318 23.666 grid_integrate_task_list 129 12.3 20.066 20.572 20.066 20.572 init_scf_run 11 5.9 0.001 0.003 18.865 18.867 scf_env_initial_rho_setup 11 6.9 0.007 0.017 18.865 18.866 multiply_cannon_metrocomm1 10136 15.6 0.045 0.049 10.654 18.461 cp_fm_cholesky_invert 11 10.9 17.580 17.595 17.580 17.595 qs_ot_get_derivative_diag 78 12.4 0.003 0.003 15.700 17.220 qs_ot_p2m_diag 84 11.4 0.434 0.443 16.701 16.738 mp_allgather_i34 2534 14.6 7.427 16.619 7.427 16.619 wfi_extrapolate 11 7.9 0.001 0.001 15.686 15.689 cp_dbcsr_syevd 84 12.4 0.006 0.007 15.418 15.448 fft_wrap_pw1pw2 1301 11.7 0.021 0.025 13.738 14.062 density_rs2pw 129 9.7 0.007 0.008 10.332 13.412 fft_wrap_pw1pw2_140 527 12.2 0.712 0.761 12.156 12.634 cp_fm_diag_elpa 84 13.4 0.000 0.001 11.787 11.792 cp_fm_diag_elpa_base 84 14.4 11.405 11.536 11.781 11.783 calculate_dm_sparse 129 9.5 0.001 0.002 11.253 11.709 fft3d_ps 1301 13.7 4.922 5.121 10.775 10.846 multiply_cannon_metrocomm4 7602 15.6 0.033 0.037 5.110 10.692 dbcsr_complete_redistribute 397 12.7 2.597 2.726 9.910 10.683 mp_irecv_dv 29142 15.9 5.093 10.654 5.093 10.654 mp_sum_l 12498 13.3 6.244 10.495 6.244 10.495 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 10.059 10.162 mp_alltoall_d11v 2429 14.1 8.251 9.345 8.251 9.345 dbcsr_make_dense_low 6511 15.6 0.075 0.079 7.473 9.230 make_dense_data 6511 16.6 7.165 8.843 7.371 9.125 dbcsr_make_images_dense 4434 14.8 0.047 0.049 6.635 8.508 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 7.744 8.276 cp_fm_cholesky_decompose 22 10.9 8.063 8.260 8.063 8.260 multiply_cannon_metrocomm3 10136 15.6 0.034 0.036 2.466 8.128 dbcsr_data_release 191253 16.1 4.440 7.970 4.440 7.970 copy_fm_to_dbcsr 210 11.7 0.002 0.002 7.151 7.888 dbcsr_dot_sd 1331 12.0 2.329 2.364 5.010 7.092 qs_ot_get_orbitals 118 10.6 0.001 0.001 6.731 6.795 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 6.560 6.708 mp_waitany 10760 13.9 3.682 6.613 3.682 6.613 transfer_rs2pw 527 10.6 0.007 0.008 3.705 6.576 copy_dbcsr_to_fm 187 11.8 0.004 0.004 5.820 6.468 potential_pw2rs 129 12.3 0.036 0.039 6.449 6.462 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="410", plot="h2o_256_md", label="(4n/4r/9t)", y=319.309000, yerr=0.000000 PlotPoint: name="411", plot="h2o_256_md_mem", label="(4n/4r/9t)", y=1063.636364, yerr=13.719745 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/27/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 32 x 192 x 192 121815171072 0.0% 100.0% 0.0% flops 462 x 192 x 192 154779254784 0.0% 100.0% 0.0% flops 462 x 192 x 1702 155805382656 0.0% 100.0% 0.0% flops 64 x 192 x 192 156619505664 0.0% 100.0% 0.0% flops 32 x 192 x 160 203025285120 0.0% 100.0% 0.0% flops 32 x 160 x 192 203025285120 0.0% 100.0% 0.0% flops 440 x 192 x 192 221113221120 0.0% 100.0% 0.0% flops 440 x 192 x 1702 222579118080 0.0% 100.0% 0.0% flops 462 x 160 x 192 257965424640 0.0% 100.0% 0.0% flops 462 x 192 x 160 257965424640 0.0% 100.0% 0.0% flops 462 x 160 x 1702 259675637760 0.0% 100.0% 0.0% flops 64 x 192 x 160 261032509440 0.0% 100.0% 0.0% flops 64 x 160 x 192 261032509440 0.0% 100.0% 0.0% flops 32 x 192 x 1702 269960183808 0.0% 100.0% 0.0% flops 453 x 192 x 192 303528148992 0.0% 100.0% 0.0% flops 453 x 192 x 1702 305540425728 0.0% 100.0% 0.0% flops 462 x 192 x 1698 310878425088 0.0% 100.0% 0.0% flops 462 x 192 x 1711 313258530816 0.0% 100.0% 0.0% flops 32 x 160 x 160 338375475200 0.0% 100.0% 0.0% flops 64 x 192 x 1702 347091664896 0.0% 100.0% 0.0% flops 440 x 192 x 160 368522035200 0.0% 100.0% 0.0% flops 440 x 160 x 192 368522035200 0.0% 100.0% 0.0% flops 440 x 160 x 1702 370965196800 0.0% 100.0% 0.0% flops 449 x 192 x 192 376059985920 0.0% 100.0% 0.0% flops 449 x 192 x 1702 378553121280 0.0% 100.0% 0.0% flops 462 x 160 x 160 429942374400 0.0% 100.0% 0.0% flops 64 x 160 x 160 435054182400 0.0% 100.0% 0.0% flops 440 x 192 x 1698 444112035840 0.0% 100.0% 0.0% flops 440 x 192 x 1711 447512186880 0.0% 100.0% 0.0% flops 32 x 160 x 1702 449933639680 0.0% 100.0% 0.0% flops 453 x 192 x 160 505880248320 0.0% 100.0% 0.0% flops 453 x 160 x 192 505880248320 0.0% 100.0% 0.0% flops 453 x 160 x 1702 509234042880 0.0% 100.0% 0.0% flops 462 x 160 x 1698 518130708480 0.0% 100.0% 0.0% flops 462 x 160 x 1711 522097551360 0.0% 100.0% 0.0% flops 9 x 9 x 192 536420454912 0.0% 100.0% 0.0% flops 32 x 192 x 1698 538651459584 0.0% 100.0% 0.0% flops 32 x 192 x 1711 542775410688 0.0% 100.0% 0.0% flops 64 x 160 x 1702 578486108160 0.0% 100.0% 0.0% flops 453 x 192 x 1698 609644703744 0.0% 100.0% 0.0% flops 440 x 160 x 160 614203392000 0.0% 100.0% 0.0% flops 453 x 192 x 1711 614312183808 0.0% 100.0% 0.0% flops 449 x 192 x 160 626766643200 0.0% 100.0% 0.0% flops 449 x 160 x 192 626766643200 0.0% 100.0% 0.0% flops 449 x 160 x 1702 630921868800 0.0% 100.0% 0.0% flops 64 x 192 x 1698 692551876608 0.0% 100.0% 0.0% flops 64 x 192 x 1711 697854099456 0.0% 100.0% 0.0% flops 440 x 160 x 1698 740186726400 0.0% 100.0% 0.0% flops 22 x 9 x 192 744845967360 0.0% 100.0% 0.0% flops 440 x 160 x 1711 745853644800 0.0% 100.0% 0.0% flops 9 x 22 x 192 747002539008 0.0% 100.0% 0.0% flops 449 x 192 x 1698 755326909440 0.0% 100.0% 0.0% flops 449 x 192 x 1711 761109742080 0.0% 100.0% 0.0% flops 453 x 160 x 160 843133747200 0.0% 100.0% 0.0% flops 9 x 9 x 160 894034091520 0.0% 100.0% 0.0% flops 32 x 160 x 1698 897752432640 0.0% 100.0% 0.0% flops 32 x 160 x 1711 904625684480 0.0% 100.0% 0.0% flops 453 x 160 x 1698 1016074506240 0.0% 100.0% 0.0% flops 453 x 160 x 1711 1023853639680 0.0% 100.0% 0.0% flops 22 x 22 x 192 1032734512128 0.0% 100.0% 0.0% flops 449 x 160 x 160 1044611072000 0.0% 100.0% 0.0% flops 64 x 160 x 1698 1154253127680 0.0% 100.0% 0.0% flops 64 x 160 x 1711 1163090165760 0.0% 100.0% 0.0% flops 22 x 9 x 160 1241409945600 0.0% 100.0% 0.0% flops 9 x 22 x 160 1245004231680 0.0% 100.0% 0.0% flops 449 x 160 x 1698 1258878182400 0.0% 100.0% 0.0% flops 449 x 160 x 1711 1268516236800 0.0% 100.0% 0.0% flops 22 x 22 x 160 1721224186880 0.0% 100.0% 0.0% flops 9 x 32 x 9 6003301257216 0.0% 100.0% 0.0% flops 22 x 32 x 9 8382804148224 0.0% 100.0% 0.0% flops 9 x 32 x 22 8382804148224 0.0% 100.0% 0.0% flops 22 x 32 x 22 11531114856448 0.0% 100.0% 0.0% flops inhomo. stacks 24540172943360 100.0% 0.0% 0.0% flops total 98.915922E+12 24.8% 75.2% 0.0% flops max/rank 9.800669E+12 26.8% 73.2% 0.0% matmuls inhomo. stacks 325374 100.0% 0.0% 0.0% matmuls total 2982637094 0.0% 100.0% 0.0% number of processed stacks 5606974 5.8% 94.2% 0.0% average stack size 1.0 564.7 0.0 marketing flops 145.669034E+12 ------------------------------------------------------------------------------- # multiplications 2537 max memory usage/rank 1.535021E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 243552 MPI messages size (bytes): total size 1.843076E+12 min size 0.000000E+00 max size 70.506240E+06 average size 7.567484E+06 MPI breakdown and total messages size (bytes): size <= 128 1386 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 66 2883584 131072 < size <= 4194304 145860 137153740800 4194304 < size <= 16777216 70800 494927872000 16777216 < size 25440 1210987567760 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4042 58574. MP_Allreduce 11182 1416. MP_Sync 87 MP_Alltoall 1724 25011495. MP_SendRecv 2838 150016. MP_ISendRecv 2838 150016. MP_Wait 13060 MP_ISend 8608 623442. MP_IRecv 8608 623442. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.217 0.401 420.247 420.274 qs_mol_dyn_low 1 2.0 0.077 0.144 418.331 418.380 qs_forces 11 3.9 0.013 0.027 417.831 417.910 qs_energies 11 4.9 0.004 0.008 406.037 406.114 scf_env_do_scf 11 5.9 0.001 0.003 374.010 374.051 scf_env_do_scf_inner_loop 118 6.6 0.005 0.023 292.531 292.559 velocity_verlet 10 3.0 0.031 0.080 275.327 275.375 dbcsr_multiply_generic 2537 12.6 0.290 0.306 215.909 219.043 qs_scf_new_mos 118 7.6 0.001 0.002 193.010 194.700 qs_scf_loop_do_ot 118 8.6 0.001 0.001 193.009 194.699 ot_scf_mini 118 9.6 0.004 0.005 182.997 184.060 multiply_cannon 2537 13.6 0.387 0.413 154.956 161.308 multiply_cannon_loop 2537 14.6 0.369 0.390 144.413 150.936 ot_mini 118 10.6 0.001 0.002 111.185 112.458 multiply_cannon_multrec 15222 15.6 85.265 109.273 85.318 109.320 mp_waitall_1 126228 16.7 63.306 99.635 63.306 99.635 init_scf_loop 11 6.9 0.002 0.006 81.198 81.222 multiply_cannon_metrocomm3 15222 15.6 0.064 0.067 42.508 81.178 rebuild_ks_matrix 129 8.3 0.001 0.001 74.367 76.546 qs_ks_build_kohn_sham_matrix 129 9.3 0.019 0.020 74.366 76.545 apply_preconditioner_dbcsr 129 12.6 0.001 0.001 69.079 71.871 apply_single 129 13.6 0.001 0.001 69.078 71.870 prepare_preconditioner 11 7.9 0.000 0.000 70.487 70.582 make_preconditioner 11 8.9 0.000 0.001 70.487 70.581 qs_ks_update_qs_env 129 7.6 0.001 0.001 68.224 70.220 ot_diis_step 118 11.6 0.061 0.093 68.462 68.464 make_full_inverse_cholesky 11 9.9 0.039 0.050 58.621 67.651 make_m2s 5074 13.6 0.149 0.156 44.748 46.794 qs_ot_get_derivative 118 11.6 0.002 0.003 42.444 43.582 qs_rho_update_rho_low 129 7.7 0.001 0.001 41.344 41.566 calculate_rho_elec 129 8.7 0.414 0.425 41.343 41.566 make_images 5074 14.6 4.522 5.161 34.497 36.606 multiply_cannon_metrocomm4 12685 15.6 0.068 0.078 14.355 35.974 mp_irecv_dv 35533 16.2 14.240 35.801 14.240 35.801 cp_fm_upper_to_full 105 14.8 25.988 35.577 25.988 35.577 sum_up_and_integrate 129 10.3 0.003 0.006 31.350 31.480 integrate_v_rspace 129 11.3 0.004 0.005 31.241 31.377 qs_ot_get_p 129 10.4 0.001 0.001 29.545 31.349 grid_collocate_task_list 129 9.7 28.774 29.194 28.774 29.194 hybrid_alltoall_any 5260 16.5 1.358 3.787 21.943 25.413 make_images_data 5074 15.6 0.079 0.093 22.641 24.985 dbcsr_complete_redistribute 395 12.7 3.330 3.515 18.063 24.391 init_scf_run 11 5.9 0.001 0.004 23.196 23.200 scf_env_initial_rho_setup 11 6.9 0.001 0.004 23.195 23.199 grid_integrate_task_list 129 12.3 22.055 22.386 22.055 22.386 cp_fm_cholesky_invert 11 10.9 21.383 21.418 21.383 21.418 copy_fm_to_dbcsr 209 11.7 0.002 0.002 14.605 21.070 qs_ot_p2m_diag 83 11.4 0.536 0.593 19.137 19.228 wfi_extrapolate 11 7.9 0.001 0.002 19.196 19.197 qs_ot_get_derivative_diag 77 12.4 0.003 0.003 17.561 18.886 transfer_fm_to_dbcsr 11 9.9 0.037 0.039 11.821 17.989 cp_dbcsr_syevd 83 12.4 0.006 0.006 17.635 17.637 mp_alltoall_i22 718 14.1 10.476 16.798 10.476 16.798 mp_sum_l 12513 13.3 9.838 14.347 9.838 14.347 cp_fm_diag_elpa 83 13.4 0.000 0.000 14.139 14.157 cp_fm_diag_elpa_base 83 14.4 12.078 12.616 14.126 14.127 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 13.633 13.866 fft_wrap_pw1pw2 1301 11.7 0.022 0.031 13.369 13.553 fft_wrap_pw1pw2_140 527 12.2 0.824 0.838 11.986 12.121 calculate_dm_sparse 129 9.5 0.001 0.001 11.294 11.940 dbcsr_make_dense_low 10960 15.7 0.113 0.116 10.584 11.026 make_dense_data 10960 16.7 10.065 10.485 10.427 10.866 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 10.451 10.859 fft3d_ps 1301 13.7 5.178 5.578 10.349 10.589 density_rs2pw 129 9.7 0.007 0.008 9.383 10.216 dbcsr_make_images_dense 4440 14.8 0.070 0.072 9.813 9.975 mp_alltoall_d11v 2423 14.1 9.143 9.458 9.143 9.458 qs_ot_get_orbitals 118 10.6 0.001 0.001 8.957 9.209 dbcsr_dot_sd 1331 12.0 2.923 3.133 6.271 9.169 cp_fm_cholesky_decompose 22 10.9 8.570 8.697 8.570 8.697 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="412", plot="h2o_256_md", label="(4n/3r/12t)", y=420.274000, yerr=0.000000 PlotPoint: name="413", plot="h2o_256_md_mem", label="(4n/3r/12t)", y=1297.000000, yerr=31.740424 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/28/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 258 x 256 x 256 614650085376 0.0% 100.0% 0.0% flops 32 x 256 x 256 989989961728 0.0% 100.0% 0.0% flops 64 x 256 x 256 989989961728 0.0% 100.0% 0.0% flops 280 x 256 x 256 1000593162240 0.0% 100.0% 0.0% flops 258 x 256 x 2560 1395948257280 0.0% 100.0% 0.0% flops 9 x 9 x 256 1430454546432 0.0% 100.0% 0.0% flops 22 x 9 x 256 1986255912960 0.0% 100.0% 0.0% flops 9 x 22 x 256 1992006770688 0.0% 100.0% 0.0% flops 280 x 256 x 2560 2272473907200 0.0% 100.0% 0.0% flops 289 x 256 x 256 2409761865728 0.0% 100.0% 0.0% flops 22 x 22 x 256 2753958699008 0.0% 100.0% 0.0% flops 311 x 256 x 256 3334119358464 0.0% 100.0% 0.0% flops 32 x 256 x 2560 4949949808640 0.0% 100.0% 0.0% flops 64 x 256 x 2560 4949949808640 0.0% 100.0% 0.0% flops 289 x 256 x 2560 5472874659840 0.0% 100.0% 0.0% flops 9 x 32 x 9 6003301257216 0.0% 100.0% 0.0% flops 311 x 256 x 2560 7572207697920 0.0% 100.0% 0.0% flops 22 x 32 x 9 8382804148224 0.0% 100.0% 0.0% flops 9 x 32 x 22 8382804148224 0.0% 100.0% 0.0% flops 22 x 32 x 22 11531114856448 0.0% 100.0% 0.0% flops inhomo. stacks 19217579507712 100.0% 0.0% 0.0% flops total 97.640170E+12 19.7% 80.3% 0.0% flops max/rank 12.576240E+12 20.6% 79.4% 0.0% matmuls inhomo. stacks 167280 100.0% 0.0% 0.0% matmuls total 2939398316 0.0% 100.0% 0.0% number of processed stacks 4490812 3.7% 96.3% 0.0% average stack size 1.0 679.8 0.0 marketing flops 145.668111E+12 ------------------------------------------------------------------------------- # multiplications 2537 max memory usage/rank 3.892720E+09 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 101480 MPI messages size (bytes): total size 1.145238E+12 min size 0.000000E+00 max size 104.857600E+06 average size 11.285357E+06 MPI breakdown and total messages size (bytes): size <= 128 572 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 44 2883584 131072 < size <= 4194304 45968 35701915648 4194304 < size <= 16777216 44720 382939955200 16777216 < size 10176 726592540656 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4050 58461. MP_Allreduce 11202 1500. MP_Sync 87 MP_Alltoall 1724 36993632. MP_SendRecv 1806 218624. MP_ISendRecv 1806 218624. MP_Wait 9876 MP_ISend 6456 1080169. MP_IRecv 6456 1080169. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.246 0.292 466.505 466.520 qs_mol_dyn_low 1 2.0 0.106 0.148 464.627 464.650 qs_forces 11 3.9 0.018 0.032 464.150 464.209 qs_energies 11 4.9 0.036 0.050 451.283 451.324 scf_env_do_scf 11 5.9 0.041 0.070 417.783 417.823 velocity_verlet 10 3.0 0.007 0.009 316.257 316.266 scf_env_do_scf_inner_loop 118 6.6 0.024 0.044 285.940 285.984 dbcsr_multiply_generic 2537 12.6 0.308 0.324 197.186 198.438 qs_scf_new_mos 118 7.6 0.001 0.001 183.002 183.749 qs_scf_loop_do_ot 118 8.6 0.020 0.033 183.001 183.748 ot_scf_mini 118 9.6 0.026 0.053 172.521 172.830 multiply_cannon 2537 13.6 0.399 0.426 134.446 138.440 init_scf_loop 11 6.9 0.052 0.079 131.415 131.424 multiply_cannon_loop 2537 14.6 0.292 0.325 128.265 130.716 prepare_preconditioner 11 7.9 0.000 0.000 120.124 120.312 make_preconditioner 11 8.9 0.001 0.001 120.124 120.312 make_full_inverse_cholesky 11 9.9 0.048 0.053 96.103 116.249 ot_mini 118 10.6 0.002 0.002 98.441 98.786 multiply_cannon_multrec 10148 15.6 82.999 98.149 83.057 98.204 mp_waitall_1 104900 16.8 57.825 81.923 57.825 81.923 rebuild_ks_matrix 129 8.3 0.001 0.001 74.285 75.066 qs_ks_build_kohn_sham_matrix 129 9.3 0.021 0.032 74.284 75.065 cp_fm_upper_to_full 105 14.8 51.274 73.160 51.274 73.160 qs_ks_update_qs_env 129 7.6 0.001 0.001 68.259 68.948 apply_preconditioner_dbcsr 129 12.6 0.043 0.053 60.415 61.557 apply_single 129 13.6 0.001 0.001 60.372 61.540 ot_diis_step 118 11.6 0.068 0.105 59.676 59.677 multiply_cannon_metrocomm3 10148 15.6 0.038 0.039 37.058 57.700 make_m2s 5074 13.6 0.133 0.137 48.774 49.746 qs_rho_update_rho_low 129 7.7 0.004 0.006 45.567 45.839 calculate_rho_elec 129 8.7 0.582 0.584 45.563 45.835 dbcsr_complete_redistribute 395 12.7 4.528 4.575 31.775 44.366 copy_fm_to_dbcsr 209 11.7 0.002 0.002 27.134 39.762 qs_ot_get_derivative 118 11.6 0.017 0.018 38.667 38.992 make_images 5074 14.6 6.058 6.203 35.698 36.757 transfer_fm_to_dbcsr 11 9.9 0.034 0.042 23.971 36.449 qs_ot_get_p 129 10.4 0.001 0.001 34.403 35.539 mp_alltoall_i22 718 14.1 21.721 34.442 21.721 34.442 sum_up_and_integrate 129 10.3 0.002 0.002 32.628 32.735 integrate_v_rspace 129 11.3 0.004 0.004 32.514 32.620 grid_collocate_task_list 129 9.7 31.677 31.979 31.677 31.979 cp_fm_cholesky_invert 11 10.9 29.986 29.998 29.986 29.998 make_images_data 5074 15.6 0.075 0.083 23.139 26.629 hybrid_alltoall_any 5260 16.5 2.041 4.626 23.328 26.541 qs_ot_p2m_diag 83 11.4 0.707 0.714 24.348 24.359 cp_dbcsr_syevd 83 12.4 0.006 0.006 22.783 22.788 grid_integrate_task_list 129 12.3 22.437 22.758 22.437 22.758 init_scf_run 11 5.9 0.001 0.003 22.598 22.608 scf_env_initial_rho_setup 11 6.9 0.018 0.035 22.598 22.608 cp_fm_diag_elpa 83 13.4 0.000 0.001 19.285 19.287 cp_fm_diag_elpa_base 83 14.4 15.039 16.362 19.280 19.281 wfi_extrapolate 11 7.9 0.001 0.001 19.051 19.051 multiply_cannon_metrocomm4 7611 15.6 0.037 0.040 6.738 16.489 mp_irecv_dv 24053 16.2 6.676 16.371 6.676 16.371 qs_ot_get_derivative_diag 77 12.4 0.003 0.003 15.548 15.835 fft_wrap_pw1pw2 1301 11.7 0.021 0.024 14.444 14.510 dbcsr_make_dense_low 8740 15.6 0.097 0.098 13.718 13.856 make_dense_data 8740 16.6 13.221 13.359 13.584 13.722 fft_wrap_pw1pw2_140 527 12.2 1.155 1.195 12.967 13.086 dbcsr_make_images_dense 4440 14.8 0.056 0.057 12.575 12.708 calculate_dm_sparse 129 9.5 0.001 0.002 11.944 12.533 cp_fm_cholesky_decompose 22 10.9 12.425 12.502 12.425 12.502 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 12.311 12.441 mp_alltoall_d11v 2423 14.1 10.935 11.473 10.935 11.473 density_rs2pw 129 9.7 0.007 0.008 9.995 10.864 fft3d_ps 1301 13.7 5.772 5.871 10.799 10.860 copy_dbcsr_to_fm 186 11.8 0.004 0.004 9.328 9.483 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="414", plot="h2o_256_md", label="(4n/2r/18t)", y=466.520000, yerr=0.000000 PlotPoint: name="415", plot="h2o_256_md_mem", label="(4n/2r/18t)", y=3129.545455, yerr=430.641459 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/29/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 9 x 32 x 32 3321888768 0.0% 100.0% 0.0% flops 22 x 32 x 32 4060086272 0.0% 100.0% 0.0% flops 138 x 512 x 512 951570137088 0.0% 100.0% 0.0% flops 160 x 512 x 512 1287148011520 0.0% 100.0% 0.0% flops 9 x 9 x 512 1389588037632 0.0% 100.0% 0.0% flops 32 x 512 x 512 1887638126592 0.0% 100.0% 0.0% flops 22 x 9 x 512 1929499951104 0.0% 100.0% 0.0% flops 9 x 22 x 512 1935094689792 0.0% 100.0% 0.0% flops 138 x 512 x 5120 2170552320000 0.0% 100.0% 0.0% flops 129 x 512 x 512 2520281776128 0.0% 100.0% 0.0% flops 22 x 22 x 512 2675276685312 0.0% 100.0% 0.0% flops 160 x 512 x 5120 2936012800000 0.0% 100.0% 0.0% flops 151 x 512 x 512 4858983743488 0.0% 100.0% 0.0% flops 129 x 512 x 5120 5748817920000 0.0% 100.0% 0.0% flops 9 x 32 x 9 5840513703936 0.0% 100.0% 0.0% flops 22 x 32 x 9 8155468062720 0.0% 100.0% 0.0% flops 9 x 32 x 22 8155468062720 0.0% 100.0% 0.0% flops 32 x 512 x 5120 9556302233600 0.0% 100.0% 0.0% flops 151 x 512 x 5120 11083448320000 0.0% 100.0% 0.0% flops 22 x 32 x 22 11218416844800 0.0% 100.0% 0.0% flops inhomo. stacks 7631854895104 100.0% 0.0% 0.0% flops total 91.939318E+12 8.3% 91.7% 0.0% flops max/rank 24.417707E+12 10.7% 89.3% 0.0% matmuls inhomo. stacks 32304 100.0% 0.0% 0.0% matmuls total 2817810780 0.0% 100.0% 0.0% number of processed stacks 3433412 0.9% 99.1% 0.0% average stack size 1.0 828.5 0.0 marketing flops 141.366701E+12 ------------------------------------------------------------------------------- # multiplications 2442 max memory usage/rank 15.159882E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 19536 MPI messages size (bytes): total size 433.484333E+09 min size 0.000000E+00 max size 209.715200E+06 average size 22.189002E+06 MPI breakdown and total messages size (bytes): size <= 128 110 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 22 2883584 131072 < size <= 4194304 9224 19344130048 4194304 < size <= 16777216 0 0 16777216 < size 10180 414137650336 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 21 12. MP_Allreduce 13010 38. MP_Alltoall 9470 3350624. MP_ISend 19492 5687502. MP_IRecv 19492 5682102. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4000 59167. MP_Allreduce 11083 1846. MP_Sync 86 MP_Alltoall 1654 67291489. MP_SendRecv 1125 1443499. MP_ISendRecv 1125 1443499. MP_Wait 3301 MP_ISend 1632 4586667. MP_IRecv 1632 4586667. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.229 0.265 412.737 412.744 qs_mol_dyn_low 1 2.0 0.260 0.280 410.810 410.825 qs_forces 11 3.9 0.069 0.094 409.864 409.879 qs_energies 11 4.9 0.009 0.010 391.257 391.284 scf_env_do_scf 11 5.9 0.012 0.018 347.452 347.476 velocity_verlet 10 3.0 0.002 0.002 274.457 274.475 scf_env_do_scf_inner_loop 114 6.6 0.014 0.048 258.848 258.870 qs_scf_new_mos 114 7.6 0.001 0.001 153.681 153.821 qs_scf_loop_do_ot 114 8.6 0.001 0.002 153.680 153.820 dbcsr_multiply_generic 2442 12.5 0.364 0.377 146.925 148.105 ot_scf_mini 114 9.6 0.009 0.012 142.260 142.588 multiply_cannon 2442 13.5 0.928 0.989 86.367 92.058 init_scf_loop 11 6.9 0.029 0.035 87.920 87.941 multiply_cannon_loop 2442 14.5 0.165 0.175 78.285 80.289 ot_mini 114 10.6 0.017 0.017 77.914 78.232 prepare_preconditioner 11 7.9 0.000 0.000 75.744 75.844 make_preconditioner 11 8.9 0.013 0.033 75.744 75.844 multiply_cannon_multrec 4884 15.5 71.960 74.442 72.245 74.735 make_full_inverse_cholesky 11 9.9 0.072 0.076 67.990 68.984 rebuild_ks_matrix 125 8.3 0.001 0.001 67.595 67.945 qs_ks_build_kohn_sham_matrix 125 9.3 0.018 0.018 67.595 67.944 qs_ks_update_qs_env 125 7.6 0.001 0.001 62.205 62.507 qs_rho_update_rho_low 125 7.7 0.001 0.001 56.010 56.207 calculate_rho_elec 125 8.7 1.001 1.002 56.009 56.206 make_m2s 4884 13.5 0.129 0.132 47.736 52.037 apply_preconditioner_dbcsr 125 12.6 0.047 0.052 40.014 40.746 apply_single 125 13.6 0.001 0.001 39.967 40.694 ot_diis_step 114 11.6 0.109 0.150 40.150 40.152 grid_collocate_task_list 125 9.7 38.669 39.107 38.669 39.107 qs_ot_get_p 125 10.4 0.001 0.001 38.023 38.150 qs_ot_get_derivative 114 11.6 0.013 0.014 37.618 37.947 cp_fm_cholesky_invert 11 10.9 36.905 36.913 36.905 36.913 make_images 4884 14.5 7.045 7.338 31.245 35.758 sum_up_and_integrate 125 10.3 0.002 0.002 34.433 34.466 integrate_v_rspace 125 11.3 0.004 0.004 34.320 34.352 qs_ot_p2m_diag 82 11.4 1.220 1.226 31.124 31.132 mp_waitall_1 79813 16.9 21.134 30.527 21.134 30.527 cp_dbcsr_syevd 82 12.4 0.007 0.008 28.832 28.836 hybrid_alltoall_any 5069 16.4 3.512 8.043 20.310 27.104 make_images_data 4884 15.5 0.071 0.082 19.072 25.581 cp_fm_diag_elpa 82 13.4 0.000 0.000 25.169 25.169 cp_fm_diag_elpa_base 82 14.4 23.949 24.162 25.164 25.165 init_scf_run 11 5.9 0.001 0.002 25.102 25.105 scf_env_initial_rho_setup 11 6.9 0.014 0.029 25.101 25.104 grid_integrate_task_list 125 12.3 22.926 23.034 22.926 23.034 dbcsr_complete_redistribute 393 12.7 8.747 8.934 21.260 22.105 wfi_extrapolate 11 7.9 0.001 0.001 21.575 21.575 fft_wrap_pw1pw2 1261 11.7 0.020 0.020 18.868 18.895 dbcsr_make_dense_low 6263 15.5 0.085 0.087 18.139 18.280 make_dense_data 6263 16.5 17.409 17.682 18.025 18.166 fft_wrap_pw1pw2_140 511 12.2 2.092 2.102 16.871 16.889 dbcsr_make_images_dense 4266 14.8 0.043 0.043 15.743 15.914 copy_dbcsr_to_fm 185 11.7 0.004 0.004 15.168 15.503 cp_fm_cholesky_decompose 22 10.9 15.044 15.119 15.044 15.119 qs_ot_get_derivative_diag 76 12.4 0.003 0.003 14.652 15.033 fft3d_ps 1261 13.7 7.900 7.948 13.777 13.868 qs_energies_init_hamiltonians 11 5.9 0.001 0.001 13.737 13.756 mp_alltoall_d11v 2385 14.1 13.430 13.628 13.430 13.628 copy_fm_to_dbcsr 208 11.6 0.002 0.002 12.774 13.462 calculate_dm_sparse 125 9.5 0.001 0.001 13.148 13.357 cp_dbcsr_sm_fm_multiply 37 9.5 0.002 0.002 13.154 13.174 density_rs2pw 125 9.7 0.006 0.006 12.581 12.908 mp_allgather_i34 2442 14.5 4.809 12.791 4.809 12.791 transfer_dbcsr_to_fm 11 10.9 0.000 0.000 11.027 11.188 qs_ks_update_qs_env_forces 11 4.9 0.000 0.000 8.903 8.965 build_core_hamiltonian_matrix_ 11 4.9 0.001 0.001 8.864 8.958 dbcsr_dot_sd 1279 12.0 7.566 7.578 8.429 8.922 cp_dbcsr_sm_fm_multiply_core 37 10.5 0.000 0.000 8.276 8.487 transfer_fm_to_dbcsr 11 9.9 0.056 0.057 7.660 8.263 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="416", plot="h2o_256_md", label="(4n/1r/36t)", y=412.744000, yerr=0.000000 PlotPoint: name="417", plot="h2o_256_md_mem", label="(4n/1r/36t)", y=11473.727273, yerr=2330.005895 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/30/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 100.0% 0.0% flops max/rank 1.924147E+12 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 100.0% 0.0% number of processed stacks 9697789 0.0% 100.0% 0.0% average stack size 0.0 993.4 0.0 marketing flops 1.742118E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 687.923200E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 351648 MPI messages size (bytes): total size 4.213128E+12 min size 0.000000E+00 max size 25.408928E+06 average size 11.981093E+06 MPI breakdown and total messages size (bytes): size <= 128 92928 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 0 0 131072 < size <= 4194304 14784 17360358048 4194304 < size <= 16777216 77033 1000974278216 16777216 < size 166903 3194793140616 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 2 12. MP_Allreduce 716 49. MP_Alltoall 310 1558616. MP_ISend 5328 5698210. MP_IRecv 5328 5713852. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 253283. MP_Allreduce 3140 5695. MP_Sync 4 MP_Alltoall 61 4925786. MP_SendRecv 429 12000. MP_ISendRecv 429 12000. MP_Wait 1251 MP_ISend 726 139430. MP_IRecv 726 139314. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.059 0.094 206.516 206.523 qs_energies 1 2.0 0.000 0.001 205.535 205.560 ls_scf 1 3.0 0.000 0.000 204.747 204.762 dbcsr_multiply_generic 111 6.7 0.015 0.070 188.266 190.851 multiply_cannon 111 7.7 0.018 0.022 156.378 173.430 multiply_cannon_loop 111 8.7 0.056 0.066 153.090 169.926 multiply_cannon_multrec 1332 9.7 122.794 146.142 122.918 146.265 ls_scf_main 1 4.0 0.000 0.000 119.393 119.396 density_matrix_trs4 2 5.0 0.002 0.003 112.433 113.196 ls_scf_init_scf 1 4.0 0.000 0.000 77.114 77.117 ls_scf_init_matrix_S 1 5.0 0.000 0.000 74.900 75.438 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.001 69.761 69.786 mp_waitall_1 12957 10.9 30.609 45.484 30.609 45.484 mp_sum_l 898 5.1 18.080 31.646 18.080 31.646 multiply_cannon_metrocomm1 1332 9.7 0.011 0.017 16.728 28.116 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 13.457 24.645 multiply_cannon_metrocomm3 1332 9.7 0.007 0.011 4.112 22.107 make_m2s 222 7.7 0.007 0.008 12.946 13.251 make_images 222 8.7 0.063 0.071 12.930 13.235 mp_irecv_dv 3257 11.0 4.301 12.452 4.301 12.452 make_images_data 222 9.7 0.004 0.006 9.951 10.677 hybrid_alltoall_any 227 10.6 0.163 2.331 7.746 9.941 ls_scf_post 1 4.0 0.000 0.000 8.240 8.255 ls_scf_store_result 1 5.0 0.000 0.000 7.697 8.092 multiply_cannon_metrocomm4 1221 9.7 0.008 0.014 2.644 7.065 multiply_cannon_metrocomm2 1221 9.7 0.008 0.015 1.711 7.026 make_images_sizes 222 9.7 0.000 0.001 0.666 6.682 mp_alltoall_i44 222 10.7 0.666 6.681 0.666 6.681 calculate_norms 2376 9.8 4.922 6.169 4.922 6.169 apply_matrix_preconditioner 6 5.3 0.000 0.000 4.838 5.486 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="500", plot="h2o_32_nrep3_ls", label="(4n/36r/1t)", y=206.523000, yerr=0.000000 PlotPoint: name="501", plot="h2o_32_nrep3_ls_mem", label="(4n/36r/1t)", y=645.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/31/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 100.0% 0.0% flops max/rank 3.668084E+12 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 100.0% 0.0% number of processed stacks 9699679 0.0% 100.0% 0.0% average stack size 0.0 993.3 0.0 marketing flops 1.742118E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 1.136574E+09 # max total images/rank 2 # max 3D layers 1 # MPI messages exchanged 167832 MPI messages size (bytes): total size 3.077601E+12 min size 0.000000E+00 max size 46.966736E+06 average size 18.337394E+06 MPI breakdown and total messages size (bytes): size <= 128 40320 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 320 37241600 131072 < size <= 4194304 9250 5469089776 4194304 < size <= 16777216 22076 261636692280 16777216 < size 95866 2810458169672 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 259847. MP_Allreduce 3139 7842. MP_Sync 4 MP_Alltoall 54 45289358. MP_SendRecv 213 26880. MP_ISendRecv 213 26880. MP_Wait 945 MP_ISend 642 238274. MP_IRecv 642 237962. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.095 0.157 343.936 343.955 qs_energies 1 2.0 0.000 0.001 342.716 342.743 ls_scf 1 3.0 0.003 0.014 341.366 341.384 dbcsr_multiply_generic 111 6.7 0.020 0.048 316.392 318.892 multiply_cannon 111 7.7 0.026 0.030 272.110 294.917 multiply_cannon_loop 111 8.7 0.088 0.102 266.385 290.371 multiply_cannon_multrec 1332 9.7 238.230 268.174 238.389 268.318 ls_scf_main 1 4.0 0.004 0.017 196.998 197.010 density_matrix_trs4 2 5.0 0.003 0.009 186.059 187.118 ls_scf_init_scf 1 4.0 0.001 0.003 130.549 130.579 ls_scf_init_matrix_S 1 5.0 0.000 0.000 127.227 127.972 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.005 118.336 118.356 mp_waitall_1 10071 10.9 24.634 53.233 24.634 53.233 mp_sum_l 898 5.1 23.253 43.224 23.253 43.224 multiply_cannon_metrocomm3 1332 9.7 0.009 0.012 7.968 38.148 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 18.574 35.198 multiply_cannon_metrocomm1 1332 9.7 0.006 0.007 5.642 21.213 make_m2s 222 7.7 0.009 0.011 19.462 20.134 make_images 222 8.7 1.619 2.079 19.432 20.104 mp_irecv_dv 3391 11.0 6.379 17.570 6.379 17.570 multiply_cannon_metrocomm4 1221 9.7 0.010 0.013 5.269 14.943 ls_scf_post 1 4.0 0.004 0.017 13.815 13.835 ls_scf_store_result 1 5.0 0.000 0.000 13.084 13.622 make_images_data 222 9.7 0.006 0.006 11.536 12.626 hybrid_alltoall_any 227 10.6 0.416 2.888 10.081 11.448 calculate_norms 2376 9.8 7.667 8.504 7.667 8.504 apply_matrix_preconditioner 6 5.3 0.000 0.000 6.970 8.091 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="502", plot="h2o_32_nrep3_ls", label="(4n/18r/2t)", y=343.955000, yerr=0.000000 PlotPoint: name="503", plot="h2o_32_nrep3_ls_mem", label="(4n/18r/2t)", y=1021.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/32/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 100.0% 0.0% flops max/rank 7.107463E+12 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 100.0% 0.0% number of processed stacks 9667097 0.0% 100.0% 0.0% average stack size 0.0 996.6 0.0 marketing flops 1.742118E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 2.014024E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 39960 MPI messages size (bytes): total size 1.915058E+12 min size 0.000000E+00 max size 93.908080E+06 average size 47.924364E+06 MPI breakdown and total messages size (bytes): size <= 128 9600 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 0 0 131072 < size <= 4194304 1920 1170063360 4194304 < size <= 16777216 720 6721008480 16777216 < size 27720 1907167008560 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 2 12. MP_Allreduce 716 87. MP_Alltoall 310 5824960. MP_ISend 2664 20322846. MP_IRecv 2664 20175729. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 259409. MP_Allreduce 3138 10459. MP_Sync 4 MP_Alltoall 47 20667983. MP_SendRecv 105 57600. MP_ISendRecv 105 57600. MP_Wait 567 MP_ISend 378 618054. MP_IRecv 378 618834. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.117 0.221 352.999 353.010 qs_energies 1 2.0 0.000 0.001 351.787 351.802 ls_scf 1 3.0 0.007 0.028 350.278 350.298 dbcsr_multiply_generic 111 6.7 0.019 0.023 322.131 325.956 multiply_cannon 111 7.7 0.026 0.030 266.768 295.070 multiply_cannon_loop 111 8.7 0.065 0.075 259.042 288.374 multiply_cannon_multrec 666 9.7 233.043 267.215 233.180 267.327 ls_scf_main 1 4.0 0.001 0.015 202.232 202.249 density_matrix_trs4 2 5.0 0.004 0.039 190.215 191.565 ls_scf_init_scf 1 4.0 0.002 0.009 133.537 133.545 ls_scf_init_matrix_S 1 5.0 0.004 0.019 129.759 130.788 matrix_sqrt_Newton_Schulz 2 6.5 0.003 0.015 120.943 120.970 mp_waitall_1 7293 11.0 28.952 57.742 28.952 57.742 mp_sum_l 898 5.1 27.457 46.094 27.457 46.094 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 22.311 38.671 multiply_cannon_metrocomm1 666 9.7 0.004 0.007 10.724 33.119 make_m2s 222 7.7 0.007 0.007 24.782 26.288 make_images 222 8.7 3.118 3.810 24.741 26.249 make_images_data 222 9.7 0.004 0.006 16.346 19.263 hybrid_alltoall_any 227 10.6 0.655 4.802 15.740 18.276 mp_irecv_dv 1601 11.0 6.605 17.578 6.605 17.578 multiply_cannon_metrocomm3 666 9.7 0.003 0.005 2.523 16.450 ls_scf_post 1 4.0 0.005 0.022 14.502 14.530 multiply_cannon_metrocomm4 555 9.7 0.003 0.004 4.102 14.092 ls_scf_store_result 1 5.0 0.000 0.000 13.427 14.054 multiply_cannon_metrocomm2 555 9.7 0.004 0.006 3.345 12.492 make_images_sizes 222 9.7 0.000 0.001 2.125 8.661 mp_alltoall_i44 222 10.7 2.124 8.661 2.124 8.661 apply_matrix_preconditioner 6 5.3 0.000 0.000 7.426 8.424 mp_allgather_i34 111 8.7 2.053 7.272 2.053 7.272 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="504", plot="h2o_32_nrep3_ls", label="(4n/9r/4t)", y=353.010000, yerr=0.000000 PlotPoint: name="505", plot="h2o_32_nrep3_ls_mem", label="(4n/9r/4t)", y=1377.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/33/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 100.0% 0.0% flops max/rank 10.747127E+12 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 100.0% 0.0% number of processed stacks 9703792 0.0% 100.0% 0.0% average stack size 0.0 992.8 0.0 marketing flops 1.742116E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 2.883928E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 50616 MPI messages size (bytes): total size 1.536549E+12 min size 0.000000E+00 max size 72.286792E+06 average size 30.356988E+06 MPI breakdown and total messages size (bytes): size <= 128 10368 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 1056 104411904 131072 < size <= 4194304 3168 831638784 4194304 < size <= 16777216 3103 33613273640 16777216 < size 32921 1501999894888 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 266673. MP_Allreduce 3138 13030. MP_Sync 4 MP_Alltoall 47 30278988. MP_SendRecv 69 86400. MP_ISendRecv 69 86400. MP_Wait 531 MP_ISend 378 823502. MP_IRecv 378 823753. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.148 0.213 353.921 353.925 qs_energies 1 2.0 0.003 0.016 352.720 352.755 ls_scf 1 3.0 0.002 0.008 350.891 350.912 dbcsr_multiply_generic 111 6.7 0.020 0.022 322.088 324.350 multiply_cannon 111 7.7 0.029 0.031 265.512 283.233 multiply_cannon_loop 111 8.7 0.103 0.113 257.398 275.448 multiply_cannon_multrec 1332 9.7 242.281 262.438 242.400 262.554 ls_scf_main 1 4.0 0.002 0.015 203.285 203.296 density_matrix_trs4 2 5.0 0.020 0.107 191.459 192.332 ls_scf_init_scf 1 4.0 0.021 0.081 133.156 133.158 ls_scf_init_matrix_S 1 5.0 0.000 0.001 129.884 130.500 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.005 120.811 120.829 mp_sum_l 898 5.1 19.644 38.941 19.644 38.941 make_m2s 222 7.7 0.009 0.010 33.013 33.845 make_images 222 8.7 3.701 4.138 32.954 33.786 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 15.895 32.008 mp_waitall_1 6369 11.0 23.176 29.517 23.176 29.517 make_images_data 222 9.7 0.005 0.006 19.217 21.058 hybrid_alltoall_any 227 10.6 0.930 3.970 18.066 20.847 multiply_cannon_metrocomm4 1110 9.7 0.006 0.008 4.616 16.133 mp_irecv_dv 3229 10.9 4.609 16.106 4.609 16.106 ls_scf_post 1 4.0 0.031 0.070 14.450 14.470 ls_scf_store_result 1 5.0 0.000 0.000 13.661 14.163 multiply_cannon_metrocomm1 1332 9.7 0.004 0.004 3.447 9.062 apply_matrix_preconditioner 6 5.3 0.000 0.000 7.157 8.494 dbcsr_data_release 10900 10.7 7.242 8.036 7.242 8.036 arnoldi_extremal 4 6.8 0.012 0.049 7.387 7.419 arnoldi_normal_ev 4 7.8 0.002 0.009 7.375 7.419 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="506", plot="h2o_32_nrep3_ls", label="(4n/6r/6t)", y=353.925000, yerr=0.000000 PlotPoint: name="507", plot="h2o_32_nrep3_ls_mem", label="(4n/6r/6t)", y=1970.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/34/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 100.0% 0.0% flops max/rank 15.383312E+12 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 100.0% 0.0% number of processed stacks 9657067 0.0% 100.0% 0.0% average stack size 0.0 997.6 0.0 marketing flops 1.742118E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 4.273308E+09 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 10656 MPI messages size (bytes): total size 1.149035E+12 min size 0.000000E+00 max size 203.538048E+06 average size 107.829832E+06 MPI breakdown and total messages size (bytes): size <= 128 2304 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 0 0 131072 < size <= 4194304 768 702038016 4194304 < size <= 16777216 0 0 16777216 < size 7584 1148332810224 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 2 12. MP_Allreduce 716 126. MP_Alltoall 310 12920694. MP_ISend 1776 40180426. MP_IRecv 1776 40465032. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 265536. MP_Allreduce 3129 15263. MP_Sync 4 MP_Alltoall 47 46208988. MP_SendRecv 45 115200. MP_ISendRecv 45 115200. MP_Wait 528 MP_ISend 420 924980. MP_IRecv 420 924528. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.069 0.132 356.539 356.542 qs_energies 1 2.0 0.002 0.008 355.427 355.464 ls_scf 1 3.0 0.000 0.001 353.518 353.552 dbcsr_multiply_generic 111 6.7 0.022 0.024 321.776 323.223 multiply_cannon 111 7.7 0.029 0.033 268.857 284.131 multiply_cannon_loop 111 8.7 0.074 0.083 258.298 267.265 multiply_cannon_multrec 444 9.7 238.728 249.871 238.841 249.984 ls_scf_main 1 4.0 0.000 0.001 206.945 206.950 density_matrix_trs4 2 5.0 0.005 0.040 193.578 194.242 ls_scf_init_scf 1 4.0 0.001 0.006 132.466 132.476 ls_scf_init_matrix_S 1 5.0 0.001 0.011 129.276 129.750 matrix_sqrt_Newton_Schulz 2 6.5 0.005 0.020 120.006 120.018 mp_waitall_1 5436 11.0 34.000 39.798 34.000 39.798 make_m2s 222 7.7 0.006 0.006 34.563 38.012 make_images 222 8.7 3.681 4.406 34.493 37.936 make_images_data 222 9.7 0.004 0.006 24.662 29.287 hybrid_alltoall_any 227 10.6 1.177 5.554 22.905 28.436 mp_sum_l 898 5.1 13.152 25.393 13.152 25.393 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 10.250 20.182 ls_scf_post 1 4.0 0.009 0.036 14.107 14.143 mp_allgather_i34 111 8.7 3.557 14.016 3.557 14.016 ls_scf_store_result 1 5.0 0.000 0.000 13.549 13.868 make_images_sizes 222 9.7 0.000 0.000 2.469 11.834 mp_alltoall_i44 222 10.7 2.469 11.834 2.469 11.834 multiply_cannon_metrocomm1 444 9.7 0.003 0.003 8.915 11.584 dbcsr_data_release 10924 10.7 8.548 9.637 8.548 9.637 apply_matrix_preconditioner 6 5.3 0.000 0.000 7.931 8.820 arnoldi_extremal 4 6.8 0.006 0.026 8.522 8.546 arnoldi_normal_ev 4 7.8 0.002 0.007 8.515 8.546 multiply_cannon_metrocomm4 333 9.7 0.002 0.002 3.057 8.122 mp_irecv_dv 1241 11.2 3.059 8.116 3.059 8.116 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="508", plot="h2o_32_nrep3_ls", label="(4n/4r/9t)", y=356.542000, yerr=0.000000 PlotPoint: name="509", plot="h2o_32_nrep3_ls_mem", label="(4n/4r/9t)", y=2808.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/35/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 23 x 23 x 23 234439235724792 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 234.439236E+12 0.0% 100.0% 0.0% flops max/rank 20.557908E+12 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 9634225188 0.0% 100.0% 0.0% number of processed stacks 9673530 0.0% 100.0% 0.0% average stack size 0.0 995.9 0.0 marketing flops 1.742116E+15 ------------------------------------------------------------------------------- # multiplications 111 max memory usage/rank 5.549216E+09 # max total images/rank 3 # max 3D layers 1 # MPI messages exchanged 10656 MPI messages size (bytes): total size 1.158041E+12 min size 0.000000E+00 max size 265.321008E+06 average size 108.674984E+06 MPI breakdown and total messages size (bytes): size <= 128 1536 0 128 < size <= 8192 0 0 8192 < size <= 32768 0 0 32768 < size <= 131072 0 0 131072 < size <= 4194304 1536 702038016 4194304 < size <= 16777216 72 672100848 16777216 < size 7512 1156666219168 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 1026 284877. MP_Allreduce 3129 18839. MP_Sync 4 MP_Alltoall 47 60354741. MP_SendRecv 33 144000. MP_ISendRecv 33 144000. MP_Wait 432 MP_ISend 336 1403879. MP_IRecv 336 1404443. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.126 0.211 368.012 368.017 qs_energies 1 2.0 0.000 0.000 366.314 366.324 ls_scf 1 3.0 0.000 0.001 363.440 363.467 dbcsr_multiply_generic 111 6.7 0.024 0.026 328.182 328.965 multiply_cannon 111 7.7 0.032 0.035 271.442 287.978 multiply_cannon_loop 111 8.7 0.090 0.095 260.923 276.297 multiply_cannon_multrec 666 9.7 228.950 235.378 229.041 235.470 ls_scf_main 1 4.0 0.002 0.014 218.396 218.397 density_matrix_trs4 2 5.0 0.013 0.043 203.049 203.450 ls_scf_init_scf 1 4.0 0.001 0.002 130.819 130.820 ls_scf_init_matrix_S 1 5.0 0.000 0.001 127.526 127.853 matrix_sqrt_Newton_Schulz 2 6.5 0.001 0.004 118.340 118.347 mp_waitall_1 5424 11.0 39.403 57.877 39.403 57.877 make_m2s 222 7.7 0.007 0.007 37.200 40.381 make_images 222 8.7 4.568 5.066 37.111 40.290 multiply_cannon_metrocomm3 666 9.7 0.003 0.004 16.502 36.218 multiply_cannon_metrocomm4 555 9.7 0.003 0.004 8.310 29.021 mp_irecv_dv 1779 11.1 8.272 28.811 8.272 28.811 hybrid_alltoall_any 227 10.6 1.741 3.775 23.710 27.942 make_images_data 222 9.7 0.004 0.005 23.877 27.185 mp_sum_l 898 5.1 16.084 25.263 16.084 25.263 dbcsr_multiply_generic_mpsum_f 86 7.8 0.000 0.000 13.059 21.907 ls_scf_post 1 4.0 0.011 0.023 14.224 14.250 ls_scf_store_result 1 5.0 0.000 0.000 13.720 13.936 dbcsr_data_release 12835 10.6 6.956 11.812 6.956 11.812 apply_matrix_preconditioner 6 5.3 0.000 0.000 9.316 9.668 arnoldi_extremal 4 6.8 0.016 0.062 8.523 8.586 arnoldi_normal_ev 4 7.8 0.003 0.011 8.507 8.584 ls_scf_dm_to_ks 2 5.0 0.000 0.000 7.608 7.664 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="510", plot="h2o_32_nrep3_ls", label="(4n/3r/12t)", y=368.017000, yerr=0.000000 PlotPoint: name="511", plot="h2o_32_nrep3_ls_mem", label="(4n/3r/12t)", y=3568.000000, yerr=0.000000 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ~~~~~~~~~ RESULT ~~~~~~~~~ RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC40-gnu/aa4adac34cf79714cddcdea931a90aba3810da90_performance_tests/36/result.log @@@@@@@@@@ Run number: 1 @@@@@@@@@@ ------------------------------------------------------------------------------- - - - DBCSR STATISTICS - - - ------------------------------------------------------------------------------- COUNTER TOTAL BLAS SMM ACC flops 64 x 64 x 64 109521666048 0.0% 100.0% 0.0% flops 32 x 32 x 849 202529832960 0.0% 100.0% 0.0% flops 32 x 32 x 853 203484037120 0.0% 100.0% 0.0% flops 32 x 32 x 858 204676792320 0.0% 100.0% 0.0% flops 64 x 64 x 96 328564998144 0.0% 100.0% 0.0% flops 64 x 96 x 64 328564998144 0.0% 100.0% 0.0% flops 96 x 64 x 64 328564998144 0.0% 100.0% 0.0% flops 9 x 32 x 32 549621596160 0.0% 100.0% 0.0% flops 22 x 32 x 32 671759728640 0.0% 100.0% 0.0% flops 64 x 64 x 849 936533557248 0.0% 100.0% 0.0% flops 64 x 64 x 853 940945965056 0.0% 100.0% 0.0% flops 64 x 64 x 858 946461474816 0.0% 100.0% 0.0% flops 96 x 96 x 64 985694994432 0.0% 100.0% 0.0% flops 64 x 96 x 96 985694994432 0.0% 100.0% 0.0% flops 96 x 64 x 96 985694994432 0.0% 100.0% 0.0% flops 849 x 64 x 64 1285508038656 0.0% 100.0% 0.0% flops 853 x 64 x 64 1291564613632 0.0% 100.0% 0.0% flops 858 x 64 x 64 1299135332352 0.0% 100.0% 0.0% flops 9 x 9 x 64 1833777211392 0.0% 100.0% 0.0% flops 9 x 22 x 64 2466560397312 0.0% 100.0% 0.0% flops 22 x 9 x 64 2471027226624 0.0% 100.0% 0.0% flops 64 x 96 x 849 2809600671744 0.0% 100.0% 0.0% flops 96 x 64 x 849 2809600671744 0.0% 100.0% 0.0% flops 64 x 96 x 853 2822837895168 0.0% 100.0% 0.0% flops 96 x 64 x 853 2822837895168 0.0% 100.0% 0.0% flops 64 x 96 x 858 2839384424448 0.0% 100.0% 0.0% flops 96 x 64 x 858 2839384424448 0.0% 100.0% 0.0% flops 849 x 64 x 849 2928781688832 0.0% 100.0% 0.0% flops 849 x 64 x 853 2942580424704 0.0% 100.0% 0.0% flops 853 x 64 x 849 2942580424704 0.0% 100.0% 0.0% flops 853 x 64 x 853 2956444172288 0.0% 100.0% 0.0% flops 96 x 96 x 96 2957084983296 0.0% 100.0% 0.0% flops 849 x 64 x 858 2959828844544 0.0% 100.0% 0.0% flops 858 x 64 x 849 2959828844544 0.0% 100.0% 0.0% flops 853 x 64 x 858 2973773856768 0.0% 100.0% 0.0% flops 858 x 64 x 853 2973773856768 0.0% 100.0% 0.0% flops 858 x 64 x 858 2991205122048 0.0% 100.0% 0.0% flops 22 x 22 x 64 3338610130944 0.0% 100.0% 0.0% flops 849 x 64 x 96 3856524115968 0.0% 100.0% 0.0% flops 849 x 96 x 64 3856524115968 0.0% 100.0% 0.0% flops 853 x 64 x 96 3874693840896 0.0% 100.0% 0.0% flops 853 x 96 x 64 3874693840896 0.0% 100.0% 0.0% flops 858 x 96 x 64 3897405997056 0.0% 100.0% 0.0% flops 858 x 64 x 96 3897405997056 0.0% 100.0% 0.0% flops 9 x 9 x 96 5501331634176 0.0% 100.0% 0.0% flops 9 x 22 x 96 7399681191936 0.0% 100.0% 0.0% flops 22 x 9 x 96 7413081679872 0.0% 100.0% 0.0% flops 96 x 96 x 849 8428802015232 0.0% 100.0% 0.0% flops 96 x 96 x 853 8468513685504 0.0% 100.0% 0.0% flops 96 x 96 x 858 8518153273344 0.0% 100.0% 0.0% flops 849 x 96 x 849 8786345066496 0.0% 100.0% 0.0% flops 849 x 96 x 853 8827741274112 0.0% 100.0% 0.0% flops 853 x 96 x 849 8827741274112 0.0% 100.0% 0.0% flops 853 x 96 x 853 8869332516864 0.0% 100.0% 0.0% flops 858 x 96 x 849 8879486533632 0.0% 100.0% 0.0% flops 849 x 96 x 858 8879486533632 0.0% 100.0% 0.0% flops 858 x 96 x 853 8921321570304 0.0% 100.0% 0.0% flops 853 x 96 x 858 8921321570304 0.0% 100.0% 0.0% flops 858 x 96 x 858 8973615366144 0.0% 100.0% 0.0% flops 22 x 22 x 96 10015830392832 0.0% 100.0% 0.0% flops 849 x 96 x 96 11569572347904 0.0% 100.0% 0.0% flops 853 x 96 x 96 11624081522688 0.0% 100.0% 0.0% flops 858 x 96 x 96 11692217991168 0.0% 100.0% 0.0% flops 9 x 32 x 9 21312216612864 0.0% 100.0% 0.0% flops 22 x 32 x 9 29317892972544 0.0% 100.0% 0.0% flops 9 x 32 x 22 29317892972544 0.0% 100.0% 0.0% flops 22 x 32 x 22 40107728764928 0.0% 100.0% 0.0% flops inhomo. stacks 0 0.0% 0.0% 0.0% flops total 383.054662E+12 0.0% 100.0% 0.0% flops max/rank 769.048094E+09 0.0% 100.0% 0.0% matmuls inhomo. stacks 0 0.0% 0.0% 0.0% matmuls total 11370092824 0.0% 100.0% 0.0% number of processed stacks 36472128 0.0% 100.0% 0.0% average stack size 0.0 311.7 0.0 marketing flops 780.451392E+12 ------------------------------------------------------------------------------- # multiplications 1445 max memory usage/rank 365.891584E+06 # max total images/rank 1 # max 3D layers 1 # MPI messages exchanged 38286720 MPI messages size (bytes): total size 22.066386E+12 min size 0.000000E+00 max size 5.889312E+06 average size 576.345688E+03 MPI breakdown and total messages size (bytes): size <= 128 274344 0 128 < size <= 8192 0 0 8192 < size <= 32768 1746712 57194053632 32768 < size <= 131072 13942784 856644648960 131072 < size <= 4194304 21501504 16367441085440 4194304 < size <= 16777216 821376 4784862003200 16777216 < size 0 0 ------------------------------------------------------------------------------- - - - DBCSR MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Bcast 68 12. MP_Allreduce 7415 50. MP_Alltoall 5329 496642. MP_ISend 138692 290073. MP_IRecv 138692 288852. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - MESSAGE PASSING PERFORMANCE - - - ------------------------------------------------------------------------------- ROUTINE CALLS AVE VOLUME [Bytes] MP_Group 4 MP_Bcast 4640 77325. MP_Allreduce 13232 2300. MP_Sync 1064 MP_Alltoall 2588 5068187. MP_SendRecv 126500 14304. MP_ISendRecv 69000 14304. MP_Wait 78200 MP_comm_split 40 MP_ISend 24680 99620. MP_IRecv 36480 68885. ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- - - - T I M I N G - - - ------------------------------------------------------------------------------- SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM CP2K 1 1.0 0.127 0.324 274.450 274.505 qs_mol_dyn_low 1 2.0 0.164 0.364 271.871 271.908 qs_forces 5 3.8 0.007 0.043 270.972 271.138 qs_energies 5 4.8 0.002 0.016 266.057 266.220 scf_env_do_scf 5 5.8 0.000 0.001 251.640 251.671 scf_env_do_scf_inner_loop 105 6.6 0.004 0.026 204.502 204.525 velocity_verlet 4 3.0 0.001 0.001 133.720 133.754 qs_scf_new_mos 105 7.6 0.001 0.001 123.137 124.132 qs_scf_loop_do_ot 105 8.6 0.001 0.001 123.136 124.132 ot_scf_mini 105 9.6 0.004 0.006 111.408 112.031 dbcsr_multiply_generic 1445 12.2 0.205 0.246 106.581 109.101 multiply_cannon 1445 13.2 0.186 0.230 86.987 94.885 multiply_cannon_loop 1445 14.2 0.336 0.512 83.870 89.203 mp_waitall_1 372490 16.1 52.453 65.805 52.453 65.805 rebuild_ks_matrix 110 8.4 0.001 0.001 51.175 51.689 qs_ks_build_kohn_sham_matrix 110 9.4 0.013 0.016 51.175 51.689 qs_ks_update_qs_env 112 7.6 0.001 0.001 47.146 47.609 init_scf_loop 7 6.6 0.000 0.000 47.076 47.101 qs_ot_get_p 112 10.4 0.001 0.002 45.296 47.066 multiply_cannon_metrocomm3 34680 15.2 0.125 0.247 9.634 46.946 multiply_cannon_multrec 34680 15.2 39.380 46.692 39.392 46.702 ot_mini 105 10.6 0.001 0.001 45.877 46.635 qs_rho_update_rho_low 110 7.6 0.001 0.001 41.743 41.892 calculate_rho_elec 110 8.6 0.047 0.048 41.742 41.891 multiply_cannon_metrocomm1 34680 15.2 0.149 0.281 30.794 40.693 prepare_preconditioner 7 7.6 0.000 0.000 39.585 39.700 make_preconditioner 7 8.6 0.000 0.000 39.585 39.700 make_full_inverse_cholesky 7 9.6 0.000 0.000 30.955 31.332 qs_ot_p2m_diag 40 11.0 0.022 0.029 31.101 31.192 cp_dbcsr_syevd 40 12.0 0.003 0.003 30.237 30.307 qs_ot_get_derivative 55 11.6 0.001 0.001 28.673 29.347 fft_wrap_pw1pw2 1425 12.5 0.019 0.023 23.796 24.212 cp_fm_syevd 40 13.0 0.000 0.001 23.706 23.840 grid_collocate_task_list 110 9.6 20.473 22.243 20.473 22.243 fft_wrap_pw1pw2_240 915 14.0 0.436 0.488 21.357 22.025 density_rs2pw 110 9.6 0.006 0.009 18.839 20.968 sum_up_and_integrate 60 10.3 0.001 0.002 20.896 20.924 integrate_v_rspace 60 11.3 0.002 0.003 20.862 20.892 fft3d_pb 915 15.0 4.994 5.350 19.234 19.946 apply_preconditioner_dbcsr 62 12.6 0.000 0.000 17.937 19.062 apply_single 62 13.6 0.000 0.000 17.937 19.062 qs_vxc_create 110 10.4 0.002 0.003 18.604 18.670 cp_fm_redistribute_end 40 14.0 8.937 17.811 8.944 17.816 cp_fm_syevd_base 40 14.0 8.862 17.742 8.862 17.742 ot_new_cg_direction 55 11.6 0.001 0.002 17.006 17.009 cp_fm_cholesky_invert 7 10.6 16.178 16.189 16.178 16.189 make_m2s 2890 13.2 0.098 0.114 14.527 15.572 transfer_rs2pw 445 10.6 0.007 0.010 12.736 14.692 make_images 2890 14.2 0.238 0.271 13.310 14.478 mp_alltoall_z22v 2340 16.7 12.993 14.115 12.993 14.115 cp_fm_cholesky_decompose 14 10.2 13.595 13.646 13.595 13.646 xc_pw_derive 510 13.4 0.006 0.010 12.788 13.090 calculate_dm_sparse 110 9.5 0.001 0.001 12.423 13.031 xc_vxc_pw_create 60 11.3 0.081 0.093 12.806 12.872 xc_rho_set_and_dset_create 110 12.4 0.138 0.171 12.575 12.631 mp_waitany 6270 13.5 10.665 12.617 10.665 12.617 check_diag 80 13.5 10.237 10.488 11.568 11.695 make_images_data 2890 15.2 0.069 0.091 9.773 11.141 qs_ot_get_derivative_taylor 37 12.8 0.001 0.002 10.148 10.583 grid_integrate_task_list 60 12.3 9.946 10.404 9.946 10.404 init_scf_run 5 5.8 0.000 0.000 10.305 10.307 scf_env_initial_rho_setup 5 6.8 0.001 0.008 10.305 10.306 hybrid_alltoall_any 2983 16.1 0.102 1.298 7.273 9.571 potential_pw2rs 60 12.3 0.003 0.005 9.147 9.186 mp_sum_l 7231 12.6 4.006 7.966 4.006 7.966 mp_irecv_dv 72697 16.1 3.206 7.842 3.206 7.842 make_full_single_inverse 7 9.6 0.001 0.001 7.566 7.677 multiply_cannon_metrocomm4 33235 15.2 0.115 0.256 3.037 7.155 transfer_pw2rs 245 13.2 0.003 0.004 7.091 7.117 cube_transpose_3 560 16.1 0.547 0.757 6.495 7.113 qs_ot_get_derivative_diag 18 12.0 0.001 0.001 6.453 6.712 mp_alltoall_d11v 1300 13.8 5.877 6.533 5.877 6.533 transfer_rs2pw_240 115 11.5 1.076 1.520 4.118 6.090 make_images_sizes 2890 15.2 0.004 0.008 1.821 6.013 mp_alltoall_i44 2890 16.2 1.817 6.008 1.817 6.008 transfer_rs2pw_80 110 11.6 0.243 0.316 5.523 5.967 mp_allgather_i34 1445 14.2 1.802 5.826 1.802 5.826 xc_pw_divergence 60 12.3 0.002 0.003 5.700 5.824 xc_exc_calc 50 11.5 0.019 0.019 5.796 5.802 qs_ot_get_orbitals 105 10.6 0.001 0.001 5.345 5.600 wfi_extrapolate 5 7.8 0.000 0.001 5.570 5.571 ------------------------------------------------------------------------------- ~ ~ ~ ~ DATA POINTS ~ ~ ~ ~ PlotPoint: name="601", plot="h2o_512_md", label="(16n/36r/1t)", y=274.505000, yerr=0.000000 PlotPoint: name="602", plot="h2o_512_md_mem", label="(16n/36r/1t)", y=344.800000, yerr=1.939072 ~ ~ ~ ~ END DATA POINTS ~ ~ ~ ~~~~~~ END RESULT ~~~~~~~~ ========= END RESULTS =========== CommitSHA: aa4adac34cf79714cddcdea931a90aba3810da90 Summary: empty Status: OK