=== This is the CP2K Performance-Test ===


Updating ce4b5920f..cd366a402
Fast-forward
 INSTALL.md                                       |    7 +-
 arch/Linux-x86-64-mingw64-minimal.sopt           |    2 +-
 src/CMakeLists.txt                               |    4 +
 src/cp2k_info.F                                  |    8 +-
 src/hdf5_wrapper.F                               |  762 ++++++++
 src/input_constants.F                            |    6 +-
 src/input_cp2k_dft.F                             |  125 +-
 src/ipi_driver.F                                 |  314 +---
 src/qcschema.F                                   | 1036 +++++++++++
 src/qs_active_space_methods.F                    | 2017 ++++++++++++----------
 src/qs_active_space_types.F                      |   53 +-
 src/qs_active_space_utils.F                      |  158 ++
 src/qs_energy.F                                  |    4 +
 src/qs_scf_post_gpw.F                            |    4 -
 src/sockets.c                                    |  119 +-
 src/sockets_interface.F                          |  304 ++++
 tests/QS/regtest-as-1/TEST_FILES                 |   24 +-
 tests/QS/regtest-as-1/ch2_gapw_2-3.inp           |   31 +-
 tests/QS/regtest-as-1/ch2_gapw_pp_2-3.inp        |   31 +-
 tests/QS/regtest-as-1/ch2_gpw_pp_2-3.inp         |   31 +-
 tests/QS/regtest-as-1/h2_gapw_2-2.inp            |   30 +-
 tests/QS/regtest-as-1/h2_gapw_2-3.inp            |   30 +-
 tests/QS/regtest-as-1/h2_gapw_2-4.inp            |   30 +-
 tests/QS/regtest-as-1/h2_gapw_pp_2-2.inp         |   30 +-
 tests/QS/regtest-as-1/h2_gapw_pp_2-3.inp         |   30 +-
 tests/QS/regtest-as-1/h2_gapw_pp_2-4.inp         |   30 +-
 tests/QS/regtest-as-1/h2_gpw_pp_2-2.inp          |   30 +-
 tests/QS/regtest-as-1/h2_gpw_pp_2-3.inp          |   30 +-
 tests/QS/regtest-as-1/h2_gpw_pp_2-4.inp          |   30 +-
 tests/QS/regtest-as-1/h2o_gapw_2-2.inp           |   30 +-
 tests/QS/regtest-as-dft/h2_2-2.inp               |   38 +-
 tests/QS/regtest-as-dft/h2_iterative_density.inp |   42 +-
 tests/QS/regtest-as-dft/h2_manual_1+2.inp        |   42 +-
 tests/QS/regtest-as-dft/h2_manual_1+2_1+3.inp    |   43 +-
 tests/QS/regtest-as-dft/h2_manual_1+3.inp        |   42 +-
 tests/QS/regtest-as-dft/h2_manual_1+3_1+2.inp    |   43 +-
 tests/QS/regtest-as-dft/h2_manual_1+4.inp        |   42 +-
 tests/QS/regtest-as-dft/h2_manual_UKS_1+2.inp    |   43 +-
 tests/QS/regtest-as/be.inp                       |   36 +-
 tests/QS/regtest-as/h2o.inp                      |   44 +-
 tools/toolchain/scripts/stage7/install_hdf5.sh   |   15 +-
 41 files changed, 4085 insertions(+), 1685 deletions(-)
 create mode 100644 src/hdf5_wrapper.F
 create mode 100644 src/qcschema.F
 create mode 100644 src/qs_active_space_utils.F
 create mode 100644 src/sockets_interface.F
Current branch master is up to date.


Already up to date.
Current branch master is up to date.

 GIT Revision: cd366a4022ad0ce9b8ea778ed4e993fe588c3c12


################# ARCHITECTURE FILE ##################
#!/bin/bash
#
# CP2K arch file for Cray-XC50 (Piz Daint, CSCS, GPU partition)
#
# Tested with: GNU 9.3.0, Cray-MPICH 7.7.18, Cray-libsci 20.09.1, Cray-FFTW 3.3.8.10,
#              COSMA 2.6.6, ELPA 2023.05.001, LIBINT 2.6.0, LIBPEXSI 1.2.0,
#              LIBXC 6.2.2, LIBVORI 220621, LIBXSMM 1.17, PLUMED 2.8.2,
#              SIRIUS 7.4.3, SPGLIB 1.16.2
#
# Usage: Source this arch file and then run make as instructed.
#        A full toolchain installation is performed as default.
#        Replace or adapt the "module add" commands below if needed.
#
# Last update: 21.06.2023
#
# \
   if [ "${0}" = "${BASH_SOURCE}" ]; then \
      echo "ERROR: Script ${0##*/} must be sourced"; \
      echo "Usage: source ${0##*/}"; \
      exit 1; \
   fi; \
   this_file=${BASH_SOURCE##*/}; \
   if [ -n "${1}" ]; then \
      gcc_version="${1}"; \
   else \
      gcc_version="9.3.0"; \
   fi; \
   module add daint-gpu; \
   module rm PrgEnv-cray; \
   module add PrgEnv-gnu; \
   module rm gcc; \
   module add gcc/${gcc_version}; \
   module add cray-fftw/3.3.8.10; \
   module add cudatoolkit; \
   echo "Expected setup:"; \
   echo "   cray-mpich/7.7.18"; \
   echo "   craype-haswell"; \
   echo "   daint-gpu/21.09"; \
   echo "   craype/2.7.10"; \
   echo "   cray-libsci/20.09.1"; \
   echo "   PrgEnv-gnu/6.0.10"; \
   echo "   gcc/${gcc_version}"; \
   echo "   cray-fftw/3.3.8.10"; \
   echo "   cudatoolkit/11.0.2_3.38-8.1__g5b73779"; \
   module list; \
   module -f save cp2k_gpu_gnu_psmp; \
   echo "To load the required modules in your batch job script, use:"; \
   echo "   module restore cp2k_gpu_gnu_psmp"; \
   cd tools/toolchain; \
   ./install_cp2k_toolchain.sh --enable-cuda=yes --gpu-ver=P100 -j${maxtasks} --no-arch-files --with-gcc=system --with-libvdwxc --with-pexsi --with-plumed; \
   cd ../..; \
   printf "Sourcing ${PWD}/tools/toolchain/install/setup ... "; \
   source ${PWD}/tools/toolchain/install/setup; \
   printf "done\n"; \
   echo "Check the output above for error messages and consistency!"; \
   echo; \
   echo "If everything is OK, you can build a CP2K production binary with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.}"; \
   echo; \
   echo "Alternatively, you can add further checks, e.g. for regression testing, with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.} DO_CHECKS=yes"; \
   echo "or build CP2K as a library with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.} libcp2k"; \
   echo; \
   return

# Set options
DO_CHECKS      := no
USE_ACC        := yes
USE_COSMA      := 2.6.6
USE_ELPA       := 2023.05.001
USE_LIBINT     := 2.6.0
USE_LIBPEXSI   := 1.2.0
USE_LIBVORI    := 220621
USE_LIBXC      := 6.2.2
USE_LIBXSMM    := 1.17
USE_PLUMED     := 2.8.2
#USE_QUIP       := 0.9.10
USE_SIRIUS     := 7.4.3
USE_SPGLIB     := 1.16.2
# Only needed for SIRIUS
LIBVDWXC_VER   := 0.4.0
SPFFT_VER      := 1.0.6
SPLA_VER       := 1.5.5
HDF5_VER       := 1.12.0
# Only needed for LIBPEXSI
SCOTCH_VER     := 6.0.0
SUPERLU_VER    := 6.1.0

LMAX           := 5
MAX_CONTR      := 4

GPUVER         := P100
OFFLOAD_TARGET := cuda

CC             := cc
CXX            := CC
OFFLOAD_CC     := nvcc
FC             := ftn
LD             := ftn
AR             := ar -r

# cc, CC, and ftn include already the proper -march flag
CFLAGS         := -O2 -fopenmp -fopenmp-simd -ftree-vectorize -funroll-loops -g

DFLAGS         := -D__parallel
DFLAGS         += -D__SCALAPACK
DFLAGS         += -D__FFTW3
DFLAGS         += -D__MAX_CONTR=$(strip $(MAX_CONTR))

INSTALL_PATH   := $(PWD)/tools/toolchain/install

ifeq ($(DO_CHECKS), yes)
   DFLAGS         += -D__CHECK_DIAG
endif

ifeq ($(USE_ACC), yes)
   DFLAGS         += -D__DBCSR_ACC
   DFLAGS         += -D__OFFLOAD_CUDA
# Possibly no performance gain with PW_CUDA currently
   DFLAGS         += -D__NO_OFFLOAD_PW
endif

ifneq ($(USE_PLUMED),)
   USE_PLUMED     := $(strip $(USE_PLUMED))
   PLUMED_LIB     := $(INSTALL_PATH)/plumed-$(USE_PLUMED)/lib
   DFLAGS         += -D__PLUMED2
   USE_GSL        := 2.7
   LIBS           += $(PLUMED_LIB)/libplumed.a
endif

ifneq ($(USE_ELPA),)
   USE_ELPA       := $(strip $(USE_ELPA))
   TARGET         := nvidia
   ELPA_INC       := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/include/elpa-$(USE_ELPA)
   ELPA_LIB       := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/lib
   CFLAGS         += -I$(ELPA_INC)/elpa -I$(ELPA_INC)/modules
   DFLAGS         += -D__ELPA
   ifeq ($(TARGET), nvidia)
      DFLAGS         += -D__ELPA_NVIDIA_GPU
   endif
   LIBS           += $(ELPA_LIB)/libelpa.a
endif

ifneq ($(USE_QUIP),)
   USE_QUIP       := $(strip $(USE_QUIP))
   QUIP_INC       := $(INSTALL_PATH)/quip-$(USE_QUIP)/include
   QUIP_LIB       := $(INSTALL_PATH)/quip-$(USE_QUIP)/lib
   CFLAGS         += -I$(QUIP_INC)
   DFLAGS         += -D__QUIP
   LIBS           += $(QUIP_LIB)/libquip_core.a
   LIBS           += $(QUIP_LIB)/libatoms.a
   LIBS           += $(QUIP_LIB)/libFoX_sax.a
   LIBS           += $(QUIP_LIB)/libFoX_common.a
   LIBS           += $(QUIP_LIB)/libFoX_utils.a
   LIBS           += $(QUIP_LIB)/libFoX_fsys.a
endif

ifneq ($(USE_LIBPEXSI),)
   USE_LIBPEXSI   := $(strip $(USE_LIBPEXSI))
   SCOTCH_VER     := $(strip $(SCOTCH_VER))
   SUPERLU_VER    := $(strip $(SUPERLU_VER))
   LIBPEXSI_INC   := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/include
   LIBPEXSI_LIB   := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/lib
   SCOTCH_INC     := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/include
   SCOTCH_LIB     := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/lib
   SUPERLU_INC    := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/include
   SUPERLU_LIB    := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/lib
   CFLAGS         += -I$(LIBPEXSI_INC) -I$(SCOTCH_INC) -I$(SUPERLU_INC)
   DFLAGS         += -D__LIBPEXSI
   LIBS           += $(LIBPEXSI_LIB)/libpexsi.a
   LIBS           += $(SUPERLU_LIB)/libsuperlu_dist.a
   LIBS           += $(SCOTCH_LIB)/libptscotchparmetis.a
   LIBS           += $(SCOTCH_LIB)/libptscotch.a
   LIBS           += $(SCOTCH_LIB)/libptscotcherr.a
   LIBS           += $(SCOTCH_LIB)/libscotchmetis.a
   LIBS           += $(SCOTCH_LIB)/libscotch.a
endif

ifneq ($(USE_LIBVORI),)
   USE_LIBVORI    := $(strip $(USE_LIBVORI))
   LIBVORI_LIB    := $(INSTALL_PATH)/libvori-$(USE_LIBVORI)/lib
   DFLAGS         += -D__LIBVORI
   LIBS           += $(LIBVORI_LIB)/libvori.a
endif

ifneq ($(USE_LIBXC),)
   USE_LIBXC      := $(strip $(USE_LIBXC))
   LIBXC_INC      := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/include
   LIBXC_LIB      := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/lib
   CFLAGS         += -I$(LIBXC_INC)
   DFLAGS         += -D__LIBXC
   LIBS           += $(LIBXC_LIB)/libxcf03.a
   LIBS           += $(LIBXC_LIB)/libxc.a
endif

ifneq ($(USE_LIBINT),)
   USE_LIBINT     := $(strip $(USE_LIBINT))
   LMAX           := $(strip $(LMAX))
   LIBINT_INC     := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/include
   LIBINT_LIB     := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/lib
   CFLAGS         += -I$(LIBINT_INC)
   DFLAGS         += -D__LIBINT
   LIBS           += $(LIBINT_LIB)/libint2.a
endif

ifneq ($(USE_SPGLIB),)
   USE_SPGLIB     := $(strip $(USE_SPGLIB))
   SPGLIB_INC     := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/include
   SPGLIB_LIB     := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/lib
   CFLAGS         += -I$(SPGLIB_INC)
   DFLAGS         += -D__SPGLIB
   LIBS           += $(SPGLIB_LIB)/libsymspg.a
endif

ifneq ($(USE_LIBXSMM),)
   USE_LIBXSMM    := $(strip $(USE_LIBXSMM))
   LIBXSMM_INC    := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/include
   LIBXSMM_LIB    := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/lib
   CFLAGS         += -I$(LIBXSMM_INC)
   DFLAGS         += -D__LIBXSMM
   LIBS           += $(LIBXSMM_LIB)/libxsmmf.a
   LIBS           += $(LIBXSMM_LIB)/libxsmm.a
endif

ifneq ($(USE_SIRIUS),)
   USE_SIRIUS     := $(strip $(USE_SIRIUS))
   HDF5_VER       := $(strip $(HDF5_VER))
   HDF5_LIB       := $(INSTALL_PATH)/hdf5-$(HDF5_VER)/lib
   LIBVDWXC_VER   := $(strip $(LIBVDWXC_VER))
   LIBVDWXC_INC   := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/include
   LIBVDWXC_LIB   := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/lib
   SPFFT_VER      := $(strip $(SPFFT_VER))
   SPFFT_INC      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/include
   SPLA_VER       := $(strip $(SPLA_VER))
   SPLA_INC       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/include/spla
   ifeq ($(USE_ACC), yes)
      DFLAGS         += -D__OFFLOAD_GEMM
      SPFFT_LIB      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib/cuda
      SPLA_LIB       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib/cuda
      SIRIUS_INC     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include/cuda
      SIRIUS_LIB     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib/cuda
   else
      SPFFT_LIB      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib
      SPLA_LIB       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib
      SIRIUS_INC     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include
      SIRIUS_LIB     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib
   endif
   CFLAGS         += -I$(LIBVDWXC_INC)
   CFLAGS         += -I$(SPFFT_INC)
   CFLAGS         += -I$(SPLA_INC)
   CFLAGS         += -I$(SIRIUS_INC)
   DFLAGS         += -D__HDF5
   DFLAGS         += -D__LIBVDWXC
   DFLAGS         += -D__SPFFT
   DFLAGS         += -D__SPLA
   DFLAGS         += -D__SIRIUS
   LIBS           += $(SIRIUS_LIB)/libsirius.a
   LIBS           += $(SPLA_LIB)/libspla.a
   LIBS           += $(SPFFT_LIB)/libspfft.a
   LIBS           += $(LIBVDWXC_LIB)/libvdwxc.a
   LIBS           += $(HDF5_LIB)/libhdf5.a
endif

ifneq ($(USE_COSMA),)
   USE_COSMA      := $(strip $(USE_COSMA))
   ifeq ($(USE_ACC), yes)
      USE_COSMA      := $(USE_COSMA)-cuda
   endif
   COSMA_INC      := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/include
   COSMA_LIB      := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/lib
   CFLAGS         += -I$(COSMA_INC)
   DFLAGS         += -D__COSMA
   LIBS           += $(COSMA_LIB)/libcosma_prefixed_pxgemm.a
   LIBS           += $(COSMA_LIB)/libcosma.a
   LIBS           += $(COSMA_LIB)/libcosta.a
   LIBS           += $(COSMA_LIB)/libTiled-MM.a
endif

ifneq ($(USE_GSL),)
   USE_GSL        := $(strip $(USE_GSL))
   GSL_INC        := $(INSTALL_PATH)/gsl-$(USE_GSL)/include
   GSL_LIB        := $(INSTALL_PATH)/gsl-$(USE_GSL)/lib
   CFLAGS         += -I$(GSL_INC)
   DFLAGS         += -D__GSL
   LIBS           += $(GSL_LIB)/libgsl.a
endif

CFLAGS         += $(DFLAGS)

CXXFLAGS       := $(CFLAGS) -std=c++11

OFFLOAD_FLAGS  := $(DFLAGS) -O3 -Xcompiler="-fopenmp" -arch sm_60 --std=c++11

FCFLAGS        := $(CFLAGS)
ifeq ($(shell [ $(shell gcc -dumpversion | cut -d. -f1) -gt 9 ] && echo yes), yes)
   FCFLAGS        += -fallow-argument-mismatch
endif
FCFLAGS        += -fbacktrace
FCFLAGS        += -ffree-form
FCFLAGS        += -ffree-line-length-none
FCFLAGS        += -fno-omit-frame-pointer
FCFLAGS        += -std=f2008

ifneq ($(CUDA_HOME),)
   CUDA_LIB       := $(CUDA_HOME)/lib64
   LDFLAGS        := $(FCFLAGS) -L$(CUDA_LIB) -Wl,-rpath=$(CUDA_LIB)
else
   LDFLAGS        := $(FCFLAGS)
endif

LIBS           += -lcusolver -lcudart -lnvrtc -lcuda -lcufft -lcublas -lrt
LIBS           += -lz -ldl -lpthread -lstdc++

# End
############### END ARCHITECTURE FILE ################


===== TESTS (description) =====
 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 RI-RPA/RI-MP2 correlation energy
 input file: benchmarks/QS_mp2_rpa/32-H2O/RI-RPA.inp
 required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-dRPA-TZ.inp']
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 15
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/01
 job id: 48544414
 --- Point ---
 name: 10
 plot: h2o_32_ri_rpa_mp2
 regex: Total RI-RPA Time= 
 label: RI-RPA (8n/2r/6t)
 --- Point ---
 name: 11
 plot: h2o_32_ri_rpa_mp2_mem
 regex: Estimated peak process memory 
 label: RI-RPA (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 RI-RPA/RI-MP2 correlation energy
 input file: benchmarks/QS_mp2_rpa/32-H2O/RI-MP2.inp
 required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-HF-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-MP2-TZ.inp']
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 15
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/02
 job id: 48544416
 --- Point ---
 name: 20
 plot: h2o_32_ri_rpa_mp2
 regex: Total MP2 Time= 
 label: RI-MP2 (8n/6r/2t)
 --- Point ---
 name: 21
 plot: h2o_32_ri_rpa_mp2_mem
 regex: Estimated peak process memory 
 label: RI-MP2 (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/03
 job id: 48544419
 --- Point ---
 name: 100
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 101
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/04
 job id: 48544421
 --- Point ---
 name: 102
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 103
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/05
 job id: 48544423
 --- Point ---
 name: 104
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 105
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/06
 job id: 48544425
 --- Point ---
 name: 106
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 107
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/07
 job id: 48544429
 --- Point ---
 name: 108
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 109
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/08
 job id: 48544431
 --- Point ---
 name: 110
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 111
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/09
 job id: 48544435
 --- Point ---
 name: 200
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 201
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/10
 job id: 48544439
 --- Point ---
 name: 202
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 203
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/11
 job id: 48544445
 --- Point ---
 name: 204
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 205
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/12
 job id: 48544451
 --- Point ---
 name: 206
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 207
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/13
 job id: 48544455
 --- Point ---
 name: 208
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 209
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/14
 job id: 48544459
 --- Point ---
 name: 210
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 211
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/15
 job id: 48544465
 --- Point ---
 name: 400
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 401
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/16
 job id: 48544471
 --- Point ---
 name: 402
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 403
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/17
 job id: 48544473
 --- Point ---
 name: 404
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 405
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/18
 job id: 48544475
 --- Point ---
 name: 406
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 407
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/19
 job id: 48544477
 --- Point ---
 name: 408
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 409
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/20
 job id: 48544479
 --- Point ---
 name: 410
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 411
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/21
 job id: 48544480
 --- Point ---
 name: 500
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 501
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/22
 job id: 48544481
 --- Point ---
 name: 502
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 503
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/23
 job id: 48544482
 --- Point ---
 name: 504
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 505
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/24
 job id: 48544484
 --- Point ---
 name: 506
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 507
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/25
 job id: 48544485
 --- Point ---
 name: 508
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 509
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/26
 job id: 48544486
 --- Point ---
 name: 510
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 511
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: 512 H2O (4 NVE MD steps on 64 nodes)
 input file: benchmarks/QS/00512_H2O/H2O-512_md.inp
 required files: []
 output file: result.log
 # nodes = 64
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/27
 job id: 48544488
 --- Point ---
 name: 601
 plot: h2o_512_md
 regex: CP2K  
 label: (64n/12r/1t)
 --- Point ---
 name: 602
 plot: h2o_512_md_mem
 regex: Estimated peak process memory 
 label: (64n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

=== END TESTS (description) ===


===== PLOTS (description) =====
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_ri_rpa_mp2", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_ri_rpa_mp2_mem", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_64_md", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_64_md_mem", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_128_md", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_128_md_mem", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_256_md", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_256_md_mem", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_nrep3_ls", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_nrep3_ls_mem", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_512_md", title="512 H2O (4 NVE MD steps on 64 nodes)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_512_md_mem", title="512 H2O (4 NVE MD steps on 64 nodes)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
=== END PLOTS (description) ===


============ RESULTS ============
 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/01/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         0.000000E+00       0.0%      0.0%      0.0%
 flops max/rank                      0.000000E+00       0.0%      0.0%      0.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                                  0       0.0%      0.0%      0.0%
 number of processed stacks                     0       0.0%      0.0%      0.0%
 average stack size                                     0.0       0.0       0.0
 marketing flops                     0.000000E+00
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                1                     12.
 MP_Allreduce           19                     21.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               15                 177869.
 MP_Allreduce          424                      8.
 MP_Sync                 3
 MP_comm_split           1
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.026    0.036  138.170  138.171
 farming_run                          1  2.0  137.691  137.692  138.125  138.129
 -------------------------------------------------------------------------------


 @@@@@@@@@@ Run number: 2 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32              4194304       0.0%      0.0%    100.0%
 flops    14 x    32 x    32            154140672       0.0%      0.0%    100.0%
 flops    29 x    32 x    32            159645696       0.0%      0.0%    100.0%
 flops    14 x    14 x    32            208732160       0.0%      0.0%    100.0%
 flops    29 x    14 x    32            212860928       0.0%      0.0%    100.0%
 flops    14 x    29 x    32            212860928       0.0%      0.0%    100.0%
 flops    29 x    29 x    32            227352576       0.0%      0.0%    100.0%
 flops    14 x    32 x    14         896801644032       0.0%      0.0%    100.0%
 flops    29 x    32 x    14         928925089792       0.0%      0.0%    100.0%
 flops    14 x    32 x    29         928925089792       0.0%      0.0%    100.0%
 flops    29 x    32 x    29         962100985856       0.0%      0.0%    100.0%
 flops    32 x    32 x    14        1693169221632       0.0%      0.0%    100.0%
 flops    32 x    32 x    29        1753639550976       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         7.164741E+12       0.0%      0.0%    100.0%
 flops max/rank                    447.801317E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          249492158       0.0%      0.0%    100.0%
 number of processed stacks                164328       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1518.3
 marketing flops                     7.165779E+12
 -------------------------------------------------------------------------------
 # multiplications                           1160
 max memory usage/rank               1.488605E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                    2592
 MPI messages size (bytes):
  total size                         1.140326E+09
  min size                           0.000000E+00
  max size                           1.663488E+06
  average size                     439.940750E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                 132                        0
       128 < size <=     8192                 348                  2850816
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                1536                179306496
    131072 < size <=  4194304                 576                958169088
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         2308                     54.
 MP_Alltoall          4670                 822215.
 MP_ISend             2604                  90577.
 MP_IRecv             2604                  90574.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group               12
 MP_Bcast              230                1134128.
 MP_Allreduce          571                1938539.
 MP_Sync                25
 MP_Alltoall            38                9316958.
 MP_SendRecv           120                 384007.
 MP_ISendRecv           45                 235435.
 MP_Wait               191
 MP_comm_split          10
 MP_ISend              127                3867574.
 MP_IRecv              127                3866554.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.009    0.030  116.644  116.644
 qs_energies                          1  2.0    0.000    0.000  116.434  116.436
 mp2_main                             1  3.0    0.000    0.000  113.834  113.837
 mp2_gpw_main                         1  4.0    0.028    0.034  112.514  112.516
 mp2_ri_gpw_compute_in                1  5.0    0.173    0.178   93.023   93.383
 mp2_ri_gpw_compute_in_loop           1  6.0    0.004    0.005   54.707   55.066
 mp2_eri_3c_integrate_gpw           272  7.0    0.151    0.163   41.089   46.151
 get_2c_integrals                     1  6.0    0.008    0.009   37.300   38.143
 integrate_v_rspace                 273  8.0    0.438    0.453   24.620   29.439
 pw_transfer                       6555 10.6    0.375    0.393   26.894   27.412
 fft_wrap_pw1pw2                   5465 11.4    0.044    0.046   25.505   25.981
 grid_integrate_task_list           273  9.0   20.520   25.799   20.520   25.799
 fft_wrap_pw1pw2_100               2178 12.4    1.123    1.343   23.048   23.523
 compute_2c_integrals                 1  7.0    0.010    0.012   19.577   19.579
 rpa_ri_compute_en                    1  5.0    0.025    0.027   19.371   19.507
 compute_2c_integrals_loop_lm         1  8.0    0.003    0.004   18.630   19.184
 mp2_eri_2c_integrate_gpw             1  9.0    2.359    2.404   18.627   19.181
 cp_fm_cholesky_decompose            12  8.2   17.926   18.796   17.926   18.796
 cholesky_decomp                      1  7.0    0.000    0.000   16.570   17.458
 fft3d_s                           5443 13.4   16.155   16.391   16.177   16.413
 ao_to_mo_and_store_B_mult_1        272  7.0   10.783   15.321   10.783   15.321
 calculate_wavefunction             272  8.0    5.421    5.575   12.363   13.053
 rpa_num_int                          1  6.0    0.001    0.011   10.974   10.975
 rpa_num_int_RPA_matrix_operati       8  7.0    0.000    0.000   10.941   10.960
 calc_mat_Q                           8  8.0    0.000    0.000    9.554    9.635
 contract_S_to_Q                      8  9.0    0.000    0.000    8.976    9.061
 calc_potential_gpw                 544  9.5    0.005    0.006    8.263    8.821
 parallel_gemm_fm                    14  9.1    0.000    0.000    8.570    8.677
 parallel_gemm_fm_cosma              14 10.1    8.570    8.677    8.570    8.677
 mp2_eri_2c_integrate_gpw_pot_l     272 10.0    0.001    0.001    8.102    8.466
 potential_pw2rs                    545 10.0    0.106    0.108    7.505    8.171
 create_integ_mat                     1  6.0    0.022    0.028    7.936    7.936
 collocate_single_gaussian          272 10.0    0.039    0.042    7.360    7.661
 array2fm                             1  7.0    0.000    0.000    6.658    7.162
 pw_scatter_s                      2720 13.7    4.342    4.458    4.342    4.458
 pw_gather_s                       2722 13.2    3.470    3.806    3.470    3.806
 array2fm_buffer_send                 1  8.0    2.899    3.111    2.899    3.111
 pw_poisson_solve                   545 10.5    1.116    1.172    2.212    2.503
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="10", plot="h2o_32_ri_rpa_mp2", label="RI-RPA (8n/2r/6t)", y=112.516264, yerr=0.000000
PlotPoint: name="11", plot="h2o_32_ri_rpa_mp2_mem", label="RI-RPA (8n/2r/6t)", y=2793.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/02/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         0.000000E+00       0.0%      0.0%      0.0%
 flops max/rank                      0.000000E+00       0.0%      0.0%      0.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                                  0       0.0%      0.0%      0.0%
 number of processed stacks                     0       0.0%      0.0%      0.0%
 average stack size                                     0.0       0.0       0.0
 marketing flops                     0.000000E+00
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                1                     12.
 MP_Allreduce           19                     21.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               22                 205321.
 MP_Allreduce          424                      9.
 MP_Sync                 4
 MP_comm_split           1
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.029    0.038  420.265  420.266
 farming_run                          1  2.0  419.369  419.381  420.222  420.225
 -------------------------------------------------------------------------------


 @@@@@@@@@@ Run number: 2 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32             16777216       0.0%      0.0%    100.0%
 flops    14 x    32 x    32            565182464       0.0%      0.0%    100.0%
 flops    29 x    32 x    32            585367552       0.0%      0.0%    100.0%
 flops    14 x    14 x    32            626196480       0.0%      0.0%    100.0%
 flops    29 x    14 x    32            638582784       0.0%      0.0%    100.0%
 flops    14 x    29 x    32            638582784       0.0%      0.0%    100.0%
 flops    29 x    29 x    32            682057728       0.0%      0.0%    100.0%
 flops    14 x    32 x    14         897827128576       0.0%      0.0%    100.0%
 flops    29 x    32 x    14         929989394432       0.0%      0.0%    100.0%
 flops    14 x    32 x    29         929989394432       0.0%      0.0%    100.0%
 flops    29 x    32 x    29         963203301376       0.0%      0.0%    100.0%
 flops    32 x    32 x    14        1693481172992       0.0%      0.0%    100.0%
 flops    32 x    32 x    29        1753962643456       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         7.172206E+12       0.0%      0.0%    100.0%
 flops max/rank                    150.696064E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          249788821       0.0%      0.0%    100.0%
 number of processed stacks                 98736       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    2529.9
 marketing flops                     7.174951E+12
 -------------------------------------------------------------------------------
 # multiplications                           1140
 max memory usage/rank               1.242403E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                   61440
 MPI messages size (bytes):
  total size                         6.073508E+09
  min size                           0.000000E+00
  max size                         642.960000E+03
  average size                      98.852664E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               32004                        0
       128 < size <=     8192                1820                 14909440
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072               18640               1081442304
    131072 < size <=  4194304                8976               4977156096
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         1003                     44.
 MP_Alltoall          1797                 713538.
 MP_ISend             3686                  54943.
 MP_IRecv             3622                  54292.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group               12
 MP_Bcast              757                 478553.
 MP_Allreduce         2021                  21391.
 MP_Sync                37
 MP_Alltoall            77
 MP_SendRecv          2876                2171486.
 MP_ISendRecv         1034                 172620.
 MP_Wait              1346
 MP_comm_split           7
 MP_ISend              264                 362227.
 MP_IRecv              264                 362718.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.012    0.034  217.153  217.153
 qs_energies                          1  2.0    0.000    0.000  216.934  216.941
 scf_env_do_scf                       1  3.0    0.000    0.000  114.940  114.940
 qs_ks_update_qs_env                  5  5.0    0.000    0.000  114.043  114.051
 rebuild_ks_matrix                    4  6.0    0.000    0.000  114.042  114.050
 qs_ks_build_kohn_sham_matrix         4  7.0    0.054    0.063  114.041  114.050
 hfx_ks_matrix                        4  8.0    0.001    0.001  113.697  113.701
 integrate_four_center                4  9.0    0.153    0.458  113.696  113.700
 integrate_four_center_main           4 10.0    0.146    0.606  102.221  104.914
 integrate_four_center_bin          265 11.0  102.074  104.912  102.074  104.912
 mp2_main                             1  3.0    0.000    0.000  101.703  101.710
 mp2_gpw_main                         1  4.0    0.047    0.071  100.862  100.872
 init_scf_loop                        1  4.0    0.000    0.000   97.053   97.053
 mp2_ri_gpw_compute_in                1  5.0    0.064    0.067   73.998   75.115
 mp2_ri_gpw_compute_in_loop           1  6.0    0.002    0.002   53.861   54.978
 mp2_eri_3c_integrate_gpw            91  7.0    0.143    0.160   41.400   46.479
 integrate_v_rspace                  95  8.0    0.396    0.567   27.869   32.767
 pw_transfer                       2240 10.6    0.147    0.162   29.448   29.825
 fft_wrap_pw1pw2                   1868 11.4    0.018    0.020   28.464   28.850
 mp2_ri_gpw_compute_en                1  5.0    0.055    0.064   26.707   28.423
 grid_integrate_task_list            95  9.0   23.293   28.423   23.293   28.423
 ao_to_mo_and_store_B_mult_1         91  7.0   10.781   28.109   10.781   28.109
 fft_wrap_pw1pw2_100                730 12.4    1.259    1.471   26.227   26.614
 mp2_ri_gpw_compute_en_RI_loop        1  6.0    1.842    1.905   24.931   24.941
 get_2c_integrals                     1  6.0    0.000    0.000   20.040   20.073
 compute_2c_integrals                 1  7.0    0.003    0.004   19.030   19.033
 fft3d_s                           1823 13.4   18.486   18.915   18.499   18.929
 compute_2c_integrals_loop_lm         1  8.0    0.001    0.002   18.541   18.907
 mp2_eri_2c_integrate_gpw             1  9.0    1.734    1.874   18.540   18.907
 scf_env_do_scf_inner_loop            4  4.0    0.000    0.001   17.886   17.886
 calculate_wavefunction              91  8.0    2.009    2.035    9.620    9.823
 mp2_ri_gpw_compute_en_expansio     172  7.0    0.555    0.574    8.821    9.138
 potential_pw2rs                    186 10.0    0.033    0.034    8.389    9.005
 local_gemm                         172  8.0    8.266    8.580    8.266    8.580
 mp2_eri_2c_integrate_gpw_pot_l      91 10.0    0.001    0.001    8.104    8.518
 mp2_ri_gpw_compute_en_comm          22  7.0    0.499    0.512    7.892    8.245
 mp_sync                             37 10.5    3.440    8.102    3.440    8.102
 calc_potential_gpw                 182  9.5    0.002    0.002    7.880    8.068
 collocate_single_gaussian           91 10.0    0.016    0.019    7.717    7.941
 integrate_four_center_load           4 10.0    0.000    0.000    6.782    6.786
 hfx_load_balance                     1 11.0    0.000    0.000    6.782    6.786
 mp2_ri_gpw_compute_en_ener         172  7.0    6.330    6.387    6.330    6.387
 mp_sendrecv_dm3                   2068  8.0    5.923    6.276    5.923    6.276
 pw_gather_s                        912 13.2    4.475    5.036    4.475    5.036
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="20", plot="h2o_32_ri_rpa_mp2", label="RI-MP2 (8n/6r/2t)", y=100.864213, yerr=0.000000
PlotPoint: name="21", plot="h2o_32_ri_rpa_mp2_mem", label="RI-MP2 (8n/6r/2t)", y=1502.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/03/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     29.277748E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               5055360       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      29.1
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             451.944448E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 9436608
 MPI messages size (bytes):
  total size                       333.233553E+09
  min size                           0.000000E+00
  max size                         315.840000E+03
  average size                      35.312852E+03
 MPI breakdown and total messages size (bytes):
             size <=      128             4913240                        0
       128 < size <=     8192             1155432               9465298944
      8192 < size <=    32768             1984512              54190407680
     32768 < size <=   131072              551296              42776657920
    131072 < size <=  4194304              832128             226802306368
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3683                  62379.
 MP_Allreduce        10329                    270.
 MP_Sync               530
 MP_Alltoall          2083                 592243.
 MP_SendRecv         22610                   5520.
 MP_ISendRecv        22610                   5520.
 MP_Wait             37876
 MP_comm_split          50
 MP_ISend            20771                  42672.
 MP_IRecv            20771                  42672.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.051    0.140   55.506   55.510
 qs_mol_dyn_low                       1  2.0    0.016    0.040   54.785   54.946
 qs_forces                           11  3.9    0.003    0.004   54.567   54.569
 qs_energies                         11  4.9    0.003    0.012   53.090   53.104
 scf_env_do_scf                      11  5.9    0.000    0.001   47.116   47.117
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.008   45.113   45.113
 qs_scf_new_mos                     108  7.5    0.000    0.000   35.204   35.500
 qs_scf_loop_do_ot                  108  8.5    0.000    0.001   35.204   35.499
 dbcsr_multiply_generic            2286 12.5    0.094    0.099   34.825   35.346
 ot_scf_mini                        108  9.5    0.002    0.002   33.461   33.641
 multiply_cannon                   2286 13.5    0.188    0.195   26.838   28.721
 multiply_cannon_loop              2286 14.5    1.836    1.945   26.204   28.107
 velocity_verlet                     10  3.0    0.001    0.002   26.772   26.773
 ot_mini                            108 10.5    0.001    0.002   20.660   20.908
 qs_ot_get_derivative               108 11.5    0.001    0.002   17.584   17.793
 mp_waitall_1                    245248 16.5    9.088   15.368    9.088   15.368
 multiply_cannon_metrocomm3       54864 15.5    0.073    0.080    6.457   13.437
 multiply_cannon_multrec          54864 15.5    3.621    5.740    7.557   10.915
 qs_ot_get_p                        119 10.4    0.001    0.001    8.055    8.339
 mp_sum_l                          7287 12.8    6.158    7.926    6.158    7.926
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.770    7.915
 qs_ks_build_kohn_sham_matrix       119  9.3    0.011    0.016    7.770    7.914
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.837    6.975
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    6.434    6.905
 multiply_cannon_sync_h2d         54864 15.5    5.103    6.281    5.103    6.281
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    5.664    5.777
 dbcsr_mm_accdrv_process          76910 16.1    1.829    2.870    3.850    5.478
 qs_ot_p2m_diag                      50 11.0    0.004    0.007    5.210    5.264
 init_scf_run                        11  5.9    0.000    0.001    4.589    4.589
 scf_env_initial_rho_setup           11  6.9    0.001    0.003    4.588    4.589
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    4.420    4.421
 sum_up_and_integrate               119 10.3    0.012    0.017    4.388    4.395
 integrate_v_rspace                 119 11.3    0.002    0.003    4.376    4.384
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    4.232    4.232
 qs_rho_update_rho_low              119  7.7    0.001    0.002    4.056    4.214
 calculate_rho_elec                 119  8.7    0.011    0.016    4.055    4.214
 cp_fm_redistribute_end              50 14.0    2.160    4.209    2.165    4.212
 cp_fm_diag_elpa_base                50 14.0    2.041    4.104    2.045    4.113
 apply_preconditioner_dbcsr         119 12.6    0.000    0.003    3.021    3.563
 apply_single                       119 13.6    0.000    0.000    3.021    3.563
 multiply_cannon_metrocomm1       54864 15.5    0.057    0.063    2.020    3.258
 calculate_dm_sparse                119  9.5    0.000    0.000    2.952    3.111
 acc_transpose_blocks             54864 15.5    0.229    0.262    2.304    3.006
 ot_diis_step                       108 11.5    0.015    0.051    2.797    2.798
 jit_kernel_multiply                 13 15.8    1.958    2.608    1.958    2.608
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    2.533    2.592
 calculate_first_density_matrix       1  7.0    0.000    0.002    2.425    2.429
 density_rs2pw                      119  9.7    0.004    0.004    2.085    2.211
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.005    2.195    2.196
 grid_integrate_task_list           119 12.3    2.021    2.123    2.021    2.123
 wfi_extrapolate                     11  7.9    0.001    0.003    2.095    2.095
 mp_sum_d                          4135 12.0    1.402    2.048    1.402    2.048
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.919    1.990
 init_scf_loop                       11  6.9    0.000    0.001    1.988    1.988
 potential_pw2rs                    119 12.3    0.004    0.004    1.762    1.774
 pw_transfer                       1439 11.6    0.052    0.059    1.601    1.669
 make_m2s                          4572 13.5    0.054    0.056    1.576    1.621
 fft_wrap_pw1pw2                   1201 12.6    0.007    0.007    1.526    1.597
 make_images                       4572 14.5    0.133    0.139    1.494    1.538
 mp_alltoall_d11v                  2130 13.8    1.282    1.519    1.282    1.519
 transfer_rs2pw                     487 10.6    0.005    0.006    1.391    1.506
 mp_waitany                       12084 13.8    1.312    1.473    1.312    1.473
 acc_transpose_blocks_sync       164592 16.5    1.210    1.452    1.210    1.452
 grid_collocate_task_list           119  9.7    1.345    1.405    1.345    1.405
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.378    1.402
 acc_transpose_blocks_kernels     54864 16.5    0.247    0.378    0.819    1.331
 fft3d_ps                          1201 14.6    0.369    0.475    1.240    1.309
 transfer_pw2rs                     487 13.2    0.006    0.007    1.293    1.301
 fft_wrap_pw1pw2_140                487 13.2    0.141    0.154    1.185    1.255
 dbcsr_dot_sd                      1205 11.9    0.048    0.059    0.790    1.192
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="100", plot="h2o_64_md", label="(8n/12r/1t)", y=55.510000, yerr=0.000000
PlotPoint: name="101", plot="h2o_64_md_mem", label="(8n/12r/1t)", y=431.272727, yerr=1.052349
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/04/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     57.173320E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3066240       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      47.9
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             488.550400E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 2194560
 MPI messages size (bytes):
  total size                       310.646604E+09
  min size                           0.000000E+00
  max size                           1.145520E+06
  average size                     141.553031E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              724648                        0
       128 < size <=     8192              253512               2076770304
      8192 < size <=    32768              281952               4619501568
     32768 < size <=   131072              494448              39143342080
    131072 < size <=  4194304              440000             264807943488
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3672                  62658.
 MP_Allreduce        10306                    303.
 MP_Sync                54
 MP_Alltoall          2060                 922657.
 MP_SendRecv         16779                  37093.
 MP_ISendRecv        16779                  37093.
 MP_Wait             23539
 MP_comm_split          50
 MP_ISend             5720                 128509.
 MP_IRecv             5720                 128509.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.028    0.051   40.140   40.147
 qs_mol_dyn_low                       1  2.0    0.003    0.003   39.771   39.809
 qs_forces                           11  3.9    0.003    0.004   39.446   39.447
 qs_energies                         11  4.9    0.002    0.002   37.774   37.780
 scf_env_do_scf                      11  5.9    0.000    0.001   32.648   32.649
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.007   30.160   30.160
 dbcsr_multiply_generic            2286 12.5    0.100    0.102   22.927   23.281
 qs_scf_new_mos                     108  7.5    0.001    0.001   21.544   21.771
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   21.543   21.771
 ot_scf_mini                        108  9.5    0.002    0.003   20.628   20.793
 multiply_cannon                   2286 13.5    0.209    0.218   17.302   19.055
 velocity_verlet                     10  3.0    0.002    0.003   18.595   18.602
 multiply_cannon_loop              2286 14.5    1.195    1.266   16.135   18.086
 ot_mini                            108 10.5    0.001    0.001   13.060   13.292
 mp_waitall_1                    200699 16.5    5.794   11.051    5.794   11.051
 qs_ot_get_derivative               108 11.5    0.001    0.001   10.686   10.855
 multiply_cannon_metrocomm3       27432 15.5    0.071    0.074    4.310    9.829
 multiply_cannon_multrec          27432 15.5    1.803    3.993    6.187    9.056
 rebuild_ks_matrix                  119  8.3    0.000    0.000    6.856    6.995
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.014    6.856    6.995
 dbcsr_mm_accdrv_process          47894 16.0    3.534    6.244    4.302    6.853
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.037    6.164
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    4.055    5.484
 mp_sum_l                          7287 12.8    2.614    4.958    2.614    4.958
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    3.589    4.953
 apply_single                       119 13.6    0.000    0.000    3.589    4.952
 qs_ot_get_p                        119 10.4    0.001    0.001    4.648    4.877
 init_scf_run                        11  5.9    0.000    0.001    3.932    3.932
 scf_env_initial_rho_setup           11  6.9    0.001    0.001    3.931    3.932
 sum_up_and_integrate               119 10.3    0.025    0.028    3.705    3.710
 integrate_v_rspace                 119 11.3    0.002    0.003    3.680    3.688
 qs_rho_update_rho_low              119  7.7    0.001    0.001    3.590    3.616
 calculate_rho_elec                 119  8.7    0.021    0.024    3.589    3.616
 qs_ot_p2m_diag                      50 11.0    0.009    0.013    3.004    3.022
 make_m2s                          4572 13.5    0.052    0.054    2.577    2.898
 make_images                       4572 14.5    0.208    0.250    2.490    2.814
 acc_transpose_blocks             27432 15.5    0.113    0.120    1.710    2.777
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    2.607    2.607
 init_scf_loop                       11  6.9    0.001    0.001    2.467    2.467
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.424    2.426
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    2.315    2.402
 ot_diis_step                       108 11.5    0.011    0.011    2.326    2.326
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    2.291    2.291
 cp_fm_redistribute_end              50 14.0    1.162    2.266    1.165    2.269
 multiply_cannon_sync_h2d         27432 15.5    1.708    2.246    1.708    2.246
 cp_fm_diag_elpa_base                50 14.0    1.072    2.175    1.100    2.210
 jit_kernel_multiply                  9 16.4    0.710    2.201    0.710    2.201
 calculate_dm_sparse                119  9.5    0.000    0.001    2.094    2.171
 density_rs2pw                      119  9.7    0.004    0.004    1.888    1.992
 grid_integrate_task_list           119 12.3    1.833    1.938    1.833    1.938
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    1.932    1.934
 pw_transfer                       1439 11.6    0.065    0.068    1.832    1.861
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    1.741    1.772
 acc_transpose_blocks_kernels     27432 16.5    0.188    0.274    0.755    1.722
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.673    1.714
 make_images_data                  4572 15.5    0.047    0.054    1.231    1.566
 prepare_preconditioner              11  7.9    0.000    0.000    1.517    1.543
 make_preconditioner                 11  8.9    0.000    0.000    1.517    1.543
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.420    1.475
 jit_kernel_transpose                 5 15.5    0.567    1.454    0.567    1.454
 potential_pw2rs                    119 12.3    0.006    0.006    1.442    1.452
 wfi_extrapolate                     11  7.9    0.001    0.001    1.450    1.450
 hybrid_alltoall_any               4725 16.4    0.054    0.116    1.098    1.445
 mp_alltoall_d11v                  2130 13.8    1.214    1.410    1.214    1.410
 fft_wrap_pw1pw2_140                487 13.2    0.160    0.168    1.372    1.405
 fft3d_ps                          1201 14.6    0.516    0.571    1.377    1.403
 grid_collocate_task_list           119  9.7    1.289    1.351    1.289    1.351
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.288    1.297
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.203    1.252
 transfer_rs2pw                     487 10.6    0.005    0.005    1.068    1.140
 mp_sum_d                          4135 12.0    0.591    1.084    0.591    1.084
 mp_allgather_i34                  2286 14.5    0.591    0.987    0.591    0.987
 acc_transpose_blocks_sync        82296 16.5    0.817    0.943    0.817    0.943
 qs_energies_init_hamiltonians       11  5.9    0.000    0.001    0.926    0.927
 transfer_pw2rs                     487 13.2    0.005    0.006    0.897    0.901
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.875    0.888
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="102", plot="h2o_64_md", label="(8n/6r/2t)", y=40.147000, yerr=0.000000
PlotPoint: name="103", plot="h2o_64_md_mem", label="(8n/6r/2t)", y=465.181818, yerr=1.113404
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/05/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     59.051995E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3143552       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      46.8
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             519.483392E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  950976
 MPI messages size (bytes):
  total size                       203.844256E+09
  min size                           0.000000E+00
  max size                           1.638400E+06
  average size                     214.352688E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192              253512               2076770304
      8192 < size <=    32768              179424               2939682816
     32768 < size <=   131072              181440              14863564800
    131072 < size <=  4194304              330176             183964913216
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63490.
 MP_Allreduce        10155                    305.
 MP_Sync                54
 MP_Alltoall          1821                1607811.
 MP_SendRecv         11067                  57667.
 MP_ISendRecv        11067                  57667.
 MP_Wait             21987
 MP_ISend             9880                  92618.
 MP_IRecv             9880                  92618.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.028    0.031   32.899   32.899
 qs_mol_dyn_low                       1  2.0    0.003    0.003   32.606   32.614
 qs_forces                           11  3.9    0.003    0.004   32.392   32.393
 qs_energies                         11  4.9    0.001    0.001   30.851   30.853
 scf_env_do_scf                      11  5.9    0.000    0.001   25.519   25.519
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.006   23.129   23.129
 dbcsr_multiply_generic            2286 12.5    0.095    0.096   17.515   17.612
 qs_scf_new_mos                     108  7.5    0.001    0.001   15.439   15.458
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   15.438   15.458
 velocity_verlet                     10  3.0    0.002    0.003   15.272   15.278
 multiply_cannon                   2286 13.5    0.193    0.198   14.278   15.137
 ot_scf_mini                        108  9.5    0.002    0.002   14.681   14.698
 multiply_cannon_loop              2286 14.5    0.862    0.914   13.504   14.393
 ot_mini                            108 10.5    0.001    0.001    9.049    9.068
 multiply_cannon_multrec          18288 15.5    1.872    2.836    7.478    7.779
 qs_ot_get_derivative               108 11.5    0.001    0.001    7.564    7.579
 dbcsr_mm_accdrv_process          38222 16.0    5.455    6.400    5.513    6.465
 rebuild_ks_matrix                  119  8.3    0.000    0.000    5.958    5.973
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.013    5.957    5.973
 qs_ks_update_qs_env                119  7.6    0.001    0.001    5.247    5.262
 init_scf_run                        11  5.9    0.000    0.001    4.174    4.174
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    4.174    4.174
 mp_waitall_1                    158411 16.6    2.676    3.678    2.676    3.678
 sum_up_and_integrate               119 10.3    0.030    0.031    3.559    3.564
 integrate_v_rspace                 119 11.3    0.002    0.003    3.529    3.537
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    2.844    3.448
 qs_ot_get_p                        119 10.4    0.001    0.001    3.400    3.426
 qs_rho_update_rho_low              119  7.7    0.001    0.001    3.286    3.291
 calculate_rho_elec                 119  8.7    0.031    0.031    3.285    3.290
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.951    2.953
 calculate_dm_sparse                119  9.5    0.000    0.000    2.428    2.438
 init_scf_loop                       11  6.9    0.000    0.001    2.373    2.374
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.002    2.351
 apply_single                       119 13.6    0.000    0.000    2.002    2.350
 qs_ot_p2m_diag                      50 11.0    0.012    0.013    2.274    2.282
 multiply_cannon_metrocomm3       18288 15.5    0.047    0.048    1.476    2.249
 density_rs2pw                      119  9.7    0.004    0.004    1.854    1.980
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    1.974    1.975
 make_m2s                          4572 13.5    0.044    0.045    1.794    1.940
 grid_integrate_task_list           119 12.3    1.804    1.905    1.804    1.905
 make_images                       4572 14.5    0.194    0.207    1.710    1.854
 pw_transfer                       1439 11.6    0.065    0.069    1.820    1.831
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    1.778    1.779
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.767    1.770
 acc_transpose_blocks             18288 15.5    0.080    0.082    1.653    1.753
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    1.728    1.741
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.720    1.732
 cp_fm_diag_elpa_base                50 14.0    1.696    1.709    1.718    1.731
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.718    1.727
 prepare_preconditioner              11  7.9    0.000    0.000    1.625    1.627
 make_preconditioner                 11  8.9    0.000    0.000    1.625    1.627
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.591    1.596
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.486    1.566
 ot_diis_step                       108 11.5    0.011    0.011    1.466    1.467
 mp_sum_l                          7287 12.8    1.059    1.420    1.059    1.420
 fft_wrap_pw1pw2_140                487 13.2    0.212    0.218    1.386    1.398
 potential_pw2rs                    119 12.3    0.007    0.008    1.342    1.346
 grid_collocate_task_list           119  9.7    1.234    1.325    1.234    1.325
 fft3d_ps                          1201 14.6    0.526    0.542    1.296    1.308
 multiply_cannon_sync_h2d         18288 15.5    1.048    1.200    1.048    1.200
 wfi_extrapolate                     11  7.9    0.001    0.001    1.171    1.171
 transfer_rs2pw                     487 10.6    0.005    0.005    1.008    1.141
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    0.943    0.961
 qs_energies_init_hamiltonians       11  5.9    0.000    0.001    0.936    0.937
 make_images_data                  4572 15.5    0.047    0.051    0.756    0.923
 hybrid_alltoall_any               4725 16.4    0.058    0.115    0.651    0.858
 acc_transpose_blocks_sync        54864 16.5    0.740    0.836    0.740    0.836
 acc_transpose_blocks_kernels     18288 16.5    0.217    0.223    0.815    0.824
 transfer_pw2rs                     487 13.2    0.004    0.004    0.818    0.821
 multiply_cannon_metrocomm1       18288 15.5    0.030    0.031    0.463    0.797
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.780    0.781
 mp_alltoall_d11v                  2130 13.8    0.664    0.776    0.664    0.776
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.000    0.676    0.725
 cp_fm_cholesky_invert               11 10.9    0.697    0.701    0.697    0.701
 mp_alltoall_z22v                  1201 16.6    0.618    0.681    0.618    0.681
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="104", plot="h2o_64_md", label="(8n/4r/3t)", y=32.899000, yerr=0.000000
PlotPoint: name="105", plot="h2o_64_md_mem", label="(8n/4r/3t)", y=495.000000, yerr=1.044466
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/06/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    114.044384E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3805952       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      38.6
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             558.137344E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1042416
 MPI messages size (bytes):
  total size                       150.443262E+09
  min size                           0.000000E+00
  max size                           1.188816E+06
  average size                     144.321719E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              228256                        0
       128 < size <=     8192              126888               1039466496
      8192 < size <=    32768              191472               3137077248
     32768 < size <=   131072              295800              25899827200
    131072 < size <=  4194304              200000             120367247040
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63489.
 MP_Allreduce        10154                    346.
 MP_Sync                54
 MP_Alltoall          1582                2412273.
 MP_SendRecv          8211                  74133.
 MP_ISendRecv         8211                  74133.
 MP_Wait             16271
 MP_ISend             7280                 135929.
 MP_IRecv             7280                 135929.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.015    0.033   36.678   36.679
 qs_mol_dyn_low                       1  2.0    0.003    0.003   36.500   36.509
 qs_forces                           11  3.9    0.002    0.003   36.442   36.443
 qs_energies                         11  4.9    0.002    0.002   34.735   34.740
 scf_env_do_scf                      11  5.9    0.000    0.001   29.204   29.206
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.007   25.867   25.867
 dbcsr_multiply_generic            2286 12.5    0.098    0.101   20.290   20.415
 velocity_verlet                     10  3.0    0.002    0.002   18.283   18.285
 qs_scf_new_mos                     108  7.5    0.001    0.001   17.900   17.953
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   17.899   17.953
 multiply_cannon                   2286 13.5    0.217    0.226   16.601   16.962
 ot_scf_mini                        108  9.5    0.002    0.003   16.867   16.920
 multiply_cannon_loop              2286 14.5    1.552    1.626   15.672   15.977
 ot_mini                            108 10.5    0.001    0.001   10.375   10.440
 multiply_cannon_multrec          27432 15.5    2.464    3.156    9.151    9.581
 qs_ot_get_derivative               108 11.5    0.001    0.001    8.516    8.570
 dbcsr_mm_accdrv_process          47916 15.9    5.987    7.658    6.585    8.090
 rebuild_ks_matrix                  119  8.3    0.000    0.000    6.287    6.346
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.013    6.287    6.346
 qs_ks_update_qs_env                119  7.6    0.001    0.001    5.568    5.621
 init_scf_run                        11  5.9    0.000    0.001    4.197    4.198
 scf_env_initial_rho_setup           11  6.9    0.001    0.001    4.197    4.197
 qs_ot_get_p                        119 10.4    0.001    0.001    3.644    3.721
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    3.116    3.521
 sum_up_and_integrate               119 10.3    0.035    0.037    3.469    3.478
 integrate_v_rspace                 119 11.3    0.003    0.004    3.434    3.443
 init_scf_loop                       11  6.9    0.001    0.001    3.319    3.319
 qs_rho_update_rho_low              119  7.7    0.001    0.001    3.282    3.313
 calculate_rho_elec                 119  8.7    0.040    0.046    3.282    3.312
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.818    2.821
 mp_waitall_1                    137007 16.6    2.056    2.641    2.056    2.641
 acc_transpose_blocks             27432 15.5    0.121    0.126    2.442    2.553
 prepare_preconditioner              11  7.9    0.000    0.000    2.487    2.495
 make_preconditioner                 11  8.9    0.000    0.000    2.487    2.495
 make_full_inverse_cholesky          11  9.9    0.000    0.000    2.099    2.424
 make_m2s                          4572 13.5    0.054    0.056    2.280    2.385
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.346    2.347
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    2.318    2.346
 make_images                       4572 14.5    0.288    0.372    2.172    2.275
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.106    2.246
 apply_single                       119 13.6    0.000    0.000    2.106    2.246
 qs_ot_p2m_diag                      50 11.0    0.016    0.023    2.234    2.245
 calculate_dm_sparse                119  9.5    0.000    0.000    2.145    2.204
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.007    2.028
 pw_transfer                       1439 11.6    0.065    0.069    1.894    1.928
 grid_integrate_task_list           119 12.3    1.836    1.915    1.836    1.915
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    1.802    1.839
 density_rs2pw                      119  9.7    0.004    0.004    1.748    1.822
 ot_diis_step                       108 11.5    0.012    0.012    1.819    1.819
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    1.792    1.792
 multiply_cannon_metrocomm3       27432 15.5    0.040    0.042    0.971    1.722
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.554    1.563
 cp_fm_diag_elpa_base                50 14.0    1.520    1.537    1.552    1.560
 acc_transpose_blocks_sync        82296 16.5    1.453    1.555    1.453    1.555
 fft_wrap_pw1pw2_140                487 13.2    0.246    0.259    1.491    1.530
 fft3d_ps                          1201 14.6    0.554    0.604    1.320    1.346
 jit_kernel_multiply                  6 16.4    0.532    1.346    0.532    1.346
 grid_collocate_task_list           119  9.7    1.248    1.342    1.248    1.342
 wfi_extrapolate                     11  7.9    0.001    0.001    1.325    1.325
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.286    1.304
 potential_pw2rs                    119 12.3    0.008    0.009    1.252    1.256
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.238    1.247
 mp_sum_l                          7287 12.8    0.984    1.244    0.984    1.244
 dbcsr_complete_redistribute        329 12.2    0.225    0.314    0.921    1.190
 cp_fm_upper_to_full                 72 14.2    0.806    1.151    0.806    1.151
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    1.074    1.075
 copy_fm_to_dbcsr                   176 11.2    0.001    0.001    0.720    0.984
 make_images_data                  4572 15.5    0.047    0.051    0.849    0.974
 transfer_rs2pw                     487 10.6    0.005    0.005    0.868    0.953
 multiply_cannon_metrocomm1       27432 15.5    0.035    0.037    0.289    0.921
 hybrid_alltoall_any               4725 16.4    0.065    0.154    0.717    0.913
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.001    0.801    0.880
 acc_transpose_blocks_kernels     27432 16.5    0.270    0.276    0.840    0.855
 mp_alltoall_i22                    627 13.8    0.431    0.827    0.431    0.827
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.807    0.812
 mp_alltoall_d11v                  2130 13.8    0.693    0.812    0.693    0.812
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="106", plot="h2o_64_md", label="(8n/3r/4t)", y=36.679000, yerr=0.000000
PlotPoint: name="107", plot="h2o_64_md_mem", label="(8n/3r/4t)", y=529.181818, yerr=4.130115
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/07/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    117.977176E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               1384136       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     106.2
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             621.228032E+06
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  219456
 MPI messages size (bytes):
  total size                        97.042514E+09
  min size                           0.000000E+00
  max size                           3.276800E+06
  average size                     442.195750E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768              101892               3336634368
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304              116112              93705670464
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         8156                     20.
 MP_Alltoall          8655                  64935.
 MP_ISend            36532                 168375.
 MP_IRecv            36532                 168349.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63488.
 MP_Allreduce        10154                    346.
 MP_Sync                54
 MP_Alltoall          1582                3682667.
 MP_SendRecv          5355                  94533.
 MP_ISendRecv         5355                  94533.
 MP_Wait             11335
 MP_ISend             5200                 225425.
 MP_IRecv             5200                 225425.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.027    0.033   29.955   29.956
 qs_mol_dyn_low                       1  2.0    0.003    0.003   29.707   29.715
 qs_forces                           11  3.9    0.002    0.003   29.143   29.144
 qs_energies                         11  4.9    0.001    0.002   27.452   27.454
 scf_env_do_scf                      11  5.9    0.000    0.001   21.926   21.927
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.007   19.505   19.506
 velocity_verlet                     10  3.0    0.002    0.002   15.060   15.095
 dbcsr_multiply_generic            2286 12.5    0.092    0.096   13.962   14.044
 qs_scf_new_mos                     108  7.5    0.001    0.001   11.947   11.972
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   11.947   11.972
 multiply_cannon                   2286 13.5    0.223    0.228   11.396   11.839
 ot_scf_mini                        108  9.5    0.002    0.002   11.238   11.263
 multiply_cannon_loop              2286 14.5    0.645    0.668   10.479   10.680
 multiply_cannon_multrec           9144 15.5    1.823    2.169    6.688    7.130
 ot_mini                            108 10.5    0.001    0.001    6.580    6.612
 rebuild_ks_matrix                  119  8.3    0.000    0.000    5.734    5.758
 qs_ks_build_kohn_sham_matrix       119  9.3    0.013    0.013    5.733    5.758
 qs_ot_get_derivative               108 11.5    0.001    0.001    5.302    5.326
 qs_ks_update_qs_env                119  7.6    0.001    0.001    5.102    5.124
 dbcsr_mm_accdrv_process          12550 15.8    3.930    4.809    4.755    4.955
 init_scf_run                        11  5.9    0.000    0.001    4.087    4.087
 scf_env_initial_rho_setup           11  6.9    0.001    0.001    4.087    4.087
 sum_up_and_integrate               119 10.3    0.037    0.041    3.423    3.428
 integrate_v_rspace                 119 11.3    0.003    0.003    3.386    3.390
 qs_rho_update_rho_low              119  7.7    0.001    0.001    3.296    3.308
 calculate_rho_elec                 119  8.7    0.060    0.061    3.296    3.308
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.924    2.926
 qs_ot_get_p                        119 10.4    0.001    0.001    2.782    2.819
 init_scf_loop                       11  6.9    0.001    0.001    2.402    2.403
 calculate_dm_sparse                119  9.5    0.000    0.000    2.325    2.343
 mp_waitall_1                    115863 16.7    1.557    2.082    1.557    2.082
 pw_transfer                       1439 11.6    0.065    0.068    1.948    1.959
 make_m2s                          4572 13.5    0.034    0.035    1.789    1.957
 grid_integrate_task_list           119 12.3    1.864    1.928    1.864    1.928
 make_images                       4572 14.5    0.273    0.305    1.701    1.867
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    1.855    1.867
 qs_ot_p2m_diag                      50 11.0    0.022    0.023    1.804    1.807
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    1.772    1.790
 density_rs2pw                      119  9.7    0.003    0.004    1.690    1.778
 jit_kernel_multiply                  8 15.7    0.785    1.763    0.785    1.763
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    1.730    1.731
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.713    1.720
 prepare_preconditioner              11  7.9    0.000    0.000    1.676    1.680
 make_preconditioner                 11  8.9    0.000    0.000    1.676    1.680
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.566    1.593
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    1.562    1.563
 fft_wrap_pw1pw2_140                487 13.2    0.322    0.333    1.529    1.542
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.524    1.533
 acc_transpose_blocks              9144 15.5    0.041    0.042    1.473    1.504
 grid_collocate_task_list           119  9.7    1.297    1.373    1.297    1.373
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.342    1.356
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.293    1.301
 cp_fm_diag_elpa_base                50 14.0    1.266    1.283    1.292    1.300
 fft3d_ps                          1201 14.6    0.558    0.569    1.269    1.276
 ot_diis_step                       108 11.5    0.013    0.013    1.266    1.266
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    1.205    1.242
 apply_single                       119 13.6    0.000    0.000    1.205    1.241
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    1.228    1.229
 potential_pw2rs                    119 12.3    0.010    0.011    1.208    1.211
 wfi_extrapolate                     11  7.9    0.001    0.001    1.112    1.113
 hybrid_alltoall_any               4725 16.4    0.065    0.174    0.747    0.986
 make_images_data                  4572 15.5    0.042    0.045    0.774    0.961
 multiply_cannon_metrocomm3        9144 15.5    0.020    0.021    0.437    0.941
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.000    0.875    0.920
 cp_fm_cholesky_invert               11 10.9    0.846    0.849    0.846    0.849
 transfer_rs2pw                     487 10.6    0.004    0.004    0.773    0.848
 mp_alltoall_d11v                  2130 13.8    0.765    0.834    0.765    0.834
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    0.816    0.823
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.753    0.756
 acc_transpose_blocks_sync        27432 16.5    0.722    0.752    0.722    0.752
 qs_env_update_s_mstruct             11  6.9    0.001    0.002    0.675    0.723
 acc_transpose_blocks_kernels      9144 16.5    0.117    0.119    0.694    0.696
 mp_allgather_i34                  2286 14.5    0.235    0.661    0.235    0.661
 transfer_pw2rs                     487 13.2    0.003    0.004    0.648    0.651
 multiply_cannon_metrocomm1        9144 15.5    0.023    0.024    0.377    0.639
 mp_alltoall_z22v                  1201 16.6    0.584    0.613    0.584    0.613
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="108", plot="h2o_64_md", label="(8n/2r/6t)", y=29.956000, yerr=0.000000
PlotPoint: name="109", plot="h2o_64_md_mem", label="(8n/2r/6t)", y=589.818182, yerr=4.386305
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/08/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    235.585836E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               1388964       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     105.8
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             777.605120E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   91440
 MPI messages size (bytes):
  total size                        85.748679E+09
  min size                           0.000000E+00
  max size                           6.553600E+06
  average size                     937.758938E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768               21148                692256768
     32768 < size <=   131072               19224               1259864064
    131072 < size <=  4194304               41040              21941452800
   4194304 < size <= 16777216                9456              61855174464
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63723.
 MP_Allreduce        10154                    429.
 MP_Sync                54
 MP_Alltoall          1582                7383731.
 MP_SendRecv          2499                 189067.
 MP_ISendRecv         2499                 189067.
 MP_Wait              6399
 MP_ISend             3120                 546875.
 MP_IRecv             3120                 546875.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.019    0.039  104.659  104.660
 qs_mol_dyn_low                       1  2.0    0.003    0.003  104.429  104.456
 velocity_verlet                     10  3.0    0.002    0.002   78.114   85.895
 mp_sum_dm                          438  4.9   54.485   62.279   54.485   62.279
 md_output                           10  3.0    0.000    0.000    7.784   62.253
 md_write_output                     11  3.9    0.008    0.060    7.783   62.251
 update_particle_set                 20  4.0    0.000    0.000   54.466   62.247
 write_trajectory                    44  4.9    0.001    0.007    7.774   62.191
 write_particle_coordinates          11  5.9    7.773   62.184    7.773   62.184
 qs_forces                           11  3.9    0.002    0.003   42.161   42.163
 qs_energies                         11  4.9    0.001    0.002   40.160   40.163
 scf_env_do_scf                      11  5.9    0.001    0.001   34.332   34.332
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.007   26.639   26.642
 dbcsr_multiply_generic            2286 12.5    0.099    0.102   19.065   19.117
 qs_scf_new_mos                     108  7.5    0.001    0.001   17.325   17.427
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   17.324   17.427
 ot_scf_mini                        108  9.5    0.002    0.002   16.172   16.272
 multiply_cannon                   2286 13.5    0.296    0.299   15.251   15.880
 multiply_cannon_loop              2286 14.5    0.871    0.894   14.014   14.674
 ot_mini                            108 10.5    0.001    0.001    9.975   10.091
 multiply_cannon_multrec           9144 15.5    3.505    5.002    8.854    8.967
 qs_ot_get_derivative               108 11.5    0.001    0.001    7.945    8.045
 init_scf_loop                       11  6.9    0.001    0.001    7.667    7.670
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.073    7.209
 qs_ks_build_kohn_sham_matrix       119  9.3    0.013    0.014    7.072    7.209
 prepare_preconditioner              11  7.9    0.000    0.000    6.713    6.726
 make_preconditioner                 11  8.9    0.000    0.000    6.713    6.725
 make_full_inverse_cholesky          11  9.9    0.000    0.000    5.274    6.608
 dbcsr_mm_accdrv_process          12550 15.8    4.660    6.523    5.213    6.556
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.370    6.495
 cp_fm_upper_to_full                 72 14.2    3.171    4.555    3.171    4.555
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.034    4.106
 calculate_rho_elec                 119  8.7    0.118    0.121    4.034    4.105
 sum_up_and_integrate               119 10.3    0.064    0.066    3.793    3.799
 init_scf_run                        11  5.9    0.000    0.001    3.773    3.773
 scf_env_initial_rho_setup           11  6.9    0.001    0.001    3.773    3.773
 integrate_v_rspace                 119 11.3    0.004    0.004    3.729    3.736
 qs_ot_get_p                        119 10.4    0.001    0.001    3.292    3.429
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    2.793    3.186
 mp_waitall_1                     94719 16.7    2.350    3.138    2.350    3.138
 dbcsr_complete_redistribute        329 12.2    0.287    0.293    1.949    2.767
 pw_transfer                       1439 11.6    0.069    0.069    2.535    2.541
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.173    2.481
 apply_single                       119 13.6    0.000    0.000    2.173    2.481
 copy_fm_to_dbcsr                   176 11.2    0.001    0.001    1.658    2.465
 fft_wrap_pw1pw2                   1201 12.6    0.009    0.009    2.437    2.443
 make_m2s                          4572 13.5    0.038    0.038    2.262    2.407
 calculate_dm_sparse                119  9.5    0.000    0.000    2.308    2.378
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.272    2.342
 mp_alltoall_i22                    627 13.8    1.470    2.299    1.470    2.299
 make_images                       4572 14.5    0.361    0.396    2.143    2.286
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    1.434    2.233
 multiply_cannon_metrocomm3        9144 15.5    0.021    0.021    1.499    2.192
 grid_integrate_task_list           119 12.3    2.097    2.117    2.097    2.117
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="110", plot="h2o_64_md", label="(8n/1r/12t)", y=104.660000, yerr=0.000000
PlotPoint: name="111", plot="h2o_64_md_mem", label="(8n/1r/12t)", y=727.727273, yerr=17.705114
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/09/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    198.287135E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               8410880       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     117.0
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             501.379072E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 8483040
 MPI messages size (bytes):
  total size                         1.160510E+12
  min size                           0.000000E+00
  max size                           1.161504E+06
  average size                     136.803609E+03
 MPI breakdown and total messages size (bytes):
             size <=      128             1836752                        0
       128 < size <=     8192             1040592               8524529664
      8192 < size <=    32768             1486976              24362614784
     32768 < size <=   131072             2491776             216971345920
    131072 < size <=  4194304             1626944             910632720448
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66212.
 MP_Allreduce         9776                    488.
 MP_Sync                52
 MP_Alltoall          1938                1383689.
 MP_SendRecv         20900                   9096.
 MP_ISendRecv        20900                   9096.
 MP_Wait             37268
 MP_ISend            14300                  82312.
 MP_IRecv            14300                  82312.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.032    0.106   82.244   82.247
 qs_mol_dyn_low                       1  2.0    0.003    0.003   81.843   81.854
 qs_forces                           11  3.9    0.003    0.004   81.775   81.776
 qs_energies                         11  4.9    0.003    0.015   78.909   78.930
 scf_env_do_scf                      11  5.9    0.001    0.002   70.084   70.088
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.007   64.462   64.462
 dbcsr_multiply_generic            2055 12.4    0.106    0.108   50.914   51.174
 qs_scf_new_mos                      99  7.5    0.000    0.001   47.344   47.461
 qs_scf_loop_do_ot                   99  8.5    0.000    0.001   47.343   47.460
 ot_scf_mini                         99  9.5    0.002    0.002   44.951   45.046
 velocity_verlet                     10  3.0    0.001    0.002   43.510   43.510
 multiply_cannon                   2055 13.4    0.184    0.187   42.528   43.198
 multiply_cannon_loop              2055 14.4    1.803    1.843   41.562   42.222
 ot_mini                             99 10.5    0.001    0.001   26.594   26.678
 qs_ot_get_derivative                99 11.5    0.001    0.001   19.744   19.823
 multiply_cannon_multrec          49320 15.4   11.361   12.040   17.339   18.190
 rebuild_ks_matrix                  110  8.3    0.000    0.001   14.542   14.644
 qs_ks_build_kohn_sham_matrix       110  9.3    0.011    0.014   14.542   14.643
 qs_ks_update_qs_env                110  7.6    0.001    0.001   12.854   12.956
 mp_waitall_1                    220248 16.4   10.816   11.453   10.816   11.453
 multiply_cannon_sync_h2d         49320 15.4    9.581   10.092    9.581   10.092
 qs_ot_get_p                        110 10.4    0.001    0.001    9.861    9.985
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    7.612    8.129
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    7.178    7.671
 apply_single                       110 13.6    0.000    0.000    7.178    7.670
 multiply_cannon_metrocomm3       49320 15.4    0.084    0.087    6.562    7.396
 sum_up_and_integrate               110 10.3    0.036    0.043    7.060    7.074
 integrate_v_rspace                 110 11.3    0.003    0.004    7.024    7.047
 init_scf_run                        11  5.9    0.000    0.001    6.759    6.760
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    6.759    6.759
 qs_ot_p2m_diag                      48 11.0    0.013    0.019    6.608    6.623
 ot_diis_step                        99 11.5    0.006    0.006    6.557    6.557
 qs_rho_update_rho_low              110  7.6    0.000    0.001    6.375    6.524
 calculate_rho_elec                 110  8.6    0.021    0.024    6.375    6.524
 dbcsr_mm_accdrv_process          87628 16.1    3.017    3.104    5.849    6.136
 cp_dbcsr_syevd                      48 12.0    0.002    0.003    5.812    5.812
 init_scf_loop                       11  6.9    0.002    0.011    5.596    5.598
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    5.264    5.316
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    5.216    5.239
 cp_fm_diag_elpa_base                48 14.0    5.203    5.226    5.215    5.237
 mp_sum_l                          6594 12.7    4.084    4.744    4.084    4.744
 wfi_extrapolate                     11  7.9    0.001    0.001    4.030    4.030
 make_m2s                          4110 13.4    0.061    0.066    3.863    3.971
 make_images                       4110 14.4    0.176    0.188    3.767    3.879
 calculate_dm_sparse                110  9.5    0.001    0.001    3.771    3.859
 density_rs2pw                      110  9.6    0.004    0.005    3.387    3.578
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    3.539    3.544
 grid_integrate_task_list           110 12.3    3.263    3.413    3.263    3.413
 multiply_cannon_metrocomm1       49320 15.4    0.066    0.069    2.380    3.398
 qs_ot_get_orbitals                  99 10.5    0.000    0.001    3.242    3.279
 prepare_preconditioner              11  7.9    0.000    0.000    3.238    3.252
 make_preconditioner                 11  8.9    0.000    0.000    3.238    3.252
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.195    3.240
 pw_transfer                       1331 11.6    0.055    0.065    3.063    3.151
 make_full_inverse_cholesky          11  9.9    0.000    0.000    3.027    3.071
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    2.974    3.065
 potential_pw2rs                    110 12.3    0.006    0.007    2.638    2.670
 fft_wrap_pw1pw2_140                451 13.1    0.388    0.430    2.538    2.635
 calculate_first_density_matrix       1  7.0    0.000    0.003    2.630    2.633
 acc_transpose_blocks             49320 15.4    0.213    0.219    2.572    2.629
 jit_kernel_multiply                 13 15.9    2.553    2.574    2.553    2.574
 mp_alltoall_d11v                  2046 13.8    2.063    2.526    2.063    2.526
 fft3d_ps                          1111 14.6    0.790    0.876    2.271    2.329
 grid_collocate_task_list           110  9.6    2.162    2.276    2.162    2.276
 mp_waitany                       14300 13.8    1.845    2.159    1.845    2.159
 transfer_rs2pw                     451 10.6    0.005    0.006    1.939    2.118
 mp_sum_d                          3889 11.9    1.558    2.072    1.558    2.072
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.913    1.928
 make_images_data                  4110 15.4    0.043    0.047    1.739    1.861
 transfer_pw2rs                     451 13.1    0.006    0.007    1.812    1.831
 cp_fm_cholesky_invert               11 10.9    1.783    1.786    1.783    1.786
 hybrid_alltoall_any               4261 16.3    0.085    0.484    1.500    1.768
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.664    1.701
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="200", plot="h2o_128_md", label="(8n/12r/1t)", y=82.247000, yerr=0.000000
PlotPoint: name="201", plot="h2o_128_md_mem", label="(8n/12r/1t)", y=476.090909, yerr=2.712079
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/10/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    390.715586E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               5019072       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     196.1
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             593.002496E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1972800
 MPI messages size (bytes):
  total size                         1.077520E+12
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     546.188250E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               14916                        0
       128 < size <=     8192              222984               1826684928
      8192 < size <=    32768              520356              13399818240
     32768 < size <=   131072              372336              35386294272
    131072 < size <=  4194304              787758             788321309808
   4194304 < size <= 16777216               54450             238588003280
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66430.
 MP_Allreduce         9775                    566.
 MP_Sync                52
 MP_Alltoall          1717                2769658.
 MP_SendRecv         10340                  26400.
 MP_ISendRecv        10340                  26400.
 MP_Wait             22352
 MP_ISend            10164                 155761.
 MP_IRecv            10164                 155761.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.026    0.105  140.453  140.455
 qs_mol_dyn_low                       1  2.0    0.004    0.008  140.068  140.070
 velocity_verlet                     10  3.0    0.001    0.002  105.225  106.702
 mp_sum_dm                          438  4.9   69.461   70.957   69.461   70.957
 md_write_output                     11  3.9    1.477   70.884    1.479   70.903
 update_particle_set                 20  4.0    0.000    0.000   69.421   70.899
 md_output                           10  3.0    0.000    0.000    1.480   70.887
 qs_forces                           11  3.9    0.003    0.003   69.141   69.148
 qs_energies                         11  4.9    0.001    0.002   65.823   65.834
 scf_env_do_scf                      11  5.9    0.000    0.001   57.004   57.008
 scf_env_do_scf_inner_loop           99  6.5    0.003    0.008   49.471   49.472
 dbcsr_multiply_generic            2055 12.4    0.114    0.118   38.616   38.747
 qs_scf_new_mos                      99  7.5    0.001    0.001   33.704   33.824
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   33.704   33.823
 multiply_cannon                   2055 13.4    0.224    0.246   31.756   32.586
 ot_scf_mini                         99  9.5    0.003    0.003   32.037   32.156
 multiply_cannon_loop              2055 14.4    1.169    1.202   30.618   31.683
 ot_mini                             99 10.5    0.001    0.001   18.953   19.063
 multiply_cannon_multrec          24660 15.4    6.990    8.340   14.046   15.434
 rebuild_ks_matrix                  110  8.3    0.000    0.000   13.491   13.583
 qs_ks_build_kohn_sham_matrix       110  9.3    0.013    0.014   13.490   13.583
 qs_ot_get_derivative                99 11.5    0.001    0.001   13.166   13.290
 qs_ks_update_qs_env                110  7.6    0.001    0.001   11.861   11.940
 mp_waitall_1                    176588 16.5    7.851   10.389    7.851   10.389
 multiply_cannon_metrocomm3       24660 15.4    0.072    0.074    5.463    8.182
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    6.789    7.790
 apply_single                       110 13.6    0.000    0.001    6.789    7.789
 init_scf_loop                       11  6.9    0.000    0.001    7.499    7.499
 multiply_cannon_sync_h2d         24660 15.4    6.443    7.272    6.443    7.272
 dbcsr_mm_accdrv_process          52282 16.1    5.428    6.455    6.885    7.189
 qs_ot_get_p                        110 10.4    0.001    0.001    6.440    6.573
 sum_up_and_integrate               110 10.3    0.052    0.059    6.351    6.366
 init_scf_run                        11  5.9    0.000    0.001    6.319    6.320
 scf_env_initial_rho_setup           11  6.9    0.001    0.001    6.319    6.319
 integrate_v_rspace                 110 11.3    0.003    0.003    6.299    6.312
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    5.044    6.031
 qs_rho_update_rho_low              110  7.6    0.001    0.001    5.844    5.852
 calculate_rho_elec                 110  8.6    0.039    0.047    5.843    5.851
 ot_diis_step                        99 11.5    0.010    0.010    5.738    5.739
 prepare_preconditioner              11  7.9    0.000    0.000    5.504    5.524
 make_preconditioner                 11  8.9    0.000    0.000    5.504    5.524
 make_full_inverse_cholesky          11  9.9    0.000    0.000    5.109    5.265
 qs_ot_p2m_diag                      48 11.0    0.029    0.044    4.564    4.583
 make_m2s                          4110 13.4    0.057    0.060    4.208    4.533
 make_images                       4110 14.4    0.414    0.475    4.100    4.422
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.097    4.097
 pw_transfer                       1331 11.6    0.066    0.071    3.497    3.638
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.532    3.548
 cp_fm_diag_elpa_base                48 14.0    3.486    3.503    3.530    3.545
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.009    3.391    3.533
 wfi_extrapolate                     11  7.9    0.001    0.001    3.503    3.504
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    3.397    3.400
 grid_integrate_task_list           110 12.3    3.160    3.337    3.160    3.337
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    3.262    3.320
 density_rs2pw                      110  9.6    0.004    0.004    3.102    3.298
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.067    3.098
 fft_wrap_pw1pw2_140                451 13.1    0.459    0.472    2.918    3.070
 calculate_dm_sparse                110  9.5    0.001    0.001    2.978    3.010
 mp_sum_l                          6594 12.7    2.040    2.835    2.040    2.835
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="202", plot="h2o_128_md", label="(8n/6r/2t)", y=140.455000, yerr=0.000000
PlotPoint: name="203", plot="h2o_128_md_mem", label="(8n/6r/2t)", y=560.545455, yerr=6.358439
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/11/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    404.681598E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               3346752       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     294.1
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             666.341376E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  854880
 MPI messages size (bytes):
  total size                       708.322787E+09
  min size                           0.000000E+00
  max size                           6.553600E+06
  average size                     828.564000E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768              222984               7302414336
     32768 < size <=   131072              153888              10085203968
    131072 < size <=  4194304              389376             200257044480
   4194304 < size <= 16777216               82208             490679162176
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66421.
 MP_Allreduce         9774                    562.
 MP_Sync                52
 MP_Alltoall          1496                4511006.
 MP_SendRecv          6820                  27424.
 MP_ISendRecv         6820                  27424.
 MP_Wait             25498
 MP_ISend            17072                 115022.
 MP_IRecv            17072                 115022.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.027    0.049   61.442   61.445
 qs_mol_dyn_low                       1  2.0    0.003    0.003   61.100   61.111
 qs_forces                           11  3.9    0.009    0.013   61.035   61.037
 qs_energies                         11  4.9    0.002    0.005   57.853   57.862
 scf_env_do_scf                      11  5.9    0.001    0.001   49.151   49.151
 scf_env_do_scf_inner_loop           99  6.5    0.003    0.007   40.660   40.661
 velocity_verlet                     10  3.0    0.005    0.009   32.473   32.474
 dbcsr_multiply_generic            2055 12.4    0.106    0.111   29.809   30.050
 qs_scf_new_mos                      99  7.5    0.001    0.001   26.023   26.125
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   26.023   26.124
 ot_scf_mini                         99  9.5    0.002    0.003   24.778   24.898
 multiply_cannon                   2055 13.4    0.213    0.223   22.992   24.638
 multiply_cannon_loop              2055 14.4    0.814    0.864   21.840   23.405
 ot_mini                             99 10.5    0.001    0.001   14.383   14.498
 rebuild_ks_matrix                  110  8.3    0.000    0.000   12.172   12.301
 qs_ks_build_kohn_sham_matrix       110  9.3    0.012    0.014   12.171   12.301
 multiply_cannon_multrec          16440 15.4    3.707    4.805    9.960   10.938
 qs_ks_update_qs_env                110  7.6    0.001    0.001   10.714   10.833
 qs_ot_get_derivative                99 11.5    0.001    0.001    9.975   10.094
 mp_waitall_1                    139946 16.5    7.012    9.836    7.012    9.836
 init_scf_loop                       11  6.9    0.001    0.002    8.456    8.457
 multiply_cannon_metrocomm3       16440 15.4    0.045    0.048    4.382    7.160
 prepare_preconditioner              11  7.9    0.000    0.000    6.682    6.697
 make_preconditioner                 11  8.9    0.000    0.001    6.682    6.697
 make_full_inverse_cholesky          11  9.9    0.000    0.000    6.020    6.354
 sum_up_and_integrate               110 10.3    0.060    0.061    6.304    6.320
 dbcsr_mm_accdrv_process          34862 16.1    5.379    5.879    6.097    6.294
 integrate_v_rspace                 110 11.3    0.003    0.003    6.243    6.259
 init_scf_run                        11  5.9    0.000    0.001    6.098    6.098
 scf_env_initial_rho_setup           11  6.9    0.003    0.014    6.097    6.098
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    5.414    5.998
 apply_single                       110 13.6    0.000    0.000    5.413    5.998
 qs_rho_update_rho_low              110  7.6    0.000    0.001    5.727    5.739
 calculate_rho_elec                 110  8.6    0.058    0.059    5.727    5.739
 qs_ot_get_p                        110 10.4    0.001    0.001    5.556    5.704
 make_m2s                          4110 13.4    0.049    0.051    4.221    4.583
 make_images                       4110 14.4    0.401    0.522    4.108    4.470
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.322    4.459
 ot_diis_step                        99 11.5    0.011    0.011    4.378    4.378
 multiply_cannon_sync_h2d         16440 15.4    3.269    4.030    3.269    4.030
 qs_ot_p2m_diag                      48 11.0    0.042    0.045    3.921    3.924
 pw_transfer                       1331 11.6    0.065    0.073    3.557    3.563
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    3.553    3.554
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.009    3.450    3.460
 grid_integrate_task_list           110 12.3    3.207    3.393    3.207    3.393
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.002    3.315    3.319
 density_rs2pw                      110  9.6    0.004    0.004    2.975    3.177
 calculate_first_density_matrix       1  7.0    0.009    0.071    3.013    3.017
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.003    3.012
 cp_fm_diag_elpa_base                48 14.0    2.937    2.970    3.001    3.010
 wfi_extrapolate                     11  7.9    0.001    0.001    2.991    2.991
 fft_wrap_pw1pw2_140                451 13.1    0.578    0.584    2.975    2.989
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.001    2.868    2.922
 make_images_data                  4110 15.4    0.045    0.049    2.391    2.876
 hybrid_alltoall_any               4261 16.3    0.109    0.382    2.162    2.863
 mp_sum_l                          6594 12.7    1.979    2.755    1.979    2.755
 cp_fm_cholesky_invert               11 10.9    2.608    2.614    2.608    2.614
 calculate_dm_sparse                110  9.5    0.001    0.001    2.519    2.551
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    2.428    2.494
 acc_transpose_blocks             16440 15.4    0.078    0.079    1.928    2.478
 multiply_cannon_metrocomm4       14385 15.4    0.048    0.052    0.882    2.436
 grid_collocate_task_list           110  9.6    2.212    2.405    2.212    2.405
 fft3d_ps                          1111 14.6    1.093    1.104    2.391    2.400
 mp_irecv_dv                      48980 15.7    0.807    2.302    0.807    2.302
 qs_energies_init_hamiltonians       11  5.9    0.010    0.046    2.167    2.174
 potential_pw2rs                    110 12.3    0.011    0.011    2.125    2.132
 mp_alltoall_d11v                  2046 13.8    1.810    2.109    1.810    2.109
 dbcsr_complete_redistribute        325 12.2    0.348    0.423    1.516    1.971
 cp_fm_upper_to_full                 70 14.2    1.358    1.723    1.358    1.723
 acc_transpose_blocks_kernels     16440 16.4    0.216    0.220    1.149    1.698
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.651    1.665
 transfer_rs2pw                     451 10.6    0.005    0.006    1.381    1.616
 mp_allgather_i34                  2055 14.4    0.465    1.602    0.465    1.602
 cp_fm_cholesky_decompose            22 10.9    1.574    1.592    1.574    1.592
 copy_fm_to_dbcsr                   174 11.2    0.002    0.008    1.077    1.511
 jit_kernel_transpose                 5 15.6    0.934    1.482    0.934    1.482
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.459    1.471
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    1.364    1.468
 mp_waitany                       17072 13.8    1.215    1.468    1.215    1.468
 qs_ot_get_orbitals                  99 10.5    0.000    0.001    1.265    1.276
 qs_env_update_s_mstruct             11  6.9    0.012    0.024    1.174    1.247
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="204", plot="h2o_128_md", label="(8n/4r/3t)", y=61.445000, yerr=0.000000
PlotPoint: name="205", plot="h2o_128_md_mem", label="(8n/4r/3t)", y=630.818182, yerr=8.767898
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/12/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    601.317074E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               4916280       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     200.2
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             729.169920E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  937080
 MPI messages size (bytes):
  total size                       523.723932E+09
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     558.889250E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6996                        0
       128 < size <=     8192                 264                  2162688
      8192 < size <=    32768              304932               8165326848
     32768 < size <=   131072              110640               6338641920
    131072 < size <=  4194304              489498             400769458320
   4194304 < size <= 16777216               24750             108449092400
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66419.
 MP_Allreduce         9774                    603.
 MP_Sync                52
 MP_Alltoall          1496                5863162.
 MP_SendRecv          5060                  43184.
 MP_ISendRecv         5060                  43184.
 MP_Wait             20042
 MP_ISend            13376                 163145.
 MP_IRecv            13376                 163145.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.019    0.048   67.833   67.834
 qs_mol_dyn_low                       1  2.0    0.003    0.003   67.549   67.559
 qs_forces                           11  3.9    0.003    0.004   67.141   67.141
 qs_energies                         11  4.9    0.002    0.010   63.734   63.741
 scf_env_do_scf                      11  5.9    0.001    0.002   54.939   54.942
 scf_env_do_scf_inner_loop           99  6.5    0.003    0.008   43.216   43.217
 velocity_verlet                     10  3.0    0.003    0.003   37.668   37.684
 dbcsr_multiply_generic            2055 12.4    0.113    0.124   32.070   32.358
 qs_scf_new_mos                      99  7.5    0.001    0.001   28.643   28.757
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   28.643   28.756
 ot_scf_mini                         99  9.5    0.003    0.004   26.953   27.048
 multiply_cannon                   2055 13.4    0.239    0.258   24.872   26.092
 multiply_cannon_loop              2055 14.4    1.421    1.501   23.502   24.224
 ot_mini                             99 10.5    0.001    0.001   15.861   15.984
 multiply_cannon_multrec          24660 15.4    4.177    7.111   13.246   14.910
 rebuild_ks_matrix                  110  8.3    0.000    0.000   11.971   12.086
 qs_ks_build_kohn_sham_matrix       110  9.3    0.013    0.017   11.971   12.086
 qs_ot_get_derivative                99 11.5    0.001    0.001   11.686   11.785
 init_scf_loop                       11  6.9    0.001    0.004   11.682   11.683
 qs_ks_update_qs_env                110  7.6    0.001    0.001   10.577   10.679
 dbcsr_mm_accdrv_process          52304 16.0    7.855    9.400    8.911   10.424
 prepare_preconditioner              11  7.9    0.000    0.001    9.934    9.951
 make_preconditioner                 11  8.9    0.000    0.001    9.934    9.951
 make_full_inverse_cholesky          11  9.9    0.000    0.000    8.157    9.648
 mp_waitall_1                    121746 16.5    4.987    6.849    4.987    6.849
 sum_up_and_integrate               110 10.3    0.067    0.070    6.259    6.272
 integrate_v_rspace                 110 11.3    0.003    0.003    6.192    6.205
 qs_ot_get_p                        110 10.4    0.001    0.001    5.963    6.101
 init_scf_run                        11  5.9    0.000    0.001    5.978    5.978
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    5.977    5.978
 qs_rho_update_rho_low              110  7.6    0.001    0.001    5.803    5.817
 calculate_rho_elec                 110  8.6    0.078    0.081    5.802    5.816
 make_m2s                          4110 13.4    0.060    0.061    5.508    5.794
 make_images                       4110 14.4    0.586    0.707    5.367    5.647
 cp_fm_upper_to_full                 70 14.2    3.336    4.868    3.336    4.868
 ot_diis_step                        99 11.5    0.011    0.011    4.138    4.139
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    4.058    4.111
 apply_single                       110 13.6    0.000    0.000    4.058    4.111
 qs_ot_p2m_diag                      48 11.0    0.055    0.064    4.073    4.094
 dbcsr_complete_redistribute        325 12.2    0.425    0.493    2.701    3.872
 multiply_cannon_metrocomm3       24660 15.4    0.038    0.039    1.850    3.860
 pw_transfer                       1331 11.6    0.065    0.075    3.635    3.670
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    3.528    3.567
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.499    3.548
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    3.504    3.505
 grid_integrate_task_list           110 12.3    3.297    3.425    3.297    3.425
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    3.353    3.410
 copy_fm_to_dbcsr                   174 11.2    0.001    0.002    2.141    3.302
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.250    3.251
 fft_wrap_pw1pw2_140                451 13.1    0.608    0.630    3.027    3.069
 density_rs2pw                      110  9.6    0.004    0.004    2.928    3.059
 calculate_dm_sparse                110  9.5    0.001    0.001    3.012    3.053
 make_images_data                  4110 15.4    0.048    0.053    2.725    3.042
 hybrid_alltoall_any               4261 16.3    0.123    0.460    2.284    3.024
 wfi_extrapolate                     11  7.9    0.001    0.001    3.000    3.001
 acc_transpose_blocks             24660 15.4    0.114    0.117    2.813    2.989
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    2.955    2.968
 cp_fm_diag_elpa_base                48 14.0    2.808    2.872    2.953    2.966
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.001    2.872    2.921
 calculate_first_density_matrix       1  7.0    0.000    0.003    2.879    2.882
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    1.767    2.881
 mp_alltoall_i22                    605 13.7    1.684    2.857    1.684    2.857
 cp_fm_cholesky_invert               11 10.9    2.694    2.702    2.694    2.702
 multiply_cannon_sync_h2d         24660 15.4    2.381    2.517    2.381    2.517
 grid_collocate_task_list           110  9.6    2.266    2.426    2.266    2.426
 fft3d_ps                          1111 14.6    1.086    1.113    2.411    2.426
 qs_energies_init_hamiltonians       11  5.9    0.001    0.003    2.279    2.280
 potential_pw2rs                    110 12.3    0.012    0.013    2.034    2.042
 mp_alltoall_d11v                  2046 13.8    1.722    1.873    1.722    1.873
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    1.795    1.831
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    1.614    1.722
 cp_fm_cholesky_decompose            22 10.9    1.677    1.714    1.677    1.714
 mp_sum_l                          6594 12.7    1.011    1.661    1.011    1.661
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.637    1.652
 mp_allgather_i34                  2055 14.4    0.452    1.587    0.452    1.587
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.002    1.565    1.585
 acc_transpose_blocks_sync        73980 16.4    1.414    1.566    1.414    1.566
 multiply_cannon_metrocomm4       20550 15.4    0.062    0.066    0.840    1.505
 multiply_cannon_metrocomm1       24660 15.4    0.035    0.036    0.527    1.475
 jit_kernel_multiply                  8 15.7    0.709    1.424    0.709    1.424
 mp_irecv_dv                      62702 16.1    0.735    1.421    0.735    1.421
 transfer_rs2pw                     451 10.6    0.005    0.006    1.223    1.419
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="206", plot="h2o_128_md", label="(8n/3r/4t)", y=67.834000, yerr=0.000000
PlotPoint: name="207", plot="h2o_128_md_mem", label="(8n/3r/4t)", y=692.727273, yerr=6.689081
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/13/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    807.299199E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               1438408       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     684.2
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             851.025920E+06
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  197280
 MPI messages size (bytes):
  total size                       339.125567E+09
  min size                           0.000000E+00
  max size                          13.107200E+06
  average size                       1.719006E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 132                  4325376
     32768 < size <=   131072               88656              11620319232
    131072 < size <=  4194304               89424             117209825280
   4194304 < size <= 16777216               17616             210291069504
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         7346                     33.
 MP_Alltoall          8043                 263767.
 MP_ISend            32836                 654203.
 MP_IRecv            32836                 654587.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66417.
 MP_Allreduce         9774                    644.
 MP_Sync                52
 MP_Alltoall          1496                8504061.
 MP_SendRecv          3300                  54848.
 MP_ISendRecv         3300                  54848.
 MP_Wait             13926
 MP_ISend             9240                 278857.
 MP_IRecv             9240                 278857.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.017    0.036   56.009   56.010
 qs_mol_dyn_low                       1  2.0    0.003    0.003   55.732   55.742
 qs_forces                           11  3.9    0.003    0.003   55.188   55.189
 qs_energies                         11  4.9    0.001    0.001   51.520   51.524
 scf_env_do_scf                      11  5.9    0.000    0.001   43.274   43.274
 scf_env_do_scf_inner_loop           99  6.5    0.003    0.007   35.570   35.571
 velocity_verlet                     10  3.0    0.003    0.008   31.565   31.598
 dbcsr_multiply_generic            2055 12.4    0.105    0.109   23.533   23.670
 qs_scf_new_mos                      99  7.5    0.001    0.001   21.016   21.059
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   21.016   21.059
 ot_scf_mini                         99  9.5    0.002    0.002   19.760   19.778
 multiply_cannon                   2055 13.4    0.237    0.245   17.827   19.098
 multiply_cannon_loop              2055 14.4    0.606    0.628   16.524   16.771
 rebuild_ks_matrix                  110  8.3    0.000    0.000   11.676   11.688
 qs_ks_build_kohn_sham_matrix       110  9.3    0.013    0.014   11.675   11.687
 ot_mini                             99 10.5    0.001    0.001   10.948   10.959
 qs_ks_update_qs_env                110  7.6    0.001    0.001   10.374   10.382
 multiply_cannon_multrec           8220 15.4    3.173    4.487    7.585    8.515
 init_scf_loop                       11  6.9    0.001    0.001    7.652    7.653
 mp_waitall_1                    103326 16.6    5.806    7.390    5.806    7.390
 qs_ot_get_derivative                99 11.5    0.001    0.001    7.268    7.284
 sum_up_and_integrate               110 10.3    0.080    0.082    6.274    6.289
 integrate_v_rspace                 110 11.3    0.003    0.004    6.194    6.208
 prepare_preconditioner              11  7.9    0.000    0.000    5.999    6.002
 make_preconditioner                 11  8.9    0.000    0.000    5.999    6.002
 qs_rho_update_rho_low              110  7.6    0.001    0.001    5.952    5.968
 calculate_rho_elec                 110  8.6    0.114    0.114    5.951    5.967
 make_full_inverse_cholesky          11  9.9    0.000    0.000    5.573    5.651
 dbcsr_mm_accdrv_process          17442 15.9    3.117    4.179    4.272    5.225
 init_scf_run                        11  5.9    0.000    0.001    5.098    5.098
 scf_env_initial_rho_setup           11  6.9    0.001    0.001    5.098    5.098
 qs_ot_get_p                        110 10.4    0.001    0.001    4.831    4.859
 make_m2s                          4110 13.4    0.038    0.040    4.320    4.607
 make_images                       4110 14.4    0.654    0.716    4.191    4.476
 multiply_cannon_metrocomm3        8220 15.4    0.019    0.019    2.941    4.331
 pw_transfer                       1331 11.6    0.066    0.072    3.861    3.886
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    3.784    3.878
 apply_single                       110 13.6    0.000    0.000    3.783    3.878
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    3.753    3.783
 ot_diis_step                        99 11.5    0.012    0.012    3.658    3.658
 grid_integrate_task_list           110 12.3    3.391    3.481    3.391    3.481
 qs_ot_p2m_diag                      48 11.0    0.081    0.084    3.420    3.424
 fft_wrap_pw1pw2_140                451 13.1    0.773    0.787    3.239    3.280
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    3.076    3.077
 density_rs2pw                      110  9.6    0.004    0.004    2.940    3.024
 cp_fm_cholesky_invert               11 10.9    2.870    2.874    2.870    2.874
 hybrid_alltoall_any               4261 16.3    0.201    0.852    2.303    2.848
 make_images_data                  4110 15.4    0.040    0.046    2.311    2.727
 wfi_extrapolate                     11  7.9    0.001    0.001    2.695    2.695
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    2.683    2.684
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.551    2.552
 calculate_dm_sparse                110  9.5    0.001    0.001    2.485    2.524
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    2.503    2.510
 cp_fm_diag_elpa_base                48 14.0    2.446    2.469    2.501    2.508
 grid_collocate_task_list           110  9.6    2.369    2.494    2.369    2.494
 multiply_cannon_sync_h2d          8220 15.4    2.379    2.474    2.379    2.474
 fft3d_ps                          1111 14.6    1.136    1.182    2.422    2.437
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    2.297    2.379
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.302    2.304
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.191    2.204
 potential_pw2rs                    110 12.3    0.015    0.015    1.998    2.003
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    1.786    1.997
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    1.947    1.956
 mp_alltoall_d11v                  2046 13.8    1.710    1.798    1.710    1.798
 acc_transpose_blocks              8220 15.4    0.039    0.040    1.621    1.756
 cp_fm_cholesky_decompose            22 10.9    1.685    1.701    1.685    1.701
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.631    1.635
 qs_env_update_s_mstruct             11  6.9    0.001    0.002    1.512    1.631
 mp_allgather_i34                  2055 14.4    0.452    1.575    0.452    1.575
 dbcsr_complete_redistribute        325 12.2    0.562    0.580    1.443    1.539
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.432    1.445
 jit_kernel_multiply                  8 15.6    0.837    1.428    0.837    1.428
 qs_create_task_list                 11  7.9    0.000    0.001    1.231    1.333
 generate_qs_task_list               11  8.9    0.389    0.452    1.231    1.333
 transfer_rs2pw                     451 10.6    0.005    0.005    1.141    1.239
 copy_dbcsr_to_fm                   151 11.3    0.003    0.003    1.193    1.214
 mp_waitany                        9240 13.8    1.053    1.144    1.053    1.144
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="208", plot="h2o_128_md", label="(8n/2r/6t)", y=56.010000, yerr=0.000000
PlotPoint: name="209", plot="h2o_128_md_mem", label="(8n/2r/6t)", y=805.181818, yerr=11.519297
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/14/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                      1.612391E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               1464624       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     672.0
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank               1.426985E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   82200
 MPI messages size (bytes):
  total size                       297.640985E+09
  min size                           0.000000E+00
  max size                          26.214400E+06
  average size                       3.620936E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                  44                  1441792
     32768 < size <=   131072               18560               2432696320
    131072 < size <=  4194304               54216              84915781632
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            8808             210291069504
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3462                  67098.
 MP_Allreduce         9752                    812.
 MP_Sync                52
 MP_Alltoall          1474               16505187.
 MP_SendRecv          2310                 360267.
 MP_ISendRecv         2310                 360267.
 MP_Wait              5214
 MP_ISend             2420                1187840.
 MP_IRecv             2420                1187840.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.029    0.043   86.661   86.662
 qs_mol_dyn_low                       1  2.0    0.003    0.003   86.330   86.342
 qs_forces                           11  3.9    0.003    0.003   86.246   86.247
 qs_energies                         11  4.9    0.001    0.001   82.106   82.107
 scf_env_do_scf                      11  5.9    0.001    0.001   72.128   72.128
 velocity_verlet                     10  3.0    0.002    0.002   55.546   55.554
 scf_env_do_scf_inner_loop           99  6.5    0.003    0.007   43.810   43.812
 dbcsr_multiply_generic            2055 12.4    0.120    0.125   29.284   29.381
 init_scf_loop                       11  6.9    0.001    0.001   28.247   28.249
 qs_scf_new_mos                      99  7.5    0.001    0.001   26.586   26.620
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   26.585   26.619
 prepare_preconditioner              11  7.9    0.000    0.000   26.243   26.250
 make_preconditioner                 11  8.9    0.000    0.000   26.243   26.250
 make_full_inverse_cholesky          11  9.9    0.000    0.000   20.276   25.713
 ot_scf_mini                         99  9.5    0.002    0.002   24.787   24.811
 multiply_cannon                   2055 13.4    0.335    0.363   22.311   23.064
 multiply_cannon_loop              2055 14.4    0.808    0.817   20.529   20.802
 cp_fm_upper_to_full                 70 14.2   12.746   18.424   12.746   18.424
 ot_mini                             99 10.5    0.001    0.001   13.976   14.003
 rebuild_ks_matrix                  110  8.3    0.000    0.001   13.628   13.661
 qs_ks_build_kohn_sham_matrix       110  9.3    0.014    0.014   13.627   13.660
 qs_ks_update_qs_env                110  7.6    0.001    0.001   12.363   12.393
 dbcsr_complete_redistribute        325 12.2    1.022    1.048    7.516   10.848
 multiply_cannon_multrec           8220 15.4    4.070    4.226    9.748    9.879
 copy_fm_to_dbcsr                   174 11.2    0.001    0.001    6.511    9.840
 qs_ot_get_derivative                99 11.5    0.001    0.001    9.417    9.444
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    5.951    9.247
 mp_alltoall_i22                    605 13.7    5.569    8.923    5.569    8.923
 mp_waitall_1                     84994 16.7    7.347    8.148    7.347    8.148
 qs_rho_update_rho_low              110  7.6    0.001    0.001    7.025    7.060
 calculate_rho_elec                 110  8.6    0.225    0.225    7.024    7.059
 sum_up_and_integrate               110 10.3    0.150    0.151    6.804    6.820
 integrate_v_rspace                 110 11.3    0.004    0.004    6.654    6.671
 make_m2s                          4110 13.4    0.044    0.044    5.296    5.793
 dbcsr_mm_accdrv_process          11614 15.7    3.839    4.100    5.531    5.737
 init_scf_run                        11  5.9    0.000    0.001    5.681    5.681
 scf_env_initial_rho_setup           11  6.9    0.001    0.001    5.681    5.681
 make_images                       4110 14.4    0.895    0.949    5.108    5.604
 qs_ot_get_p                        110 10.4    0.001    0.001    5.542    5.577
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    4.717    5.182
 apply_single                       110 13.6    0.000    0.000    4.717    5.181
 cp_fm_cholesky_invert               11 10.9    5.077    5.081    5.077    5.081
 multiply_cannon_metrocomm3        8220 15.4    0.019    0.020    4.762    5.073
 pw_transfer                       1331 11.6    0.075    0.075    4.880    4.888
 fft_wrap_pw1pw2                   1111 12.6    0.009    0.009    4.763    4.772
 ot_diis_step                        99 11.5    0.015    0.015    4.543    4.543
 fft_wrap_pw1pw2_140                451 13.1    1.280    1.286    4.165    4.177
 qs_ot_p2m_diag                      48 11.0    0.151    0.156    4.011    4.019
 grid_integrate_task_list           110 12.3    3.675    3.751    3.675    3.751
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    3.690    3.690
 density_rs2pw                      110  9.6    0.004    0.004    3.488    3.506
 hybrid_alltoall_any               4261 16.3    0.262    0.562    2.774    3.503
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    3.498    3.498
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    2.978    3.428
 make_images_data                  4110 15.4    0.045    0.048    2.760    3.371
 wfi_extrapolate                     11  7.9    0.001    0.001    3.300    3.300
 calculate_dm_sparse                110  9.5    0.001    0.001    3.176    3.200
 multiply_cannon_sync_h2d          8220 15.4    3.154    3.172    3.154    3.172
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    2.917    2.917
 cp_fm_diag_elpa_base                48 14.0    2.382    2.580    2.916    2.916
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.881    2.883
 fft3d_ps                          1111 14.6    1.299    1.310    2.814    2.827
 grid_collocate_task_list           110  9.6    2.666    2.679    2.666    2.679
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    2.623    2.640
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.374    2.389
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.268    2.270
 qs_env_update_s_mstruct             11  6.9    0.001    0.002    2.206    2.263
 potential_pw2rs                    110 12.3    0.021    0.021    2.211    2.215
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    2.104    2.200
 mp_alltoall_d11v                  2046 13.8    2.019    2.095    2.019    2.095
 cp_fm_cholesky_decompose            22 10.9    1.953    1.977    1.953    1.977
 qs_create_task_list                 11  7.9    0.010    0.011    1.904    1.947
 generate_qs_task_list               11  8.9    0.749    0.798    1.894    1.937
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.862    1.865
 copy_dbcsr_to_fm                   151 11.3    0.003    0.003    1.721    1.758
 jit_kernel_multiply                 10 15.2    1.494    1.740    1.494    1.740
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="210", plot="h2o_128_md", label="(8n/1r/12t)", y=86.662000, yerr=0.000000
PlotPoint: name="211", plot="h2o_128_md_mem", label="(8n/1r/12t)", y=1279.909091, yerr=66.267302
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/15/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420239992832       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528891191296       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514751E+12       0.0%      0.0%    100.0%
 flops max/rank                      1.094965E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755938624       0.0%      0.0%    100.0%
 number of processed stacks              11950464       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     565.3
 marketing flops                   144.580175E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank             630.571008E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                10348896
 MPI messages size (bytes):
  total size                         4.491514E+12
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     434.009000E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               65736                        0
       128 < size <=     8192                1232                 10092544
      8192 < size <=    32768             3576680              95640223744
     32768 < size <=   131072             1294784              74079797248
    131072 < size <=  4194304             5148576            3175954870160
   4194304 < size <= 16777216              261888            1145794321408
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3992                  57905.
 MP_Allreduce        11059                    797.
 MP_Sync                87
 MP_Alltoall          2226                1595625.
 MP_SendRecv         24320                  18752.
 MP_ISendRecv        24320                  18752.
 MP_Wait             42476
 MP_ISend            16020                 108028.
 MP_IRecv            16020                 108028.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.046    0.075  208.924  208.926
 qs_mol_dyn_low                       1  2.0    0.003    0.003  208.389  208.402
 qs_forces                           11  3.9    0.005    0.005  208.304  208.305
 qs_energies                         11  4.9    0.001    0.002  202.690  202.707
 scf_env_do_scf                      11  5.9    0.001    0.001  185.623  185.627
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.008  164.910  164.912
 dbcsr_multiply_generic            2507 12.6    0.180    0.184  126.322  127.263
 qs_scf_new_mos                     117  7.6    0.001    0.001  125.384  125.665
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001  125.383  125.664
 velocity_verlet                     10  3.0    0.001    0.002  124.717  124.718
 ot_scf_mini                        117  9.6    0.003    0.003  118.747  118.989
 multiply_cannon                   2507 13.6    0.239    0.247  102.201  104.163
 multiply_cannon_loop              2507 14.6    2.414    2.464  100.022  101.917
 ot_mini                            117 10.6    0.001    0.001   66.301   66.549
 multiply_cannon_multrec          60168 15.6   31.951   34.106   42.068   43.929
 qs_ot_get_derivative               117 11.6    0.001    0.001   41.349   41.595
 rebuild_ks_matrix                  128  8.3    0.001    0.001   33.683   33.993
 qs_ks_build_kohn_sham_matrix       128  9.3    0.016    0.018   33.683   33.992
 mp_waitall_1                    267128 16.5   28.917   31.737   28.917   31.737
 qs_ks_update_qs_env                128  7.6    0.001    0.001   30.256   30.537
 qs_ot_get_p                        128 10.4    0.001    0.001   30.161   30.366
 multiply_cannon_sync_h2d         60168 15.6   26.438   28.341   26.438   28.341
 apply_preconditioner_dbcsr         128 12.6    0.000    0.001   24.422   25.534
 apply_single                       128 13.6    0.001    0.001   24.422   25.534
 ot_diis_step                       117 11.6    0.008    0.008   24.723   24.724
 qs_ot_p2m_diag                      83 11.4    0.079    0.091   23.548   23.606
 cp_dbcsr_syevd                      83 12.4    0.005    0.005   20.682   20.683
 init_scf_loop                       11  6.9    0.000    0.001   20.637   20.639
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002   19.825   20.023
 multiply_cannon_metrocomm3       60168 15.6    0.120    0.126   16.235   18.261
 cp_fm_diag_elpa                     83 13.4    0.000    0.000   17.670   17.702
 cp_fm_diag_elpa_base                83 14.4   17.602   17.642   17.666   17.700
 prepare_preconditioner              11  7.9    0.000    0.000   16.018   16.061
 make_preconditioner                 11  8.9    0.000    0.000   16.018   16.061
 make_full_inverse_cholesky          11  9.9    0.000    0.000   15.260   15.428
 make_m2s                          5014 13.6    0.105    0.115   13.864   14.249
 make_images                       5014 14.6    0.397    0.417   13.681   14.078
 sum_up_and_integrate               128 10.3    0.089    0.107   14.057   14.076
 integrate_v_rspace                 128 11.3    0.004    0.005   13.967   13.991
 qs_rho_update_rho_low              128  7.7    0.001    0.001   13.220   13.343
 calculate_rho_elec                 128  8.7    0.045    0.064   13.220   13.342
 init_scf_run                        11  5.9    0.000    0.001   12.918   12.919
 scf_env_initial_rho_setup           11  6.9    0.001    0.003   12.918   12.918
 dbcsr_mm_accdrv_process         124484 16.2    4.683    4.787    9.684   10.272
 mp_sum_l                          7950 12.9    9.022   10.035    9.022   10.035
 cp_fm_cholesky_invert               11 10.9    9.184    9.192    9.184    9.192
 wfi_extrapolate                     11  7.9    0.001    0.001    9.172    9.172
 calculate_dm_sparse                128  9.5    0.001    0.001    8.958    9.058
 multiply_cannon_metrocomm1       60168 15.6    0.096    0.101    6.153    8.702
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    8.095    8.219
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    8.126    8.214
 pw_transfer                       1547 11.6    0.075    0.087    7.681    7.872
 make_images_data                  5014 15.6    0.068    0.073    6.807    7.698
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    7.478    7.666
 grid_integrate_task_list           128 12.3    7.047    7.509    7.047    7.509
 density_rs2pw                      128  9.7    0.006    0.007    6.873    7.392
 hybrid_alltoall_any               5200 16.5    0.297    2.279    5.952    7.141
 fft_wrap_pw1pw2_140                523 13.2    1.133    1.180    6.526    6.703
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.003    6.687    6.698
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    5.790    5.869
 fft3d_ps                          1291 14.7    2.192    2.836    5.477    5.815
 mp_alltoall_d11v                  2415 14.1    4.292    5.560    4.292    5.560
 grid_collocate_task_list           128  9.7    4.847    5.272    4.847    5.272
 cp_fm_cholesky_decompose            22 10.9    4.663    4.677    4.663    4.677
 potential_pw2rs                    128 12.3    0.009    0.010    4.541    4.565
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="400", plot="h2o_256_md", label="(8n/12r/1t)", y=208.926000, yerr=0.000000
PlotPoint: name="401", plot="h2o_256_md_mem", label="(8n/12r/1t)", y=596.909091, yerr=6.316710
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/16/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1430460020736       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1958505086976       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1986244964352       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1992000282624       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2753956716544       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4454954827776       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5444944789504       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5492290093056       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6712799002624       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11613089636352       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15239146475520       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15239146475520       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19911124992000       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        94.228663E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.199914E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6806316384       0.0%      0.0%    100.0%
 number of processed stacks               6022464       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1130.2
 marketing flops                   145.647559E+12
 -------------------------------------------------------------------------------
 # multiplications                           2527
 max memory usage/rank             845.172736E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 2425920
 MPI messages size (bytes):
  total size                         4.132350E+12
  min size                           0.000000E+00
  max size                          17.653760E+06
  average size                       1.703416E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               14916                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768               71436               2336489472
     32768 < size <=   131072              728832              55956209664
    131072 < size <=  4194304             1386864            1409906900992
   4194304 < size <= 16777216              155760            1473826772352
  16777216 < size                           68112            1190343475200
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4024                  57903.
 MP_Allreduce        11138                    958.
 MP_Sync                87
 MP_Alltoall          1983                5056073.
 MP_SendRecv         12126                  47072.
 MP_ISendRecv        12126                  47072.
 MP_Wait             26114
 MP_ISend            11836                 212447.
 MP_IRecv            11836                 212447.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.017    0.033  194.234  194.236
 qs_mol_dyn_low                       1  2.0    0.003    0.003  193.839  193.854
 qs_forces                           11  3.9    0.004    0.005  193.424  193.425
 qs_energies                         11  4.9    0.002    0.007  186.636  186.649
 scf_env_do_scf                      11  5.9    0.001    0.001  169.552  169.562
 scf_env_do_scf_inner_loop          118  6.6    0.003    0.008  136.067  136.069
 velocity_verlet                     10  3.0    0.001    0.002  122.236  122.245
 dbcsr_multiply_generic            2527 12.6    0.190    0.196   98.786  100.000
 qs_scf_new_mos                     118  7.6    0.001    0.001   96.841   97.384
 qs_scf_loop_do_ot                  118  8.6    0.001    0.001   96.840   97.383
 ot_scf_mini                        118  9.6    0.004    0.005   91.970   92.594
 multiply_cannon                   2527 13.6    0.505    0.559   78.159   82.927
 multiply_cannon_loop              2527 14.6    1.594    1.673   75.079   77.896
 ot_mini                            118 10.6    0.001    0.001   50.819   51.364
 mp_waitall_1                    216438 16.6   24.963   38.931   24.963   38.931
 multiply_cannon_multrec          30324 15.6   21.181   26.099   31.976   37.644
 init_scf_loop                       11  6.9    0.001    0.006   33.395   33.398
 rebuild_ks_matrix                  129  8.3    0.001    0.001   32.751   33.349
 qs_ks_build_kohn_sham_matrix       129  9.3    0.017    0.020   32.751   33.348
 qs_ks_update_qs_env                129  7.6    0.001    0.001   29.487   30.037
 multiply_cannon_metrocomm3       30324 15.6    0.097    0.103   16.084   29.315
 qs_ot_get_derivative               118 11.6    0.001    0.002   28.545   29.157
 prepare_preconditioner              11  7.9    0.000    0.001   29.014   29.075
 make_preconditioner                 11  8.9    0.000    0.002   29.013   29.075
 make_full_inverse_cholesky          11  9.9    0.000    0.000   27.673   28.223
 qs_ot_get_p                        129 10.4    0.001    0.002   23.053   23.593
 apply_preconditioner_dbcsr         129 12.6    0.000    0.000   22.338   23.496
 apply_single                       129 13.6    0.001    0.001   22.338   23.496
 ot_diis_step                       118 11.6    0.015    0.015   22.094   22.096
 multiply_cannon_sync_h2d         30324 15.6   18.167   20.323   18.167   20.323
 qs_ot_p2m_diag                      83 11.4    0.189    0.217   18.033   18.075
 cp_fm_cholesky_invert               11 10.9   16.894   16.906   16.894   16.906
 cp_dbcsr_syevd                      83 12.4    0.006    0.006   16.802   16.803
 make_m2s                          5054 13.6    0.091    0.097   14.549   15.875
 make_images                       5054 14.6    1.198    1.394   14.338   15.663
 sum_up_and_integrate               129 10.3    0.117    0.133   14.345   14.374
 integrate_v_rspace                 129 11.3    0.004    0.004   14.228   14.264
 qs_rho_update_rho_low              129  7.7    0.001    0.001   13.664   13.695
 calculate_rho_elec                 129  8.7    0.088    0.104   13.663   13.695
 cp_fm_diag_elpa                     83 13.4    0.000    0.000   13.555   13.592
 cp_fm_diag_elpa_base                83 14.4   13.298   13.396   13.549   13.580
 init_scf_run                        11  5.9    0.000    0.001   12.246   12.247
 scf_env_initial_rho_setup           11  6.9    0.001    0.001   12.245   12.247
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002   11.292   11.736
 dbcsr_mm_accdrv_process          62734 16.2    5.409    6.141   10.247   11.108
 multiply_cannon_metrocomm4       27797 15.6    0.106    0.121    3.869   10.857
 mp_irecv_dv                      70031 16.3    3.664   10.453    3.664   10.453
 make_images_data                  5054 15.6    0.068    0.076    8.611   10.215
 hybrid_alltoall_any               5240 16.5    0.354    1.551    7.270    9.636
 pw_transfer                       1559 11.6    0.085    0.097    8.792    8.848
 fft_wrap_pw1pw2                   1301 12.7    0.010    0.011    8.567    8.625
 wfi_extrapolate                     11  7.9    0.001    0.001    8.418    8.418
 density_rs2pw                      129  9.7    0.006    0.007    7.293    7.750
 grid_integrate_task_list           129 12.3    7.242    7.630    7.242    7.630
 fft_wrap_pw1pw2_140                527 13.2    1.234    1.270    7.525    7.601
 qs_ot_get_derivative_taylor         41 13.0    0.001    0.001    6.411    7.183
 cp_fm_cholesky_decompose            22 10.9    7.017    7.100    7.017    7.100
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    6.768    6.777
 calculate_dm_sparse                129  9.5    0.001    0.001    6.543    6.694
 fft3d_ps                          1301 14.7    2.857    3.018    6.094    6.131
 mp_sum_l                          8010 12.9    4.266    6.100    4.266    6.100
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    5.851    5.995
 qs_ot_get_orbitals                 118 10.6    0.001    0.001    5.461    5.551
 grid_collocate_task_list           129  9.7    5.043    5.389    5.043    5.389
 mp_alltoall_d11v                  2423 14.1    4.294    5.060    4.294    5.060
 mp_allgather_i34                  2527 14.6    1.594    4.760    1.594    4.760
 potential_pw2rs                    129 12.3    0.015    0.017    4.696    4.716
 dbcsr_complete_redistribute        395 12.7    0.781    0.901    3.424    4.277
 mp_sum_d                          4499 12.1    2.701    4.025    2.701    4.025
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="402", plot="h2o_256_md", label="(8n/6r/2t)", y=194.236000, yerr=0.000000
PlotPoint: name="403", plot="h2o_256_md_mem", label="(8n/6r/2t)", y=802.545455, yerr=6.035706
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/17/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420239992832       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528891191296       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514751E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.928533E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755938624       0.0%      0.0%    100.0%
 number of processed stacks               3984192       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1695.7
 marketing flops                   144.579337E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank             948.224000E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                 1042912
 MPI messages size (bytes):
  total size                         2.716210E+12
  min size                           0.000000E+00
  max size                          26.214400E+06
  average size                       2.604448E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 264                  8650752
     32768 < size <=   131072              281856              36943429632
    131072 < size <=  4194304              660064             996105256960
   4194304 < size <= 16777216               65632             931530938576
  16777216 < size                           28672             751619276800
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3992                  58351.
 MP_Allreduce        11057                   1000.
 MP_Sync                87
 MP_Alltoall          1712                9388896.
 MP_SendRecv          7936                  75008.
 MP_ISendRecv         7936                  75008.
 MP_Wait             21820
 MP_ISend            11748                 275205.
 MP_IRecv            11748                 275205.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.038    0.074  177.084  177.087
 qs_mol_dyn_low                       1  2.0    0.003    0.003  176.295  176.569
 qs_forces                           11  3.9    0.006    0.013  176.117  176.121
 qs_energies                         11  4.9    0.004    0.013  169.574  169.584
 scf_env_do_scf                      11  5.9    0.001    0.002  153.970  153.972
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.010  118.919  118.920
 velocity_verlet                     10  3.0    0.001    0.002  112.914  112.919
 dbcsr_multiply_generic            2507 12.6    0.179    0.184   81.952   83.122
 qs_scf_new_mos                     117  7.6    0.001    0.001   81.757   82.227
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   81.756   82.226
 ot_scf_mini                        117  9.6    0.004    0.005   77.528   78.039
 multiply_cannon                   2507 13.6    0.504    0.531   61.970   66.303
 multiply_cannon_loop              2507 14.6    1.132    1.203   59.209   62.080
 ot_mini                            117 10.6    0.001    0.001   42.740   43.249
 mp_waitall_1                    170520 16.6   25.087   35.177   25.087   35.177
 init_scf_loop                       11  6.9    0.006    0.023   34.952   34.954
 prepare_preconditioner              11  7.9    0.000    0.000   30.927   30.981
 make_preconditioner                 11  8.9    0.000    0.001   30.927   30.981
 rebuild_ks_matrix                  128  8.3    0.001    0.001   30.265   30.786
 qs_ks_build_kohn_sham_matrix       128  9.3    0.016    0.020   30.265   30.786
 make_full_inverse_cholesky          11  9.9    0.000    0.000   28.620   29.965
 qs_ks_update_qs_env                128  7.6    0.001    0.001   27.287   27.772
 multiply_cannon_multrec          20056 15.6   12.989   16.804   22.311   26.133
 multiply_cannon_metrocomm3       20056 15.6    0.062    0.066   15.596   25.138
 qs_ot_get_derivative               117 11.6    0.001    0.002   22.995   23.504
 apply_preconditioner_dbcsr         128 12.6    0.000    0.000   19.739   20.766
 apply_single                       128 13.6    0.001    0.001   19.738   20.765
 qs_ot_get_p                        128 10.4    0.001    0.001   20.115   20.608
 ot_diis_step                       117 11.6    0.018    0.018   19.633   19.634
 qs_ot_p2m_diag                      83 11.4    0.266    0.274   15.817   15.826
 make_m2s                          5014 13.6    0.080    0.085   14.926   15.723
 multiply_cannon_sync_h2d         20056 15.6   13.459   15.566   13.459   15.566
 make_images                       5014 14.6    1.158    1.252   14.694   15.489
 cp_dbcsr_syevd                      83 12.4    0.005    0.006   14.736   14.737
 cp_fm_cholesky_invert               11 10.9   14.595   14.604   14.595   14.604
 sum_up_and_integrate               128 10.3    0.132    0.146   14.097   14.120
 integrate_v_rspace                 128 11.3    0.004    0.004   13.965   13.989
 qs_rho_update_rho_low              128  7.7    0.001    0.001   13.605   13.633
 calculate_rho_elec                 128  8.7    0.130    0.145   13.605   13.632
 cp_fm_diag_elpa                     83 13.4    0.000    0.001   11.443   11.464
 cp_fm_diag_elpa_base                83 14.4   11.034   11.185   11.440   11.460
 init_scf_run                        11  5.9    0.000    0.001   10.543   10.544
 scf_env_initial_rho_setup           11  6.9    0.001    0.003   10.543   10.544
 make_images_data                  5014 15.6    0.063    0.072    8.944   10.180
 hybrid_alltoall_any               5200 16.5    0.451    2.043    7.623    9.679
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002    9.105    9.473
 multiply_cannon_metrocomm4       17549 15.6    0.067    0.078    3.533    9.454
 mp_irecv_dv                      50230 16.2    3.405    9.204    3.405    9.204
 dbcsr_mm_accdrv_process          41502 16.2    5.593    5.910    8.786    8.926
 pw_transfer                       1547 11.6    0.085    0.104    8.743    8.843
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.012    8.519    8.623
 grid_integrate_task_list           128 12.3    7.314    7.791    7.314    7.791
 fft_wrap_pw1pw2_140                523 13.2    1.313    1.351    7.539    7.653
 wfi_extrapolate                     11  7.9    0.001    0.001    7.477    7.478
 cp_fm_cholesky_decompose            22 10.9    7.340    7.371    7.340    7.371
 density_rs2pw                      128  9.7    0.006    0.006    6.998    7.369
 cp_fm_upper_to_full                105 14.8    5.612    7.044    5.612    7.044
 dbcsr_complete_redistribute        395 12.7    1.169    1.198    4.591    6.295
 fft3d_ps                          1291 14.7    2.754    2.979    5.884    5.954
 calculate_dm_sparse                128  9.5    0.001    0.001    5.827    5.894
 grid_collocate_task_list           128  9.7    5.155    5.542    5.155    5.542
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.462    5.466
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    4.706    5.355
 copy_fm_to_dbcsr                   209 11.7    0.002    0.002    3.395    5.094
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.551    4.704
 mp_alltoall_d11v                  2415 14.1    4.334    4.695    4.334    4.695
 mp_sum_l                          7950 12.9    3.241    4.563    3.241    4.563
 mp_allgather_i34                  2507 14.6    1.260    4.562    1.260    4.562
 potential_pw2rs                    128 12.3    0.020    0.022    4.456    4.473
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    4.013    4.040
 transfer_fm_to_dbcsr                11  9.9    0.019    0.024    2.287    3.939
 qs_energies_init_hamiltonians       11  5.9    0.001    0.004    3.765    3.767
 mp_alltoall_i22                    716 14.1    1.883    3.695    1.883    3.695
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="404", plot="h2o_256_md", label="(8n/4r/3t)", y=177.087000, yerr=0.000000
PlotPoint: name="405", plot="h2o_256_md_mem", label="(8n/4r/3t)", y=902.363636, yerr=8.336914
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/18/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420242647040       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528903135232       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514766E+12       0.0%      0.0%    100.0%
 flops max/rank                      4.353791E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755941440       0.0%      0.0%    100.0%
 number of processed stacks               5977344       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1130.3
 marketing flops                   144.580175E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank               1.151242E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1143192
 MPI messages size (bytes):
  total size                         2.023815E+12
  min size                           0.000000E+00
  max size                          17.653760E+06
  average size                       1.770320E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                6996                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 396                  8650752
     32768 < size <=   131072              319024              36042702848
    131072 < size <=  4194304              715736             785529176064
   4194304 < size <= 16777216               70320             665379475120
  16777216 < size                           30720             536870912000
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4020                  57949.
 MP_Allreduce        11127                   1081.
 MP_Sync                87
 MP_Alltoall          1712               12503107.
 MP_SendRecv          5888                  75008.
 MP_ISendRecv         5888                  75008.
 MP_Wait             22442
 MP_ISend            14952                 244818.
 MP_IRecv            14952                 244818.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.023    0.052  190.256  190.256
 qs_mol_dyn_low                       1  2.0    0.003    0.004  189.647  189.660
 qs_forces                           11  3.9    0.004    0.005  188.467  188.475
 qs_energies                         11  4.9    0.003    0.007  181.334  181.345
 scf_env_do_scf                      11  5.9    0.001    0.002  163.207  163.219
 velocity_verlet                     10  3.0    0.014    0.028  125.986  126.033
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.010  116.999  117.000
 dbcsr_multiply_generic            2507 12.6    0.187    0.198   81.926   82.740
 qs_scf_new_mos                     117  7.6    0.001    0.001   80.430   80.763
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   80.429   80.762
 ot_scf_mini                        117  9.6    0.003    0.004   75.869   76.227
 multiply_cannon                   2507 13.6    0.552    0.591   57.134   60.537
 multiply_cannon_loop              2507 14.6    1.847    1.941   53.305   55.114
 init_scf_loop                       11  6.9    0.001    0.002   46.086   46.087
 ot_mini                            117 10.6    0.001    0.001   43.099   43.455
 prepare_preconditioner              11  7.9    0.000    0.000   41.992   42.038
 make_preconditioner                 11  8.9    0.000    0.002   41.992   42.038
 make_full_inverse_cholesky          11  9.9    0.011    0.021   35.607   40.660
 multiply_cannon_multrec          30084 15.6   13.537   19.242   27.409   32.699
 rebuild_ks_matrix                  128  8.3    0.001    0.001   29.374   29.694
 qs_ks_build_kohn_sham_matrix       128  9.3    0.017    0.019   29.373   29.694
 mp_waitall_1                    147882 16.7   17.403   26.817   17.403   26.817
 qs_ks_update_qs_env                128  7.6    0.001    0.001   26.482   26.761
 qs_ot_get_derivative               117 11.6    0.002    0.002   23.250   23.612
 make_m2s                          5014 13.6    0.097    0.102   20.430   21.752
 make_images                       5014 14.6    1.964    2.323   20.121   21.441
 apply_preconditioner_dbcsr         128 12.6    0.000    0.001   19.283   19.935
 apply_single                       128 13.6    0.001    0.001   19.283   19.934
 ot_diis_step                       117 11.6    0.018    0.018   19.735   19.737
 qs_ot_get_p                        128 10.4    0.001    0.001   18.852   19.255
 cp_fm_upper_to_full                105 14.8   11.392   16.825   11.392   16.825
 cp_fm_cholesky_invert               11 10.9   16.074   16.083   16.074   16.083
 multiply_cannon_metrocomm3       30084 15.6    0.049    0.053    6.555   15.534
 qs_ot_p2m_diag                      83 11.4    0.343    0.390   14.711   14.762
 sum_up_and_integrate               128 10.3    0.140    0.153   14.087   14.116
 dbcsr_mm_accdrv_process          62264 16.2    8.445    9.330   13.439   13.992
 integrate_v_rspace                 128 11.3    0.004    0.004   13.947   13.980
 qs_rho_update_rho_low              128  7.7    0.001    0.001   13.692   13.733
 calculate_rho_elec                 128  8.7    0.173    0.188   13.691   13.733
 cp_dbcsr_syevd                      83 12.4    0.005    0.006   13.301   13.302
 make_images_data                  5014 15.6    0.076    0.153   10.956   13.059
 dbcsr_complete_redistribute        395 12.7    1.499    1.633    8.982   12.844
 hybrid_alltoall_any               5200 16.5    0.520    2.080    9.829   12.285
 init_scf_run                        11  5.9    0.000    0.001   11.959   11.960
 scf_env_initial_rho_setup           11  6.9    0.002    0.003   11.958   11.960
 copy_fm_to_dbcsr                   209 11.7    0.001    0.002    7.608   11.396
 multiply_cannon_sync_h2d         30084 15.6   10.487   11.280   10.487   11.280
 cp_fm_diag_elpa                     83 13.4    0.000    0.000   10.237   10.250
 cp_fm_diag_elpa_base                83 14.4    9.278    9.589   10.231   10.242
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002    9.771   10.034
 transfer_fm_to_dbcsr                11  9.9    0.001    0.005    6.363   10.024
 mp_alltoall_i22                    716 14.1    5.659    9.393    5.659    9.393
 pw_transfer                       1547 11.6    0.086    0.102    8.920    9.002
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    8.694    8.785
 fft_wrap_pw1pw2_140                523 13.2    1.453    1.490    7.749    7.841
 grid_integrate_task_list           128 12.3    7.559    7.835    7.559    7.835
 wfi_extrapolate                     11  7.9    0.001    0.001    7.652    7.652
 cp_fm_cholesky_decompose            22 10.9    7.392    7.504    7.392    7.504
 multiply_cannon_metrocomm4       25070 15.6    0.085    0.098    2.817    7.197
 density_rs2pw                      128  9.7    0.006    0.006    6.717    7.108
 mp_irecv_dv                      76098 16.2    2.661    6.914    2.661    6.914
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    6.531    6.611
 calculate_dm_sparse                128  9.5    0.001    0.001    6.387    6.480
 fft3d_ps                          1291 14.7    2.847    2.943    5.860    5.947
 grid_collocate_task_list           128  9.7    5.277    5.703    5.277    5.703
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.001    5.453    5.566
 mp_alltoall_d11v                  2415 14.1    4.782    5.315    4.782    5.315
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    4.628    4.628
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    4.465    4.571
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    4.252    4.294
 potential_pw2rs                    128 12.3    0.023    0.024    4.269    4.281
 mp_allgather_i34                  2507 14.6    1.910    4.199    1.910    4.199
 calculate_first_density_matrix       1  7.0    0.000    0.002    4.087    4.090
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="406", plot="h2o_256_md", label="(8n/3r/4t)", y=190.256000, yerr=0.000000
PlotPoint: name="407", plot="h2o_256_md_mem", label="(8n/3r/4t)", y=1088.727273, yerr=16.954972
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/19/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420242647040       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528903135232       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514766E+12       0.0%      0.0%    100.0%
 flops max/rank                      5.865089E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755941440       0.0%      0.0%    100.0%
 number of processed stacks               1960712       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    3445.7
 marketing flops                   144.579337E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank               1.525473E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  240672
 MPI messages size (bytes):
  total size                         1.331455E+12
  min size                           0.000000E+00
  max size                          52.428800E+06
  average size                       5.532238E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                 132                  8650752
    131072 < size <=  4194304              113904              59718500352
   4194304 < size <= 16777216              104976             550376570880
  16777216 < size                           20208             721350232272
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         8931                     51.
 MP_Alltoall          9654                 799394.
 MP_ISend            40068                2102573.
 MP_IRecv            40068                2101676.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4002                  58203.
 MP_Allreduce        11082                   1166.
 MP_Sync                87
 MP_Alltoall          1712               18838222.
 MP_SendRecv          3840                 122880.
 MP_ISendRecv         3840                 122880.
 MP_Wait             16122
 MP_ISend            10680                 423556.
 MP_IRecv            10680                 423556.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.024    0.038  170.358  170.360
 qs_mol_dyn_low                       1  2.0    0.003    0.003  169.925  169.939
 qs_forces                           11  3.9    0.004    0.005  169.812  169.815
 qs_energies                         11  4.9    0.002    0.002  162.404  162.411
 scf_env_do_scf                      11  5.9    0.001    0.001  145.039  145.050
 velocity_verlet                     10  3.0    0.005    0.005  111.843  111.848
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.009  109.394  109.395
 qs_scf_new_mos                     117  7.6    0.001    0.001   72.556   72.714
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   72.555   72.713
 dbcsr_multiply_generic            2507 12.6    0.181    0.192   72.139   72.436
 ot_scf_mini                        117  9.6    0.003    0.004   68.088   68.196
 multiply_cannon                   2507 13.6    0.561    0.593   53.367   56.985
 multiply_cannon_loop              2507 14.6    0.811    0.846   50.361   51.037
 ot_mini                            117 10.6    0.001    0.001   37.452   37.555
 init_scf_loop                       11  6.9    0.001    0.001   35.493   35.494
 prepare_preconditioner              11  7.9    0.000    0.000   31.599   31.628
 make_preconditioner                 11  8.9    0.000    0.000   31.599   31.628
 mp_waitall_1                    125778 16.7   24.661   30.883   24.661   30.883
 make_full_inverse_cholesky          11  9.9    0.013    0.027   29.473   29.751
 rebuild_ks_matrix                  128  8.3    0.001    0.001   29.171   29.298
 qs_ks_build_kohn_sham_matrix       128  9.3    0.017    0.018   29.170   29.298
 qs_ks_update_qs_env                128  7.6    0.001    0.001   26.601   26.726
 multiply_cannon_multrec          10028 15.6   10.405   14.517   18.015   20.932
 qs_ot_get_derivative               117 11.6    0.001    0.002   20.534   20.641
 multiply_cannon_metrocomm3       10028 15.6    0.026    0.027   12.418   19.613
 qs_ot_get_p                        128 10.4    0.001    0.001   17.887   17.979
 cp_fm_cholesky_invert               11 10.9   17.939   17.946   17.939   17.946
 apply_preconditioner_dbcsr         128 12.6    0.000    0.000   16.933   17.155
 apply_single                       128 13.6    0.001    0.001   16.933   17.155
 ot_diis_step                       117 11.6    0.020    0.020   16.848   16.848
 make_m2s                          5014 13.6    0.066    0.072   14.870   15.738
 make_images                       5014 14.6    2.184    2.618   14.565   15.437
 sum_up_and_integrate               128 10.3    0.182    0.194   14.220   14.273
 qs_ot_p2m_diag                      83 11.4    0.496    0.502   14.130   14.149
 integrate_v_rspace                 128 11.3    0.004    0.004   14.038   14.101
 qs_rho_update_rho_low              128  7.7    0.001    0.001   14.011   14.049
 calculate_rho_elec                 128  8.7    0.256    0.267   14.011   14.048
 cp_dbcsr_syevd                      83 12.4    0.005    0.006   12.934   12.935
 multiply_cannon_sync_h2d         10028 15.6   10.893   11.122   10.893   11.122
 init_scf_run                        11  5.9    0.000    0.001   10.486   10.487
 scf_env_initial_rho_setup           11  6.9    0.001    0.001   10.486   10.487
 cp_fm_diag_elpa                     83 13.4    0.000    0.000   10.013   10.023
 cp_fm_diag_elpa_base                83 14.4    9.775    9.851   10.009   10.019
 make_images_data                  5014 15.6    0.056    0.064    8.562    9.903
 hybrid_alltoall_any               5200 16.5    0.844    3.785    8.391    9.845
 pw_transfer                       1547 11.6    0.085    0.093    9.317    9.342
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    9.096    9.128
 qs_ot_get_derivative_diag           77 12.4    0.002    0.003    8.071    8.154
 fft_wrap_pw1pw2_140                523 13.2    1.817    1.854    8.041    8.079
 cp_fm_cholesky_decompose            22 10.9    7.967    8.075    7.967    8.075
 grid_integrate_task_list           128 12.3    7.764    7.987    7.764    7.987
 dbcsr_mm_accdrv_process          20762 16.1    3.148    4.420    7.258    7.932
 multiply_cannon_metrocomm1       10028 15.6    0.030    0.030    4.429    7.524
 wfi_extrapolate                     11  7.9    0.001    0.001    7.362    7.362
 density_rs2pw                      128  9.7    0.005    0.006    6.742    7.023
 calculate_dm_sparse                128  9.5    0.001    0.001    6.160    6.220
 fft3d_ps                          1291 14.7    2.762    2.830    5.807    5.857
 grid_collocate_task_list           128  9.7    5.569    5.784    5.569    5.784
 dbcsr_complete_redistribute        395 12.7    2.115    2.160    5.291    5.680
 qs_energies_init_hamiltonians       11  5.9    0.010    0.015    5.319    5.320
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.252    5.261
 mp_alltoall_d11v                  2415 14.1    4.625    4.886    4.625    4.886
 mp_allgather_i34                  2507 14.6    1.188    4.615    1.188    4.615
 potential_pw2rs                    128 12.3    0.026    0.026    4.251    4.264
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.150    4.202
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    3.586    3.866
 multiply_cannon_metrocomm4        7521 15.6    0.027    0.029    1.732    3.749
 copy_fm_to_dbcsr                   209 11.7    0.001    0.002    3.408    3.739
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    3.644    3.727
 mp_irecv_dv                      28860 15.9    1.692    3.672    1.692    3.672
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    3.595    3.627
 copy_dbcsr_to_fm                   186 11.8    0.004    0.004    3.482    3.556
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    3.475    3.481
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="408", plot="h2o_256_md", label="(8n/2r/6t)", y=170.360000, yerr=0.000000
PlotPoint: name="409", plot="h2o_256_md_mem", label="(8n/2r/6t)", y=1438.818182, yerr=21.565200
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/20/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1410022121472       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1924145348608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1957871443968       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1963544850432       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2714615709696       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4377645416448       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5350455508992       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5395653328896       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6594687401984       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11444702699520       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15019188129792       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15019188129792       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19624853225472       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        92.796573E+12       0.0%      0.0%    100.0%
 flops max/rank                     11.606413E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6705499744       0.0%      0.0%    100.0%
 number of processed stacks               1947808       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    3442.6
 marketing flops                   143.507742E+12
 -------------------------------------------------------------------------------
 # multiplications                           2485
 max memory usage/rank               3.080552E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   99400
 MPI messages size (bytes):
  total size                         1.127422E+12
  min size                           0.000000E+00
  max size                         104.857600E+06
  average size                      11.342275E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                  44                  2883584
    131072 < size <=  4194304               44768              34745614336
   4194304 < size <= 16777216               43984             376564613120
  16777216 < size                           10032             716108580288
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4003                  59121.
 MP_Allreduce        11085                   1504.
 MP_Sync                86
 MP_Alltoall          1700               36954383.
 MP_SendRecv          1778                 218624.
 MP_ISendRecv         1778                 218624.
 MP_Wait              9728
 MP_ISend             6360                1080477.
 MP_IRecv             6360                1080477.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.027    0.045  289.849  289.851
 qs_mol_dyn_low                       1  2.0    0.003    0.003  289.283  289.329
 qs_forces                           11  3.9    0.005    0.005  289.184  289.187
 qs_energies                         11  4.9    0.002    0.003  280.386  280.398
 scf_env_do_scf                      11  5.9    0.001    0.001  257.532  257.542
 velocity_verlet                     10  3.0    0.005    0.006  209.377  209.385
 scf_env_do_scf_inner_loop          116  6.6    0.004    0.009  133.061  133.063
 init_scf_loop                       11  6.9    0.023    0.053  124.197  124.200
 prepare_preconditioner              11  7.9    0.000    0.000  119.336  119.376
 make_preconditioner                 11  8.9    0.000    0.000  119.336  119.376
 make_full_inverse_cholesky          11  9.9    0.035    0.039   95.204  116.530
 qs_scf_new_mos                     116  7.6    0.001    0.001   89.823   89.900
 qs_scf_loop_do_ot                  116  8.6    0.001    0.001   89.822   89.900
 ot_scf_mini                        116  9.6    0.004    0.004   84.958   84.994
 dbcsr_multiply_generic            2485 12.5    0.210    0.222   81.524   81.919
 cp_fm_upper_to_full                104 14.8   52.146   75.240   52.146   75.240
 multiply_cannon                   2485 13.5    0.680    0.729   58.922   59.987
 multiply_cannon_loop              2485 14.5    1.042    1.053   54.950   56.067
 ot_mini                            116 10.6    0.001    0.001   44.032   44.067
 dbcsr_complete_redistribute        393 12.7    4.014    4.062   30.157   43.499
 copy_fm_to_dbcsr                   208 11.6    0.001    0.002   26.603   39.856
 transfer_fm_to_dbcsr                11  9.9    0.030    0.031   24.089   37.127
 mp_alltoall_i22                    712 14.1   21.887   35.309   21.887   35.309
 rebuild_ks_matrix                  127  8.3    0.001    0.001   33.863   33.906
 qs_ks_build_kohn_sham_matrix       127  9.3    0.018    0.018   33.862   33.905
 cp_fm_cholesky_invert               11 10.9   33.675   33.681   33.675   33.681
 mp_waitall_1                    102768 16.8   27.854   32.196   27.854   32.196
 qs_ks_update_qs_env                127  7.6    0.001    0.001   31.393   31.434
 qs_ot_get_p                        127 10.4    0.001    0.001   26.010   26.063
 qs_ot_get_derivative               116 11.6    0.002    0.002   24.547   24.583
 qs_ot_p2m_diag                      82 11.4    0.868    0.873   21.922   21.951
 multiply_cannon_metrocomm3        9940 15.5    0.025    0.026   19.128   20.293
 cp_dbcsr_syevd                      82 12.4    0.006    0.006   20.133   20.135
 make_m2s                          4970 13.5    0.076    0.078   18.046   19.458
 ot_diis_step                       116 11.6    0.021    0.022   19.452   19.452
 make_images                       4970 14.5    3.060    3.245   17.570   18.987
 apply_preconditioner_dbcsr         127 12.6    0.000    0.000   18.795   18.926
 apply_single                       127 13.6    0.001    0.001   18.795   18.926
 multiply_cannon_multrec           9940 15.5   10.233   12.012   17.969   18.056
 cp_fm_diag_elpa                     82 13.4    0.000    0.000   16.954   16.955
 cp_fm_diag_elpa_base                82 14.4   12.634   14.218   16.948   16.949
 qs_rho_update_rho_low              127  7.7    0.001    0.001   16.329   16.347
 calculate_rho_elec                 127  8.7    0.477    0.478   16.329   16.346
 sum_up_and_integrate               127 10.3    0.318    0.320   16.199   16.285
 integrate_v_rspace                 127 11.3    0.004    0.005   15.880   15.966
 multiply_cannon_sync_h2d          9940 15.5   14.196   14.234   14.196   14.234
 init_scf_run                        11  5.9    0.000    0.001   12.261   12.261
 scf_env_initial_rho_setup           11  6.9    0.001    0.001   12.261   12.261
 hybrid_alltoall_any               5155 16.4    1.301    3.036   10.190   12.241
 make_images_data                  4970 15.5    0.064    0.070    9.982   12.076
 pw_transfer                       1535 11.6    0.092    0.092   11.282   11.285
 fft_wrap_pw1pw2                   1281 12.7    0.011    0.011   11.049   11.051
 fft_wrap_pw1pw2_140                519 13.2    3.027    3.060    9.845    9.849
 qs_ot_get_derivative_diag           76 12.4    0.002    0.002    9.662    9.690
 dbcsr_mm_accdrv_process          20590 16.0    4.143    5.947    7.489    9.362
 cp_fm_cholesky_decompose            22 10.9    9.136    9.231    9.136    9.231
 wfi_extrapolate                     11  7.9    0.001    0.001    9.027    9.027
 grid_integrate_task_list           127 12.3    8.499    8.701    8.499    8.701
 qs_energies_init_hamiltonians       11  5.9    0.006    0.014    8.389    8.390
 density_rs2pw                      127  9.7    0.005    0.006    7.432    7.483
 mp_alltoall_d11v                  2401 14.1    6.922    7.058    6.922    7.058
 calculate_dm_sparse                127  9.5    0.001    0.001    6.734    6.810
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    6.538    6.596
 grid_collocate_task_list           127  9.7    6.364    6.448    6.364    6.448
 fft3d_ps                          1281 14.7    2.828    2.834    6.328    6.361
 copy_dbcsr_to_fm                   185 11.7    0.004    0.004    6.223    6.301
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="410", plot="h2o_256_md", label="(8n/1r/12t)", y=289.851000, yerr=0.000000
PlotPoint: name="411", plot="h2o_256_md_mem", label="(8n/1r/12t)", y=2821.818182, yerr=139.561533
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/21/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.766000E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                419739       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   22952.9
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               1.259442E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  458208
 MPI messages size (bytes):
  total size                         3.456111E+12
  min size                           0.000000E+00
  max size                          18.735064E+06
  average size                       7.542668E+06
 MPI breakdown and total messages size (bytes):
             size <=      128              112896                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 224                  5687808
     32768 < size <=   131072               10528                813356544
    131072 < size <=  4194304               36422              76284728544
   4194304 < size <= 16777216              294266            3312457683808
  16777216 < size                            3872              66548597808
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 255646.
 MP_Allreduce         3139                   6114.
 MP_Sync                 4
 MP_Alltoall            54
 MP_SendRecv           285                  19200.
 MP_ISendRecv          285                  19200.
 MP_Wait              1017
 MP_ISend              642                 197829.
 MP_IRecv              642                 197607.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.022    0.044   85.641   85.642
 qs_energies                          1  2.0    0.000    0.000   85.172   85.178
 ls_scf                               1  3.0    0.000    0.000   84.281   84.287
 dbcsr_multiply_generic             111  6.7    0.014    0.015   73.397   73.586
 multiply_cannon                    111  7.7    0.017    0.020   56.631   57.671
 multiply_cannon_loop               111  8.7    0.227    0.244   53.200   54.480
 ls_scf_main                          1  4.0    0.000    0.000   52.365   52.366
 density_matrix_trs4                  2  5.0    0.002    0.003   46.863   46.972
 ls_scf_init_scf                      1  4.0    0.000    0.000   28.877   28.879
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   27.744   27.803
 mp_waitall_1                     11031 10.9   22.571   25.834   22.571   25.834
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   25.669   25.691
 multiply_cannon_multrec           2664  9.7    8.123    8.949   15.562   17.388
 multiply_cannon_sync_h2d          2664  9.7   13.554   15.605   13.554   15.605
 make_m2s                           222  7.7    0.009    0.012   13.200   13.753
 make_images                        222  8.7    0.098    0.108   13.178   13.732
 multiply_cannon_metrocomm1        2664  9.7    0.010    0.011    9.674   13.260
 multiply_cannon_metrocomm3        2664  9.7    0.009    0.011    5.457    8.492
 make_images_data                   222  9.7    0.004    0.005    7.775    8.284
 dbcsr_mm_accdrv_process           4760 10.4    0.585    0.701    7.056    8.002
 hybrid_alltoall_any                227 10.6    0.215    1.839    6.644    7.988
 dbcsr_mm_accdrv_process_sort      4760 11.4    6.272    7.145    6.272    7.145
 calculate_norms                   4752  9.8    5.509    6.228    5.509    6.228
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.007    5.127
 mp_sum_l                           887  5.1    3.029    4.260    3.029    4.260
 multiply_cannon_metrocomm4        2442  9.7    0.012    0.015    2.044    3.753
 mp_irecv_dv                       6231 10.9    2.027    3.735    2.027    3.735
 make_images_sizes                  222  9.7    0.000    0.000    0.680    3.412
 mp_alltoall_i44                    222 10.7    0.680    3.412    0.680    3.412
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    2.273    3.260
 arnoldi_extremal                     4  6.8    0.000    0.000    3.167    3.201
 arnoldi_normal_ev                    4  7.8    0.001    0.003    3.167    3.201
 build_subspace                      16  8.4    0.009    0.012    3.075    3.077
 ls_scf_post                          1  4.0    0.000    0.000    3.038    3.045
 ls_scf_store_result                  1  5.0    0.000    0.000    2.865    2.901
 dbcsr_special_finalize             555  9.7    0.005    0.006    2.391    2.839
 dbcsr_merge_single_wm              555 10.7    0.458    0.589    2.382    2.830
 make_images_pack                   222  9.7    2.208    2.633    2.210    2.635
 dbcsr_sort_data                    658 11.4    2.181    2.575    2.181    2.575
 dbcsr_matrix_vector_mult           304  9.0    0.006    0.013    2.306    2.552
 dbcsr_matrix_vector_mult_local     304 10.0    2.066    2.462    2.068    2.464
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    2.285    2.352
 buffer_matrices_ensure_size        222  8.7    1.757    2.069    1.757    2.069
 compute_matrix_preconditioner        1  6.0    0.000    0.001    1.796    1.800
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.767    1.768
 rebuild_ks_matrix                    3  7.3    0.000    0.000    1.757    1.758
 qs_ks_build_kohn_sham_matrix         3  8.3    0.004    0.007    1.757    1.758
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="500", plot="h2o_32_nrep3_ls", label="(8n/12r/1t)", y=85.642000, yerr=0.000000
PlotPoint: name="501", plot="h2o_32_nrep3_ls_mem", label="(8n/12r/1t)", y=1140.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/22/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      5.588524E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                368848       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   26119.8
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               2.143425E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  106560
 MPI messages size (bytes):
  total size                         2.699093E+12
  min size                           0.000000E+00
  max size                          72.286792E+06
  average size                      25.329324E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               23040                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                3264                325830144
    131072 < size <=  4194304                5280               3328561104
   4194304 < size <= 16777216               12709             156766962056
  16777216 < size                           62267            2538670978840
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 266673.
 MP_Allreduce         3138                  10075.
 MP_Sync                 4
 MP_Alltoall            47               15335933.
 MP_SendRecv           141                  57600.
 MP_ISendRecv          141                  57600.
 MP_Wait               687
 MP_ISend              462                 414589.
 MP_IRecv              462                 413870.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.034    0.054   89.939   89.942
 qs_energies                          1  2.0    0.000    0.001   89.405   89.409
 ls_scf                               1  3.0    0.000    0.001   88.044   88.047
 dbcsr_multiply_generic             111  6.7    0.016    0.016   74.274   74.507
 multiply_cannon                    111  7.7    0.027    0.042   52.846   57.082
 ls_scf_main                          1  4.0    0.004    0.056   54.487   54.489
 multiply_cannon_loop               111  8.7    0.135    0.147   50.030   53.138
 density_matrix_trs4                  2  5.0    0.002    0.008   48.773   49.039
 ls_scf_init_scf                      1  4.0    0.000    0.002   29.848   29.850
 mp_waitall_1                      9105 10.9   20.777   29.677   20.777   29.677
 ls_scf_init_matrix_S                 1  5.0    0.000    0.001   28.644   28.748
 multiply_cannon_multrec           1332  9.7   13.302   17.113   22.636   27.591
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.002   26.286   26.295
 multiply_cannon_metrocomm3        1332  9.7    0.007    0.008   11.522   20.033
 make_m2s                           222  7.7    0.006    0.008   14.975   15.563
 make_images                        222  8.7    1.372    1.703   14.944   15.534
 dbcsr_mm_accdrv_process           4041 10.4    0.348    0.540    8.932   10.476
 dbcsr_mm_accdrv_process_sort      4041 11.4    8.452    9.936    8.452    9.936
 make_images_data                   222  9.7    0.004    0.005    8.703    9.633
 hybrid_alltoall_any                227 10.6    0.541    2.556    8.076    9.447
 mp_sum_l                           887  5.1    4.979    7.887    4.979    7.887
 multiply_cannon_metrocomm4        1221  9.7    0.007    0.009    3.194    7.752
 mp_irecv_dv                       3311 11.0    3.174    7.694    3.174    7.694
 multiply_cannon_sync_h2d          1332  9.7    4.905    6.861    4.905    6.861
 calculate_norms                   2376  9.8    6.072    6.808    6.072    6.808
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    3.739    6.062
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.970    5.203
 arnoldi_extremal                     4  6.8    0.000    0.000    4.583    4.600
 arnoldi_normal_ev                    4  7.8    0.001    0.005    4.583    4.600
 build_subspace                      16  8.4    0.014    0.021    4.331    4.334
 ls_scf_post                          1  4.0    0.000    0.002    3.709    3.712
 ls_scf_store_result                  1  5.0    0.000    0.000    3.410    3.534
 dbcsr_matrix_vector_mult           304  9.0    0.010    0.022    3.118    3.351
 dbcsr_matrix_vector_mult_local     304 10.0    2.738    3.221    2.740    3.222
 mp_allgather_i34                   111  8.7    0.799    2.976    0.799    2.976
 multiply_cannon_metrocomm1        1332  9.7    0.003    0.004    1.200    2.893
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    2.617    2.700
 dbcsr_data_new                    4174 10.1    2.115    2.398    2.115    2.398
 make_images_pack                   222  9.7    1.818    2.123    1.820    2.125
 dbcsr_sort_data                    436 11.2    1.792    2.001    1.792    2.001
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.859    1.861
 rebuild_ks_matrix                    3  7.3    0.000    0.000    1.846    1.848
 qs_ks_build_kohn_sham_matrix         3  8.3    0.003    0.004    1.846    1.848
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="502", plot="h2o_32_nrep3_ls", label="(8n/6r/2t)", y=89.942000, yerr=0.000000
PlotPoint: name="503", plot="h2o_32_nrep3_ls_mem", label="(8n/6r/2t)", y=1774.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/23/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      8.404608E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                353133       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   27282.1
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               2.881233E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   46176
 MPI messages size (bytes):
  total size                         1.924064E+12
  min size                           0.000000E+00
  max size                         108.059888E+06
  average size                      41.668048E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                9984                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                3328               1170063360
   4194304 < size <= 16777216                1870              19378539600
  16777216 < size                           30994            1903514987232
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 265448.
 MP_Allreduce         3138                  10896.
 MP_Sync                 4
 MP_Alltoall            47               23526250.
 MP_SendRecv            93                  57600.
 MP_ISendRecv           93                  57600.
 MP_Wait               639
 MP_ISend              462                 560046.
 MP_IRecv              462                 560662.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.038    0.063   93.613   93.614
 qs_energies                          1  2.0    0.000    0.001   92.953   92.976
 ls_scf                               1  3.0    0.000    0.001   91.525   91.546
 dbcsr_multiply_generic             111  6.7    0.017    0.021   76.201   76.583
 ls_scf_main                          1  4.0    0.000    0.005   57.006   57.009
 multiply_cannon                    111  7.7    0.034    0.077   52.584   56.800
 multiply_cannon_loop               111  8.7    0.117    0.130   49.805   54.039
 density_matrix_trs4                  2  5.0    0.002    0.004   51.096   51.299
 mp_waitall_1                      7281 11.0   24.037   33.944   24.037   33.944
 ls_scf_init_scf                      1  4.0    0.000    0.001   30.846   30.849
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   29.564   29.686
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.002   27.211   27.228
 multiply_cannon_multrec            888  9.7   12.712   15.365   21.488   24.856
 multiply_cannon_metrocomm3         888  9.7    0.004    0.004   11.201   23.478
 make_m2s                           222  7.7    0.006    0.008   16.580   17.373
 make_images                        222  8.7    1.590    1.857   16.542   17.331
 make_images_data                   222  9.7    0.004    0.005    9.740   10.779
 hybrid_alltoall_any                227 10.6    0.642    2.950    9.181   10.436
 dbcsr_mm_accdrv_process           3754 10.4    0.438    1.069    8.299    9.848
 mp_sum_l                           887  5.1    5.336    8.979    5.336    8.979
 dbcsr_mm_accdrv_process_sort      3754 11.4    7.724    8.904    7.724    8.904
 multiply_cannon_metrocomm1         888  9.7    0.003    0.003    3.741    7.309
 multiply_cannon_sync_h2d           888  9.7    6.131    7.121    6.131    7.121
 multiply_cannon_metrocomm4         777  9.7    0.004    0.005    2.430    7.088
 mp_irecv_dv                       2335 11.1    2.415    7.046    2.415    7.046
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    4.140    7.042
 arnoldi_extremal                     4  6.8    0.000    0.000    5.048    5.070
 arnoldi_normal_ev                    4  7.8    0.001    0.006    5.048    5.070
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.766    5.017
 calculate_norms                   1584  9.8    4.339    4.755    4.339    4.755
 build_subspace                      16  8.4    0.014    0.021    4.742    4.748
 mp_allgather_i34                   111  8.7    0.905    3.833    0.905    3.833
 dbcsr_matrix_vector_mult           304  9.0    0.010    0.021    3.423    3.754
 ls_scf_post                          1  4.0    0.000    0.000    3.673    3.695
 dbcsr_matrix_vector_mult_local     304 10.0    3.021    3.589    3.023    3.591
 ls_scf_store_result                  1  5.0    0.000    0.000    3.411    3.495
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    2.841    2.931
 dbcsr_data_new                    4116  9.9    2.105    2.449    2.105    2.449
 dbcsr_sort_data                    325 11.1    1.901    2.167    1.901    2.167
 make_images_sizes                  222  9.7    0.000    0.000    0.990    2.133
 mp_alltoall_i44                    222 10.7    0.989    2.132    0.989    2.132
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.930    1.932
 dbcsr_finalize                     304  7.8    0.026    0.032    1.618    1.917
 rebuild_ks_matrix                    3  7.3    0.000    0.000    1.911    1.913
 qs_ks_build_kohn_sham_matrix         3  8.3    0.001    0.004    1.911    1.913
 make_images_pack                   222  9.7    1.624    1.882    1.627    1.885
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="504", plot="h2o_32_nrep3_ls", label="(8n/4r/3t)", y=93.614000, yerr=0.000000
PlotPoint: name="505", plot="h2o_32_nrep3_ls_mem", label="(8n/4r/3t)", y=2294.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/24/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     10.747127E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                369794       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   26053.0
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               3.293651E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                   50616
 MPI messages size (bytes):
  total size                         1.536549E+12
  min size                           0.000000E+00
  max size                          72.286792E+06
  average size                      30.356986E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               10368                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                1056                104411904
    131072 < size <=  4194304                3168                831638784
   4194304 < size <= 16777216                3103              33613273640
  16777216 < size                           32921            1501999894888
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 266673.
 MP_Allreduce         3138                  13030.
 MP_Sync                 4
 MP_Alltoall            47               30278988.
 MP_SendRecv            69                  86400.
 MP_ISendRecv           69                  86400.
 MP_Wait               531
 MP_ISend              378                 823502.
 MP_IRecv              378                 823753.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.042    0.054   98.093   98.096
 qs_energies                          1  2.0    0.000    0.001   97.395   97.402
 ls_scf                               1  3.0    0.000    0.001   95.631   95.635
 dbcsr_multiply_generic             111  6.7    0.018    0.021   79.432   79.672
 ls_scf_main                          1  4.0    0.001    0.007   59.417   59.418
 multiply_cannon                    111  7.7    0.056    0.131   52.250   56.503
 density_matrix_trs4                  2  5.0    0.002    0.005   53.323   53.445
 multiply_cannon_loop               111  8.7    0.153    0.165   47.250   50.763
 ls_scf_init_scf                      1  4.0    0.000    0.001   32.946   32.948
 ls_scf_init_matrix_S                 1  5.0    0.000    0.001   31.731   31.802
 mp_waitall_1                      6369 11.0   23.286   29.867   23.286   29.867
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   28.906   28.927
 multiply_cannon_multrec           1332  9.7   14.104   17.370   22.177   24.793
 make_m2s                           222  7.7    0.007    0.008   21.322   22.679
 make_images                        222  8.7    3.152    3.630   21.271   22.630
 multiply_cannon_metrocomm3        1332  9.7    0.003    0.004    9.606   17.400
 make_images_data                   222  9.7    0.004    0.004   11.872   13.395
 hybrid_alltoall_any                227 10.6    0.803    3.761   11.262   12.869
 dbcsr_mm_accdrv_process           3641 10.4    0.285    0.480    7.698    9.142
 dbcsr_mm_accdrv_process_sort      3641 11.4    7.157    8.642    7.157    8.642
 mp_sum_l                           887  5.1    4.271    8.209    4.271    8.209
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    3.341    6.548
 multiply_cannon_metrocomm4        1110  9.7    0.005    0.007    2.082    6.173
 mp_irecv_dv                       3229 10.9    2.056    6.100    2.056    6.100
 multiply_cannon_sync_h2d          1332  9.7    5.458    5.960    5.458    5.960
 multiply_cannon_metrocomm1        1332  9.7    0.003    0.003    2.640    5.335
 arnoldi_extremal                     4  6.8    0.000    0.000    5.204    5.223
 arnoldi_normal_ev                    4  7.8    0.001    0.005    5.204    5.223
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.872    5.046
 build_subspace                      16  8.4    0.015    0.021    4.864    4.872
 calculate_norms                   2376  9.8    4.184    4.540    4.184    4.540
 mp_allgather_i34                   111  8.7    2.119    4.303    2.119    4.303
 dbcsr_matrix_vector_mult           304  9.0    0.010    0.021    3.577    3.867
 dbcsr_matrix_vector_mult_local     304 10.0    3.193    3.680    3.195    3.682
 dbcsr_sort_data                    658 11.4    3.101    3.500    3.101    3.500
 ls_scf_post                          1  4.0    0.000    0.001    3.268    3.275
 dbcsr_special_finalize             555  9.7    0.006    0.007    2.845    3.252
 dbcsr_merge_single_wm              555 10.7    0.537    0.666    2.836    3.243
 ls_scf_store_result                  1  5.0    0.000    0.000    3.005    3.070
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    3.001    3.056
 dbcsr_data_release               10477 10.7    1.595    2.401    1.595    2.401
 dbcsr_finalize                     304  7.8    0.050    0.061    1.814    2.002
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="506", plot="h2o_32_nrep3_ls", label="(8n/3r/4t)", y=98.096000, yerr=0.000000
PlotPoint: name="507", plot="h2o_32_nrep3_ls_mem", label="(8n/3r/4t)", y=2711.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/25/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     15.383312E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                336818       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   28603.7
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               4.759831E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                   10656
 MPI messages size (bytes):
  total size                         1.149035E+12
  min size                           0.000000E+00
  max size                         203.538048E+06
  average size                     107.829832E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                2304                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                 768                702038016
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            7584            1148332810224
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                2                     12.
 MP_Allreduce          705                    128.
 MP_Alltoall           310               12920694.
 MP_ISend             1776               40180424.
 MP_IRecv             1776               40465030.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 265536.
 MP_Allreduce         3129                  15263.
 MP_Sync                 4
 MP_Alltoall            47               46208988.
 MP_SendRecv            45                 115200.
 MP_ISendRecv           45                 115200.
 MP_Wait               528
 MP_ISend              420                 924980.
 MP_IRecv              420                 924528.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.041    0.056   91.631   91.632
 qs_energies                          1  2.0    0.000    0.000   90.891   90.893
 ls_scf                               1  3.0    0.000    0.000   88.878   88.879
 dbcsr_multiply_generic             111  6.7    0.017    0.019   70.488   70.682
 ls_scf_main                          1  4.0    0.000    0.000   56.314   56.315
 multiply_cannon                    111  7.7    0.075    0.167   52.192   55.817
 multiply_cannon_loop               111  8.7    0.088    0.095   49.634   51.164
 density_matrix_trs4                  2  5.0    0.002    0.003   49.324   49.421
 ls_scf_init_scf                      1  4.0    0.000    0.000   29.138   29.138
 mp_waitall_1                      5436 11.0   24.079   28.395   24.079   28.395
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   27.908   27.943
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   25.806   25.824
 multiply_cannon_multrec            444  9.7   13.686   16.100   20.818   22.958
 multiply_cannon_metrocomm1         444  9.7    0.002    0.002   10.249   15.149
 make_m2s                           222  7.7    0.005    0.005   13.580   14.496
 make_images                        222  8.7    2.044    2.483   13.512   14.428
 multiply_cannon_metrocomm3         444  9.7    0.001    0.002    6.326   14.427
 hybrid_alltoall_any                227 10.6    0.803    3.847    8.101    9.726
 make_images_data                   222  9.7    0.003    0.004    8.324    9.605
 multiply_cannon_sync_h2d           444  9.7    6.723    7.984    6.723    7.984
 dbcsr_mm_accdrv_process           3003 10.4    0.362    0.399    6.829    7.966
 dbcsr_mm_accdrv_process_sort      3003 11.4    6.468    7.569    6.468    7.569
 arnoldi_extremal                     4  6.8    0.000    0.000    5.843    5.861
 arnoldi_normal_ev                    4  7.8    0.002    0.005    5.843    5.861
 build_subspace                      16  8.4    0.015    0.019    5.445    5.453
 mp_sum_l                           887  5.1    2.715    4.886    2.715    4.886
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.481    4.679
 dbcsr_matrix_vector_mult           304  9.0    0.011    0.021    4.179    4.397
 dbcsr_matrix_vector_mult_local     304 10.0    3.703    4.178    3.705    4.180
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    1.901    3.869
 multiply_cannon_metrocomm4         333  9.7    0.001    0.002    1.555    3.798
 mp_irecv_dv                       1241 11.2    1.541    3.772    1.541    3.772
 calculate_norms                    792  9.8    3.621    3.723    3.621    3.723
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    3.531    3.623
 mp_allgather_i34                   111  8.7    1.135    3.603    1.135    3.603
 ls_scf_post                          1  4.0    0.000    0.000    3.426    3.428
 make_images_sizes                  222  9.7    0.000    0.000    0.875    3.335
 mp_alltoall_i44                    222 10.7    0.875    3.335    0.875    3.335
 ls_scf_store_result                  1  5.0    0.000    0.000    3.209    3.244
 dbcsr_finalize                     304  7.8    0.062    0.077    2.194    2.288
 dbcsr_data_new                    4608  9.7    1.790    2.217    1.790    2.217
 dbcsr_merge_all                    275  8.9    0.479    0.525    2.052    2.129
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.022    2.023
 qs_energies_init_hamiltonians        1  3.0    0.005    0.026    1.997    1.997
 rebuild_ks_matrix                    3  7.3    0.000    0.000    1.989    1.990
 qs_ks_build_kohn_sham_matrix         3  8.3    0.001    0.002    1.989    1.990
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="508", plot="h2o_32_nrep3_ls", label="(8n/2r/6t)", y=91.632000, yerr=0.000000
PlotPoint: name="509", plot="h2o_32_nrep3_ls_mem", label="(8n/2r/6t)", y=3691.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/26/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     30.358840E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                339931       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   28341.7
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               8.840839E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                    4440
 MPI messages size (bytes):
  total size                       770.525954E+09
  min size                           0.000000E+00
  max size                         399.069120E+06
  average size                     173.541888E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 640                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                 640                468025344
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            3160             770057961712
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 284089.
 MP_Allreduce         3123                  21388.
 MP_Sync                 4
 MP_Alltoall            47               88727262.
 MP_SendRecv            42                 732600.
 MP_ISendRecv           42                 732600.
 MP_Wait               267
 MP_ISend              180                3337386.
 MP_IRecv              180                3339494.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.098    0.109  105.534  105.538
 qs_energies                          1  2.0    0.000    0.000  104.082  104.095
 ls_scf                               1  3.0    0.000    0.000  101.149  101.161
 dbcsr_multiply_generic             111  6.7    0.024    0.026   74.881   75.023
 ls_scf_main                          1  4.0    0.000    0.000   63.820   63.821
 density_matrix_trs4                  2  5.0    0.002    0.003   54.869   54.936
 multiply_cannon                    111  7.7    0.115    0.157   48.686   50.985
 multiply_cannon_loop               111  8.7    0.098    0.100   45.700   46.078
 ls_scf_init_scf                      1  4.0    0.000    0.000   33.533   33.534
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   32.030   32.042
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   29.324   29.335
 mp_waitall_1                      4527 11.1   22.013   25.803   22.013   25.803
 make_m2s                           222  7.7    0.005    0.005   22.593   23.690
 make_images                        222  8.7    3.579    3.889   22.486   23.580
 multiply_cannon_multrec            444  9.7   17.849   18.427   22.540   23.170
 hybrid_alltoall_any                227 10.6    1.656    3.629   12.759   15.669
 make_images_data                   222  9.7    0.003    0.004   12.996   15.128
 multiply_cannon_metrocomm3         444  9.7    0.001    0.001   10.434   10.844
 multiply_cannon_sync_h2d           444  9.7    8.793    8.831    8.793    8.831
 arnoldi_extremal                     4  6.8    0.000    0.000    7.412    7.424
 arnoldi_normal_ev                    4  7.8    0.003    0.009    7.412    7.424
 build_subspace                      16  8.4    0.026    0.035    6.856    6.868
 dbcsr_matrix_vector_mult           304  9.0    0.017    0.034    5.497    5.644
 dbcsr_matrix_vector_mult_local     304 10.0    5.062    5.355    5.065    5.358
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    5.094    5.189
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.854    5.099
 dbcsr_mm_accdrv_process           1814 10.4    0.294    0.366    4.502    4.624
 dbcsr_mm_accdrv_process_sort      1814 11.4    4.162    4.291    4.162    4.291
 ls_scf_post                          1  4.0    0.000    0.000    3.796    3.809
 make_images_sizes                  222  9.7    0.000    0.000    1.460    3.619
 mp_alltoall_i44                    222 10.7    1.460    3.619    1.460    3.619
 mp_allgather_i34                   111  8.7    1.106    3.604    1.106    3.604
 ls_scf_store_result                  1  5.0    0.000    0.000    3.516    3.551
 calculate_norms                    792  9.8    3.230    3.275    3.230    3.275
 dbcsr_finalize                     304  7.8    0.082    0.089    3.081    3.184
 dbcsr_merge_all                    275  8.9    0.887    0.917    2.866    2.959
 qs_energies_init_hamiltonians        1  3.0    0.001    0.001    2.903    2.903
 dbcsr_complete_redistribute          5  7.6    1.437    1.478    2.754    2.871
 matrix_ls_to_qs                      2  6.0    0.000    0.000    2.399    2.523
 dbcsr_sort_data                    325 11.1    2.440    2.513    2.440    2.513
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.420    2.422
 dbcsr_data_new                    6591  9.6    1.880    2.360    1.880    2.360
 rebuild_ks_matrix                    3  7.3    0.000    0.000    2.354    2.356
 qs_ks_build_kohn_sham_matrix         3  8.3    0.002    0.002    2.354    2.356
 dbcsr_new_transposed                 4  7.5    0.240    0.252    2.305    2.328
 dbcsr_frobenius_norm                74  6.6    2.055    2.131    2.182    2.203
 dbcsr_add_d                        103  6.2    0.000    0.000    2.126    2.203
 dbcsr_add_anytype                  103  7.2    0.859    0.890    2.126    2.202
 dbcsr_data_release               12724 10.6    1.983    2.195    1.983    2.195
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="510", plot="h2o_32_nrep3_ls", label="(8n/1r/12t)", y=105.538000, yerr=0.000000
PlotPoint: name="511", plot="h2o_32_nrep3_ls_mem", label="(8n/1r/12t)", y=6957.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/cd366a4022ad0ce9b8ea778ed4e993fe588c3c12_performance_tests/27/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32        7009386627072       0.0%      0.0%    100.0%
 flops     9 x     9 x    32        7335108845568       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        9866241589248       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        9884108906496       0.0%      0.0%    100.0%
 flops    22 x    22 x    32       13354440523776       0.0%      0.0%    100.0%
 flops    32 x    32 x     9       20607185977344       0.0%      0.0%    100.0%
 flops    32 x    32 x    22       25186560638976       0.0%      0.0%    100.0%
 flops     9 x    32 x    32       28458319085568       0.0%      0.0%    100.0%
 flops    22 x    32 x    32       34782389993472       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       42881542373376       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       55680402235392       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       55680402235392       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       72328573419520       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       383.054662E+12       0.0%      0.0%    100.0%
 flops max/rank                    733.641090E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                        26899403712       0.0%      0.0%    100.0%
 number of processed stacks             118860288       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     226.3
 marketing flops                   780.439111E+12
 -------------------------------------------------------------------------------
 # multiplications                           1445
 max memory usage/rank             592.891904E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged               102097920
 MPI messages size (bytes):
  total size                        37.227590E+12
  min size                           0.000000E+00
  max size                           4.551360E+06
  average size                     364.626312E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              731472                        0
       128 < size <=     8192            11922720              97670922240
      8192 < size <=    32768            24718992             614677610496
     32768 < size <=   131072            20000256            1970081366016
    131072 < size <=  4194304            42515668           24886801223040
   4194304 < size <= 16777216             2208812            9656099886720
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4640                  78072.
 MP_Allreduce        13232                   2081.
 MP_Sync              1064
 MP_Alltoall          2588                4037197.
 MP_SendRecv        168740                  11136.
 MP_ISendRecv        92040                  11136.
 MP_Wait            102830
 MP_comm_split          40
 MP_ISend            26090                  85106.
 MP_IRecv            37890                  59644.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.070    0.229  230.120  230.153
 qs_mol_dyn_low                       1  2.0    0.005    0.030  228.840  228.874
 qs_forces                            5  3.8    0.005    0.026  228.732  228.750
 qs_energies                          5  4.8    0.003    0.033  225.663  225.684
 scf_env_do_scf                       5  5.8    0.001    0.003  212.472  212.475
 scf_env_do_scf_inner_loop          105  6.6    0.002    0.006  183.947  183.948
 qs_scf_new_mos                     105  7.6    0.000    0.001  143.135  143.288
 qs_scf_loop_do_ot                  105  8.6    0.001    0.001  143.135  143.288
 ot_scf_mini                        105  9.6    0.003    0.004  133.279  133.421
 dbcsr_multiply_generic            1445 12.2    0.126    0.137  132.862  133.303
 multiply_cannon                   1445 13.2    0.274    0.287  113.507  115.470
 multiply_cannon_loop              1445 14.2    2.855    3.014  111.838  113.022
 velocity_verlet                      4  3.0    0.001    0.013  107.788  107.789
 ot_mini                            105 10.6    0.001    0.003   60.705   60.826
 multiply_cannon_multrec          69360 15.2   29.782   34.868   39.737   45.079
 qs_ot_get_p                        112 10.4    0.001    0.001   42.868   43.206
 mp_waitall_1                    488190 16.1   35.560   42.358   35.560   42.358
 qs_ot_get_derivative                55 11.6    0.001    0.001   38.997   39.139
 multiply_cannon_metrocomm3       69360 15.2    0.201    0.210   26.228   33.358
 multiply_cannon_sync_h2d         69360 15.2   29.035   33.198   29.035   33.198
 qs_ot_p2m_diag                      40 11.0    0.020    0.031   31.754   31.844
 rebuild_ks_matrix                  110  8.4    0.000    0.000   29.409   29.607
 qs_ks_build_kohn_sham_matrix       110  9.4    0.012    0.015   29.409   29.606
 cp_dbcsr_syevd                      40 12.0    0.002    0.005   28.532   28.533
 init_scf_loop                        7  6.6    0.001    0.006   28.488   28.490
 qs_ks_update_qs_env                112  7.6    0.001    0.001   26.953   27.135
 cp_fm_syevd                         40 13.0    0.000    0.002   23.377   23.529
 prepare_preconditioner               7  7.6    0.000    0.000   23.479   23.510
 make_preconditioner                  7  8.6    0.000    0.004   23.479   23.510
 apply_preconditioner_dbcsr          62 12.6    0.000    0.001   23.004   23.251
 apply_single                        62 13.6    0.000    0.000   23.004   23.250
 ot_new_cg_direction                 55 11.6    0.001    0.003   21.035   21.035
 cp_fm_redistribute_end              40 14.0    9.332   18.615    9.337   18.616
 cp_fm_syevd_base                    40 14.0    9.272   18.557    9.272   18.557
 qs_rho_update_rho_low              110  7.6    0.001    0.001   17.415   17.758
 calculate_rho_elec                 110  8.6    0.029    0.032   17.414   17.758
 make_full_inverse_cholesky           7  9.6    0.000    0.001   15.233   15.297
 qs_ot_get_orbitals                 105 10.6    0.001    0.001   14.522   14.651
 qs_ot_get_derivative_taylor         37 12.8    0.001    0.001   13.982   14.075
 mp_sum_l                          4764 12.2   12.342   13.085   12.342   13.085
 pw_transfer                       1645 12.4    0.080    0.094   12.249   12.448
 fft_wrap_pw1pw2                   1425 13.5    0.012    0.015   12.110   12.313
 density_rs2pw                      110  9.6    0.005    0.007   11.670   11.991
 calculate_dm_sparse                110  9.5    0.000    0.001   11.428   11.622
 dbcsr_mm_accdrv_process         154766 15.8    6.177    6.365    9.822   11.257
 fft_wrap_pw1pw2_240                915 15.0    1.149    1.224   10.630   10.816
 qs_vxc_create                      110 10.4    0.003    0.009   10.726   10.761
 cp_fm_cholesky_invert                7 10.6   10.696   10.704   10.696   10.704
 qs_ot_get_derivative_diag           18 12.0    0.000    0.001   10.590   10.667
 init_scf_run                         5  5.8    0.000    0.001   10.501   10.502
 scf_env_initial_rho_setup            5  6.8    0.001    0.003   10.501   10.502
 check_diag                          80 13.5    8.594    8.892    9.538    9.689
 fft3d_pb                           915 16.0    2.384    2.577    8.832    9.026
 sum_up_and_integrate                60 10.3    0.028    0.034    8.660    8.672
 integrate_v_rspace                  60 11.3    0.002    0.002    8.632    8.645
 transfer_rs2pw                     445 10.6    0.007    0.008    8.209    8.608
 make_full_single_inverse             7  9.6    0.001    0.007    7.889    7.919
 xc_rho_set_and_dset_create         110 12.4    0.077    0.099    7.495    7.736
 multiply_cannon_metrocomm1       69360 15.2    0.099    0.105    4.830    7.540
 xc_vxc_pw_create                    60 11.3    0.039    0.050    7.156    7.192
 make_m2s                          2890 13.2    0.078    0.088    6.600    7.174
 make_images                       2890 14.2    0.240    0.259    6.494    7.070
 acc_transpose_blocks             69360 15.2    0.356    0.377    6.053    6.573
 xc_pw_derive                       510 13.4    0.005    0.006    6.436    6.508
 mp_alltoall_z22v                  2340 17.7    5.855    6.156    5.855    6.156
 cp_dbcsr_sm_fm_multiply             15  9.3    0.001    0.002    6.019    6.036
 calculate_first_density_matrix       1  7.0    0.000    0.003    5.562    5.580
 cp_dbcsr_sm_fm_multiply_core        15 10.3    0.000    0.001    5.502    5.551
 multiply_cannon_metrocomm4       67915 15.2    0.185    0.202    2.016    4.994
 mp_waitany                        7680 13.5    4.461    4.948    4.461    4.948
 potential_pw2rs                     60 12.3    0.003    0.004    4.845    4.878
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="601", plot="h2o_512_md", label="(64n/12r/1t)", y=230.153000, yerr=0.000000
PlotPoint: name="602", plot="h2o_512_md_mem", label="(64n/12r/1t)", y=562.600000, yerr=3.322650
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


========= END RESULTS ===========

CommitSHA: cd366a4022ad0ce9b8ea778ed4e993fe588c3c12
Summary: empty
Status: OK